illumos-dccp Old usr/src/uts/common/inet/ip/ip

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 1990 Mentat Inc.
  24  */
  25 
  26 /*
  27  * This file contains the interface control functions for IP.
  28  */
  29 
  30 #include <sys/types.h>
  31 #include <sys/stream.h>
  32 #include <sys/dlpi.h>
  33 #include <sys/stropts.h>
  34 #include <sys/strsun.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/strsubr.h>
  37 #include <sys/strlog.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/cmn_err.h>
  41 #include <sys/kstat.h>
  42 #include <sys/debug.h>
  43 #include <sys/zone.h>
  44 #include <sys/sunldi.h>
  45 #include <sys/file.h>
  46 #include <sys/bitmap.h>
  47 #include <sys/cpuvar.h>
  48 #include <sys/time.h>
  49 #include <sys/ctype.h>
  50 #include <sys/kmem.h>
  51 #include <sys/systm.h>
  52 #include <sys/param.h>
  53 #include <sys/socket.h>
  54 #include <sys/isa_defs.h>
  55 #include <net/if.h>
  56 #include <net/if_arp.h>
  57 #include <net/if_types.h>
  58 #include <net/if_dl.h>
  59 #include <net/route.h>
  60 #include <sys/sockio.h>
  61 #include <netinet/in.h>
  62 #include <netinet/ip6.h>
  63 #include <netinet/icmp6.h>
  64 #include <netinet/igmp_var.h>
  65 #include <sys/policy.h>
  66 #include <sys/ethernet.h>
  67 #include <sys/callb.h>
  68 #include <sys/md5.h>
  69 
  70 #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
  71 #include <inet/mi.h>
  72 #include <inet/nd.h>
  73 #include <inet/tunables.h>
  74 #include <inet/arp.h>
  75 #include <inet/ip_arp.h>
  76 #include <inet/mib2.h>
  77 #include <inet/ip.h>
  78 #include <inet/ip6.h>
  79 #include <inet/ip6_asp.h>
  80 #include <inet/tcp.h>
  81 #include <inet/ip_multi.h>
  82 #include <inet/ip_ire.h>
  83 #include <inet/ip_ftable.h>
  84 #include <inet/ip_rts.h>
  85 #include <inet/ip_ndp.h>
  86 #include <inet/ip_if.h>
  87 #include <inet/ip_impl.h>
  88 #include <inet/sctp_ip.h>
  89 #include <inet/ip_netinfo.h>
  90 #include <inet/ilb_ip.h>
  91 
  92 #include <netinet/igmp.h>
  93 #include <inet/ip_listutils.h>
  94 #include <inet/ipclassifier.h>
  95 #include <sys/mac_client.h>
  96 #include <sys/dld.h>
  97 #include <sys/mac_flow.h>
  98 
  99 #include <sys/systeminfo.h>
 100 #include <sys/bootconf.h>
 101 
 102 #include <sys/tsol/tndb.h>
 103 #include <sys/tsol/tnet.h>
 104 
 105 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
 106 #include <inet/udp_impl.h> /* needed for udp_stack_t */
 107 
 108 /* The character which tells where the ill_name ends */
 109 #define IPIF_SEPARATOR_CHAR     ':'
 110 
 111 /* IP ioctl function table entry */
 112 typedef struct ipft_s {
 113         int     ipft_cmd;
 114         pfi_t   ipft_pfi;
 115         int     ipft_min_size;
 116         int     ipft_flags;
 117 } ipft_t;
 118 #define IPFT_F_NO_REPLY         0x1     /* IP ioctl does not expect any reply */
 119 #define IPFT_F_SELF_REPLY       0x2     /* ioctl callee does the ioctl reply */
 120 
 121 static int      nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 122 static int      nd_ill_forward_set(queue_t *q, mblk_t *mp,
 123                     char *value, caddr_t cp, cred_t *ioc_cr);
 124 
 125 static boolean_t ill_is_quiescent(ill_t *);
 126 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
 127 static ip_m_t   *ip_m_lookup(t_uscalar_t mac_type);
 128 static int      ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 129     mblk_t *mp, boolean_t need_up);
 130 static int      ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 131     mblk_t *mp, boolean_t need_up);
 132 static int      ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
 133     queue_t *q, mblk_t *mp, boolean_t need_up);
 134 static int      ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
 135     mblk_t *mp);
 136 static int      ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 137     mblk_t *mp);
 138 static int      ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
 139     queue_t *q, mblk_t *mp, boolean_t need_up);
 140 static int      ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
 141     int ioccmd, struct linkblk *li);
 142 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
 143 static void     ip_wput_ioctl(queue_t *q, mblk_t *mp);
 144 static void     ipsq_flush(ill_t *ill);
 145 
 146 static  int     ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
 147     queue_t *q, mblk_t *mp, boolean_t need_up);
 148 static void     ipsq_delete(ipsq_t *);
 149 
 150 static ipif_t   *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
 151     boolean_t initialize, boolean_t insert, int *errorp);
 152 static ire_t    **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
 153 static void     ipif_delete_bcast_ires(ipif_t *ipif);
 154 static int      ipif_add_ires_v4(ipif_t *, boolean_t);
 155 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
 156                     boolean_t isv6);
 157 static int      ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 158 static void     ipif_free(ipif_t *ipif);
 159 static void     ipif_free_tail(ipif_t *ipif);
 160 static void     ipif_set_default(ipif_t *ipif);
 161 static int      ipif_set_values(queue_t *q, mblk_t *mp,
 162     char *interf_name, uint_t *ppa);
 163 static int      ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
 164     queue_t *q);
 165 static ipif_t   *ipif_lookup_on_name(char *name, size_t namelen,
 166     boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
 167     ip_stack_t *);
 168 static ipif_t   *ipif_lookup_on_name_async(char *name, size_t namelen,
 169     boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
 170     int *error, ip_stack_t *);
 171 
 172 static int      ill_alloc_ppa(ill_if_t *, ill_t *);
 173 static void     ill_delete_interface_type(ill_if_t *);
 174 static int      ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
 175 static void     ill_dl_down(ill_t *ill);
 176 static void     ill_down(ill_t *ill);
 177 static void     ill_down_ipifs(ill_t *, boolean_t);
 178 static void     ill_free_mib(ill_t *ill);
 179 static void     ill_glist_delete(ill_t *);
 180 static void     ill_phyint_reinit(ill_t *ill);
 181 static void     ill_set_nce_router_flags(ill_t *, boolean_t);
 182 static void     ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 183 static void     ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 184 
 185 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
 186 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
 187 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
 188 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
 189 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
 190 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
 191 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
 192 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
 193 static ip_v4mapinfo_func_t ip_mbcast_mapping;
 194 static void     ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
 195 static void     ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
 196 static void     phyint_free(phyint_t *);
 197 
 198 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
 199 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 200 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 201 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 202 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
 203 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
 204     dl_capability_sub_t *);
 205 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
 206 static void     ill_capability_dld_reset_fill(ill_t *, mblk_t *);
 207 static void     ill_capability_dld_ack(ill_t *, mblk_t *,
 208                     dl_capability_sub_t *);
 209 static void     ill_capability_dld_enable(ill_t *);
 210 static void     ill_capability_ack_thr(void *);
 211 static void     ill_capability_lso_enable(ill_t *);
 212 
 213 static ill_t    *ill_prev_usesrc(ill_t *);
 214 static int      ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 215 static void     ill_disband_usesrc_group(ill_t *);
 216 static void     ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
 217 
 218 #ifdef DEBUG
 219 static  void    ill_trace_cleanup(const ill_t *);
 220 static  void    ipif_trace_cleanup(const ipif_t *);
 221 #endif
 222 
 223 static  void    ill_dlpi_clear_deferred(ill_t *ill);
 224 
 225 /*
 226  * if we go over the memory footprint limit more than once in this msec
 227  * interval, we'll start pruning aggressively.
 228  */
 229 int ip_min_frag_prune_time = 0;
 230 
 231 static ipft_t   ip_ioctl_ftbl[] = {
 232         { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
 233         { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
 234                 IPFT_F_NO_REPLY },
 235         { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
 236         { 0 }
 237 };
 238 
 239 /* Simple ICMP IP Header Template */
 240 static ipha_t icmp_ipha = {
 241         IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
 242 };
 243 
 244 static uchar_t  ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 245 
 246 static ip_m_t   ip_m_tbl[] = {
 247         { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 248             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 249             ip_nodef_v6intfid },
 250         { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
 251             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 252             ip_nodef_v6intfid },
 253         { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
 254             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 255             ip_nodef_v6intfid },
 256         { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
 257             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 258             ip_nodef_v6intfid },
 259         { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
 260             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 261             ip_nodef_v6intfid },
 262         { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
 263             ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
 264             ip_nodef_v6intfid },
 265         { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
 266             ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 267             ip_ipv4_v6destintfid },
 268         { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
 269             ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
 270             ip_ipv6_v6destintfid },
 271         { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
 272             ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 273             ip_nodef_v6intfid },
 274         { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 275             NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
 276         { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 277             NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
 278         { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 279             ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 280             ip_nodef_v6intfid }
 281 };
 282 
 283 static ill_t    ill_null;               /* Empty ILL for init. */
 284 char    ipif_loopback_name[] = "lo0";
 285 
 286 /* These are used by all IP network modules. */
 287 sin6_t  sin6_null;      /* Zero address for quick clears */
 288 sin_t   sin_null;       /* Zero address for quick clears */
 289 
 290 /* When set search for unused ipif_seqid */
 291 static ipif_t   ipif_zero;
 292 
 293 /*
 294  * ppa arena is created after these many
 295  * interfaces have been plumbed.
 296  */
 297 uint_t  ill_no_arena = 12;      /* Setable in /etc/system */
 298 
 299 /*
 300  * Allocate per-interface mibs.
 301  * Returns true if ok. False otherwise.
 302  *  ipsq  may not yet be allocated (loopback case ).
 303  */
 304 static boolean_t
 305 ill_allocate_mibs(ill_t *ill)
 306 {
 307         /* Already allocated? */
 308         if (ill->ill_ip_mib != NULL) {
 309                 if (ill->ill_isv6)
 310                         ASSERT(ill->ill_icmp6_mib != NULL);
 311                 return (B_TRUE);
 312         }
 313 
 314         ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
 315             KM_NOSLEEP);
 316         if (ill->ill_ip_mib == NULL) {
 317                 return (B_FALSE);
 318         }
 319 
 320         /* Setup static information */
 321         SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
 322             sizeof (mib2_ipIfStatsEntry_t));
 323         if (ill->ill_isv6) {
 324                 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
 325                 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 326                     sizeof (mib2_ipv6AddrEntry_t));
 327                 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 328                     sizeof (mib2_ipv6RouteEntry_t));
 329                 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 330                     sizeof (mib2_ipv6NetToMediaEntry_t));
 331                 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 332                     sizeof (ipv6_member_t));
 333                 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 334                     sizeof (ipv6_grpsrc_t));
 335         } else {
 336                 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
 337                 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 338                     sizeof (mib2_ipAddrEntry_t));
 339                 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 340                     sizeof (mib2_ipRouteEntry_t));
 341                 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 342                     sizeof (mib2_ipNetToMediaEntry_t));
 343                 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 344                     sizeof (ip_member_t));
 345                 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 346                     sizeof (ip_grpsrc_t));
 347 
 348                 /*
 349                  * For a v4 ill, we are done at this point, because per ill
 350                  * icmp mibs are only used for v6.
 351                  */
 352                 return (B_TRUE);
 353         }
 354 
 355         ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
 356             KM_NOSLEEP);
 357         if (ill->ill_icmp6_mib == NULL) {
 358                 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 359                 ill->ill_ip_mib = NULL;
 360                 return (B_FALSE);
 361         }
 362         /* static icmp info */
 363         ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
 364             sizeof (mib2_ipv6IfIcmpEntry_t);
 365         /*
 366          * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
 367          * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
 368          * -> ill_phyint_reinit
 369          */
 370         return (B_TRUE);
 371 }
 372 
 373 /*
 374  * Completely vaporize a lower level tap and all associated interfaces.
 375  * ill_delete is called only out of ip_close when the device control
 376  * stream is being closed.
 377  */
 378 void
 379 ill_delete(ill_t *ill)
 380 {
 381         ipif_t  *ipif;
 382         ill_t   *prev_ill;
 383         ip_stack_t      *ipst = ill->ill_ipst;
 384 
 385         /*
 386          * ill_delete may be forcibly entering the ipsq. The previous
 387          * ioctl may not have completed and may need to be aborted.
 388          * ipsq_flush takes care of it. If we don't need to enter the
 389          * the ipsq forcibly, the 2nd invocation of ipsq_flush in
 390          * ill_delete_tail is sufficient.
 391          */
 392         ipsq_flush(ill);
 393 
 394         /*
 395          * Nuke all interfaces.  ipif_free will take down the interface,
 396          * remove it from the list, and free the data structure.
 397          * Walk down the ipif list and remove the logical interfaces
 398          * first before removing the main ipif. We can't unplumb
 399          * zeroth interface first in the case of IPv6 as update_conn_ill
 400          * -> ip_ll_multireq de-references ill_ipif for checking
 401          * POINTOPOINT.
 402          *
 403          * If ill_ipif was not properly initialized (i.e low on memory),
 404          * then no interfaces to clean up. In this case just clean up the
 405          * ill.
 406          */
 407         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
 408                 ipif_free(ipif);
 409 
 410         /*
 411          * clean out all the nce_t entries that depend on this
 412          * ill for the ill_phys_addr.
 413          */
 414         nce_flush(ill, B_TRUE);
 415 
 416         /* Clean up msgs on pending upcalls for mrouted */
 417         reset_mrt_ill(ill);
 418 
 419         update_conn_ill(ill, ipst);
 420 
 421         /*
 422          * Remove multicast references added as a result of calls to
 423          * ip_join_allmulti().
 424          */
 425         ip_purge_allmulti(ill);
 426 
 427         /*
 428          * If the ill being deleted is under IPMP, boot it out of the illgrp.
 429          */
 430         if (IS_UNDER_IPMP(ill))
 431                 ipmp_ill_leave_illgrp(ill);
 432 
 433         /*
 434          * ill_down will arrange to blow off any IRE's dependent on this
 435          * ILL, and shut down fragmentation reassembly.
 436          */
 437         ill_down(ill);
 438 
 439         /* Let SCTP know, so that it can remove this from its list. */
 440         sctp_update_ill(ill, SCTP_ILL_REMOVE);
 441 
 442         /*
 443          * Walk all CONNs that can have a reference on an ire or nce for this
 444          * ill (we actually walk all that now have stale references).
 445          */
 446         ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
 447 
 448         /* With IPv6 we have dce_ifindex. Cleanup for neatness */
 449         if (ill->ill_isv6)
 450                 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
 451 
 452         /*
 453          * If an address on this ILL is being used as a source address then
 454          * clear out the pointers in other ILLs that point to this ILL.
 455          */
 456         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
 457         if (ill->ill_usesrc_grp_next != NULL) {
 458                 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
 459                         ill_disband_usesrc_group(ill);
 460                 } else {        /* consumer of the usesrc ILL */
 461                         prev_ill = ill_prev_usesrc(ill);
 462                         prev_ill->ill_usesrc_grp_next =
 463                             ill->ill_usesrc_grp_next;
 464                 }
 465         }
 466         rw_exit(&ipst->ips_ill_g_usesrc_lock);
 467 }
 468 
 469 static void
 470 ipif_non_duplicate(ipif_t *ipif)
 471 {
 472         ill_t *ill = ipif->ipif_ill;
 473         mutex_enter(&ill->ill_lock);
 474         if (ipif->ipif_flags & IPIF_DUPLICATE) {
 475                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
 476                 ASSERT(ill->ill_ipif_dup_count > 0);
 477                 ill->ill_ipif_dup_count--;
 478         }
 479         mutex_exit(&ill->ill_lock);
 480 }
 481 
 482 /*
 483  * ill_delete_tail is called from ip_modclose after all references
 484  * to the closing ill are gone. The wait is done in ip_modclose
 485  */
 486 void
 487 ill_delete_tail(ill_t *ill)
 488 {
 489         mblk_t  **mpp;
 490         ipif_t  *ipif;
 491         ip_stack_t *ipst = ill->ill_ipst;
 492 
 493         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 494                 ipif_non_duplicate(ipif);
 495                 (void) ipif_down_tail(ipif);
 496         }
 497 
 498         ASSERT(ill->ill_ipif_dup_count == 0);
 499 
 500         /*
 501          * If polling capability is enabled (which signifies direct
 502          * upcall into IP and driver has ill saved as a handle),
 503          * we need to make sure that unbind has completed before we
 504          * let the ill disappear and driver no longer has any reference
 505          * to this ill.
 506          */
 507         mutex_enter(&ill->ill_lock);
 508         while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
 509                 cv_wait(&ill->ill_cv, &ill->ill_lock);
 510         mutex_exit(&ill->ill_lock);
 511         ASSERT(!(ill->ill_capabilities &
 512             (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
 513 
 514         if (ill->ill_net_type != IRE_LOOPBACK)
 515                 qprocsoff(ill->ill_rq);
 516 
 517         /*
 518          * We do an ipsq_flush once again now. New messages could have
 519          * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
 520          * could also have landed up if an ioctl thread had looked up
 521          * the ill before we set the ILL_CONDEMNED flag, but not yet
 522          * enqueued the ioctl when we did the ipsq_flush last time.
 523          */
 524         ipsq_flush(ill);
 525 
 526         /*
 527          * Free capabilities.
 528          */
 529         if (ill->ill_hcksum_capab != NULL) {
 530                 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
 531                 ill->ill_hcksum_capab = NULL;
 532         }
 533 
 534         if (ill->ill_zerocopy_capab != NULL) {
 535                 kmem_free(ill->ill_zerocopy_capab,
 536                     sizeof (ill_zerocopy_capab_t));
 537                 ill->ill_zerocopy_capab = NULL;
 538         }
 539 
 540         if (ill->ill_lso_capab != NULL) {
 541                 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
 542                 ill->ill_lso_capab = NULL;
 543         }
 544 
 545         if (ill->ill_dld_capab != NULL) {
 546                 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
 547                 ill->ill_dld_capab = NULL;
 548         }
 549 
 550         /* Clean up ill_allowed_ips* related state */
 551         if (ill->ill_allowed_ips != NULL) {
 552                 ASSERT(ill->ill_allowed_ips_cnt > 0);
 553                 kmem_free(ill->ill_allowed_ips,
 554                     ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
 555                 ill->ill_allowed_ips = NULL;
 556                 ill->ill_allowed_ips_cnt = 0;
 557         }
 558 
 559         while (ill->ill_ipif != NULL)
 560                 ipif_free_tail(ill->ill_ipif);
 561 
 562         /*
 563          * We have removed all references to ilm from conn and the ones joined
 564          * within the kernel.
 565          *
 566          * We don't walk conns, mrts and ires because
 567          *
 568          * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
 569          * 2) ill_down ->ill_downi walks all the ires and cleans up
 570          *    ill references.
 571          */
 572 
 573         /*
 574          * If this ill is an IPMP meta-interface, blow away the illgrp.  This
 575          * is safe to do because the illgrp has already been unlinked from the
 576          * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
 577          */
 578         if (IS_IPMP(ill)) {
 579                 ipmp_illgrp_destroy(ill->ill_grp);
 580                 ill->ill_grp = NULL;
 581         }
 582 
 583         if (ill->ill_mphysaddr_list != NULL) {
 584                 multiphysaddr_t *mpa, *tmpa;
 585 
 586                 mpa = ill->ill_mphysaddr_list;
 587                 ill->ill_mphysaddr_list = NULL;
 588                 while (mpa) {
 589                         tmpa = mpa->mpa_next;
 590                         kmem_free(mpa, sizeof (*mpa));
 591                         mpa = tmpa;
 592                 }
 593         }
 594         /*
 595          * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
 596          * could free the phyint. No more reference to the phyint after this
 597          * point.
 598          */
 599         (void) ill_glist_delete(ill);
 600 
 601         if (ill->ill_frag_ptr != NULL) {
 602                 uint_t count;
 603 
 604                 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
 605                         mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
 606                 }
 607                 mi_free(ill->ill_frag_ptr);
 608                 ill->ill_frag_ptr = NULL;
 609                 ill->ill_frag_hash_tbl = NULL;
 610         }
 611 
 612         freemsg(ill->ill_nd_lla_mp);
 613         /* Free all retained control messages. */
 614         mpp = &ill->ill_first_mp_to_free;
 615         do {
 616                 while (mpp[0]) {
 617                         mblk_t  *mp;
 618                         mblk_t  *mp1;
 619 
 620                         mp = mpp[0];
 621                         mpp[0] = mp->b_next;
 622                         for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
 623                                 mp1->b_next = NULL;
 624                                 mp1->b_prev = NULL;
 625                         }
 626                         freemsg(mp);
 627                 }
 628         } while (mpp++ != &ill->ill_last_mp_to_free);
 629 
 630         ill_free_mib(ill);
 631 
 632 #ifdef DEBUG
 633         ill_trace_cleanup(ill);
 634 #endif
 635 
 636         /* The default multicast interface might have changed */
 637         ire_increment_multicast_generation(ipst, ill->ill_isv6);
 638 
 639         /* Drop refcnt here */
 640         netstack_rele(ill->ill_ipst->ips_netstack);
 641         ill->ill_ipst = NULL;
 642 }
 643 
 644 static void
 645 ill_free_mib(ill_t *ill)
 646 {
 647         ip_stack_t *ipst = ill->ill_ipst;
 648 
 649         /*
 650          * MIB statistics must not be lost, so when an interface
 651          * goes away the counter values will be added to the global
 652          * MIBs.
 653          */
 654         if (ill->ill_ip_mib != NULL) {
 655                 if (ill->ill_isv6) {
 656                         ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
 657                             ill->ill_ip_mib);
 658                 } else {
 659                         ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
 660                             ill->ill_ip_mib);
 661                 }
 662 
 663                 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 664                 ill->ill_ip_mib = NULL;
 665         }
 666         if (ill->ill_icmp6_mib != NULL) {
 667                 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
 668                     ill->ill_icmp6_mib);
 669                 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
 670                 ill->ill_icmp6_mib = NULL;
 671         }
 672 }
 673 
 674 /*
 675  * Concatenate together a physical address and a sap.
 676  *
 677  * Sap_lengths are interpreted as follows:
 678  *   sap_length == 0    ==>  no sap
 679  *   sap_length > 0  ==>  sap is at the head of the dlpi address
 680  *   sap_length < 0  ==>  sap is at the tail of the dlpi address
 681  */
 682 static void
 683 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
 684     t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
 685 {
 686         uint16_t sap_addr = (uint16_t)sap_src;
 687 
 688         if (sap_length == 0) {
 689                 if (phys_src == NULL)
 690                         bzero(dst, phys_length);
 691                 else
 692                         bcopy(phys_src, dst, phys_length);
 693         } else if (sap_length < 0) {
 694                 if (phys_src == NULL)
 695                         bzero(dst, phys_length);
 696                 else
 697                         bcopy(phys_src, dst, phys_length);
 698                 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
 699         } else {
 700                 bcopy(&sap_addr, dst, sizeof (sap_addr));
 701                 if (phys_src == NULL)
 702                         bzero((char *)dst + sap_length, phys_length);
 703                 else
 704                         bcopy(phys_src, (char *)dst + sap_length, phys_length);
 705         }
 706 }
 707 
 708 /*
 709  * Generate a dl_unitdata_req mblk for the device and address given.
 710  * addr_length is the length of the physical portion of the address.
 711  * If addr is NULL include an all zero address of the specified length.
 712  * TRUE? In any case, addr_length is taken to be the entire length of the
 713  * dlpi address, including the absolute value of sap_length.
 714  */
 715 mblk_t *
 716 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
 717                 t_scalar_t sap_length)
 718 {
 719         dl_unitdata_req_t *dlur;
 720         mblk_t  *mp;
 721         t_scalar_t      abs_sap_length;         /* absolute value */
 722 
 723         abs_sap_length = ABS(sap_length);
 724         mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
 725             DL_UNITDATA_REQ);
 726         if (mp == NULL)
 727                 return (NULL);
 728         dlur = (dl_unitdata_req_t *)mp->b_rptr;
 729         /* HACK: accomodate incompatible DLPI drivers */
 730         if (addr_length == 8)
 731                 addr_length = 6;
 732         dlur->dl_dest_addr_length = addr_length + abs_sap_length;
 733         dlur->dl_dest_addr_offset = sizeof (*dlur);
 734         dlur->dl_priority.dl_min = 0;
 735         dlur->dl_priority.dl_max = 0;
 736         ill_dlur_copy_address(addr, addr_length, sap, sap_length,
 737             (uchar_t *)&dlur[1]);
 738         return (mp);
 739 }
 740 
 741 /*
 742  * Add the pending mp to the list. There can be only 1 pending mp
 743  * in the list. Any exclusive ioctl that needs to wait for a response
 744  * from another module or driver needs to use this function to set
 745  * the ipx_pending_mp to the ioctl mblk and wait for the response from
 746  * the other module/driver. This is also used while waiting for the
 747  * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
 748  */
 749 boolean_t
 750 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 751     int waitfor)
 752 {
 753         ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
 754 
 755         ASSERT(IAM_WRITER_IPIF(ipif));
 756         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 757         ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
 758         ASSERT(ipx->ipx_pending_mp == NULL);
 759         /*
 760          * The caller may be using a different ipif than the one passed into
 761          * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
 762          * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
 763          * that `ipx_current_ipif == ipif'.
 764          */
 765         ASSERT(ipx->ipx_current_ipif != NULL);
 766 
 767         /*
 768          * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
 769          * driver.
 770          */
 771         ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
 772             (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
 773             (DB_TYPE(add_mp) == M_PCPROTO));
 774 
 775         if (connp != NULL) {
 776                 ASSERT(MUTEX_HELD(&connp->conn_lock));
 777                 /*
 778                  * Return error if the conn has started closing. The conn
 779                  * could have finished cleaning up the pending mp list,
 780                  * If so we should not add another mp to the list negating
 781                  * the cleanup.
 782                  */
 783                 if (connp->conn_state_flags & CONN_CLOSING)
 784                         return (B_FALSE);
 785         }
 786         mutex_enter(&ipx->ipx_lock);
 787         ipx->ipx_pending_ipif = ipif;
 788         /*
 789          * Note down the queue in b_queue. This will be returned by
 790          * ipsq_pending_mp_get. Caller will then use these values to restart
 791          * the processing
 792          */
 793         add_mp->b_next = NULL;
 794         add_mp->b_queue = q;
 795         ipx->ipx_pending_mp = add_mp;
 796         ipx->ipx_waitfor = waitfor;
 797         mutex_exit(&ipx->ipx_lock);
 798 
 799         if (connp != NULL)
 800                 connp->conn_oper_pending_ill = ipif->ipif_ill;
 801 
 802         return (B_TRUE);
 803 }
 804 
 805 /*
 806  * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
 807  * queued in the list.
 808  */
 809 mblk_t *
 810 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 811 {
 812         mblk_t  *curr = NULL;
 813         ipxop_t *ipx = ipsq->ipsq_xop;
 814 
 815         *connpp = NULL;
 816         mutex_enter(&ipx->ipx_lock);
 817         if (ipx->ipx_pending_mp == NULL) {
 818                 mutex_exit(&ipx->ipx_lock);
 819                 return (NULL);
 820         }
 821 
 822         /* There can be only 1 such excl message */
 823         curr = ipx->ipx_pending_mp;
 824         ASSERT(curr->b_next == NULL);
 825         ipx->ipx_pending_ipif = NULL;
 826         ipx->ipx_pending_mp = NULL;
 827         ipx->ipx_waitfor = 0;
 828         mutex_exit(&ipx->ipx_lock);
 829 
 830         if (CONN_Q(curr->b_queue)) {
 831                 /*
 832                  * This mp did a refhold on the conn, at the start of the ioctl.
 833                  * So we can safely return a pointer to the conn to the caller.
 834                  */
 835                 *connpp = Q_TO_CONN(curr->b_queue);
 836         } else {
 837                 *connpp = NULL;
 838         }
 839         curr->b_next = NULL;
 840         curr->b_prev = NULL;
 841         return (curr);
 842 }
 843 
 844 /*
 845  * Cleanup the ioctl mp queued in ipx_pending_mp
 846  * - Called in the ill_delete path
 847  * - Called in the M_ERROR or M_HANGUP path on the ill.
 848  * - Called in the conn close path.
 849  *
 850  * Returns success on finding the pending mblk associated with the ioctl or
 851  * exclusive operation in progress, failure otherwise.
 852  */
 853 boolean_t
 854 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 855 {
 856         mblk_t  *mp;
 857         ipxop_t *ipx;
 858         queue_t *q;
 859         ipif_t  *ipif;
 860         int     cmd;
 861 
 862         ASSERT(IAM_WRITER_ILL(ill));
 863         ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
 864 
 865         mutex_enter(&ipx->ipx_lock);
 866         mp = ipx->ipx_pending_mp;
 867         if (connp != NULL) {
 868                 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
 869                         /*
 870                          * Nothing to clean since the conn that is closing
 871                          * does not have a matching pending mblk in
 872                          * ipx_pending_mp.
 873                          */
 874                         mutex_exit(&ipx->ipx_lock);
 875                         return (B_FALSE);
 876                 }
 877         } else {
 878                 /*
 879                  * A non-zero ill_error signifies we are called in the
 880                  * M_ERROR or M_HANGUP path and we need to unconditionally
 881                  * abort any current ioctl and do the corresponding cleanup.
 882                  * A zero ill_error means we are in the ill_delete path and
 883                  * we do the cleanup only if there is a pending mp.
 884                  */
 885                 if (mp == NULL && ill->ill_error == 0) {
 886                         mutex_exit(&ipx->ipx_lock);
 887                         return (B_FALSE);
 888                 }
 889         }
 890 
 891         /* Now remove from the ipx_pending_mp */
 892         ipx->ipx_pending_mp = NULL;
 893         ipif = ipx->ipx_pending_ipif;
 894         ipx->ipx_pending_ipif = NULL;
 895         ipx->ipx_waitfor = 0;
 896         ipx->ipx_current_ipif = NULL;
 897         cmd = ipx->ipx_current_ioctl;
 898         ipx->ipx_current_ioctl = 0;
 899         ipx->ipx_current_done = B_TRUE;
 900         mutex_exit(&ipx->ipx_lock);
 901 
 902         if (mp == NULL)
 903                 return (B_FALSE);
 904 
 905         q = mp->b_queue;
 906         mp->b_next = NULL;
 907         mp->b_prev = NULL;
 908         mp->b_queue = NULL;
 909 
 910         if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
 911                 DTRACE_PROBE4(ipif__ioctl,
 912                     char *, "ipsq_pending_mp_cleanup",
 913                     int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
 914                     ipif_t *, ipif);
 915                 if (connp == NULL) {
 916                         ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
 917                 } else {
 918                         ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
 919                         mutex_enter(&ipif->ipif_ill->ill_lock);
 920                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
 921                         mutex_exit(&ipif->ipif_ill->ill_lock);
 922                 }
 923         } else {
 924                 inet_freemsg(mp);
 925         }
 926         return (B_TRUE);
 927 }
 928 
 929 /*
 930  * Called in the conn close path and ill delete path
 931  */
 932 static void
 933 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
 934 {
 935         ipsq_t  *ipsq;
 936         mblk_t  *prev;
 937         mblk_t  *curr;
 938         mblk_t  *next;
 939         queue_t *wq, *rq = NULL;
 940         mblk_t  *tmp_list = NULL;
 941 
 942         ASSERT(IAM_WRITER_ILL(ill));
 943         if (connp != NULL)
 944                 wq = CONNP_TO_WQ(connp);
 945         else
 946                 wq = ill->ill_wq;
 947 
 948         /*
 949          * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
 950          * against this here.
 951          */
 952         if (wq != NULL)
 953                 rq = RD(wq);
 954 
 955         ipsq = ill->ill_phyint->phyint_ipsq;
 956         /*
 957          * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
 958          * In the case of ioctl from a conn, there can be only 1 mp
 959          * queued on the ipsq. If an ill is being unplumbed flush all
 960          * the messages.
 961          */
 962         mutex_enter(&ipsq->ipsq_lock);
 963         for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
 964             curr = next) {
 965                 next = curr->b_next;
 966                 if (connp == NULL ||
 967                     (curr->b_queue == wq || curr->b_queue == rq)) {
 968                         /* Unlink the mblk from the pending mp list */
 969                         if (prev != NULL) {
 970                                 prev->b_next = curr->b_next;
 971                         } else {
 972                                 ASSERT(ipsq->ipsq_xopq_mphead == curr);
 973                                 ipsq->ipsq_xopq_mphead = curr->b_next;
 974                         }
 975                         if (ipsq->ipsq_xopq_mptail == curr)
 976                                 ipsq->ipsq_xopq_mptail = prev;
 977                         /*
 978                          * Create a temporary list and release the ipsq lock
 979                          * New elements are added to the head of the tmp_list
 980                          */
 981                         curr->b_next = tmp_list;
 982                         tmp_list = curr;
 983                 } else {
 984                         prev = curr;
 985                 }
 986         }
 987         mutex_exit(&ipsq->ipsq_lock);
 988 
 989         while (tmp_list != NULL) {
 990                 curr = tmp_list;
 991                 tmp_list = curr->b_next;
 992                 curr->b_next = NULL;
 993                 curr->b_prev = NULL;
 994                 wq = curr->b_queue;
 995                 curr->b_queue = NULL;
 996                 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
 997                         DTRACE_PROBE4(ipif__ioctl,
 998                             char *, "ipsq_xopq_mp_cleanup",
 999                             int, 0, ill_t *, NULL, ipif_t *, NULL);
1000                         ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1001                             CONN_CLOSE : NO_COPYOUT, NULL);
1002                 } else {
1003                         /*
1004                          * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1005                          * this can't be just inet_freemsg. we have to
1006                          * restart it otherwise the thread will be stuck.
1007                          */
1008                         inet_freemsg(curr);
1009                 }
1010         }
1011 }
1012 
1013 /*
1014  * This conn has started closing. Cleanup any pending ioctl from this conn.
1015  * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1016  */
1017 void
1018 conn_ioctl_cleanup(conn_t *connp)
1019 {
1020         ipsq_t  *ipsq;
1021         ill_t   *ill;
1022         boolean_t refheld;
1023 
1024         /*
1025          * Check for a queued ioctl. If the ioctl has not yet started, the mp
1026          * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1027          * started the mp could be present in ipx_pending_mp. Note that if
1028          * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1029          * not yet queued anywhere. In this case, the conn close code will wait
1030          * until the conn_ref is dropped. If the stream was a tcp stream, then
1031          * tcp_close will wait first until all ioctls have completed for this
1032          * conn.
1033          */
1034         mutex_enter(&connp->conn_lock);
1035         ill = connp->conn_oper_pending_ill;
1036         if (ill == NULL) {
1037                 mutex_exit(&connp->conn_lock);
1038                 return;
1039         }
1040 
1041         /*
1042          * We may not be able to refhold the ill if the ill/ipif
1043          * is changing. But we need to make sure that the ill will
1044          * not vanish. So we just bump up the ill_waiter count.
1045          */
1046         refheld = ill_waiter_inc(ill);
1047         mutex_exit(&connp->conn_lock);
1048         if (refheld) {
1049                 if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1050                         ill_waiter_dcr(ill);
1051                         /*
1052                          * Check whether this ioctl has started and is
1053                          * pending. If it is not found there then check
1054                          * whether this ioctl has not even started and is in
1055                          * the ipsq_xopq list.
1056                          */
1057                         if (!ipsq_pending_mp_cleanup(ill, connp))
1058                                 ipsq_xopq_mp_cleanup(ill, connp);
1059                         ipsq = ill->ill_phyint->phyint_ipsq;
1060                         ipsq_exit(ipsq);
1061                         return;
1062                 }
1063         }
1064 
1065         /*
1066          * The ill is also closing and we could not bump up the
1067          * ill_waiter_count or we could not enter the ipsq. Leave
1068          * the cleanup to ill_delete
1069          */
1070         mutex_enter(&connp->conn_lock);
1071         while (connp->conn_oper_pending_ill != NULL)
1072                 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1073         mutex_exit(&connp->conn_lock);
1074         if (refheld)
1075                 ill_waiter_dcr(ill);
1076 }
1077 
1078 /*
1079  * ipcl_walk function for cleaning up conn_*_ill fields.
1080  * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1081  * conn_bound_if in place. We prefer dropping
1082  * packets instead of sending them out the wrong interface, or accepting
1083  * packets from the wrong ifindex.
1084  */
1085 static void
1086 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1087 {
1088         ill_t   *ill = (ill_t *)arg;
1089 
1090         mutex_enter(&connp->conn_lock);
1091         if (connp->conn_dhcpinit_ill == ill) {
1092                 connp->conn_dhcpinit_ill = NULL;
1093                 ASSERT(ill->ill_dhcpinit != 0);
1094                 atomic_dec_32(&ill->ill_dhcpinit);
1095                 ill_set_inputfn(ill);
1096         }
1097         mutex_exit(&connp->conn_lock);
1098 }
1099 
1100 static int
1101 ill_down_ipifs_tail(ill_t *ill)
1102 {
1103         ipif_t  *ipif;
1104         int err;
1105 
1106         ASSERT(IAM_WRITER_ILL(ill));
1107         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1108                 ipif_non_duplicate(ipif);
1109                 /*
1110                  * ipif_down_tail will call arp_ll_down on the last ipif
1111                  * and typically return EINPROGRESS when the DL_UNBIND is sent.
1112                  */
1113                 if ((err = ipif_down_tail(ipif)) != 0)
1114                         return (err);
1115         }
1116         return (0);
1117 }
1118 
1119 /* ARGSUSED */
1120 void
1121 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1122 {
1123         ASSERT(IAM_WRITER_IPSQ(ipsq));
1124         (void) ill_down_ipifs_tail(q->q_ptr);
1125         freemsg(mp);
1126         ipsq_current_finish(ipsq);
1127 }
1128 
1129 /*
1130  * ill_down_start is called when we want to down this ill and bring it up again
1131  * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1132  * all interfaces, but don't tear down any plumbing.
1133  */
1134 boolean_t
1135 ill_down_start(queue_t *q, mblk_t *mp)
1136 {
1137         ill_t   *ill = q->q_ptr;
1138         ipif_t  *ipif;
1139 
1140         ASSERT(IAM_WRITER_ILL(ill));
1141         /*
1142          * It is possible that some ioctl is already in progress while we
1143          * received the M_ERROR / M_HANGUP in which case, we need to abort
1144          * the ioctl. ill_down_start() is being processed as CUR_OP rather
1145          * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1146          * the in progress ioctl from ever completing.
1147          *
1148          * The thread that started the ioctl (if any) must have returned,
1149          * since we are now executing as writer. After the 2 calls below,
1150          * the state of the ipsq and the ill would reflect no trace of any
1151          * pending operation. Subsequently if there is any response to the
1152          * original ioctl from the driver, it would be discarded as an
1153          * unsolicited message from the driver.
1154          */
1155         (void) ipsq_pending_mp_cleanup(ill, NULL);
1156         ill_dlpi_clear_deferred(ill);
1157 
1158         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1159                 (void) ipif_down(ipif, NULL, NULL);
1160 
1161         ill_down(ill);
1162 
1163         /*
1164          * Walk all CONNs that can have a reference on an ire or nce for this
1165          * ill (we actually walk all that now have stale references).
1166          */
1167         ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1168 
1169         /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1170         if (ill->ill_isv6)
1171                 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1172 
1173         ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1174 
1175         /*
1176          * Atomically test and add the pending mp if references are active.
1177          */
1178         mutex_enter(&ill->ill_lock);
1179         if (!ill_is_quiescent(ill)) {
1180                 /* call cannot fail since `conn_t *' argument is NULL */
1181                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1182                     mp, ILL_DOWN);
1183                 mutex_exit(&ill->ill_lock);
1184                 return (B_FALSE);
1185         }
1186         mutex_exit(&ill->ill_lock);
1187         return (B_TRUE);
1188 }
1189 
1190 static void
1191 ill_down(ill_t *ill)
1192 {
1193         mblk_t  *mp;
1194         ip_stack_t      *ipst = ill->ill_ipst;
1195 
1196         /*
1197          * Blow off any IREs dependent on this ILL.
1198          * The caller needs to handle conn_ixa_cleanup
1199          */
1200         ill_delete_ires(ill);
1201 
1202         ire_walk_ill(0, 0, ill_downi, ill, ill);
1203 
1204         /* Remove any conn_*_ill depending on this ill */
1205         ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1206 
1207         /*
1208          * Free state for additional IREs.
1209          */
1210         mutex_enter(&ill->ill_saved_ire_lock);
1211         mp = ill->ill_saved_ire_mp;
1212         ill->ill_saved_ire_mp = NULL;
1213         ill->ill_saved_ire_cnt = 0;
1214         mutex_exit(&ill->ill_saved_ire_lock);
1215         freemsg(mp);
1216 }
1217 
1218 /*
1219  * ire_walk routine used to delete every IRE that depends on
1220  * 'ill'.  (Always called as writer, and may only be called from ire_walk.)
1221  *
1222  * Note: since the routes added by the kernel are deleted separately,
1223  * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1224  *
1225  * We also remove references on ire_nce_cache entries that refer to the ill.
1226  */
1227 void
1228 ill_downi(ire_t *ire, char *ill_arg)
1229 {
1230         ill_t   *ill = (ill_t *)ill_arg;
1231         nce_t   *nce;
1232 
1233         mutex_enter(&ire->ire_lock);
1234         nce = ire->ire_nce_cache;
1235         if (nce != NULL && nce->nce_ill == ill)
1236                 ire->ire_nce_cache = NULL;
1237         else
1238                 nce = NULL;
1239         mutex_exit(&ire->ire_lock);
1240         if (nce != NULL)
1241                 nce_refrele(nce);
1242         if (ire->ire_ill == ill) {
1243                 /*
1244                  * The existing interface binding for ire must be
1245                  * deleted before trying to bind the route to another
1246                  * interface. However, since we are using the contents of the
1247                  * ire after ire_delete, the caller has to ensure that
1248                  * CONDEMNED (deleted) ire's are not removed from the list
1249                  * when ire_delete() returns. Currently ill_downi() is
1250                  * only called as part of ire_walk*() routines, so that
1251                  * the irb_refhold() done by ire_walk*() will ensure that
1252                  * ire_delete() does not lead to ire_inactive().
1253                  */
1254                 ASSERT(ire->ire_bucket->irb_refcnt > 0);
1255                 ire_delete(ire);
1256                 if (ire->ire_unbound)
1257                         ire_rebind(ire);
1258         }
1259 }
1260 
1261 /* Remove IRE_IF_CLONE on this ill */
1262 void
1263 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1264 {
1265         ill_t   *ill = (ill_t *)ill_arg;
1266 
1267         ASSERT(ire->ire_type & IRE_IF_CLONE);
1268         if (ire->ire_ill == ill)
1269                 ire_delete(ire);
1270 }
1271 
1272 /* Consume an M_IOCACK of the fastpath probe. */
1273 void
1274 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1275 {
1276         mblk_t  *mp1 = mp;
1277 
1278         /*
1279          * If this was the first attempt turn on the fastpath probing.
1280          */
1281         mutex_enter(&ill->ill_lock);
1282         if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1283                 ill->ill_dlpi_fastpath_state = IDS_OK;
1284         mutex_exit(&ill->ill_lock);
1285 
1286         /* Free the M_IOCACK mblk, hold on to the data */
1287         mp = mp->b_cont;
1288         freeb(mp1);
1289         if (mp == NULL)
1290                 return;
1291         if (mp->b_cont != NULL)
1292                 nce_fastpath_update(ill, mp);
1293         else
1294                 ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
1295         freemsg(mp);
1296 }
1297 
1298 /*
1299  * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1300  * The data portion of the request is a dl_unitdata_req_t template for
1301  * what we would send downstream in the absence of a fastpath confirmation.
1302  */
1303 int
1304 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1305 {
1306         struct iocblk   *ioc;
1307         mblk_t  *mp;
1308 
1309         if (dlur_mp == NULL)
1310                 return (EINVAL);
1311 
1312         mutex_enter(&ill->ill_lock);
1313         switch (ill->ill_dlpi_fastpath_state) {
1314         case IDS_FAILED:
1315                 /*
1316                  * Driver NAKed the first fastpath ioctl - assume it doesn't
1317                  * support it.
1318                  */
1319                 mutex_exit(&ill->ill_lock);
1320                 return (ENOTSUP);
1321         case IDS_UNKNOWN:
1322                 /* This is the first probe */
1323                 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1324                 break;
1325         default:
1326                 break;
1327         }
1328         mutex_exit(&ill->ill_lock);
1329 
1330         if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1331                 return (EAGAIN);
1332 
1333         mp->b_cont = copyb(dlur_mp);
1334         if (mp->b_cont == NULL) {
1335                 freeb(mp);
1336                 return (EAGAIN);
1337         }
1338 
1339         ioc = (struct iocblk *)mp->b_rptr;
1340         ioc->ioc_count = msgdsize(mp->b_cont);
1341 
1342         DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1343             char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1344         putnext(ill->ill_wq, mp);
1345         return (0);
1346 }
1347 
1348 void
1349 ill_capability_probe(ill_t *ill)
1350 {
1351         mblk_t  *mp;
1352 
1353         ASSERT(IAM_WRITER_ILL(ill));
1354 
1355         if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1356             ill->ill_dlpi_capab_state != IDCS_FAILED)
1357                 return;
1358 
1359         /*
1360          * We are starting a new cycle of capability negotiation.
1361          * Free up the capab reset messages of any previous incarnation.
1362          * We will do a fresh allocation when we get the response to our probe
1363          */
1364         if (ill->ill_capab_reset_mp != NULL) {
1365                 freemsg(ill->ill_capab_reset_mp);
1366                 ill->ill_capab_reset_mp = NULL;
1367         }
1368 
1369         ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1370 
1371         mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1372         if (mp == NULL)
1373                 return;
1374 
1375         ill_capability_send(ill, mp);
1376         ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1377 }
1378 
1379 void
1380 ill_capability_reset(ill_t *ill, boolean_t reneg)
1381 {
1382         ASSERT(IAM_WRITER_ILL(ill));
1383 
1384         if (ill->ill_dlpi_capab_state != IDCS_OK)
1385                 return;
1386 
1387         ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1388 
1389         ill_capability_send(ill, ill->ill_capab_reset_mp);
1390         ill->ill_capab_reset_mp = NULL;
1391         /*
1392          * We turn off all capabilities except those pertaining to
1393          * direct function call capabilities viz. ILL_CAPAB_DLD*
1394          * which will be turned off by the corresponding reset functions.
1395          */
1396         ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
1397 }
1398 
1399 static void
1400 ill_capability_reset_alloc(ill_t *ill)
1401 {
1402         mblk_t *mp;
1403         size_t  size = 0;
1404         int     err;
1405         dl_capability_req_t     *capb;
1406 
1407         ASSERT(IAM_WRITER_ILL(ill));
1408         ASSERT(ill->ill_capab_reset_mp == NULL);
1409 
1410         if (ILL_HCKSUM_CAPABLE(ill)) {
1411                 size += sizeof (dl_capability_sub_t) +
1412                     sizeof (dl_capab_hcksum_t);
1413         }
1414 
1415         if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1416                 size += sizeof (dl_capability_sub_t) +
1417                     sizeof (dl_capab_zerocopy_t);
1418         }
1419 
1420         if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1421                 size += sizeof (dl_capability_sub_t) +
1422                     sizeof (dl_capab_dld_t);
1423         }
1424 
1425         mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1426             STR_NOSIG, &err);
1427 
1428         mp->b_datap->db_type = M_PROTO;
1429         bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1430 
1431         capb = (dl_capability_req_t *)mp->b_rptr;
1432         capb->dl_primitive = DL_CAPABILITY_REQ;
1433         capb->dl_sub_offset = sizeof (dl_capability_req_t);
1434         capb->dl_sub_length = size;
1435 
1436         mp->b_wptr += sizeof (dl_capability_req_t);
1437 
1438         /*
1439          * Each handler fills in the corresponding dl_capability_sub_t
1440          * inside the mblk,
1441          */
1442         ill_capability_hcksum_reset_fill(ill, mp);
1443         ill_capability_zerocopy_reset_fill(ill, mp);
1444         ill_capability_dld_reset_fill(ill, mp);
1445 
1446         ill->ill_capab_reset_mp = mp;
1447 }
1448 
1449 static void
1450 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1451 {
1452         dl_capab_id_t *id_ic;
1453         uint_t sub_dl_cap = outers->dl_cap;
1454         dl_capability_sub_t *inners;
1455         uint8_t *capend;
1456 
1457         ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1458 
1459         /*
1460          * Note: range checks here are not absolutely sufficient to
1461          * make us robust against malformed messages sent by drivers;
1462          * this is in keeping with the rest of IP's dlpi handling.
1463          * (Remember, it's coming from something else in the kernel
1464          * address space)
1465          */
1466 
1467         capend = (uint8_t *)(outers + 1) + outers->dl_length;
1468         if (capend > mp->b_wptr) {
1469                 cmn_err(CE_WARN, "ill_capability_id_ack: "
1470                     "malformed sub-capability too long for mblk");
1471                 return;
1472         }
1473 
1474         id_ic = (dl_capab_id_t *)(outers + 1);
1475 
1476         if (outers->dl_length < sizeof (*id_ic) ||
1477             (inners = &id_ic->id_subcap,
1478             inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1479                 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1480                     "encapsulated capab type %d too long for mblk",
1481                     inners->dl_cap);
1482                 return;
1483         }
1484 
1485         if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1486                 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1487                     "isn't as expected; pass-thru module(s) detected, "
1488                     "discarding capability\n", inners->dl_cap));
1489                 return;
1490         }
1491 
1492         /* Process the encapsulated sub-capability */
1493         ill_capability_dispatch(ill, mp, inners);
1494 }
1495 
1496 static void
1497 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1498 {
1499         dl_capability_sub_t *dl_subcap;
1500 
1501         if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1502                 return;
1503 
1504         /*
1505          * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1506          * initialized below since it is not used by DLD.
1507          */
1508         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1509         dl_subcap->dl_cap = DL_CAPAB_DLD;
1510         dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1511 
1512         mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1513 }
1514 
1515 static void
1516 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1517 {
1518         /*
1519          * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1520          * is only to get the VRRP capability.
1521          *
1522          * Note that we cannot check ill_ipif_up_count here since
1523          * ill_ipif_up_count is only incremented when the resolver is setup.
1524          * That is done asynchronously, and can race with this function.
1525          */
1526         if (!ill->ill_dl_up) {
1527                 if (subp->dl_cap == DL_CAPAB_VRRP)
1528                         ill_capability_vrrp_ack(ill, mp, subp);
1529                 return;
1530         }
1531 
1532         switch (subp->dl_cap) {
1533         case DL_CAPAB_HCKSUM:
1534                 ill_capability_hcksum_ack(ill, mp, subp);
1535                 break;
1536         case DL_CAPAB_ZEROCOPY:
1537                 ill_capability_zerocopy_ack(ill, mp, subp);
1538                 break;
1539         case DL_CAPAB_DLD:
1540                 ill_capability_dld_ack(ill, mp, subp);
1541                 break;
1542         case DL_CAPAB_VRRP:
1543                 break;
1544         default:
1545                 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1546                     subp->dl_cap));
1547         }
1548 }
1549 
1550 /*
1551  * Process the vrrp capability received from a DLS Provider. isub must point
1552  * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1553  */
1554 static void
1555 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1556 {
1557         dl_capab_vrrp_t *vrrp;
1558         uint_t          sub_dl_cap = isub->dl_cap;
1559         uint8_t         *capend;
1560 
1561         ASSERT(IAM_WRITER_ILL(ill));
1562         ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1563 
1564         /*
1565          * Note: range checks here are not absolutely sufficient to
1566          * make us robust against malformed messages sent by drivers;
1567          * this is in keeping with the rest of IP's dlpi handling.
1568          * (Remember, it's coming from something else in the kernel
1569          * address space)
1570          */
1571         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1572         if (capend > mp->b_wptr) {
1573                 cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1574                     "malformed sub-capability too long for mblk");
1575                 return;
1576         }
1577         vrrp = (dl_capab_vrrp_t *)(isub + 1);
1578 
1579         /*
1580          * Compare the IP address family and set ILLF_VRRP for the right ill.
1581          */
1582         if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1583             (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1584                 ill->ill_flags |= ILLF_VRRP;
1585         }
1586 }
1587 
1588 /*
1589  * Process a hardware checksum offload capability negotiation ack received
1590  * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1591  * of a DL_CAPABILITY_ACK message.
1592  */
1593 static void
1594 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1595 {
1596         dl_capability_req_t     *ocap;
1597         dl_capab_hcksum_t       *ihck, *ohck;
1598         ill_hcksum_capab_t      **ill_hcksum;
1599         mblk_t                  *nmp = NULL;
1600         uint_t                  sub_dl_cap = isub->dl_cap;
1601         uint8_t                 *capend;
1602 
1603         ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1604 
1605         ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1606 
1607         /*
1608          * Note: range checks here are not absolutely sufficient to
1609          * make us robust against malformed messages sent by drivers;
1610          * this is in keeping with the rest of IP's dlpi handling.
1611          * (Remember, it's coming from something else in the kernel
1612          * address space)
1613          */
1614         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1615         if (capend > mp->b_wptr) {
1616                 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1617                     "malformed sub-capability too long for mblk");
1618                 return;
1619         }
1620 
1621         /*
1622          * There are two types of acks we process here:
1623          * 1. acks in reply to a (first form) generic capability req
1624          *    (no ENABLE flag set)
1625          * 2. acks in reply to a ENABLE capability req.
1626          *    (ENABLE flag set)
1627          */
1628         ihck = (dl_capab_hcksum_t *)(isub + 1);
1629 
1630         if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1631                 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1632                     "unsupported hardware checksum "
1633                     "sub-capability (version %d, expected %d)",
1634                     ihck->hcksum_version, HCKSUM_VERSION_1);
1635                 return;
1636         }
1637 
1638         if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1639                 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1640                     "checksum capability isn't as expected; pass-thru "
1641                     "module(s) detected, discarding capability\n"));
1642                 return;
1643         }
1644 
1645 #define CURR_HCKSUM_CAPAB                               \
1646         (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |    \
1647         HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1648 
1649         if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1650             (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1651                 /* do ENABLE processing */
1652                 if (*ill_hcksum == NULL) {
1653                         *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1654                             KM_NOSLEEP);
1655 
1656                         if (*ill_hcksum == NULL) {
1657                                 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1658                                     "could not enable hcksum version %d "
1659                                     "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1660                                     ill->ill_name);
1661                                 return;
1662                         }
1663                 }
1664 
1665                 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1666                 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1667                 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1668                 ip1dbg(("ill_capability_hcksum_ack: interface %s "
1669                     "has enabled hardware checksumming\n ",
1670                     ill->ill_name));
1671         } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1672                 /*
1673                  * Enabling hardware checksum offload
1674                  * Currently IP supports {TCP,UDP}/IPv4
1675                  * partial and full cksum offload and
1676                  * IPv4 header checksum offload.
1677                  * Allocate new mblk which will
1678                  * contain a new capability request
1679                  * to enable hardware checksum offload.
1680                  */
1681                 uint_t  size;
1682                 uchar_t *rptr;
1683 
1684                 size = sizeof (dl_capability_req_t) +
1685                     sizeof (dl_capability_sub_t) + isub->dl_length;
1686 
1687                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1688                         cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1689                             "could not enable hardware cksum for %s (ENOMEM)\n",
1690                             ill->ill_name);
1691                         return;
1692                 }
1693 
1694                 rptr = nmp->b_rptr;
1695                 /* initialize dl_capability_req_t */
1696                 ocap = (dl_capability_req_t *)nmp->b_rptr;
1697                 ocap->dl_sub_offset =
1698                     sizeof (dl_capability_req_t);
1699                 ocap->dl_sub_length =
1700                     sizeof (dl_capability_sub_t) +
1701                     isub->dl_length;
1702                 nmp->b_rptr += sizeof (dl_capability_req_t);
1703 
1704                 /* initialize dl_capability_sub_t */
1705                 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1706                 nmp->b_rptr += sizeof (*isub);
1707 
1708                 /* initialize dl_capab_hcksum_t */
1709                 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1710                 bcopy(ihck, ohck, sizeof (*ihck));
1711 
1712                 nmp->b_rptr = rptr;
1713                 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1714 
1715                 /* Set ENABLE flag */
1716                 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1717                 ohck->hcksum_txflags |= HCKSUM_ENABLE;
1718 
1719                 /*
1720                  * nmp points to a DL_CAPABILITY_REQ message to enable
1721                  * hardware checksum acceleration.
1722                  */
1723                 ill_capability_send(ill, nmp);
1724         } else {
1725                 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1726                     "advertised %x hardware checksum capability flags\n",
1727                     ill->ill_name, ihck->hcksum_txflags));
1728         }
1729 }
1730 
1731 static void
1732 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1733 {
1734         dl_capab_hcksum_t *hck_subcap;
1735         dl_capability_sub_t *dl_subcap;
1736 
1737         if (!ILL_HCKSUM_CAPABLE(ill))
1738                 return;
1739 
1740         ASSERT(ill->ill_hcksum_capab != NULL);
1741 
1742         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1743         dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1744         dl_subcap->dl_length = sizeof (*hck_subcap);
1745 
1746         hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1747         hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1748         hck_subcap->hcksum_txflags = 0;
1749 
1750         mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1751 }
1752 
1753 static void
1754 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1755 {
1756         mblk_t *nmp = NULL;
1757         dl_capability_req_t *oc;
1758         dl_capab_zerocopy_t *zc_ic, *zc_oc;
1759         ill_zerocopy_capab_t **ill_zerocopy_capab;
1760         uint_t sub_dl_cap = isub->dl_cap;
1761         uint8_t *capend;
1762 
1763         ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1764 
1765         ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1766 
1767         /*
1768          * Note: range checks here are not absolutely sufficient to
1769          * make us robust against malformed messages sent by drivers;
1770          * this is in keeping with the rest of IP's dlpi handling.
1771          * (Remember, it's coming from something else in the kernel
1772          * address space)
1773          */
1774         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1775         if (capend > mp->b_wptr) {
1776                 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1777                     "malformed sub-capability too long for mblk");
1778                 return;
1779         }
1780 
1781         zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1782         if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1783                 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1784                     "unsupported ZEROCOPY sub-capability (version %d, "
1785                     "expected %d)", zc_ic->zerocopy_version,
1786                     ZEROCOPY_VERSION_1);
1787                 return;
1788         }
1789 
1790         if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1791                 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1792                     "capability isn't as expected; pass-thru module(s) "
1793                     "detected, discarding capability\n"));
1794                 return;
1795         }
1796 
1797         if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1798                 if (*ill_zerocopy_capab == NULL) {
1799                         *ill_zerocopy_capab =
1800                             kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1801                             KM_NOSLEEP);
1802 
1803                         if (*ill_zerocopy_capab == NULL) {
1804                                 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1805                                     "could not enable Zero-copy version %d "
1806                                     "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1807                                     ill->ill_name);
1808                                 return;
1809                         }
1810                 }
1811 
1812                 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1813                     "supports Zero-copy version %d\n", ill->ill_name,
1814                     ZEROCOPY_VERSION_1));
1815 
1816                 (*ill_zerocopy_capab)->ill_zerocopy_version =
1817                     zc_ic->zerocopy_version;
1818                 (*ill_zerocopy_capab)->ill_zerocopy_flags =
1819                     zc_ic->zerocopy_flags;
1820 
1821                 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1822         } else {
1823                 uint_t size;
1824                 uchar_t *rptr;
1825 
1826                 size = sizeof (dl_capability_req_t) +
1827                     sizeof (dl_capability_sub_t) +
1828                     sizeof (dl_capab_zerocopy_t);
1829 
1830                 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1831                         cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1832                             "could not enable zerocopy for %s (ENOMEM)\n",
1833                             ill->ill_name);
1834                         return;
1835                 }
1836 
1837                 rptr = nmp->b_rptr;
1838                 /* initialize dl_capability_req_t */
1839                 oc = (dl_capability_req_t *)rptr;
1840                 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1841                 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1842                     sizeof (dl_capab_zerocopy_t);
1843                 rptr += sizeof (dl_capability_req_t);
1844 
1845                 /* initialize dl_capability_sub_t */
1846                 bcopy(isub, rptr, sizeof (*isub));
1847                 rptr += sizeof (*isub);
1848 
1849                 /* initialize dl_capab_zerocopy_t */
1850                 zc_oc = (dl_capab_zerocopy_t *)rptr;
1851                 *zc_oc = *zc_ic;
1852 
1853                 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1854                     "to enable zero-copy version %d\n", ill->ill_name,
1855                     ZEROCOPY_VERSION_1));
1856 
1857                 /* set VMSAFE_MEM flag */
1858                 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1859 
1860                 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1861                 ill_capability_send(ill, nmp);
1862         }
1863 }
1864 
1865 static void
1866 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1867 {
1868         dl_capab_zerocopy_t *zerocopy_subcap;
1869         dl_capability_sub_t *dl_subcap;
1870 
1871         if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1872                 return;
1873 
1874         ASSERT(ill->ill_zerocopy_capab != NULL);
1875 
1876         dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1877         dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1878         dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1879 
1880         zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1881         zerocopy_subcap->zerocopy_version =
1882             ill->ill_zerocopy_capab->ill_zerocopy_version;
1883         zerocopy_subcap->zerocopy_flags = 0;
1884 
1885         mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1886 }
1887 
1888 /*
1889  * DLD capability
1890  * Refer to dld.h for more information regarding the purpose and usage
1891  * of this capability.
1892  */
1893 static void
1894 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1895 {
1896         dl_capab_dld_t          *dld_ic, dld;
1897         uint_t                  sub_dl_cap = isub->dl_cap;
1898         uint8_t                 *capend;
1899         ill_dld_capab_t         *idc;
1900 
1901         ASSERT(IAM_WRITER_ILL(ill));
1902         ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1903 
1904         /*
1905          * Note: range checks here are not absolutely sufficient to
1906          * make us robust against malformed messages sent by drivers;
1907          * this is in keeping with the rest of IP's dlpi handling.
1908          * (Remember, it's coming from something else in the kernel
1909          * address space)
1910          */
1911         capend = (uint8_t *)(isub + 1) + isub->dl_length;
1912         if (capend > mp->b_wptr) {
1913                 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1914                     "malformed sub-capability too long for mblk");
1915                 return;
1916         }
1917         dld_ic = (dl_capab_dld_t *)(isub + 1);
1918         if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1919                 cmn_err(CE_CONT, "ill_capability_dld_ack: "
1920                     "unsupported DLD sub-capability (version %d, "
1921                     "expected %d)", dld_ic->dld_version,
1922                     DLD_CURRENT_VERSION);
1923                 return;
1924         }
1925         if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1926                 ip1dbg(("ill_capability_dld_ack: mid token for dld "
1927                     "capability isn't as expected; pass-thru module(s) "
1928                     "detected, discarding capability\n"));
1929                 return;
1930         }
1931 
1932         /*
1933          * Copy locally to ensure alignment.
1934          */
1935         bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1936 
1937         if ((idc = ill->ill_dld_capab) == NULL) {
1938                 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1939                 if (idc == NULL) {
1940                         cmn_err(CE_WARN, "ill_capability_dld_ack: "
1941                             "could not enable DLD version %d "
1942                             "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1943                             ill->ill_name);
1944                         return;
1945                 }
1946                 ill->ill_dld_capab = idc;
1947         }
1948         idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1949         idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1950         ip1dbg(("ill_capability_dld_ack: interface %s "
1951             "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1952 
1953         ill_capability_dld_enable(ill);
1954 }
1955 
1956 /*
1957  * Typically capability negotiation between IP and the driver happens via
1958  * DLPI message exchange. However GLD also offers a direct function call
1959  * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1960  * But arbitrary function calls into IP or GLD are not permitted, since both
1961  * of them are protected by their own perimeter mechanism. The perimeter can
1962  * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1963  * these perimeters is IP -> MAC. Thus for example to enable the squeue
1964  * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
1965  * to enter the mac perimeter and then do the direct function calls into
1966  * GLD to enable squeue polling. The ring related callbacks from the mac into
1967  * the stack to add, bind, quiesce, restart or cleanup a ring are all
1968  * protected by the mac perimeter.
1969  */
1970 static void
1971 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
1972 {
1973         ill_dld_capab_t         *idc = ill->ill_dld_capab;
1974         int                     err;
1975 
1976         err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
1977             DLD_ENABLE);
1978         ASSERT(err == 0);
1979 }
1980 
1981 static void
1982 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
1983 {
1984         ill_dld_capab_t         *idc = ill->ill_dld_capab;
1985         int                     err;
1986 
1987         err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
1988             DLD_DISABLE);
1989         ASSERT(err == 0);
1990 }
1991 
1992 boolean_t
1993 ill_mac_perim_held(ill_t *ill)
1994 {
1995         ill_dld_capab_t         *idc = ill->ill_dld_capab;
1996 
1997         return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
1998             DLD_QUERY));
1999 }
2000 
2001 static void
2002 ill_capability_direct_enable(ill_t *ill)
2003 {
2004         ill_dld_capab_t         *idc = ill->ill_dld_capab;
2005         ill_dld_direct_t        *idd = &idc->idc_direct;
2006         dld_capab_direct_t      direct;
2007         int                     rc;
2008 
2009         ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2010 
2011         bzero(&direct, sizeof (direct));
2012         direct.di_rx_cf = (uintptr_t)ip_input;
2013         direct.di_rx_ch = ill;
2014 
2015         rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2016             DLD_ENABLE);
2017         if (rc == 0) {
2018                 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2019                 idd->idd_tx_dh = direct.di_tx_dh;
2020                 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2021                 idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2022                 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2023                 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2024                 ASSERT(idd->idd_tx_cb_df != NULL);
2025                 ASSERT(idd->idd_tx_fctl_df != NULL);
2026                 ASSERT(idd->idd_tx_df != NULL);
2027                 /*
2028                  * One time registration of flow enable callback function
2029                  */
2030                 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2031                     ill_flow_enable, ill);
2032                 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2033                 DTRACE_PROBE1(direct_on, (ill_t *), ill);
2034         } else {
2035                 cmn_err(CE_WARN, "warning: could not enable DIRECT "
2036                     "capability, rc = %d\n", rc);
2037                 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2038         }
2039 }
2040 
2041 static void
2042 ill_capability_poll_enable(ill_t *ill)
2043 {
2044         ill_dld_capab_t         *idc = ill->ill_dld_capab;
2045         dld_capab_poll_t        poll;
2046         int                     rc;
2047 
2048         ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2049 
2050         bzero(&poll, sizeof (poll));
2051         poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2052         poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2053         poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2054         poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2055         poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2056         poll.poll_ring_ch = ill;
2057         rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2058             DLD_ENABLE);
2059         if (rc == 0) {
2060                 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2061                 DTRACE_PROBE1(poll_on, (ill_t *), ill);
2062         } else {
2063                 ip1dbg(("warning: could not enable POLL "
2064                     "capability, rc = %d\n", rc));
2065                 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2066         }
2067 }
2068 
2069 /*
2070  * Enable the LSO capability.
2071  */
2072 static void
2073 ill_capability_lso_enable(ill_t *ill)
2074 {
2075         ill_dld_capab_t *idc = ill->ill_dld_capab;
2076         dld_capab_lso_t lso;
2077         int rc;
2078 
2079         ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2080 
2081         if (ill->ill_lso_capab == NULL) {
2082                 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2083                     KM_NOSLEEP);
2084                 if (ill->ill_lso_capab == NULL) {
2085                         cmn_err(CE_WARN, "ill_capability_lso_enable: "
2086                             "could not enable LSO for %s (ENOMEM)\n",
2087                             ill->ill_name);
2088                         return;
2089                 }
2090         }
2091 
2092         bzero(&lso, sizeof (lso));
2093         if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2094             DLD_ENABLE)) == 0) {
2095                 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2096                 ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2097                 ill->ill_capabilities |= ILL_CAPAB_LSO;
2098                 ip1dbg(("ill_capability_lso_enable: interface %s "
2099                     "has enabled LSO\n ", ill->ill_name));
2100         } else {
2101                 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2102                 ill->ill_lso_capab = NULL;
2103                 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2104         }
2105 }
2106 
2107 static void
2108 ill_capability_dld_enable(ill_t *ill)
2109 {
2110         mac_perim_handle_t mph;
2111 
2112         ASSERT(IAM_WRITER_ILL(ill));
2113 
2114         if (ill->ill_isv6)
2115                 return;
2116 
2117         ill_mac_perim_enter(ill, &mph);
2118         if (!ill->ill_isv6) {
2119                 ill_capability_direct_enable(ill);
2120                 ill_capability_poll_enable(ill);
2121                 ill_capability_lso_enable(ill);
2122         }
2123         ill->ill_capabilities |= ILL_CAPAB_DLD;
2124         ill_mac_perim_exit(ill, mph);
2125 }
2126 
2127 static void
2128 ill_capability_dld_disable(ill_t *ill)
2129 {
2130         ill_dld_capab_t *idc;
2131         ill_dld_direct_t *idd;
2132         mac_perim_handle_t      mph;
2133 
2134         ASSERT(IAM_WRITER_ILL(ill));
2135 
2136         if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2137                 return;
2138 
2139         ill_mac_perim_enter(ill, &mph);
2140 
2141         idc = ill->ill_dld_capab;
2142         if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2143                 /*
2144                  * For performance we avoid locks in the transmit data path
2145                  * and don't maintain a count of the number of threads using
2146                  * direct calls. Thus some threads could be using direct
2147                  * transmit calls to GLD, even after the capability mechanism
2148                  * turns it off. This is still safe since the handles used in
2149                  * the direct calls continue to be valid until the unplumb is
2150                  * completed. Remove the callback that was added (1-time) at
2151                  * capab enable time.
2152                  */
2153                 mutex_enter(&ill->ill_lock);
2154                 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2155                 mutex_exit(&ill->ill_lock);
2156                 if (ill->ill_flownotify_mh != NULL) {
2157                         idd = &idc->idc_direct;
2158                         idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2159                             ill->ill_flownotify_mh);
2160                         ill->ill_flownotify_mh = NULL;
2161                 }
2162                 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2163                     NULL, DLD_DISABLE);
2164         }
2165 
2166         if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2167                 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2168                 ip_squeue_clean_all(ill);
2169                 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2170                     NULL, DLD_DISABLE);
2171         }
2172 
2173         if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2174                 ASSERT(ill->ill_lso_capab != NULL);
2175                 /*
2176                  * Clear the capability flag for LSO but retain the
2177                  * ill_lso_capab structure since it's possible that another
2178                  * thread is still referring to it.  The structure only gets
2179                  * deallocated when we destroy the ill.
2180                  */
2181 
2182                 ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2183                 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2184                     NULL, DLD_DISABLE);
2185         }
2186 
2187         ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2188         ill_mac_perim_exit(ill, mph);
2189 }
2190 
2191 /*
2192  * Capability Negotiation protocol
2193  *
2194  * We don't wait for DLPI capability operations to finish during interface
2195  * bringup or teardown. Doing so would introduce more asynchrony and the
2196  * interface up/down operations will need multiple return and restarts.
2197  * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2198  * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2199  * exclusive operation won't start until the DLPI operations of the previous
2200  * exclusive operation complete.
2201  *
2202  * The capability state machine is shown below.
2203  *
2204  * state                next state              event, action
2205  *
2206  * IDCS_UNKNOWN         IDCS_PROBE_SENT         ill_capability_probe
2207  * IDCS_PROBE_SENT      IDCS_OK                 ill_capability_ack
2208  * IDCS_PROBE_SENT      IDCS_FAILED             ip_rput_dlpi_writer (nack)
2209  * IDCS_OK              IDCS_RENEG              Receipt of DL_NOTE_CAPAB_RENEG
2210  * IDCS_OK              IDCS_RESET_SENT         ill_capability_reset
2211  * IDCS_RESET_SENT      IDCS_UNKNOWN            ill_capability_ack_thr
2212  * IDCS_RENEG           IDCS_PROBE_SENT         ill_capability_ack_thr ->
2213  *                                                  ill_capability_probe.
2214  */
2215 
2216 /*
2217  * Dedicated thread started from ip_stack_init that handles capability
2218  * disable. This thread ensures the taskq dispatch does not fail by waiting
2219  * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2220  * that direct calls to DLD are done in a cv_waitable context.
2221  */
2222 void
2223 ill_taskq_dispatch(ip_stack_t *ipst)
2224 {
2225         callb_cpr_t cprinfo;
2226         char    name[64];
2227         mblk_t  *mp;
2228 
2229         (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2230             ipst->ips_netstack->netstack_stackid);
2231         CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2232             name);
2233         mutex_enter(&ipst->ips_capab_taskq_lock);
2234 
2235         for (;;) {
2236                 mp = ipst->ips_capab_taskq_head;
2237                 while (mp != NULL) {
2238                         ipst->ips_capab_taskq_head = mp->b_next;
2239                         if (ipst->ips_capab_taskq_head == NULL)
2240                                 ipst->ips_capab_taskq_tail = NULL;
2241                         mutex_exit(&ipst->ips_capab_taskq_lock);
2242                         mp->b_next = NULL;
2243 
2244                         VERIFY(taskq_dispatch(system_taskq,
2245                             ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
2246                         mutex_enter(&ipst->ips_capab_taskq_lock);
2247                         mp = ipst->ips_capab_taskq_head;
2248                 }
2249 
2250                 if (ipst->ips_capab_taskq_quit)
2251                         break;
2252                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2253                 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2254                 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2255         }
2256         VERIFY(ipst->ips_capab_taskq_head == NULL);
2257         VERIFY(ipst->ips_capab_taskq_tail == NULL);
2258         CALLB_CPR_EXIT(&cprinfo);
2259         thread_exit();
2260 }
2261 
2262 /*
2263  * Consume a new-style hardware capabilities negotiation ack.
2264  * Called via taskq on receipt of DL_CAPABILITY_ACK.
2265  */
2266 static void
2267 ill_capability_ack_thr(void *arg)
2268 {
2269         mblk_t  *mp = arg;
2270         dl_capability_ack_t *capp;
2271         dl_capability_sub_t *subp, *endp;
2272         ill_t   *ill;
2273         boolean_t reneg;
2274 
2275         ill = (ill_t *)mp->b_prev;
2276         mp->b_prev = NULL;
2277 
2278         VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2279 
2280         if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2281             ill->ill_dlpi_capab_state == IDCS_RENEG) {
2282                 /*
2283                  * We have received the ack for our DL_CAPAB reset request.
2284                  * There isnt' anything in the message that needs processing.
2285                  * All message based capabilities have been disabled, now
2286                  * do the function call based capability disable.
2287                  */
2288                 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2289                 ill_capability_dld_disable(ill);
2290                 ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2291                 if (reneg)
2292                         ill_capability_probe(ill);
2293                 goto done;
2294         }
2295 
2296         if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2297                 ill->ill_dlpi_capab_state = IDCS_OK;
2298 
2299         capp = (dl_capability_ack_t *)mp->b_rptr;
2300 
2301         if (capp->dl_sub_length == 0) {
2302                 /* no new-style capabilities */
2303                 goto done;
2304         }
2305 
2306         /* make sure the driver supplied correct dl_sub_length */
2307         if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2308                 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2309                     "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2310                 goto done;
2311         }
2312 
2313 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2314         /*
2315          * There are sub-capabilities. Process the ones we know about.
2316          * Loop until we don't have room for another sub-cap header..
2317          */
2318         for (subp = SC(capp, capp->dl_sub_offset),
2319             endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2320             subp <= endp;
2321             subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2322 
2323                 switch (subp->dl_cap) {
2324                 case DL_CAPAB_ID_WRAPPER:
2325                         ill_capability_id_ack(ill, mp, subp);
2326                         break;
2327                 default:
2328                         ill_capability_dispatch(ill, mp, subp);
2329                         break;
2330                 }
2331         }
2332 #undef SC
2333 done:
2334         inet_freemsg(mp);
2335         ill_capability_done(ill);
2336         ipsq_exit(ill->ill_phyint->phyint_ipsq);
2337 }
2338 
2339 /*
2340  * This needs to be started in a taskq thread to provide a cv_waitable
2341  * context.
2342  */
2343 void
2344 ill_capability_ack(ill_t *ill, mblk_t *mp)
2345 {
2346         ip_stack_t      *ipst = ill->ill_ipst;
2347 
2348         mp->b_prev = (mblk_t *)ill;
2349         ASSERT(mp->b_next == NULL);
2350 
2351         if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2352             TQ_NOSLEEP) != 0)
2353                 return;
2354 
2355         /*
2356          * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2357          * which will do the dispatch using TQ_SLEEP to guarantee success.
2358          */
2359         mutex_enter(&ipst->ips_capab_taskq_lock);
2360         if (ipst->ips_capab_taskq_head == NULL) {
2361                 ASSERT(ipst->ips_capab_taskq_tail == NULL);
2362                 ipst->ips_capab_taskq_head = mp;
2363         } else {
2364                 ipst->ips_capab_taskq_tail->b_next = mp;
2365         }
2366         ipst->ips_capab_taskq_tail = mp;
2367 
2368         cv_signal(&ipst->ips_capab_taskq_cv);
2369         mutex_exit(&ipst->ips_capab_taskq_lock);
2370 }
2371 
2372 /*
2373  * This routine is called to scan the fragmentation reassembly table for
2374  * the specified ILL for any packets that are starting to smell.
2375  * dead_interval is the maximum time in seconds that will be tolerated.  It
2376  * will either be the value specified in ip_g_frag_timeout, or zero if the
2377  * ILL is shutting down and it is time to blow everything off.
2378  *
2379  * It returns the number of seconds (as a time_t) that the next frag timer
2380  * should be scheduled for, 0 meaning that the timer doesn't need to be
2381  * re-started.  Note that the method of calculating next_timeout isn't
2382  * entirely accurate since time will flow between the time we grab
2383  * current_time and the time we schedule the next timeout.  This isn't a
2384  * big problem since this is the timer for sending an ICMP reassembly time
2385  * exceeded messages, and it doesn't have to be exactly accurate.
2386  *
2387  * This function is
2388  * sometimes called as writer, although this is not required.
2389  */
2390 time_t
2391 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2392 {
2393         ipfb_t  *ipfb;
2394         ipfb_t  *endp;
2395         ipf_t   *ipf;
2396         ipf_t   *ipfnext;
2397         mblk_t  *mp;
2398         time_t  current_time = gethrestime_sec();
2399         time_t  next_timeout = 0;
2400         uint32_t        hdr_length;
2401         mblk_t  *send_icmp_head;
2402         mblk_t  *send_icmp_head_v6;
2403         ip_stack_t *ipst = ill->ill_ipst;
2404         ip_recv_attr_t iras;
2405 
2406         bzero(&iras, sizeof (iras));
2407         iras.ira_flags = 0;
2408         iras.ira_ill = iras.ira_rill = ill;
2409         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2410         iras.ira_rifindex = iras.ira_ruifindex;
2411 
2412         ipfb = ill->ill_frag_hash_tbl;
2413         if (ipfb == NULL)
2414                 return (B_FALSE);
2415         endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2416         /* Walk the frag hash table. */
2417         for (; ipfb < endp; ipfb++) {
2418                 send_icmp_head = NULL;
2419                 send_icmp_head_v6 = NULL;
2420                 mutex_enter(&ipfb->ipfb_lock);
2421                 while ((ipf = ipfb->ipfb_ipf) != 0) {
2422                         time_t frag_time = current_time - ipf->ipf_timestamp;
2423                         time_t frag_timeout;
2424 
2425                         if (frag_time < dead_interval) {
2426                                 /*
2427                                  * There are some outstanding fragments
2428                                  * that will timeout later.  Make note of
2429                                  * the time so that we can reschedule the
2430                                  * next timeout appropriately.
2431                                  */
2432                                 frag_timeout = dead_interval - frag_time;
2433                                 if (next_timeout == 0 ||
2434                                     frag_timeout < next_timeout) {
2435                                         next_timeout = frag_timeout;
2436                                 }
2437                                 break;
2438                         }
2439                         /* Time's up.  Get it out of here. */
2440                         hdr_length = ipf->ipf_nf_hdr_len;
2441                         ipfnext = ipf->ipf_hash_next;
2442                         if (ipfnext)
2443                                 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2444                         *ipf->ipf_ptphn = ipfnext;
2445                         mp = ipf->ipf_mp->b_cont;
2446                         for (; mp; mp = mp->b_cont) {
2447                                 /* Extra points for neatness. */
2448                                 IP_REASS_SET_START(mp, 0);
2449                                 IP_REASS_SET_END(mp, 0);
2450                         }
2451                         mp = ipf->ipf_mp->b_cont;
2452                         atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2453                         ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2454                         ipfb->ipfb_count -= ipf->ipf_count;
2455                         ASSERT(ipfb->ipfb_frag_pkts > 0);
2456                         ipfb->ipfb_frag_pkts--;
2457                         /*
2458                          * We do not send any icmp message from here because
2459                          * we currently are holding the ipfb_lock for this
2460                          * hash chain. If we try and send any icmp messages
2461                          * from here we may end up via a put back into ip
2462                          * trying to get the same lock, causing a recursive
2463                          * mutex panic. Instead we build a list and send all
2464                          * the icmp messages after we have dropped the lock.
2465                          */
2466                         if (ill->ill_isv6) {
2467                                 if (hdr_length != 0) {
2468                                         mp->b_next = send_icmp_head_v6;
2469                                         send_icmp_head_v6 = mp;
2470                                 } else {
2471                                         freemsg(mp);
2472                                 }
2473                         } else {
2474                                 if (hdr_length != 0) {
2475                                         mp->b_next = send_icmp_head;
2476                                         send_icmp_head = mp;
2477                                 } else {
2478                                         freemsg(mp);
2479                                 }
2480                         }
2481                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2482                         ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2483                         freeb(ipf->ipf_mp);
2484                 }
2485                 mutex_exit(&ipfb->ipfb_lock);
2486                 /*
2487                  * Now need to send any icmp messages that we delayed from
2488                  * above.
2489                  */
2490                 while (send_icmp_head_v6 != NULL) {
2491                         ip6_t *ip6h;
2492 
2493                         mp = send_icmp_head_v6;
2494                         send_icmp_head_v6 = send_icmp_head_v6->b_next;
2495                         mp->b_next = NULL;
2496                         ip6h = (ip6_t *)mp->b_rptr;
2497                         iras.ira_flags = 0;
2498                         /*
2499                          * This will result in an incorrect ALL_ZONES zoneid
2500                          * for multicast packets, but we
2501                          * don't send ICMP errors for those in any case.
2502                          */
2503                         iras.ira_zoneid =
2504                             ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2505                             ill, ipst);
2506                         ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2507                         icmp_time_exceeded_v6(mp,
2508                             ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2509                             &iras);
2510                         ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2511                 }
2512                 while (send_icmp_head != NULL) {
2513                         ipaddr_t dst;
2514 
2515                         mp = send_icmp_head;
2516                         send_icmp_head = send_icmp_head->b_next;
2517                         mp->b_next = NULL;
2518 
2519                         dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2520 
2521                         iras.ira_flags = IRAF_IS_IPV4;
2522                         /*
2523                          * This will result in an incorrect ALL_ZONES zoneid
2524                          * for broadcast and multicast packets, but we
2525                          * don't send ICMP errors for those in any case.
2526                          */
2527                         iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2528                             ill, ipst);
2529                         ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2530                         icmp_time_exceeded(mp,
2531                             ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2532                         ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2533                 }
2534         }
2535         /*
2536          * A non-dying ILL will use the return value to decide whether to
2537          * restart the frag timer, and for how long.
2538          */
2539         return (next_timeout);
2540 }
2541 
2542 /*
2543  * This routine is called when the approximate count of mblk memory used
2544  * for the specified ILL has exceeded max_count.
2545  */
2546 void
2547 ill_frag_prune(ill_t *ill, uint_t max_count)
2548 {
2549         ipfb_t  *ipfb;
2550         ipf_t   *ipf;
2551         size_t  count;
2552         clock_t now;
2553 
2554         /*
2555          * If we are here within ip_min_frag_prune_time msecs remove
2556          * ill_frag_free_num_pkts oldest packets from each bucket and increment
2557          * ill_frag_free_num_pkts.
2558          */
2559         mutex_enter(&ill->ill_lock);
2560         now = ddi_get_lbolt();
2561         if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2562             (ip_min_frag_prune_time != 0 ?
2563             ip_min_frag_prune_time : msec_per_tick)) {
2564 
2565                 ill->ill_frag_free_num_pkts++;
2566 
2567         } else {
2568                 ill->ill_frag_free_num_pkts = 0;
2569         }
2570         ill->ill_last_frag_clean_time = now;
2571         mutex_exit(&ill->ill_lock);
2572 
2573         /*
2574          * free ill_frag_free_num_pkts oldest packets from each bucket.
2575          */
2576         if (ill->ill_frag_free_num_pkts != 0) {
2577                 int ix;
2578 
2579                 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2580                         ipfb = &ill->ill_frag_hash_tbl[ix];
2581                         mutex_enter(&ipfb->ipfb_lock);
2582                         if (ipfb->ipfb_ipf != NULL) {
2583                                 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2584                                     ill->ill_frag_free_num_pkts);
2585                         }
2586                         mutex_exit(&ipfb->ipfb_lock);
2587                 }
2588         }
2589         /*
2590          * While the reassembly list for this ILL is too big, prune a fragment
2591          * queue by age, oldest first.
2592          */
2593         while (ill->ill_frag_count > max_count) {
2594                 int     ix;
2595                 ipfb_t  *oipfb = NULL;
2596                 uint_t  oldest = UINT_MAX;
2597 
2598                 count = 0;
2599                 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2600                         ipfb = &ill->ill_frag_hash_tbl[ix];
2601                         mutex_enter(&ipfb->ipfb_lock);
2602                         ipf = ipfb->ipfb_ipf;
2603                         if (ipf != NULL && ipf->ipf_gen < oldest) {
2604                                 oldest = ipf->ipf_gen;
2605                                 oipfb = ipfb;
2606                         }
2607                         count += ipfb->ipfb_count;
2608                         mutex_exit(&ipfb->ipfb_lock);
2609                 }
2610                 if (oipfb == NULL)
2611                         break;
2612 
2613                 if (count <= max_count)
2614                         return; /* Somebody beat us to it, nothing to do */
2615                 mutex_enter(&oipfb->ipfb_lock);
2616                 ipf = oipfb->ipfb_ipf;
2617                 if (ipf != NULL) {
2618                         ill_frag_free_pkts(ill, oipfb, ipf, 1);
2619                 }
2620                 mutex_exit(&oipfb->ipfb_lock);
2621         }
2622 }
2623 
2624 /*
2625  * free 'free_cnt' fragmented packets starting at ipf.
2626  */
2627 void
2628 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2629 {
2630         size_t  count;
2631         mblk_t  *mp;
2632         mblk_t  *tmp;
2633         ipf_t **ipfp = ipf->ipf_ptphn;
2634 
2635         ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2636         ASSERT(ipfp != NULL);
2637         ASSERT(ipf != NULL);
2638 
2639         while (ipf != NULL && free_cnt-- > 0) {
2640                 count = ipf->ipf_count;
2641                 mp = ipf->ipf_mp;
2642                 ipf = ipf->ipf_hash_next;
2643                 for (tmp = mp; tmp; tmp = tmp->b_cont) {
2644                         IP_REASS_SET_START(tmp, 0);
2645                         IP_REASS_SET_END(tmp, 0);
2646                 }
2647                 atomic_add_32(&ill->ill_frag_count, -count);
2648                 ASSERT(ipfb->ipfb_count >= count);
2649                 ipfb->ipfb_count -= count;
2650                 ASSERT(ipfb->ipfb_frag_pkts > 0);
2651                 ipfb->ipfb_frag_pkts--;
2652                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2653                 ip_drop_input("ipIfStatsReasmFails", mp, ill);
2654                 freemsg(mp);
2655         }
2656 
2657         if (ipf)
2658                 ipf->ipf_ptphn = ipfp;
2659         ipfp[0] = ipf;
2660 }
2661 
2662 /*
2663  * Helper function for ill_forward_set().
2664  */
2665 static void
2666 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2667 {
2668         ip_stack_t      *ipst = ill->ill_ipst;
2669 
2670         ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2671 
2672         ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2673             (enable ? "Enabling" : "Disabling"),
2674             (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2675         mutex_enter(&ill->ill_lock);
2676         if (enable)
2677                 ill->ill_flags |= ILLF_ROUTER;
2678         else
2679                 ill->ill_flags &= ~ILLF_ROUTER;
2680         mutex_exit(&ill->ill_lock);
2681         if (ill->ill_isv6)
2682                 ill_set_nce_router_flags(ill, enable);
2683         /* Notify routing socket listeners of this change. */
2684         if (ill->ill_ipif != NULL)
2685                 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2686 }
2687 
2688 /*
2689  * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
2690  * socket messages for each interface whose flags we change.
2691  */
2692 int
2693 ill_forward_set(ill_t *ill, boolean_t enable)
2694 {
2695         ipmp_illgrp_t *illg;
2696         ip_stack_t *ipst = ill->ill_ipst;
2697 
2698         ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2699 
2700         if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2701             (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2702                 return (0);
2703 
2704         if (IS_LOOPBACK(ill))
2705                 return (EINVAL);
2706 
2707         if (enable && ill->ill_allowed_ips_cnt > 0)
2708                 return (EPERM);
2709 
2710         if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2711                 /*
2712                  * Update all of the interfaces in the group.
2713                  */
2714                 illg = ill->ill_grp;
2715                 ill = list_head(&illg->ig_if);
2716                 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2717                         ill_forward_set_on_ill(ill, enable);
2718 
2719                 /*
2720                  * Update the IPMP meta-interface.
2721                  */
2722                 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2723                 return (0);
2724         }
2725 
2726         ill_forward_set_on_ill(ill, enable);
2727         return (0);
2728 }
2729 
2730 /*
2731  * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2732  * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2733  * set or clear.
2734  */
2735 static void
2736 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2737 {
2738         ipif_t *ipif;
2739         ncec_t *ncec;
2740         nce_t *nce;
2741 
2742         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2743                 /*
2744                  * NOTE: we match across the illgrp because nce's for
2745                  * addresses on IPMP interfaces have an nce_ill that points to
2746                  * the bound underlying ill.
2747                  */
2748                 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2749                 if (nce != NULL) {
2750                         ncec = nce->nce_common;
2751                         mutex_enter(&ncec->ncec_lock);
2752                         if (enable)
2753                                 ncec->ncec_flags |= NCE_F_ISROUTER;
2754                         else
2755                                 ncec->ncec_flags &= ~NCE_F_ISROUTER;
2756                         mutex_exit(&ncec->ncec_lock);
2757                         nce_refrele(nce);
2758                 }
2759         }
2760 }
2761 
2762 /*
2763  * Intializes the context structure and returns the first ill in the list
2764  * cuurently start_list and end_list can have values:
2765  * MAX_G_HEADS          Traverse both IPV4 and IPV6 lists.
2766  * IP_V4_G_HEAD         Traverse IPV4 list only.
2767  * IP_V6_G_HEAD         Traverse IPV6 list only.
2768  */
2769 
2770 /*
2771  * We don't check for CONDEMNED ills here. Caller must do that if
2772  * necessary under the ill lock.
2773  */
2774 ill_t *
2775 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2776     ip_stack_t *ipst)
2777 {
2778         ill_if_t *ifp;
2779         ill_t *ill;
2780         avl_tree_t *avl_tree;
2781 
2782         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2783         ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2784 
2785         /*
2786          * setup the lists to search
2787          */
2788         if (end_list != MAX_G_HEADS) {
2789                 ctx->ctx_current_list = start_list;
2790                 ctx->ctx_last_list = end_list;
2791         } else {
2792                 ctx->ctx_last_list = MAX_G_HEADS - 1;
2793                 ctx->ctx_current_list = 0;
2794         }
2795 
2796         while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2797                 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2798                 if (ifp != (ill_if_t *)
2799                     &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2800                         avl_tree = &ifp->illif_avl_by_ppa;
2801                         ill = avl_first(avl_tree);
2802                         /*
2803                          * ill is guaranteed to be non NULL or ifp should have
2804                          * not existed.
2805                          */
2806                         ASSERT(ill != NULL);
2807                         return (ill);
2808                 }
2809                 ctx->ctx_current_list++;
2810         }
2811 
2812         return (NULL);
2813 }
2814 
2815 /*
2816  * returns the next ill in the list. ill_first() must have been called
2817  * before calling ill_next() or bad things will happen.
2818  */
2819 
2820 /*
2821  * We don't check for CONDEMNED ills here. Caller must do that if
2822  * necessary under the ill lock.
2823  */
2824 ill_t *
2825 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2826 {
2827         ill_if_t *ifp;
2828         ill_t *ill;
2829         ip_stack_t      *ipst = lastill->ill_ipst;
2830 
2831         ASSERT(lastill->ill_ifptr != (ill_if_t *)
2832             &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2833         if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2834             AVL_AFTER)) != NULL) {
2835                 return (ill);
2836         }
2837 
2838         /* goto next ill_ifp in the list. */
2839         ifp = lastill->ill_ifptr->illif_next;
2840 
2841         /* make sure not at end of circular list */
2842         while (ifp ==
2843             (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2844                 if (++ctx->ctx_current_list > ctx->ctx_last_list)
2845                         return (NULL);
2846                 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2847         }
2848 
2849         return (avl_first(&ifp->illif_avl_by_ppa));
2850 }
2851 
2852 /*
2853  * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2854  * The final number (PPA) must not have any leading zeros.  Upon success, a
2855  * pointer to the start of the PPA is returned; otherwise NULL is returned.
2856  */
2857 static char *
2858 ill_get_ppa_ptr(char *name)
2859 {
2860         int namelen = strlen(name);
2861         int end_ndx = namelen - 1;
2862         int ppa_ndx, i;
2863 
2864         /*
2865          * Check that the first character is [a-zA-Z], and that the last
2866          * character is [0-9].
2867          */
2868         if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2869                 return (NULL);
2870 
2871         /*
2872          * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2873          */
2874         for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2875                 if (!isdigit(name[ppa_ndx - 1]))
2876                         break;
2877 
2878         if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2879                 return (NULL);
2880 
2881         /*
2882          * Check that the intermediate characters are [a-z0-9.]
2883          */
2884         for (i = 1; i < ppa_ndx; i++) {
2885                 if (!isalpha(name[i]) && !isdigit(name[i]) &&
2886                     name[i] != '.' && name[i] != '_') {
2887                         return (NULL);
2888                 }
2889         }
2890 
2891         return (name + ppa_ndx);
2892 }
2893 
2894 /*
2895  * use avl tree to locate the ill.
2896  */
2897 static ill_t *
2898 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2899 {
2900         char *ppa_ptr = NULL;
2901         int len;
2902         uint_t ppa;
2903         ill_t *ill = NULL;
2904         ill_if_t *ifp;
2905         int list;
2906 
2907         /*
2908          * get ppa ptr
2909          */
2910         if (isv6)
2911                 list = IP_V6_G_HEAD;
2912         else
2913                 list = IP_V4_G_HEAD;
2914 
2915         if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
2916                 return (NULL);
2917         }
2918 
2919         len = ppa_ptr - name + 1;
2920 
2921         ppa = stoi(&ppa_ptr);
2922 
2923         ifp = IP_VX_ILL_G_LIST(list, ipst);
2924 
2925         while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2926                 /*
2927                  * match is done on len - 1 as the name is not null
2928                  * terminated it contains ppa in addition to the interface
2929                  * name.
2930                  */
2931                 if ((ifp->illif_name_len == len) &&
2932                     bcmp(ifp->illif_name, name, len - 1) == 0) {
2933                         break;
2934                 } else {
2935                         ifp = ifp->illif_next;
2936                 }
2937         }
2938 
2939         if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2940                 /*
2941                  * Even the interface type does not exist.
2942                  */
2943                 return (NULL);
2944         }
2945 
2946         ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
2947         if (ill != NULL) {
2948                 mutex_enter(&ill->ill_lock);
2949                 if (ILL_CAN_LOOKUP(ill)) {
2950                         ill_refhold_locked(ill);
2951                         mutex_exit(&ill->ill_lock);
2952                         return (ill);
2953                 }
2954                 mutex_exit(&ill->ill_lock);
2955         }
2956         return (NULL);
2957 }
2958 
2959 /*
2960  * comparison function for use with avl.
2961  */
2962 static int
2963 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
2964 {
2965         uint_t ppa;
2966         uint_t ill_ppa;
2967 
2968         ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
2969 
2970         ppa = *((uint_t *)ppa_ptr);
2971         ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
2972         /*
2973          * We want the ill with the lowest ppa to be on the
2974          * top.
2975          */
2976         if (ill_ppa < ppa)
2977                 return (1);
2978         if (ill_ppa > ppa)
2979                 return (-1);
2980         return (0);
2981 }
2982 
2983 /*
2984  * remove an interface type from the global list.
2985  */
2986 static void
2987 ill_delete_interface_type(ill_if_t *interface)
2988 {
2989         ASSERT(interface != NULL);
2990         ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
2991 
2992         avl_destroy(&interface->illif_avl_by_ppa);
2993         if (interface->illif_ppa_arena != NULL)
2994                 vmem_destroy(interface->illif_ppa_arena);
2995 
2996         remque(interface);
2997 
2998         mi_free(interface);
2999 }
3000 
3001 /*
3002  * remove ill from the global list.
3003  */
3004 static void
3005 ill_glist_delete(ill_t *ill)
3006 {
3007         ip_stack_t      *ipst;
3008         phyint_t        *phyi;
3009 
3010         if (ill == NULL)
3011                 return;
3012         ipst = ill->ill_ipst;
3013         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3014 
3015         /*
3016          * If the ill was never inserted into the AVL tree
3017          * we skip the if branch.
3018          */
3019         if (ill->ill_ifptr != NULL) {
3020                 /*
3021                  * remove from AVL tree and free ppa number
3022                  */
3023                 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3024 
3025                 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3026                         vmem_free(ill->ill_ifptr->illif_ppa_arena,
3027                             (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3028                 }
3029                 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3030                         ill_delete_interface_type(ill->ill_ifptr);
3031                 }
3032 
3033                 /*
3034                  * Indicate ill is no longer in the list.
3035                  */
3036                 ill->ill_ifptr = NULL;
3037                 ill->ill_name_length = 0;
3038                 ill->ill_name[0] = '\0';
3039                 ill->ill_ppa = UINT_MAX;
3040         }
3041 
3042         /* Generate one last event for this ill. */
3043         ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3044             ill->ill_name_length);
3045 
3046         ASSERT(ill->ill_phyint != NULL);
3047         phyi = ill->ill_phyint;
3048         ill->ill_phyint = NULL;
3049 
3050         /*
3051          * ill_init allocates a phyint always to store the copy
3052          * of flags relevant to phyint. At that point in time, we could
3053          * not assign the name and hence phyint_illv4/v6 could not be
3054          * initialized. Later in ipif_set_values, we assign the name to
3055          * the ill, at which point in time we assign phyint_illv4/v6.
3056          * Thus we don't rely on phyint_illv6 to be initialized always.
3057          */
3058         if (ill->ill_flags & ILLF_IPV6)
3059                 phyi->phyint_illv6 = NULL;
3060         else
3061                 phyi->phyint_illv4 = NULL;
3062 
3063         if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3064                 rw_exit(&ipst->ips_ill_g_lock);
3065                 return;
3066         }
3067 
3068         /*
3069          * There are no ills left on this phyint; pull it out of the phyint
3070          * avl trees, and free it.
3071          */
3072         if (phyi->phyint_ifindex > 0) {
3073                 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3074                     phyi);
3075                 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3076                     phyi);
3077         }
3078         rw_exit(&ipst->ips_ill_g_lock);
3079 
3080         phyint_free(phyi);
3081 }
3082 
3083 /*
3084  * allocate a ppa, if the number of plumbed interfaces of this type are
3085  * less than ill_no_arena do a linear search to find a unused ppa.
3086  * When the number goes beyond ill_no_arena switch to using an arena.
3087  * Note: ppa value of zero cannot be allocated from vmem_arena as it
3088  * is the return value for an error condition, so allocation starts at one
3089  * and is decremented by one.
3090  */
3091 static int
3092 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3093 {
3094         ill_t *tmp_ill;
3095         uint_t start, end;
3096         int ppa;
3097 
3098         if (ifp->illif_ppa_arena == NULL &&
3099             (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3100                 /*
3101                  * Create an arena.
3102                  */
3103                 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3104                     (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3105                     NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3106                         /* allocate what has already been assigned */
3107                 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3108                     tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3109                     tmp_ill, AVL_AFTER)) {
3110                         ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3111                             1,          /* size */
3112                             1,          /* align/quantum */
3113                             0,          /* phase */
3114                             0,          /* nocross */
3115                             /* minaddr */
3116                             (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3117                             /* maxaddr */
3118                             (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3119                             VM_NOSLEEP|VM_FIRSTFIT);
3120                         if (ppa == 0) {
3121                                 ip1dbg(("ill_alloc_ppa: ppa allocation"
3122                                     " failed while switching"));
3123                                 vmem_destroy(ifp->illif_ppa_arena);
3124                                 ifp->illif_ppa_arena = NULL;
3125                                 break;
3126                         }
3127                 }
3128         }
3129 
3130         if (ifp->illif_ppa_arena != NULL) {
3131                 if (ill->ill_ppa == UINT_MAX) {
3132                         ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3133                             1, VM_NOSLEEP|VM_FIRSTFIT);
3134                         if (ppa == 0)
3135                                 return (EAGAIN);
3136                         ill->ill_ppa = --ppa;
3137                 } else {
3138                         ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3139                             1,          /* size */
3140                             1,          /* align/quantum */
3141                             0,          /* phase */
3142                             0,          /* nocross */
3143                             (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3144                             (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3145                             VM_NOSLEEP|VM_FIRSTFIT);
3146                         /*
3147                          * Most likely the allocation failed because
3148                          * the requested ppa was in use.
3149                          */
3150                         if (ppa == 0)
3151                                 return (EEXIST);
3152                 }
3153                 return (0);
3154         }
3155 
3156         /*
3157          * No arena is in use and not enough (>ill_no_arena) interfaces have
3158          * been plumbed to create one. Do a linear search to get a unused ppa.
3159          */
3160         if (ill->ill_ppa == UINT_MAX) {
3161                 end = UINT_MAX - 1;
3162                 start = 0;
3163         } else {
3164                 end = start = ill->ill_ppa;
3165         }
3166 
3167         tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3168         while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3169                 if (start++ >= end) {
3170                         if (ill->ill_ppa == UINT_MAX)
3171                                 return (EAGAIN);
3172                         else
3173                                 return (EEXIST);
3174                 }
3175                 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3176         }
3177         ill->ill_ppa = start;
3178         return (0);
3179 }
3180 
3181 /*
3182  * Insert ill into the list of configured ill's. Once this function completes,
3183  * the ill is globally visible and is available through lookups. More precisely
3184  * this happens after the caller drops the ill_g_lock.
3185  */
3186 static int
3187 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3188 {
3189         ill_if_t *ill_interface;
3190         avl_index_t where = 0;
3191         int error;
3192         int name_length;
3193         int index;
3194         boolean_t check_length = B_FALSE;
3195         ip_stack_t      *ipst = ill->ill_ipst;
3196 
3197         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3198 
3199         name_length = mi_strlen(name) + 1;
3200 
3201         if (isv6)
3202                 index = IP_V6_G_HEAD;
3203         else
3204                 index = IP_V4_G_HEAD;
3205 
3206         ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3207         /*
3208          * Search for interface type based on name
3209          */
3210         while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3211                 if ((ill_interface->illif_name_len == name_length) &&
3212                     (strcmp(ill_interface->illif_name, name) == 0)) {
3213                         break;
3214                 }
3215                 ill_interface = ill_interface->illif_next;
3216         }
3217 
3218         /*
3219          * Interface type not found, create one.
3220          */
3221         if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3222                 ill_g_head_t ghead;
3223 
3224                 /*
3225                  * allocate ill_if_t structure
3226                  */
3227                 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3228                 if (ill_interface == NULL) {
3229                         return (ENOMEM);
3230                 }
3231 
3232                 (void) strcpy(ill_interface->illif_name, name);
3233                 ill_interface->illif_name_len = name_length;
3234 
3235                 avl_create(&ill_interface->illif_avl_by_ppa,
3236                     ill_compare_ppa, sizeof (ill_t),
3237                     offsetof(struct ill_s, ill_avl_byppa));
3238 
3239                 /*
3240                  * link the structure in the back to maintain order
3241                  * of configuration for ifconfig output.
3242                  */
3243                 ghead = ipst->ips_ill_g_heads[index];
3244                 insque(ill_interface, ghead.ill_g_list_tail);
3245         }
3246 
3247         if (ill->ill_ppa == UINT_MAX)
3248                 check_length = B_TRUE;
3249 
3250         error = ill_alloc_ppa(ill_interface, ill);
3251         if (error != 0) {
3252                 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3253                         ill_delete_interface_type(ill->ill_ifptr);
3254                 return (error);
3255         }
3256 
3257         /*
3258          * When the ppa is choosen by the system, check that there is
3259          * enough space to insert ppa. if a specific ppa was passed in this
3260          * check is not required as the interface name passed in will have
3261          * the right ppa in it.
3262          */
3263         if (check_length) {
3264                 /*
3265                  * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3266                  */
3267                 char buf[sizeof (uint_t) * 3];
3268 
3269                 /*
3270                  * convert ppa to string to calculate the amount of space
3271                  * required for it in the name.
3272                  */
3273                 numtos(ill->ill_ppa, buf);
3274 
3275                 /* Do we have enough space to insert ppa ? */
3276 
3277                 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3278                         /* Free ppa and interface type struct */
3279                         if (ill_interface->illif_ppa_arena != NULL) {
3280                                 vmem_free(ill_interface->illif_ppa_arena,
3281                                     (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3282                         }
3283                         if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3284                                 ill_delete_interface_type(ill->ill_ifptr);
3285 
3286                         return (EINVAL);
3287                 }
3288         }
3289 
3290         (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3291         ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3292 
3293         (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3294             &where);
3295         ill->ill_ifptr = ill_interface;
3296         avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3297 
3298         ill_phyint_reinit(ill);
3299         return (0);
3300 }
3301 
3302 /* Initialize the per phyint ipsq used for serialization */
3303 static boolean_t
3304 ipsq_init(ill_t *ill, boolean_t enter)
3305 {
3306         ipsq_t  *ipsq;
3307         ipxop_t *ipx;
3308 
3309         if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3310                 return (B_FALSE);
3311 
3312         ill->ill_phyint->phyint_ipsq = ipsq;
3313         ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3314         ipx->ipx_ipsq = ipsq;
3315         ipsq->ipsq_next = ipsq;
3316         ipsq->ipsq_phyint = ill->ill_phyint;
3317         mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3318         mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3319         ipsq->ipsq_ipst = ill->ill_ipst;  /* No netstack_hold */
3320         if (enter) {
3321                 ipx->ipx_writer = curthread;
3322                 ipx->ipx_forced = B_FALSE;
3323                 ipx->ipx_reentry_cnt = 1;
3324 #ifdef DEBUG
3325                 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3326 #endif
3327         }
3328         return (B_TRUE);
3329 }
3330 
3331 /*
3332  * ill_init is called by ip_open when a device control stream is opened.
3333  * It does a few initializations, and shoots a DL_INFO_REQ message down
3334  * to the driver.  The response is later picked up in ip_rput_dlpi and
3335  * used to set up default mechanisms for talking to the driver.  (Always
3336  * called as writer.)
3337  *
3338  * If this function returns error, ip_open will call ip_close which in
3339  * turn will call ill_delete to clean up any memory allocated here that
3340  * is not yet freed.
3341  */
3342 int
3343 ill_init(queue_t *q, ill_t *ill)
3344 {
3345         int     count;
3346         dl_info_req_t   *dlir;
3347         mblk_t  *info_mp;
3348         uchar_t *frag_ptr;
3349 
3350         /*
3351          * The ill is initialized to zero by mi_alloc*(). In addition
3352          * some fields already contain valid values, initialized in
3353          * ip_open(), before we reach here.
3354          */
3355         mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3356         mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3357         ill->ill_saved_ire_cnt = 0;
3358 
3359         ill->ill_rq = q;
3360         ill->ill_wq = WR(q);
3361 
3362         info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3363             BPRI_HI);
3364         if (info_mp == NULL)
3365                 return (ENOMEM);
3366 
3367         /*
3368          * Allocate sufficient space to contain our fragment hash table and
3369          * the device name.
3370          */
3371         frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3372         if (frag_ptr == NULL) {
3373                 freemsg(info_mp);
3374                 return (ENOMEM);
3375         }
3376         ill->ill_frag_ptr = frag_ptr;
3377         ill->ill_frag_free_num_pkts = 0;
3378         ill->ill_last_frag_clean_time = 0;
3379         ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3380         ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3381         for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3382                 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3383                     NULL, MUTEX_DEFAULT, NULL);
3384         }
3385 
3386         ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3387         if (ill->ill_phyint == NULL) {
3388                 freemsg(info_mp);
3389                 mi_free(frag_ptr);
3390                 return (ENOMEM);
3391         }
3392 
3393         mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3394         /*
3395          * For now pretend this is a v4 ill. We need to set phyint_ill*
3396          * at this point because of the following reason. If we can't
3397          * enter the ipsq at some point and cv_wait, the writer that
3398          * wakes us up tries to locate us using the list of all phyints
3399          * in an ipsq and the ills from the phyint thru the phyint_ill*.
3400          * If we don't set it now, we risk a missed wakeup.
3401          */
3402         ill->ill_phyint->phyint_illv4 = ill;
3403         ill->ill_ppa = UINT_MAX;
3404         list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3405 
3406         ill_set_inputfn(ill);
3407 
3408         if (!ipsq_init(ill, B_TRUE)) {
3409                 freemsg(info_mp);
3410                 mi_free(frag_ptr);
3411                 mi_free(ill->ill_phyint);
3412                 return (ENOMEM);
3413         }
3414 
3415         ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3416 
3417         /* Frag queue limit stuff */
3418         ill->ill_frag_count = 0;
3419         ill->ill_ipf_gen = 0;
3420 
3421         rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3422         mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3423         ill->ill_global_timer = INFINITY;
3424         ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3425         ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3426         ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3427         ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3428 
3429         /*
3430          * Initialize IPv6 configuration variables.  The IP module is always
3431          * opened as an IPv4 module.  Instead tracking down the cases where
3432          * it switches to do ipv6, we'll just initialize the IPv6 configuration
3433          * here for convenience, this has no effect until the ill is set to do
3434          * IPv6.
3435          */
3436         ill->ill_reachable_time = ND_REACHABLE_TIME;
3437         ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3438         ill->ill_max_buf = ND_MAX_Q;
3439         ill->ill_refcnt = 0;
3440 
3441         /* Send down the Info Request to the driver. */
3442         info_mp->b_datap->db_type = M_PCPROTO;
3443         dlir = (dl_info_req_t *)info_mp->b_rptr;
3444         info_mp->b_wptr = (uchar_t *)&dlir[1];
3445         dlir->dl_primitive = DL_INFO_REQ;
3446 
3447         ill->ill_dlpi_pending = DL_PRIM_INVAL;
3448 
3449         qprocson(q);
3450         ill_dlpi_send(ill, info_mp);
3451 
3452         return (0);
3453 }
3454 
3455 /*
3456  * ill_dls_info
3457  * creates datalink socket info from the device.
3458  */
3459 int
3460 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3461 {
3462         size_t  len;
3463 
3464         sdl->sdl_family = AF_LINK;
3465         sdl->sdl_index = ill_get_upper_ifindex(ill);
3466         sdl->sdl_type = ill->ill_type;
3467         ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3468         len = strlen(sdl->sdl_data);
3469         ASSERT(len < 256);
3470         sdl->sdl_nlen = (uchar_t)len;
3471         sdl->sdl_alen = ill->ill_phys_addr_length;
3472         sdl->sdl_slen = 0;
3473         if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3474                 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3475 
3476         return (sizeof (struct sockaddr_dl));
3477 }
3478 
3479 /*
3480  * ill_xarp_info
3481  * creates xarp info from the device.
3482  */
3483 static int
3484 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3485 {
3486         sdl->sdl_family = AF_LINK;
3487         sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3488         sdl->sdl_type = ill->ill_type;
3489         ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3490         sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3491         sdl->sdl_alen = ill->ill_phys_addr_length;
3492         sdl->sdl_slen = 0;
3493         return (sdl->sdl_nlen);
3494 }
3495 
3496 static int
3497 loopback_kstat_update(kstat_t *ksp, int rw)
3498 {
3499         kstat_named_t *kn;
3500         netstackid_t    stackid;
3501         netstack_t      *ns;
3502         ip_stack_t      *ipst;
3503 
3504         if (ksp == NULL || ksp->ks_data == NULL)
3505                 return (EIO);
3506 
3507         if (rw == KSTAT_WRITE)
3508                 return (EACCES);
3509 
3510         kn = KSTAT_NAMED_PTR(ksp);
3511         stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3512 
3513         ns = netstack_find_by_stackid(stackid);
3514         if (ns == NULL)
3515                 return (-1);
3516 
3517         ipst = ns->netstack_ip;
3518         if (ipst == NULL) {
3519                 netstack_rele(ns);
3520                 return (-1);
3521         }
3522         kn[0].value.ui32 = ipst->ips_loopback_packets;
3523         kn[1].value.ui32 = ipst->ips_loopback_packets;
3524         netstack_rele(ns);
3525         return (0);
3526 }
3527 
3528 /*
3529  * Has ifindex been plumbed already?
3530  */
3531 static boolean_t
3532 phyint_exists(uint_t index, ip_stack_t *ipst)
3533 {
3534         ASSERT(index != 0);
3535         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3536 
3537         return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3538             &index, NULL) != NULL);
3539 }
3540 
3541 /*
3542  * Pick a unique ifindex.
3543  * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3544  * flag is set so that next time time ip_assign_ifindex() is called, it
3545  * falls through and resets the index counter back to 1, the minimum value
3546  * for the interface index. The logic below assumes that ips_ill_index
3547  * can hold a value of IF_INDEX_MAX+1 without there being any loss
3548  * (i.e. reset back to 0.)
3549  */
3550 boolean_t
3551 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3552 {
3553         uint_t loops;
3554 
3555         if (!ipst->ips_ill_index_wrap) {
3556                 *indexp = ipst->ips_ill_index++;
3557                 if (ipst->ips_ill_index > IF_INDEX_MAX) {
3558                         /*
3559                          * Reached the maximum ifindex value, set the wrap
3560                          * flag to indicate that it is no longer possible
3561                          * to assume that a given index is unallocated.
3562                          */
3563                         ipst->ips_ill_index_wrap = B_TRUE;
3564                 }
3565                 return (B_TRUE);
3566         }
3567 
3568         if (ipst->ips_ill_index > IF_INDEX_MAX)
3569                 ipst->ips_ill_index = 1;
3570 
3571         /*
3572          * Start reusing unused indexes. Note that we hold the ill_g_lock
3573          * at this point and don't want to call any function that attempts
3574          * to get the lock again.
3575          */
3576         for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3577                 if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3578                         /* found unused index - use it */
3579                         *indexp = ipst->ips_ill_index;
3580                         return (B_TRUE);
3581                 }
3582 
3583                 ipst->ips_ill_index++;
3584                 if (ipst->ips_ill_index > IF_INDEX_MAX)
3585                         ipst->ips_ill_index = 1;
3586         }
3587 
3588         /*
3589          * all interface indicies are inuse.
3590          */
3591         return (B_FALSE);
3592 }
3593 
3594 /*
3595  * Assign a unique interface index for the phyint.
3596  */
3597 static boolean_t
3598 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3599 {
3600         ASSERT(phyi->phyint_ifindex == 0);
3601         return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3602 }
3603 
3604 /*
3605  * Initialize the flags on `phyi' as per the provided mactype.
3606  */
3607 static void
3608 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3609 {
3610         uint64_t flags = 0;
3611 
3612         /*
3613          * Initialize PHYI_RUNNING and PHYI_FAILED.  For non-IPMP interfaces,
3614          * we always presume the underlying hardware is working and set
3615          * PHYI_RUNNING (if it's not, the driver will subsequently send a
3616          * DL_NOTE_LINK_DOWN message).  For IPMP interfaces, at initialization
3617          * there are no active interfaces in the group so we set PHYI_FAILED.
3618          */
3619         if (mactype == SUNW_DL_IPMP)
3620                 flags |= PHYI_FAILED;
3621         else
3622                 flags |= PHYI_RUNNING;
3623 
3624         switch (mactype) {
3625         case SUNW_DL_VNI:
3626                 flags |= PHYI_VIRTUAL;
3627                 break;
3628         case SUNW_DL_IPMP:
3629                 flags |= PHYI_IPMP;
3630                 break;
3631         case DL_LOOP:
3632                 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3633                 break;
3634         }
3635 
3636         mutex_enter(&phyi->phyint_lock);
3637         phyi->phyint_flags |= flags;
3638         mutex_exit(&phyi->phyint_lock);
3639 }
3640 
3641 /*
3642  * Return a pointer to the ill which matches the supplied name.  Note that
3643  * the ill name length includes the null termination character.  (May be
3644  * called as writer.)
3645  * If do_alloc and the interface is "lo0" it will be automatically created.
3646  * Cannot bump up reference on condemned ills. So dup detect can't be done
3647  * using this func.
3648  */
3649 ill_t *
3650 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3651     boolean_t *did_alloc, ip_stack_t *ipst)
3652 {
3653         ill_t   *ill;
3654         ipif_t  *ipif;
3655         ipsq_t  *ipsq;
3656         kstat_named_t   *kn;
3657         boolean_t isloopback;
3658         in6_addr_t ov6addr;
3659 
3660         isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3661 
3662         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3663         ill = ill_find_by_name(name, isv6, ipst);
3664         rw_exit(&ipst->ips_ill_g_lock);
3665         if (ill != NULL)
3666                 return (ill);
3667 
3668         /*
3669          * Couldn't find it.  Does this happen to be a lookup for the
3670          * loopback device and are we allowed to allocate it?
3671          */
3672         if (!isloopback || !do_alloc)
3673                 return (NULL);
3674 
3675         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3676         ill = ill_find_by_name(name, isv6, ipst);
3677         if (ill != NULL) {
3678                 rw_exit(&ipst->ips_ill_g_lock);
3679                 return (ill);
3680         }
3681 
3682         /* Create the loopback device on demand */
3683         ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3684             sizeof (ipif_loopback_name), BPRI_MED));
3685         if (ill == NULL)
3686                 goto done;
3687 
3688         *ill = ill_null;
3689         mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
3690         ill->ill_ipst = ipst;
3691         list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3692         netstack_hold(ipst->ips_netstack);
3693         /*
3694          * For exclusive stacks we set the zoneid to zero
3695          * to make IP operate as if in the global zone.
3696          */
3697         ill->ill_zoneid = GLOBAL_ZONEID;
3698 
3699         ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3700         if (ill->ill_phyint == NULL)
3701                 goto done;
3702 
3703         if (isv6)
3704                 ill->ill_phyint->phyint_illv6 = ill;
3705         else
3706                 ill->ill_phyint->phyint_illv4 = ill;
3707         mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3708         phyint_flags_init(ill->ill_phyint, DL_LOOP);
3709 
3710         if (isv6) {
3711                 ill->ill_isv6 = B_TRUE;
3712                 ill->ill_max_frag = ip_loopback_mtu_v6plus;
3713         } else {
3714                 ill->ill_max_frag = ip_loopback_mtuplus;
3715         }
3716         if (!ill_allocate_mibs(ill))
3717                 goto done;
3718         ill->ill_current_frag = ill->ill_max_frag;
3719         ill->ill_mtu = ill->ill_max_frag; /* Initial value */
3720         ill->ill_mc_mtu = ill->ill_mtu;
3721         /*
3722          * ipif_loopback_name can't be pointed at directly because its used
3723          * by both the ipv4 and ipv6 interfaces.  When the ill is removed
3724          * from the glist, ill_glist_delete() sets the first character of
3725          * ill_name to '\0'.
3726          */
3727         ill->ill_name = (char *)ill + sizeof (*ill);
3728         (void) strcpy(ill->ill_name, ipif_loopback_name);
3729         ill->ill_name_length = sizeof (ipif_loopback_name);
3730         /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3731         ill->ill_dlpi_pending = DL_PRIM_INVAL;
3732 
3733         rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3734         mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3735         ill->ill_global_timer = INFINITY;
3736         ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3737         ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3738         ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3739         ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3740 
3741         /* No resolver here. */
3742         ill->ill_net_type = IRE_LOOPBACK;
3743 
3744         /* Initialize the ipsq */
3745         if (!ipsq_init(ill, B_FALSE))
3746                 goto done;
3747 
3748         ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3749         if (ipif == NULL)
3750                 goto done;
3751 
3752         ill->ill_flags = ILLF_MULTICAST;
3753 
3754         ov6addr = ipif->ipif_v6lcl_addr;
3755         /* Set up default loopback address and mask. */
3756         if (!isv6) {
3757                 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3758 
3759                 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3760                 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3761                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3762                     ipif->ipif_v6subnet);
3763                 ill->ill_flags |= ILLF_IPV4;
3764         } else {
3765                 ipif->ipif_v6lcl_addr = ipv6_loopback;
3766                 ipif->ipif_v6net_mask = ipv6_all_ones;
3767                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3768                     ipif->ipif_v6subnet);
3769                 ill->ill_flags |= ILLF_IPV6;
3770         }
3771 
3772         /*
3773          * Chain us in at the end of the ill list. hold the ill
3774          * before we make it globally visible. 1 for the lookup.
3775          */
3776         ill->ill_refcnt = 0;
3777         ill_refhold(ill);
3778 
3779         ill->ill_frag_count = 0;
3780         ill->ill_frag_free_num_pkts = 0;
3781         ill->ill_last_frag_clean_time = 0;
3782 
3783         ipsq = ill->ill_phyint->phyint_ipsq;
3784 
3785         ill_set_inputfn(ill);
3786 
3787         if (ill_glist_insert(ill, "lo", isv6) != 0)
3788                 cmn_err(CE_PANIC, "cannot insert loopback interface");
3789 
3790         /* Let SCTP know so that it can add this to its list */
3791         sctp_update_ill(ill, SCTP_ILL_INSERT);
3792 
3793         /*
3794          * We have already assigned ipif_v6lcl_addr above, but we need to
3795          * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3796          * requires to be after ill_glist_insert() since we need the
3797          * ill_index set. Pass on ipv6_loopback as the old address.
3798          */
3799         sctp_update_ipif_addr(ipif, ov6addr);
3800 
3801         ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3802 
3803         /*
3804          * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3805          * If so, free our original one.
3806          */
3807         if (ipsq != ill->ill_phyint->phyint_ipsq)
3808                 ipsq_delete(ipsq);
3809 
3810         if (ipst->ips_loopback_ksp == NULL) {
3811                 /* Export loopback interface statistics */
3812                 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3813                     ipif_loopback_name, "net",
3814                     KSTAT_TYPE_NAMED, 2, 0,
3815                     ipst->ips_netstack->netstack_stackid);
3816                 if (ipst->ips_loopback_ksp != NULL) {
3817                         ipst->ips_loopback_ksp->ks_update =
3818                             loopback_kstat_update;
3819                         kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3820                         kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3821                         kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3822                         ipst->ips_loopback_ksp->ks_private =
3823                             (void *)(uintptr_t)ipst->ips_netstack->
3824                             netstack_stackid;
3825                         kstat_install(ipst->ips_loopback_ksp);
3826                 }
3827         }
3828 
3829         *did_alloc = B_TRUE;
3830         rw_exit(&ipst->ips_ill_g_lock);
3831         ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3832             NE_PLUMB, ill->ill_name, ill->ill_name_length);
3833         return (ill);
3834 done:
3835         if (ill != NULL) {
3836                 if (ill->ill_phyint != NULL) {
3837                         ipsq = ill->ill_phyint->phyint_ipsq;
3838                         if (ipsq != NULL) {
3839                                 ipsq->ipsq_phyint = NULL;
3840                                 ipsq_delete(ipsq);
3841                         }
3842                         mi_free(ill->ill_phyint);
3843                 }
3844                 ill_free_mib(ill);
3845                 if (ill->ill_ipst != NULL)
3846                         netstack_rele(ill->ill_ipst->ips_netstack);
3847                 mi_free(ill);
3848         }
3849         rw_exit(&ipst->ips_ill_g_lock);
3850         return (NULL);
3851 }
3852 
3853 /*
3854  * For IPP calls - use the ip_stack_t for global stack.
3855  */
3856 ill_t *
3857 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3858 {
3859         ip_stack_t      *ipst;
3860         ill_t           *ill;
3861 
3862         ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip;
3863         if (ipst == NULL) {
3864                 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3865                 return (NULL);
3866         }
3867 
3868         ill = ill_lookup_on_ifindex(index, isv6, ipst);
3869         netstack_rele(ipst->ips_netstack);
3870         return (ill);
3871 }
3872 
3873 /*
3874  * Return a pointer to the ill which matches the index and IP version type.
3875  */
3876 ill_t *
3877 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3878 {
3879         ill_t   *ill;
3880         phyint_t *phyi;
3881 
3882         /*
3883          * Indexes are stored in the phyint - a common structure
3884          * to both IPv4 and IPv6.
3885          */
3886         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3887         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3888             (void *) &index, NULL);
3889         if (phyi != NULL) {
3890                 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3891                 if (ill != NULL) {
3892                         mutex_enter(&ill->ill_lock);
3893                         if (!ILL_IS_CONDEMNED(ill)) {
3894                                 ill_refhold_locked(ill);
3895                                 mutex_exit(&ill->ill_lock);
3896                                 rw_exit(&ipst->ips_ill_g_lock);
3897                                 return (ill);
3898                         }
3899                         mutex_exit(&ill->ill_lock);
3900                 }
3901         }
3902         rw_exit(&ipst->ips_ill_g_lock);
3903         return (NULL);
3904 }
3905 
3906 /*
3907  * Verify whether or not an interface index is valid for the specified zoneid
3908  * to transmit packets.
3909  * It can be zero (meaning "reset") or an interface index assigned
3910  * to a non-VNI interface. (We don't use VNI interface to send packets.)
3911  */
3912 boolean_t
3913 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
3914     ip_stack_t *ipst)
3915 {
3916         ill_t           *ill;
3917 
3918         if (ifindex == 0)
3919                 return (B_TRUE);
3920 
3921         ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
3922         if (ill == NULL)
3923                 return (B_FALSE);
3924         if (IS_VNI(ill)) {
3925                 ill_refrele(ill);
3926                 return (B_FALSE);
3927         }
3928         ill_refrele(ill);
3929         return (B_TRUE);
3930 }
3931 
3932 /*
3933  * Return the ifindex next in sequence after the passed in ifindex.
3934  * If there is no next ifindex for the given protocol, return 0.
3935  */
3936 uint_t
3937 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3938 {
3939         phyint_t *phyi;
3940         phyint_t *phyi_initial;
3941         uint_t   ifindex;
3942 
3943         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3944 
3945         if (index == 0) {
3946                 phyi = avl_first(
3947                     &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
3948         } else {
3949                 phyi = phyi_initial = avl_find(
3950                     &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3951                     (void *) &index, NULL);
3952         }
3953 
3954         for (; phyi != NULL;
3955             phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3956             phyi, AVL_AFTER)) {
3957                 /*
3958                  * If we're not returning the first interface in the tree
3959                  * and we still haven't moved past the phyint_t that
3960                  * corresponds to index, avl_walk needs to be called again
3961                  */
3962                 if (!((index != 0) && (phyi == phyi_initial))) {
3963                         if (isv6) {
3964                                 if ((phyi->phyint_illv6) &&
3965                                     ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
3966                                     (phyi->phyint_illv6->ill_isv6 == 1))
3967                                         break;
3968                         } else {
3969                                 if ((phyi->phyint_illv4) &&
3970                                     ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
3971                                     (phyi->phyint_illv4->ill_isv6 == 0))
3972                                         break;
3973                         }
3974                 }
3975         }
3976 
3977         rw_exit(&ipst->ips_ill_g_lock);
3978 
3979         if (phyi != NULL)
3980                 ifindex = phyi->phyint_ifindex;
3981         else
3982                 ifindex = 0;
3983 
3984         return (ifindex);
3985 }
3986 
3987 /*
3988  * Return the ifindex for the named interface.
3989  * If there is no next ifindex for the interface, return 0.
3990  */
3991 uint_t
3992 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
3993 {
3994         phyint_t        *phyi;
3995         avl_index_t     where = 0;
3996         uint_t          ifindex;
3997 
3998         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3999 
4000         if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
4001             name, &where)) == NULL) {
4002                 rw_exit(&ipst->ips_ill_g_lock);
4003                 return (0);
4004         }
4005 
4006         ifindex = phyi->phyint_ifindex;
4007 
4008         rw_exit(&ipst->ips_ill_g_lock);
4009 
4010         return (ifindex);
4011 }
4012 
4013 /*
4014  * Return the ifindex to be used by upper layer protocols for instance
4015  * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4016  */
4017 uint_t
4018 ill_get_upper_ifindex(const ill_t *ill)
4019 {
4020         if (IS_UNDER_IPMP(ill))
4021                 return (ipmp_ill_get_ipmp_ifindex(ill));
4022         else
4023                 return (ill->ill_phyint->phyint_ifindex);
4024 }
4025 
4026 
4027 /*
4028  * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4029  * that gives a running thread a reference to the ill. This reference must be
4030  * released by the thread when it is done accessing the ill and related
4031  * objects. ill_refcnt can not be used to account for static references
4032  * such as other structures pointing to an ill. Callers must generally
4033  * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4034  * or be sure that the ill is not being deleted or changing state before
4035  * calling the refhold functions. A non-zero ill_refcnt ensures that the
4036  * ill won't change any of its critical state such as address, netmask etc.
4037  */
4038 void
4039 ill_refhold(ill_t *ill)
4040 {
4041         mutex_enter(&ill->ill_lock);
4042         ill->ill_refcnt++;
4043         ILL_TRACE_REF(ill);
4044         mutex_exit(&ill->ill_lock);
4045 }
4046 
4047 void
4048 ill_refhold_locked(ill_t *ill)
4049 {
4050         ASSERT(MUTEX_HELD(&ill->ill_lock));
4051         ill->ill_refcnt++;
4052         ILL_TRACE_REF(ill);
4053 }
4054 
4055 /* Returns true if we managed to get a refhold */
4056 boolean_t
4057 ill_check_and_refhold(ill_t *ill)
4058 {
4059         mutex_enter(&ill->ill_lock);
4060         if (!ILL_IS_CONDEMNED(ill)) {
4061                 ill_refhold_locked(ill);
4062                 mutex_exit(&ill->ill_lock);
4063                 return (B_TRUE);
4064         }
4065         mutex_exit(&ill->ill_lock);
4066         return (B_FALSE);
4067 }
4068 
4069 /*
4070  * Must not be called while holding any locks. Otherwise if this is
4071  * the last reference to be released, there is a chance of recursive mutex
4072  * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4073  * to restart an ioctl.
4074  */
4075 void
4076 ill_refrele(ill_t *ill)
4077 {
4078         mutex_enter(&ill->ill_lock);
4079         ASSERT(ill->ill_refcnt != 0);
4080         ill->ill_refcnt--;
4081         ILL_UNTRACE_REF(ill);
4082         if (ill->ill_refcnt != 0) {
4083                 /* Every ire pointing to the ill adds 1 to ill_refcnt */
4084                 mutex_exit(&ill->ill_lock);
4085                 return;
4086         }
4087 
4088         /* Drops the ill_lock */
4089         ipif_ill_refrele_tail(ill);
4090 }
4091 
4092 /*
4093  * Obtain a weak reference count on the ill. This reference ensures the
4094  * ill won't be freed, but the ill may change any of its critical state
4095  * such as netmask, address etc. Returns an error if the ill has started
4096  * closing.
4097  */
4098 boolean_t
4099 ill_waiter_inc(ill_t *ill)
4100 {
4101         mutex_enter(&ill->ill_lock);
4102         if (ill->ill_state_flags & ILL_CONDEMNED) {
4103                 mutex_exit(&ill->ill_lock);
4104                 return (B_FALSE);
4105         }
4106         ill->ill_waiters++;
4107         mutex_exit(&ill->ill_lock);
4108         return (B_TRUE);
4109 }
4110 
4111 void
4112 ill_waiter_dcr(ill_t *ill)
4113 {
4114         mutex_enter(&ill->ill_lock);
4115         ill->ill_waiters--;
4116         if (ill->ill_waiters == 0)
4117                 cv_broadcast(&ill->ill_cv);
4118         mutex_exit(&ill->ill_lock);
4119 }
4120 
4121 /*
4122  * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4123  * driver.  We construct best guess defaults for lower level information that
4124  * we need.  If an interface is brought up without injection of any overriding
4125  * information from outside, we have to be ready to go with these defaults.
4126  * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4127  * we primarely want the dl_provider_style.
4128  * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4129  * at which point we assume the other part of the information is valid.
4130  */
4131 void
4132 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4133 {
4134         uchar_t         *brdcst_addr;
4135         uint_t          brdcst_addr_length, phys_addr_length;
4136         t_scalar_t      sap_length;
4137         dl_info_ack_t   *dlia;
4138         ip_m_t          *ipm;
4139         dl_qos_cl_sel1_t *sel1;
4140         int             min_mtu;
4141 
4142         ASSERT(IAM_WRITER_ILL(ill));
4143 
4144         /*
4145          * Till the ill is fully up  the ill is not globally visible.
4146          * So no need for a lock.
4147          */
4148         dlia = (dl_info_ack_t *)mp->b_rptr;
4149         ill->ill_mactype = dlia->dl_mac_type;
4150 
4151         ipm = ip_m_lookup(dlia->dl_mac_type);
4152         if (ipm == NULL) {
4153                 ipm = ip_m_lookup(DL_OTHER);
4154                 ASSERT(ipm != NULL);
4155         }
4156         ill->ill_media = ipm;
4157 
4158         /*
4159          * When the new DLPI stuff is ready we'll pull lengths
4160          * from dlia.
4161          */
4162         if (dlia->dl_version == DL_VERSION_2) {
4163                 brdcst_addr_length = dlia->dl_brdcst_addr_length;
4164                 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4165                     brdcst_addr_length);
4166                 if (brdcst_addr == NULL) {
4167                         brdcst_addr_length = 0;
4168                 }
4169                 sap_length = dlia->dl_sap_length;
4170                 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4171                 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4172                     brdcst_addr_length, sap_length, phys_addr_length));
4173         } else {
4174                 brdcst_addr_length = 6;
4175                 brdcst_addr = ip_six_byte_all_ones;
4176                 sap_length = -2;
4177                 phys_addr_length = brdcst_addr_length;
4178         }
4179 
4180         ill->ill_bcast_addr_length = brdcst_addr_length;
4181         ill->ill_phys_addr_length = phys_addr_length;
4182         ill->ill_sap_length = sap_length;
4183 
4184         /*
4185          * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4186          * but we must ensure a minimum IP MTU is used since other bits of
4187          * IP will fly apart otherwise.
4188          */
4189         min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4190         ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4191         ill->ill_current_frag = ill->ill_max_frag;
4192         ill->ill_mtu = ill->ill_max_frag;
4193         ill->ill_mc_mtu = ill->ill_mtu;   /* Overridden by DL_NOTE_SDU_SIZE2 */
4194 
4195         ill->ill_type = ipm->ip_m_type;
4196 
4197         if (!ill->ill_dlpi_style_set) {
4198                 if (dlia->dl_provider_style == DL_STYLE2)
4199                         ill->ill_needs_attach = 1;
4200 
4201                 phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4202 
4203                 /*
4204                  * Allocate the first ipif on this ill.  We don't delay it
4205                  * further as ioctl handling assumes at least one ipif exists.
4206                  *
4207                  * At this point we don't know whether the ill is v4 or v6.
4208                  * We will know this whan the SIOCSLIFNAME happens and
4209                  * the correct value for ill_isv6 will be assigned in
4210                  * ipif_set_values(). We need to hold the ill lock and
4211                  * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4212                  * the wakeup.
4213                  */
4214                 (void) ipif_allocate(ill, 0, IRE_LOCAL,
4215                     dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4216                 mutex_enter(&ill->ill_lock);
4217                 ASSERT(ill->ill_dlpi_style_set == 0);
4218                 ill->ill_dlpi_style_set = 1;
4219                 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4220                 cv_broadcast(&ill->ill_cv);
4221                 mutex_exit(&ill->ill_lock);
4222                 freemsg(mp);
4223                 return;
4224         }
4225         ASSERT(ill->ill_ipif != NULL);
4226         /*
4227          * We know whether it is IPv4 or IPv6 now, as this is the
4228          * second DL_INFO_ACK we are recieving in response to the
4229          * DL_INFO_REQ sent in ipif_set_values.
4230          */
4231         ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4232         /*
4233          * Clear all the flags that were set based on ill_bcast_addr_length
4234          * and ill_phys_addr_length (in ipif_set_values) as these could have
4235          * changed now and we need to re-evaluate.
4236          */
4237         ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4238         ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4239 
4240         /*
4241          * Free ill_bcast_mp as things could have changed now.
4242          *
4243          * NOTE: The IPMP meta-interface is special-cased because it starts
4244          * with no underlying interfaces (and thus an unknown broadcast
4245          * address length), but we enforce that an interface is broadcast-
4246          * capable as part of allowing it to join a group.
4247          */
4248         if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4249                 if (ill->ill_bcast_mp != NULL)
4250                         freemsg(ill->ill_bcast_mp);
4251                 ill->ill_net_type = IRE_IF_NORESOLVER;
4252 
4253                 ill->ill_bcast_mp = ill_dlur_gen(NULL,
4254                     ill->ill_phys_addr_length,
4255                     ill->ill_sap,
4256                     ill->ill_sap_length);
4257 
4258                 if (ill->ill_isv6)
4259                         /*
4260                          * Note: xresolv interfaces will eventually need NOARP
4261                          * set here as well, but that will require those
4262                          * external resolvers to have some knowledge of
4263                          * that flag and act appropriately. Not to be changed
4264                          * at present.
4265                          */
4266                         ill->ill_flags |= ILLF_NONUD;
4267                 else
4268                         ill->ill_flags |= ILLF_NOARP;
4269 
4270                 if (ill->ill_mactype == SUNW_DL_VNI) {
4271                         ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4272                 } else if (ill->ill_phys_addr_length == 0 ||
4273                     ill->ill_mactype == DL_IPV4 ||
4274                     ill->ill_mactype == DL_IPV6) {
4275                         /*
4276                          * The underying link is point-to-point, so mark the
4277                          * interface as such.  We can do IP multicast over
4278                          * such a link since it transmits all network-layer
4279                          * packets to the remote side the same way.
4280                          */
4281                         ill->ill_flags |= ILLF_MULTICAST;
4282                         ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4283                 }
4284         } else {
4285                 ill->ill_net_type = IRE_IF_RESOLVER;
4286                 if (ill->ill_bcast_mp != NULL)
4287                         freemsg(ill->ill_bcast_mp);
4288                 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4289                     ill->ill_bcast_addr_length, ill->ill_sap,
4290                     ill->ill_sap_length);
4291                 /*
4292                  * Later detect lack of DLPI driver multicast
4293                  * capability by catching DL_ENABMULTI errors in
4294                  * ip_rput_dlpi.
4295                  */
4296                 ill->ill_flags |= ILLF_MULTICAST;
4297                 if (!ill->ill_isv6)
4298                         ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4299         }
4300 
4301         /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4302         if (ill->ill_mactype == SUNW_DL_IPMP)
4303                 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4304 
4305         /* By default an interface does not support any CoS marking */
4306         ill->ill_flags &= ~ILLF_COS_ENABLED;
4307 
4308         /*
4309          * If we get QoS information in DL_INFO_ACK, the device supports
4310          * some form of CoS marking, set ILLF_COS_ENABLED.
4311          */
4312         sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4313             dlia->dl_qos_length);
4314         if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4315                 ill->ill_flags |= ILLF_COS_ENABLED;
4316         }
4317 
4318         /* Clear any previous error indication. */
4319         ill->ill_error = 0;
4320         freemsg(mp);
4321 }
4322 
4323 /*
4324  * Perform various checks to verify that an address would make sense as a
4325  * local, remote, or subnet interface address.
4326  */
4327 static boolean_t
4328 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4329 {
4330         ipaddr_t        net_mask;
4331 
4332         /*
4333          * Don't allow all zeroes, or all ones, but allow
4334          * all ones netmask.
4335          */
4336         if ((net_mask = ip_net_mask(addr)) == 0)
4337                 return (B_FALSE);
4338         /* A given netmask overrides the "guess" netmask */
4339         if (subnet_mask != 0)
4340                 net_mask = subnet_mask;
4341         if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4342             (addr == (addr | ~net_mask)))) {
4343                 return (B_FALSE);
4344         }
4345 
4346         /*
4347          * Even if the netmask is all ones, we do not allow address to be
4348          * 255.255.255.255
4349          */
4350         if (addr == INADDR_BROADCAST)
4351                 return (B_FALSE);
4352 
4353         if (CLASSD(addr))
4354                 return (B_FALSE);
4355 
4356         return (B_TRUE);
4357 }
4358 
4359 #define V6_IPIF_LINKLOCAL(p)    \
4360         IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4361 
4362 /*
4363  * Compare two given ipifs and check if the second one is better than
4364  * the first one using the order of preference (not taking deprecated
4365  * into acount) specified in ipif_lookup_multicast().
4366  */
4367 static boolean_t
4368 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4369 {
4370         /* Check the least preferred first. */
4371         if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4372                 /* If both ipifs are the same, use the first one. */
4373                 if (IS_LOOPBACK(new_ipif->ipif_ill))
4374                         return (B_FALSE);
4375                 else
4376                         return (B_TRUE);
4377         }
4378 
4379         /* For IPv6, check for link local address. */
4380         if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4381                 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4382                     V6_IPIF_LINKLOCAL(new_ipif)) {
4383                         /* The second one is equal or less preferred. */
4384                         return (B_FALSE);
4385                 } else {
4386                         return (B_TRUE);
4387                 }
4388         }
4389 
4390         /* Then check for point to point interface. */
4391         if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4392                 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4393                     (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4394                     (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4395                         return (B_FALSE);
4396                 } else {
4397                         return (B_TRUE);
4398                 }
4399         }
4400 
4401         /* old_ipif is a normal interface, so no need to use the new one. */
4402         return (B_FALSE);
4403 }
4404 
4405 /*
4406  * Find a mulitcast-capable ipif given an IP instance and zoneid.
4407  * The ipif must be up, and its ill must multicast-capable, not
4408  * condemned, not an underlying interface in an IPMP group, and
4409  * not a VNI interface.  Order of preference:
4410  *
4411  *      1a. normal
4412  *      1b. normal, but deprecated
4413  *      2a. point to point
4414  *      2b. point to point, but deprecated
4415  *      3a. link local
4416  *      3b. link local, but deprecated
4417  *      4. loopback.
4418  */
4419 static ipif_t *
4420 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4421 {
4422         ill_t                   *ill;
4423         ill_walk_context_t      ctx;
4424         ipif_t                  *ipif;
4425         ipif_t                  *saved_ipif = NULL;
4426         ipif_t                  *dep_ipif = NULL;
4427 
4428         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4429         if (isv6)
4430                 ill = ILL_START_WALK_V6(&ctx, ipst);
4431         else
4432                 ill = ILL_START_WALK_V4(&ctx, ipst);
4433 
4434         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4435                 mutex_enter(&ill->ill_lock);
4436                 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4437                     ILL_IS_CONDEMNED(ill) ||
4438                     !(ill->ill_flags & ILLF_MULTICAST)) {
4439                         mutex_exit(&ill->ill_lock);
4440                         continue;
4441                 }
4442                 for (ipif = ill->ill_ipif; ipif != NULL;
4443                     ipif = ipif->ipif_next) {
4444                         if (zoneid != ipif->ipif_zoneid &&
4445                             zoneid != ALL_ZONES &&
4446                             ipif->ipif_zoneid != ALL_ZONES) {
4447                                 continue;
4448                         }
4449                         if (!(ipif->ipif_flags & IPIF_UP) ||
4450                             IPIF_IS_CONDEMNED(ipif)) {
4451                                 continue;
4452                         }
4453 
4454                         /*
4455                          * Found one candidate.  If it is deprecated,
4456                          * remember it in dep_ipif.  If it is not deprecated,
4457                          * remember it in saved_ipif.
4458                          */
4459                         if (ipif->ipif_flags & IPIF_DEPRECATED) {
4460                                 if (dep_ipif == NULL) {
4461                                         dep_ipif = ipif;
4462                                 } else if (ipif_comp_multi(dep_ipif, ipif,
4463                                     isv6)) {
4464                                         /*
4465                                          * If the previous dep_ipif does not
4466                                          * belong to the same ill, we've done
4467                                          * a ipif_refhold() on it.  So we need
4468                                          * to release it.
4469                                          */
4470                                         if (dep_ipif->ipif_ill != ill)
4471                                                 ipif_refrele(dep_ipif);
4472                                         dep_ipif = ipif;
4473                                 }
4474                                 continue;
4475                         }
4476                         if (saved_ipif == NULL) {
4477                                 saved_ipif = ipif;
4478                         } else {
4479                                 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4480                                         if (saved_ipif->ipif_ill != ill)
4481                                                 ipif_refrele(saved_ipif);
4482                                         saved_ipif = ipif;
4483                                 }
4484                         }
4485                 }
4486                 /*
4487                  * Before going to the next ill, do a ipif_refhold() on the
4488                  * saved ones.
4489                  */
4490                 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4491                         ipif_refhold_locked(saved_ipif);
4492                 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4493                         ipif_refhold_locked(dep_ipif);
4494                 mutex_exit(&ill->ill_lock);
4495         }
4496         rw_exit(&ipst->ips_ill_g_lock);
4497 
4498         /*
4499          * If we have only the saved_ipif, return it.  But if we have both
4500          * saved_ipif and dep_ipif, check to see which one is better.
4501          */
4502         if (saved_ipif != NULL) {
4503                 if (dep_ipif != NULL) {
4504                         if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4505                                 ipif_refrele(saved_ipif);
4506                                 return (dep_ipif);
4507                         } else {
4508                                 ipif_refrele(dep_ipif);
4509                                 return (saved_ipif);
4510                         }
4511                 }
4512                 return (saved_ipif);
4513         } else {
4514                 return (dep_ipif);
4515         }
4516 }
4517 
4518 ill_t *
4519 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4520 {
4521         ipif_t *ipif;
4522         ill_t *ill;
4523 
4524         ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4525         if (ipif == NULL)
4526                 return (NULL);
4527 
4528         ill = ipif->ipif_ill;
4529         ill_refhold(ill);
4530         ipif_refrele(ipif);
4531         return (ill);
4532 }
4533 
4534 /*
4535  * This function is called when an application does not specify an interface
4536  * to be used for multicast traffic (joining a group/sending data).  It
4537  * calls ire_lookup_multi() to look for an interface route for the
4538  * specified multicast group.  Doing this allows the administrator to add
4539  * prefix routes for multicast to indicate which interface to be used for
4540  * multicast traffic in the above scenario.  The route could be for all
4541  * multicast (224.0/4), for a single multicast group (a /32 route) or
4542  * anything in between.  If there is no such multicast route, we just find
4543  * any multicast capable interface and return it.  The returned ipif
4544  * is refhold'ed.
4545  *
4546  * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
4547  * unicast table. This is used by CGTP.
4548  */
4549 ill_t *
4550 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4551     boolean_t *multirtp, ipaddr_t *setsrcp)
4552 {
4553         ill_t                   *ill;
4554 
4555         ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
4556         if (ill != NULL)
4557                 return (ill);
4558 
4559         return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4560 }
4561 
4562 /*
4563  * Look for an ipif with the specified interface address and destination.
4564  * The destination address is used only for matching point-to-point interfaces.
4565  */
4566 ipif_t *
4567 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4568 {
4569         ipif_t  *ipif;
4570         ill_t   *ill;
4571         ill_walk_context_t ctx;
4572 
4573         /*
4574          * First match all the point-to-point interfaces
4575          * before looking at non-point-to-point interfaces.
4576          * This is done to avoid returning non-point-to-point
4577          * ipif instead of unnumbered point-to-point ipif.
4578          */
4579         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4580         ill = ILL_START_WALK_V4(&ctx, ipst);
4581         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4582                 mutex_enter(&ill->ill_lock);
4583                 for (ipif = ill->ill_ipif; ipif != NULL;
4584                     ipif = ipif->ipif_next) {
4585                         /* Allow the ipif to be down */
4586                         if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4587                             (ipif->ipif_lcl_addr == if_addr) &&
4588                             (ipif->ipif_pp_dst_addr == dst)) {
4589                                 if (!IPIF_IS_CONDEMNED(ipif)) {
4590                                         ipif_refhold_locked(ipif);
4591                                         mutex_exit(&ill->ill_lock);
4592                                         rw_exit(&ipst->ips_ill_g_lock);
4593                                         return (ipif);
4594                                 }
4595                         }
4596                 }
4597                 mutex_exit(&ill->ill_lock);
4598         }
4599         rw_exit(&ipst->ips_ill_g_lock);
4600 
4601         /* lookup the ipif based on interface address */
4602         ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4603         ASSERT(ipif == NULL || !ipif->ipif_isv6);
4604         return (ipif);
4605 }
4606 
4607 /*
4608  * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4609  */
4610 static ipif_t *
4611 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4612     zoneid_t zoneid, ip_stack_t *ipst)
4613 {
4614         ipif_t  *ipif;
4615         ill_t   *ill;
4616         boolean_t ptp = B_FALSE;
4617         ill_walk_context_t      ctx;
4618         boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4619         boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4620 
4621         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4622         /*
4623          * Repeat twice, first based on local addresses and
4624          * next time for pointopoint.
4625          */
4626 repeat:
4627         ill = ILL_START_WALK_V4(&ctx, ipst);
4628         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4629                 if (match_ill != NULL && ill != match_ill &&
4630                     (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4631                         continue;
4632                 }
4633                 mutex_enter(&ill->ill_lock);
4634                 for (ipif = ill->ill_ipif; ipif != NULL;
4635                     ipif = ipif->ipif_next) {
4636                         if (zoneid != ALL_ZONES &&
4637                             zoneid != ipif->ipif_zoneid &&
4638                             ipif->ipif_zoneid != ALL_ZONES)
4639                                 continue;
4640 
4641                         if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4642                                 continue;
4643 
4644                         /* Allow the ipif to be down */
4645                         if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4646                             ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4647                             (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4648                             (ipif->ipif_pp_dst_addr == addr))) {
4649                                 if (!IPIF_IS_CONDEMNED(ipif)) {
4650                                         ipif_refhold_locked(ipif);
4651                                         mutex_exit(&ill->ill_lock);
4652                                         rw_exit(&ipst->ips_ill_g_lock);
4653                                         return (ipif);
4654                                 }
4655                         }
4656                 }
4657                 mutex_exit(&ill->ill_lock);
4658         }
4659 
4660         /* If we already did the ptp case, then we are done */
4661         if (ptp) {
4662                 rw_exit(&ipst->ips_ill_g_lock);
4663                 return (NULL);
4664         }
4665         ptp = B_TRUE;
4666         goto repeat;
4667 }
4668 
4669 /*
4670  * Lookup an ipif with the specified address.  For point-to-point links we
4671  * look for matches on either the destination address or the local address,
4672  * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
4673  * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4674  * (or illgrp if `match_ill' is in an IPMP group).
4675  */
4676 ipif_t *
4677 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4678     ip_stack_t *ipst)
4679 {
4680         return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4681             zoneid, ipst));
4682 }
4683 
4684 /*
4685  * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4686  * except that we will only return an address if it is not marked as
4687  * IPIF_DUPLICATE
4688  */
4689 ipif_t *
4690 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4691     ip_stack_t *ipst)
4692 {
4693         return (ipif_lookup_addr_common(addr, match_ill,
4694             (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4695             zoneid, ipst));
4696 }
4697 
4698 /*
4699  * Special abbreviated version of ipif_lookup_addr() that doesn't match
4700  * `match_ill' across the IPMP group.  This function is only needed in some
4701  * corner-cases; almost everything should use ipif_lookup_addr().
4702  */
4703 ipif_t *
4704 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4705 {
4706         ASSERT(match_ill != NULL);
4707         return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4708             ipst));
4709 }
4710 
4711 /*
4712  * Look for an ipif with the specified address. For point-point links
4713  * we look for matches on either the destination address and the local
4714  * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4715  * is set.
4716  * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4717  * ill (or illgrp if `match_ill' is in an IPMP group).
4718  * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4719  */
4720 zoneid_t
4721 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4722 {
4723         zoneid_t zoneid;
4724         ipif_t  *ipif;
4725         ill_t   *ill;
4726         boolean_t ptp = B_FALSE;
4727         ill_walk_context_t      ctx;
4728 
4729         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4730         /*
4731          * Repeat twice, first based on local addresses and
4732          * next time for pointopoint.
4733          */
4734 repeat:
4735         ill = ILL_START_WALK_V4(&ctx, ipst);
4736         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4737                 if (match_ill != NULL && ill != match_ill &&
4738                     !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4739                         continue;
4740                 }
4741                 mutex_enter(&ill->ill_lock);
4742                 for (ipif = ill->ill_ipif; ipif != NULL;
4743                     ipif = ipif->ipif_next) {
4744                         /* Allow the ipif to be down */
4745                         if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4746                             ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4747                             (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4748                             (ipif->ipif_pp_dst_addr == addr)) &&
4749                             !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4750                                 zoneid = ipif->ipif_zoneid;
4751                                 mutex_exit(&ill->ill_lock);
4752                                 rw_exit(&ipst->ips_ill_g_lock);
4753                                 /*
4754                                  * If ipif_zoneid was ALL_ZONES then we have
4755                                  * a trusted extensions shared IP address.
4756                                  * In that case GLOBAL_ZONEID works to send.
4757                                  */
4758                                 if (zoneid == ALL_ZONES)
4759                                         zoneid = GLOBAL_ZONEID;
4760                                 return (zoneid);
4761                         }
4762                 }
4763                 mutex_exit(&ill->ill_lock);
4764         }
4765 
4766         /* If we already did the ptp case, then we are done */
4767         if (ptp) {
4768                 rw_exit(&ipst->ips_ill_g_lock);
4769                 return (ALL_ZONES);
4770         }
4771         ptp = B_TRUE;
4772         goto repeat;
4773 }
4774 
4775 /*
4776  * Look for an ipif that matches the specified remote address i.e. the
4777  * ipif that would receive the specified packet.
4778  * First look for directly connected interfaces and then do a recursive
4779  * IRE lookup and pick the first ipif corresponding to the source address in the
4780  * ire.
4781  * Returns: held ipif
4782  *
4783  * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4784  */
4785 ipif_t *
4786 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4787 {
4788         ipif_t  *ipif;
4789 
4790         ASSERT(!ill->ill_isv6);
4791 
4792         /*
4793          * Someone could be changing this ipif currently or change it
4794          * after we return this. Thus  a few packets could use the old
4795          * old values. However structure updates/creates (ire, ilg, ilm etc)
4796          * will atomically be updated or cleaned up with the new value
4797          * Thus we don't need a lock to check the flags or other attrs below.
4798          */
4799         mutex_enter(&ill->ill_lock);
4800         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4801                 if (IPIF_IS_CONDEMNED(ipif))
4802                         continue;
4803                 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4804                     ipif->ipif_zoneid != ALL_ZONES)
4805                         continue;
4806                 /* Allow the ipif to be down */
4807                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4808                         if ((ipif->ipif_pp_dst_addr == addr) ||
4809                             (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4810                             ipif->ipif_lcl_addr == addr)) {
4811                                 ipif_refhold_locked(ipif);
4812                                 mutex_exit(&ill->ill_lock);
4813                                 return (ipif);
4814                         }
4815                 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4816                         ipif_refhold_locked(ipif);
4817                         mutex_exit(&ill->ill_lock);
4818                         return (ipif);
4819                 }
4820         }
4821         mutex_exit(&ill->ill_lock);
4822         /*
4823          * For a remote destination it isn't possible to nail down a particular
4824          * ipif.
4825          */
4826 
4827         /* Pick the first interface */
4828         ipif = ipif_get_next_ipif(NULL, ill);
4829         return (ipif);
4830 }
4831 
4832 /*
4833  * This func does not prevent refcnt from increasing. But if
4834  * the caller has taken steps to that effect, then this func
4835  * can be used to determine whether the ill has become quiescent
4836  */
4837 static boolean_t
4838 ill_is_quiescent(ill_t *ill)
4839 {
4840         ipif_t  *ipif;
4841 
4842         ASSERT(MUTEX_HELD(&ill->ill_lock));
4843 
4844         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4845                 if (ipif->ipif_refcnt != 0)
4846                         return (B_FALSE);
4847         }
4848         if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4849                 return (B_FALSE);
4850         }
4851         return (B_TRUE);
4852 }
4853 
4854 boolean_t
4855 ill_is_freeable(ill_t *ill)
4856 {
4857         ipif_t  *ipif;
4858 
4859         ASSERT(MUTEX_HELD(&ill->ill_lock));
4860 
4861         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4862                 if (ipif->ipif_refcnt != 0) {
4863                         return (B_FALSE);
4864                 }
4865         }
4866         if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4867                 return (B_FALSE);
4868         }
4869         return (B_TRUE);
4870 }
4871 
4872 /*
4873  * This func does not prevent refcnt from increasing. But if
4874  * the caller has taken steps to that effect, then this func
4875  * can be used to determine whether the ipif has become quiescent
4876  */
4877 static boolean_t
4878 ipif_is_quiescent(ipif_t *ipif)
4879 {
4880         ill_t *ill;
4881 
4882         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4883 
4884         if (ipif->ipif_refcnt != 0)
4885                 return (B_FALSE);
4886 
4887         ill = ipif->ipif_ill;
4888         if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4889             ill->ill_logical_down) {
4890                 return (B_TRUE);
4891         }
4892 
4893         /* This is the last ipif going down or being deleted on this ill */
4894         if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4895                 return (B_FALSE);
4896         }
4897 
4898         return (B_TRUE);
4899 }
4900 
4901 /*
4902  * return true if the ipif can be destroyed: the ipif has to be quiescent
4903  * with zero references from ire/ilm to it.
4904  */
4905 static boolean_t
4906 ipif_is_freeable(ipif_t *ipif)
4907 {
4908         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4909         ASSERT(ipif->ipif_id != 0);
4910         return (ipif->ipif_refcnt == 0);
4911 }
4912 
4913 /*
4914  * The ipif/ill/ire has been refreled. Do the tail processing.
4915  * Determine if the ipif or ill in question has become quiescent and if so
4916  * wakeup close and/or restart any queued pending ioctl that is waiting
4917  * for the ipif_down (or ill_down)
4918  */
4919 void
4920 ipif_ill_refrele_tail(ill_t *ill)
4921 {
4922         mblk_t  *mp;
4923         conn_t  *connp;
4924         ipsq_t  *ipsq;
4925         ipxop_t *ipx;
4926         ipif_t  *ipif;
4927         dl_notify_ind_t *dlindp;
4928 
4929         ASSERT(MUTEX_HELD(&ill->ill_lock));
4930 
4931         if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
4932                 /* ip_modclose() may be waiting */
4933                 cv_broadcast(&ill->ill_cv);
4934         }
4935 
4936         ipsq = ill->ill_phyint->phyint_ipsq;
4937         mutex_enter(&ipsq->ipsq_lock);
4938         ipx = ipsq->ipsq_xop;
4939         mutex_enter(&ipx->ipx_lock);
4940         if (ipx->ipx_waitfor == 0)   /* no one's waiting; bail */
4941                 goto unlock;
4942 
4943         ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
4944 
4945         ipif = ipx->ipx_pending_ipif;
4946         if (ipif->ipif_ill != ill)   /* wait is for another ill; bail */
4947                 goto unlock;
4948 
4949         switch (ipx->ipx_waitfor) {
4950         case IPIF_DOWN:
4951                 if (!ipif_is_quiescent(ipif))
4952                         goto unlock;
4953                 break;
4954         case IPIF_FREE:
4955                 if (!ipif_is_freeable(ipif))
4956                         goto unlock;
4957                 break;
4958         case ILL_DOWN:
4959                 if (!ill_is_quiescent(ill))
4960                         goto unlock;
4961                 break;
4962         case ILL_FREE:
4963                 /*
4964                  * ILL_FREE is only for loopback; normal ill teardown waits
4965                  * synchronously in ip_modclose() without using ipx_waitfor,
4966                  * handled by the cv_broadcast() at the top of this function.
4967                  */
4968                 if (!ill_is_freeable(ill))
4969                         goto unlock;
4970                 break;
4971         default:
4972                 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
4973                     (void *)ipsq, ipx->ipx_waitfor);
4974         }
4975 
4976         ill_refhold_locked(ill);        /* for qwriter_ip() call below */
4977         mutex_exit(&ipx->ipx_lock);
4978         mp = ipsq_pending_mp_get(ipsq, &connp);
4979         mutex_exit(&ipsq->ipsq_lock);
4980         mutex_exit(&ill->ill_lock);
4981 
4982         ASSERT(mp != NULL);
4983         /*
4984          * NOTE: all of the qwriter_ip() calls below use CUR_OP since
4985          * we can only get here when the current operation decides it
4986          * it needs to quiesce via ipsq_pending_mp_add().
4987          */
4988         switch (mp->b_datap->db_type) {
4989         case M_PCPROTO:
4990         case M_PROTO:
4991                 /*
4992                  * For now, only DL_NOTIFY_IND messages can use this facility.
4993                  */
4994                 dlindp = (dl_notify_ind_t *)mp->b_rptr;
4995                 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
4996 
4997                 switch (dlindp->dl_notification) {
4998                 case DL_NOTE_PHYS_ADDR:
4999                         qwriter_ip(ill, ill->ill_rq, mp,
5000                             ill_set_phys_addr_tail, CUR_OP, B_TRUE);
5001                         return;
5002                 case DL_NOTE_REPLUMB:
5003                         qwriter_ip(ill, ill->ill_rq, mp,
5004                             ill_replumb_tail, CUR_OP, B_TRUE);
5005                         return;
5006                 default:
5007                         ASSERT(0);
5008                         ill_refrele(ill);
5009                 }
5010                 break;
5011 
5012         case M_ERROR:
5013         case M_HANGUP:
5014                 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5015                     B_TRUE);
5016                 return;
5017 
5018         case M_IOCTL:
5019         case M_IOCDATA:
5020                 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5021                     ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5022                 return;
5023 
5024         default:
5025                 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5026                     "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5027         }
5028         return;
5029 unlock:
5030         mutex_exit(&ipsq->ipsq_lock);
5031         mutex_exit(&ipx->ipx_lock);
5032         mutex_exit(&ill->ill_lock);
5033 }
5034 
5035 #ifdef DEBUG
5036 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5037 static void
5038 th_trace_rrecord(th_trace_t *th_trace)
5039 {
5040         tr_buf_t *tr_buf;
5041         uint_t lastref;
5042 
5043         lastref = th_trace->th_trace_lastref;
5044         lastref++;
5045         if (lastref == TR_BUF_MAX)
5046                 lastref = 0;
5047         th_trace->th_trace_lastref = lastref;
5048         tr_buf = &th_trace->th_trbuf[lastref];
5049         tr_buf->tr_time = ddi_get_lbolt();
5050         tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5051 }
5052 
5053 static void
5054 th_trace_free(void *value)
5055 {
5056         th_trace_t *th_trace = value;
5057 
5058         ASSERT(th_trace->th_refcnt == 0);
5059         kmem_free(th_trace, sizeof (*th_trace));
5060 }
5061 
5062 /*
5063  * Find or create the per-thread hash table used to track object references.
5064  * The ipst argument is NULL if we shouldn't allocate.
5065  *
5066  * Accesses per-thread data, so there's no need to lock here.
5067  */
5068 static mod_hash_t *
5069 th_trace_gethash(ip_stack_t *ipst)
5070 {
5071         th_hash_t *thh;
5072 
5073         if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5074                 mod_hash_t *mh;
5075                 char name[256];
5076                 size_t objsize, rshift;
5077                 int retv;
5078 
5079                 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5080                         return (NULL);
5081                 (void) snprintf(name, sizeof (name), "th_trace_%p",
5082                     (void *)curthread);
5083 
5084                 /*
5085                  * We use mod_hash_create_extended here rather than the more
5086                  * obvious mod_hash_create_ptrhash because the latter has a
5087                  * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5088                  * block.
5089                  */
5090                 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5091                     MAX(sizeof (ire_t), sizeof (ncec_t)));
5092                 rshift = highbit(objsize);
5093                 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5094                     th_trace_free, mod_hash_byptr, (void *)rshift,
5095                     mod_hash_ptrkey_cmp, KM_NOSLEEP);
5096                 if (mh == NULL) {
5097                         kmem_free(thh, sizeof (*thh));
5098                         return (NULL);
5099                 }
5100                 thh->thh_hash = mh;
5101                 thh->thh_ipst = ipst;
5102                 /*
5103                  * We trace ills, ipifs, ires, and nces.  All of these are
5104                  * per-IP-stack, so the lock on the thread list is as well.
5105                  */
5106                 rw_enter(&ip_thread_rwlock, RW_WRITER);
5107                 list_insert_tail(&ip_thread_list, thh);
5108                 rw_exit(&ip_thread_rwlock);
5109                 retv = tsd_set(ip_thread_data, thh);
5110                 ASSERT(retv == 0);
5111         }
5112         return (thh != NULL ? thh->thh_hash : NULL);
5113 }
5114 
5115 boolean_t
5116 th_trace_ref(const void *obj, ip_stack_t *ipst)
5117 {
5118         th_trace_t *th_trace;
5119         mod_hash_t *mh;
5120         mod_hash_val_t val;
5121 
5122         if ((mh = th_trace_gethash(ipst)) == NULL)
5123                 return (B_FALSE);
5124 
5125         /*
5126          * Attempt to locate the trace buffer for this obj and thread.
5127          * If it does not exist, then allocate a new trace buffer and
5128          * insert into the hash.
5129          */
5130         if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5131                 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5132                 if (th_trace == NULL)
5133                         return (B_FALSE);
5134 
5135                 th_trace->th_id = curthread;
5136                 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5137                     (mod_hash_val_t)th_trace) != 0) {
5138                         kmem_free(th_trace, sizeof (th_trace_t));
5139                         return (B_FALSE);
5140                 }
5141         } else {
5142                 th_trace = (th_trace_t *)val;
5143         }
5144 
5145         ASSERT(th_trace->th_refcnt >= 0 &&
5146             th_trace->th_refcnt < TR_BUF_MAX - 1);
5147 
5148         th_trace->th_refcnt++;
5149         th_trace_rrecord(th_trace);
5150         return (B_TRUE);
5151 }
5152 
5153 /*
5154  * For the purpose of tracing a reference release, we assume that global
5155  * tracing is always on and that the same thread initiated the reference hold
5156  * is releasing.
5157  */
5158 void
5159 th_trace_unref(const void *obj)
5160 {
5161         int retv;
5162         mod_hash_t *mh;
5163         th_trace_t *th_trace;
5164         mod_hash_val_t val;
5165 
5166         mh = th_trace_gethash(NULL);
5167         retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5168         ASSERT(retv == 0);
5169         th_trace = (th_trace_t *)val;
5170 
5171         ASSERT(th_trace->th_refcnt > 0);
5172         th_trace->th_refcnt--;
5173         th_trace_rrecord(th_trace);
5174 }
5175 
5176 /*
5177  * If tracing has been disabled, then we assume that the reference counts are
5178  * now useless, and we clear them out before destroying the entries.
5179  */
5180 void
5181 th_trace_cleanup(const void *obj, boolean_t trace_disable)
5182 {
5183         th_hash_t       *thh;
5184         mod_hash_t      *mh;
5185         mod_hash_val_t  val;
5186         th_trace_t      *th_trace;
5187         int             retv;
5188 
5189         rw_enter(&ip_thread_rwlock, RW_READER);
5190         for (thh = list_head(&ip_thread_list); thh != NULL;
5191             thh = list_next(&ip_thread_list, thh)) {
5192                 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5193                     &val) == 0) {
5194                         th_trace = (th_trace_t *)val;
5195                         if (trace_disable)
5196                                 th_trace->th_refcnt = 0;
5197                         retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5198                         ASSERT(retv == 0);
5199                 }
5200         }
5201         rw_exit(&ip_thread_rwlock);
5202 }
5203 
5204 void
5205 ipif_trace_ref(ipif_t *ipif)
5206 {
5207         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5208 
5209         if (ipif->ipif_trace_disable)
5210                 return;
5211 
5212         if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5213                 ipif->ipif_trace_disable = B_TRUE;
5214                 ipif_trace_cleanup(ipif);
5215         }
5216 }
5217 
5218 void
5219 ipif_untrace_ref(ipif_t *ipif)
5220 {
5221         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5222 
5223         if (!ipif->ipif_trace_disable)
5224                 th_trace_unref(ipif);
5225 }
5226 
5227 void
5228 ill_trace_ref(ill_t *ill)
5229 {
5230         ASSERT(MUTEX_HELD(&ill->ill_lock));
5231 
5232         if (ill->ill_trace_disable)
5233                 return;
5234 
5235         if (!th_trace_ref(ill, ill->ill_ipst)) {
5236                 ill->ill_trace_disable = B_TRUE;
5237                 ill_trace_cleanup(ill);
5238         }
5239 }
5240 
5241 void
5242 ill_untrace_ref(ill_t *ill)
5243 {
5244         ASSERT(MUTEX_HELD(&ill->ill_lock));
5245 
5246         if (!ill->ill_trace_disable)
5247                 th_trace_unref(ill);
5248 }
5249 
5250 /*
5251  * Called when ipif is unplumbed or when memory alloc fails.  Note that on
5252  * failure, ipif_trace_disable is set.
5253  */
5254 static void
5255 ipif_trace_cleanup(const ipif_t *ipif)
5256 {
5257         th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5258 }
5259 
5260 /*
5261  * Called when ill is unplumbed or when memory alloc fails.  Note that on
5262  * failure, ill_trace_disable is set.
5263  */
5264 static void
5265 ill_trace_cleanup(const ill_t *ill)
5266 {
5267         th_trace_cleanup(ill, ill->ill_trace_disable);
5268 }
5269 #endif /* DEBUG */
5270 
5271 void
5272 ipif_refhold_locked(ipif_t *ipif)
5273 {
5274         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5275         ipif->ipif_refcnt++;
5276         IPIF_TRACE_REF(ipif);
5277 }
5278 
5279 void
5280 ipif_refhold(ipif_t *ipif)
5281 {
5282         ill_t   *ill;
5283 
5284         ill = ipif->ipif_ill;
5285         mutex_enter(&ill->ill_lock);
5286         ipif->ipif_refcnt++;
5287         IPIF_TRACE_REF(ipif);
5288         mutex_exit(&ill->ill_lock);
5289 }
5290 
5291 /*
5292  * Must not be called while holding any locks. Otherwise if this is
5293  * the last reference to be released there is a chance of recursive mutex
5294  * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5295  * to restart an ioctl.
5296  */
5297 void
5298 ipif_refrele(ipif_t *ipif)
5299 {
5300         ill_t   *ill;
5301 
5302         ill = ipif->ipif_ill;
5303 
5304         mutex_enter(&ill->ill_lock);
5305         ASSERT(ipif->ipif_refcnt != 0);
5306         ipif->ipif_refcnt--;
5307         IPIF_UNTRACE_REF(ipif);
5308         if (ipif->ipif_refcnt != 0) {
5309                 mutex_exit(&ill->ill_lock);
5310                 return;
5311         }
5312 
5313         /* Drops the ill_lock */
5314         ipif_ill_refrele_tail(ill);
5315 }
5316 
5317 ipif_t *
5318 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5319 {
5320         ipif_t  *ipif;
5321 
5322         mutex_enter(&ill->ill_lock);
5323         for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5324             ipif != NULL; ipif = ipif->ipif_next) {
5325                 if (IPIF_IS_CONDEMNED(ipif))
5326                         continue;
5327                 ipif_refhold_locked(ipif);
5328                 mutex_exit(&ill->ill_lock);
5329                 return (ipif);
5330         }
5331         mutex_exit(&ill->ill_lock);
5332         return (NULL);
5333 }
5334 
5335 /*
5336  * TODO: make this table extendible at run time
5337  * Return a pointer to the mac type info for 'mac_type'
5338  */
5339 static ip_m_t *
5340 ip_m_lookup(t_uscalar_t mac_type)
5341 {
5342         ip_m_t  *ipm;
5343 
5344         for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5345                 if (ipm->ip_m_mac_type == mac_type)
5346                         return (ipm);
5347         return (NULL);
5348 }
5349 
5350 /*
5351  * Make a link layer address from the multicast IP address *addr.
5352  * To form the link layer address, invoke the ip_m_v*mapping function
5353  * associated with the link-layer type.
5354  */
5355 void
5356 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5357 {
5358         ip_m_t *ipm;
5359 
5360         if (ill->ill_net_type == IRE_IF_NORESOLVER)
5361                 return;
5362 
5363         ASSERT(addr != NULL);
5364 
5365         ipm = ip_m_lookup(ill->ill_mactype);
5366         if (ipm == NULL ||
5367             (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5368             (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5369                 ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5370                     ill->ill_name, ill->ill_mactype));
5371                 return;
5372         }
5373         if (ill->ill_isv6)
5374                 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5375         else
5376                 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5377 }
5378 
5379 /*
5380  * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5381  * Otherwise returns B_TRUE.
5382  *
5383  * The netmask can be verified to be contiguous with 32 shifts and or
5384  * operations. Take the contiguous mask (in host byte order) and compute
5385  *      mask | mask << 1 | mask << 2 | ... | mask << 31
5386  * the result will be the same as the 'mask' for contiguous mask.
5387  */
5388 static boolean_t
5389 ip_contiguous_mask(uint32_t mask)
5390 {
5391         uint32_t        m = mask;
5392         int             i;
5393 
5394         for (i = 1; i < 32; i++)
5395                 m |= (mask << i);
5396 
5397         return (m == mask);
5398 }
5399 
5400 /*
5401  * ip_rt_add is called to add an IPv4 route to the forwarding table.
5402  * ill is passed in to associate it with the correct interface.
5403  * If ire_arg is set, then we return the held IRE in that location.
5404  */
5405 int
5406 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5407     ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5408     boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
5409 {
5410         ire_t   *ire, *nire;
5411         ire_t   *gw_ire = NULL;
5412         ipif_t  *ipif = NULL;
5413         uint_t  type;
5414         int     match_flags = MATCH_IRE_TYPE;
5415         tsol_gc_t *gc = NULL;
5416         tsol_gcgrp_t *gcgrp = NULL;
5417         boolean_t gcgrp_xtraref = B_FALSE;
5418         boolean_t cgtp_broadcast;
5419         boolean_t unbound = B_FALSE;
5420 
5421         ip1dbg(("ip_rt_add:"));
5422 
5423         if (ire_arg != NULL)
5424                 *ire_arg = NULL;
5425 
5426         /* disallow non-contiguous netmasks */
5427         if (!ip_contiguous_mask(ntohl(mask)))
5428                 return (ENOTSUP);
5429 
5430         /*
5431          * If this is the case of RTF_HOST being set, then we set the netmask
5432          * to all ones (regardless if one was supplied).
5433          */
5434         if (flags & RTF_HOST)
5435                 mask = IP_HOST_MASK;
5436 
5437         /*
5438          * Prevent routes with a zero gateway from being created (since
5439          * interfaces can currently be plumbed and brought up no assigned
5440          * address).
5441          */
5442         if (gw_addr == 0)
5443                 return (ENETUNREACH);
5444         /*
5445          * Get the ipif, if any, corresponding to the gw_addr
5446          * If -ifp was specified we restrict ourselves to the ill, otherwise
5447          * we match on the gatway and destination to handle unnumbered pt-pt
5448          * interfaces.
5449          */
5450         if (ill != NULL)
5451                 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5452         else
5453                 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5454         if (ipif != NULL) {
5455                 if (IS_VNI(ipif->ipif_ill)) {
5456                         ipif_refrele(ipif);
5457                         return (EINVAL);
5458                 }
5459         }
5460 
5461         /*
5462          * GateD will attempt to create routes with a loopback interface
5463          * address as the gateway and with RTF_GATEWAY set.  We allow
5464          * these routes to be added, but create them as interface routes
5465          * since the gateway is an interface address.
5466          */
5467         if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5468                 flags &= ~RTF_GATEWAY;
5469                 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5470                     mask == IP_HOST_MASK) {
5471                         ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5472                             NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
5473                             NULL);
5474                         if (ire != NULL) {
5475                                 ire_refrele(ire);
5476                                 ipif_refrele(ipif);
5477                                 return (EEXIST);
5478                         }
5479                         ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5480                             "for 0x%x\n", (void *)ipif,
5481                             ipif->ipif_ire_type,
5482                             ntohl(ipif->ipif_lcl_addr)));
5483                         ire = ire_create(
5484                             (uchar_t *)&dst_addr,   /* dest address */
5485                             (uchar_t *)&mask,               /* mask */
5486                             NULL,                       /* no gateway */
5487                             ipif->ipif_ire_type,     /* LOOPBACK */
5488                             ipif->ipif_ill,
5489                             zoneid,
5490                             (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5491                             NULL,
5492                             ipst);
5493 
5494                         if (ire == NULL) {
5495                                 ipif_refrele(ipif);
5496                                 return (ENOMEM);
5497                         }
5498                         /* src address assigned by the caller? */
5499                         if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5500                                 ire->ire_setsrc_addr = src_addr;
5501 
5502                         nire = ire_add(ire);
5503                         if (nire == NULL) {
5504                                 /*
5505                                  * In the result of failure, ire_add() will have
5506                                  * already deleted the ire in question, so there
5507                                  * is no need to do that here.
5508                                  */
5509                                 ipif_refrele(ipif);
5510                                 return (ENOMEM);
5511                         }
5512                         /*
5513                          * Check if it was a duplicate entry. This handles
5514                          * the case of two racing route adds for the same route
5515                          */
5516                         if (nire != ire) {
5517                                 ASSERT(nire->ire_identical_ref > 1);
5518                                 ire_delete(nire);
5519                                 ire_refrele(nire);
5520                                 ipif_refrele(ipif);
5521                                 return (EEXIST);
5522                         }
5523                         ire = nire;
5524                         goto save_ire;
5525                 }
5526         }
5527 
5528         /*
5529          * The routes for multicast with CGTP are quite special in that
5530          * the gateway is the local interface address, yet RTF_GATEWAY
5531          * is set. We turn off RTF_GATEWAY to provide compatibility with
5532          * this undocumented and unusual use of multicast routes.
5533          */
5534         if ((flags & RTF_MULTIRT) && ipif != NULL)
5535                 flags &= ~RTF_GATEWAY;
5536 
5537         /*
5538          * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5539          * and the gateway address provided is one of the system's interface
5540          * addresses.  By using the routing socket interface and supplying an
5541          * RTA_IFP sockaddr with an interface index, an alternate method of
5542          * specifying an interface route to be created is available which uses
5543          * the interface index that specifies the outgoing interface rather than
5544          * the address of an outgoing interface (which may not be able to
5545          * uniquely identify an interface).  When coupled with the RTF_GATEWAY
5546          * flag, routes can be specified which not only specify the next-hop to
5547          * be used when routing to a certain prefix, but also which outgoing
5548          * interface should be used.
5549          *
5550          * Previously, interfaces would have unique addresses assigned to them
5551          * and so the address assigned to a particular interface could be used
5552          * to identify a particular interface.  One exception to this was the
5553          * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5554          *
5555          * With the advent of IPv6 and its link-local addresses, this
5556          * restriction was relaxed and interfaces could share addresses between
5557          * themselves.  In fact, typically all of the link-local interfaces on
5558          * an IPv6 node or router will have the same link-local address.  In
5559          * order to differentiate between these interfaces, the use of an
5560          * interface index is necessary and this index can be carried inside a
5561          * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
5562          * of using the interface index, however, is that all of the ipif's that
5563          * are part of an ill have the same index and so the RTA_IFP sockaddr
5564          * cannot be used to differentiate between ipif's (or logical
5565          * interfaces) that belong to the same ill (physical interface).
5566          *
5567          * For example, in the following case involving IPv4 interfaces and
5568          * logical interfaces
5569          *
5570          *      192.0.2.32      255.255.255.224 192.0.2.33      U       if0
5571          *      192.0.2.32      255.255.255.224 192.0.2.34      U       if0
5572          *      192.0.2.32      255.255.255.224 192.0.2.35      U       if0
5573          *
5574          * the ipif's corresponding to each of these interface routes can be
5575          * uniquely identified by the "gateway" (actually interface address).
5576          *
5577          * In this case involving multiple IPv6 default routes to a particular
5578          * link-local gateway, the use of RTA_IFP is necessary to specify which
5579          * default route is of interest:
5580          *
5581          *      default         fe80::123:4567:89ab:cdef        U       if0
5582          *      default         fe80::123:4567:89ab:cdef        U       if1
5583          */
5584 
5585         /* RTF_GATEWAY not set */
5586         if (!(flags & RTF_GATEWAY)) {
5587                 if (sp != NULL) {
5588                         ip2dbg(("ip_rt_add: gateway security attributes "
5589                             "cannot be set with interface route\n"));
5590                         if (ipif != NULL)
5591                                 ipif_refrele(ipif);
5592                         return (EINVAL);
5593                 }
5594 
5595                 /*
5596                  * Whether or not ill (RTA_IFP) is set, we require that
5597                  * the gateway is one of our local addresses.
5598                  */
5599                 if (ipif == NULL)
5600                         return (ENETUNREACH);
5601 
5602                 /*
5603                  * We use MATCH_IRE_ILL here. If the caller specified an
5604                  * interface (from the RTA_IFP sockaddr) we use it, otherwise
5605                  * we use the ill derived from the gateway address.
5606                  * We can always match the gateway address since we record it
5607                  * in ire_gateway_addr.
5608                  * We don't allow RTA_IFP to specify a different ill than the
5609                  * one matching the ipif to make sure we can delete the route.
5610                  */
5611                 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5612                 if (ill == NULL) {
5613                         ill = ipif->ipif_ill;
5614                 } else if (ill != ipif->ipif_ill) {
5615                         ipif_refrele(ipif);
5616                         return (EINVAL);
5617                 }
5618 
5619                 /*
5620                  * We check for an existing entry at this point.
5621                  *
5622                  * Since a netmask isn't passed in via the ioctl interface
5623                  * (SIOCADDRT), we don't check for a matching netmask in that
5624                  * case.
5625                  */
5626                 if (!ioctl_msg)
5627                         match_flags |= MATCH_IRE_MASK;
5628                 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5629                     IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
5630                     NULL);
5631                 if (ire != NULL) {
5632                         ire_refrele(ire);
5633                         ipif_refrele(ipif);
5634                         return (EEXIST);
5635                 }
5636 
5637                 /*
5638                  * Some software (for example, GateD and Sun Cluster) attempts
5639                  * to create (what amount to) IRE_PREFIX routes with the
5640                  * loopback address as the gateway.  This is primarily done to
5641                  * set up prefixes with the RTF_REJECT flag set (for example,
5642                  * when generating aggregate routes.)
5643                  *
5644                  * If the IRE type (as defined by ill->ill_net_type) would be
5645                  * IRE_LOOPBACK, then we map the request into a
5646                  * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5647                  * these interface routes, by definition, can only be that.
5648                  *
5649                  * Needless to say, the real IRE_LOOPBACK is NOT created by this
5650                  * routine, but rather using ire_create() directly.
5651                  *
5652                  */
5653                 type = ill->ill_net_type;
5654                 if (type == IRE_LOOPBACK) {
5655                         type = IRE_IF_NORESOLVER;
5656                         flags |= RTF_BLACKHOLE;
5657                 }
5658 
5659                 /*
5660                  * Create a copy of the IRE_IF_NORESOLVER or
5661                  * IRE_IF_RESOLVER with the modified address, netmask, and
5662                  * gateway.
5663                  */
5664                 ire = ire_create(
5665                     (uchar_t *)&dst_addr,
5666                     (uint8_t *)&mask,
5667                     (uint8_t *)&gw_addr,
5668                     type,
5669                     ill,
5670                     zoneid,
5671                     flags,
5672                     NULL,
5673                     ipst);
5674                 if (ire == NULL) {
5675                         ipif_refrele(ipif);
5676                         return (ENOMEM);
5677                 }
5678 
5679                 /* src address assigned by the caller? */
5680                 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5681                         ire->ire_setsrc_addr = src_addr;
5682 
5683                 nire = ire_add(ire);
5684                 if (nire == NULL) {
5685                         /*
5686                          * In the result of failure, ire_add() will have
5687                          * already deleted the ire in question, so there
5688                          * is no need to do that here.
5689                          */
5690                         ipif_refrele(ipif);
5691                         return (ENOMEM);
5692                 }
5693                 /*
5694                  * Check if it was a duplicate entry. This handles
5695                  * the case of two racing route adds for the same route
5696                  */
5697                 if (nire != ire) {
5698                         ire_delete(nire);
5699                         ire_refrele(nire);
5700                         ipif_refrele(ipif);
5701                         return (EEXIST);
5702                 }
5703                 ire = nire;
5704                 goto save_ire;
5705         }
5706 
5707         /*
5708          * Get an interface IRE for the specified gateway.
5709          * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5710          * gateway, it is currently unreachable and we fail the request
5711          * accordingly. We reject any RTF_GATEWAY routes where the gateway
5712          * is an IRE_LOCAL or IRE_LOOPBACK.
5713          * If RTA_IFP was specified we look on that particular ill.
5714          */
5715         if (ill != NULL)
5716                 match_flags |= MATCH_IRE_ILL;
5717 
5718         /* Check whether the gateway is reachable. */
5719 again:
5720         type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5721         if (flags & RTF_INDIRECT)
5722                 type |= IRE_OFFLINK;
5723 
5724         gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5725             ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
5726         if (gw_ire == NULL) {
5727                 /*
5728                  * With IPMP, we allow host routes to influence in.mpathd's
5729                  * target selection.  However, if the test addresses are on
5730                  * their own network, the above lookup will fail since the
5731                  * underlying IRE_INTERFACEs are marked hidden.  So allow
5732                  * hidden test IREs to be found and try again.
5733                  */
5734                 if (!(match_flags & MATCH_IRE_TESTHIDDEN))  {
5735                         match_flags |= MATCH_IRE_TESTHIDDEN;
5736                         goto again;
5737                 }
5738                 if (ipif != NULL)
5739                         ipif_refrele(ipif);
5740                 return (ENETUNREACH);
5741         }
5742         if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5743                 ire_refrele(gw_ire);
5744                 if (ipif != NULL)
5745                         ipif_refrele(ipif);
5746                 return (ENETUNREACH);
5747         }
5748 
5749         if (ill == NULL && !(flags & RTF_INDIRECT)) {
5750                 unbound = B_TRUE;
5751                 if (ipst->ips_ip_strict_src_multihoming > 0)
5752                         ill = gw_ire->ire_ill;
5753         }
5754 
5755         /*
5756          * We create one of three types of IREs as a result of this request
5757          * based on the netmask.  A netmask of all ones (which is automatically
5758          * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5759          * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5760          * created.  Otherwise, an IRE_PREFIX route is created for the
5761          * destination prefix.
5762          */
5763         if (mask == IP_HOST_MASK)
5764                 type = IRE_HOST;
5765         else if (mask == 0)
5766                 type = IRE_DEFAULT;
5767         else
5768                 type = IRE_PREFIX;
5769 
5770         /* check for a duplicate entry */
5771         ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5772             ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
5773             0, ipst, NULL);
5774         if (ire != NULL) {
5775                 if (ipif != NULL)
5776                         ipif_refrele(ipif);
5777                 ire_refrele(gw_ire);
5778                 ire_refrele(ire);
5779                 return (EEXIST);
5780         }
5781 
5782         /* Security attribute exists */
5783         if (sp != NULL) {
5784                 tsol_gcgrp_addr_t ga;
5785 
5786                 /* find or create the gateway credentials group */
5787                 ga.ga_af = AF_INET;
5788                 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
5789 
5790                 /* we hold reference to it upon success */
5791                 gcgrp = gcgrp_lookup(&ga, B_TRUE);
5792                 if (gcgrp == NULL) {
5793                         if (ipif != NULL)
5794                                 ipif_refrele(ipif);
5795                         ire_refrele(gw_ire);
5796                         return (ENOMEM);
5797                 }
5798 
5799                 /*
5800                  * Create and add the security attribute to the group; a
5801                  * reference to the group is made upon allocating a new
5802                  * entry successfully.  If it finds an already-existing
5803                  * entry for the security attribute in the group, it simply
5804                  * returns it and no new reference is made to the group.
5805                  */
5806                 gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
5807                 if (gc == NULL) {
5808                         if (ipif != NULL)
5809                                 ipif_refrele(ipif);
5810                         /* release reference held by gcgrp_lookup */
5811                         GCGRP_REFRELE(gcgrp);
5812                         ire_refrele(gw_ire);
5813                         return (ENOMEM);
5814                 }
5815         }
5816 
5817         /* Create the IRE. */
5818         ire = ire_create(
5819             (uchar_t *)&dst_addr,           /* dest address */
5820             (uchar_t *)&mask,                       /* mask */
5821             (uchar_t *)&gw_addr,            /* gateway address */
5822             (ushort_t)type,                     /* IRE type */
5823             ill,
5824             zoneid,
5825             flags,
5826             gc,                                 /* security attribute */
5827             ipst);
5828 
5829         /*
5830          * The ire holds a reference to the 'gc' and the 'gc' holds a
5831          * reference to the 'gcgrp'. We can now release the extra reference
5832          * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
5833          */
5834         if (gcgrp_xtraref)
5835                 GCGRP_REFRELE(gcgrp);
5836         if (ire == NULL) {
5837                 if (gc != NULL)
5838                         GC_REFRELE(gc);
5839                 if (ipif != NULL)
5840                         ipif_refrele(ipif);
5841                 ire_refrele(gw_ire);
5842                 return (ENOMEM);
5843         }
5844 
5845         /* Before we add, check if an extra CGTP broadcast is needed */
5846         cgtp_broadcast = ((flags & RTF_MULTIRT) &&
5847             ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
5848 
5849         /* src address assigned by the caller? */
5850         if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5851                 ire->ire_setsrc_addr = src_addr;
5852 
5853         ire->ire_unbound = unbound;
5854 
5855         /*
5856          * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5857          * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5858          */
5859 
5860         /* Add the new IRE. */
5861         nire = ire_add(ire);
5862         if (nire == NULL) {
5863                 /*
5864                  * In the result of failure, ire_add() will have
5865                  * already deleted the ire in question, so there
5866                  * is no need to do that here.
5867                  */
5868                 if (ipif != NULL)
5869                         ipif_refrele(ipif);
5870                 ire_refrele(gw_ire);
5871                 return (ENOMEM);
5872         }
5873         /*
5874          * Check if it was a duplicate entry. This handles
5875          * the case of two racing route adds for the same route
5876          */
5877         if (nire != ire) {
5878                 ire_delete(nire);
5879                 ire_refrele(nire);
5880                 if (ipif != NULL)
5881                         ipif_refrele(ipif);
5882                 ire_refrele(gw_ire);
5883                 return (EEXIST);
5884         }
5885         ire = nire;
5886 
5887         if (flags & RTF_MULTIRT) {
5888                 /*
5889                  * Invoke the CGTP (multirouting) filtering module
5890                  * to add the dst address in the filtering database.
5891                  * Replicated inbound packets coming from that address
5892                  * will be filtered to discard the duplicates.
5893                  * It is not necessary to call the CGTP filter hook
5894                  * when the dst address is a broadcast or multicast,
5895                  * because an IP source address cannot be a broadcast
5896                  * or a multicast.
5897                  */
5898                 if (cgtp_broadcast) {
5899                         ip_cgtp_bcast_add(ire, ipst);
5900                         goto save_ire;
5901                 }
5902                 if (ipst->ips_ip_cgtp_filter_ops != NULL &&
5903                     !CLASSD(ire->ire_addr)) {
5904                         int res;
5905                         ipif_t *src_ipif;
5906 
5907                         /* Find the source address corresponding to gw_ire */
5908                         src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
5909                             NULL, zoneid, ipst);
5910                         if (src_ipif != NULL) {
5911                                 res = ipst->ips_ip_cgtp_filter_ops->
5912                                     cfo_add_dest_v4(
5913                                     ipst->ips_netstack->netstack_stackid,
5914                                     ire->ire_addr,
5915                                     ire->ire_gateway_addr,
5916                                     ire->ire_setsrc_addr,
5917                                     src_ipif->ipif_lcl_addr);
5918                                 ipif_refrele(src_ipif);
5919                         } else {
5920                                 res = EADDRNOTAVAIL;
5921                         }
5922                         if (res != 0) {
5923                                 if (ipif != NULL)
5924                                         ipif_refrele(ipif);
5925                                 ire_refrele(gw_ire);
5926                                 ire_delete(ire);
5927                                 ire_refrele(ire);       /* Held in ire_add */
5928                                 return (res);
5929                         }
5930                 }
5931         }
5932 
5933 save_ire:
5934         if (gw_ire != NULL) {
5935                 ire_refrele(gw_ire);
5936                 gw_ire = NULL;
5937         }
5938         if (ill != NULL) {
5939                 /*
5940                  * Save enough information so that we can recreate the IRE if
5941                  * the interface goes down and then up.  The metrics associated
5942                  * with the route will be saved as well when rts_setmetrics() is
5943                  * called after the IRE has been created.  In the case where
5944                  * memory cannot be allocated, none of this information will be
5945                  * saved.
5946                  */
5947                 ill_save_ire(ill, ire);
5948         }
5949         if (ioctl_msg)
5950                 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
5951         if (ire_arg != NULL) {
5952                 /*
5953                  * Store the ire that was successfully added into where ire_arg
5954                  * points to so that callers don't have to look it up
5955                  * themselves (but they are responsible for ire_refrele()ing
5956                  * the ire when they are finished with it).
5957                  */
5958                 *ire_arg = ire;
5959         } else {
5960                 ire_refrele(ire);               /* Held in ire_add */
5961         }
5962         if (ipif != NULL)
5963                 ipif_refrele(ipif);
5964         return (0);
5965 }
5966 
5967 /*
5968  * ip_rt_delete is called to delete an IPv4 route.
5969  * ill is passed in to associate it with the correct interface.
5970  */
5971 /* ARGSUSED4 */
5972 int
5973 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5974     uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
5975     ip_stack_t *ipst, zoneid_t zoneid)
5976 {
5977         ire_t   *ire = NULL;
5978         ipif_t  *ipif;
5979         uint_t  type;
5980         uint_t  match_flags = MATCH_IRE_TYPE;
5981         int     err = 0;
5982 
5983         ip1dbg(("ip_rt_delete:"));
5984         /*
5985          * If this is the case of RTF_HOST being set, then we set the netmask
5986          * to all ones.  Otherwise, we use the netmask if one was supplied.
5987          */
5988         if (flags & RTF_HOST) {
5989                 mask = IP_HOST_MASK;
5990                 match_flags |= MATCH_IRE_MASK;
5991         } else if (rtm_addrs & RTA_NETMASK) {
5992                 match_flags |= MATCH_IRE_MASK;
5993         }
5994 
5995         /*
5996          * Note that RTF_GATEWAY is never set on a delete, therefore
5997          * we check if the gateway address is one of our interfaces first,
5998          * and fall back on RTF_GATEWAY routes.
5999          *
6000          * This makes it possible to delete an original
6001          * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
6002          * However, we have RTF_KERNEL set on the ones created by ipif_up
6003          * and those can not be deleted here.
6004          *
6005          * We use MATCH_IRE_ILL if we know the interface. If the caller
6006          * specified an interface (from the RTA_IFP sockaddr) we use it,
6007          * otherwise we use the ill derived from the gateway address.
6008          * We can always match the gateway address since we record it
6009          * in ire_gateway_addr.
6010          *
6011          * For more detail on specifying routes by gateway address and by
6012          * interface index, see the comments in ip_rt_add().
6013          */
6014         ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
6015         if (ipif != NULL) {
6016                 ill_t   *ill_match;
6017 
6018                 if (ill != NULL)
6019                         ill_match = ill;
6020                 else
6021                         ill_match = ipif->ipif_ill;
6022 
6023                 match_flags |= MATCH_IRE_ILL;
6024                 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
6025                         ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
6026                             IRE_LOOPBACK, ill_match, ALL_ZONES, NULL,
6027                             match_flags, 0, ipst, NULL);
6028                 }
6029                 if (ire == NULL) {
6030                         match_flags |= MATCH_IRE_GW;
6031                         ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
6032                             IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
6033                             match_flags, 0, ipst, NULL);
6034                 }
6035                 /* Avoid deleting routes created by kernel from an ipif */
6036                 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
6037                         ire_refrele(ire);
6038                         ire = NULL;
6039                 }
6040 
6041                 /* Restore in case we didn't find a match */
6042                 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
6043         }
6044 
6045         if (ire == NULL) {
6046                 /*
6047                  * At this point, the gateway address is not one of our own
6048                  * addresses or a matching interface route was not found.  We
6049                  * set the IRE type to lookup based on whether
6050                  * this is a host route, a default route or just a prefix.
6051                  *
6052                  * If an ill was passed in, then the lookup is based on an
6053                  * interface index so MATCH_IRE_ILL is added to match_flags.
6054                  */
6055                 match_flags |= MATCH_IRE_GW;
6056                 if (ill != NULL)
6057                         match_flags |= MATCH_IRE_ILL;
6058                 if (mask == IP_HOST_MASK)
6059                         type = IRE_HOST;
6060                 else if (mask == 0)
6061                         type = IRE_DEFAULT;
6062                 else
6063                         type = IRE_PREFIX;
6064                 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
6065                     ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
6066         }
6067 
6068         if (ipif != NULL) {
6069                 ipif_refrele(ipif);
6070                 ipif = NULL;
6071         }
6072 
6073         if (ire == NULL)
6074                 return (ESRCH);
6075 
6076         if (ire->ire_flags & RTF_MULTIRT) {
6077                 /*
6078                  * Invoke the CGTP (multirouting) filtering module
6079                  * to remove the dst address from the filtering database.
6080                  * Packets coming from that address will no longer be
6081                  * filtered to remove duplicates.
6082                  */
6083                 if (ipst->ips_ip_cgtp_filter_ops != NULL) {
6084                         err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
6085                             ipst->ips_netstack->netstack_stackid,
6086                             ire->ire_addr, ire->ire_gateway_addr);
6087                 }
6088                 ip_cgtp_bcast_delete(ire, ipst);
6089         }
6090 
6091         ill = ire->ire_ill;
6092         if (ill != NULL)
6093                 ill_remove_saved_ire(ill, ire);
6094         if (ioctl_msg)
6095                 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
6096         ire_delete(ire);
6097         ire_refrele(ire);
6098         return (err);
6099 }
6100 
6101 /*
6102  * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
6103  */
6104 /* ARGSUSED */
6105 int
6106 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6107     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6108 {
6109         ipaddr_t dst_addr;
6110         ipaddr_t gw_addr;
6111         ipaddr_t mask;
6112         int error = 0;
6113         mblk_t *mp1;
6114         struct rtentry *rt;
6115         ipif_t *ipif = NULL;
6116         ip_stack_t      *ipst;
6117 
6118         ASSERT(q->q_next == NULL);
6119         ipst = CONNQ_TO_IPST(q);
6120 
6121         ip1dbg(("ip_siocaddrt:"));
6122         /* Existence of mp1 verified in ip_wput_nondata */
6123         mp1 = mp->b_cont->b_cont;
6124         rt = (struct rtentry *)mp1->b_rptr;
6125 
6126         dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6127         gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6128 
6129         /*
6130          * If the RTF_HOST flag is on, this is a request to assign a gateway
6131          * to a particular host address.  In this case, we set the netmask to
6132          * all ones for the particular destination address.  Otherwise,
6133          * determine the netmask to be used based on dst_addr and the interfaces
6134          * in use.
6135          */
6136         if (rt->rt_flags & RTF_HOST) {
6137                 mask = IP_HOST_MASK;
6138         } else {
6139                 /*
6140                  * Note that ip_subnet_mask returns a zero mask in the case of
6141                  * default (an all-zeroes address).
6142                  */
6143                 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6144         }
6145 
6146         error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
6147             B_TRUE, NULL, ipst, ALL_ZONES);
6148         if (ipif != NULL)
6149                 ipif_refrele(ipif);
6150         return (error);
6151 }
6152 
6153 /*
6154  * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6155  */
6156 /* ARGSUSED */
6157 int
6158 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6159     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6160 {
6161         ipaddr_t dst_addr;
6162         ipaddr_t gw_addr;
6163         ipaddr_t mask;
6164         int error;
6165         mblk_t *mp1;
6166         struct rtentry *rt;
6167         ipif_t *ipif = NULL;
6168         ip_stack_t      *ipst;
6169 
6170         ASSERT(q->q_next == NULL);
6171         ipst = CONNQ_TO_IPST(q);
6172 
6173         ip1dbg(("ip_siocdelrt:"));
6174         /* Existence of mp1 verified in ip_wput_nondata */
6175         mp1 = mp->b_cont->b_cont;
6176         rt = (struct rtentry *)mp1->b_rptr;
6177 
6178         dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6179         gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6180 
6181         /*
6182          * If the RTF_HOST flag is on, this is a request to delete a gateway
6183          * to a particular host address.  In this case, we set the netmask to
6184          * all ones for the particular destination address.  Otherwise,
6185          * determine the netmask to be used based on dst_addr and the interfaces
6186          * in use.
6187          */
6188         if (rt->rt_flags & RTF_HOST) {
6189                 mask = IP_HOST_MASK;
6190         } else {
6191                 /*
6192                  * Note that ip_subnet_mask returns a zero mask in the case of
6193                  * default (an all-zeroes address).
6194                  */
6195                 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6196         }
6197 
6198         error = ip_rt_delete(dst_addr, mask, gw_addr,
6199             RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6200             ipst, ALL_ZONES);
6201         if (ipif != NULL)
6202                 ipif_refrele(ipif);
6203         return (error);
6204 }
6205 
6206 /*
6207  * Enqueue the mp onto the ipsq, chained by b_next.
6208  * b_prev stores the function to be executed later, and b_queue the queue
6209  * where this mp originated.
6210  */
6211 void
6212 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6213     ill_t *pending_ill)
6214 {
6215         conn_t  *connp;
6216         ipxop_t *ipx = ipsq->ipsq_xop;
6217 
6218         ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6219         ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6220         ASSERT(func != NULL);
6221 
6222         mp->b_queue = q;
6223         mp->b_prev = (void *)func;
6224         mp->b_next = NULL;
6225 
6226         switch (type) {
6227         case CUR_OP:
6228                 if (ipx->ipx_mptail != NULL) {
6229                         ASSERT(ipx->ipx_mphead != NULL);
6230                         ipx->ipx_mptail->b_next = mp;
6231                 } else {
6232                         ASSERT(ipx->ipx_mphead == NULL);
6233                         ipx->ipx_mphead = mp;
6234                 }
6235                 ipx->ipx_mptail = mp;
6236                 break;
6237 
6238         case NEW_OP:
6239                 if (ipsq->ipsq_xopq_mptail != NULL) {
6240                         ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6241                         ipsq->ipsq_xopq_mptail->b_next = mp;
6242                 } else {
6243                         ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6244                         ipsq->ipsq_xopq_mphead = mp;
6245                 }
6246                 ipsq->ipsq_xopq_mptail = mp;
6247                 ipx->ipx_ipsq_queued = B_TRUE;
6248                 break;
6249 
6250         case SWITCH_OP:
6251                 ASSERT(ipsq->ipsq_swxop != NULL);
6252                 /* only one switch operation is currently allowed */
6253                 ASSERT(ipsq->ipsq_switch_mp == NULL);
6254                 ipsq->ipsq_switch_mp = mp;
6255                 ipx->ipx_ipsq_queued = B_TRUE;
6256                 break;
6257         default:
6258                 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6259         }
6260 
6261         if (CONN_Q(q) && pending_ill != NULL) {
6262                 connp = Q_TO_CONN(q);
6263                 ASSERT(MUTEX_HELD(&connp->conn_lock));
6264                 connp->conn_oper_pending_ill = pending_ill;
6265         }
6266 }
6267 
6268 /*
6269  * Dequeue the next message that requested exclusive access to this IPSQ's
6270  * xop.  Specifically:
6271  *
6272  *  1. If we're still processing the current operation on `ipsq', then
6273  *     dequeue the next message for the operation (from ipx_mphead), or
6274  *     return NULL if there are no queued messages for the operation.
6275  *     These messages are queued via CUR_OP to qwriter_ip() and friends.
6276  *
6277  *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6278  *     not set) see if the ipsq has requested an xop switch.  If so, switch
6279  *     `ipsq' to a different xop.  Xop switches only happen when joining or
6280  *     leaving IPMP groups and require a careful dance -- see the comments
6281  *     in-line below for details.  If we're leaving a group xop or if we're
6282  *     joining a group xop and become writer on it, then we proceed to (3).
6283  *     Otherwise, we return NULL and exit the xop.
6284  *
6285  *  3. For each IPSQ in the xop, return any switch operation stored on
6286  *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6287  *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
6288  *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6289  *     Note that if the phyint tied to `ipsq' is not using IPMP there will
6290  *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
6291  *     each phyint in the group, including the IPMP meta-interface phyint.
6292  */
6293 static mblk_t *
6294 ipsq_dq(ipsq_t *ipsq)
6295 {
6296         ill_t   *illv4, *illv6;
6297         mblk_t  *mp;
6298         ipsq_t  *xopipsq;
6299         ipsq_t  *leftipsq = NULL;
6300         ipxop_t *ipx;
6301         phyint_t *phyi = ipsq->ipsq_phyint;
6302         ip_stack_t *ipst = ipsq->ipsq_ipst;
6303         boolean_t emptied = B_FALSE;
6304 
6305         /*
6306          * Grab all the locks we need in the defined order (ill_g_lock ->
6307          * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6308          */
6309         rw_enter(&ipst->ips_ill_g_lock,
6310             ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6311         mutex_enter(&ipsq->ipsq_lock);
6312         ipx = ipsq->ipsq_xop;
6313         mutex_enter(&ipx->ipx_lock);
6314 
6315         /*
6316          * Dequeue the next message associated with the current exclusive
6317          * operation, if any.
6318          */
6319         if ((mp = ipx->ipx_mphead) != NULL) {
6320                 ipx->ipx_mphead = mp->b_next;
6321                 if (ipx->ipx_mphead == NULL)
6322                         ipx->ipx_mptail = NULL;
6323                 mp->b_next = (void *)ipsq;
6324                 goto out;
6325         }
6326 
6327         if (ipx->ipx_current_ipif != NULL)
6328                 goto empty;
6329 
6330         if (ipsq->ipsq_swxop != NULL) {
6331                 /*
6332                  * The exclusive operation that is now being completed has
6333                  * requested a switch to a different xop.  This happens
6334                  * when an interface joins or leaves an IPMP group.  Joins
6335                  * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6336                  * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6337                  * (phyint_free()), or interface plumb for an ill type
6338                  * not in the IPMP group (ip_rput_dlpi_writer()).
6339                  *
6340                  * Xop switches are not allowed on the IPMP meta-interface.
6341                  */
6342                 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6343                 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6344                 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6345 
6346                 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6347                         /*
6348                          * We're switching back to our own xop, so we have two
6349                          * xop's to drain/exit: our own, and the group xop
6350                          * that we are leaving.
6351                          *
6352                          * First, pull ourselves out of the group ipsq list.
6353                          * This is safe since we're writer on ill_g_lock.
6354                          */
6355                         ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6356 
6357                         xopipsq = ipx->ipx_ipsq;
6358                         while (xopipsq->ipsq_next != ipsq)
6359                                 xopipsq = xopipsq->ipsq_next;
6360 
6361                         xopipsq->ipsq_next = ipsq->ipsq_next;
6362                         ipsq->ipsq_next = ipsq;
6363                         ipsq->ipsq_xop = ipsq->ipsq_swxop;
6364                         ipsq->ipsq_swxop = NULL;
6365 
6366                         /*
6367                          * Second, prepare to exit the group xop.  The actual
6368                          * ipsq_exit() is done at the end of this function
6369                          * since we cannot hold any locks across ipsq_exit().
6370                          * Note that although we drop the group's ipx_lock, no
6371                          * threads can proceed since we're still ipx_writer.
6372                          */
6373                         leftipsq = xopipsq;
6374                         mutex_exit(&ipx->ipx_lock);
6375 
6376                         /*
6377                          * Third, set ipx to point to our own xop (which was
6378                          * inactive and therefore can be entered).
6379                          */
6380                         ipx = ipsq->ipsq_xop;
6381                         mutex_enter(&ipx->ipx_lock);
6382                         ASSERT(ipx->ipx_writer == NULL);
6383                         ASSERT(ipx->ipx_current_ipif == NULL);
6384                 } else {
6385                         /*
6386                          * We're switching from our own xop to a group xop.
6387                          * The requestor of the switch must ensure that the
6388                          * group xop cannot go away (e.g. by ensuring the
6389                          * phyint associated with the xop cannot go away).
6390                          *
6391                          * If we can become writer on our new xop, then we'll
6392                          * do the drain.  Otherwise, the current writer of our
6393                          * new xop will do the drain when it exits.
6394                          *
6395                          * First, splice ourselves into the group IPSQ list.
6396                          * This is safe since we're writer on ill_g_lock.
6397                          */
6398                         ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6399 
6400                         xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6401                         while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6402                                 xopipsq = xopipsq->ipsq_next;
6403 
6404                         xopipsq->ipsq_next = ipsq;
6405                         ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6406                         ipsq->ipsq_xop = ipsq->ipsq_swxop;
6407                         ipsq->ipsq_swxop = NULL;
6408 
6409                         /*
6410                          * Second, exit our own xop, since it's now unused.
6411                          * This is safe since we've got the only reference.
6412                          */
6413                         ASSERT(ipx->ipx_writer == curthread);
6414                         ipx->ipx_writer = NULL;
6415                         VERIFY(--ipx->ipx_reentry_cnt == 0);
6416                         ipx->ipx_ipsq_queued = B_FALSE;
6417                         mutex_exit(&ipx->ipx_lock);
6418 
6419                         /*
6420                          * Third, set ipx to point to our new xop, and check
6421                          * if we can become writer on it.  If we cannot, then
6422                          * the current writer will drain the IPSQ group when
6423                          * it exits.  Our ipsq_xop is guaranteed to be stable
6424                          * because we're still holding ipsq_lock.
6425                          */
6426                         ipx = ipsq->ipsq_xop;
6427                         mutex_enter(&ipx->ipx_lock);
6428                         if (ipx->ipx_writer != NULL ||
6429                             ipx->ipx_current_ipif != NULL) {
6430                                 goto out;
6431                         }
6432                 }
6433 
6434                 /*
6435                  * Fourth, become writer on our new ipx before we continue
6436                  * with the drain.  Note that we never dropped ipsq_lock
6437                  * above, so no other thread could've raced with us to
6438                  * become writer first.  Also, we're holding ipx_lock, so
6439                  * no other thread can examine the ipx right now.
6440                  */
6441                 ASSERT(ipx->ipx_current_ipif == NULL);
6442                 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6443                 VERIFY(ipx->ipx_reentry_cnt++ == 0);
6444                 ipx->ipx_writer = curthread;
6445                 ipx->ipx_forced = B_FALSE;
6446 #ifdef DEBUG
6447                 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6448 #endif
6449         }
6450 
6451         xopipsq = ipsq;
6452         do {
6453                 /*
6454                  * So that other operations operate on a consistent and
6455                  * complete phyint, a switch message on an IPSQ must be
6456                  * handled prior to any other operations on that IPSQ.
6457                  */
6458                 if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6459                         xopipsq->ipsq_switch_mp = NULL;
6460                         ASSERT(mp->b_next == NULL);
6461                         mp->b_next = (void *)xopipsq;
6462                         goto out;
6463                 }
6464 
6465                 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6466                         xopipsq->ipsq_xopq_mphead = mp->b_next;
6467                         if (xopipsq->ipsq_xopq_mphead == NULL)
6468                                 xopipsq->ipsq_xopq_mptail = NULL;
6469                         mp->b_next = (void *)xopipsq;
6470                         goto out;
6471                 }
6472         } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6473 empty:
6474         /*
6475          * There are no messages.  Further, we are holding ipx_lock, hence no
6476          * new messages can end up on any IPSQ in the xop.
6477          */
6478         ipx->ipx_writer = NULL;
6479         ipx->ipx_forced = B_FALSE;
6480         VERIFY(--ipx->ipx_reentry_cnt == 0);
6481         ipx->ipx_ipsq_queued = B_FALSE;
6482         emptied = B_TRUE;
6483 #ifdef  DEBUG
6484         ipx->ipx_depth = 0;
6485 #endif
6486 out:
6487         mutex_exit(&ipx->ipx_lock);
6488         mutex_exit(&ipsq->ipsq_lock);
6489 
6490         /*
6491          * If we completely emptied the xop, then wake up any threads waiting
6492          * to enter any of the IPSQ's associated with it.
6493          */
6494         if (emptied) {
6495                 xopipsq = ipsq;
6496                 do {
6497                         if ((phyi = xopipsq->ipsq_phyint) == NULL)
6498                                 continue;
6499 
6500                         illv4 = phyi->phyint_illv4;
6501                         illv6 = phyi->phyint_illv6;
6502 
6503                         GRAB_ILL_LOCKS(illv4, illv6);
6504                         if (illv4 != NULL)
6505                                 cv_broadcast(&illv4->ill_cv);
6506                         if (illv6 != NULL)
6507                                 cv_broadcast(&illv6->ill_cv);
6508                         RELEASE_ILL_LOCKS(illv4, illv6);
6509                 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6510         }
6511         rw_exit(&ipst->ips_ill_g_lock);
6512 
6513         /*
6514          * Now that all locks are dropped, exit the IPSQ we left.
6515          */
6516         if (leftipsq != NULL)
6517                 ipsq_exit(leftipsq);
6518 
6519         return (mp);
6520 }
6521 
6522 /*
6523  * Return completion status of previously initiated DLPI operations on
6524  * ills in the purview of an ipsq.
6525  */
6526 static boolean_t
6527 ipsq_dlpi_done(ipsq_t *ipsq)
6528 {
6529         ipsq_t          *ipsq_start;
6530         phyint_t        *phyi;
6531         ill_t           *ill;
6532 
6533         ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6534         ipsq_start = ipsq;
6535 
6536         do {
6537                 /*
6538                  * The only current users of this function are ipsq_try_enter
6539                  * and ipsq_enter which have made sure that ipsq_writer is
6540                  * NULL before we reach here. ill_dlpi_pending is modified
6541                  * only by an ipsq writer
6542                  */
6543                 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6544                 phyi = ipsq->ipsq_phyint;
6545                 /*
6546                  * phyi could be NULL if a phyint that is part of an
6547                  * IPMP group is being unplumbed. A more detailed
6548                  * comment is in ipmp_grp_update_kstats()
6549                  */
6550                 if (phyi != NULL) {
6551                         ill = phyi->phyint_illv4;
6552                         if (ill != NULL &&
6553                             (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6554                             ill->ill_arl_dlpi_pending))
6555                                 return (B_FALSE);
6556 
6557                         ill = phyi->phyint_illv6;
6558                         if (ill != NULL &&
6559                             ill->ill_dlpi_pending != DL_PRIM_INVAL)
6560                                 return (B_FALSE);
6561                 }
6562 
6563         } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6564 
6565         return (B_TRUE);
6566 }
6567 
6568 /*
6569  * Enter the ipsq corresponding to ill, by waiting synchronously till
6570  * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6571  * will have to drain completely before ipsq_enter returns success.
6572  * ipx_current_ipif will be set if some exclusive op is in progress,
6573  * and the ipsq_exit logic will start the next enqueued op after
6574  * completion of the current op. If 'force' is used, we don't wait
6575  * for the enqueued ops. This is needed when a conn_close wants to
6576  * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6577  * of an ill can also use this option. But we dont' use it currently.
6578  */
6579 #define ENTER_SQ_WAIT_TICKS 100
6580 boolean_t
6581 ipsq_enter(ill_t *ill, boolean_t force, int type)
6582 {
6583         ipsq_t  *ipsq;
6584         ipxop_t *ipx;
6585         boolean_t waited_enough = B_FALSE;
6586         ip_stack_t *ipst = ill->ill_ipst;
6587 
6588         /*
6589          * Note that the relationship between ill and ipsq is fixed as long as
6590          * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
6591          * relationship between the IPSQ and xop cannot change.  However,
6592          * since we cannot hold ipsq_lock across the cv_wait(), it may change
6593          * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
6594          * waking up all ills in the xop when it becomes available.
6595          */
6596         for (;;) {
6597                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6598                 mutex_enter(&ill->ill_lock);
6599                 if (ill->ill_state_flags & ILL_CONDEMNED) {
6600                         mutex_exit(&ill->ill_lock);
6601                         rw_exit(&ipst->ips_ill_g_lock);
6602                         return (B_FALSE);
6603                 }
6604 
6605                 ipsq = ill->ill_phyint->phyint_ipsq;
6606                 mutex_enter(&ipsq->ipsq_lock);
6607                 ipx = ipsq->ipsq_xop;
6608                 mutex_enter(&ipx->ipx_lock);
6609 
6610                 if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6611                     (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6612                     waited_enough))
6613                         break;
6614 
6615                 rw_exit(&ipst->ips_ill_g_lock);
6616 
6617                 if (!force || ipx->ipx_writer != NULL) {
6618                         mutex_exit(&ipx->ipx_lock);
6619                         mutex_exit(&ipsq->ipsq_lock);
6620                         cv_wait(&ill->ill_cv, &ill->ill_lock);
6621                 } else {
6622                         mutex_exit(&ipx->ipx_lock);
6623                         mutex_exit(&ipsq->ipsq_lock);
6624                         (void) cv_reltimedwait(&ill->ill_cv,
6625                             &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6626                         waited_enough = B_TRUE;
6627                 }
6628                 mutex_exit(&ill->ill_lock);
6629         }
6630 
6631         ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6632         ASSERT(ipx->ipx_reentry_cnt == 0);
6633         ipx->ipx_writer = curthread;
6634         ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6635         ipx->ipx_reentry_cnt++;
6636 #ifdef DEBUG
6637         ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6638 #endif
6639         mutex_exit(&ipx->ipx_lock);
6640         mutex_exit(&ipsq->ipsq_lock);
6641         mutex_exit(&ill->ill_lock);
6642         rw_exit(&ipst->ips_ill_g_lock);
6643 
6644         return (B_TRUE);
6645 }
6646 
6647 /*
6648  * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6649  * across the call to the core interface ipsq_try_enter() and hence calls this
6650  * function directly. This is explained more fully in ipif_set_values().
6651  * In order to support the above constraint, ipsq_try_enter is implemented as
6652  * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6653  */
6654 static ipsq_t *
6655 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6656     int type, boolean_t reentry_ok)
6657 {
6658         ipsq_t  *ipsq;
6659         ipxop_t *ipx;
6660         ip_stack_t *ipst = ill->ill_ipst;
6661 
6662         /*
6663          * lock ordering:
6664          * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6665          *
6666          * ipx of an ipsq can't change when ipsq_lock is held.
6667          */
6668         ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6669         GRAB_CONN_LOCK(q);
6670         mutex_enter(&ill->ill_lock);
6671         ipsq = ill->ill_phyint->phyint_ipsq;
6672         mutex_enter(&ipsq->ipsq_lock);
6673         ipx = ipsq->ipsq_xop;
6674         mutex_enter(&ipx->ipx_lock);
6675 
6676         /*
6677          * 1. Enter the ipsq if we are already writer and reentry is ok.
6678          *    (Note: If the caller does not specify reentry_ok then neither
6679          *    'func' nor any of its callees must ever attempt to enter the ipsq
6680          *    again. Otherwise it can lead to an infinite loop
6681          * 2. Enter the ipsq if there is no current writer and this attempted
6682          *    entry is part of the current operation
6683          * 3. Enter the ipsq if there is no current writer and this is a new
6684          *    operation and the operation queue is empty and there is no
6685          *    operation currently in progress and if all previously initiated
6686          *    DLPI operations have completed.
6687          */
6688         if ((ipx->ipx_writer == curthread && reentry_ok) ||
6689             (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6690             !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6691             ipsq_dlpi_done(ipsq))))) {
6692                 /* Success. */
6693                 ipx->ipx_reentry_cnt++;
6694                 ipx->ipx_writer = curthread;
6695                 ipx->ipx_forced = B_FALSE;
6696                 mutex_exit(&ipx->ipx_lock);
6697                 mutex_exit(&ipsq->ipsq_lock);
6698                 mutex_exit(&ill->ill_lock);
6699                 RELEASE_CONN_LOCK(q);
6700 #ifdef DEBUG
6701                 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6702 #endif
6703                 return (ipsq);
6704         }
6705 
6706         if (func != NULL)
6707                 ipsq_enq(ipsq, q, mp, func, type, ill);
6708 
6709         mutex_exit(&ipx->ipx_lock);
6710         mutex_exit(&ipsq->ipsq_lock);
6711         mutex_exit(&ill->ill_lock);
6712         RELEASE_CONN_LOCK(q);
6713         return (NULL);
6714 }
6715 
6716 /*
6717  * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6718  * certain critical operations like plumbing (i.e. most set ioctls), etc.
6719  * There is one ipsq per phyint. The ipsq
6720  * serializes exclusive ioctls issued by applications on a per ipsq basis in
6721  * ipsq_xopq_mphead. It also protects against multiple threads executing in
6722  * the ipsq. Responses from the driver pertain to the current ioctl (say a
6723  * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6724  * up the interface) and are enqueued in ipx_mphead.
6725  *
6726  * If a thread does not want to reenter the ipsq when it is already writer,
6727  * it must make sure that the specified reentry point to be called later
6728  * when the ipsq is empty, nor any code path starting from the specified reentry
6729  * point must never ever try to enter the ipsq again. Otherwise it can lead
6730  * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6731  * When the thread that is currently exclusive finishes, it (ipsq_exit)
6732  * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6733  * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6734  * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6735  * ioctl if the current ioctl has completed. If the current ioctl is still
6736  * in progress it simply returns. The current ioctl could be waiting for
6737  * a response from another module (the driver or could be waiting for
6738  * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6739  * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6740  * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6741  * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6742  * all associated DLPI operations have completed.
6743  */
6744 
6745 /*
6746  * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6747  * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
6748  * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
6749  * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
6750  * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6751  * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
6752  */
6753 ipsq_t *
6754 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6755     ipsq_func_t func, int type, boolean_t reentry_ok)
6756 {
6757         ip_stack_t      *ipst;
6758         ipsq_t          *ipsq;
6759 
6760         /* Only 1 of ipif or ill can be specified */
6761         ASSERT((ipif != NULL) ^ (ill != NULL));
6762 
6763         if (ipif != NULL)
6764                 ill = ipif->ipif_ill;
6765         ipst = ill->ill_ipst;
6766 
6767         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6768         ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6769         rw_exit(&ipst->ips_ill_g_lock);
6770 
6771         return (ipsq);
6772 }
6773 
6774 /*
6775  * Try to enter the IPSQ corresponding to `ill' as writer.  The caller ensures
6776  * ill is valid by refholding it if necessary; we will refrele.  If the IPSQ
6777  * cannot be entered, the mp is queued for completion.
6778  */
6779 void
6780 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6781     boolean_t reentry_ok)
6782 {
6783         ipsq_t  *ipsq;
6784 
6785         ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6786 
6787         /*
6788          * Drop the caller's refhold on the ill.  This is safe since we either
6789          * entered the IPSQ (and thus are exclusive), or failed to enter the
6790          * IPSQ, in which case we return without accessing ill anymore.  This
6791          * is needed because func needs to see the correct refcount.
6792          * e.g. removeif can work only then.
6793          */
6794         ill_refrele(ill);
6795         if (ipsq != NULL) {
6796                 (*func)(ipsq, q, mp, NULL);
6797                 ipsq_exit(ipsq);
6798         }
6799 }
6800 
6801 /*
6802  * Exit the specified IPSQ.  If this is the final exit on it then drain it
6803  * prior to exiting.  Caller must be writer on the specified IPSQ.
6804  */
6805 void
6806 ipsq_exit(ipsq_t *ipsq)
6807 {
6808         mblk_t *mp;
6809         ipsq_t *mp_ipsq;
6810         queue_t *q;
6811         phyint_t *phyi;
6812         ipsq_func_t func;
6813 
6814         ASSERT(IAM_WRITER_IPSQ(ipsq));
6815 
6816         ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6817         if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6818                 ipsq->ipsq_xop->ipx_reentry_cnt--;
6819                 return;
6820         }
6821 
6822         for (;;) {
6823                 phyi = ipsq->ipsq_phyint;
6824                 mp = ipsq_dq(ipsq);
6825                 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6826 
6827                 /*
6828                  * If we've changed to a new IPSQ, and the phyint associated
6829                  * with the old one has gone away, free the old IPSQ.  Note
6830                  * that this cannot happen while the IPSQ is in a group.
6831                  */
6832                 if (mp_ipsq != ipsq && phyi == NULL) {
6833                         ASSERT(ipsq->ipsq_next == ipsq);
6834                         ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6835                         ipsq_delete(ipsq);
6836                 }
6837 
6838                 if (mp == NULL)
6839                         break;
6840 
6841                 q = mp->b_queue;
6842                 func = (ipsq_func_t)mp->b_prev;
6843                 ipsq = mp_ipsq;
6844                 mp->b_next = mp->b_prev = NULL;
6845                 mp->b_queue = NULL;
6846 
6847                 /*
6848                  * If 'q' is an conn queue, it is valid, since we did a
6849                  * a refhold on the conn at the start of the ioctl.
6850                  * If 'q' is an ill queue, it is valid, since close of an
6851                  * ill will clean up its IPSQ.
6852                  */
6853                 (*func)(ipsq, q, mp, NULL);
6854         }
6855 }
6856 
6857 /*
6858  * Used to start any igmp or mld timers that could not be started
6859  * while holding ill_mcast_lock. The timers can't be started while holding
6860  * the lock, since mld/igmp_start_timers may need to call untimeout()
6861  * which can't be done while holding the lock which the timeout handler
6862  * acquires. Otherwise
6863  * there could be a deadlock since the timeout handlers
6864  * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6865  * ill_mcast_lock.
6866  */
6867 void
6868 ill_mcast_timer_start(ip_stack_t *ipst)
6869 {
6870         int             next;
6871 
6872         mutex_enter(&ipst->ips_igmp_timer_lock);
6873         next = ipst->ips_igmp_deferred_next;
6874         ipst->ips_igmp_deferred_next = INFINITY;
6875         mutex_exit(&ipst->ips_igmp_timer_lock);
6876 
6877         if (next != INFINITY)
6878                 igmp_start_timers(next, ipst);
6879 
6880         mutex_enter(&ipst->ips_mld_timer_lock);
6881         next = ipst->ips_mld_deferred_next;
6882         ipst->ips_mld_deferred_next = INFINITY;
6883         mutex_exit(&ipst->ips_mld_timer_lock);
6884 
6885         if (next != INFINITY)
6886                 mld_start_timers(next, ipst);
6887 }
6888 
6889 /*
6890  * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6891  * and `ioccmd'.
6892  */
6893 void
6894 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6895 {
6896         ill_t *ill = ipif->ipif_ill;
6897         ipxop_t *ipx = ipsq->ipsq_xop;
6898 
6899         ASSERT(IAM_WRITER_IPSQ(ipsq));
6900         ASSERT(ipx->ipx_current_ipif == NULL);
6901         ASSERT(ipx->ipx_current_ioctl == 0);
6902 
6903         ipx->ipx_current_done = B_FALSE;
6904         ipx->ipx_current_ioctl = ioccmd;
6905         mutex_enter(&ipx->ipx_lock);
6906         ipx->ipx_current_ipif = ipif;
6907         mutex_exit(&ipx->ipx_lock);
6908 
6909         /*
6910          * Set IPIF_CHANGING on one or more ipifs associated with the
6911          * current exclusive operation.  IPIF_CHANGING prevents any new
6912          * references to the ipif (so that the references will eventually
6913          * drop to zero) and also prevents any "get" operations (e.g.,
6914          * SIOCGLIFFLAGS) from being able to access the ipif until the
6915          * operation has completed and the ipif is again in a stable state.
6916          *
6917          * For ioctls, IPIF_CHANGING is set on the ipif associated with the
6918          * ioctl.  For internal operations (where ioccmd is zero), all ipifs
6919          * on the ill are marked with IPIF_CHANGING since it's unclear which
6920          * ipifs will be affected.
6921          *
6922          * Note that SIOCLIFREMOVEIF is a special case as it sets
6923          * IPIF_CONDEMNED internally after identifying the right ipif to
6924          * operate on.
6925          */
6926         switch (ioccmd) {
6927         case SIOCLIFREMOVEIF:
6928                 break;
6929         case 0:
6930                 mutex_enter(&ill->ill_lock);
6931                 ipif = ipif->ipif_ill->ill_ipif;
6932                 for (; ipif != NULL; ipif = ipif->ipif_next)
6933                         ipif->ipif_state_flags |= IPIF_CHANGING;
6934                 mutex_exit(&ill->ill_lock);
6935                 break;
6936         default:
6937                 mutex_enter(&ill->ill_lock);
6938                 ipif->ipif_state_flags |= IPIF_CHANGING;
6939                 mutex_exit(&ill->ill_lock);
6940         }
6941 }
6942 
6943 /*
6944  * Finish the current exclusive operation on `ipsq'.  Usually, this will allow
6945  * the next exclusive operation to begin once we ipsq_exit().  However, if
6946  * pending DLPI operations remain, then we will wait for the queue to drain
6947  * before allowing the next exclusive operation to begin.  This ensures that
6948  * DLPI operations from one exclusive operation are never improperly processed
6949  * as part of a subsequent exclusive operation.
6950  */
6951 void
6952 ipsq_current_finish(ipsq_t *ipsq)
6953 {
6954         ipxop_t *ipx = ipsq->ipsq_xop;
6955         t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
6956         ipif_t  *ipif = ipx->ipx_current_ipif;
6957 
6958         ASSERT(IAM_WRITER_IPSQ(ipsq));
6959 
6960         /*
6961          * For SIOCLIFREMOVEIF, the ipif has been already been blown away
6962          * (but in that case, IPIF_CHANGING will already be clear and no
6963          * pending DLPI messages can remain).
6964          */
6965         if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
6966                 ill_t *ill = ipif->ipif_ill;
6967 
6968                 mutex_enter(&ill->ill_lock);
6969                 dlpi_pending = ill->ill_dlpi_pending;
6970                 if (ipx->ipx_current_ioctl == 0) {
6971                         ipif = ill->ill_ipif;
6972                         for (; ipif != NULL; ipif = ipif->ipif_next)
6973                                 ipif->ipif_state_flags &= ~IPIF_CHANGING;
6974                 } else {
6975                         ipif->ipif_state_flags &= ~IPIF_CHANGING;
6976                 }
6977                 mutex_exit(&ill->ill_lock);
6978         }
6979 
6980         ASSERT(!ipx->ipx_current_done);
6981         ipx->ipx_current_done = B_TRUE;
6982         ipx->ipx_current_ioctl = 0;
6983         if (dlpi_pending == DL_PRIM_INVAL) {
6984                 mutex_enter(&ipx->ipx_lock);
6985                 ipx->ipx_current_ipif = NULL;
6986                 mutex_exit(&ipx->ipx_lock);
6987         }
6988 }
6989 
6990 /*
6991  * The ill is closing. Flush all messages on the ipsq that originated
6992  * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
6993  * for this ill since ipsq_enter could not have entered until then.
6994  * New messages can't be queued since the CONDEMNED flag is set.
6995  */
6996 static void
6997 ipsq_flush(ill_t *ill)
6998 {
6999         queue_t *q;
7000         mblk_t  *prev;
7001         mblk_t  *mp;
7002         mblk_t  *mp_next;
7003         ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
7004 
7005         ASSERT(IAM_WRITER_ILL(ill));
7006 
7007         /*
7008          * Flush any messages sent up by the driver.
7009          */
7010         mutex_enter(&ipx->ipx_lock);
7011         for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
7012                 mp_next = mp->b_next;
7013                 q = mp->b_queue;
7014                 if (q == ill->ill_rq || q == ill->ill_wq) {
7015                         /* dequeue mp */
7016                         if (prev == NULL)
7017                                 ipx->ipx_mphead = mp->b_next;
7018                         else
7019                                 prev->b_next = mp->b_next;
7020                         if (ipx->ipx_mptail == mp) {
7021                                 ASSERT(mp_next == NULL);
7022                                 ipx->ipx_mptail = prev;
7023                         }
7024                         inet_freemsg(mp);
7025                 } else {
7026                         prev = mp;
7027                 }
7028         }
7029         mutex_exit(&ipx->ipx_lock);
7030         (void) ipsq_pending_mp_cleanup(ill, NULL);
7031         ipsq_xopq_mp_cleanup(ill, NULL);
7032 }
7033 
7034 /*
7035  * Parse an ifreq or lifreq struct coming down ioctls and refhold
7036  * and return the associated ipif.
7037  * Return value:
7038  *      Non zero: An error has occurred. ci may not be filled out.
7039  *      zero : ci is filled out with the ioctl cmd in ci.ci_name, and
7040  *      a held ipif in ci.ci_ipif.
7041  */
7042 int
7043 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
7044     cmd_info_t *ci)
7045 {
7046         char            *name;
7047         struct ifreq    *ifr;
7048         struct lifreq    *lifr;
7049         ipif_t          *ipif = NULL;
7050         ill_t           *ill;
7051         conn_t          *connp;
7052         boolean_t       isv6;
7053         int             err;
7054         mblk_t          *mp1;
7055         zoneid_t        zoneid;
7056         ip_stack_t      *ipst;
7057 
7058         if (q->q_next != NULL) {
7059                 ill = (ill_t *)q->q_ptr;
7060                 isv6 = ill->ill_isv6;
7061                 connp = NULL;
7062                 zoneid = ALL_ZONES;
7063                 ipst = ill->ill_ipst;
7064         } else {
7065                 ill = NULL;
7066                 connp = Q_TO_CONN(q);
7067                 isv6 = (connp->conn_family == AF_INET6);
7068                 zoneid = connp->conn_zoneid;
7069                 if (zoneid == GLOBAL_ZONEID) {
7070                         /* global zone can access ipifs in all zones */
7071                         zoneid = ALL_ZONES;
7072                 }
7073                 ipst = connp->conn_netstack->netstack_ip;
7074         }
7075 
7076         /* Has been checked in ip_wput_nondata */
7077         mp1 = mp->b_cont->b_cont;
7078 
7079         if (ipip->ipi_cmd_type == IF_CMD) {
7080                 /* This a old style SIOC[GS]IF* command */
7081                 ifr = (struct ifreq *)mp1->b_rptr;
7082                 /*
7083                  * Null terminate the string to protect against buffer
7084                  * overrun. String was generated by user code and may not
7085                  * be trusted.
7086                  */
7087                 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
7088                 name = ifr->ifr_name;
7089                 ci->ci_sin = (sin_t *)&ifr->ifr_addr;
7090                 ci->ci_sin6 = NULL;
7091                 ci->ci_lifr = (struct lifreq *)ifr;
7092         } else {
7093                 /* This a new style SIOC[GS]LIF* command */
7094                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
7095                 lifr = (struct lifreq *)mp1->b_rptr;
7096                 /*
7097                  * Null terminate the string to protect against buffer
7098                  * overrun. String was generated by user code and may not
7099                  * be trusted.
7100                  */
7101                 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
7102                 name = lifr->lifr_name;
7103                 ci->ci_sin = (sin_t *)&lifr->lifr_addr;
7104                 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
7105                 ci->ci_lifr = lifr;
7106         }
7107 
7108         if (ipip->ipi_cmd == SIOCSLIFNAME) {
7109                 /*
7110                  * The ioctl will be failed if the ioctl comes down
7111                  * an conn stream
7112                  */
7113                 if (ill == NULL) {
7114                         /*
7115                          * Not an ill queue, return EINVAL same as the
7116                          * old error code.
7117                          */
7118                         return (ENXIO);
7119                 }
7120                 ipif = ill->ill_ipif;
7121                 ipif_refhold(ipif);
7122         } else {
7123                 /*
7124                  * Ensure that ioctls don't see any internal state changes
7125                  * caused by set ioctls by deferring them if IPIF_CHANGING is
7126                  * set.
7127                  */
7128                 ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
7129                     isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
7130                 if (ipif == NULL) {
7131                         if (err == EINPROGRESS)
7132                                 return (err);
7133                         err = 0;        /* Ensure we don't use it below */
7134                 }
7135         }
7136 
7137         /*
7138          * Old style [GS]IFCMD does not admit IPv6 ipif
7139          */
7140         if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
7141                 ipif_refrele(ipif);
7142                 return (ENXIO);
7143         }
7144 
7145         if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
7146             name[0] == '\0') {
7147                 /*
7148                  * Handle a or a SIOC?IF* with a null name
7149                  * during plumb (on the ill queue before the I_PLINK).
7150                  */
7151                 ipif = ill->ill_ipif;
7152                 ipif_refhold(ipif);
7153         }
7154 
7155         if (ipif == NULL)
7156                 return (ENXIO);
7157 
7158         DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7159             int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7160 
7161         ci->ci_ipif = ipif;
7162         return (0);
7163 }
7164 
7165 /*
7166  * Return the total number of ipifs.
7167  */
7168 static uint_t
7169 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7170 {
7171         uint_t numifs = 0;
7172         ill_t   *ill;
7173         ill_walk_context_t      ctx;
7174         ipif_t  *ipif;
7175 
7176         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7177         ill = ILL_START_WALK_V4(&ctx, ipst);
7178         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7179                 if (IS_UNDER_IPMP(ill))
7180                         continue;
7181                 for (ipif = ill->ill_ipif; ipif != NULL;
7182                     ipif = ipif->ipif_next) {
7183                         if (ipif->ipif_zoneid == zoneid ||
7184                             ipif->ipif_zoneid == ALL_ZONES)
7185                                 numifs++;
7186                 }
7187         }
7188         rw_exit(&ipst->ips_ill_g_lock);
7189         return (numifs);
7190 }
7191 
7192 /*
7193  * Return the total number of ipifs.
7194  */
7195 static uint_t
7196 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7197 {
7198         uint_t numifs = 0;
7199         ill_t   *ill;
7200         ipif_t  *ipif;
7201         ill_walk_context_t      ctx;
7202 
7203         ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7204 
7205         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7206         if (family == AF_INET)
7207                 ill = ILL_START_WALK_V4(&ctx, ipst);
7208         else if (family == AF_INET6)
7209                 ill = ILL_START_WALK_V6(&ctx, ipst);
7210         else
7211                 ill = ILL_START_WALK_ALL(&ctx, ipst);
7212 
7213         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7214                 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7215                         continue;
7216 
7217                 for (ipif = ill->ill_ipif; ipif != NULL;
7218                     ipif = ipif->ipif_next) {
7219                         if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7220                             !(lifn_flags & LIFC_NOXMIT))
7221                                 continue;
7222                         if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7223                             !(lifn_flags & LIFC_TEMPORARY))
7224                                 continue;
7225                         if (((ipif->ipif_flags &
7226                             (IPIF_NOXMIT|IPIF_NOLOCAL|
7227                             IPIF_DEPRECATED)) ||
7228                             IS_LOOPBACK(ill) ||
7229                             !(ipif->ipif_flags & IPIF_UP)) &&
7230                             (lifn_flags & LIFC_EXTERNAL_SOURCE))
7231                                 continue;
7232 
7233                         if (zoneid != ipif->ipif_zoneid &&
7234                             ipif->ipif_zoneid != ALL_ZONES &&
7235                             (zoneid != GLOBAL_ZONEID ||
7236                             !(lifn_flags & LIFC_ALLZONES)))
7237                                 continue;
7238 
7239                         numifs++;
7240                 }
7241         }
7242         rw_exit(&ipst->ips_ill_g_lock);
7243         return (numifs);
7244 }
7245 
7246 uint_t
7247 ip_get_lifsrcofnum(ill_t *ill)
7248 {
7249         uint_t numifs = 0;
7250         ill_t   *ill_head = ill;
7251         ip_stack_t      *ipst = ill->ill_ipst;
7252 
7253         /*
7254          * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7255          * other thread may be trying to relink the ILLs in this usesrc group
7256          * and adjusting the ill_usesrc_grp_next pointers
7257          */
7258         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7259         if ((ill->ill_usesrc_ifindex == 0) &&
7260             (ill->ill_usesrc_grp_next != NULL)) {
7261                 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7262                     ill = ill->ill_usesrc_grp_next)
7263                         numifs++;
7264         }
7265         rw_exit(&ipst->ips_ill_g_usesrc_lock);
7266 
7267         return (numifs);
7268 }
7269 
7270 /* Null values are passed in for ipif, sin, and ifreq */
7271 /* ARGSUSED */
7272 int
7273 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7274     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7275 {
7276         int *nump;
7277         conn_t *connp = Q_TO_CONN(q);
7278 
7279         ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7280 
7281         /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7282         nump = (int *)mp->b_cont->b_cont->b_rptr;
7283 
7284         *nump = ip_get_numifs(connp->conn_zoneid,
7285             connp->conn_netstack->netstack_ip);
7286         ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7287         return (0);
7288 }
7289 
7290 /* Null values are passed in for ipif, sin, and ifreq */
7291 /* ARGSUSED */
7292 int
7293 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7294     queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7295 {
7296         struct lifnum *lifn;
7297         mblk_t  *mp1;
7298         conn_t *connp = Q_TO_CONN(q);
7299 
7300         ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7301 
7302         /* Existence checked in ip_wput_nondata */
7303         mp1 = mp->b_cont->b_cont;
7304 
7305         lifn = (struct lifnum *)mp1->b_rptr;
7306         switch (lifn->lifn_family) {
7307         case AF_UNSPEC:
7308         case AF_INET:
7309         case AF_INET6:
7310                 break;
7311         default:
7312                 return (EAFNOSUPPORT);
7313         }
7314 
7315         lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7316             connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7317         ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7318         return (0);
7319 }
7320 
7321 /* ARGSUSED */
7322 int
7323 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7324     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7325 {
7326         STRUCT_HANDLE(ifconf, ifc);
7327         mblk_t *mp1;
7328         struct iocblk *iocp;
7329         struct ifreq *ifr;
7330         ill_walk_context_t      ctx;
7331         ill_t   *ill;
7332         ipif_t  *ipif;
7333         struct sockaddr_in *sin;
7334         int32_t ifclen;
7335         zoneid_t zoneid;
7336         ip_stack_t *ipst = CONNQ_TO_IPST(q);
7337 
7338         ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7339 
7340         ip1dbg(("ip_sioctl_get_ifconf"));
7341         /* Existence verified in ip_wput_nondata */
7342         mp1 = mp->b_cont->b_cont;
7343         iocp = (struct iocblk *)mp->b_rptr;
7344         zoneid = Q_TO_CONN(q)->conn_zoneid;
7345 
7346         /*
7347          * The original SIOCGIFCONF passed in a struct ifconf which specified
7348          * the user buffer address and length into which the list of struct
7349          * ifreqs was to be copied.  Since AT&T Streams does not seem to
7350          * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7351          * the SIOCGIFCONF operation was redefined to simply provide
7352          * a large output buffer into which we are supposed to jam the ifreq
7353          * array.  The same ioctl command code was used, despite the fact that
7354          * both the applications and the kernel code had to change, thus making
7355          * it impossible to support both interfaces.
7356          *
7357          * For reasons not good enough to try to explain, the following
7358          * algorithm is used for deciding what to do with one of these:
7359          * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7360          * form with the output buffer coming down as the continuation message.
7361          * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7362          * and we have to copy in the ifconf structure to find out how big the
7363          * output buffer is and where to copy out to.  Sure no problem...
7364          *
7365          */
7366         STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7367         if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7368                 int numifs = 0;
7369                 size_t ifc_bufsize;
7370 
7371                 /*
7372                  * Must be (better be!) continuation of a TRANSPARENT
7373                  * IOCTL.  We just copied in the ifconf structure.
7374                  */
7375                 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7376                     (struct ifconf *)mp1->b_rptr);
7377 
7378                 /*
7379                  * Allocate a buffer to hold requested information.
7380                  *
7381                  * If ifc_len is larger than what is needed, we only
7382                  * allocate what we will use.
7383                  *
7384                  * If ifc_len is smaller than what is needed, return
7385                  * EINVAL.
7386                  *
7387                  * XXX: the ill_t structure can hava 2 counters, for
7388                  * v4 and v6 (not just ill_ipif_up_count) to store the
7389                  * number of interfaces for a device, so we don't need
7390                  * to count them here...
7391                  */
7392                 numifs = ip_get_numifs(zoneid, ipst);
7393 
7394                 ifclen = STRUCT_FGET(ifc, ifc_len);
7395                 ifc_bufsize = numifs * sizeof (struct ifreq);
7396                 if (ifc_bufsize > ifclen) {
7397                         if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7398                                 /* old behaviour */
7399                                 return (EINVAL);
7400                         } else {
7401                                 ifc_bufsize = ifclen;
7402                         }
7403                 }
7404 
7405                 mp1 = mi_copyout_alloc(q, mp,
7406                     STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7407                 if (mp1 == NULL)
7408                         return (ENOMEM);
7409 
7410                 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7411         }
7412         bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7413         /*
7414          * the SIOCGIFCONF ioctl only knows about
7415          * IPv4 addresses, so don't try to tell
7416          * it about interfaces with IPv6-only
7417          * addresses. (Last parm 'isv6' is B_FALSE)
7418          */
7419 
7420         ifr = (struct ifreq *)mp1->b_rptr;
7421 
7422         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7423         ill = ILL_START_WALK_V4(&ctx, ipst);
7424         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7425                 if (IS_UNDER_IPMP(ill))
7426                         continue;
7427                 for (ipif = ill->ill_ipif; ipif != NULL;
7428                     ipif = ipif->ipif_next) {
7429                         if (zoneid != ipif->ipif_zoneid &&
7430                             ipif->ipif_zoneid != ALL_ZONES)
7431                                 continue;
7432                         if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7433                                 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7434                                         /* old behaviour */
7435                                         rw_exit(&ipst->ips_ill_g_lock);
7436                                         return (EINVAL);
7437                                 } else {
7438                                         goto if_copydone;
7439                                 }
7440                         }
7441                         ipif_get_name(ipif, ifr->ifr_name,
7442                             sizeof (ifr->ifr_name));
7443                         sin = (sin_t *)&ifr->ifr_addr;
7444                         *sin = sin_null;
7445                         sin->sin_family = AF_INET;
7446                         sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7447                         ifr++;
7448                 }
7449         }
7450 if_copydone:
7451         rw_exit(&ipst->ips_ill_g_lock);
7452         mp1->b_wptr = (uchar_t *)ifr;
7453 
7454         if (STRUCT_BUF(ifc) != NULL) {
7455                 STRUCT_FSET(ifc, ifc_len,
7456                     (int)((uchar_t *)ifr - mp1->b_rptr));
7457         }
7458         return (0);
7459 }
7460 
7461 /*
7462  * Get the interfaces using the address hosted on the interface passed in,
7463  * as a source adddress
7464  */
7465 /* ARGSUSED */
7466 int
7467 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7468     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7469 {
7470         mblk_t *mp1;
7471         ill_t   *ill, *ill_head;
7472         ipif_t  *ipif, *orig_ipif;
7473         int     numlifs = 0;
7474         size_t  lifs_bufsize, lifsmaxlen;
7475         struct  lifreq *lifr;
7476         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7477         uint_t  ifindex;
7478         zoneid_t zoneid;
7479         boolean_t isv6 = B_FALSE;
7480         struct  sockaddr_in     *sin;
7481         struct  sockaddr_in6    *sin6;
7482         STRUCT_HANDLE(lifsrcof, lifs);
7483         ip_stack_t              *ipst;
7484 
7485         ipst = CONNQ_TO_IPST(q);
7486 
7487         ASSERT(q->q_next == NULL);
7488 
7489         zoneid = Q_TO_CONN(q)->conn_zoneid;
7490 
7491         /* Existence verified in ip_wput_nondata */
7492         mp1 = mp->b_cont->b_cont;
7493 
7494         /*
7495          * Must be (better be!) continuation of a TRANSPARENT
7496          * IOCTL.  We just copied in the lifsrcof structure.
7497          */
7498         STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7499             (struct lifsrcof *)mp1->b_rptr);
7500 
7501         if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7502                 return (EINVAL);
7503 
7504         ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7505         isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7506         ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7507         if (ipif == NULL) {
7508                 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7509                     ifindex));
7510                 return (ENXIO);
7511         }
7512 
7513         /* Allocate a buffer to hold requested information */
7514         numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7515         lifs_bufsize = numlifs * sizeof (struct lifreq);
7516         lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
7517         /* The actual size needed is always returned in lifs_len */
7518         STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7519 
7520         /* If the amount we need is more than what is passed in, abort */
7521         if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7522                 ipif_refrele(ipif);
7523                 return (0);
7524         }
7525 
7526         mp1 = mi_copyout_alloc(q, mp,
7527             STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7528         if (mp1 == NULL) {
7529                 ipif_refrele(ipif);
7530                 return (ENOMEM);
7531         }
7532 
7533         mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7534         bzero(mp1->b_rptr, lifs_bufsize);
7535 
7536         lifr = (struct lifreq *)mp1->b_rptr;
7537 
7538         ill = ill_head = ipif->ipif_ill;
7539         orig_ipif = ipif;
7540 
7541         /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7542         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7543         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7544 
7545         ill = ill->ill_usesrc_grp_next; /* start from next ill */
7546         for (; (ill != NULL) && (ill != ill_head);
7547             ill = ill->ill_usesrc_grp_next) {
7548 
7549                 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7550                         break;
7551 
7552                 ipif = ill->ill_ipif;
7553                 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7554                 if (ipif->ipif_isv6) {
7555                         sin6 = (sin6_t *)&lifr->lifr_addr;
7556                         *sin6 = sin6_null;
7557                         sin6->sin6_family = AF_INET6;
7558                         sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7559                         lifr->lifr_addrlen = ip_mask_to_plen_v6(
7560                             &ipif->ipif_v6net_mask);
7561                 } else {
7562                         sin = (sin_t *)&lifr->lifr_addr;
7563                         *sin = sin_null;
7564                         sin->sin_family = AF_INET;
7565                         sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7566                         lifr->lifr_addrlen = ip_mask_to_plen(
7567                             ipif->ipif_net_mask);
7568                 }
7569                 lifr++;
7570         }
7571         rw_exit(&ipst->ips_ill_g_lock);
7572         rw_exit(&ipst->ips_ill_g_usesrc_lock);
7573         ipif_refrele(orig_ipif);
7574         mp1->b_wptr = (uchar_t *)lifr;
7575         STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7576 
7577         return (0);
7578 }
7579 
7580 /* ARGSUSED */
7581 int
7582 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7583     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7584 {
7585         mblk_t *mp1;
7586         int     list;
7587         ill_t   *ill;
7588         ipif_t  *ipif;
7589         int     flags;
7590         int     numlifs = 0;
7591         size_t  lifc_bufsize;
7592         struct  lifreq *lifr;
7593         sa_family_t     family;
7594         struct  sockaddr_in     *sin;
7595         struct  sockaddr_in6    *sin6;
7596         ill_walk_context_t      ctx;
7597         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7598         int32_t lifclen;
7599         zoneid_t zoneid;
7600         STRUCT_HANDLE(lifconf, lifc);
7601         ip_stack_t *ipst = CONNQ_TO_IPST(q);
7602 
7603         ip1dbg(("ip_sioctl_get_lifconf"));
7604 
7605         ASSERT(q->q_next == NULL);
7606 
7607         zoneid = Q_TO_CONN(q)->conn_zoneid;
7608 
7609         /* Existence verified in ip_wput_nondata */
7610         mp1 = mp->b_cont->b_cont;
7611 
7612         /*
7613          * An extended version of SIOCGIFCONF that takes an
7614          * additional address family and flags field.
7615          * AF_UNSPEC retrieve both IPv4 and IPv6.
7616          * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7617          * interfaces are omitted.
7618          * Similarly, IPIF_TEMPORARY interfaces are omitted
7619          * unless LIFC_TEMPORARY is specified.
7620          * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7621          * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7622          * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7623          * has priority over LIFC_NOXMIT.
7624          */
7625         STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7626 
7627         if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7628                 return (EINVAL);
7629 
7630         /*
7631          * Must be (better be!) continuation of a TRANSPARENT
7632          * IOCTL.  We just copied in the lifconf structure.
7633          */
7634         STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7635 
7636         family = STRUCT_FGET(lifc, lifc_family);
7637         flags = STRUCT_FGET(lifc, lifc_flags);
7638 
7639         switch (family) {
7640         case AF_UNSPEC:
7641                 /*
7642                  * walk all ILL's.
7643                  */
7644                 list = MAX_G_HEADS;
7645                 break;
7646         case AF_INET:
7647                 /*
7648                  * walk only IPV4 ILL's.
7649                  */
7650                 list = IP_V4_G_HEAD;
7651                 break;
7652         case AF_INET6:
7653                 /*
7654                  * walk only IPV6 ILL's.
7655                  */
7656                 list = IP_V6_G_HEAD;
7657                 break;
7658         default:
7659                 return (EAFNOSUPPORT);
7660         }
7661 
7662         /*
7663          * Allocate a buffer to hold requested information.
7664          *
7665          * If lifc_len is larger than what is needed, we only
7666          * allocate what we will use.
7667          *
7668          * If lifc_len is smaller than what is needed, return
7669          * EINVAL.
7670          */
7671         numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7672         lifc_bufsize = numlifs * sizeof (struct lifreq);
7673         lifclen = STRUCT_FGET(lifc, lifc_len);
7674         if (lifc_bufsize > lifclen) {
7675                 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7676                         return (EINVAL);
7677                 else
7678                         lifc_bufsize = lifclen;
7679         }
7680 
7681         mp1 = mi_copyout_alloc(q, mp,
7682             STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7683         if (mp1 == NULL)
7684                 return (ENOMEM);
7685 
7686         mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7687         bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7688 
7689         lifr = (struct lifreq *)mp1->b_rptr;
7690 
7691         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7692         ill = ill_first(list, list, &ctx, ipst);
7693         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7694                 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7695                         continue;
7696 
7697                 for (ipif = ill->ill_ipif; ipif != NULL;
7698                     ipif = ipif->ipif_next) {
7699                         if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7700                             !(flags & LIFC_NOXMIT))
7701                                 continue;
7702 
7703                         if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7704                             !(flags & LIFC_TEMPORARY))
7705                                 continue;
7706 
7707                         if (((ipif->ipif_flags &
7708                             (IPIF_NOXMIT|IPIF_NOLOCAL|
7709                             IPIF_DEPRECATED)) ||
7710                             IS_LOOPBACK(ill) ||
7711                             !(ipif->ipif_flags & IPIF_UP)) &&
7712                             (flags & LIFC_EXTERNAL_SOURCE))
7713                                 continue;
7714 
7715                         if (zoneid != ipif->ipif_zoneid &&
7716                             ipif->ipif_zoneid != ALL_ZONES &&
7717                             (zoneid != GLOBAL_ZONEID ||
7718                             !(flags & LIFC_ALLZONES)))
7719                                 continue;
7720 
7721                         if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7722                                 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7723                                         rw_exit(&ipst->ips_ill_g_lock);
7724                                         return (EINVAL);
7725                                 } else {
7726                                         goto lif_copydone;
7727                                 }
7728                         }
7729 
7730                         ipif_get_name(ipif, lifr->lifr_name,
7731                             sizeof (lifr->lifr_name));
7732                         lifr->lifr_type = ill->ill_type;
7733                         if (ipif->ipif_isv6) {
7734                                 sin6 = (sin6_t *)&lifr->lifr_addr;
7735                                 *sin6 = sin6_null;
7736                                 sin6->sin6_family = AF_INET6;
7737                                 sin6->sin6_addr =
7738                                     ipif->ipif_v6lcl_addr;
7739                                 lifr->lifr_addrlen =
7740                                     ip_mask_to_plen_v6(
7741                                     &ipif->ipif_v6net_mask);
7742                         } else {
7743                                 sin = (sin_t *)&lifr->lifr_addr;
7744                                 *sin = sin_null;
7745                                 sin->sin_family = AF_INET;
7746                                 sin->sin_addr.s_addr =
7747                                     ipif->ipif_lcl_addr;
7748                                 lifr->lifr_addrlen =
7749                                     ip_mask_to_plen(
7750                                     ipif->ipif_net_mask);
7751                         }
7752                         lifr++;
7753                 }
7754         }
7755 lif_copydone:
7756         rw_exit(&ipst->ips_ill_g_lock);
7757 
7758         mp1->b_wptr = (uchar_t *)lifr;
7759         if (STRUCT_BUF(lifc) != NULL) {
7760                 STRUCT_FSET(lifc, lifc_len,
7761                     (int)((uchar_t *)lifr - mp1->b_rptr));
7762         }
7763         return (0);
7764 }
7765 
7766 static void
7767 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7768 {
7769         ip6_asp_t *table;
7770         size_t table_size;
7771         mblk_t *data_mp;
7772         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7773         ip_stack_t      *ipst;
7774 
7775         if (q->q_next == NULL)
7776                 ipst = CONNQ_TO_IPST(q);
7777         else
7778                 ipst = ILLQ_TO_IPST(q);
7779 
7780         /* These two ioctls are I_STR only */
7781         if (iocp->ioc_count == TRANSPARENT) {
7782                 miocnak(q, mp, 0, EINVAL);
7783                 return;
7784         }
7785 
7786         data_mp = mp->b_cont;
7787         if (data_mp == NULL) {
7788                 /* The user passed us a NULL argument */
7789                 table = NULL;
7790                 table_size = iocp->ioc_count;
7791         } else {
7792                 /*
7793                  * The user provided a table.  The stream head
7794                  * may have copied in the user data in chunks,
7795                  * so make sure everything is pulled up
7796                  * properly.
7797                  */
7798                 if (MBLKL(data_mp) < iocp->ioc_count) {
7799                         mblk_t *new_data_mp;
7800                         if ((new_data_mp = msgpullup(data_mp, -1)) ==
7801                             NULL) {
7802                                 miocnak(q, mp, 0, ENOMEM);
7803                                 return;
7804                         }
7805                         freemsg(data_mp);
7806                         data_mp = new_data_mp;
7807                         mp->b_cont = data_mp;
7808                 }
7809                 table = (ip6_asp_t *)data_mp->b_rptr;
7810                 table_size = iocp->ioc_count;
7811         }
7812 
7813         switch (iocp->ioc_cmd) {
7814         case SIOCGIP6ADDRPOLICY:
7815                 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7816                 if (iocp->ioc_rval == -1)
7817                         iocp->ioc_error = EINVAL;
7818 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7819                 else if (table != NULL &&
7820                     (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7821                         ip6_asp_t *src = table;
7822                         ip6_asp32_t *dst = (void *)table;
7823                         int count = table_size / sizeof (ip6_asp_t);
7824                         int i;
7825 
7826                         /*
7827                          * We need to do an in-place shrink of the array
7828                          * to match the alignment attributes of the
7829                          * 32-bit ABI looking at it.
7830                          */
7831                         /* LINTED: logical expression always true: op "||" */
7832                         ASSERT(sizeof (*src) > sizeof (*dst));
7833                         for (i = 1; i < count; i++)
7834                                 bcopy(src + i, dst + i, sizeof (*dst));
7835                 }
7836 #endif
7837                 break;
7838 
7839         case SIOCSIP6ADDRPOLICY:
7840                 ASSERT(mp->b_prev == NULL);
7841                 mp->b_prev = (void *)q;
7842 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7843                 /*
7844                  * We pass in the datamodel here so that the ip6_asp_replace()
7845                  * routine can handle converting from 32-bit to native formats
7846                  * where necessary.
7847                  *
7848                  * A better way to handle this might be to convert the inbound
7849                  * data structure here, and hang it off a new 'mp'; thus the
7850                  * ip6_asp_replace() logic would always be dealing with native
7851                  * format data structures..
7852                  *
7853                  * (An even simpler way to handle these ioctls is to just
7854                  * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7855                  * and just recompile everything that depends on it.)
7856                  */
7857 #endif
7858                 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7859                     iocp->ioc_flag & IOC_MODELS);
7860                 return;
7861         }
7862 
7863         DB_TYPE(mp) =  (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7864         qreply(q, mp);
7865 }
7866 
7867 static void
7868 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7869 {
7870         mblk_t          *data_mp;
7871         struct dstinforeq       *dir;
7872         uint8_t         *end, *cur;
7873         in6_addr_t      *daddr, *saddr;
7874         ipaddr_t        v4daddr;
7875         ire_t           *ire;
7876         ipaddr_t        v4setsrc;
7877         in6_addr_t      v6setsrc;
7878         char            *slabel, *dlabel;
7879         boolean_t       isipv4;
7880         int             match_ire;
7881         ill_t           *dst_ill;
7882         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7883         conn_t          *connp = Q_TO_CONN(q);
7884         zoneid_t        zoneid = IPCL_ZONEID(connp);
7885         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
7886         uint64_t        ipif_flags;
7887 
7888         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7889 
7890         /*
7891          * This ioctl is I_STR only, and must have a
7892          * data mblk following the M_IOCTL mblk.
7893          */
7894         data_mp = mp->b_cont;
7895         if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7896                 miocnak(q, mp, 0, EINVAL);
7897                 return;
7898         }
7899 
7900         if (MBLKL(data_mp) < iocp->ioc_count) {
7901                 mblk_t *new_data_mp;
7902 
7903                 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7904                         miocnak(q, mp, 0, ENOMEM);
7905                         return;
7906                 }
7907                 freemsg(data_mp);
7908                 data_mp = new_data_mp;
7909                 mp->b_cont = data_mp;
7910         }
7911         match_ire = MATCH_IRE_DSTONLY;
7912 
7913         for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
7914             end - cur >= sizeof (struct dstinforeq);
7915             cur += sizeof (struct dstinforeq)) {
7916                 dir = (struct dstinforeq *)cur;
7917                 daddr = &dir->dir_daddr;
7918                 saddr = &dir->dir_saddr;
7919 
7920                 /*
7921                  * ip_addr_scope_v6() and ip6_asp_lookup() handle
7922                  * v4 mapped addresses; ire_ftable_lookup_v6()
7923                  * and ip_select_source_v6() do not.
7924                  */
7925                 dir->dir_dscope = ip_addr_scope_v6(daddr);
7926                 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
7927 
7928                 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
7929                 if (isipv4) {
7930                         IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
7931                         v4setsrc = INADDR_ANY;
7932                         ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
7933                             NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc,
7934                             NULL, NULL);
7935                 } else {
7936                         v6setsrc = ipv6_all_zeros;
7937                         ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
7938                             NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc,
7939                             NULL, NULL);
7940                 }
7941                 ASSERT(ire != NULL);
7942                 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
7943                         ire_refrele(ire);
7944                         dir->dir_dreachable = 0;
7945 
7946                         /* move on to next dst addr */
7947                         continue;
7948                 }
7949                 dir->dir_dreachable = 1;
7950 
7951                 dst_ill = ire_nexthop_ill(ire);
7952                 if (dst_ill == NULL) {
7953                         ire_refrele(ire);
7954                         continue;
7955                 }
7956 
7957                 /* With ipmp we most likely look at the ipmp ill here */
7958                 dir->dir_dmactype = dst_ill->ill_mactype;
7959 
7960                 if (isipv4) {
7961                         ipaddr_t v4saddr;
7962 
7963                         if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
7964                             connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
7965                             &v4saddr, NULL, &ipif_flags) != 0) {
7966                                 v4saddr = INADDR_ANY;
7967                                 ipif_flags = 0;
7968                         }
7969                         IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
7970                 } else {
7971                         if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
7972                             zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
7973                             saddr, NULL, &ipif_flags) != 0) {
7974                                 *saddr = ipv6_all_zeros;
7975                                 ipif_flags = 0;
7976                         }
7977                 }
7978 
7979                 dir->dir_sscope = ip_addr_scope_v6(saddr);
7980                 slabel = ip6_asp_lookup(saddr, NULL, ipst);
7981                 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
7982                 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
7983                 ire_refrele(ire);
7984                 ill_refrele(dst_ill);
7985         }
7986         miocack(q, mp, iocp->ioc_count, 0);
7987 }
7988 
7989 /*
7990  * Check if this is an address assigned to this machine.
7991  * Skips interfaces that are down by using ire checks.
7992  * Translates mapped addresses to v4 addresses and then
7993  * treats them as such, returning true if the v4 address
7994  * associated with this mapped address is configured.
7995  * Note: Applications will have to be careful what they do
7996  * with the response; use of mapped addresses limits
7997  * what can be done with the socket, especially with
7998  * respect to socket options and ioctls - neither IPv4
7999  * options nor IPv6 sticky options/ancillary data options
8000  * may be used.
8001  */
8002 /* ARGSUSED */
8003 int
8004 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8005     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8006 {
8007         struct sioc_addrreq *sia;
8008         sin_t *sin;
8009         ire_t *ire;
8010         mblk_t *mp1;
8011         zoneid_t zoneid;
8012         ip_stack_t      *ipst;
8013 
8014         ip1dbg(("ip_sioctl_tmyaddr"));
8015 
8016         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8017         zoneid = Q_TO_CONN(q)->conn_zoneid;
8018         ipst = CONNQ_TO_IPST(q);
8019 
8020         /* Existence verified in ip_wput_nondata */
8021         mp1 = mp->b_cont->b_cont;
8022         sia = (struct sioc_addrreq *)mp1->b_rptr;
8023         sin = (sin_t *)&sia->sa_addr;
8024         switch (sin->sin_family) {
8025         case AF_INET6: {
8026                 sin6_t *sin6 = (sin6_t *)sin;
8027 
8028                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8029                         ipaddr_t v4_addr;
8030 
8031                         IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8032                             v4_addr);
8033                         ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
8034                             IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8035                             MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8036                 } else {
8037                         in6_addr_t v6addr;
8038 
8039                         v6addr = sin6->sin6_addr;
8040                         ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
8041                             IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8042                             MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8043                 }
8044                 break;
8045         }
8046         case AF_INET: {
8047                 ipaddr_t v4addr;
8048 
8049                 v4addr = sin->sin_addr.s_addr;
8050                 ire = ire_ftable_lookup_v4(v4addr, 0, 0,
8051                     IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8052                     NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8053                 break;
8054         }
8055         default:
8056                 return (EAFNOSUPPORT);
8057         }
8058         if (ire != NULL) {
8059                 sia->sa_res = 1;
8060                 ire_refrele(ire);
8061         } else {
8062                 sia->sa_res = 0;
8063         }
8064         return (0);
8065 }
8066 
8067 /*
8068  * Check if this is an address assigned on-link i.e. neighbor,
8069  * and makes sure it's reachable from the current zone.
8070  * Returns true for my addresses as well.
8071  * Translates mapped addresses to v4 addresses and then
8072  * treats them as such, returning true if the v4 address
8073  * associated with this mapped address is configured.
8074  * Note: Applications will have to be careful what they do
8075  * with the response; use of mapped addresses limits
8076  * what can be done with the socket, especially with
8077  * respect to socket options and ioctls - neither IPv4
8078  * options nor IPv6 sticky options/ancillary data options
8079  * may be used.
8080  */
8081 /* ARGSUSED */
8082 int
8083 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8084     ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
8085 {
8086         struct sioc_addrreq *sia;
8087         sin_t *sin;
8088         mblk_t  *mp1;
8089         ire_t *ire = NULL;
8090         zoneid_t zoneid;
8091         ip_stack_t      *ipst;
8092 
8093         ip1dbg(("ip_sioctl_tonlink"));
8094 
8095         ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8096         zoneid = Q_TO_CONN(q)->conn_zoneid;
8097         ipst = CONNQ_TO_IPST(q);
8098 
8099         /* Existence verified in ip_wput_nondata */
8100         mp1 = mp->b_cont->b_cont;
8101         sia = (struct sioc_addrreq *)mp1->b_rptr;
8102         sin = (sin_t *)&sia->sa_addr;
8103 
8104         /*
8105          * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
8106          * to make sure we only look at on-link unicast address.
8107          */
8108         switch (sin->sin_family) {
8109         case AF_INET6: {
8110                 sin6_t *sin6 = (sin6_t *)sin;
8111 
8112                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8113                         ipaddr_t v4_addr;
8114 
8115                         IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8116                             v4_addr);
8117                         if (!CLASSD(v4_addr)) {
8118                                 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
8119                                     NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
8120                                     0, ipst, NULL);
8121                         }
8122                 } else {
8123                         in6_addr_t v6addr;
8124 
8125                         v6addr = sin6->sin6_addr;
8126                         if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
8127                                 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
8128                                     NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
8129                                     ipst, NULL);
8130                         }
8131                 }
8132                 break;
8133         }
8134         case AF_INET: {
8135                 ipaddr_t v4addr;
8136 
8137                 v4addr = sin->sin_addr.s_addr;
8138                 if (!CLASSD(v4addr)) {
8139                         ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
8140                             zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
8141                 }
8142                 break;
8143         }
8144         default:
8145                 return (EAFNOSUPPORT);
8146         }
8147         sia->sa_res = 0;
8148         if (ire != NULL) {
8149                 ASSERT(!(ire->ire_type & IRE_MULTICAST));
8150 
8151                 if ((ire->ire_type & IRE_ONLINK) &&
8152                     !(ire->ire_type & IRE_BROADCAST))
8153                         sia->sa_res = 1;
8154                 ire_refrele(ire);
8155         }
8156         return (0);
8157 }
8158 
8159 /*
8160  * TBD: implement when kernel maintaines a list of site prefixes.
8161  */
8162 /* ARGSUSED */
8163 int
8164 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8165     ip_ioctl_cmd_t *ipip, void *ifreq)
8166 {
8167         return (ENXIO);
8168 }
8169 
8170 /* ARP IOCTLs. */
8171 /* ARGSUSED */
8172 int
8173 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8174     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8175 {
8176         int             err;
8177         ipaddr_t        ipaddr;
8178         struct iocblk   *iocp;
8179         conn_t          *connp;
8180         struct arpreq   *ar;
8181         struct xarpreq  *xar;
8182         int             arp_flags, flags, alength;
8183         uchar_t         *lladdr;
8184         ip_stack_t      *ipst;
8185         ill_t           *ill = ipif->ipif_ill;
8186         ill_t           *proxy_ill = NULL;
8187         ipmp_arpent_t   *entp = NULL;
8188         boolean_t       proxyarp = B_FALSE;
8189         boolean_t       if_arp_ioctl = B_FALSE;
8190         ncec_t          *ncec = NULL;
8191         nce_t           *nce;
8192 
8193         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8194         connp = Q_TO_CONN(q);
8195         ipst = connp->conn_netstack->netstack_ip;
8196         iocp = (struct iocblk *)mp->b_rptr;
8197 
8198         if (ipip->ipi_cmd_type == XARP_CMD) {
8199                 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8200                 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8201                 ar = NULL;
8202 
8203                 arp_flags = xar->xarp_flags;
8204                 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8205                 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8206                 /*
8207                  * Validate against user's link layer address length
8208                  * input and name and addr length limits.
8209                  */
8210                 alength = ill->ill_phys_addr_length;
8211                 if (ipip->ipi_cmd == SIOCSXARP) {
8212                         if (alength != xar->xarp_ha.sdl_alen ||
8213                             (alength + xar->xarp_ha.sdl_nlen >
8214                             sizeof (xar->xarp_ha.sdl_data)))
8215                                 return (EINVAL);
8216                 }
8217         } else {
8218                 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8219                 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8220                 xar = NULL;
8221 
8222                 arp_flags = ar->arp_flags;
8223                 lladdr = (uchar_t *)ar->arp_ha.sa_data;
8224                 /*
8225                  * Theoretically, the sa_family could tell us what link
8226                  * layer type this operation is trying to deal with. By
8227                  * common usage AF_UNSPEC means ethernet. We'll assume
8228                  * any attempt to use the SIOC?ARP ioctls is for ethernet,
8229                  * for now. Our new SIOC*XARP ioctls can be used more
8230                  * generally.
8231                  *
8232                  * If the underlying media happens to have a non 6 byte
8233                  * address, arp module will fail set/get, but the del
8234                  * operation will succeed.
8235                  */
8236                 alength = 6;
8237                 if ((ipip->ipi_cmd != SIOCDARP) &&
8238                     (alength != ill->ill_phys_addr_length)) {
8239                         return (EINVAL);
8240                 }
8241         }
8242 
8243         /* Translate ATF* flags to NCE* flags */
8244         flags = 0;
8245         if (arp_flags & ATF_AUTHORITY)
8246                 flags |= NCE_F_AUTHORITY;
8247         if (arp_flags & ATF_PERM)
8248                 flags |= NCE_F_NONUD; /* not subject to aging */
8249         if (arp_flags & ATF_PUBL)
8250                 flags |= NCE_F_PUBLISH;
8251 
8252         /*
8253          * IPMP ARP special handling:
8254          *
8255          * 1. Since ARP mappings must appear consistent across the group,
8256          *    prohibit changing ARP mappings on the underlying interfaces.
8257          *
8258          * 2. Since ARP mappings for IPMP data addresses are maintained by
8259          *    IP itself, prohibit changing them.
8260          *
8261          * 3. For proxy ARP, use a functioning hardware address in the group,
8262          *    provided one exists.  If one doesn't, just add the entry as-is;
8263          *    ipmp_illgrp_refresh_arpent() will refresh it if things change.
8264          */
8265         if (IS_UNDER_IPMP(ill)) {
8266                 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8267                         return (EPERM);
8268         }
8269         if (IS_IPMP(ill)) {
8270                 ipmp_illgrp_t *illg = ill->ill_grp;
8271 
8272                 switch (ipip->ipi_cmd) {
8273                 case SIOCSARP:
8274                 case SIOCSXARP:
8275                         proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8276                         if (proxy_ill != NULL) {
8277                                 proxyarp = B_TRUE;
8278                                 if (!ipmp_ill_is_active(proxy_ill))
8279                                         proxy_ill = ipmp_illgrp_next_ill(illg);
8280                                 if (proxy_ill != NULL)
8281                                         lladdr = proxy_ill->ill_phys_addr;
8282                         }
8283                         /* FALLTHRU */
8284                 }
8285         }
8286 
8287         ipaddr = sin->sin_addr.s_addr;
8288         /*
8289          * don't match across illgrp per case (1) and (2).
8290          * XXX use IS_IPMP(ill) like ndp_sioc_update?
8291          */
8292         nce = nce_lookup_v4(ill, &ipaddr);
8293         if (nce != NULL)
8294                 ncec = nce->nce_common;
8295 
8296         switch (iocp->ioc_cmd) {
8297         case SIOCDARP:
8298         case SIOCDXARP: {
8299                 /*
8300                  * Delete the NCE if any.
8301                  */
8302                 if (ncec == NULL) {
8303                         iocp->ioc_error = ENXIO;
8304                         break;
8305                 }
8306                 /* Don't allow changes to arp mappings of local addresses. */
8307                 if (NCE_MYADDR(ncec)) {
8308                         nce_refrele(nce);
8309                         return (ENOTSUP);
8310                 }
8311                 iocp->ioc_error = 0;
8312 
8313                 /*
8314                  * Delete the nce_common which has ncec_ill set to ipmp_ill.
8315                  * This will delete all the nce entries on the under_ills.
8316                  */
8317                 ncec_delete(ncec);
8318                 /*
8319                  * Once the NCE has been deleted, then the ire_dep* consistency
8320                  * mechanism will find any IRE which depended on the now
8321                  * condemned NCE (as part of sending packets).
8322                  * That mechanism handles redirects by deleting redirects
8323                  * that refer to UNREACHABLE nces.
8324                  */
8325                 break;
8326         }
8327         case SIOCGARP:
8328         case SIOCGXARP:
8329                 if (ncec != NULL) {
8330                         lladdr = ncec->ncec_lladdr;
8331                         flags = ncec->ncec_flags;
8332                         iocp->ioc_error = 0;
8333                         ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8334                 } else {
8335                         iocp->ioc_error = ENXIO;
8336                 }
8337                 break;
8338         case SIOCSARP:
8339         case SIOCSXARP:
8340                 /* Don't allow changes to arp mappings of local addresses. */
8341                 if (ncec != NULL && NCE_MYADDR(ncec)) {
8342                         nce_refrele(nce);
8343                         return (ENOTSUP);
8344                 }
8345 
8346                 /* static arp entries will undergo NUD if ATF_PERM is not set */
8347                 flags |= NCE_F_STATIC;
8348                 if (!if_arp_ioctl) {
8349                         ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8350                             lladdr, alength, flags);
8351                 } else {
8352                         ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8353                         if (ipif != NULL) {
8354                                 ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8355                                     lladdr, alength, flags);
8356                                 ipif_refrele(ipif);
8357                         }
8358                 }
8359                 if (nce != NULL) {
8360                         nce_refrele(nce);
8361                         nce = NULL;
8362                 }
8363                 /*
8364                  * NCE_F_STATIC entries will be added in state ND_REACHABLE
8365                  * by nce_add_common()
8366                  */
8367                 err = nce_lookup_then_add_v4(ill, lladdr,
8368                     ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8369                     &nce);
8370                 if (err == EEXIST) {
8371                         ncec = nce->nce_common;
8372                         mutex_enter(&ncec->ncec_lock);
8373                         ncec->ncec_state = ND_REACHABLE;
8374                         ncec->ncec_flags = flags;
8375                         nce_update(ncec, ND_UNCHANGED, lladdr);
8376                         mutex_exit(&ncec->ncec_lock);
8377                         err = 0;
8378                 }
8379                 if (nce != NULL) {
8380                         nce_refrele(nce);
8381                         nce = NULL;
8382                 }
8383                 if (IS_IPMP(ill) && err == 0) {
8384                         entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8385                             proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8386                             flags);
8387                         if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8388                                 iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8389                                 break;
8390                         }
8391                 }
8392                 iocp->ioc_error = err;
8393         }
8394 
8395         if (nce != NULL) {
8396                 nce_refrele(nce);
8397         }
8398 
8399         /*
8400          * If we created an IPMP ARP entry, mark that we've notified ARP.
8401          */
8402         if (entp != NULL)
8403                 ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8404 
8405         return (iocp->ioc_error);
8406 }
8407 
8408 /*
8409  * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8410  * the associated sin and refhold and return the associated ipif via `ci'.
8411  */
8412 int
8413 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8414     cmd_info_t *ci)
8415 {
8416         mblk_t  *mp1;
8417         sin_t   *sin;
8418         conn_t  *connp;
8419         ipif_t  *ipif;
8420         ire_t   *ire = NULL;
8421         ill_t   *ill = NULL;
8422         boolean_t exists;
8423         ip_stack_t *ipst;
8424         struct arpreq *ar;
8425         struct xarpreq *xar;
8426         struct sockaddr_dl *sdl;
8427 
8428         /* ioctl comes down on a conn */
8429         ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8430         connp = Q_TO_CONN(q);
8431         if (connp->conn_family == AF_INET6)
8432                 return (ENXIO);
8433 
8434         ipst = connp->conn_netstack->netstack_ip;
8435 
8436         /* Verified in ip_wput_nondata */
8437         mp1 = mp->b_cont->b_cont;
8438 
8439         if (ipip->ipi_cmd_type == XARP_CMD) {
8440                 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8441                 xar = (struct xarpreq *)mp1->b_rptr;
8442                 sin = (sin_t *)&xar->xarp_pa;
8443                 sdl = &xar->xarp_ha;
8444 
8445                 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8446                         return (ENXIO);
8447                 if (sdl->sdl_nlen >= LIFNAMSIZ)
8448                         return (EINVAL);
8449         } else {
8450                 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8451                 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8452                 ar = (struct arpreq *)mp1->b_rptr;
8453                 sin = (sin_t *)&ar->arp_pa;
8454         }
8455 
8456         if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8457                 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8458                     B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8459                 if (ipif == NULL)
8460                         return (ENXIO);
8461                 if (ipif->ipif_id != 0) {
8462                         ipif_refrele(ipif);
8463                         return (ENXIO);
8464                 }
8465         } else {
8466                 /*
8467                  * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8468                  * of 0: use the IP address to find the ipif.  If the IP
8469                  * address is an IPMP test address, ire_ftable_lookup() will
8470                  * find the wrong ill, so we first do an ipif_lookup_addr().
8471                  */
8472                 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8473                     ipst);
8474                 if (ipif == NULL) {
8475                         ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8476                             0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8477                             NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
8478                         if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8479                                 if (ire != NULL)
8480                                         ire_refrele(ire);
8481                                 return (ENXIO);
8482                         }
8483                         ASSERT(ire != NULL && ill != NULL);
8484                         ipif = ill->ill_ipif;
8485                         ipif_refhold(ipif);
8486                         ire_refrele(ire);
8487                 }
8488         }
8489 
8490         if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8491                 ipif_refrele(ipif);
8492                 return (ENXIO);
8493         }
8494 
8495         ci->ci_sin = sin;
8496         ci->ci_ipif = ipif;
8497         return (0);
8498 }
8499 
8500 /*
8501  * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8502  * value of `ioccmd'.  While an illgrp is linked to an ipmp_grp_t, it is
8503  * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8504  * up and thus an ill can join that illgrp.
8505  *
8506  * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8507  * open()/close() primarily because close() is not allowed to fail or block
8508  * forever.  On the other hand, I_PUNLINK *can* fail, and there's no reason
8509  * why anyone should ever need to I_PUNLINK an in-use IPMP stream.  To ensure
8510  * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8511  * I_PUNLINK) we defer linking to I_PLINK.  Separately, we also fail attempts
8512  * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8513  * state if I_UNLINK didn't occur.
8514  *
8515  * Note that for each plumb/unplumb operation, we may end up here more than
8516  * once because of the way ifconfig works.  However, it's OK to link the same
8517  * illgrp more than once, or unlink an illgrp that's already unlinked.
8518  */
8519 static int
8520 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8521 {
8522         int err;
8523         ip_stack_t *ipst = ill->ill_ipst;
8524 
8525         ASSERT(IS_IPMP(ill));
8526         ASSERT(IAM_WRITER_ILL(ill));
8527 
8528         switch (ioccmd) {
8529         case I_LINK:
8530                 return (ENOTSUP);
8531 
8532         case I_PLINK:
8533                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8534                 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8535                 rw_exit(&ipst->ips_ipmp_lock);
8536                 break;
8537 
8538         case I_PUNLINK:
8539                 /*
8540                  * Require all UP ipifs be brought down prior to unlinking the
8541                  * illgrp so any associated IREs (and other state) is torched.
8542                  */
8543                 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8544                         return (EBUSY);
8545 
8546                 /*
8547                  * NOTE: We hold ipmp_lock across the unlink to prevent a race
8548                  * with an SIOCSLIFGROUPNAME request from an ill trying to
8549                  * join this group.  Specifically: ills trying to join grab
8550                  * ipmp_lock and bump a "pending join" counter checked by
8551                  * ipmp_illgrp_unlink_grp().  During the unlink no new pending
8552                  * joins can occur (since we have ipmp_lock).  Once we drop
8553                  * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8554                  * find the illgrp (since we unlinked it) and will return
8555                  * EAFNOSUPPORT.  This will then take them back through the
8556                  * IPMP meta-interface plumbing logic in ifconfig, and thus
8557                  * back through I_PLINK above.
8558                  */
8559                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8560                 err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8561                 rw_exit(&ipst->ips_ipmp_lock);
8562                 return (err);
8563         default:
8564                 break;
8565         }
8566         return (0);
8567 }
8568 
8569 /*
8570  * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8571  * atomically set/clear the muxids. Also complete the ioctl by acking or
8572  * naking it.  Note that the code is structured such that the link type,
8573  * whether it's persistent or not, is treated equally.  ifconfig(1M) and
8574  * its clones use the persistent link, while pppd(1M) and perhaps many
8575  * other daemons may use non-persistent link.  When combined with some
8576  * ill_t states, linking and unlinking lower streams may be used as
8577  * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8578  */
8579 /* ARGSUSED */
8580 void
8581 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8582 {
8583         mblk_t          *mp1;
8584         struct linkblk  *li;
8585         int             ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8586         int             err = 0;
8587 
8588         ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8589             ioccmd == I_LINK || ioccmd == I_UNLINK);
8590 
8591         mp1 = mp->b_cont;    /* This is the linkblk info */
8592         li = (struct linkblk *)mp1->b_rptr;
8593 
8594         err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8595         if (err == EINPROGRESS)
8596                 return;
8597         if (err == 0)
8598                 miocack(q, mp, 0, 0);
8599         else
8600                 miocnak(q, mp, 0, err);
8601 
8602         /* Conn was refheld in ip_sioctl_copyin_setup */
8603         if (CONN_Q(q)) {
8604                 CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8605                 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8606         }
8607 }
8608 
8609 /*
8610  * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8611  * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8612  * module stream).
8613  * Returns zero on success, EINPROGRESS if the operation is still pending, or
8614  * an error code on failure.
8615  */
8616 static int
8617 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8618     struct linkblk *li)
8619 {
8620         int             err = 0;
8621         ill_t           *ill;
8622         queue_t         *ipwq, *dwq;
8623         const char      *name;
8624         struct qinit    *qinfo;
8625         boolean_t       islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8626         boolean_t       entered_ipsq = B_FALSE;
8627         boolean_t       is_ip = B_FALSE;
8628         arl_t           *arl;
8629 
8630         /*
8631          * Walk the lower stream to verify it's the IP module stream.
8632          * The IP module is identified by its name, wput function,
8633          * and non-NULL q_next.  STREAMS ensures that the lower stream
8634          * (li->l_qbot) will not vanish until this ioctl completes.
8635          */
8636         for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8637                 qinfo = ipwq->q_qinfo;
8638                 name = qinfo->qi_minfo->mi_idname;
8639                 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8640                     qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
8641                         is_ip = B_TRUE;
8642                         break;
8643                 }
8644                 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8645                     qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
8646                         break;
8647                 }
8648         }
8649 
8650         /*
8651          * If this isn't an IP module stream, bail.
8652          */
8653         if (ipwq == NULL)
8654                 return (0);
8655 
8656         if (!is_ip) {
8657                 arl = (arl_t *)ipwq->q_ptr;
8658                 ill = arl_to_ill(arl);
8659                 if (ill == NULL)
8660                         return (0);
8661         } else {
8662                 ill = ipwq->q_ptr;
8663         }
8664         ASSERT(ill != NULL);
8665 
8666         if (ipsq == NULL) {
8667                 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8668                     NEW_OP, B_FALSE);
8669                 if (ipsq == NULL) {
8670                         if (!is_ip)
8671                                 ill_refrele(ill);
8672                         return (EINPROGRESS);
8673                 }
8674                 entered_ipsq = B_TRUE;
8675         }
8676         ASSERT(IAM_WRITER_ILL(ill));
8677         mutex_enter(&ill->ill_lock);
8678         if (!is_ip) {
8679                 if (islink && ill->ill_muxid == 0) {
8680                         /*
8681                          * Plumbing has to be done with IP plumbed first, arp
8682                          * second, but here we have arp being plumbed first.
8683                          */
8684                         mutex_exit(&ill->ill_lock);
8685                         if (entered_ipsq)
8686                                 ipsq_exit(ipsq);
8687                         ill_refrele(ill);
8688                         return (EINVAL);
8689                 }
8690         }
8691         mutex_exit(&ill->ill_lock);
8692         if (!is_ip) {
8693                 arl->arl_muxid = islink ? li->l_index : 0;
8694                 ill_refrele(ill);
8695                 goto done;
8696         }
8697 
8698         if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8699                 goto done;
8700 
8701         /*
8702          * As part of I_{P}LINKing, stash the number of downstream modules and
8703          * the read queue of the module immediately below IP in the ill.
8704          * These are used during the capability negotiation below.
8705          */
8706         ill->ill_lmod_rq = NULL;
8707         ill->ill_lmod_cnt = 0;
8708         if (islink && ((dwq = ipwq->q_next) != NULL)) {
8709                 ill->ill_lmod_rq = RD(dwq);
8710                 for (; dwq != NULL; dwq = dwq->q_next)
8711                         ill->ill_lmod_cnt++;
8712         }
8713 
8714         ill->ill_muxid = islink ? li->l_index : 0;
8715 
8716         /*
8717          * Mark the ipsq busy until the capability operations initiated below
8718          * complete. The PLINK/UNLINK ioctl itself completes when our caller
8719          * returns, but the capability operation may complete asynchronously
8720          * much later.
8721          */
8722         ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8723         /*
8724          * If there's at least one up ipif on this ill, then we're bound to
8725          * the underlying driver via DLPI.  In that case, renegotiate
8726          * capabilities to account for any possible change in modules
8727          * interposed between IP and the driver.
8728          */
8729         if (ill->ill_ipif_up_count > 0) {
8730                 if (islink)
8731                         ill_capability_probe(ill);
8732                 else
8733                         ill_capability_reset(ill, B_FALSE);
8734         }
8735         ipsq_current_finish(ipsq);
8736 done:
8737         if (entered_ipsq)
8738                 ipsq_exit(ipsq);
8739 
8740         return (err);
8741 }
8742 
8743 /*
8744  * Search the ioctl command in the ioctl tables and return a pointer
8745  * to the ioctl command information. The ioctl command tables are
8746  * static and fully populated at compile time.
8747  */
8748 ip_ioctl_cmd_t *
8749 ip_sioctl_lookup(int ioc_cmd)
8750 {
8751         int index;
8752         ip_ioctl_cmd_t *ipip;
8753         ip_ioctl_cmd_t *ipip_end;
8754 
8755         if (ioc_cmd == IPI_DONTCARE)
8756                 return (NULL);
8757 
8758         /*
8759          * Do a 2 step search. First search the indexed table
8760          * based on the least significant byte of the ioctl cmd.
8761          * If we don't find a match, then search the misc table
8762          * serially.
8763          */
8764         index = ioc_cmd & 0xFF;
8765         if (index < ip_ndx_ioctl_count) {
8766                 ipip = &ip_ndx_ioctl_table[index];
8767                 if (ipip->ipi_cmd == ioc_cmd) {
8768                         /* Found a match in the ndx table */
8769                         return (ipip);
8770                 }
8771         }
8772 
8773         /* Search the misc table */
8774         ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8775         for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8776                 if (ipip->ipi_cmd == ioc_cmd)
8777                         /* Found a match in the misc table */
8778                         return (ipip);
8779         }
8780 
8781         return (NULL);
8782 }
8783 
8784 /*
8785  * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8786  */
8787 static boolean_t
8788 getset_ioctl_checks(mblk_t *mp)
8789 {
8790         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
8791         mblk_t          *mp1 = mp->b_cont;
8792         mod_ioc_prop_t  *pioc;
8793         uint_t          flags;
8794         uint_t          pioc_size;
8795 
8796         /* do sanity checks on various arguments */
8797         if (mp1 == NULL || iocp->ioc_count == 0 ||
8798             iocp->ioc_count == TRANSPARENT) {
8799                 return (B_FALSE);
8800         }
8801         if (msgdsize(mp1) < iocp->ioc_count) {
8802                 if (!pullupmsg(mp1, iocp->ioc_count))
8803                         return (B_FALSE);
8804         }
8805 
8806         pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8807 
8808         /* sanity checks on mpr_valsize */
8809         pioc_size = sizeof (mod_ioc_prop_t);
8810         if (pioc->mpr_valsize != 0)
8811                 pioc_size += pioc->mpr_valsize - 1;
8812 
8813         if (iocp->ioc_count != pioc_size)
8814                 return (B_FALSE);
8815 
8816         flags = pioc->mpr_flags;
8817         if (iocp->ioc_cmd == SIOCSETPROP) {
8818                 /*
8819                  * One can either reset the value to it's default value or
8820                  * change the current value or append/remove the value from
8821                  * a multi-valued properties.
8822                  */
8823                 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8824                     flags != MOD_PROP_ACTIVE &&
8825                     flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8826                     flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8827                         return (B_FALSE);
8828         } else {
8829                 ASSERT(iocp->ioc_cmd == SIOCGETPROP);
8830 
8831                 /*
8832                  * One can retrieve only one kind of property information
8833                  * at a time.
8834                  */
8835                 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8836                     (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8837                     (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8838                     (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8839                         return (B_FALSE);
8840         }
8841 
8842         return (B_TRUE);
8843 }
8844 
8845 /*
8846  * process the SIOC{SET|GET}PROP ioctl's
8847  */
8848 /* ARGSUSED */
8849 static void
8850 ip_sioctl_getsetprop(queue_t *q, mblk_t *mp)
8851 {
8852         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
8853         mblk_t          *mp1 = mp->b_cont;
8854         mod_ioc_prop_t  *pioc;
8855         mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8856         ip_stack_t      *ipst;
8857         icmp_stack_t    *is;
8858         tcp_stack_t     *tcps;
8859         sctp_stack_t    *sctps;
8860         udp_stack_t     *us;
8861         netstack_t      *stack;
8862         void            *cbarg;
8863         cred_t          *cr;
8864         boolean_t       set;
8865         int             err;
8866 
8867         ASSERT(q->q_next == NULL);
8868         ASSERT(CONN_Q(q));
8869 
8870         if (!getset_ioctl_checks(mp)) {
8871                 miocnak(q, mp, 0, EINVAL);
8872                 return;
8873         }
8874         ipst = CONNQ_TO_IPST(q);
8875         stack = ipst->ips_netstack;
8876         pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8877 
8878         switch (pioc->mpr_proto) {
8879         case MOD_PROTO_IP:
8880         case MOD_PROTO_IPV4:
8881         case MOD_PROTO_IPV6:
8882                 ptbl = ipst->ips_propinfo_tbl;
8883                 cbarg = ipst;
8884                 break;
8885         case MOD_PROTO_RAWIP:
8886                 is = stack->netstack_icmp;
8887                 ptbl = is->is_propinfo_tbl;
8888                 cbarg = is;
8889                 break;
8890         case MOD_PROTO_TCP:
8891                 tcps = stack->netstack_tcp;
8892                 ptbl = tcps->tcps_propinfo_tbl;
8893                 cbarg = tcps;
8894                 break;
8895         case MOD_PROTO_UDP:
8896                 us = stack->netstack_udp;
8897                 ptbl = us->us_propinfo_tbl;
8898                 cbarg = us;
8899                 break;
8900         case MOD_PROTO_SCTP:
8901                 sctps = stack->netstack_sctp;
8902                 ptbl = sctps->sctps_propinfo_tbl;
8903                 cbarg = sctps;
8904                 break;
8905         default:
8906                 miocnak(q, mp, 0, EINVAL);
8907                 return;
8908         }
8909 
8910         /* search for given property in respective protocol propinfo table */
8911         for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) {
8912                 if (strcmp(pinfo->mpi_name, pioc->mpr_name) == 0 &&
8913                     pinfo->mpi_proto == pioc->mpr_proto)
8914                         break;
8915         }
8916         if (pinfo->mpi_name == NULL) {
8917                 miocnak(q, mp, 0, ENOENT);
8918                 return;
8919         }
8920 
8921         set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8922         if (set && pinfo->mpi_setf != NULL) {
8923                 cr = msg_getcred(mp, NULL);
8924                 if (cr == NULL)
8925                         cr = iocp->ioc_cr;
8926                 err = pinfo->mpi_setf(cbarg, cr, pinfo, pioc->mpr_ifname,
8927                     pioc->mpr_val, pioc->mpr_flags);
8928         } else if (!set && pinfo->mpi_getf != NULL) {
8929                 err = pinfo->mpi_getf(cbarg, pinfo, pioc->mpr_ifname,
8930                     pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags);
8931         } else {
8932                 err = EPERM;
8933         }
8934 
8935         if (err != 0) {
8936                 miocnak(q, mp, 0, err);
8937         } else {
8938                 if (set)
8939                         miocack(q, mp, 0, 0);
8940                 else    /* For get, we need to return back the data */
8941                         miocack(q, mp, iocp->ioc_count, 0);
8942         }
8943 }
8944 
8945 /*
8946  * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
8947  * as several routing daemons have unfortunately used this 'unpublished'
8948  * but well-known ioctls.
8949  */
8950 /* ARGSUSED */
8951 static void
8952 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
8953 {
8954         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
8955         mblk_t          *mp1 = mp->b_cont;
8956         char            *pname, *pval, *buf;
8957         uint_t          bufsize, proto;
8958         mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8959         ip_stack_t      *ipst;
8960         int             err = 0;
8961 
8962         ASSERT(CONN_Q(q));
8963         ipst = CONNQ_TO_IPST(q);
8964 
8965         if (iocp->ioc_count == 0 || mp1 == NULL) {
8966                 miocnak(q, mp, 0, EINVAL);
8967                 return;
8968         }
8969 
8970         mp1->b_datap->db_lim[-1] = '\0';  /* Force null termination */
8971         pval = buf = pname = (char *)mp1->b_rptr;
8972         bufsize = MBLKL(mp1);
8973 
8974         if (strcmp(pname, "ip_forwarding") == 0) {
8975                 pname = "forwarding";
8976                 proto = MOD_PROTO_IPV4;
8977         } else if (strcmp(pname, "ip6_forwarding") == 0) {
8978                 pname = "forwarding";
8979                 proto = MOD_PROTO_IPV6;
8980         } else {
8981                 miocnak(q, mp, 0, EINVAL);
8982                 return;
8983         }
8984 
8985         ptbl = ipst->ips_propinfo_tbl;
8986         for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) {
8987                 if (strcmp(pinfo->mpi_name, pname) == 0 &&
8988                     pinfo->mpi_proto == proto)
8989                         break;
8990         }
8991 
8992         ASSERT(pinfo->mpi_name != NULL);
8993 
8994         switch (iocp->ioc_cmd) {
8995         case ND_GET:
8996                 if ((err = pinfo->mpi_getf(ipst, pinfo, NULL, buf, bufsize,
8997                     0)) == 0) {
8998                         miocack(q, mp, iocp->ioc_count, 0);
8999                         return;
9000                 }
9001                 break;
9002         case ND_SET:
9003                 /*
9004                  * buffer will have property name and value in the following
9005                  * format,
9006                  * <property name>'\0'<property value>'\0', extract them;
9007                  */
9008                 while (*pval++)
9009                         noop;
9010 
9011                 if (!*pval || pval >= (char *)mp1->b_wptr) {
9012                         err = EINVAL;
9013                 } else if ((err = pinfo->mpi_setf(ipst, NULL, pinfo, NULL,
9014                     pval, 0)) == 0) {
9015                         miocack(q, mp, 0, 0);
9016                         return;
9017                 }
9018                 break;
9019         default:
9020                 err = EINVAL;
9021                 break;
9022         }
9023         miocnak(q, mp, 0, err);
9024 }
9025 
9026 /*
9027  * Wrapper function for resuming deferred ioctl processing
9028  * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
9029  * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
9030  */
9031 /* ARGSUSED */
9032 void
9033 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
9034     void *dummy_arg)
9035 {
9036         ip_sioctl_copyin_setup(q, mp);
9037 }
9038 
9039 /*
9040  * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
9041  * that arrives.  Most of the IOCTLs are "socket" IOCTLs which we handle
9042  * in either I_STR or TRANSPARENT form, using the mi_copy facility.
9043  * We establish here the size of the block to be copied in.  mi_copyin
9044  * arranges for this to happen, an processing continues in ip_wput_nondata with
9045  * an M_IOCDATA message.
9046  */
9047 void
9048 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
9049 {
9050         int     copyin_size;
9051         struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9052         ip_ioctl_cmd_t *ipip;
9053         cred_t *cr;
9054         ip_stack_t      *ipst;
9055 
9056         if (CONN_Q(q))
9057                 ipst = CONNQ_TO_IPST(q);
9058         else
9059                 ipst = ILLQ_TO_IPST(q);
9060 
9061         ipip = ip_sioctl_lookup(iocp->ioc_cmd);
9062         if (ipip == NULL) {
9063                 /*
9064                  * The ioctl is not one we understand or own.
9065                  * Pass it along to be processed down stream,
9066                  * if this is a module instance of IP, else nak
9067                  * the ioctl.
9068                  */
9069                 if (q->q_next == NULL) {
9070                         goto nak;
9071                 } else {
9072                         putnext(q, mp);
9073                         return;
9074                 }
9075         }
9076 
9077         /*
9078          * If this is deferred, then we will do all the checks when we
9079          * come back.
9080          */
9081         if ((iocp->ioc_cmd == SIOCGDSTINFO ||
9082             iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
9083                 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
9084                 return;
9085         }
9086 
9087         /*
9088          * Only allow a very small subset of IP ioctls on this stream if
9089          * IP is a module and not a driver. Allowing ioctls to be processed
9090          * in this case may cause assert failures or data corruption.
9091          * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
9092          * ioctls allowed on an IP module stream, after which this stream
9093          * normally becomes a multiplexor (at which time the stream head
9094          * will fail all ioctls).
9095          */
9096         if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
9097                 goto nak;
9098         }
9099 
9100         /* Make sure we have ioctl data to process. */
9101         if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
9102                 goto nak;
9103 
9104         /*
9105          * Prefer dblk credential over ioctl credential; some synthesized
9106          * ioctls have kcred set because there's no way to crhold()
9107          * a credential in some contexts.  (ioc_cr is not crfree() by
9108          * the framework; the caller of ioctl needs to hold the reference
9109          * for the duration of the call).
9110          */
9111         cr = msg_getcred(mp, NULL);
9112         if (cr == NULL)
9113                 cr = iocp->ioc_cr;
9114 
9115         /* Make sure normal users don't send down privileged ioctls */
9116         if ((ipip->ipi_flags & IPI_PRIV) &&
9117             (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
9118                 /* We checked the privilege earlier but log it here */
9119                 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
9120                 return;
9121         }
9122 
9123         /*
9124          * The ioctl command tables can only encode fixed length
9125          * ioctl data. If the length is variable, the table will
9126          * encode the length as zero. Such special cases are handled
9127          * below in the switch.
9128          */
9129         if (ipip->ipi_copyin_size != 0) {
9130                 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
9131                 return;
9132         }
9133 
9134         switch (iocp->ioc_cmd) {
9135         case O_SIOCGIFCONF:
9136         case SIOCGIFCONF:
9137                 /*
9138                  * This IOCTL is hilarious.  See comments in
9139                  * ip_sioctl_get_ifconf for the story.
9140                  */
9141                 if (iocp->ioc_count == TRANSPARENT)
9142                         copyin_size = SIZEOF_STRUCT(ifconf,
9143                             iocp->ioc_flag);
9144                 else
9145                         copyin_size = iocp->ioc_count;
9146                 mi_copyin(q, mp, NULL, copyin_size);
9147                 return;
9148 
9149         case O_SIOCGLIFCONF:
9150         case SIOCGLIFCONF:
9151                 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
9152                 mi_copyin(q, mp, NULL, copyin_size);
9153                 return;
9154 
9155         case SIOCGLIFSRCOF:
9156                 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
9157                 mi_copyin(q, mp, NULL, copyin_size);
9158                 return;
9159 
9160         case SIOCGIP6ADDRPOLICY:
9161                 ip_sioctl_ip6addrpolicy(q, mp);
9162                 ip6_asp_table_refrele(ipst);
9163                 return;
9164 
9165         case SIOCSIP6ADDRPOLICY:
9166                 ip_sioctl_ip6addrpolicy(q, mp);
9167                 return;
9168 
9169         case SIOCGDSTINFO:
9170                 ip_sioctl_dstinfo(q, mp);
9171                 ip6_asp_table_refrele(ipst);
9172                 return;
9173 
9174         case ND_SET:
9175         case ND_GET:
9176                 ip_process_legacy_nddprop(q, mp);
9177                 return;
9178 
9179         case SIOCSETPROP:
9180         case SIOCGETPROP:
9181                 ip_sioctl_getsetprop(q, mp);
9182                 return;
9183 
9184         case I_PLINK:
9185         case I_PUNLINK:
9186         case I_LINK:
9187         case I_UNLINK:
9188                 /*
9189                  * We treat non-persistent link similarly as the persistent
9190                  * link case, in terms of plumbing/unplumbing, as well as
9191                  * dynamic re-plumbing events indicator.  See comments
9192                  * in ip_sioctl_plink() for more.
9193                  *
9194                  * Request can be enqueued in the 'ipsq' while waiting
9195                  * to become exclusive. So bump up the conn ref.
9196                  */
9197                 if (CONN_Q(q)) {
9198                         CONN_INC_REF(Q_TO_CONN(q));
9199                         CONN_INC_IOCTLREF(Q_TO_CONN(q))
9200                 }
9201                 ip_sioctl_plink(NULL, q, mp, NULL);
9202                 return;
9203 
9204         case IP_IOCTL:
9205                 ip_wput_ioctl(q, mp);
9206                 return;
9207 
9208         case SIOCILB:
9209                 /* The ioctl length varies depending on the ILB command. */
9210                 copyin_size = iocp->ioc_count;
9211                 if (copyin_size < sizeof (ilb_cmd_t))
9212                         goto nak;
9213                 mi_copyin(q, mp, NULL, copyin_size);
9214                 return;
9215 
9216         default:
9217                 cmn_err(CE_PANIC, "should not happen ");
9218         }
9219 nak:
9220         if (mp->b_cont != NULL) {
9221                 freemsg(mp->b_cont);
9222                 mp->b_cont = NULL;
9223         }
9224         iocp->ioc_error = EINVAL;
9225         mp->b_datap->db_type = M_IOCNAK;
9226         iocp->ioc_count = 0;
9227         qreply(q, mp);
9228 }
9229 
9230 static void
9231 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9232 {
9233         struct arpreq *ar;
9234         struct xarpreq *xar;
9235         mblk_t  *tmp;
9236         struct iocblk *iocp;
9237         int x_arp_ioctl = B_FALSE;
9238         int *flagsp;
9239         char *storage = NULL;
9240 
9241         ASSERT(ill != NULL);
9242 
9243         iocp = (struct iocblk *)mp->b_rptr;
9244         ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9245 
9246         tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9247         if ((iocp->ioc_cmd == SIOCGXARP) ||
9248             (iocp->ioc_cmd == SIOCSXARP)) {
9249                 x_arp_ioctl = B_TRUE;
9250                 xar = (struct xarpreq *)tmp->b_rptr;
9251                 flagsp = &xar->xarp_flags;
9252                 storage = xar->xarp_ha.sdl_data;
9253         } else {
9254                 ar = (struct arpreq *)tmp->b_rptr;
9255                 flagsp = &ar->arp_flags;
9256                 storage = ar->arp_ha.sa_data;
9257         }
9258 
9259         /*
9260          * We're done if this is not an SIOCG{X}ARP
9261          */
9262         if (x_arp_ioctl) {
9263                 storage += ill_xarp_info(&xar->xarp_ha, ill);
9264                 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9265                     sizeof (xar->xarp_ha.sdl_data)) {
9266                         iocp->ioc_error = EINVAL;
9267                         return;
9268                 }
9269         }
9270         *flagsp = ATF_INUSE;
9271         /*
9272          * If /sbin/arp told us we are the authority using the "permanent"
9273          * flag, or if this is one of my addresses print "permanent"
9274          * in the /sbin/arp output.
9275          */
9276         if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9277                 *flagsp |= ATF_AUTHORITY;
9278         if (flags & NCE_F_NONUD)
9279                 *flagsp |= ATF_PERM; /* not subject to aging */
9280         if (flags & NCE_F_PUBLISH)
9281                 *flagsp |= ATF_PUBL;
9282         if (hwaddr != NULL) {
9283                 *flagsp |= ATF_COM;
9284                 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9285         }
9286 }
9287 
9288 /*
9289  * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9290  * interface) create the next available logical interface for this
9291  * physical interface.
9292  * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9293  * ipif with the specified name.
9294  *
9295  * If the address family is not AF_UNSPEC then set the address as well.
9296  *
9297  * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9298  * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9299  *
9300  * Executed as a writer on the ill.
9301  * So no lock is needed to traverse the ipif chain, or examine the
9302  * phyint flags.
9303  */
9304 /* ARGSUSED */
9305 int
9306 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9307     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9308 {
9309         mblk_t  *mp1;
9310         struct lifreq *lifr;
9311         boolean_t       isv6;
9312         boolean_t       exists;
9313         char    *name;
9314         char    *endp;
9315         char    *cp;
9316         int     namelen;
9317         ipif_t  *ipif;
9318         long    id;
9319         ipsq_t  *ipsq;
9320         ill_t   *ill;
9321         sin_t   *sin;
9322         int     err = 0;
9323         boolean_t found_sep = B_FALSE;
9324         conn_t  *connp;
9325         zoneid_t zoneid;
9326         ip_stack_t *ipst = CONNQ_TO_IPST(q);
9327 
9328         ASSERT(q->q_next == NULL);
9329         ip1dbg(("ip_sioctl_addif\n"));
9330         /* Existence of mp1 has been checked in ip_wput_nondata */
9331         mp1 = mp->b_cont->b_cont;
9332         /*
9333          * Null terminate the string to protect against buffer
9334          * overrun. String was generated by user code and may not
9335          * be trusted.
9336          */
9337         lifr = (struct lifreq *)mp1->b_rptr;
9338         lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9339         name = lifr->lifr_name;
9340         ASSERT(CONN_Q(q));
9341         connp = Q_TO_CONN(q);
9342         isv6 = (connp->conn_family == AF_INET6);
9343         zoneid = connp->conn_zoneid;
9344         namelen = mi_strlen(name);
9345         if (namelen == 0)
9346                 return (EINVAL);
9347 
9348         exists = B_FALSE;
9349         if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9350             (mi_strcmp(name, ipif_loopback_name) == 0)) {
9351                 /*
9352                  * Allow creating lo0 using SIOCLIFADDIF.
9353                  * can't be any other writer thread. So can pass null below
9354                  * for the last 4 args to ipif_lookup_name.
9355                  */
9356                 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9357                     &exists, isv6, zoneid, ipst);
9358                 /* Prevent any further action */
9359                 if (ipif == NULL) {
9360                         return (ENOBUFS);
9361                 } else if (!exists) {
9362                         /* We created the ipif now and as writer */
9363                         ipif_refrele(ipif);
9364                         return (0);
9365                 } else {
9366                         ill = ipif->ipif_ill;
9367                         ill_refhold(ill);
9368                         ipif_refrele(ipif);
9369                 }
9370         } else {
9371                 /* Look for a colon in the name. */
9372                 endp = &name[namelen];
9373                 for (cp = endp; --cp > name; ) {
9374                         if (*cp == IPIF_SEPARATOR_CHAR) {
9375                                 found_sep = B_TRUE;
9376                                 /*
9377                                  * Reject any non-decimal aliases for plumbing
9378                                  * of logical interfaces. Aliases with leading
9379                                  * zeroes are also rejected as they introduce
9380                                  * ambiguity in the naming of the interfaces.
9381                                  * Comparing with "0" takes care of all such
9382                                  * cases.
9383                                  */
9384                                 if ((strncmp("0", cp+1, 1)) == 0)
9385                                         return (EINVAL);
9386 
9387                                 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9388                                     id <= 0 || *endp != '\0') {
9389                                         return (EINVAL);
9390                                 }
9391                                 *cp = '\0';
9392                                 break;
9393                         }
9394                 }
9395                 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9396                 if (found_sep)
9397                         *cp = IPIF_SEPARATOR_CHAR;
9398                 if (ill == NULL)
9399                         return (ENXIO);
9400         }
9401 
9402         ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9403             B_TRUE);
9404 
9405         /*
9406          * Release the refhold due to the lookup, now that we are excl
9407          * or we are just returning
9408          */
9409         ill_refrele(ill);
9410 
9411         if (ipsq == NULL)
9412                 return (EINPROGRESS);
9413 
9414         /* We are now exclusive on the IPSQ */
9415         ASSERT(IAM_WRITER_ILL(ill));
9416 
9417         if (found_sep) {
9418                 /* Now see if there is an IPIF with this unit number. */
9419                 for (ipif = ill->ill_ipif; ipif != NULL;
9420                     ipif = ipif->ipif_next) {
9421                         if (ipif->ipif_id == id) {
9422                                 err = EEXIST;
9423                                 goto done;
9424                         }
9425                 }
9426         }
9427 
9428         /*
9429          * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9430          * of lo0.  Plumbing for lo0:0 happens in ipif_lookup_on_name()
9431          * instead.
9432          */
9433         if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9434             B_TRUE, B_TRUE, &err)) == NULL) {
9435                 goto done;
9436         }
9437 
9438         /* Return created name with ioctl */
9439         (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9440             IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9441         ip1dbg(("created %s\n", lifr->lifr_name));
9442 
9443         /* Set address */
9444         sin = (sin_t *)&lifr->lifr_addr;
9445         if (sin->sin_family != AF_UNSPEC) {
9446                 err = ip_sioctl_addr(ipif, sin, q, mp,
9447                     &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9448         }
9449 
9450 done:
9451         ipsq_exit(ipsq);
9452         return (err);
9453 }
9454 
9455 /*
9456  * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9457  * interface) delete it based on the IP address (on this physical interface).
9458  * Otherwise delete it based on the ipif_id.
9459  * Also, special handling to allow a removeif of lo0.
9460  */
9461 /* ARGSUSED */
9462 int
9463 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9464     ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9465 {
9466         conn_t          *connp;
9467         ill_t           *ill = ipif->ipif_ill;
9468         boolean_t        success;
9469         ip_stack_t      *ipst;
9470 
9471         ipst = CONNQ_TO_IPST(q);
9472 
9473         ASSERT(q->q_next == NULL);
9474         ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9475             ill->ill_name, ipif->ipif_id, (void *)ipif));
9476         ASSERT(IAM_WRITER_IPIF(ipif));
9477 
9478         connp = Q_TO_CONN(q);
9479         /*
9480          * Special case for unplumbing lo0 (the loopback physical interface).
9481          * If unplumbing lo0, the incoming address structure has been
9482          * initialized to all zeros. When unplumbing lo0, all its logical
9483          * interfaces must be removed too.
9484          *
9485          * Note that this interface may be called to remove a specific
9486          * loopback logical interface (eg, lo0:1). But in that case
9487          * ipif->ipif_id != 0 so that the code path for that case is the
9488          * same as any other interface (meaning it skips the code directly
9489          * below).
9490          */
9491         if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9492                 if (sin->sin_family == AF_UNSPEC &&
9493                     (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9494                         /*
9495                          * Mark it condemned. No new ref. will be made to ill.
9496                          */
9497                         mutex_enter(&ill->ill_lock);
9498                         ill->ill_state_flags |= ILL_CONDEMNED;
9499                         for (ipif = ill->ill_ipif; ipif != NULL;
9500                             ipif = ipif->ipif_next) {
9501                                 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9502                         }
9503                         mutex_exit(&ill->ill_lock);
9504 
9505                         ipif = ill->ill_ipif;
9506                         /* unplumb the loopback interface */
9507                         ill_delete(ill);
9508                         mutex_enter(&connp->conn_lock);
9509                         mutex_enter(&ill->ill_lock);
9510 
9511                         /* Are any references to this ill active */
9512                         if (ill_is_freeable(ill)) {
9513                                 mutex_exit(&ill->ill_lock);
9514                                 mutex_exit(&connp->conn_lock);
9515                                 ill_delete_tail(ill);
9516                                 mi_free(ill);
9517                                 return (0);
9518                         }
9519                         success = ipsq_pending_mp_add(connp, ipif,
9520                             CONNP_TO_WQ(connp), mp, ILL_FREE);
9521                         mutex_exit(&connp->conn_lock);
9522                         mutex_exit(&ill->ill_lock);
9523                         if (success)
9524                                 return (EINPROGRESS);
9525                         else
9526                                 return (EINTR);
9527                 }
9528         }
9529 
9530         if (ipif->ipif_id == 0) {
9531                 ipsq_t *ipsq;
9532 
9533                 /* Find based on address */
9534                 if (ipif->ipif_isv6) {
9535                         sin6_t *sin6;
9536 
9537                         if (sin->sin_family != AF_INET6)
9538                                 return (EAFNOSUPPORT);
9539 
9540                         sin6 = (sin6_t *)sin;
9541                         /* We are a writer, so we should be able to lookup */
9542                         ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9543                             ipst);
9544                 } else {
9545                         if (sin->sin_family != AF_INET)
9546                                 return (EAFNOSUPPORT);
9547 
9548                         /* We are a writer, so we should be able to lookup */
9549                         ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9550                             ipst);
9551                 }
9552                 if (ipif == NULL) {
9553                         return (EADDRNOTAVAIL);
9554                 }
9555 
9556                 /*
9557                  * It is possible for a user to send an SIOCLIFREMOVEIF with
9558                  * lifr_name of the physical interface but with an ip address
9559                  * lifr_addr of a logical interface plumbed over it.
9560                  * So update ipx_current_ipif now that ipif points to the
9561                  * correct one.
9562                  */
9563                 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9564                 ipsq->ipsq_xop->ipx_current_ipif = ipif;
9565 
9566                 /* This is a writer */
9567                 ipif_refrele(ipif);
9568         }
9569 
9570         /*
9571          * Can not delete instance zero since it is tied to the ill.
9572          */
9573         if (ipif->ipif_id == 0)
9574                 return (EBUSY);
9575 
9576         mutex_enter(&ill->ill_lock);
9577         ipif->ipif_state_flags |= IPIF_CONDEMNED;
9578         mutex_exit(&ill->ill_lock);
9579 
9580         ipif_free(ipif);
9581 
9582         mutex_enter(&connp->conn_lock);
9583         mutex_enter(&ill->ill_lock);
9584 
9585         /* Are any references to this ipif active */
9586         if (ipif_is_freeable(ipif)) {
9587                 mutex_exit(&ill->ill_lock);
9588                 mutex_exit(&connp->conn_lock);
9589                 ipif_non_duplicate(ipif);
9590                 (void) ipif_down_tail(ipif);
9591                 ipif_free_tail(ipif); /* frees ipif */
9592                 return (0);
9593         }
9594         success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9595             IPIF_FREE);
9596         mutex_exit(&ill->ill_lock);
9597         mutex_exit(&connp->conn_lock);
9598         if (success)
9599                 return (EINPROGRESS);
9600         else
9601                 return (EINTR);
9602 }
9603 
9604 /*
9605  * Restart the removeif ioctl. The refcnt has gone down to 0.
9606  * The ipif is already condemned. So can't find it thru lookups.
9607  */
9608 /* ARGSUSED */
9609 int
9610 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9611     mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9612 {
9613         ill_t *ill = ipif->ipif_ill;
9614 
9615         ASSERT(IAM_WRITER_IPIF(ipif));
9616         ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9617 
9618         ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9619             ill->ill_name, ipif->ipif_id, (void *)ipif));
9620 
9621         if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9622                 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9623                 ill_delete_tail(ill);
9624                 mi_free(ill);
9625                 return (0);
9626         }
9627 
9628         ipif_non_duplicate(ipif);
9629         (void) ipif_down_tail(ipif);
9630         ipif_free_tail(ipif);
9631 
9632         return (0);
9633 }
9634 
9635 /*
9636  * Set the local interface address using the given prefix and ill_token.
9637  */
9638 /* ARGSUSED */
9639 int
9640 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9641     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9642 {
9643         int err;
9644         in6_addr_t v6addr;
9645         sin6_t *sin6;
9646         ill_t *ill;
9647         int i;
9648 
9649         ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9650             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9651 
9652         ASSERT(IAM_WRITER_IPIF(ipif));
9653 
9654         if (!ipif->ipif_isv6)
9655                 return (EINVAL);
9656 
9657         if (sin->sin_family != AF_INET6)
9658                 return (EAFNOSUPPORT);
9659 
9660         sin6 = (sin6_t *)sin;
9661         v6addr = sin6->sin6_addr;
9662         ill = ipif->ipif_ill;
9663 
9664         if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9665             IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9666                 return (EADDRNOTAVAIL);
9667 
9668         for (i = 0; i < 4; i++)
9669                 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9670 
9671         err = ip_sioctl_addr(ipif, sin, q, mp,
9672             &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9673         return (err);
9674 }
9675 
9676 /*
9677  * Restart entry point to restart the address set operation after the
9678  * refcounts have dropped to zero.
9679  */
9680 /* ARGSUSED */
9681 int
9682 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9683     ip_ioctl_cmd_t *ipip, void *ifreq)
9684 {
9685         ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9686             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9687         return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9688 }
9689 
9690 /*
9691  * Set the local interface address.
9692  * Allow an address of all zero when the interface is down.
9693  */
9694 /* ARGSUSED */
9695 int
9696 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9697     ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9698 {
9699         int err = 0;
9700         in6_addr_t v6addr;
9701         boolean_t need_up = B_FALSE;
9702         ill_t *ill;
9703         int i;
9704 
9705         ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9706             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9707 
9708         ASSERT(IAM_WRITER_IPIF(ipif));
9709 
9710         ill = ipif->ipif_ill;
9711         if (ipif->ipif_isv6) {
9712                 sin6_t *sin6;
9713                 phyint_t *phyi;
9714 
9715                 if (sin->sin_family != AF_INET6)
9716                         return (EAFNOSUPPORT);
9717 
9718                 sin6 = (sin6_t *)sin;
9719                 v6addr = sin6->sin6_addr;
9720                 phyi = ill->ill_phyint;
9721 
9722                 /*
9723                  * Enforce that true multicast interfaces have a link-local
9724                  * address for logical unit 0.
9725                  *
9726                  * However for those ipif's for which link-local address was
9727                  * not created by default, also allow setting :: as the address.
9728                  * This scenario would arise, when we delete an address on ipif
9729                  * with logical unit 0, we would want to set :: as the address.
9730                  */
9731                 if (ipif->ipif_id == 0 &&
9732                     (ill->ill_flags & ILLF_MULTICAST) &&
9733                     !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9734                     !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9735                     !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9736 
9737                         /*
9738                          * if default link-local was not created by kernel for
9739                          * this ill, allow setting :: as the address on ipif:0.
9740                          */
9741                         if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9742                                 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9743                                         return (EADDRNOTAVAIL);
9744                         } else {
9745                                 return (EADDRNOTAVAIL);
9746                         }
9747                 }
9748 
9749                 /*
9750                  * up interfaces shouldn't have the unspecified address
9751                  * unless they also have the IPIF_NOLOCAL flags set and
9752                  * have a subnet assigned.
9753                  */
9754                 if ((ipif->ipif_flags & IPIF_UP) &&
9755                     IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9756                     (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9757                     IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9758                         return (EADDRNOTAVAIL);
9759                 }
9760 
9761                 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9762                         return (EADDRNOTAVAIL);
9763         } else {
9764                 ipaddr_t addr;
9765 
9766                 if (sin->sin_family != AF_INET)
9767                         return (EAFNOSUPPORT);
9768 
9769                 addr = sin->sin_addr.s_addr;
9770 
9771                 /* Allow INADDR_ANY as the local address. */
9772                 if (addr != INADDR_ANY &&
9773                     !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9774                         return (EADDRNOTAVAIL);
9775 
9776                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9777         }
9778         /*
9779          * verify that the address being configured is permitted by the
9780          * ill_allowed_ips[] for the interface.
9781          */
9782         if (ill->ill_allowed_ips_cnt > 0) {
9783                 for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
9784                         if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
9785                             &v6addr))
9786                                 break;
9787                 }
9788                 if (i == ill->ill_allowed_ips_cnt) {
9789                         pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
9790                         return (EPERM);
9791                 }
9792         }
9793         /*
9794          * Even if there is no change we redo things just to rerun
9795          * ipif_set_default.
9796          */
9797         if (ipif->ipif_flags & IPIF_UP) {
9798                 /*
9799                  * Setting a new local address, make sure
9800                  * we have net and subnet bcast ire's for
9801                  * the old address if we need them.
9802                  */
9803                 /*
9804                  * If the interface is already marked up,
9805                  * we call ipif_down which will take care
9806                  * of ditching any IREs that have been set
9807                  * up based on the old interface address.
9808                  */
9809                 err = ipif_logical_down(ipif, q, mp);
9810                 if (err == EINPROGRESS)
9811                         return (err);
9812                 (void) ipif_down_tail(ipif);
9813                 need_up = 1;
9814         }
9815 
9816         err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9817         return (err);
9818 }
9819 
9820 int
9821 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9822     boolean_t need_up)
9823 {
9824         in6_addr_t v6addr;
9825         in6_addr_t ov6addr;
9826         ipaddr_t addr;
9827         sin6_t  *sin6;
9828         int     sinlen;
9829         int     err = 0;
9830         ill_t   *ill = ipif->ipif_ill;
9831         boolean_t need_dl_down;
9832         boolean_t need_arp_down;
9833         struct iocblk *iocp;
9834 
9835         iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9836 
9837         ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9838             ill->ill_name, ipif->ipif_id, (void *)ipif));
9839         ASSERT(IAM_WRITER_IPIF(ipif));
9840 
9841         /* Must cancel any pending timer before taking the ill_lock */
9842         if (ipif->ipif_recovery_id != 0)
9843                 (void) untimeout(ipif->ipif_recovery_id);
9844         ipif->ipif_recovery_id = 0;
9845 
9846         if (ipif->ipif_isv6) {
9847                 sin6 = (sin6_t *)sin;
9848                 v6addr = sin6->sin6_addr;
9849                 sinlen = sizeof (struct sockaddr_in6);
9850         } else {
9851                 addr = sin->sin_addr.s_addr;
9852                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9853                 sinlen = sizeof (struct sockaddr_in);
9854         }
9855         mutex_enter(&ill->ill_lock);
9856         ov6addr = ipif->ipif_v6lcl_addr;
9857         ipif->ipif_v6lcl_addr = v6addr;
9858         sctp_update_ipif_addr(ipif, ov6addr);
9859         ipif->ipif_addr_ready = 0;
9860 
9861         ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9862 
9863         /*
9864          * If the interface was previously marked as a duplicate, then since
9865          * we've now got a "new" address, it should no longer be considered a
9866          * duplicate -- even if the "new" address is the same as the old one.
9867          * Note that if all ipifs are down, we may have a pending ARP down
9868          * event to handle.  This is because we want to recover from duplicates
9869          * and thus delay tearing down ARP until the duplicates have been
9870          * removed or disabled.
9871          */
9872         need_dl_down = need_arp_down = B_FALSE;
9873         if (ipif->ipif_flags & IPIF_DUPLICATE) {
9874                 need_arp_down = !need_up;
9875                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9876                 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9877                     ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9878                         need_dl_down = B_TRUE;
9879                 }
9880         }
9881 
9882         ipif_set_default(ipif);
9883 
9884         /*
9885          * If we've just manually set the IPv6 link-local address (0th ipif),
9886          * tag the ill so that future updates to the interface ID don't result
9887          * in this address getting automatically reconfigured from under the
9888          * administrator.
9889          */
9890         if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9891                 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9892                     !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9893                         ill->ill_manual_linklocal = 1;
9894         }
9895 
9896         /*
9897          * When publishing an interface address change event, we only notify
9898          * the event listeners of the new address.  It is assumed that if they
9899          * actively care about the addresses assigned that they will have
9900          * already discovered the previous address assigned (if there was one.)
9901          *
9902          * Don't attach nic event message for SIOCLIFADDIF ioctl.
9903          */
9904         if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9905                 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9906                     NE_ADDRESS_CHANGE, sin, sinlen);
9907         }
9908 
9909         mutex_exit(&ill->ill_lock);
9910 
9911         if (need_up) {
9912                 /*
9913                  * Now bring the interface back up.  If this
9914                  * is the only IPIF for the ILL, ipif_up
9915                  * will have to re-bind to the device, so
9916                  * we may get back EINPROGRESS, in which
9917                  * case, this IOCTL will get completed in
9918                  * ip_rput_dlpi when we see the DL_BIND_ACK.
9919                  */
9920                 err = ipif_up(ipif, q, mp);
9921         } else {
9922                 /* Perhaps ilgs should use this ill */
9923                 update_conn_ill(NULL, ill->ill_ipst);
9924         }
9925 
9926         if (need_dl_down)
9927                 ill_dl_down(ill);
9928 
9929         if (need_arp_down && !ill->ill_isv6)
9930                 (void) ipif_arp_down(ipif);
9931 
9932         /*
9933          * The default multicast interface might have changed (for
9934          * instance if the IPv6 scope of the address changed)
9935          */
9936         ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9937 
9938         return (err);
9939 }
9940 
9941 /*
9942  * Restart entry point to restart the address set operation after the
9943  * refcounts have dropped to zero.
9944  */
9945 /* ARGSUSED */
9946 int
9947 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9948     ip_ioctl_cmd_t *ipip, void *ifreq)
9949 {
9950         ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
9951             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9952         ASSERT(IAM_WRITER_IPIF(ipif));
9953         (void) ipif_down_tail(ipif);
9954         return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
9955 }
9956 
9957 /* ARGSUSED */
9958 int
9959 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9960     ip_ioctl_cmd_t *ipip, void *if_req)
9961 {
9962         sin6_t *sin6 = (struct sockaddr_in6 *)sin;
9963         struct lifreq *lifr = (struct lifreq *)if_req;
9964 
9965         ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
9966             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9967         /*
9968          * The net mask and address can't change since we have a
9969          * reference to the ipif. So no lock is necessary.
9970          */
9971         if (ipif->ipif_isv6) {
9972                 *sin6 = sin6_null;
9973                 sin6->sin6_family = AF_INET6;
9974                 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
9975                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
9976                 lifr->lifr_addrlen =
9977                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
9978         } else {
9979                 *sin = sin_null;
9980                 sin->sin_family = AF_INET;
9981                 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
9982                 if (ipip->ipi_cmd_type == LIF_CMD) {
9983                         lifr->lifr_addrlen =
9984                             ip_mask_to_plen(ipif->ipif_net_mask);
9985                 }
9986         }
9987         return (0);
9988 }
9989 
9990 /*
9991  * Set the destination address for a pt-pt interface.
9992  */
9993 /* ARGSUSED */
9994 int
9995 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9996     ip_ioctl_cmd_t *ipip, void *if_req)
9997 {
9998         int err = 0;
9999         in6_addr_t v6addr;
10000         boolean_t need_up = B_FALSE;
10001 
10002         ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
10003             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10004         ASSERT(IAM_WRITER_IPIF(ipif));
10005 
10006         if (ipif->ipif_isv6) {
10007                 sin6_t *sin6;
10008 
10009                 if (sin->sin_family != AF_INET6)
10010                         return (EAFNOSUPPORT);
10011 
10012                 sin6 = (sin6_t *)sin;
10013                 v6addr = sin6->sin6_addr;
10014 
10015                 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10016                         return (EADDRNOTAVAIL);
10017         } else {
10018                 ipaddr_t addr;
10019 
10020                 if (sin->sin_family != AF_INET)
10021                         return (EAFNOSUPPORT);
10022 
10023                 addr = sin->sin_addr.s_addr;
10024                 if (addr != INADDR_ANY &&
10025                     !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
10026                         return (EADDRNOTAVAIL);
10027                 }
10028 
10029                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10030         }
10031 
10032         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
10033                 return (0);     /* No change */
10034 
10035         if (ipif->ipif_flags & IPIF_UP) {
10036                 /*
10037                  * If the interface is already marked up,
10038                  * we call ipif_down which will take care
10039                  * of ditching any IREs that have been set
10040                  * up based on the old pp dst address.
10041                  */
10042                 err = ipif_logical_down(ipif, q, mp);
10043                 if (err == EINPROGRESS)
10044                         return (err);
10045                 (void) ipif_down_tail(ipif);
10046                 need_up = B_TRUE;
10047         }
10048         /*
10049          * could return EINPROGRESS. If so ioctl will complete in
10050          * ip_rput_dlpi_writer
10051          */
10052         err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
10053         return (err);
10054 }
10055 
10056 static int
10057 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10058     boolean_t need_up)
10059 {
10060         in6_addr_t v6addr;
10061         ill_t   *ill = ipif->ipif_ill;
10062         int     err = 0;
10063         boolean_t need_dl_down;
10064         boolean_t need_arp_down;
10065 
10066         ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
10067             ipif->ipif_id, (void *)ipif));
10068 
10069         /* Must cancel any pending timer before taking the ill_lock */
10070         if (ipif->ipif_recovery_id != 0)
10071                 (void) untimeout(ipif->ipif_recovery_id);
10072         ipif->ipif_recovery_id = 0;
10073 
10074         if (ipif->ipif_isv6) {
10075                 sin6_t *sin6;
10076 
10077                 sin6 = (sin6_t *)sin;
10078                 v6addr = sin6->sin6_addr;
10079         } else {
10080                 ipaddr_t addr;
10081 
10082                 addr = sin->sin_addr.s_addr;
10083                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10084         }
10085         mutex_enter(&ill->ill_lock);
10086         /* Set point to point destination address. */
10087         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10088                 /*
10089                  * Allow this as a means of creating logical
10090                  * pt-pt interfaces on top of e.g. an Ethernet.
10091                  * XXX Undocumented HACK for testing.
10092                  * pt-pt interfaces are created with NUD disabled.
10093                  */
10094                 ipif->ipif_flags |= IPIF_POINTOPOINT;
10095                 ipif->ipif_flags &= ~IPIF_BROADCAST;
10096                 if (ipif->ipif_isv6)
10097                         ill->ill_flags |= ILLF_NONUD;
10098         }
10099 
10100         /*
10101          * If the interface was previously marked as a duplicate, then since
10102          * we've now got a "new" address, it should no longer be considered a
10103          * duplicate -- even if the "new" address is the same as the old one.
10104          * Note that if all ipifs are down, we may have a pending ARP down
10105          * event to handle.
10106          */
10107         need_dl_down = need_arp_down = B_FALSE;
10108         if (ipif->ipif_flags & IPIF_DUPLICATE) {
10109                 need_arp_down = !need_up;
10110                 ipif->ipif_flags &= ~IPIF_DUPLICATE;
10111                 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
10112                     ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
10113                         need_dl_down = B_TRUE;
10114                 }
10115         }
10116 
10117         /*
10118          * If we've just manually set the IPv6 destination link-local address
10119          * (0th ipif), tag the ill so that future updates to the destination
10120          * interface ID (as can happen with interfaces over IP tunnels) don't
10121          * result in this address getting automatically reconfigured from
10122          * under the administrator.
10123          */
10124         if (ipif->ipif_isv6 && ipif->ipif_id == 0)
10125                 ill->ill_manual_dst_linklocal = 1;
10126 
10127         /* Set the new address. */
10128         ipif->ipif_v6pp_dst_addr = v6addr;
10129         /* Make sure subnet tracks pp_dst */
10130         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
10131         mutex_exit(&ill->ill_lock);
10132 
10133         if (need_up) {
10134                 /*
10135                  * Now bring the interface back up.  If this
10136                  * is the only IPIF for the ILL, ipif_up
10137                  * will have to re-bind to the device, so
10138                  * we may get back EINPROGRESS, in which
10139                  * case, this IOCTL will get completed in
10140                  * ip_rput_dlpi when we see the DL_BIND_ACK.
10141                  */
10142                 err = ipif_up(ipif, q, mp);
10143         }
10144 
10145         if (need_dl_down)
10146                 ill_dl_down(ill);
10147         if (need_arp_down && !ipif->ipif_isv6)
10148                 (void) ipif_arp_down(ipif);
10149 
10150         return (err);
10151 }
10152 
10153 /*
10154  * Restart entry point to restart the dstaddress set operation after the
10155  * refcounts have dropped to zero.
10156  */
10157 /* ARGSUSED */
10158 int
10159 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10160     ip_ioctl_cmd_t *ipip, void *ifreq)
10161 {
10162         ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
10163             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10164         (void) ipif_down_tail(ipif);
10165         return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
10166 }
10167 
10168 /* ARGSUSED */
10169 int
10170 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10171     ip_ioctl_cmd_t *ipip, void *if_req)
10172 {
10173         sin6_t  *sin6 = (struct sockaddr_in6 *)sin;
10174 
10175         ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
10176             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10177         /*
10178          * Get point to point destination address. The addresses can't
10179          * change since we hold a reference to the ipif.
10180          */
10181         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
10182                 return (EADDRNOTAVAIL);
10183 
10184         if (ipif->ipif_isv6) {
10185                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10186                 *sin6 = sin6_null;
10187                 sin6->sin6_family = AF_INET6;
10188                 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
10189         } else {
10190                 *sin = sin_null;
10191                 sin->sin_family = AF_INET;
10192                 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
10193         }
10194         return (0);
10195 }
10196 
10197 /*
10198  * Check which flags will change by the given flags being set
10199  * silently ignore flags which userland is not allowed to control.
10200  * (Because these flags may change between SIOCGLIFFLAGS and
10201  * SIOCSLIFFLAGS, and that's outside of userland's control,
10202  * we need to silently ignore them rather than fail.)
10203  */
10204 static void
10205 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
10206     uint64_t *offp)
10207 {
10208         ill_t           *ill = ipif->ipif_ill;
10209         phyint_t        *phyi = ill->ill_phyint;
10210         uint64_t        cantchange_flags, intf_flags;
10211         uint64_t        turn_on, turn_off;
10212 
10213         intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10214         cantchange_flags = IFF_CANTCHANGE;
10215         if (IS_IPMP(ill))
10216                 cantchange_flags |= IFF_IPMP_CANTCHANGE;
10217         turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10218         turn_off = intf_flags & turn_on;
10219         turn_on ^= turn_off;
10220         *onp = turn_on;
10221         *offp = turn_off;
10222 }
10223 
10224 /*
10225  * Set interface flags.  Many flags require special handling (e.g.,
10226  * bringing the interface down); see below for details.
10227  *
10228  * NOTE : We really don't enforce that ipif_id zero should be used
10229  *        for setting any flags other than IFF_LOGINT_FLAGS. This
10230  *        is because applications generally does SICGLIFFLAGS and
10231  *        ORs in the new flags (that affects the logical) and does a
10232  *        SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10233  *        than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10234  *        flags that will be turned on is correct with respect to
10235  *        ipif_id 0. For backward compatibility reasons, it is not done.
10236  */
10237 /* ARGSUSED */
10238 int
10239 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10240     ip_ioctl_cmd_t *ipip, void *if_req)
10241 {
10242         uint64_t turn_on;
10243         uint64_t turn_off;
10244         int     err = 0;
10245         phyint_t *phyi;
10246         ill_t *ill;
10247         conn_t *connp;
10248         uint64_t intf_flags;
10249         boolean_t phyint_flags_modified = B_FALSE;
10250         uint64_t flags;
10251         struct ifreq *ifr;
10252         struct lifreq *lifr;
10253         boolean_t set_linklocal = B_FALSE;
10254 
10255         ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10256             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10257 
10258         ASSERT(IAM_WRITER_IPIF(ipif));
10259 
10260         ill = ipif->ipif_ill;
10261         phyi = ill->ill_phyint;
10262 
10263         if (ipip->ipi_cmd_type == IF_CMD) {
10264                 ifr = (struct ifreq *)if_req;
10265                 flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10266         } else {
10267                 lifr = (struct lifreq *)if_req;
10268                 flags = lifr->lifr_flags;
10269         }
10270 
10271         intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10272 
10273         /*
10274          * Have the flags been set correctly until now?
10275          */
10276         ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10277         ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10278         ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10279         /*
10280          * Compare the new flags to the old, and partition
10281          * into those coming on and those going off.
10282          * For the 16 bit command keep the bits above bit 16 unchanged.
10283          */
10284         if (ipip->ipi_cmd == SIOCSIFFLAGS)
10285                 flags |= intf_flags & ~0xFFFF;
10286 
10287         /*
10288          * Explicitly fail attempts to change flags that are always invalid on
10289          * an IPMP meta-interface.
10290          */
10291         if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10292                 return (EINVAL);
10293 
10294         ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10295         if ((turn_on|turn_off) == 0)
10296                 return (0);     /* No change */
10297 
10298         /*
10299          * All test addresses must be IFF_DEPRECATED (to ensure source address
10300          * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10301          * allow it to be turned off.
10302          */
10303         if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10304             (turn_on|intf_flags) & IFF_NOFAILOVER)
10305                 return (EINVAL);
10306 
10307         if ((connp = Q_TO_CONN(q)) == NULL)
10308                 return (EINVAL);
10309 
10310         /*
10311          * Only vrrp control socket is allowed to change IFF_UP and
10312          * IFF_NOACCEPT flags when IFF_VRRP is set.
10313          */
10314         if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10315                 if (!connp->conn_isvrrp)
10316                         return (EINVAL);
10317         }
10318 
10319         /*
10320          * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10321          * VRRP control socket.
10322          */
10323         if ((turn_off | turn_on) & IFF_NOACCEPT) {
10324                 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10325                         return (EINVAL);
10326         }
10327 
10328         if (turn_on & IFF_NOFAILOVER) {
10329                 turn_on |= IFF_DEPRECATED;
10330                 flags |= IFF_DEPRECATED;
10331         }
10332 
10333         /*
10334          * On underlying interfaces, only allow applications to manage test
10335          * addresses -- otherwise, they may get confused when the address
10336          * moves as part of being brought up.  Likewise, prevent an
10337          * application-managed test address from being converted to a data
10338          * address.  To prevent migration of administratively up addresses in
10339          * the kernel, we don't allow them to be converted either.
10340          */
10341         if (IS_UNDER_IPMP(ill)) {
10342                 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10343 
10344                 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10345                         return (EINVAL);
10346 
10347                 if ((turn_off & IFF_NOFAILOVER) &&
10348                     (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10349                         return (EINVAL);
10350         }
10351 
10352         /*
10353          * Only allow IFF_TEMPORARY flag to be set on
10354          * IPv6 interfaces.
10355          */
10356         if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10357                 return (EINVAL);
10358 
10359         /*
10360          * cannot turn off IFF_NOXMIT on  VNI interfaces.
10361          */
10362         if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10363                 return (EINVAL);
10364 
10365         /*
10366          * Don't allow the IFF_ROUTER flag to be turned on on loopback
10367          * interfaces.  It makes no sense in that context.
10368          */
10369         if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10370                 return (EINVAL);
10371 
10372         /*
10373          * For IPv6 ipif_id 0, don't allow the interface to be up without
10374          * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10375          * If the link local address isn't set, and can be set, it will get
10376          * set later on in this function.
10377          */
10378         if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10379             (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10380             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10381                 if (ipif_cant_setlinklocal(ipif))
10382                         return (EINVAL);
10383                 set_linklocal = B_TRUE;
10384         }
10385 
10386         /*
10387          * If we modify physical interface flags, we'll potentially need to
10388          * send up two routing socket messages for the changes (one for the
10389          * IPv4 ill, and another for the IPv6 ill).  Note that here.
10390          */
10391         if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10392                 phyint_flags_modified = B_TRUE;
10393 
10394         /*
10395          * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10396          * (otherwise, we'd immediately use them, defeating standby).  Also,
10397          * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10398          * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10399          * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared.  We
10400          * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10401          * will not be honored.
10402          */
10403         if (turn_on & PHYI_STANDBY) {
10404                 /*
10405                  * No need to grab ill_g_usesrc_lock here; see the
10406                  * synchronization notes in ip.c.
10407                  */
10408                 if (ill->ill_usesrc_grp_next != NULL ||
10409                     intf_flags & PHYI_INACTIVE)
10410                         return (EINVAL);
10411                 if (!(flags & PHYI_FAILED)) {
10412                         flags |= PHYI_INACTIVE;
10413                         turn_on |= PHYI_INACTIVE;
10414                 }
10415         }
10416 
10417         if (turn_off & PHYI_STANDBY) {
10418                 flags &= ~PHYI_INACTIVE;
10419                 turn_off |= PHYI_INACTIVE;
10420         }
10421 
10422         /*
10423          * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10424          * would end up on.
10425          */
10426         if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10427             (PHYI_FAILED | PHYI_INACTIVE))
10428                 return (EINVAL);
10429 
10430         /*
10431          * If ILLF_ROUTER changes, we need to change the ip forwarding
10432          * status of the interface.
10433          */
10434         if ((turn_on | turn_off) & ILLF_ROUTER) {
10435                 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10436                 if (err != 0)
10437                         return (err);
10438         }
10439 
10440         /*
10441          * If the interface is not UP and we are not going to
10442          * bring it UP, record the flags and return. When the
10443          * interface comes UP later, the right actions will be
10444          * taken.
10445          */
10446         if (!(ipif->ipif_flags & IPIF_UP) &&
10447             !(turn_on & IPIF_UP)) {
10448                 /* Record new flags in their respective places. */
10449                 mutex_enter(&ill->ill_lock);
10450                 mutex_enter(&ill->ill_phyint->phyint_lock);
10451                 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10452                 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10453                 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10454                 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10455                 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10456                 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10457                 mutex_exit(&ill->ill_lock);
10458                 mutex_exit(&ill->ill_phyint->phyint_lock);
10459 
10460                 /*
10461                  * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10462                  * same to the kernel: if any of them has been set by
10463                  * userland, the interface cannot be used for data traffic.
10464                  */
10465                 if ((turn_on|turn_off) &
10466                     (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10467                         ASSERT(!IS_IPMP(ill));
10468                         /*
10469                          * It's possible the ill is part of an "anonymous"
10470                          * IPMP group rather than a real group.  In that case,
10471                          * there are no other interfaces in the group and thus
10472                          * no need to call ipmp_phyint_refresh_active().
10473                          */
10474                         if (IS_UNDER_IPMP(ill))
10475                                 ipmp_phyint_refresh_active(phyi);
10476                 }
10477 
10478                 if (phyint_flags_modified) {
10479                         if (phyi->phyint_illv4 != NULL) {
10480                                 ip_rts_ifmsg(phyi->phyint_illv4->
10481                                     ill_ipif, RTSQ_DEFAULT);
10482                         }
10483                         if (phyi->phyint_illv6 != NULL) {
10484                                 ip_rts_ifmsg(phyi->phyint_illv6->
10485                                     ill_ipif, RTSQ_DEFAULT);
10486                         }
10487                 }
10488                 /* The default multicast interface might have changed */
10489                 ire_increment_multicast_generation(ill->ill_ipst,
10490                     ill->ill_isv6);
10491 
10492                 return (0);
10493         } else if (set_linklocal) {
10494                 mutex_enter(&ill->ill_lock);
10495                 if (set_linklocal)
10496                         ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10497                 mutex_exit(&ill->ill_lock);
10498         }
10499 
10500         /*
10501          * Disallow IPv6 interfaces coming up that have the unspecified address,
10502          * or point-to-point interfaces with an unspecified destination. We do
10503          * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10504          * have a subnet assigned, which is how in.ndpd currently manages its
10505          * onlink prefix list when no addresses are configured with those
10506          * prefixes.
10507          */
10508         if (ipif->ipif_isv6 &&
10509             ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10510             (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10511             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10512             ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10513             IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10514                 return (EINVAL);
10515         }
10516 
10517         /*
10518          * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10519          * from being brought up.
10520          */
10521         if (!ipif->ipif_isv6 &&
10522             ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10523             ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10524                 return (EINVAL);
10525         }
10526 
10527         /*
10528          * If we are going to change one or more of the flags that are
10529          * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10530          * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10531          * IPIF_NOFAILOVER, we will take special action.  This is
10532          * done by bring the ipif down, changing the flags and bringing
10533          * it back up again.  For IPIF_NOFAILOVER, the act of bringing it
10534          * back up will trigger the address to be moved.
10535          *
10536          * If we are going to change IFF_NOACCEPT, we need to bring
10537          * all the ipifs down then bring them up again.  The act of
10538          * bringing all the ipifs back up will trigger the local
10539          * ires being recreated with "no_accept" set/cleared.
10540          *
10541          * Note that ILLF_NOACCEPT is always set separately from the
10542          * other flags.
10543          */
10544         if ((turn_on|turn_off) &
10545             (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10546             ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10547             IPIF_NOFAILOVER)) {
10548                 /*
10549                  * ipif_down() will ire_delete bcast ire's for the subnet,
10550                  * while the ire_identical_ref tracks the case of IRE_BROADCAST
10551                  * entries shared between multiple ipifs on the same subnet.
10552                  */
10553                 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10554                     !(turn_off & IPIF_UP)) {
10555                         if (ipif->ipif_flags & IPIF_UP)
10556                                 ill->ill_logical_down = 1;
10557                         turn_on &= ~IPIF_UP;
10558                 }
10559                 err = ipif_down(ipif, q, mp);
10560                 ip1dbg(("ipif_down returns %d err ", err));
10561                 if (err == EINPROGRESS)
10562                         return (err);
10563                 (void) ipif_down_tail(ipif);
10564         } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10565                 /*
10566                  * If we can quiesce the ill, then continue.  If not, then
10567                  * ip_sioctl_flags_tail() will be called from
10568                  * ipif_ill_refrele_tail().
10569                  */
10570                 ill_down_ipifs(ill, B_TRUE);
10571 
10572                 mutex_enter(&connp->conn_lock);
10573                 mutex_enter(&ill->ill_lock);
10574                 if (!ill_is_quiescent(ill)) {
10575                         boolean_t success;
10576 
10577                         success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10578                             q, mp, ILL_DOWN);
10579                         mutex_exit(&ill->ill_lock);
10580                         mutex_exit(&connp->conn_lock);
10581                         return (success ? EINPROGRESS : EINTR);
10582                 }
10583                 mutex_exit(&ill->ill_lock);
10584                 mutex_exit(&connp->conn_lock);
10585         }
10586         return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10587 }
10588 
10589 static int
10590 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10591 {
10592         ill_t   *ill;
10593         phyint_t *phyi;
10594         uint64_t turn_on, turn_off;
10595         boolean_t phyint_flags_modified = B_FALSE;
10596         int     err = 0;
10597         boolean_t set_linklocal = B_FALSE;
10598 
10599         ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10600             ipif->ipif_ill->ill_name, ipif->ipif_id));
10601 
10602         ASSERT(IAM_WRITER_IPIF(ipif));
10603 
10604         ill = ipif->ipif_ill;
10605         phyi = ill->ill_phyint;
10606 
10607         ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10608 
10609         /*
10610          * IFF_UP is handled separately.
10611          */
10612         turn_on &= ~IFF_UP;
10613         turn_off &= ~IFF_UP;
10614 
10615         if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10616                 phyint_flags_modified = B_TRUE;
10617 
10618         /*
10619          * Now we change the flags. Track current value of
10620          * other flags in their respective places.
10621          */
10622         mutex_enter(&ill->ill_lock);
10623         mutex_enter(&phyi->phyint_lock);
10624         ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10625         ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10626         ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10627         ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10628         phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10629         phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10630         if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10631                 set_linklocal = B_TRUE;
10632                 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10633         }
10634 
10635         mutex_exit(&ill->ill_lock);
10636         mutex_exit(&phyi->phyint_lock);
10637 
10638         if (set_linklocal)
10639                 (void) ipif_setlinklocal(ipif);
10640 
10641         /*
10642          * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10643          * the kernel: if any of them has been set by userland, the interface
10644          * cannot be used for data traffic.
10645          */
10646         if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10647                 ASSERT(!IS_IPMP(ill));
10648                 /*
10649                  * It's possible the ill is part of an "anonymous" IPMP group
10650                  * rather than a real group.  In that case, there are no other
10651                  * interfaces in the group and thus no need for us to call
10652                  * ipmp_phyint_refresh_active().
10653                  */
10654                 if (IS_UNDER_IPMP(ill))
10655                         ipmp_phyint_refresh_active(phyi);
10656         }
10657 
10658         if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10659                 /*
10660                  * If the ILLF_NOACCEPT flag is changed, bring up all the
10661                  * ipifs that were brought down.
10662                  *
10663                  * The routing sockets messages are sent as the result
10664                  * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10665                  * as well.
10666                  */
10667                 err = ill_up_ipifs(ill, q, mp);
10668         } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10669                 /*
10670                  * XXX ipif_up really does not know whether a phyint flags
10671                  * was modified or not. So, it sends up information on
10672                  * only one routing sockets message. As we don't bring up
10673                  * the interface and also set PHYI_ flags simultaneously
10674                  * it should be okay.
10675                  */
10676                 err = ipif_up(ipif, q, mp);
10677         } else {
10678                 /*
10679                  * Make sure routing socket sees all changes to the flags.
10680                  * ipif_up_done* handles this when we use ipif_up.
10681                  */
10682                 if (phyint_flags_modified) {
10683                         if (phyi->phyint_illv4 != NULL) {
10684                                 ip_rts_ifmsg(phyi->phyint_illv4->
10685                                     ill_ipif, RTSQ_DEFAULT);
10686                         }
10687                         if (phyi->phyint_illv6 != NULL) {
10688                                 ip_rts_ifmsg(phyi->phyint_illv6->
10689                                     ill_ipif, RTSQ_DEFAULT);
10690                         }
10691                 } else {
10692                         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10693                 }
10694                 /*
10695                  * Update the flags in SCTP's IPIF list, ipif_up() will do
10696                  * this in need_up case.
10697                  */
10698                 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10699         }
10700 
10701         /* The default multicast interface might have changed */
10702         ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10703         return (err);
10704 }
10705 
10706 /*
10707  * Restart the flags operation now that the refcounts have dropped to zero.
10708  */
10709 /* ARGSUSED */
10710 int
10711 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10712     ip_ioctl_cmd_t *ipip, void *if_req)
10713 {
10714         uint64_t flags;
10715         struct ifreq *ifr = if_req;
10716         struct lifreq *lifr = if_req;
10717         uint64_t turn_on, turn_off;
10718 
10719         ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10720             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10721 
10722         if (ipip->ipi_cmd_type == IF_CMD) {
10723                 /* cast to uint16_t prevents unwanted sign extension */
10724                 flags = (uint16_t)ifr->ifr_flags;
10725         } else {
10726                 flags = lifr->lifr_flags;
10727         }
10728 
10729         /*
10730          * If this function call is a result of the ILLF_NOACCEPT flag
10731          * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10732          */
10733         ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10734         if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10735                 (void) ipif_down_tail(ipif);
10736 
10737         return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10738 }
10739 
10740 /*
10741  * Can operate on either a module or a driver queue.
10742  */
10743 /* ARGSUSED */
10744 int
10745 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10746     ip_ioctl_cmd_t *ipip, void *if_req)
10747 {
10748         /*
10749          * Has the flags been set correctly till now ?
10750          */
10751         ill_t *ill = ipif->ipif_ill;
10752         phyint_t *phyi = ill->ill_phyint;
10753 
10754         ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10755             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10756         ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10757         ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10758         ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10759 
10760         /*
10761          * Need a lock since some flags can be set even when there are
10762          * references to the ipif.
10763          */
10764         mutex_enter(&ill->ill_lock);
10765         if (ipip->ipi_cmd_type == IF_CMD) {
10766                 struct ifreq *ifr = (struct ifreq *)if_req;
10767 
10768                 /* Get interface flags (low 16 only). */
10769                 ifr->ifr_flags = ((ipif->ipif_flags |
10770                     ill->ill_flags | phyi->phyint_flags) & 0xffff);
10771         } else {
10772                 struct lifreq *lifr = (struct lifreq *)if_req;
10773 
10774                 /* Get interface flags. */
10775                 lifr->lifr_flags = ipif->ipif_flags |
10776                     ill->ill_flags | phyi->phyint_flags;
10777         }
10778         mutex_exit(&ill->ill_lock);
10779         return (0);
10780 }
10781 
10782 /*
10783  * We allow the MTU to be set on an ILL, but not have it be different
10784  * for different IPIFs since we don't actually send packets on IPIFs.
10785  */
10786 /* ARGSUSED */
10787 int
10788 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10789     ip_ioctl_cmd_t *ipip, void *if_req)
10790 {
10791         int mtu;
10792         int ip_min_mtu;
10793         struct ifreq    *ifr;
10794         struct lifreq *lifr;
10795         ill_t   *ill;
10796 
10797         ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10798             ipif->ipif_id, (void *)ipif));
10799         if (ipip->ipi_cmd_type == IF_CMD) {
10800                 ifr = (struct ifreq *)if_req;
10801                 mtu = ifr->ifr_metric;
10802         } else {
10803                 lifr = (struct lifreq *)if_req;
10804                 mtu = lifr->lifr_mtu;
10805         }
10806         /* Only allow for logical unit zero i.e. not on "bge0:17" */
10807         if (ipif->ipif_id != 0)
10808                 return (EINVAL);
10809 
10810         ill = ipif->ipif_ill;
10811         if (ipif->ipif_isv6)
10812                 ip_min_mtu = IPV6_MIN_MTU;
10813         else
10814                 ip_min_mtu = IP_MIN_MTU;
10815 
10816         mutex_enter(&ill->ill_lock);
10817         if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10818                 mutex_exit(&ill->ill_lock);
10819                 return (EINVAL);
10820         }
10821         /* Avoid increasing ill_mc_mtu */
10822         if (ill->ill_mc_mtu > mtu)
10823                 ill->ill_mc_mtu = mtu;
10824 
10825         /*
10826          * The dce and fragmentation code can handle changes to ill_mtu
10827          * concurrent with sending/fragmenting packets.
10828          */
10829         ill->ill_mtu = mtu;
10830         ill->ill_flags |= ILLF_FIXEDMTU;
10831         mutex_exit(&ill->ill_lock);
10832 
10833         /*
10834          * Make sure all dce_generation checks find out
10835          * that ill_mtu/ill_mc_mtu has changed.
10836          */
10837         dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10838 
10839         /*
10840          * Refresh IPMP meta-interface MTU if necessary.
10841          */
10842         if (IS_UNDER_IPMP(ill))
10843                 ipmp_illgrp_refresh_mtu(ill->ill_grp);
10844 
10845         /* Update the MTU in SCTP's list */
10846         sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10847         return (0);
10848 }
10849 
10850 /* Get interface MTU. */
10851 /* ARGSUSED */
10852 int
10853 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10854         ip_ioctl_cmd_t *ipip, void *if_req)
10855 {
10856         struct ifreq    *ifr;
10857         struct lifreq   *lifr;
10858 
10859         ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10860             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10861 
10862         /*
10863          * We allow a get on any logical interface even though the set
10864          * can only be done on logical unit 0.
10865          */
10866         if (ipip->ipi_cmd_type == IF_CMD) {
10867                 ifr = (struct ifreq *)if_req;
10868                 ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10869         } else {
10870                 lifr = (struct lifreq *)if_req;
10871                 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10872         }
10873         return (0);
10874 }
10875 
10876 /* Set interface broadcast address. */
10877 /* ARGSUSED2 */
10878 int
10879 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10880         ip_ioctl_cmd_t *ipip, void *if_req)
10881 {
10882         ipaddr_t addr;
10883         ire_t   *ire;
10884         ill_t           *ill = ipif->ipif_ill;
10885         ip_stack_t      *ipst = ill->ill_ipst;
10886 
10887         ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10888             ipif->ipif_id));
10889 
10890         ASSERT(IAM_WRITER_IPIF(ipif));
10891         if (!(ipif->ipif_flags & IPIF_BROADCAST))
10892                 return (EADDRNOTAVAIL);
10893 
10894         ASSERT(!(ipif->ipif_isv6));  /* No IPv6 broadcast */
10895 
10896         if (sin->sin_family != AF_INET)
10897                 return (EAFNOSUPPORT);
10898 
10899         addr = sin->sin_addr.s_addr;
10900 
10901         if (ipif->ipif_flags & IPIF_UP) {
10902                 /*
10903                  * If we are already up, make sure the new
10904                  * broadcast address makes sense.  If it does,
10905                  * there should be an IRE for it already.
10906                  */
10907                 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10908                     ill, ipif->ipif_zoneid, NULL,
10909                     (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10910                 if (ire == NULL) {
10911                         return (EINVAL);
10912                 } else {
10913                         ire_refrele(ire);
10914                 }
10915         }
10916         /*
10917          * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10918          * needs to already exist we never need to change the set of
10919          * IRE_BROADCASTs when we are UP.
10920          */
10921         if (addr != ipif->ipif_brd_addr)
10922                 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10923 
10924         return (0);
10925 }
10926 
10927 /* Get interface broadcast address. */
10928 /* ARGSUSED */
10929 int
10930 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10931     ip_ioctl_cmd_t *ipip, void *if_req)
10932 {
10933         ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10934             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10935         if (!(ipif->ipif_flags & IPIF_BROADCAST))
10936                 return (EADDRNOTAVAIL);
10937 
10938         /* IPIF_BROADCAST not possible with IPv6 */
10939         ASSERT(!ipif->ipif_isv6);
10940         *sin = sin_null;
10941         sin->sin_family = AF_INET;
10942         sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10943         return (0);
10944 }
10945 
10946 /*
10947  * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
10948  */
10949 /* ARGSUSED */
10950 int
10951 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10952     ip_ioctl_cmd_t *ipip, void *if_req)
10953 {
10954         int err = 0;
10955         in6_addr_t v6mask;
10956 
10957         ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
10958             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10959 
10960         ASSERT(IAM_WRITER_IPIF(ipif));
10961 
10962         if (ipif->ipif_isv6) {
10963                 sin6_t *sin6;
10964 
10965                 if (sin->sin_family != AF_INET6)
10966                         return (EAFNOSUPPORT);
10967 
10968                 sin6 = (sin6_t *)sin;
10969                 v6mask = sin6->sin6_addr;
10970         } else {
10971                 ipaddr_t mask;
10972 
10973                 if (sin->sin_family != AF_INET)
10974                         return (EAFNOSUPPORT);
10975 
10976                 mask = sin->sin_addr.s_addr;
10977                 if (!ip_contiguous_mask(ntohl(mask)))
10978                         return (ENOTSUP);
10979                 V4MASK_TO_V6(mask, v6mask);
10980         }
10981 
10982         /*
10983          * No big deal if the interface isn't already up, or the mask
10984          * isn't really changing, or this is pt-pt.
10985          */
10986         if (!(ipif->ipif_flags & IPIF_UP) ||
10987             IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
10988             (ipif->ipif_flags & IPIF_POINTOPOINT)) {
10989                 ipif->ipif_v6net_mask = v6mask;
10990                 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10991                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
10992                             ipif->ipif_v6net_mask,
10993                             ipif->ipif_v6subnet);
10994                 }
10995                 return (0);
10996         }
10997         /*
10998          * Make sure we have valid net and subnet broadcast ire's
10999          * for the old netmask, if needed by other logical interfaces.
11000          */
11001         err = ipif_logical_down(ipif, q, mp);
11002         if (err == EINPROGRESS)
11003                 return (err);
11004         (void) ipif_down_tail(ipif);
11005         err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
11006         return (err);
11007 }
11008 
11009 static int
11010 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
11011 {
11012         in6_addr_t v6mask;
11013         int err = 0;
11014 
11015         ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
11016             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11017 
11018         if (ipif->ipif_isv6) {
11019                 sin6_t *sin6;
11020 
11021                 sin6 = (sin6_t *)sin;
11022                 v6mask = sin6->sin6_addr;
11023         } else {
11024                 ipaddr_t mask;
11025 
11026                 mask = sin->sin_addr.s_addr;
11027                 V4MASK_TO_V6(mask, v6mask);
11028         }
11029 
11030         ipif->ipif_v6net_mask = v6mask;
11031         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11032                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
11033                     ipif->ipif_v6subnet);
11034         }
11035         err = ipif_up(ipif, q, mp);
11036 
11037         if (err == 0 || err == EINPROGRESS) {
11038                 /*
11039                  * The interface must be DL_BOUND if this packet has to
11040                  * go out on the wire. Since we only go through a logical
11041                  * down and are bound with the driver during an internal
11042                  * down/up that is satisfied.
11043                  */
11044                 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
11045                         /* Potentially broadcast an address mask reply. */
11046                         ipif_mask_reply(ipif);
11047                 }
11048         }
11049         return (err);
11050 }
11051 
11052 /* ARGSUSED */
11053 int
11054 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11055     ip_ioctl_cmd_t *ipip, void *if_req)
11056 {
11057         ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
11058             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11059         (void) ipif_down_tail(ipif);
11060         return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
11061 }
11062 
11063 /* Get interface net mask. */
11064 /* ARGSUSED */
11065 int
11066 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11067     ip_ioctl_cmd_t *ipip, void *if_req)
11068 {
11069         struct lifreq *lifr = (struct lifreq *)if_req;
11070         struct sockaddr_in6 *sin6 = (sin6_t *)sin;
11071 
11072         ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
11073             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11074 
11075         /*
11076          * net mask can't change since we have a reference to the ipif.
11077          */
11078         if (ipif->ipif_isv6) {
11079                 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11080                 *sin6 = sin6_null;
11081                 sin6->sin6_family = AF_INET6;
11082                 sin6->sin6_addr = ipif->ipif_v6net_mask;
11083                 lifr->lifr_addrlen =
11084                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11085         } else {
11086                 *sin = sin_null;
11087                 sin->sin_family = AF_INET;
11088                 sin->sin_addr.s_addr = ipif->ipif_net_mask;
11089                 if (ipip->ipi_cmd_type == LIF_CMD) {
11090                         lifr->lifr_addrlen =
11091                             ip_mask_to_plen(ipif->ipif_net_mask);
11092                 }
11093         }
11094         return (0);
11095 }
11096 
11097 /* ARGSUSED */
11098 int
11099 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11100     ip_ioctl_cmd_t *ipip, void *if_req)
11101 {
11102         ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
11103             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11104 
11105         /*
11106          * Since no applications should ever be setting metrics on underlying
11107          * interfaces, we explicitly fail to smoke 'em out.
11108          */
11109         if (IS_UNDER_IPMP(ipif->ipif_ill))
11110                 return (EINVAL);
11111 
11112         /*
11113          * Set interface metric.  We don't use this for
11114          * anything but we keep track of it in case it is
11115          * important to routing applications or such.
11116          */
11117         if (ipip->ipi_cmd_type == IF_CMD) {
11118                 struct ifreq    *ifr;
11119 
11120                 ifr = (struct ifreq *)if_req;
11121                 ipif->ipif_ill->ill_metric = ifr->ifr_metric;
11122         } else {
11123                 struct lifreq   *lifr;
11124 
11125                 lifr = (struct lifreq *)if_req;
11126                 ipif->ipif_ill->ill_metric = lifr->lifr_metric;
11127         }
11128         return (0);
11129 }
11130 
11131 /* ARGSUSED */
11132 int
11133 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11134     ip_ioctl_cmd_t *ipip, void *if_req)
11135 {
11136         /* Get interface metric. */
11137         ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
11138             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11139 
11140         if (ipip->ipi_cmd_type == IF_CMD) {
11141                 struct ifreq    *ifr;
11142 
11143                 ifr = (struct ifreq *)if_req;
11144                 ifr->ifr_metric = ipif->ipif_ill->ill_metric;
11145         } else {
11146                 struct lifreq   *lifr;
11147 
11148                 lifr = (struct lifreq *)if_req;
11149                 lifr->lifr_metric = ipif->ipif_ill->ill_metric;
11150         }
11151 
11152         return (0);
11153 }
11154 
11155 /* ARGSUSED */
11156 int
11157 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11158     ip_ioctl_cmd_t *ipip, void *if_req)
11159 {
11160         int     arp_muxid;
11161 
11162         ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
11163             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11164         /*
11165          * Set the muxid returned from I_PLINK.
11166          */
11167         if (ipip->ipi_cmd_type == IF_CMD) {
11168                 struct ifreq *ifr = (struct ifreq *)if_req;
11169 
11170                 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
11171                 arp_muxid = ifr->ifr_arp_muxid;
11172         } else {
11173                 struct lifreq *lifr = (struct lifreq *)if_req;
11174 
11175                 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
11176                 arp_muxid = lifr->lifr_arp_muxid;
11177         }
11178         arl_set_muxid(ipif->ipif_ill, arp_muxid);
11179         return (0);
11180 }
11181 
11182 /* ARGSUSED */
11183 int
11184 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11185     ip_ioctl_cmd_t *ipip, void *if_req)
11186 {
11187         int     arp_muxid = 0;
11188 
11189         ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
11190             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11191         /*
11192          * Get the muxid saved in ill for I_PUNLINK.
11193          */
11194         arp_muxid = arl_get_muxid(ipif->ipif_ill);
11195         if (ipip->ipi_cmd_type == IF_CMD) {
11196                 struct ifreq *ifr = (struct ifreq *)if_req;
11197 
11198                 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11199                 ifr->ifr_arp_muxid = arp_muxid;
11200         } else {
11201                 struct lifreq *lifr = (struct lifreq *)if_req;
11202 
11203                 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11204                 lifr->lifr_arp_muxid = arp_muxid;
11205         }
11206         return (0);
11207 }
11208 
11209 /*
11210  * Set the subnet prefix. Does not modify the broadcast address.
11211  */
11212 /* ARGSUSED */
11213 int
11214 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11215     ip_ioctl_cmd_t *ipip, void *if_req)
11216 {
11217         int err = 0;
11218         in6_addr_t v6addr;
11219         in6_addr_t v6mask;
11220         boolean_t need_up = B_FALSE;
11221         int addrlen;
11222 
11223         ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11224             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11225 
11226         ASSERT(IAM_WRITER_IPIF(ipif));
11227         addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11228 
11229         if (ipif->ipif_isv6) {
11230                 sin6_t *sin6;
11231 
11232                 if (sin->sin_family != AF_INET6)
11233                         return (EAFNOSUPPORT);
11234 
11235                 sin6 = (sin6_t *)sin;
11236                 v6addr = sin6->sin6_addr;
11237                 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11238                         return (EADDRNOTAVAIL);
11239         } else {
11240                 ipaddr_t addr;
11241 
11242                 if (sin->sin_family != AF_INET)
11243                         return (EAFNOSUPPORT);
11244 
11245                 addr = sin->sin_addr.s_addr;
11246                 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11247                         return (EADDRNOTAVAIL);
11248                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11249                 /* Add 96 bits */
11250                 addrlen += IPV6_ABITS - IP_ABITS;
11251         }
11252 
11253         if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11254                 return (EINVAL);
11255 
11256         /* Check if bits in the address is set past the mask */
11257         if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11258                 return (EINVAL);
11259 
11260         if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11261             IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11262                 return (0);     /* No change */
11263 
11264         if (ipif->ipif_flags & IPIF_UP) {
11265                 /*
11266                  * If the interface is already marked up,
11267                  * we call ipif_down which will take care
11268                  * of ditching any IREs that have been set
11269                  * up based on the old interface address.
11270                  */
11271                 err = ipif_logical_down(ipif, q, mp);
11272                 if (err == EINPROGRESS)
11273                         return (err);
11274                 (void) ipif_down_tail(ipif);
11275                 need_up = B_TRUE;
11276         }
11277 
11278         err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11279         return (err);
11280 }
11281 
11282 static int
11283 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11284     queue_t *q, mblk_t *mp, boolean_t need_up)
11285 {
11286         ill_t   *ill = ipif->ipif_ill;
11287         int     err = 0;
11288 
11289         ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11290             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11291 
11292         /* Set the new address. */
11293         mutex_enter(&ill->ill_lock);
11294         ipif->ipif_v6net_mask = v6mask;
11295         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11296                 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11297                     ipif->ipif_v6subnet);
11298         }
11299         mutex_exit(&ill->ill_lock);
11300 
11301         if (need_up) {
11302                 /*
11303                  * Now bring the interface back up.  If this
11304                  * is the only IPIF for the ILL, ipif_up
11305                  * will have to re-bind to the device, so
11306                  * we may get back EINPROGRESS, in which
11307                  * case, this IOCTL will get completed in
11308                  * ip_rput_dlpi when we see the DL_BIND_ACK.
11309                  */
11310                 err = ipif_up(ipif, q, mp);
11311                 if (err == EINPROGRESS)
11312                         return (err);
11313         }
11314         return (err);
11315 }
11316 
11317 /* ARGSUSED */
11318 int
11319 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11320     ip_ioctl_cmd_t *ipip, void *if_req)
11321 {
11322         int     addrlen;
11323         in6_addr_t v6addr;
11324         in6_addr_t v6mask;
11325         struct lifreq *lifr = (struct lifreq *)if_req;
11326 
11327         ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11328             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11329         (void) ipif_down_tail(ipif);
11330 
11331         addrlen = lifr->lifr_addrlen;
11332         if (ipif->ipif_isv6) {
11333                 sin6_t *sin6;
11334 
11335                 sin6 = (sin6_t *)sin;
11336                 v6addr = sin6->sin6_addr;
11337         } else {
11338                 ipaddr_t addr;
11339 
11340                 addr = sin->sin_addr.s_addr;
11341                 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11342                 addrlen += IPV6_ABITS - IP_ABITS;
11343         }
11344         (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11345 
11346         return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11347 }
11348 
11349 /* ARGSUSED */
11350 int
11351 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11352     ip_ioctl_cmd_t *ipip, void *if_req)
11353 {
11354         struct lifreq *lifr = (struct lifreq *)if_req;
11355         struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11356 
11357         ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11358             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11359         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11360 
11361         if (ipif->ipif_isv6) {
11362                 *sin6 = sin6_null;
11363                 sin6->sin6_family = AF_INET6;
11364                 sin6->sin6_addr = ipif->ipif_v6subnet;
11365                 lifr->lifr_addrlen =
11366                     ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11367         } else {
11368                 *sin = sin_null;
11369                 sin->sin_family = AF_INET;
11370                 sin->sin_addr.s_addr = ipif->ipif_subnet;
11371                 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11372         }
11373         return (0);
11374 }
11375 
11376 /*
11377  * Set the IPv6 address token.
11378  */
11379 /* ARGSUSED */
11380 int
11381 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11382     ip_ioctl_cmd_t *ipi, void *if_req)
11383 {
11384         ill_t *ill = ipif->ipif_ill;
11385         int err;
11386         in6_addr_t v6addr;
11387         in6_addr_t v6mask;
11388         boolean_t need_up = B_FALSE;
11389         int i;
11390         sin6_t *sin6 = (sin6_t *)sin;
11391         struct lifreq *lifr = (struct lifreq *)if_req;
11392         int addrlen;
11393 
11394         ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11395             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11396         ASSERT(IAM_WRITER_IPIF(ipif));
11397 
11398         addrlen = lifr->lifr_addrlen;
11399         /* Only allow for logical unit zero i.e. not on "le0:17" */
11400         if (ipif->ipif_id != 0)
11401                 return (EINVAL);
11402 
11403         if (!ipif->ipif_isv6)
11404                 return (EINVAL);
11405 
11406         if (addrlen > IPV6_ABITS)
11407                 return (EINVAL);
11408 
11409         v6addr = sin6->sin6_addr;
11410 
11411         /*
11412          * The length of the token is the length from the end.  To get
11413          * the proper mask for this, compute the mask of the bits not
11414          * in the token; ie. the prefix, and then xor to get the mask.
11415          */
11416         if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11417                 return (EINVAL);
11418         for (i = 0; i < 4; i++) {
11419                 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11420         }
11421 
11422         if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11423             ill->ill_token_length == addrlen)
11424                 return (0);     /* No change */
11425 
11426         if (ipif->ipif_flags & IPIF_UP) {
11427                 err = ipif_logical_down(ipif, q, mp);
11428                 if (err == EINPROGRESS)
11429                         return (err);
11430                 (void) ipif_down_tail(ipif);
11431                 need_up = B_TRUE;
11432         }
11433         err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11434         return (err);
11435 }
11436 
11437 static int
11438 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11439     mblk_t *mp, boolean_t need_up)
11440 {
11441         in6_addr_t v6addr;
11442         in6_addr_t v6mask;
11443         ill_t   *ill = ipif->ipif_ill;
11444         int     i;
11445         int     err = 0;
11446 
11447         ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11448             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11449         v6addr = sin6->sin6_addr;
11450         /*
11451          * The length of the token is the length from the end.  To get
11452          * the proper mask for this, compute the mask of the bits not
11453          * in the token; ie. the prefix, and then xor to get the mask.
11454          */
11455         (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11456         for (i = 0; i < 4; i++)
11457                 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11458 
11459         mutex_enter(&ill->ill_lock);
11460         V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11461         ill->ill_token_length = addrlen;
11462         ill->ill_manual_token = 1;
11463 
11464         /* Reconfigure the link-local address based on this new token */
11465         ipif_setlinklocal(ill->ill_ipif);
11466 
11467         mutex_exit(&ill->ill_lock);
11468 
11469         if (need_up) {
11470                 /*
11471                  * Now bring the interface back up.  If this
11472                  * is the only IPIF for the ILL, ipif_up
11473                  * will have to re-bind to the device, so
11474                  * we may get back EINPROGRESS, in which
11475                  * case, this IOCTL will get completed in
11476                  * ip_rput_dlpi when we see the DL_BIND_ACK.
11477                  */
11478                 err = ipif_up(ipif, q, mp);
11479                 if (err == EINPROGRESS)
11480                         return (err);
11481         }
11482         return (err);
11483 }
11484 
11485 /* ARGSUSED */
11486 int
11487 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11488     ip_ioctl_cmd_t *ipi, void *if_req)
11489 {
11490         ill_t *ill;
11491         sin6_t *sin6 = (sin6_t *)sin;
11492         struct lifreq *lifr = (struct lifreq *)if_req;
11493 
11494         ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11495             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11496         if (ipif->ipif_id != 0)
11497                 return (EINVAL);
11498 
11499         ill = ipif->ipif_ill;
11500         if (!ill->ill_isv6)
11501                 return (ENXIO);
11502 
11503         *sin6 = sin6_null;
11504         sin6->sin6_family = AF_INET6;
11505         ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11506         sin6->sin6_addr = ill->ill_token;
11507         lifr->lifr_addrlen = ill->ill_token_length;
11508         return (0);
11509 }
11510 
11511 /*
11512  * Set (hardware) link specific information that might override
11513  * what was acquired through the DL_INFO_ACK.
11514  */
11515 /* ARGSUSED */
11516 int
11517 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11518     ip_ioctl_cmd_t *ipi, void *if_req)
11519 {
11520         ill_t           *ill = ipif->ipif_ill;
11521         int             ip_min_mtu;
11522         struct lifreq   *lifr = (struct lifreq *)if_req;
11523         lif_ifinfo_req_t *lir;
11524 
11525         ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11526             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11527         lir = &lifr->lifr_ifinfo;
11528         ASSERT(IAM_WRITER_IPIF(ipif));
11529 
11530         /* Only allow for logical unit zero i.e. not on "bge0:17" */
11531         if (ipif->ipif_id != 0)
11532                 return (EINVAL);
11533 
11534         /* Set interface MTU. */
11535         if (ipif->ipif_isv6)
11536                 ip_min_mtu = IPV6_MIN_MTU;
11537         else
11538                 ip_min_mtu = IP_MIN_MTU;
11539 
11540         /*
11541          * Verify values before we set anything. Allow zero to
11542          * mean unspecified.
11543          *
11544          * XXX We should be able to set the user-defined lir_mtu to some value
11545          * that is greater than ill_current_frag but less than ill_max_frag- the
11546          * ill_max_frag value tells us the max MTU that can be handled by the
11547          * datalink, whereas the ill_current_frag is dynamically computed for
11548          * some link-types like tunnels, based on the tunnel PMTU. However,
11549          * since there is currently no way of distinguishing between
11550          * administratively fixed link mtu values (e.g., those set via
11551          * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11552          * for tunnels) we conservatively choose the  ill_current_frag as the
11553          * upper-bound.
11554          */
11555         if (lir->lir_maxmtu != 0 &&
11556             (lir->lir_maxmtu > ill->ill_current_frag ||
11557             lir->lir_maxmtu < ip_min_mtu))
11558                 return (EINVAL);
11559         if (lir->lir_reachtime != 0 &&
11560             lir->lir_reachtime > ND_MAX_REACHTIME)
11561                 return (EINVAL);
11562         if (lir->lir_reachretrans != 0 &&
11563             lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11564                 return (EINVAL);
11565 
11566         mutex_enter(&ill->ill_lock);
11567         /*
11568          * The dce and fragmentation code can handle changes to ill_mtu
11569          * concurrent with sending/fragmenting packets.
11570          */
11571         if (lir->lir_maxmtu != 0)
11572                 ill->ill_user_mtu = lir->lir_maxmtu;
11573 
11574         if (lir->lir_reachtime != 0)
11575                 ill->ill_reachable_time = lir->lir_reachtime;
11576 
11577         if (lir->lir_reachretrans != 0)
11578                 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11579 
11580         ill->ill_max_hops = lir->lir_maxhops;
11581         ill->ill_max_buf = ND_MAX_Q;
11582         if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11583                 /*
11584                  * ill_mtu is the actual interface MTU, obtained as the min
11585                  * of user-configured mtu and the value announced by the
11586                  * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11587                  * we have already made the choice of requiring
11588                  * ill_user_mtu < ill_current_frag by the time we get here,
11589                  * the ill_mtu effectively gets assigned to the ill_user_mtu
11590                  * here.
11591                  */
11592                 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11593                 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11594         }
11595         mutex_exit(&ill->ill_lock);
11596 
11597         /*
11598          * Make sure all dce_generation checks find out
11599          * that ill_mtu/ill_mc_mtu has changed.
11600          */
11601         if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11602                 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11603 
11604         /*
11605          * Refresh IPMP meta-interface MTU if necessary.
11606          */
11607         if (IS_UNDER_IPMP(ill))
11608                 ipmp_illgrp_refresh_mtu(ill->ill_grp);
11609 
11610         return (0);
11611 }
11612 
11613 /* ARGSUSED */
11614 int
11615 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11616     ip_ioctl_cmd_t *ipi, void *if_req)
11617 {
11618         struct lif_ifinfo_req *lir;
11619         ill_t *ill = ipif->ipif_ill;
11620 
11621         ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11622             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11623         if (ipif->ipif_id != 0)
11624                 return (EINVAL);
11625 
11626         lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11627         lir->lir_maxhops = ill->ill_max_hops;
11628         lir->lir_reachtime = ill->ill_reachable_time;
11629         lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11630         lir->lir_maxmtu = ill->ill_mtu;
11631 
11632         return (0);
11633 }
11634 
11635 /*
11636  * Return best guess as to the subnet mask for the specified address.
11637  * Based on the subnet masks for all the configured interfaces.
11638  *
11639  * We end up returning a zero mask in the case of default, multicast or
11640  * experimental.
11641  */
11642 static ipaddr_t
11643 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11644 {
11645         ipaddr_t net_mask;
11646         ill_t   *ill;
11647         ipif_t  *ipif;
11648         ill_walk_context_t ctx;
11649         ipif_t  *fallback_ipif = NULL;
11650 
11651         net_mask = ip_net_mask(addr);
11652         if (net_mask == 0) {
11653                 *ipifp = NULL;
11654                 return (0);
11655         }
11656 
11657         /* Let's check to see if this is maybe a local subnet route. */
11658         /* this function only applies to IPv4 interfaces */
11659         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11660         ill = ILL_START_WALK_V4(&ctx, ipst);
11661         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11662                 mutex_enter(&ill->ill_lock);
11663                 for (ipif = ill->ill_ipif; ipif != NULL;
11664                     ipif = ipif->ipif_next) {
11665                         if (IPIF_IS_CONDEMNED(ipif))
11666                                 continue;
11667                         if (!(ipif->ipif_flags & IPIF_UP))
11668                                 continue;
11669                         if ((ipif->ipif_subnet & net_mask) ==
11670                             (addr & net_mask)) {
11671                                 /*
11672                                  * Don't trust pt-pt interfaces if there are
11673                                  * other interfaces.
11674                                  */
11675                                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11676                                         if (fallback_ipif == NULL) {
11677                                                 ipif_refhold_locked(ipif);
11678                                                 fallback_ipif = ipif;
11679                                         }
11680                                         continue;
11681                                 }
11682 
11683                                 /*
11684                                  * Fine. Just assume the same net mask as the
11685                                  * directly attached subnet interface is using.
11686                                  */
11687                                 ipif_refhold_locked(ipif);
11688                                 mutex_exit(&ill->ill_lock);
11689                                 rw_exit(&ipst->ips_ill_g_lock);
11690                                 if (fallback_ipif != NULL)
11691                                         ipif_refrele(fallback_ipif);
11692                                 *ipifp = ipif;
11693                                 return (ipif->ipif_net_mask);
11694                         }
11695                 }
11696                 mutex_exit(&ill->ill_lock);
11697         }
11698         rw_exit(&ipst->ips_ill_g_lock);
11699 
11700         *ipifp = fallback_ipif;
11701         return ((fallback_ipif != NULL) ?
11702             fallback_ipif->ipif_net_mask : net_mask);
11703 }
11704 
11705 /*
11706  * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11707  */
11708 static void
11709 ip_wput_ioctl(queue_t *q, mblk_t *mp)
11710 {
11711         IOCP    iocp;
11712         ipft_t  *ipft;
11713         ipllc_t *ipllc;
11714         mblk_t  *mp1;
11715         cred_t  *cr;
11716         int     error = 0;
11717         conn_t  *connp;
11718 
11719         ip1dbg(("ip_wput_ioctl"));
11720         iocp = (IOCP)mp->b_rptr;
11721         mp1 = mp->b_cont;
11722         if (mp1 == NULL) {
11723                 iocp->ioc_error = EINVAL;
11724                 mp->b_datap->db_type = M_IOCNAK;
11725                 iocp->ioc_count = 0;
11726                 qreply(q, mp);
11727                 return;
11728         }
11729 
11730         /*
11731          * These IOCTLs provide various control capabilities to
11732          * upstream agents such as ULPs and processes.  There
11733          * are currently two such IOCTLs implemented.  They
11734          * are used by TCP to provide update information for
11735          * existing IREs and to forcibly delete an IRE for a
11736          * host that is not responding, thereby forcing an
11737          * attempt at a new route.
11738          */
11739         iocp->ioc_error = EINVAL;
11740         if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11741                 goto done;
11742 
11743         ipllc = (ipllc_t *)mp1->b_rptr;
11744         for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11745                 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11746                         break;
11747         }
11748         /*
11749          * prefer credential from mblk over ioctl;
11750          * see ip_sioctl_copyin_setup
11751          */
11752         cr = msg_getcred(mp, NULL);
11753         if (cr == NULL)
11754                 cr = iocp->ioc_cr;
11755 
11756         /*
11757          * Refhold the conn in case the request gets queued up in some lookup
11758          */
11759         ASSERT(CONN_Q(q));
11760         connp = Q_TO_CONN(q);
11761         CONN_INC_REF(connp);
11762         CONN_INC_IOCTLREF(connp);
11763         if (ipft->ipft_pfi &&
11764             ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11765             pullupmsg(mp1, ipft->ipft_min_size))) {
11766                 error = (*ipft->ipft_pfi)(q,
11767                     (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11768         }
11769         if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11770                 /*
11771                  * CONN_OPER_PENDING_DONE happens in the function called
11772                  * through ipft_pfi above.
11773                  */
11774                 return;
11775         }
11776 
11777         CONN_DEC_IOCTLREF(connp);
11778         CONN_OPER_PENDING_DONE(connp);
11779         if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11780                 freemsg(mp);
11781                 return;
11782         }
11783         iocp->ioc_error = error;
11784 
11785 done:
11786         mp->b_datap->db_type = M_IOCACK;
11787         if (iocp->ioc_error)
11788                 iocp->ioc_count = 0;
11789         qreply(q, mp);
11790 }
11791 
11792 /*
11793  * Assign a unique id for the ipif. This is used by sctp_addr.c
11794  * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11795  */
11796 static void
11797 ipif_assign_seqid(ipif_t *ipif)
11798 {
11799         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
11800 
11801         ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1);
11802 }
11803 
11804 /*
11805  * Clone the contents of `sipif' to `dipif'.  Requires that both ipifs are
11806  * administratively down (i.e., no DAD), of the same type, and locked.  Note
11807  * that the clone is complete -- including the seqid -- and the expectation is
11808  * that the caller will either free or overwrite `sipif' before it's unlocked.
11809  */
11810 static void
11811 ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11812 {
11813         ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11814         ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11815         ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11816         ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11817         ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11818 
11819         dipif->ipif_flags = sipif->ipif_flags;
11820         dipif->ipif_zoneid = sipif->ipif_zoneid;
11821         dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11822         dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11823         dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11824         dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11825         dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11826 
11827         /*
11828          * As per the comment atop the function, we assume that these sipif
11829          * fields will be changed before sipif is unlocked.
11830          */
11831         dipif->ipif_seqid = sipif->ipif_seqid;
11832         dipif->ipif_state_flags = sipif->ipif_state_flags;
11833 }
11834 
11835 /*
11836  * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11837  * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11838  * (unreferenced) ipif.  Also, if `sipif' is used by the current xop, then
11839  * transfer the xop to `dipif'.  Requires that all ipifs are administratively
11840  * down (i.e., no DAD), of the same type, and unlocked.
11841  */
11842 static void
11843 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11844 {
11845         ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11846         ipxop_t *ipx = ipsq->ipsq_xop;
11847 
11848         ASSERT(sipif != dipif);
11849         ASSERT(sipif != virgipif);
11850 
11851         /*
11852          * Grab all of the locks that protect the ipif in a defined order.
11853          */
11854         GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11855 
11856         ipif_clone(sipif, dipif);
11857         if (virgipif != NULL) {
11858                 ipif_clone(virgipif, sipif);
11859                 mi_free(virgipif);
11860         }
11861 
11862         RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11863 
11864         /*
11865          * Transfer ownership of the current xop, if necessary.
11866          */
11867         if (ipx->ipx_current_ipif == sipif) {
11868                 ASSERT(ipx->ipx_pending_ipif == NULL);
11869                 mutex_enter(&ipx->ipx_lock);
11870                 ipx->ipx_current_ipif = dipif;
11871                 mutex_exit(&ipx->ipx_lock);
11872         }
11873 
11874         if (virgipif == NULL)
11875                 mi_free(sipif);
11876 }
11877 
11878 /*
11879  * checks if:
11880  *      - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11881  *      - logical interface is within the allowed range
11882  */
11883 static int
11884 is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11885 {
11886         if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11887                 return (ENAMETOOLONG);
11888 
11889         if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11890                 return (ERANGE);
11891         return (0);
11892 }
11893 
11894 /*
11895  * Insert the ipif, so that the list of ipifs on the ill will be sorted
11896  * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11897  * be inserted into the first space available in the list. The value of
11898  * ipif_id will then be set to the appropriate value for its position.
11899  */
11900 static int
11901 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11902 {
11903         ill_t *ill;
11904         ipif_t *tipif;
11905         ipif_t **tipifp;
11906         int id, err;
11907         ip_stack_t      *ipst;
11908 
11909         ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11910             IAM_WRITER_IPIF(ipif));
11911 
11912         ill = ipif->ipif_ill;
11913         ASSERT(ill != NULL);
11914         ipst = ill->ill_ipst;
11915 
11916         /*
11917          * In the case of lo0:0 we already hold the ill_g_lock.
11918          * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11919          * ipif_insert.
11920          */
11921         if (acquire_g_lock)
11922                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11923         mutex_enter(&ill->ill_lock);
11924         id = ipif->ipif_id;
11925         tipifp = &(ill->ill_ipif);
11926         if (id == -1) { /* need to find a real id */
11927                 id = 0;
11928                 while ((tipif = *tipifp) != NULL) {
11929                         ASSERT(tipif->ipif_id >= id);
11930                         if (tipif->ipif_id != id)
11931                                 break; /* non-consecutive id */
11932                         id++;
11933                         tipifp = &(tipif->ipif_next);
11934                 }
11935                 if ((err = is_lifname_valid(ill, id)) != 0) {
11936                         mutex_exit(&ill->ill_lock);
11937                         if (acquire_g_lock)
11938                                 rw_exit(&ipst->ips_ill_g_lock);
11939                         return (err);
11940                 }
11941                 ipif->ipif_id = id; /* assign new id */
11942         } else if ((err = is_lifname_valid(ill, id)) == 0) {
11943                 /* we have a real id; insert ipif in the right place */
11944                 while ((tipif = *tipifp) != NULL) {
11945                         ASSERT(tipif->ipif_id != id);
11946                         if (tipif->ipif_id > id)
11947                                 break; /* found correct location */
11948                         tipifp = &(tipif->ipif_next);
11949                 }
11950         } else {
11951                 mutex_exit(&ill->ill_lock);
11952                 if (acquire_g_lock)
11953                         rw_exit(&ipst->ips_ill_g_lock);
11954                 return (err);
11955         }
11956 
11957         ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
11958 
11959         ipif->ipif_next = tipif;
11960         *tipifp = ipif;
11961         mutex_exit(&ill->ill_lock);
11962         if (acquire_g_lock)
11963                 rw_exit(&ipst->ips_ill_g_lock);
11964 
11965         return (0);
11966 }
11967 
11968 static void
11969 ipif_remove(ipif_t *ipif)
11970 {
11971         ipif_t  **ipifp;
11972         ill_t   *ill = ipif->ipif_ill;
11973 
11974         ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
11975 
11976         mutex_enter(&ill->ill_lock);
11977         ipifp = &ill->ill_ipif;
11978         for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
11979                 if (*ipifp == ipif) {
11980                         *ipifp = ipif->ipif_next;
11981                         break;
11982                 }
11983         }
11984         mutex_exit(&ill->ill_lock);
11985 }
11986 
11987 /*
11988  * Allocate and initialize a new interface control structure.  (Always
11989  * called as writer.)
11990  * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
11991  * is not part of the global linked list of ills. ipif_seqid is unique
11992  * in the system and to preserve the uniqueness, it is assigned only
11993  * when ill becomes part of the global list. At that point ill will
11994  * have a name. If it doesn't get assigned here, it will get assigned
11995  * in ipif_set_values() as part of SIOCSLIFNAME processing.
11996  * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
11997  * the interface flags or any other information from the DL_INFO_ACK for
11998  * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
11999  * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
12000  * second DL_INFO_ACK comes in from the driver.
12001  */
12002 static ipif_t *
12003 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
12004     boolean_t insert, int *errorp)
12005 {
12006         int err;
12007         ipif_t  *ipif;
12008         ip_stack_t *ipst = ill->ill_ipst;
12009 
12010         ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
12011             ill->ill_name, id, (void *)ill));
12012         ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
12013 
12014         if (errorp != NULL)
12015                 *errorp = 0;
12016 
12017         if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
12018                 if (errorp != NULL)
12019                         *errorp = ENOMEM;
12020                 return (NULL);
12021         }
12022         *ipif = ipif_zero;      /* start clean */
12023 
12024         ipif->ipif_ill = ill;
12025         ipif->ipif_id = id;  /* could be -1 */
12026         /*
12027          * Inherit the zoneid from the ill; for the shared stack instance
12028          * this is always the global zone
12029          */
12030         ipif->ipif_zoneid = ill->ill_zoneid;
12031 
12032         ipif->ipif_refcnt = 0;
12033 
12034         if (insert) {
12035                 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
12036                         mi_free(ipif);
12037                         if (errorp != NULL)
12038                                 *errorp = err;
12039                         return (NULL);
12040                 }
12041                 /* -1 id should have been replaced by real id */
12042                 id = ipif->ipif_id;
12043                 ASSERT(id >= 0);
12044         }
12045 
12046         if (ill->ill_name[0] != '\0')
12047                 ipif_assign_seqid(ipif);
12048 
12049         /*
12050          * If this is the zeroth ipif on the IPMP ill, create the illgrp
12051          * (which must not exist yet because the zeroth ipif is created once
12052          * per ill).  However, do not not link it to the ipmp_grp_t until
12053          * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
12054          */
12055         if (id == 0 && IS_IPMP(ill)) {
12056                 if (ipmp_illgrp_create(ill) == NULL) {
12057                         if (insert) {
12058                                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
12059                                 ipif_remove(ipif);
12060                                 rw_exit(&ipst->ips_ill_g_lock);
12061                         }
12062                         mi_free(ipif);
12063                         if (errorp != NULL)
12064                                 *errorp = ENOMEM;
12065                         return (NULL);
12066                 }
12067         }
12068 
12069         /*
12070          * We grab ill_lock to protect the flag changes.  The ipif is still
12071          * not up and can't be looked up until the ioctl completes and the
12072          * IPIF_CHANGING flag is cleared.
12073          */
12074         mutex_enter(&ill->ill_lock);
12075 
12076         ipif->ipif_ire_type = ire_type;
12077 
12078         if (ipif->ipif_isv6) {
12079                 ill->ill_flags |= ILLF_IPV6;
12080         } else {
12081                 ipaddr_t inaddr_any = INADDR_ANY;
12082 
12083                 ill->ill_flags |= ILLF_IPV4;
12084 
12085                 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
12086                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12087                     &ipif->ipif_v6lcl_addr);
12088                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12089                     &ipif->ipif_v6subnet);
12090                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12091                     &ipif->ipif_v6net_mask);
12092                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12093                     &ipif->ipif_v6brd_addr);
12094                 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12095                     &ipif->ipif_v6pp_dst_addr);
12096         }
12097 
12098         /*
12099          * Don't set the interface flags etc. now, will do it in
12100          * ip_ll_subnet_defaults.
12101          */
12102         if (!initialize)
12103                 goto out;
12104 
12105         /*
12106          * NOTE: The IPMP meta-interface is special-cased because it starts
12107          * with no underlying interfaces (and thus an unknown broadcast
12108          * address length), but all interfaces that can be placed into an IPMP
12109          * group are required to be broadcast-capable.
12110          */
12111         if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
12112                 /*
12113                  * Later detect lack of DLPI driver multicast capability by
12114                  * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
12115                  */
12116                 ill->ill_flags |= ILLF_MULTICAST;
12117                 if (!ipif->ipif_isv6)
12118                         ipif->ipif_flags |= IPIF_BROADCAST;
12119         } else {
12120                 if (ill->ill_net_type != IRE_LOOPBACK) {
12121                         if (ipif->ipif_isv6)
12122                                 /*
12123                                  * Note: xresolv interfaces will eventually need
12124                                  * NOARP set here as well, but that will require
12125                                  * those external resolvers to have some
12126                                  * knowledge of that flag and act appropriately.
12127                                  * Not to be changed at present.
12128                                  */
12129                                 ill->ill_flags |= ILLF_NONUD;
12130                         else
12131                                 ill->ill_flags |= ILLF_NOARP;
12132                 }
12133                 if (ill->ill_phys_addr_length == 0) {
12134                         if (IS_VNI(ill)) {
12135                                 ipif->ipif_flags |= IPIF_NOXMIT;
12136                         } else {
12137                                 /* pt-pt supports multicast. */
12138                                 ill->ill_flags |= ILLF_MULTICAST;
12139                                 if (ill->ill_net_type != IRE_LOOPBACK)
12140                                         ipif->ipif_flags |= IPIF_POINTOPOINT;
12141                         }
12142                 }
12143         }
12144 out:
12145         mutex_exit(&ill->ill_lock);
12146         return (ipif);
12147 }
12148 
12149 /*
12150  * Remove the neighbor cache entries associated with this logical
12151  * interface.
12152  */
12153 int
12154 ipif_arp_down(ipif_t *ipif)
12155 {
12156         ill_t   *ill = ipif->ipif_ill;
12157         int     err = 0;
12158 
12159         ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
12160         ASSERT(IAM_WRITER_IPIF(ipif));
12161 
12162         DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
12163             ill_t *, ill, ipif_t *, ipif);
12164         ipif_nce_down(ipif);
12165 
12166         /*
12167          * If this is the last ipif that is going down and there are no
12168          * duplicate addresses we may yet attempt to re-probe, then we need to
12169          * clean up ARP completely.
12170          */
12171         if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
12172             !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
12173                 /*
12174                  * If this was the last ipif on an IPMP interface, purge any
12175                  * static ARP entries associated with it.
12176                  */
12177                 if (IS_IPMP(ill))
12178                         ipmp_illgrp_refresh_arpent(ill->ill_grp);
12179 
12180                 /* UNBIND, DETACH */
12181                 err = arp_ll_down(ill);
12182         }
12183 
12184         return (err);
12185 }
12186 
12187 /*
12188  * Get the resolver set up for a new IP address.  (Always called as writer.)
12189  * Called both for IPv4 and IPv6 interfaces, though it only does some
12190  * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
12191  *
12192  * The enumerated value res_act tunes the behavior:
12193  *      * Res_act_initial: set up all the resolver structures for a new
12194  *        IP address.
12195  *      * Res_act_defend: tell ARP that it needs to send a single gratuitous
12196  *        ARP message in defense of the address.
12197  *      * Res_act_rebind: tell ARP to change the hardware address for an IP
12198  *        address (and issue gratuitous ARPs).  Used by ipmp_ill_bind_ipif().
12199  *
12200  * Returns zero on success, or an errno upon failure.
12201  */
12202 int
12203 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
12204 {
12205         ill_t           *ill = ipif->ipif_ill;
12206         int             err;
12207         boolean_t       was_dup;
12208 
12209         ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
12210             ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12211         ASSERT(IAM_WRITER_IPIF(ipif));
12212 
12213         was_dup = B_FALSE;
12214         if (res_act == Res_act_initial) {
12215                 ipif->ipif_addr_ready = 0;
12216                 /*
12217                  * We're bringing an interface up here.  There's no way that we
12218                  * should need to shut down ARP now.
12219                  */
12220                 mutex_enter(&ill->ill_lock);
12221                 if (ipif->ipif_flags & IPIF_DUPLICATE) {
12222                         ipif->ipif_flags &= ~IPIF_DUPLICATE;
12223                         ill->ill_ipif_dup_count--;
12224                         was_dup = B_TRUE;
12225                 }
12226                 mutex_exit(&ill->ill_lock);
12227         }
12228         if (ipif->ipif_recovery_id != 0)
12229                 (void) untimeout(ipif->ipif_recovery_id);
12230         ipif->ipif_recovery_id = 0;
12231         if (ill->ill_net_type != IRE_IF_RESOLVER) {
12232                 ipif->ipif_addr_ready = 1;
12233                 return (0);
12234         }
12235         /* NDP will set the ipif_addr_ready flag when it's ready */
12236         if (ill->ill_isv6)
12237                 return (0);
12238 
12239         err = ipif_arp_up(ipif, res_act, was_dup);
12240         return (err);
12241 }
12242 
12243 /*
12244  * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12245  * when a link has just gone back up.
12246  */
12247 static void
12248 ipif_nce_start_dad(ipif_t *ipif)
12249 {
12250         ncec_t *ncec;
12251         ill_t *ill = ipif->ipif_ill;
12252         boolean_t isv6 = ill->ill_isv6;
12253 
12254         if (isv6) {
12255                 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12256                     &ipif->ipif_v6lcl_addr);
12257         } else {
12258                 ipaddr_t v4addr;
12259 
12260                 if (ill->ill_net_type != IRE_IF_RESOLVER ||
12261                     (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12262                     ipif->ipif_lcl_addr == INADDR_ANY) {
12263                         /*
12264                          * If we can't contact ARP for some reason,
12265                          * that's not really a problem.  Just send
12266                          * out the routing socket notification that
12267                          * DAD completion would have done, and continue.
12268                          */
12269                         ipif_mask_reply(ipif);
12270                         ipif_up_notify(ipif);
12271                         ipif->ipif_addr_ready = 1;
12272                         return;
12273                 }
12274 
12275                 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12276                 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12277         }
12278 
12279         if (ncec == NULL) {
12280                 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12281                     (void *)ipif));
12282                 return;
12283         }
12284         if (!nce_restart_dad(ncec)) {
12285                 /*
12286                  * If we can't restart DAD for some reason, that's not really a
12287                  * problem.  Just send out the routing socket notification that
12288                  * DAD completion would have done, and continue.
12289                  */
12290                 ipif_up_notify(ipif);
12291                 ipif->ipif_addr_ready = 1;
12292         }
12293         ncec_refrele(ncec);
12294 }
12295 
12296 /*
12297  * Restart duplicate address detection on all interfaces on the given ill.
12298  *
12299  * This is called when an interface transitions from down to up
12300  * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12301  *
12302  * Note that since the underlying physical link has transitioned, we must cause
12303  * at least one routing socket message to be sent here, either via DAD
12304  * completion or just by default on the first ipif.  (If we don't do this, then
12305  * in.mpathd will see long delays when doing link-based failure recovery.)
12306  */
12307 void
12308 ill_restart_dad(ill_t *ill, boolean_t went_up)
12309 {
12310         ipif_t *ipif;
12311 
12312         if (ill == NULL)
12313                 return;
12314 
12315         /*
12316          * If layer two doesn't support duplicate address detection, then just
12317          * send the routing socket message now and be done with it.
12318          */
12319         if (!ill->ill_isv6 && arp_no_defense) {
12320                 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12321                 return;
12322         }
12323 
12324         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12325                 if (went_up) {
12326 
12327                         if (ipif->ipif_flags & IPIF_UP) {
12328                                 ipif_nce_start_dad(ipif);
12329                         } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12330                                 /*
12331                                  * kick off the bring-up process now.
12332                                  */
12333                                 ipif_do_recovery(ipif);
12334                         } else {
12335                                 /*
12336                                  * Unfortunately, the first ipif is "special"
12337                                  * and represents the underlying ill in the
12338                                  * routing socket messages.  Thus, when this
12339                                  * one ipif is down, we must still notify so
12340                                  * that the user knows the IFF_RUNNING status
12341                                  * change.  (If the first ipif is up, then
12342                                  * we'll handle eventual routing socket
12343                                  * notification via DAD completion.)
12344                                  */
12345                                 if (ipif == ill->ill_ipif) {
12346                                         ip_rts_ifmsg(ill->ill_ipif,
12347                                             RTSQ_DEFAULT);
12348                                 }
12349                         }
12350                 } else {
12351                         /*
12352                          * After link down, we'll need to send a new routing
12353                          * message when the link comes back, so clear
12354                          * ipif_addr_ready.
12355                          */
12356                         ipif->ipif_addr_ready = 0;
12357                 }
12358         }
12359 
12360         /*
12361          * If we've torn down links, then notify the user right away.
12362          */
12363         if (!went_up)
12364                 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12365 }
12366 
12367 static void
12368 ipsq_delete(ipsq_t *ipsq)
12369 {
12370         ipxop_t *ipx = ipsq->ipsq_xop;
12371 
12372         ipsq->ipsq_ipst = NULL;
12373         ASSERT(ipsq->ipsq_phyint == NULL);
12374         ASSERT(ipsq->ipsq_xop != NULL);
12375         ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12376         ASSERT(ipx->ipx_pending_mp == NULL);
12377         kmem_free(ipsq, sizeof (ipsq_t));
12378 }
12379 
12380 static int
12381 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12382 {
12383         int err = 0;
12384         ipif_t *ipif;
12385 
12386         if (ill == NULL)
12387                 return (0);
12388 
12389         ASSERT(IAM_WRITER_ILL(ill));
12390         ill->ill_up_ipifs = B_TRUE;
12391         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12392                 if (ipif->ipif_was_up) {
12393                         if (!(ipif->ipif_flags & IPIF_UP))
12394                                 err = ipif_up(ipif, q, mp);
12395                         ipif->ipif_was_up = B_FALSE;
12396                         if (err != 0) {
12397                                 ASSERT(err == EINPROGRESS);
12398                                 return (err);
12399                         }
12400                 }
12401         }
12402         ill->ill_up_ipifs = B_FALSE;
12403         return (0);
12404 }
12405 
12406 /*
12407  * This function is called to bring up all the ipifs that were up before
12408  * bringing the ill down via ill_down_ipifs().
12409  */
12410 int
12411 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12412 {
12413         int err;
12414 
12415         ASSERT(IAM_WRITER_ILL(ill));
12416 
12417         if (ill->ill_replumbing) {
12418                 ill->ill_replumbing = 0;
12419                 /*
12420                  * Send down REPLUMB_DONE notification followed by the
12421                  * BIND_REQ on the arp stream.
12422                  */
12423                 if (!ill->ill_isv6)
12424                         arp_send_replumb_conf(ill);
12425         }
12426         err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12427         if (err != 0)
12428                 return (err);
12429 
12430         return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12431 }
12432 
12433 /*
12434  * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12435  * down the ipifs without sending DL_UNBIND_REQ to the driver.
12436  */
12437 static void
12438 ill_down_ipifs(ill_t *ill, boolean_t logical)
12439 {
12440         ipif_t *ipif;
12441 
12442         ASSERT(IAM_WRITER_ILL(ill));
12443 
12444         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12445                 /*
12446                  * We go through the ipif_down logic even if the ipif
12447                  * is already down, since routes can be added based
12448                  * on down ipifs. Going through ipif_down once again
12449                  * will delete any IREs created based on these routes.
12450                  */
12451                 if (ipif->ipif_flags & IPIF_UP)
12452                         ipif->ipif_was_up = B_TRUE;
12453 
12454                 if (logical) {
12455                         (void) ipif_logical_down(ipif, NULL, NULL);
12456                         ipif_non_duplicate(ipif);
12457                         (void) ipif_down_tail(ipif);
12458                 } else {
12459                         (void) ipif_down(ipif, NULL, NULL);
12460                 }
12461         }
12462 }
12463 
12464 /*
12465  * Redo source address selection.  This makes IXAF_VERIFY_SOURCE take
12466  * a look again at valid source addresses.
12467  * This should be called each time after the set of source addresses has been
12468  * changed.
12469  */
12470 void
12471 ip_update_source_selection(ip_stack_t *ipst)
12472 {
12473         /* We skip past SRC_GENERATION_VERIFY */
12474         if (atomic_add_32_nv(&ipst->ips_src_generation, 1) ==
12475             SRC_GENERATION_VERIFY)
12476                 atomic_add_32(&ipst->ips_src_generation, 1);
12477 }
12478 
12479 /*
12480  * Finish the group join started in ip_sioctl_groupname().
12481  */
12482 /* ARGSUSED */
12483 static void
12484 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12485 {
12486         ill_t           *ill = q->q_ptr;
12487         phyint_t        *phyi = ill->ill_phyint;
12488         ipmp_grp_t      *grp = phyi->phyint_grp;
12489         ip_stack_t      *ipst = ill->ill_ipst;
12490 
12491         /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12492         ASSERT(!IS_IPMP(ill) && grp != NULL);
12493         ASSERT(IAM_WRITER_IPSQ(ipsq));
12494 
12495         if (phyi->phyint_illv4 != NULL) {
12496                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12497                 VERIFY(grp->gr_pendv4-- > 0);
12498                 rw_exit(&ipst->ips_ipmp_lock);
12499                 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12500         }
12501         if (phyi->phyint_illv6 != NULL) {
12502                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12503                 VERIFY(grp->gr_pendv6-- > 0);
12504                 rw_exit(&ipst->ips_ipmp_lock);
12505                 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12506         }
12507         freemsg(mp);
12508 }
12509 
12510 /*
12511  * Process an SIOCSLIFGROUPNAME request.
12512  */
12513 /* ARGSUSED */
12514 int
12515 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12516     ip_ioctl_cmd_t *ipip, void *ifreq)
12517 {
12518         struct lifreq   *lifr = ifreq;
12519         ill_t           *ill = ipif->ipif_ill;
12520         ip_stack_t      *ipst = ill->ill_ipst;
12521         phyint_t        *phyi = ill->ill_phyint;
12522         ipmp_grp_t      *grp = phyi->phyint_grp;
12523         mblk_t          *ipsq_mp;
12524         int             err = 0;
12525 
12526         /*
12527          * Note that phyint_grp can only change here, where we're exclusive.
12528          */
12529         ASSERT(IAM_WRITER_ILL(ill));
12530 
12531         if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12532             (phyi->phyint_flags & PHYI_VIRTUAL))
12533                 return (EINVAL);
12534 
12535         lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12536 
12537         rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12538 
12539         /*
12540          * If the name hasn't changed, there's nothing to do.
12541          */
12542         if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12543                 goto unlock;
12544 
12545         /*
12546          * Handle requests to rename an IPMP meta-interface.
12547          *
12548          * Note that creation of the IPMP meta-interface is handled in
12549          * userland through the standard plumbing sequence.  As part of the
12550          * plumbing the IPMP meta-interface, its initial groupname is set to
12551          * the name of the interface (see ipif_set_values_tail()).
12552          */
12553         if (IS_IPMP(ill)) {
12554                 err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12555                 goto unlock;
12556         }
12557 
12558         /*
12559          * Handle requests to add or remove an IP interface from a group.
12560          */
12561         if (lifr->lifr_groupname[0] != '\0') {                       /* add */
12562                 /*
12563                  * Moves are handled by first removing the interface from
12564                  * its existing group, and then adding it to another group.
12565                  * So, fail if it's already in a group.
12566                  */
12567                 if (IS_UNDER_IPMP(ill)) {
12568                         err = EALREADY;
12569                         goto unlock;
12570                 }
12571 
12572                 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12573                 if (grp == NULL) {
12574                         err = ENOENT;
12575                         goto unlock;
12576                 }
12577 
12578                 /*
12579                  * Check if the phyint and its ills are suitable for
12580                  * inclusion into the group.
12581                  */
12582                 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12583                         goto unlock;
12584 
12585                 /*
12586                  * Checks pass; join the group, and enqueue the remaining
12587                  * illgrp joins for when we've become part of the group xop
12588                  * and are exclusive across its IPSQs.  Since qwriter_ip()
12589                  * requires an mblk_t to scribble on, and since `mp' will be
12590                  * freed as part of completing the ioctl, allocate another.
12591                  */
12592                 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12593                         err = ENOMEM;
12594                         goto unlock;
12595                 }
12596 
12597                 /*
12598                  * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12599                  * IPMP meta-interface ills needed by `phyi' cannot go away
12600                  * before ip_join_illgrps() is called back.  See the comments
12601                  * in ip_sioctl_plink_ipmp() for more.
12602                  */
12603                 if (phyi->phyint_illv4 != NULL)
12604                         grp->gr_pendv4++;
12605                 if (phyi->phyint_illv6 != NULL)
12606                         grp->gr_pendv6++;
12607 
12608                 rw_exit(&ipst->ips_ipmp_lock);
12609 
12610                 ipmp_phyint_join_grp(phyi, grp);
12611                 ill_refhold(ill);
12612                 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12613                     SWITCH_OP, B_FALSE);
12614                 return (0);
12615         } else {
12616                 /*
12617                  * Request to remove the interface from a group.  If the
12618                  * interface is not in a group, this trivially succeeds.
12619                  */
12620                 rw_exit(&ipst->ips_ipmp_lock);
12621                 if (IS_UNDER_IPMP(ill))
12622                         ipmp_phyint_leave_grp(phyi);
12623                 return (0);
12624         }
12625 unlock:
12626         rw_exit(&ipst->ips_ipmp_lock);
12627         return (err);
12628 }
12629 
12630 /*
12631  * Process an SIOCGLIFBINDING request.
12632  */
12633 /* ARGSUSED */
12634 int
12635 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12636     ip_ioctl_cmd_t *ipip, void *ifreq)
12637 {
12638         ill_t           *ill;
12639         struct lifreq   *lifr = ifreq;
12640         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12641 
12642         if (!IS_IPMP(ipif->ipif_ill))
12643                 return (EINVAL);
12644 
12645         rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12646         if ((ill = ipif->ipif_bound_ill) == NULL)
12647                 lifr->lifr_binding[0] = '\0';
12648         else
12649                 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12650         rw_exit(&ipst->ips_ipmp_lock);
12651         return (0);
12652 }
12653 
12654 /*
12655  * Process an SIOCGLIFGROUPNAME request.
12656  */
12657 /* ARGSUSED */
12658 int
12659 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12660     ip_ioctl_cmd_t *ipip, void *ifreq)
12661 {
12662         ipmp_grp_t      *grp;
12663         struct lifreq   *lifr = ifreq;
12664         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12665 
12666         rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12667         if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12668                 lifr->lifr_groupname[0] = '\0';
12669         else
12670                 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12671         rw_exit(&ipst->ips_ipmp_lock);
12672         return (0);
12673 }
12674 
12675 /*
12676  * Process an SIOCGLIFGROUPINFO request.
12677  */
12678 /* ARGSUSED */
12679 int
12680 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12681     ip_ioctl_cmd_t *ipip, void *dummy)
12682 {
12683         ipmp_grp_t      *grp;
12684         lifgroupinfo_t  *lifgr;
12685         ip_stack_t      *ipst = CONNQ_TO_IPST(q);
12686 
12687         /* ip_wput_nondata() verified mp->b_cont->b_cont */
12688         lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12689         lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12690 
12691         rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12692         if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12693                 rw_exit(&ipst->ips_ipmp_lock);
12694                 return (ENOENT);
12695         }
12696         ipmp_grp_info(grp, lifgr);
12697         rw_exit(&ipst->ips_ipmp_lock);
12698         return (0);
12699 }
12700 
12701 static void
12702 ill_dl_down(ill_t *ill)
12703 {
12704         DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12705 
12706         /*
12707          * The ill is down; unbind but stay attached since we're still
12708          * associated with a PPA. If we have negotiated DLPI capabilites
12709          * with the data link service provider (IDS_OK) then reset them.
12710          * The interval between unbinding and rebinding is potentially
12711          * unbounded hence we cannot assume things will be the same.
12712          * The DLPI capabilities will be probed again when the data link
12713          * is brought up.
12714          */
12715         mblk_t  *mp = ill->ill_unbind_mp;
12716 
12717         ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12718 
12719         if (!ill->ill_replumbing) {
12720                 /* Free all ilms for this ill */
12721                 update_conn_ill(ill, ill->ill_ipst);
12722         } else {
12723                 ill_leave_multicast(ill);
12724         }
12725 
12726         ill->ill_unbind_mp = NULL;
12727         if (mp != NULL) {
12728                 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12729                     dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12730                     ill->ill_name));
12731                 mutex_enter(&ill->ill_lock);
12732                 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12733                 mutex_exit(&ill->ill_lock);
12734                 /*
12735                  * ip_rput does not pass up normal (M_PROTO) DLPI messages
12736                  * after ILL_CONDEMNED is set. So in the unplumb case, we call
12737                  * ill_capability_dld_disable disable rightaway. If this is not
12738                  * an unplumb operation then the disable happens on receipt of
12739                  * the capab ack via ip_rput_dlpi_writer ->
12740                  * ill_capability_ack_thr. In both cases the order of
12741                  * the operations seen by DLD is capability disable followed
12742                  * by DL_UNBIND. Also the DLD capability disable needs a
12743                  * cv_wait'able context.
12744                  */
12745                 if (ill->ill_state_flags & ILL_CONDEMNED)
12746                         ill_capability_dld_disable(ill);
12747                 ill_capability_reset(ill, B_FALSE);
12748                 ill_dlpi_send(ill, mp);
12749         }
12750         mutex_enter(&ill->ill_lock);
12751         ill->ill_dl_up = 0;
12752         ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12753         mutex_exit(&ill->ill_lock);
12754 }
12755 
12756 void
12757 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12758 {
12759         union DL_primitives *dlp;
12760         t_uscalar_t prim;
12761         boolean_t waitack = B_FALSE;
12762 
12763         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12764 
12765         dlp = (union DL_primitives *)mp->b_rptr;
12766         prim = dlp->dl_primitive;
12767 
12768         ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12769             dl_primstr(prim), prim, ill->ill_name));
12770 
12771         switch (prim) {
12772         case DL_PHYS_ADDR_REQ:
12773         {
12774                 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12775                 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12776                 break;
12777         }
12778         case DL_BIND_REQ:
12779                 mutex_enter(&ill->ill_lock);
12780                 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12781                 mutex_exit(&ill->ill_lock);
12782                 break;
12783         }
12784 
12785         /*
12786          * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12787          * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12788          * we only wait for the ACK of the DL_UNBIND_REQ.
12789          */
12790         mutex_enter(&ill->ill_lock);
12791         if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12792             (prim == DL_UNBIND_REQ)) {
12793                 ill->ill_dlpi_pending = prim;
12794                 waitack = B_TRUE;
12795         }
12796 
12797         mutex_exit(&ill->ill_lock);
12798         DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12799             char *, dl_primstr(prim), ill_t *, ill);
12800         putnext(ill->ill_wq, mp);
12801 
12802         /*
12803          * There is no ack for DL_NOTIFY_CONF messages
12804          */
12805         if (waitack && prim == DL_NOTIFY_CONF)
12806                 ill_dlpi_done(ill, prim);
12807 }
12808 
12809 /*
12810  * Helper function for ill_dlpi_send().
12811  */
12812 /* ARGSUSED */
12813 static void
12814 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12815 {
12816         ill_dlpi_send(q->q_ptr, mp);
12817 }
12818 
12819 /*
12820  * Send a DLPI control message to the driver but make sure there
12821  * is only one outstanding message. Uses ill_dlpi_pending to tell
12822  * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12823  * when an ACK or a NAK is received to process the next queued message.
12824  */
12825 void
12826 ill_dlpi_send(ill_t *ill, mblk_t *mp)
12827 {
12828         mblk_t **mpp;
12829 
12830         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12831 
12832         /*
12833          * To ensure that any DLPI requests for current exclusive operation
12834          * are always completely sent before any DLPI messages for other
12835          * operations, require writer access before enqueuing.
12836          */
12837         if (!IAM_WRITER_ILL(ill)) {
12838                 ill_refhold(ill);
12839                 /* qwriter_ip() does the ill_refrele() */
12840                 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12841                     NEW_OP, B_TRUE);
12842                 return;
12843         }
12844 
12845         mutex_enter(&ill->ill_lock);
12846         if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12847                 /* Must queue message. Tail insertion */
12848                 mpp = &ill->ill_dlpi_deferred;
12849                 while (*mpp != NULL)
12850                         mpp = &((*mpp)->b_next);
12851 
12852                 ip1dbg(("ill_dlpi_send: deferring request for %s "
12853                     "while %s pending\n", ill->ill_name,
12854                     dl_primstr(ill->ill_dlpi_pending)));
12855 
12856                 *mpp = mp;
12857                 mutex_exit(&ill->ill_lock);
12858                 return;
12859         }
12860         mutex_exit(&ill->ill_lock);
12861         ill_dlpi_dispatch(ill, mp);
12862 }
12863 
12864 void
12865 ill_capability_send(ill_t *ill, mblk_t *mp)
12866 {
12867         ill->ill_capab_pending_cnt++;
12868         ill_dlpi_send(ill, mp);
12869 }
12870 
12871 void
12872 ill_capability_done(ill_t *ill)
12873 {
12874         ASSERT(ill->ill_capab_pending_cnt != 0);
12875 
12876         ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12877 
12878         ill->ill_capab_pending_cnt--;
12879         if (ill->ill_capab_pending_cnt == 0 &&
12880             ill->ill_dlpi_capab_state == IDCS_OK)
12881                 ill_capability_reset_alloc(ill);
12882 }
12883 
12884 /*
12885  * Send all deferred DLPI messages without waiting for their ACKs.
12886  */
12887 void
12888 ill_dlpi_send_deferred(ill_t *ill)
12889 {
12890         mblk_t *mp, *nextmp;
12891 
12892         /*
12893          * Clear ill_dlpi_pending so that the message is not queued in
12894          * ill_dlpi_send().
12895          */
12896         mutex_enter(&ill->ill_lock);
12897         ill->ill_dlpi_pending = DL_PRIM_INVAL;
12898         mp = ill->ill_dlpi_deferred;
12899         ill->ill_dlpi_deferred = NULL;
12900         mutex_exit(&ill->ill_lock);
12901 
12902         for (; mp != NULL; mp = nextmp) {
12903                 nextmp = mp->b_next;
12904                 mp->b_next = NULL;
12905                 ill_dlpi_send(ill, mp);
12906         }
12907 }
12908 
12909 /*
12910  * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12911  * or M_HANGUP
12912  */
12913 static void
12914 ill_dlpi_clear_deferred(ill_t *ill)
12915 {
12916         mblk_t  *mp, *nextmp;
12917 
12918         mutex_enter(&ill->ill_lock);
12919         ill->ill_dlpi_pending = DL_PRIM_INVAL;
12920         mp = ill->ill_dlpi_deferred;
12921         ill->ill_dlpi_deferred = NULL;
12922         mutex_exit(&ill->ill_lock);
12923 
12924         for (; mp != NULL; mp = nextmp) {
12925                 nextmp = mp->b_next;
12926                 inet_freemsg(mp);
12927         }
12928 }
12929 
12930 /*
12931  * Check if the DLPI primitive `prim' is pending; print a warning if not.
12932  */
12933 boolean_t
12934 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
12935 {
12936         t_uscalar_t pending;
12937 
12938         mutex_enter(&ill->ill_lock);
12939         if (ill->ill_dlpi_pending == prim) {
12940                 mutex_exit(&ill->ill_lock);
12941                 return (B_TRUE);
12942         }
12943 
12944         /*
12945          * During teardown, ill_dlpi_dispatch() will send DLPI requests
12946          * without waiting, so don't print any warnings in that case.
12947          */
12948         if (ill->ill_state_flags & ILL_CONDEMNED) {
12949                 mutex_exit(&ill->ill_lock);
12950                 return (B_FALSE);
12951         }
12952         pending = ill->ill_dlpi_pending;
12953         mutex_exit(&ill->ill_lock);
12954 
12955         if (pending == DL_PRIM_INVAL) {
12956                 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12957                     "received unsolicited ack for %s on %s\n",
12958                     dl_primstr(prim), ill->ill_name);
12959         } else {
12960                 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12961                     "received unexpected ack for %s on %s (expecting %s)\n",
12962                     dl_primstr(prim), ill->ill_name, dl_primstr(pending));
12963         }
12964         return (B_FALSE);
12965 }
12966 
12967 /*
12968  * Complete the current DLPI operation associated with `prim' on `ill' and
12969  * start the next queued DLPI operation (if any).  If there are no queued DLPI
12970  * operations and the ill's current exclusive IPSQ operation has finished
12971  * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
12972  * allow the next exclusive IPSQ operation to begin upon ipsq_exit().  See
12973  * the comments above ipsq_current_finish() for details.
12974  */
12975 void
12976 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
12977 {
12978         mblk_t *mp;
12979         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
12980         ipxop_t *ipx = ipsq->ipsq_xop;
12981 
12982         ASSERT(IAM_WRITER_IPSQ(ipsq));
12983         mutex_enter(&ill->ill_lock);
12984 
12985         ASSERT(prim != DL_PRIM_INVAL);
12986         ASSERT(ill->ill_dlpi_pending == prim);
12987 
12988         ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
12989             dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
12990 
12991         if ((mp = ill->ill_dlpi_deferred) == NULL) {
12992                 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12993                 if (ipx->ipx_current_done) {
12994                         mutex_enter(&ipx->ipx_lock);
12995                         ipx->ipx_current_ipif = NULL;
12996                         mutex_exit(&ipx->ipx_lock);
12997                 }
12998                 cv_signal(&ill->ill_cv);
12999                 mutex_exit(&ill->ill_lock);
13000                 return;
13001         }
13002 
13003         ill->ill_dlpi_deferred = mp->b_next;
13004         mp->b_next = NULL;
13005         mutex_exit(&ill->ill_lock);
13006 
13007         ill_dlpi_dispatch(ill, mp);
13008 }
13009 
13010 /*
13011  * Queue a (multicast) DLPI control message to be sent to the driver by
13012  * later calling ill_dlpi_send_queued.
13013  * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13014  * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
13015  * for the same group to race.
13016  * We send DLPI control messages in order using ill_lock.
13017  * For IPMP we should be called on the cast_ill.
13018  */
13019 void
13020 ill_dlpi_queue(ill_t *ill, mblk_t *mp)
13021 {
13022         mblk_t **mpp;
13023 
13024         ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
13025 
13026         mutex_enter(&ill->ill_lock);
13027         /* Must queue message. Tail insertion */
13028         mpp = &ill->ill_dlpi_deferred;
13029         while (*mpp != NULL)
13030                 mpp = &((*mpp)->b_next);
13031 
13032         *mpp = mp;
13033         mutex_exit(&ill->ill_lock);
13034 }
13035 
13036 /*
13037  * Send the messages that were queued. Make sure there is only
13038  * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
13039  * when an ACK or a NAK is received to process the next queued message.
13040  * For IPMP we are called on the upper ill, but when send what is queued
13041  * on the cast_ill.
13042  */
13043 void
13044 ill_dlpi_send_queued(ill_t *ill)
13045 {
13046         mblk_t  *mp;
13047         union DL_primitives *dlp;
13048         t_uscalar_t prim;
13049         ill_t *release_ill = NULL;
13050 
13051         if (IS_IPMP(ill)) {
13052                 /* On the upper IPMP ill. */
13053                 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13054                 if (release_ill == NULL) {
13055                         /* Avoid ever sending anything down to the ipmpstub */
13056                         return;
13057                 }
13058                 ill = release_ill;
13059         }
13060         mutex_enter(&ill->ill_lock);
13061         while ((mp = ill->ill_dlpi_deferred) != NULL) {
13062                 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
13063                         /* Can't send. Somebody else will send it */
13064                         mutex_exit(&ill->ill_lock);
13065                         goto done;
13066                 }
13067                 ill->ill_dlpi_deferred = mp->b_next;
13068                 mp->b_next = NULL;
13069                 if (!ill->ill_dl_up) {
13070                         /*
13071                          * Nobody there. All multicast addresses will be
13072                          * re-joined when we get the DL_BIND_ACK bringing the
13073                          * interface up.
13074                          */
13075                         freemsg(mp);
13076                         continue;
13077                 }
13078                 dlp = (union DL_primitives *)mp->b_rptr;
13079                 prim = dlp->dl_primitive;
13080 
13081                 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
13082                     (prim == DL_UNBIND_REQ)) {
13083                         ill->ill_dlpi_pending = prim;
13084                 }
13085                 mutex_exit(&ill->ill_lock);
13086 
13087                 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
13088                     char *, dl_primstr(prim), ill_t *, ill);
13089                 putnext(ill->ill_wq, mp);
13090                 mutex_enter(&ill->ill_lock);
13091         }
13092         mutex_exit(&ill->ill_lock);
13093 done:
13094         if (release_ill != NULL)
13095                 ill_refrele(release_ill);
13096 }
13097 
13098 /*
13099  * Queue an IP (IGMP/MLD) message to be sent by IP from
13100  * ill_mcast_send_queued
13101  * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13102  * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
13103  * group to race.
13104  * We send them in order using ill_lock.
13105  * For IPMP we are called on the upper ill, but we queue on the cast_ill.
13106  */
13107 void
13108 ill_mcast_queue(ill_t *ill, mblk_t *mp)
13109 {
13110         mblk_t **mpp;
13111         ill_t *release_ill = NULL;
13112 
13113         ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
13114 
13115         if (IS_IPMP(ill)) {
13116                 /* On the upper IPMP ill. */
13117                 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13118                 if (release_ill == NULL) {
13119                         /* Discard instead of queuing for the ipmp interface */
13120                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13121                         ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
13122                             mp, ill);
13123                         freemsg(mp);
13124                         return;
13125                 }
13126                 ill = release_ill;
13127         }
13128 
13129         mutex_enter(&ill->ill_lock);
13130         /* Must queue message. Tail insertion */
13131         mpp = &ill->ill_mcast_deferred;
13132         while (*mpp != NULL)
13133                 mpp = &((*mpp)->b_next);
13134 
13135         *mpp = mp;
13136         mutex_exit(&ill->ill_lock);
13137         if (release_ill != NULL)
13138                 ill_refrele(release_ill);
13139 }
13140 
13141 /*
13142  * Send the IP packets that were queued by ill_mcast_queue.
13143  * These are IGMP/MLD packets.
13144  *
13145  * For IPMP we are called on the upper ill, but when send what is queued
13146  * on the cast_ill.
13147  *
13148  * Request loopback of the report if we are acting as a multicast
13149  * router, so that the process-level routing demon can hear it.
13150  * This will run multiple times for the same group if there are members
13151  * on the same group for multiple ipif's on the same ill. The
13152  * igmp_input/mld_input code will suppress this due to the loopback thus we
13153  * always loopback membership report.
13154  *
13155  * We also need to make sure that this does not get load balanced
13156  * by IPMP. We do this by passing an ill to ip_output_simple.
13157  */
13158 void
13159 ill_mcast_send_queued(ill_t *ill)
13160 {
13161         mblk_t  *mp;
13162         ip_xmit_attr_t ixas;
13163         ill_t *release_ill = NULL;
13164 
13165         if (IS_IPMP(ill)) {
13166                 /* On the upper IPMP ill. */
13167                 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13168                 if (release_ill == NULL) {
13169                         /*
13170                          * We should have no messages on the ipmp interface
13171                          * but no point in trying to send them.
13172                          */
13173                         return;
13174                 }
13175                 ill = release_ill;
13176         }
13177         bzero(&ixas, sizeof (ixas));
13178         ixas.ixa_zoneid = ALL_ZONES;
13179         ixas.ixa_cred = kcred;
13180         ixas.ixa_cpid = NOPID;
13181         ixas.ixa_tsl = NULL;
13182         /*
13183          * Here we set ixa_ifindex. If IPMP it will be the lower ill which
13184          * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
13185          * That is necessary to handle IGMP/MLD snooping switches.
13186          */
13187         ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
13188         ixas.ixa_ipst = ill->ill_ipst;
13189 
13190         mutex_enter(&ill->ill_lock);
13191         while ((mp = ill->ill_mcast_deferred) != NULL) {
13192                 ill->ill_mcast_deferred = mp->b_next;
13193                 mp->b_next = NULL;
13194                 if (!ill->ill_dl_up) {
13195                         /*
13196                          * Nobody there. Just drop the ip packets.
13197                          * IGMP/MLD will resend later, if this is a replumb.
13198                          */
13199                         freemsg(mp);
13200                         continue;
13201                 }
13202                 mutex_enter(&ill->ill_phyint->phyint_lock);
13203                 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
13204                         /*
13205                          * When the ill is getting deactivated, we only want to
13206                          * send the DLPI messages, so drop IGMP/MLD packets.
13207                          * DLPI messages are handled by ill_dlpi_send_queued()
13208                          */
13209                         mutex_exit(&ill->ill_phyint->phyint_lock);
13210                         freemsg(mp);
13211                         continue;
13212                 }
13213                 mutex_exit(&ill->ill_phyint->phyint_lock);
13214                 mutex_exit(&ill->ill_lock);
13215 
13216                 /* Check whether we are sending IPv4 or IPv6. */
13217                 if (ill->ill_isv6) {
13218                         ip6_t  *ip6h = (ip6_t *)mp->b_rptr;
13219 
13220                         ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13221                         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13222                 } else {
13223                         ipha_t *ipha = (ipha_t *)mp->b_rptr;
13224 
13225                         ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13226                         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13227                         ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13228                 }
13229                 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13230                 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13231                 (void) ip_output_simple(mp, &ixas);
13232                 ixa_cleanup(&ixas);
13233 
13234                 mutex_enter(&ill->ill_lock);
13235         }
13236         mutex_exit(&ill->ill_lock);
13237 
13238 done:
13239         if (release_ill != NULL)
13240                 ill_refrele(release_ill);
13241 }
13242 
13243 /*
13244  * Take down a specific interface, but don't lose any information about it.
13245  * (Always called as writer.)
13246  * This function goes through the down sequence even if the interface is
13247  * already down. There are 2 reasons.
13248  * a. Currently we permit interface routes that depend on down interfaces
13249  *    to be added. This behaviour itself is questionable. However it appears
13250  *    that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13251  *    time. We go thru the cleanup in order to remove these routes.
13252  * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13253  *    DL_ERROR_ACK in response to the DL_BIND request. The interface is
13254  *    down, but we need to cleanup i.e. do ill_dl_down and
13255  *    ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13256  *
13257  * IP-MT notes:
13258  *
13259  * Model of reference to interfaces.
13260  *
13261  * The following members in ipif_t track references to the ipif.
13262  *      int     ipif_refcnt;    Active reference count
13263  *
13264  * The following members in ill_t track references to the ill.
13265  *      int             ill_refcnt;     active refcnt
13266  *      uint_t          ill_ire_cnt;    Number of ires referencing ill
13267  *      uint_t          ill_ncec_cnt;   Number of ncecs referencing ill
13268  *      uint_t          ill_nce_cnt;    Number of nces referencing ill
13269  *      uint_t          ill_ilm_cnt;    Number of ilms referencing ill
13270  *
13271  * Reference to an ipif or ill can be obtained in any of the following ways.
13272  *
13273  * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13274  * Pointers to ipif / ill from other data structures viz ire and conn.
13275  * Implicit reference to the ipif / ill by holding a reference to the ire.
13276  *
13277  * The ipif/ill lookup functions return a reference held ipif / ill.
13278  * ipif_refcnt and ill_refcnt track the reference counts respectively.
13279  * This is a purely dynamic reference count associated with threads holding
13280  * references to the ipif / ill. Pointers from other structures do not
13281  * count towards this reference count.
13282  *
13283  * ill_ire_cnt is the number of ire's associated with the
13284  * ill. This is incremented whenever a new ire is created referencing the
13285  * ill. This is done atomically inside ire_add_v[46] where the ire is
13286  * actually added to the ire hash table. The count is decremented in
13287  * ire_inactive where the ire is destroyed.
13288  *
13289  * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13290  * This is incremented atomically in
13291  * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13292  * table. Similarly it is decremented in ncec_inactive() where the ncec
13293  * is destroyed.
13294  *
13295  * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13296  * incremented atomically in nce_add() where the nce is actually added to the
13297  * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13298  * is destroyed.
13299  *
13300  * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13301  * ilm_add() and decremented before the ilm is freed in ilm_delete().
13302  *
13303  * Flow of ioctls involving interface down/up
13304  *
13305  * The following is the sequence of an attempt to set some critical flags on an
13306  * up interface.
13307  * ip_sioctl_flags
13308  * ipif_down
13309  * wait for ipif to be quiescent
13310  * ipif_down_tail
13311  * ip_sioctl_flags_tail
13312  *
13313  * All set ioctls that involve down/up sequence would have a skeleton similar
13314  * to the above. All the *tail functions are called after the refcounts have
13315  * dropped to the appropriate values.
13316  *
13317  * SIOC ioctls during the IPIF_CHANGING interval.
13318  *
13319  * Threads handling SIOC set ioctls serialize on the squeue, but this
13320  * is not done for SIOC get ioctls. Since a set ioctl can cause several
13321  * steps of internal changes to the state, some of which are visible in
13322  * ipif_flags (such as IFF_UP being cleared and later set), and we want
13323  * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13324  * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13325  * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13326  * the current exclusive operation completes. The IPIF_CHANGING check
13327  * and enqueue is atomic using the ill_lock and ipsq_lock. The
13328  * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13329  * change while the ill_lock is held. Before dropping the ill_lock we acquire
13330  * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13331  * until we release the ipsq_lock, even though the ill/ipif state flags
13332  * can change after we drop the ill_lock.
13333  */
13334 int
13335 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13336 {
13337         ill_t           *ill = ipif->ipif_ill;
13338         conn_t          *connp;
13339         boolean_t       success;
13340         boolean_t       ipif_was_up = B_FALSE;
13341         ip_stack_t      *ipst = ill->ill_ipst;
13342 
13343         ASSERT(IAM_WRITER_IPIF(ipif));
13344 
13345         ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13346 
13347         DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13348             ill_t *, ill, ipif_t *, ipif);
13349 
13350         if (ipif->ipif_flags & IPIF_UP) {
13351                 mutex_enter(&ill->ill_lock);
13352                 ipif->ipif_flags &= ~IPIF_UP;
13353                 ASSERT(ill->ill_ipif_up_count > 0);
13354                 --ill->ill_ipif_up_count;
13355                 mutex_exit(&ill->ill_lock);
13356                 ipif_was_up = B_TRUE;
13357                 /* Update status in SCTP's list */
13358                 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13359                 ill_nic_event_dispatch(ipif->ipif_ill,
13360                     MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13361         }
13362 
13363         /*
13364          * Removal of the last ipif from an ill may result in a DL_UNBIND
13365          * being sent to the driver, and we must not send any data packets to
13366          * the driver after the DL_UNBIND_REQ. To ensure this, all the
13367          * ire and nce entries used in the data path will be cleaned
13368          * up, and we also set  the ILL_DOWN_IN_PROGRESS bit to make
13369          * sure on new entries will be added until the ill is bound
13370          * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13371          * receipt of a DL_BIND_ACK.
13372          */
13373         if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13374             ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13375             ill->ill_dl_up) {
13376                 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13377         }
13378 
13379         /*
13380          * Blow away memberships we established in ipif_multicast_up().
13381          */
13382         ipif_multicast_down(ipif);
13383 
13384         /*
13385          * Remove from the mapping for __sin6_src_id. We insert only
13386          * when the address is not INADDR_ANY. As IPv4 addresses are
13387          * stored as mapped addresses, we need to check for mapped
13388          * INADDR_ANY also.
13389          */
13390         if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13391             !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13392             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13393                 int err;
13394 
13395                 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13396                     ipif->ipif_zoneid, ipst);
13397                 if (err != 0) {
13398                         ip0dbg(("ipif_down: srcid_remove %d\n", err));
13399                 }
13400         }
13401 
13402         if (ipif_was_up) {
13403                 /* only delete if we'd added ire's before */
13404                 if (ipif->ipif_isv6)
13405                         ipif_delete_ires_v6(ipif);
13406                 else
13407                         ipif_delete_ires_v4(ipif);
13408         }
13409 
13410         if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13411                 /*
13412                  * Since the interface is now down, it may have just become
13413                  * inactive.  Note that this needs to be done even for a
13414                  * lll_logical_down(), or ARP entries will not get correctly
13415                  * restored when the interface comes back up.
13416                  */
13417                 if (IS_UNDER_IPMP(ill))
13418                         ipmp_ill_refresh_active(ill);
13419         }
13420 
13421         /*
13422          * neighbor-discovery or arp entries for this interface. The ipif
13423          * has to be quiesced, so we walk all the nce's and delete those
13424          * that point at the ipif->ipif_ill. At the same time, we also
13425          * update IPMP so that ipifs for data addresses are unbound. We dont
13426          * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13427          * that for ipif_down_tail()
13428          */
13429         ipif_nce_down(ipif);
13430 
13431         /*
13432          * If this is the last ipif on the ill, we also need to remove
13433          * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13434          * never succeed.
13435          */
13436         if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13437                 ire_walk_ill(0, 0, ill_downi, ill, ill);
13438 
13439         /*
13440          * Walk all CONNs that can have a reference on an ire for this
13441          * ipif (we actually walk all that now have stale references).
13442          */
13443         ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13444 
13445         /*
13446          * If mp is NULL the caller will wait for the appropriate refcnt.
13447          * Eg. ip_sioctl_removeif -> ipif_free  -> ipif_down
13448          * and ill_delete -> ipif_free -> ipif_down
13449          */
13450         if (mp == NULL) {
13451                 ASSERT(q == NULL);
13452                 return (0);
13453         }
13454 
13455         if (CONN_Q(q)) {
13456                 connp = Q_TO_CONN(q);
13457                 mutex_enter(&connp->conn_lock);
13458         } else {
13459                 connp = NULL;
13460         }
13461         mutex_enter(&ill->ill_lock);
13462         /*
13463          * Are there any ire's pointing to this ipif that are still active ?
13464          * If this is the last ipif going down, are there any ire's pointing
13465          * to this ill that are still active ?
13466          */
13467         if (ipif_is_quiescent(ipif)) {
13468                 mutex_exit(&ill->ill_lock);
13469                 if (connp != NULL)
13470                         mutex_exit(&connp->conn_lock);
13471                 return (0);
13472         }
13473 
13474         ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13475             ill->ill_name, (void *)ill));
13476         /*
13477          * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13478          * drops down, the operation will be restarted by ipif_ill_refrele_tail
13479          * which in turn is called by the last refrele on the ipif/ill/ire.
13480          */
13481         success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13482         if (!success) {
13483                 /* The conn is closing. So just return */
13484                 ASSERT(connp != NULL);
13485                 mutex_exit(&ill->ill_lock);
13486                 mutex_exit(&connp->conn_lock);
13487                 return (EINTR);
13488         }
13489 
13490         mutex_exit(&ill->ill_lock);
13491         if (connp != NULL)
13492                 mutex_exit(&connp->conn_lock);
13493         return (EINPROGRESS);
13494 }
13495 
13496 int
13497 ipif_down_tail(ipif_t *ipif)
13498 {
13499         ill_t   *ill = ipif->ipif_ill;
13500         int     err = 0;
13501 
13502         DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13503             ill_t *, ill, ipif_t *, ipif);
13504 
13505         /*
13506          * Skip any loopback interface (null wq).
13507          * If this is the last logical interface on the ill
13508          * have ill_dl_down tell the driver we are gone (unbind)
13509          * Note that lun 0 can ipif_down even though
13510          * there are other logical units that are up.
13511          * This occurs e.g. when we change a "significant" IFF_ flag.
13512          */
13513         if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13514             ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13515             ill->ill_dl_up) {
13516                 ill_dl_down(ill);
13517         }
13518         if (!ipif->ipif_isv6)
13519                 err = ipif_arp_down(ipif);
13520 
13521         ill->ill_logical_down = 0;
13522 
13523         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13524         ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13525         return (err);
13526 }
13527 
13528 /*
13529  * Bring interface logically down without bringing the physical interface
13530  * down e.g. when the netmask is changed. This avoids long lasting link
13531  * negotiations between an ethernet interface and a certain switches.
13532  */
13533 static int
13534 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13535 {
13536         DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13537             ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13538 
13539         /*
13540          * The ill_logical_down flag is a transient flag. It is set here
13541          * and is cleared once the down has completed in ipif_down_tail.
13542          * This flag does not indicate whether the ill stream is in the
13543          * DL_BOUND state with the driver. Instead this flag is used by
13544          * ipif_down_tail to determine whether to DL_UNBIND the stream with
13545          * the driver. The state of the ill stream i.e. whether it is
13546          * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13547          */
13548         ipif->ipif_ill->ill_logical_down = 1;
13549         return (ipif_down(ipif, q, mp));
13550 }
13551 
13552 /*
13553  * Initiate deallocate of an IPIF. Always called as writer. Called by
13554  * ill_delete or ip_sioctl_removeif.
13555  */
13556 static void
13557 ipif_free(ipif_t *ipif)
13558 {
13559         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
13560 
13561         ASSERT(IAM_WRITER_IPIF(ipif));
13562 
13563         if (ipif->ipif_recovery_id != 0)
13564                 (void) untimeout(ipif->ipif_recovery_id);
13565         ipif->ipif_recovery_id = 0;
13566 
13567         /*
13568          * Take down the interface. We can be called either from ill_delete
13569          * or from ip_sioctl_removeif.
13570          */
13571         (void) ipif_down(ipif, NULL, NULL);
13572 
13573         /*
13574          * Now that the interface is down, there's no chance it can still
13575          * become a duplicate.  Cancel any timer that may have been set while
13576          * tearing down.
13577          */
13578         if (ipif->ipif_recovery_id != 0)
13579                 (void) untimeout(ipif->ipif_recovery_id);
13580         ipif->ipif_recovery_id = 0;
13581 
13582         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13583         /* Remove pointers to this ill in the multicast routing tables */
13584         reset_mrt_vif_ipif(ipif);
13585         /* If necessary, clear the cached source ipif rotor. */
13586         if (ipif->ipif_ill->ill_src_ipif == ipif)
13587                 ipif->ipif_ill->ill_src_ipif = NULL;
13588         rw_exit(&ipst->ips_ill_g_lock);
13589 }
13590 
13591 static void
13592 ipif_free_tail(ipif_t *ipif)
13593 {
13594         ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13595 
13596         /*
13597          * Need to hold both ill_g_lock and ill_lock while
13598          * inserting or removing an ipif from the linked list
13599          * of ipifs hanging off the ill.
13600          */
13601         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13602 
13603 #ifdef DEBUG
13604         ipif_trace_cleanup(ipif);
13605 #endif
13606 
13607         /* Ask SCTP to take it out of it list */
13608         sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13609         ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13610 
13611         /* Get it out of the ILL interface list. */
13612         ipif_remove(ipif);
13613         rw_exit(&ipst->ips_ill_g_lock);
13614 
13615         ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13616         ASSERT(ipif->ipif_recovery_id == 0);
13617         ASSERT(ipif->ipif_ire_local == NULL);
13618         ASSERT(ipif->ipif_ire_if == NULL);
13619 
13620         /* Free the memory. */
13621         mi_free(ipif);
13622 }
13623 
13624 /*
13625  * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13626  * is zero.
13627  */
13628 void
13629 ipif_get_name(const ipif_t *ipif, char *buf, int len)
13630 {
13631         char    lbuf[LIFNAMSIZ];
13632         char    *name;
13633         size_t  name_len;
13634 
13635         buf[0] = '\0';
13636         name = ipif->ipif_ill->ill_name;
13637         name_len = ipif->ipif_ill->ill_name_length;
13638         if (ipif->ipif_id != 0) {
13639                 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13640                     ipif->ipif_id);
13641                 name = lbuf;
13642                 name_len = mi_strlen(name) + 1;
13643         }
13644         len -= 1;
13645         buf[len] = '\0';
13646         len = MIN(len, name_len);
13647         bcopy(name, buf, len);
13648 }
13649 
13650 /*
13651  * Sets `buf' to an ill name.
13652  */
13653 void
13654 ill_get_name(const ill_t *ill, char *buf, int len)
13655 {
13656         char    *name;
13657         size_t  name_len;
13658 
13659         name = ill->ill_name;
13660         name_len = ill->ill_name_length;
13661         len -= 1;
13662         buf[len] = '\0';
13663         len = MIN(len, name_len);
13664         bcopy(name, buf, len);
13665 }
13666 
13667 /*
13668  * Find an IPIF based on the name passed in.  Names can be of the form <phys>
13669  * (e.g., le0) or <phys>:<#> (e.g., le0:1).  When there is no colon, the
13670  * implied unit id is zero. <phys> must correspond to the name of an ILL.
13671  * (May be called as writer.)
13672  */
13673 static ipif_t *
13674 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13675     boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13676 {
13677         char    *cp;
13678         char    *endp;
13679         long    id;
13680         ill_t   *ill;
13681         ipif_t  *ipif;
13682         uint_t  ire_type;
13683         boolean_t did_alloc = B_FALSE;
13684         char    last;
13685 
13686         /*
13687          * If the caller wants to us to create the ipif, make sure we have a
13688          * valid zoneid
13689          */
13690         ASSERT(!do_alloc || zoneid != ALL_ZONES);
13691 
13692         if (namelen == 0) {
13693                 return (NULL);
13694         }
13695 
13696         *exists = B_FALSE;
13697         /* Look for a colon in the name. */
13698         endp = &name[namelen];
13699         for (cp = endp; --cp > name; ) {
13700                 if (*cp == IPIF_SEPARATOR_CHAR)
13701                         break;
13702         }
13703 
13704         if (*cp == IPIF_SEPARATOR_CHAR) {
13705                 /*
13706                  * Reject any non-decimal aliases for logical
13707                  * interfaces. Aliases with leading zeroes
13708                  * are also rejected as they introduce ambiguity
13709                  * in the naming of the interfaces.
13710                  * In order to confirm with existing semantics,
13711                  * and to not break any programs/script relying
13712                  * on that behaviour, if<0>:0 is considered to be
13713                  * a valid interface.
13714                  *
13715                  * If alias has two or more digits and the first
13716                  * is zero, fail.
13717                  */
13718                 if (&cp[2] < endp && cp[1] == '0') {
13719                         return (NULL);
13720                 }
13721         }
13722 
13723         if (cp <= name) {
13724                 cp = endp;
13725         }
13726         last = *cp;
13727         *cp = '\0';
13728 
13729         /*
13730          * Look up the ILL, based on the portion of the name
13731          * before the slash. ill_lookup_on_name returns a held ill.
13732          * Temporary to check whether ill exists already. If so
13733          * ill_lookup_on_name will clear it.
13734          */
13735         ill = ill_lookup_on_name(name, do_alloc, isv6,
13736             &did_alloc, ipst);
13737         *cp = last;
13738         if (ill == NULL)
13739                 return (NULL);
13740 
13741         /* Establish the unit number in the name. */
13742         id = 0;
13743         if (cp < endp && *endp == '\0') {
13744                 /* If there was a colon, the unit number follows. */
13745                 cp++;
13746                 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13747                         ill_refrele(ill);
13748                         return (NULL);
13749                 }
13750         }
13751 
13752         mutex_enter(&ill->ill_lock);
13753         /* Now see if there is an IPIF with this unit number. */
13754         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13755                 if (ipif->ipif_id == id) {
13756                         if (zoneid != ALL_ZONES &&
13757                             zoneid != ipif->ipif_zoneid &&
13758                             ipif->ipif_zoneid != ALL_ZONES) {
13759                                 mutex_exit(&ill->ill_lock);
13760                                 ill_refrele(ill);
13761                                 return (NULL);
13762                         }
13763                         if (IPIF_CAN_LOOKUP(ipif)) {
13764                                 ipif_refhold_locked(ipif);
13765                                 mutex_exit(&ill->ill_lock);
13766                                 if (!did_alloc)
13767                                         *exists = B_TRUE;
13768                                 /*
13769                                  * Drop locks before calling ill_refrele
13770                                  * since it can potentially call into
13771                                  * ipif_ill_refrele_tail which can end up
13772                                  * in trying to acquire any lock.
13773                                  */
13774                                 ill_refrele(ill);
13775                                 return (ipif);
13776                         }
13777                 }
13778         }
13779 
13780         if (!do_alloc) {
13781                 mutex_exit(&ill->ill_lock);
13782                 ill_refrele(ill);
13783                 return (NULL);
13784         }
13785 
13786         /*
13787          * If none found, atomically allocate and return a new one.
13788          * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13789          * to support "receive only" use of lo0:1 etc. as is still done
13790          * below as an initial guess.
13791          * However, this is now likely to be overriden later in ipif_up_done()
13792          * when we know for sure what address has been configured on the
13793          * interface, since we might have more than one loopback interface
13794          * with a loopback address, e.g. in the case of zones, and all the
13795          * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13796          */
13797         if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13798                 ire_type = IRE_LOOPBACK;
13799         else
13800                 ire_type = IRE_LOCAL;
13801         ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13802         if (ipif != NULL)
13803                 ipif_refhold_locked(ipif);
13804         mutex_exit(&ill->ill_lock);
13805         ill_refrele(ill);
13806         return (ipif);
13807 }
13808 
13809 /*
13810  * Variant of the above that queues the request on the ipsq when
13811  * IPIF_CHANGING is set.
13812  */
13813 static ipif_t *
13814 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13815     zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13816     ip_stack_t *ipst)
13817 {
13818         char    *cp;
13819         char    *endp;
13820         long    id;
13821         ill_t   *ill;
13822         ipif_t  *ipif;
13823         boolean_t did_alloc = B_FALSE;
13824         ipsq_t  *ipsq;
13825 
13826         if (error != NULL)
13827                 *error = 0;
13828 
13829         if (namelen == 0) {
13830                 if (error != NULL)
13831                         *error = ENXIO;
13832                 return (NULL);
13833         }
13834 
13835         /* Look for a colon in the name. */
13836         endp = &name[namelen];
13837         for (cp = endp; --cp > name; ) {
13838                 if (*cp == IPIF_SEPARATOR_CHAR)
13839                         break;
13840         }
13841 
13842         if (*cp == IPIF_SEPARATOR_CHAR) {
13843                 /*
13844                  * Reject any non-decimal aliases for logical
13845                  * interfaces. Aliases with leading zeroes
13846                  * are also rejected as they introduce ambiguity
13847                  * in the naming of the interfaces.
13848                  * In order to confirm with existing semantics,
13849                  * and to not break any programs/script relying
13850                  * on that behaviour, if<0>:0 is considered to be
13851                  * a valid interface.
13852                  *
13853                  * If alias has two or more digits and the first
13854                  * is zero, fail.
13855                  */
13856                 if (&cp[2] < endp && cp[1] == '0') {
13857                         if (error != NULL)
13858                                 *error = EINVAL;
13859                         return (NULL);
13860                 }
13861         }
13862 
13863         if (cp <= name) {
13864                 cp = endp;
13865         } else {
13866                 *cp = '\0';
13867         }
13868 
13869         /*
13870          * Look up the ILL, based on the portion of the name
13871          * before the slash. ill_lookup_on_name returns a held ill.
13872          * Temporary to check whether ill exists already. If so
13873          * ill_lookup_on_name will clear it.
13874          */
13875         ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13876         if (cp != endp)
13877                 *cp = IPIF_SEPARATOR_CHAR;
13878         if (ill == NULL)
13879                 return (NULL);
13880 
13881         /* Establish the unit number in the name. */
13882         id = 0;
13883         if (cp < endp && *endp == '\0') {
13884                 /* If there was a colon, the unit number follows. */
13885                 cp++;
13886                 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13887                         ill_refrele(ill);
13888                         if (error != NULL)
13889                                 *error = ENXIO;
13890                         return (NULL);
13891                 }
13892         }
13893 
13894         GRAB_CONN_LOCK(q);
13895         mutex_enter(&ill->ill_lock);
13896         /* Now see if there is an IPIF with this unit number. */
13897         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13898                 if (ipif->ipif_id == id) {
13899                         if (zoneid != ALL_ZONES &&
13900                             zoneid != ipif->ipif_zoneid &&
13901                             ipif->ipif_zoneid != ALL_ZONES) {
13902                                 mutex_exit(&ill->ill_lock);
13903                                 RELEASE_CONN_LOCK(q);
13904                                 ill_refrele(ill);
13905                                 if (error != NULL)
13906                                         *error = ENXIO;
13907                                 return (NULL);
13908                         }
13909 
13910                         if (!(IPIF_IS_CHANGING(ipif) ||
13911                             IPIF_IS_CONDEMNED(ipif)) ||
13912                             IAM_WRITER_IPIF(ipif)) {
13913                                 ipif_refhold_locked(ipif);
13914                                 mutex_exit(&ill->ill_lock);
13915                                 /*
13916                                  * Drop locks before calling ill_refrele
13917                                  * since it can potentially call into
13918                                  * ipif_ill_refrele_tail which can end up
13919                                  * in trying to acquire any lock.
13920                                  */
13921                                 RELEASE_CONN_LOCK(q);
13922                                 ill_refrele(ill);
13923                                 return (ipif);
13924                         } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13925                                 ipsq = ill->ill_phyint->phyint_ipsq;
13926                                 mutex_enter(&ipsq->ipsq_lock);
13927                                 mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13928                                 mutex_exit(&ill->ill_lock);
13929                                 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13930                                 mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13931                                 mutex_exit(&ipsq->ipsq_lock);
13932                                 RELEASE_CONN_LOCK(q);
13933                                 ill_refrele(ill);
13934                                 if (error != NULL)
13935                                         *error = EINPROGRESS;
13936                                 return (NULL);
13937                         }
13938                 }
13939         }
13940         RELEASE_CONN_LOCK(q);
13941         mutex_exit(&ill->ill_lock);
13942         ill_refrele(ill);
13943         if (error != NULL)
13944                 *error = ENXIO;
13945         return (NULL);
13946 }
13947 
13948 /*
13949  * This routine is called whenever a new address comes up on an ipif.  If
13950  * we are configured to respond to address mask requests, then we are supposed
13951  * to broadcast an address mask reply at this time.  This routine is also
13952  * called if we are already up, but a netmask change is made.  This is legal
13953  * but might not make the system manager very popular.  (May be called
13954  * as writer.)
13955  */
13956 void
13957 ipif_mask_reply(ipif_t *ipif)
13958 {
13959         icmph_t *icmph;
13960         ipha_t  *ipha;
13961         mblk_t  *mp;
13962         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
13963         ip_xmit_attr_t ixas;
13964 
13965 #define REPLY_LEN       (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
13966 
13967         if (!ipst->ips_ip_respond_to_address_mask_broadcast)
13968                 return;
13969 
13970         /* ICMP mask reply is IPv4 only */
13971         ASSERT(!ipif->ipif_isv6);
13972         /* ICMP mask reply is not for a loopback interface */
13973         ASSERT(ipif->ipif_ill->ill_wq != NULL);
13974 
13975         if (ipif->ipif_lcl_addr == INADDR_ANY)
13976                 return;
13977 
13978         mp = allocb(REPLY_LEN, BPRI_HI);
13979         if (mp == NULL)
13980                 return;
13981         mp->b_wptr = mp->b_rptr + REPLY_LEN;
13982 
13983         ipha = (ipha_t *)mp->b_rptr;
13984         bzero(ipha, REPLY_LEN);
13985         *ipha = icmp_ipha;
13986         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
13987         ipha->ipha_src = ipif->ipif_lcl_addr;
13988         ipha->ipha_dst = ipif->ipif_brd_addr;
13989         ipha->ipha_length = htons(REPLY_LEN);
13990         ipha->ipha_ident = 0;
13991 
13992         icmph = (icmph_t *)&ipha[1];
13993         icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
13994         bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
13995         icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
13996 
13997         bzero(&ixas, sizeof (ixas));
13998         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13999         ixas.ixa_zoneid = ALL_ZONES;
14000         ixas.ixa_ifindex = 0;
14001         ixas.ixa_ipst = ipst;
14002         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
14003         (void) ip_output_simple(mp, &ixas);
14004         ixa_cleanup(&ixas);
14005 #undef  REPLY_LEN
14006 }
14007 
14008 /*
14009  * Join the ipif specific multicast groups.
14010  * Must be called after a mapping has been set up in the resolver.  (Always
14011  * called as writer.)
14012  */
14013 void
14014 ipif_multicast_up(ipif_t *ipif)
14015 {
14016         int err;
14017         ill_t *ill;
14018         ilm_t *ilm;
14019 
14020         ASSERT(IAM_WRITER_IPIF(ipif));
14021 
14022         ill = ipif->ipif_ill;
14023 
14024         ip1dbg(("ipif_multicast_up\n"));
14025         if (!(ill->ill_flags & ILLF_MULTICAST) ||
14026             ipif->ipif_allhosts_ilm != NULL)
14027                 return;
14028 
14029         if (ipif->ipif_isv6) {
14030                 in6_addr_t v6allmc = ipv6_all_hosts_mcast;
14031                 in6_addr_t v6solmc = ipv6_solicited_node_mcast;
14032 
14033                 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
14034 
14035                 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
14036                         return;
14037 
14038                 ip1dbg(("ipif_multicast_up - addmulti\n"));
14039 
14040                 /*
14041                  * Join the all hosts multicast address.  We skip this for
14042                  * underlying IPMP interfaces since they should be invisible.
14043                  */
14044                 if (!IS_UNDER_IPMP(ill)) {
14045                         ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
14046                             &err);
14047                         if (ilm == NULL) {
14048                                 ASSERT(err != 0);
14049                                 ip0dbg(("ipif_multicast_up: "
14050                                     "all_hosts_mcast failed %d\n", err));
14051                                 return;
14052                         }
14053                         ipif->ipif_allhosts_ilm = ilm;
14054                 }
14055 
14056                 /*
14057                  * Enable multicast for the solicited node multicast address.
14058                  * If IPMP we need to put the membership on the upper ill.
14059                  */
14060                 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
14061                         ill_t *mcast_ill = NULL;
14062                         boolean_t need_refrele;
14063 
14064                         if (IS_UNDER_IPMP(ill) &&
14065                             (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
14066                                 need_refrele = B_TRUE;
14067                         } else {
14068                                 mcast_ill = ill;
14069                                 need_refrele = B_FALSE;
14070                         }
14071 
14072                         ilm = ip_addmulti(&v6solmc, mcast_ill,
14073                             ipif->ipif_zoneid, &err);
14074                         if (need_refrele)
14075                                 ill_refrele(mcast_ill);
14076 
14077                         if (ilm == NULL) {
14078                                 ASSERT(err != 0);
14079                                 ip0dbg(("ipif_multicast_up: solicited MC"
14080                                     " failed %d\n", err));
14081                                 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
14082                                         ipif->ipif_allhosts_ilm = NULL;
14083                                         (void) ip_delmulti(ilm);
14084                                 }
14085                                 return;
14086                         }
14087                         ipif->ipif_solmulti_ilm = ilm;
14088                 }
14089         } else {
14090                 in6_addr_t v6group;
14091 
14092                 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
14093                         return;
14094 
14095                 /* Join the all hosts multicast address */
14096                 ip1dbg(("ipif_multicast_up - addmulti\n"));
14097                 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
14098 
14099                 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
14100                 if (ilm == NULL) {
14101                         ASSERT(err != 0);
14102                         ip0dbg(("ipif_multicast_up: failed %d\n", err));
14103                         return;
14104                 }
14105                 ipif->ipif_allhosts_ilm = ilm;
14106         }
14107 }
14108 
14109 /*
14110  * Blow away any multicast groups that we joined in ipif_multicast_up().
14111  * (ilms from explicit memberships are handled in conn_update_ill.)
14112  */
14113 void
14114 ipif_multicast_down(ipif_t *ipif)
14115 {
14116         ASSERT(IAM_WRITER_IPIF(ipif));
14117 
14118         ip1dbg(("ipif_multicast_down\n"));
14119 
14120         if (ipif->ipif_allhosts_ilm != NULL) {
14121                 (void) ip_delmulti(ipif->ipif_allhosts_ilm);
14122                 ipif->ipif_allhosts_ilm = NULL;
14123         }
14124         if (ipif->ipif_solmulti_ilm != NULL) {
14125                 (void) ip_delmulti(ipif->ipif_solmulti_ilm);
14126                 ipif->ipif_solmulti_ilm = NULL;
14127         }
14128 }
14129 
14130 /*
14131  * Used when an interface comes up to recreate any extra routes on this
14132  * interface.
14133  */
14134 int
14135 ill_recover_saved_ire(ill_t *ill)
14136 {
14137         mblk_t          *mp;
14138         ip_stack_t      *ipst = ill->ill_ipst;
14139 
14140         ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
14141 
14142         mutex_enter(&ill->ill_saved_ire_lock);
14143         for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
14144                 ire_t           *ire, *nire;
14145                 ifrt_t          *ifrt;
14146 
14147                 ifrt = (ifrt_t *)mp->b_rptr;
14148                 /*
14149                  * Create a copy of the IRE with the saved address and netmask.
14150                  */
14151                 if (ill->ill_isv6) {
14152                         ire = ire_create_v6(
14153                             &ifrt->ifrt_v6addr,
14154                             &ifrt->ifrt_v6mask,
14155                             &ifrt->ifrt_v6gateway_addr,
14156                             ifrt->ifrt_type,
14157                             ill,
14158                             ifrt->ifrt_zoneid,
14159                             ifrt->ifrt_flags,
14160                             NULL,
14161                             ipst);
14162                 } else {
14163                         ire = ire_create(
14164                             (uint8_t *)&ifrt->ifrt_addr,
14165                             (uint8_t *)&ifrt->ifrt_mask,
14166                             (uint8_t *)&ifrt->ifrt_gateway_addr,
14167                             ifrt->ifrt_type,
14168                             ill,
14169                             ifrt->ifrt_zoneid,
14170                             ifrt->ifrt_flags,
14171                             NULL,
14172                             ipst);
14173                 }
14174                 if (ire == NULL) {
14175                         mutex_exit(&ill->ill_saved_ire_lock);
14176                         return (ENOMEM);
14177                 }
14178 
14179                 if (ifrt->ifrt_flags & RTF_SETSRC) {
14180                         if (ill->ill_isv6) {
14181                                 ire->ire_setsrc_addr_v6 =
14182                                     ifrt->ifrt_v6setsrc_addr;
14183                         } else {
14184                                 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
14185                         }
14186                 }
14187 
14188                 /*
14189                  * Some software (for example, GateD and Sun Cluster) attempts
14190                  * to create (what amount to) IRE_PREFIX routes with the
14191                  * loopback address as the gateway.  This is primarily done to
14192                  * set up prefixes with the RTF_REJECT flag set (for example,
14193                  * when generating aggregate routes.)
14194                  *
14195                  * If the IRE type (as defined by ill->ill_net_type) is
14196                  * IRE_LOOPBACK, then we map the request into a
14197                  * IRE_IF_NORESOLVER.
14198                  */
14199                 if (ill->ill_net_type == IRE_LOOPBACK)
14200                         ire->ire_type = IRE_IF_NORESOLVER;
14201 
14202                 /*
14203                  * ire held by ire_add, will be refreled' towards the
14204                  * the end of ipif_up_done
14205                  */
14206                 nire = ire_add(ire);
14207                 /*
14208                  * Check if it was a duplicate entry. This handles
14209                  * the case of two racing route adds for the same route
14210                  */
14211                 if (nire == NULL) {
14212                         ip1dbg(("ill_recover_saved_ire: FAILED\n"));
14213                 } else if (nire != ire) {
14214                         ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14215                             (void *)nire));
14216                         ire_delete(nire);
14217                 } else {
14218                         ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14219                             (void *)nire));
14220                 }
14221                 if (nire != NULL)
14222                         ire_refrele(nire);
14223         }
14224         mutex_exit(&ill->ill_saved_ire_lock);
14225         return (0);
14226 }
14227 
14228 /*
14229  * Used to set the netmask and broadcast address to default values when the
14230  * interface is brought up.  (Always called as writer.)
14231  */
14232 static void
14233 ipif_set_default(ipif_t *ipif)
14234 {
14235         ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14236 
14237         if (!ipif->ipif_isv6) {
14238                 /*
14239                  * Interface holds an IPv4 address. Default
14240                  * mask is the natural netmask.
14241                  */
14242                 if (!ipif->ipif_net_mask) {
14243                         ipaddr_t        v4mask;
14244 
14245                         v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14246                         V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14247                 }
14248                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14249                         /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14250                         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14251                 } else {
14252                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14253                             ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14254                 }
14255                 /*
14256                  * NOTE: SunOS 4.X does this even if the broadcast address
14257                  * has been already set thus we do the same here.
14258                  */
14259                 if (ipif->ipif_flags & IPIF_BROADCAST) {
14260                         ipaddr_t        v4addr;
14261 
14262                         v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14263                         IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14264                 }
14265         } else {
14266                 /*
14267                  * Interface holds an IPv6-only address.  Default
14268                  * mask is all-ones.
14269                  */
14270                 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14271                         ipif->ipif_v6net_mask = ipv6_all_ones;
14272                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14273                         /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14274                         ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14275                 } else {
14276                         V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14277                             ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14278                 }
14279         }
14280 }
14281 
14282 /*
14283  * Return 0 if this address can be used as local address without causing
14284  * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14285  * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14286  * Note that the same IPv6 link-local address is allowed as long as the ills
14287  * are not on the same link.
14288  */
14289 int
14290 ip_addr_availability_check(ipif_t *new_ipif)
14291 {
14292         in6_addr_t our_v6addr;
14293         ill_t *ill;
14294         ipif_t *ipif;
14295         ill_walk_context_t ctx;
14296         ip_stack_t      *ipst = new_ipif->ipif_ill->ill_ipst;
14297 
14298         ASSERT(IAM_WRITER_IPIF(new_ipif));
14299         ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14300         ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14301 
14302         new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14303         if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14304             IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14305                 return (0);
14306 
14307         our_v6addr = new_ipif->ipif_v6lcl_addr;
14308 
14309         if (new_ipif->ipif_isv6)
14310                 ill = ILL_START_WALK_V6(&ctx, ipst);
14311         else
14312                 ill = ILL_START_WALK_V4(&ctx, ipst);
14313 
14314         for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14315                 for (ipif = ill->ill_ipif; ipif != NULL;
14316                     ipif = ipif->ipif_next) {
14317                         if ((ipif == new_ipif) ||
14318                             !(ipif->ipif_flags & IPIF_UP) ||
14319                             (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14320                             !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14321                             &our_v6addr))
14322                                 continue;
14323 
14324                         if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14325                                 new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14326                         else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14327                                 ipif->ipif_flags |= IPIF_UNNUMBERED;
14328                         else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14329                             IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14330                             !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14331                                 continue;
14332                         else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14333                             ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14334                                 continue;
14335                         else if (new_ipif->ipif_ill == ill)
14336                                 return (EADDRINUSE);
14337                         else
14338                                 return (EADDRNOTAVAIL);
14339                 }
14340         }
14341 
14342         return (0);
14343 }
14344 
14345 /*
14346  * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14347  * IREs for the ipif.
14348  * When the routine returns EINPROGRESS then mp has been consumed and
14349  * the ioctl will be acked from ip_rput_dlpi.
14350  */
14351 int
14352 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14353 {
14354         ill_t           *ill = ipif->ipif_ill;
14355         boolean_t       isv6 = ipif->ipif_isv6;
14356         int             err = 0;
14357         boolean_t       success;
14358         uint_t          ipif_orig_id;
14359         ip_stack_t      *ipst = ill->ill_ipst;
14360 
14361         ASSERT(IAM_WRITER_IPIF(ipif));
14362 
14363         ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14364         DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14365             ill_t *, ill, ipif_t *, ipif);
14366 
14367         /* Shouldn't get here if it is already up. */
14368         if (ipif->ipif_flags & IPIF_UP)
14369                 return (EALREADY);
14370 
14371         /*
14372          * If this is a request to bring up a data address on an interface
14373          * under IPMP, then move the address to its IPMP meta-interface and
14374          * try to bring it up.  One complication is that the zeroth ipif for
14375          * an ill is special, in that every ill always has one, and that code
14376          * throughout IP deferences ill->ill_ipif without holding any locks.
14377          */
14378         if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14379             (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14380                 ipif_t  *stubipif = NULL, *moveipif = NULL;
14381                 ill_t   *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14382 
14383                 /*
14384                  * The ipif being brought up should be quiesced.  If it's not,
14385                  * something has gone amiss and we need to bail out.  (If it's
14386                  * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14387                  */
14388                 mutex_enter(&ill->ill_lock);
14389                 if (!ipif_is_quiescent(ipif)) {
14390                         mutex_exit(&ill->ill_lock);
14391                         return (EINVAL);
14392                 }
14393                 mutex_exit(&ill->ill_lock);
14394 
14395                 /*
14396                  * If we're going to need to allocate ipifs, do it prior
14397                  * to starting the move (and grabbing locks).
14398                  */
14399                 if (ipif->ipif_id == 0) {
14400                         if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14401                             B_FALSE, &err)) == NULL) {
14402                                 return (err);
14403                         }
14404                         if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14405                             B_FALSE, &err)) == NULL) {
14406                                 mi_free(moveipif);
14407                                 return (err);
14408                         }
14409                 }
14410 
14411                 /*
14412                  * Grab or transfer the ipif to move.  During the move, keep
14413                  * ill_g_lock held to prevent any ill walker threads from
14414                  * seeing things in an inconsistent state.
14415                  */
14416                 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14417                 if (ipif->ipif_id != 0) {
14418                         ipif_remove(ipif);
14419                 } else {
14420                         ipif_transfer(ipif, moveipif, stubipif);
14421                         ipif = moveipif;
14422                 }
14423 
14424                 /*
14425                  * Place the ipif on the IPMP ill.  If the zeroth ipif on
14426                  * the IPMP ill is a stub (0.0.0.0 down address) then we
14427                  * replace that one.  Otherwise, pick the next available slot.
14428                  */
14429                 ipif->ipif_ill = ipmp_ill;
14430                 ipif_orig_id = ipif->ipif_id;
14431 
14432                 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14433                         ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14434                         ipif = ipmp_ill->ill_ipif;
14435                 } else {
14436                         ipif->ipif_id = -1;
14437                         if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14438                                 /*
14439                                  * No more available ipif_id's -- put it back
14440                                  * on the original ill and fail the operation.
14441                                  * Since we're writer on the ill, we can be
14442                                  * sure our old slot is still available.
14443                                  */
14444                                 ipif->ipif_id = ipif_orig_id;
14445                                 ipif->ipif_ill = ill;
14446                                 if (ipif_orig_id == 0) {
14447                                         ipif_transfer(ipif, ill->ill_ipif,
14448                                             NULL);
14449                                 } else {
14450                                         VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14451                                 }
14452                                 rw_exit(&ipst->ips_ill_g_lock);
14453                                 return (err);
14454                         }
14455                 }
14456                 rw_exit(&ipst->ips_ill_g_lock);
14457 
14458                 /*
14459                  * Tell SCTP that the ipif has moved.  Note that even if we
14460                  * had to allocate a new ipif, the original sequence id was
14461                  * preserved and therefore SCTP won't know.
14462                  */
14463                 sctp_move_ipif(ipif, ill, ipmp_ill);
14464 
14465                 /*
14466                  * If the ipif being brought up was on slot zero, then we
14467                  * first need to bring up the placeholder we stuck there.  In
14468                  * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14469                  * call to ipif_up() itself, if we successfully bring up the
14470                  * placeholder, we'll check ill_move_ipif and bring it up too.
14471                  */
14472                 if (ipif_orig_id == 0) {
14473                         ASSERT(ill->ill_move_ipif == NULL);
14474                         ill->ill_move_ipif = ipif;
14475                         if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14476                                 ASSERT(ill->ill_move_ipif == NULL);
14477                         if (err != EINPROGRESS)
14478                                 ill->ill_move_ipif = NULL;
14479                         return (err);
14480                 }
14481 
14482                 /*
14483                  * Bring it up on the IPMP ill.
14484                  */
14485                 return (ipif_up(ipif, q, mp));
14486         }
14487 
14488         /* Skip arp/ndp for any loopback interface. */
14489         if (ill->ill_wq != NULL) {
14490                 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14491                 ipsq_t  *ipsq = ill->ill_phyint->phyint_ipsq;
14492 
14493                 if (!ill->ill_dl_up) {
14494                         /*
14495                          * ill_dl_up is not yet set. i.e. we are yet to
14496                          * DL_BIND with the driver and this is the first
14497                          * logical interface on the ill to become "up".
14498                          * Tell the driver to get going (via DL_BIND_REQ).
14499                          * Note that changing "significant" IFF_ flags
14500                          * address/netmask etc cause a down/up dance, but
14501                          * does not cause an unbind (DL_UNBIND) with the driver
14502                          */
14503                         return (ill_dl_up(ill, ipif, mp, q));
14504                 }
14505 
14506                 /*
14507                  * ipif_resolver_up may end up needeing to bind/attach
14508                  * the ARP stream, which in turn necessitates a
14509                  * DLPI message exchange with the driver. ioctls are
14510                  * serialized and so we cannot send more than one
14511                  * interface up message at a time. If ipif_resolver_up
14512                  * does need to wait for the DLPI handshake for the ARP stream,
14513                  * we get EINPROGRESS and we will complete in arp_bringup_done.
14514                  */
14515 
14516                 ASSERT(connp != NULL || !CONN_Q(q));
14517                 if (connp != NULL)
14518                         mutex_enter(&connp->conn_lock);
14519                 mutex_enter(&ill->ill_lock);
14520                 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14521                 mutex_exit(&ill->ill_lock);
14522                 if (connp != NULL)
14523                         mutex_exit(&connp->conn_lock);
14524                 if (!success)
14525                         return (EINTR);
14526 
14527                 /*
14528                  * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14529                  * complete when ipif_ndp_up returns.
14530                  */
14531                 err = ipif_resolver_up(ipif, Res_act_initial);
14532                 if (err == EINPROGRESS) {
14533                         /* We will complete it in arp_bringup_done() */
14534                         return (err);
14535                 }
14536 
14537                 if (isv6 && err == 0)
14538                         err = ipif_ndp_up(ipif, B_TRUE);
14539 
14540                 ASSERT(err != EINPROGRESS);
14541                 mp = ipsq_pending_mp_get(ipsq, &connp);
14542                 ASSERT(mp != NULL);
14543                 if (err != 0)
14544                         return (err);
14545         } else {
14546                 /*
14547                  * Interfaces without underlying hardware don't do duplicate
14548                  * address detection.
14549                  */
14550                 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14551                 ipif->ipif_addr_ready = 1;
14552                 err = ill_add_ires(ill);
14553                 /* allocation failure? */
14554                 if (err != 0)
14555                         return (err);
14556         }
14557 
14558         err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14559         if (err == 0 && ill->ill_move_ipif != NULL) {
14560                 ipif = ill->ill_move_ipif;
14561                 ill->ill_move_ipif = NULL;
14562                 return (ipif_up(ipif, q, mp));
14563         }
14564         return (err);
14565 }
14566 
14567 /*
14568  * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14569  * The identical set of IREs need to be removed in ill_delete_ires().
14570  */
14571 int
14572 ill_add_ires(ill_t *ill)
14573 {
14574         ire_t   *ire;
14575         in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
14576         in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14577 
14578         if (ill->ill_ire_multicast != NULL)
14579                 return (0);
14580 
14581         /*
14582          * provide some dummy ire_addr for creating the ire.
14583          */
14584         if (ill->ill_isv6) {
14585                 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14586                     ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14587         } else {
14588                 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14589                     ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14590         }
14591         if (ire == NULL)
14592                 return (ENOMEM);
14593 
14594         ill->ill_ire_multicast = ire;
14595         return (0);
14596 }
14597 
14598 void
14599 ill_delete_ires(ill_t *ill)
14600 {
14601         if (ill->ill_ire_multicast != NULL) {
14602                 /*
14603                  * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14604                  * which was taken without any th_tracing enabled.
14605                  * We also mark it as condemned (note that it was never added)
14606                  * so that caching conn's can move off of it.
14607                  */
14608                 ire_make_condemned(ill->ill_ire_multicast);
14609                 ire_refrele_notr(ill->ill_ire_multicast);
14610                 ill->ill_ire_multicast = NULL;
14611         }
14612 }
14613 
14614 /*
14615  * Perform a bind for the physical device.
14616  * When the routine returns EINPROGRESS then mp has been consumed and
14617  * the ioctl will be acked from ip_rput_dlpi.
14618  * Allocate an unbind message and save it until ipif_down.
14619  */
14620 static int
14621 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
14622 {
14623         mblk_t  *bind_mp = NULL;
14624         mblk_t  *unbind_mp = NULL;
14625         conn_t  *connp;
14626         boolean_t success;
14627         int     err;
14628 
14629         DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14630 
14631         ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14632         ASSERT(IAM_WRITER_ILL(ill));
14633         ASSERT(mp != NULL);
14634 
14635         /*
14636          * Make sure we have an IRE_MULTICAST in case we immediately
14637          * start receiving packets.
14638          */
14639         err = ill_add_ires(ill);
14640         if (err != 0)
14641                 goto bad;
14642 
14643         bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14644             DL_BIND_REQ);
14645         if (bind_mp == NULL)
14646                 goto bad;
14647         ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14648         ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14649 
14650         /*
14651          * ill_unbind_mp would be non-null if the following sequence had
14652          * happened:
14653          * - send DL_BIND_REQ to driver, wait for response
14654          * - multiple ioctls that need to bring the ipif up are encountered,
14655          *   but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14656          *   These ioctls will then be enqueued on the ipsq
14657          * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14658          * At this point, the pending ioctls in the ipsq will be drained, and
14659          * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14660          * a non-null ill->ill_unbind_mp
14661          */
14662         if (ill->ill_unbind_mp == NULL) {
14663                 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14664                     DL_UNBIND_REQ);
14665                 if (unbind_mp == NULL)
14666                         goto bad;
14667         }
14668         /*
14669          * Record state needed to complete this operation when the
14670          * DL_BIND_ACK shows up.  Also remember the pre-allocated mblks.
14671          */
14672         connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14673         ASSERT(connp != NULL || !CONN_Q(q));
14674         GRAB_CONN_LOCK(q);
14675         mutex_enter(&ipif->ipif_ill->ill_lock);
14676         success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14677         mutex_exit(&ipif->ipif_ill->ill_lock);
14678         RELEASE_CONN_LOCK(q);
14679         if (!success)
14680                 goto bad;
14681 
14682         /*
14683          * Save the unbind message for ill_dl_down(); it will be consumed when
14684          * the interface goes down.
14685          */
14686         if (ill->ill_unbind_mp == NULL)
14687                 ill->ill_unbind_mp = unbind_mp;
14688 
14689         ill_dlpi_send(ill, bind_mp);
14690         /* Send down link-layer capabilities probe if not already done. */
14691         ill_capability_probe(ill);
14692 
14693         /*
14694          * Sysid used to rely on the fact that netboots set domainname
14695          * and the like. Now that miniroot boots aren't strictly netboots
14696          * and miniroot network configuration is driven from userland
14697          * these things still need to be set. This situation can be detected
14698          * by comparing the interface being configured here to the one
14699          * dhcifname was set to reference by the boot loader. Once sysid is
14700          * converted to use dhcp_ipc_getinfo() this call can go away.
14701          */
14702         if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14703             (strcmp(ill->ill_name, dhcifname) == 0) &&
14704             (strlen(srpc_domain) == 0)) {
14705                 if (dhcpinit() != 0)
14706                         cmn_err(CE_WARN, "no cached dhcp response");
14707         }
14708 
14709         /*
14710          * This operation will complete in ip_rput_dlpi with either
14711          * a DL_BIND_ACK or DL_ERROR_ACK.
14712          */
14713         return (EINPROGRESS);
14714 bad:
14715         ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14716 
14717         freemsg(bind_mp);
14718         freemsg(unbind_mp);
14719         return (ENOMEM);
14720 }
14721 
14722 /* Add room for tcp+ip headers */
14723 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14724 
14725 /*
14726  * DLPI and ARP is up.
14727  * Create all the IREs associated with an interface. Bring up multicast.
14728  * Set the interface flag and finish other initialization
14729  * that potentially had to be deferred to after DL_BIND_ACK.
14730  */
14731 int
14732 ipif_up_done(ipif_t *ipif)
14733 {
14734         ill_t           *ill = ipif->ipif_ill;
14735         int             err = 0;
14736         boolean_t       loopback = B_FALSE;
14737         boolean_t       update_src_selection = B_TRUE;
14738         ipif_t          *tmp_ipif;
14739 
14740         ip1dbg(("ipif_up_done(%s:%u)\n",
14741             ipif->ipif_ill->ill_name, ipif->ipif_id));
14742         DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14743             ill_t *, ill, ipif_t *, ipif);
14744 
14745         /* Check if this is a loopback interface */
14746         if (ipif->ipif_ill->ill_wq == NULL)
14747                 loopback = B_TRUE;
14748 
14749         ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14750 
14751         /*
14752          * If all other interfaces for this ill are down or DEPRECATED,
14753          * or otherwise unsuitable for source address selection,
14754          * reset the src generation numbers to make sure source
14755          * address selection gets to take this new ipif into account.
14756          * No need to hold ill_lock while traversing the ipif list since
14757          * we are writer
14758          */
14759         for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14760             tmp_ipif = tmp_ipif->ipif_next) {
14761                 if (((tmp_ipif->ipif_flags &
14762                     (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14763                     !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14764                     (tmp_ipif == ipif))
14765                         continue;
14766                 /* first useable pre-existing interface */
14767                 update_src_selection = B_FALSE;
14768                 break;
14769         }
14770         if (update_src_selection)
14771                 ip_update_source_selection(ill->ill_ipst);
14772 
14773         if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14774                 nce_t *loop_nce = NULL;
14775                 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14776 
14777                 /*
14778                  * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14779                  * ipif_lookup_on_name(), but in the case of zones we can have
14780                  * several loopback addresses on lo0. So all the interfaces with
14781                  * loopback addresses need to be marked IRE_LOOPBACK.
14782                  */
14783                 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14784                     htonl(INADDR_LOOPBACK))
14785                         ipif->ipif_ire_type = IRE_LOOPBACK;
14786                 else
14787                         ipif->ipif_ire_type = IRE_LOCAL;
14788                 if (ill->ill_net_type != IRE_LOOPBACK)
14789                         flags |= NCE_F_PUBLISH;
14790 
14791                 /* add unicast nce for the local addr */
14792                 err = nce_lookup_then_add_v4(ill, NULL,
14793                     ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14794                     ND_REACHABLE, &loop_nce);
14795                 /* A shared-IP zone sees EEXIST for lo0:N */
14796                 if (err == 0 || err == EEXIST) {
14797                         ipif->ipif_added_nce = 1;
14798                         loop_nce->nce_ipif_cnt++;
14799                         nce_refrele(loop_nce);
14800                         err = 0;
14801                 } else {
14802                         ASSERT(loop_nce == NULL);
14803                         return (err);
14804                 }
14805         }
14806 
14807         /* Create all the IREs associated with this interface */
14808         err = ipif_add_ires_v4(ipif, loopback);
14809         if (err != 0) {
14810                 /*
14811                  * see comments about return value from
14812                  * ip_addr_availability_check() in ipif_add_ires_v4().
14813                  */
14814                 if (err != EADDRINUSE) {
14815                         (void) ipif_arp_down(ipif);
14816                 } else {
14817                         /*
14818                          * Make IPMP aware of the deleted ipif so that
14819                          * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14820                          * can be completed. Note that we do not want to
14821                          * destroy the nce that was created on the ipmp_ill
14822                          * for the active copy of the duplicate address in
14823                          * use.
14824                          */
14825                         if (IS_IPMP(ill))
14826                                 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14827                         err = EADDRNOTAVAIL;
14828                 }
14829                 return (err);
14830         }
14831 
14832         if (ill->ill_ipif_up_count == 1 && !loopback) {
14833                 /* Recover any additional IREs entries for this ill */
14834                 (void) ill_recover_saved_ire(ill);
14835         }
14836 
14837         if (ill->ill_need_recover_multicast) {
14838                 /*
14839                  * Need to recover all multicast memberships in the driver.
14840                  * This had to be deferred until we had attached.  The same
14841                  * code exists in ipif_up_done_v6() to recover IPv6
14842                  * memberships.
14843                  *
14844                  * Note that it would be preferable to unconditionally do the
14845                  * ill_recover_multicast() in ill_dl_up(), but we cannot do
14846                  * that since ill_join_allmulti() depends on ill_dl_up being
14847                  * set, and it is not set until we receive a DL_BIND_ACK after
14848                  * having called ill_dl_up().
14849                  */
14850                 ill_recover_multicast(ill);
14851         }
14852 
14853         if (ill->ill_ipif_up_count == 1) {
14854                 /*
14855                  * Since the interface is now up, it may now be active.
14856                  */
14857                 if (IS_UNDER_IPMP(ill))
14858                         ipmp_ill_refresh_active(ill);
14859 
14860                 /*
14861                  * If this is an IPMP interface, we may now be able to
14862                  * establish ARP entries.
14863                  */
14864                 if (IS_IPMP(ill))
14865                         ipmp_illgrp_refresh_arpent(ill->ill_grp);
14866         }
14867 
14868         /* Join the allhosts multicast address */
14869         ipif_multicast_up(ipif);
14870 
14871         if (!loopback && !update_src_selection &&
14872             !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14873                 ip_update_source_selection(ill->ill_ipst);
14874 
14875         if (!loopback && ipif->ipif_addr_ready) {
14876                 /* Broadcast an address mask reply. */
14877                 ipif_mask_reply(ipif);
14878         }
14879         /* Perhaps ilgs should use this ill */
14880         update_conn_ill(NULL, ill->ill_ipst);
14881 
14882         /*
14883          * This had to be deferred until we had bound.  Tell routing sockets and
14884          * others that this interface is up if it looks like the address has
14885          * been validated.  Otherwise, if it isn't ready yet, wait for
14886          * duplicate address detection to do its thing.
14887          */
14888         if (ipif->ipif_addr_ready)
14889                 ipif_up_notify(ipif);
14890         return (0);
14891 }
14892 
14893 /*
14894  * Add the IREs associated with the ipif.
14895  * Those MUST be explicitly removed in ipif_delete_ires_v4.
14896  */
14897 static int
14898 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14899 {
14900         ill_t           *ill = ipif->ipif_ill;
14901         ip_stack_t      *ipst = ill->ill_ipst;
14902         ire_t           *ire_array[20];
14903         ire_t           **irep = ire_array;
14904         ire_t           **irep1;
14905         ipaddr_t        net_mask = 0;
14906         ipaddr_t        subnet_mask, route_mask;
14907         int             err;
14908         ire_t           *ire_local = NULL;      /* LOCAL or LOOPBACK */
14909         ire_t           *ire_if = NULL;
14910         uchar_t         *gw;
14911 
14912         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14913             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14914                 /*
14915                  * If we're on a labeled system then make sure that zone-
14916                  * private addresses have proper remote host database entries.
14917                  */
14918                 if (is_system_labeled() &&
14919                     ipif->ipif_ire_type != IRE_LOOPBACK &&
14920                     !tsol_check_interface_address(ipif))
14921                         return (EINVAL);
14922 
14923                 /* Register the source address for __sin6_src_id */
14924                 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14925                     ipif->ipif_zoneid, ipst);
14926                 if (err != 0) {
14927                         ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14928                         return (err);
14929                 }
14930 
14931                 if (loopback)
14932                         gw = (uchar_t *)&ipif->ipif_lcl_addr;
14933                 else
14934                         gw = NULL;
14935 
14936                 /* If the interface address is set, create the local IRE. */
14937                 ire_local = ire_create(
14938                     (uchar_t *)&ipif->ipif_lcl_addr,     /* dest address */
14939                     (uchar_t *)&ip_g_all_ones,              /* mask */
14940                     gw,                                 /* gateway */
14941                     ipif->ipif_ire_type,             /* LOCAL or LOOPBACK */
14942                     ipif->ipif_ill,
14943                     ipif->ipif_zoneid,
14944                     ((ipif->ipif_flags & IPIF_PRIVATE) ?
14945                     RTF_PRIVATE : 0) | RTF_KERNEL,
14946                     NULL,
14947                     ipst);
14948                 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
14949                     " for 0x%x\n", (void *)ipif, (void *)ire_local,
14950                     ipif->ipif_ire_type,
14951                     ntohl(ipif->ipif_lcl_addr)));
14952                 if (ire_local == NULL) {
14953                         ip1dbg(("ipif_up_done: NULL ire_local\n"));
14954                         err = ENOMEM;
14955                         goto bad;
14956                 }
14957         } else {
14958                 ip1dbg((
14959                     "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
14960                     ipif->ipif_ire_type,
14961                     ntohl(ipif->ipif_lcl_addr),
14962                     (uint_t)ipif->ipif_flags));
14963         }
14964         if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14965             !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14966                 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14967         } else {
14968                 net_mask = htonl(IN_CLASSA_NET);        /* fallback */
14969         }
14970 
14971         subnet_mask = ipif->ipif_net_mask;
14972 
14973         /*
14974          * If mask was not specified, use natural netmask of
14975          * interface address. Also, store this mask back into the
14976          * ipif struct.
14977          */
14978         if (subnet_mask == 0) {
14979                 subnet_mask = net_mask;
14980                 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
14981                 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
14982                     ipif->ipif_v6subnet);
14983         }
14984 
14985         /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
14986         if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
14987             ipif->ipif_subnet != INADDR_ANY) {
14988                 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14989 
14990                 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14991                         route_mask = IP_HOST_MASK;
14992                 } else {
14993                         route_mask = subnet_mask;
14994                 }
14995 
14996                 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
14997                     "creating if IRE ill_net_type 0x%x for 0x%x\n",
14998                     (void *)ipif, (void *)ill, ill->ill_net_type,
14999                     ntohl(ipif->ipif_subnet)));
15000                 ire_if = ire_create(
15001                     (uchar_t *)&ipif->ipif_subnet,
15002                     (uchar_t *)&route_mask,
15003                     (uchar_t *)&ipif->ipif_lcl_addr,
15004                     ill->ill_net_type,
15005                     ill,
15006                     ipif->ipif_zoneid,
15007                     ((ipif->ipif_flags & IPIF_PRIVATE) ?
15008                     RTF_PRIVATE: 0) | RTF_KERNEL,
15009                     NULL,
15010                     ipst);
15011                 if (ire_if == NULL) {
15012                         ip1dbg(("ipif_up_done: NULL ire_if\n"));
15013                         err = ENOMEM;
15014                         goto bad;
15015                 }
15016         }
15017 
15018         /*
15019          * Create any necessary broadcast IREs.
15020          */
15021         if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15022             !(ipif->ipif_flags & IPIF_NOXMIT))
15023                 irep = ipif_create_bcast_ires(ipif, irep);
15024 
15025         /* If an earlier ire_create failed, get out now */
15026         for (irep1 = irep; irep1 > ire_array; ) {
15027                 irep1--;
15028                 if (*irep1 == NULL) {
15029                         ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
15030                         err = ENOMEM;
15031                         goto bad;
15032                 }
15033         }
15034 
15035         /*
15036          * Need to atomically check for IP address availability under
15037          * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
15038          * ills or new ipifs can be added while we are checking availability.
15039          */
15040         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15041         mutex_enter(&ipst->ips_ip_addr_avail_lock);
15042         /* Mark it up, and increment counters. */
15043         ipif->ipif_flags |= IPIF_UP;
15044         ill->ill_ipif_up_count++;
15045         err = ip_addr_availability_check(ipif);
15046         mutex_exit(&ipst->ips_ip_addr_avail_lock);
15047         rw_exit(&ipst->ips_ill_g_lock);
15048 
15049         if (err != 0) {
15050                 /*
15051                  * Our address may already be up on the same ill. In this case,
15052                  * the ARP entry for our ipif replaced the one for the other
15053                  * ipif. So we don't want to delete it (otherwise the other ipif
15054                  * would be unable to send packets).
15055                  * ip_addr_availability_check() identifies this case for us and
15056                  * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
15057                  * which is the expected error code.
15058                  */
15059                 ill->ill_ipif_up_count--;
15060                 ipif->ipif_flags &= ~IPIF_UP;
15061                 goto bad;
15062         }
15063 
15064         /*
15065          * Add in all newly created IREs.  ire_create_bcast() has
15066          * already checked for duplicates of the IRE_BROADCAST type.
15067          * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
15068          * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
15069          * a /32 route.
15070          */
15071         if (ire_if != NULL) {
15072                 ire_if = ire_add(ire_if);
15073                 if (ire_if == NULL) {
15074                         err = ENOMEM;
15075                         goto bad2;
15076                 }
15077 #ifdef DEBUG
15078                 ire_refhold_notr(ire_if);
15079                 ire_refrele(ire_if);
15080 #endif
15081         }
15082         if (ire_local != NULL) {
15083                 ire_local = ire_add(ire_local);
15084                 if (ire_local == NULL) {
15085                         err = ENOMEM;
15086                         goto bad2;
15087                 }
15088 #ifdef DEBUG
15089                 ire_refhold_notr(ire_local);
15090                 ire_refrele(ire_local);
15091 #endif
15092         }
15093         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15094         if (ire_local != NULL)
15095                 ipif->ipif_ire_local = ire_local;
15096         if (ire_if != NULL)
15097                 ipif->ipif_ire_if = ire_if;
15098         rw_exit(&ipst->ips_ill_g_lock);
15099         ire_local = NULL;
15100         ire_if = NULL;
15101 
15102         /*
15103          * We first add all of them, and if that succeeds we refrele the
15104          * bunch. That enables us to delete all of them should any of the
15105          * ire_adds fail.
15106          */
15107         for (irep1 = irep; irep1 > ire_array; ) {
15108                 irep1--;
15109                 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
15110                 *irep1 = ire_add(*irep1);
15111                 if (*irep1 == NULL) {
15112                         err = ENOMEM;
15113                         goto bad2;
15114                 }
15115         }
15116 
15117         for (irep1 = irep; irep1 > ire_array; ) {
15118                 irep1--;
15119                 /* refheld by ire_add. */
15120                 if (*irep1 != NULL) {
15121                         ire_refrele(*irep1);
15122                         *irep1 = NULL;
15123                 }
15124         }
15125 
15126         if (!loopback) {
15127                 /*
15128                  * If the broadcast address has been set, make sure it makes
15129                  * sense based on the interface address.
15130                  * Only match on ill since we are sharing broadcast addresses.
15131                  */
15132                 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
15133                     (ipif->ipif_flags & IPIF_BROADCAST)) {
15134                         ire_t   *ire;
15135 
15136                         ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
15137                             IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
15138                             (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
15139 
15140                         if (ire == NULL) {
15141                                 /*
15142                                  * If there isn't a matching broadcast IRE,
15143                                  * revert to the default for this netmask.
15144                                  */
15145                                 ipif->ipif_v6brd_addr = ipv6_all_zeros;
15146                                 mutex_enter(&ipif->ipif_ill->ill_lock);
15147                                 ipif_set_default(ipif);
15148                                 mutex_exit(&ipif->ipif_ill->ill_lock);
15149                         } else {
15150                                 ire_refrele(ire);
15151                         }
15152                 }
15153 
15154         }
15155         return (0);
15156 
15157 bad2:
15158         ill->ill_ipif_up_count--;
15159         ipif->ipif_flags &= ~IPIF_UP;
15160 
15161 bad:
15162         ip1dbg(("ipif_add_ires: FAILED \n"));
15163         if (ire_local != NULL)
15164                 ire_delete(ire_local);
15165         if (ire_if != NULL)
15166                 ire_delete(ire_if);
15167 
15168         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15169         ire_local = ipif->ipif_ire_local;
15170         ipif->ipif_ire_local = NULL;
15171         ire_if = ipif->ipif_ire_if;
15172         ipif->ipif_ire_if = NULL;
15173         rw_exit(&ipst->ips_ill_g_lock);
15174         if (ire_local != NULL) {
15175                 ire_delete(ire_local);
15176                 ire_refrele_notr(ire_local);
15177         }
15178         if (ire_if != NULL) {
15179                 ire_delete(ire_if);
15180                 ire_refrele_notr(ire_if);
15181         }
15182 
15183         while (irep > ire_array) {
15184                 irep--;
15185                 if (*irep != NULL) {
15186                         ire_delete(*irep);
15187                 }
15188         }
15189         (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
15190 
15191         return (err);
15192 }
15193 
15194 /* Remove all the IREs created by ipif_add_ires_v4 */
15195 void
15196 ipif_delete_ires_v4(ipif_t *ipif)
15197 {
15198         ill_t           *ill = ipif->ipif_ill;
15199         ip_stack_t      *ipst = ill->ill_ipst;
15200         ire_t           *ire;
15201 
15202         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15203         ire = ipif->ipif_ire_local;
15204         ipif->ipif_ire_local = NULL;
15205         rw_exit(&ipst->ips_ill_g_lock);
15206         if (ire != NULL) {
15207                 /*
15208                  * Move count to ipif so we don't loose the count due to
15209                  * a down/up dance.
15210                  */
15211                 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
15212 
15213                 ire_delete(ire);
15214                 ire_refrele_notr(ire);
15215         }
15216         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15217         ire = ipif->ipif_ire_if;
15218         ipif->ipif_ire_if = NULL;
15219         rw_exit(&ipst->ips_ill_g_lock);
15220         if (ire != NULL) {
15221                 ire_delete(ire);
15222                 ire_refrele_notr(ire);
15223         }
15224 
15225         /*
15226          * Delete the broadcast IREs.
15227          */
15228         if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15229             !(ipif->ipif_flags & IPIF_NOXMIT))
15230                 ipif_delete_bcast_ires(ipif);
15231 }
15232 
15233 /*
15234  * Checks for availbility of a usable source address (if there is one) when the
15235  * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15236  * this selection is done regardless of the destination.
15237  */
15238 boolean_t
15239 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15240     ip_stack_t *ipst)
15241 {
15242         ipif_t          *ipif = NULL;
15243         ill_t           *uill;
15244 
15245         ASSERT(ifindex != 0);
15246 
15247         uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15248         if (uill == NULL)
15249                 return (B_FALSE);
15250 
15251         mutex_enter(&uill->ill_lock);
15252         for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15253                 if (IPIF_IS_CONDEMNED(ipif))
15254                         continue;
15255                 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15256                         continue;
15257                 if (!(ipif->ipif_flags & IPIF_UP))
15258                         continue;
15259                 if (ipif->ipif_zoneid != zoneid)
15260                         continue;
15261                 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15262                     ipif->ipif_lcl_addr == INADDR_ANY)
15263                         continue;
15264                 mutex_exit(&uill->ill_lock);
15265                 ill_refrele(uill);
15266                 return (B_TRUE);
15267         }
15268         mutex_exit(&uill->ill_lock);
15269         ill_refrele(uill);
15270         return (B_FALSE);
15271 }
15272 
15273 /*
15274  * Find an ipif with a good local address on the ill+zoneid.
15275  */
15276 ipif_t *
15277 ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15278 {
15279         ipif_t          *ipif;
15280 
15281         mutex_enter(&ill->ill_lock);
15282         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15283                 if (IPIF_IS_CONDEMNED(ipif))
15284                         continue;
15285                 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15286                         continue;
15287                 if (!(ipif->ipif_flags & IPIF_UP))
15288                         continue;
15289                 if (ipif->ipif_zoneid != zoneid &&
15290                     ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15291                         continue;
15292                 if (ill->ill_isv6 ?
15293                     IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15294                     ipif->ipif_lcl_addr == INADDR_ANY)
15295                         continue;
15296                 ipif_refhold_locked(ipif);
15297                 mutex_exit(&ill->ill_lock);
15298                 return (ipif);
15299         }
15300         mutex_exit(&ill->ill_lock);
15301         return (NULL);
15302 }
15303 
15304 /*
15305  * IP source address type, sorted from worst to best.  For a given type,
15306  * always prefer IP addresses on the same subnet.  All-zones addresses are
15307  * suboptimal because they pose problems with unlabeled destinations.
15308  */
15309 typedef enum {
15310         IPIF_NONE,
15311         IPIF_DIFFNET_DEPRECATED,        /* deprecated and different subnet */
15312         IPIF_SAMENET_DEPRECATED,        /* deprecated and same subnet */
15313         IPIF_DIFFNET_ALLZONES,          /* allzones and different subnet */
15314         IPIF_SAMENET_ALLZONES,          /* allzones and same subnet */
15315         IPIF_DIFFNET,                   /* normal and different subnet */
15316         IPIF_SAMENET,                   /* normal and same subnet */
15317         IPIF_LOCALADDR                  /* local loopback */
15318 } ipif_type_t;
15319 
15320 /*
15321  * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15322  * `zoneid'.  We rate usable ipifs from low -> high as per the ipif_type_t
15323  * enumeration, and return the highest-rated ipif.  If there's a tie, we pick
15324  * the first one, unless IPMP is used in which case we round-robin among them;
15325  * see below for more.
15326  *
15327  * Returns NULL if there is no suitable source address for the ill.
15328  * This only occurs when there is no valid source address for the ill.
15329  */
15330 ipif_t *
15331 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15332     boolean_t allow_usesrc, boolean_t *notreadyp)
15333 {
15334         ill_t   *usill = NULL;
15335         ill_t   *ipmp_ill = NULL;
15336         ipif_t  *start_ipif, *next_ipif, *ipif, *best_ipif;
15337         ipif_type_t type, best_type;
15338         tsol_tpc_t *src_rhtp, *dst_rhtp;
15339         ip_stack_t *ipst = ill->ill_ipst;
15340         boolean_t samenet;
15341 
15342         if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15343                 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15344                     B_FALSE, ipst);
15345                 if (usill != NULL)
15346                         ill = usill;    /* Select source from usesrc ILL */
15347                 else
15348                         return (NULL);
15349         }
15350 
15351         /*
15352          * Test addresses should never be used for source address selection,
15353          * so if we were passed one, switch to the IPMP meta-interface.
15354          */
15355         if (IS_UNDER_IPMP(ill)) {
15356                 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15357                         ill = ipmp_ill; /* Select source from IPMP ill */
15358                 else
15359                         return (NULL);
15360         }
15361 
15362         /*
15363          * If we're dealing with an unlabeled destination on a labeled system,
15364          * make sure that we ignore source addresses that are incompatible with
15365          * the destination's default label.  That destination's default label
15366          * must dominate the minimum label on the source address.
15367          */
15368         dst_rhtp = NULL;
15369         if (is_system_labeled()) {
15370                 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE);
15371                 if (dst_rhtp == NULL)
15372                         return (NULL);
15373                 if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
15374                         TPC_RELE(dst_rhtp);
15375                         dst_rhtp = NULL;
15376                 }
15377         }
15378 
15379         /*
15380          * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15381          * can be deleted. But an ipif/ill can get CONDEMNED any time.
15382          * After selecting the right ipif, under ill_lock make sure ipif is
15383          * not condemned, and increment refcnt. If ipif is CONDEMNED,
15384          * we retry. Inside the loop we still need to check for CONDEMNED,
15385          * but not under a lock.
15386          */
15387         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15388 retry:
15389         /*
15390          * For source address selection, we treat the ipif list as circular
15391          * and continue until we get back to where we started.  This allows
15392          * IPMP to vary source address selection (which improves inbound load
15393          * spreading) by caching its last ending point and starting from
15394          * there.  NOTE: we don't have to worry about ill_src_ipif changing
15395          * ills since that can't happen on the IPMP ill.
15396          */
15397         start_ipif = ill->ill_ipif;
15398         if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15399                 start_ipif = ill->ill_src_ipif;
15400 
15401         ipif = start_ipif;
15402         best_ipif = NULL;
15403         best_type = IPIF_NONE;
15404         do {
15405                 if ((next_ipif = ipif->ipif_next) == NULL)
15406                         next_ipif = ill->ill_ipif;
15407 
15408                 if (IPIF_IS_CONDEMNED(ipif))
15409                         continue;
15410                 /* Always skip NOLOCAL and ANYCAST interfaces */
15411                 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15412                         continue;
15413                 /* Always skip NOACCEPT interfaces */
15414                 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15415                         continue;
15416                 if (!(ipif->ipif_flags & IPIF_UP))
15417                         continue;
15418 
15419                 if (!ipif->ipif_addr_ready) {
15420                         if (notreadyp != NULL)
15421                                 *notreadyp = B_TRUE;
15422                         continue;
15423                 }
15424 
15425                 if (zoneid != ALL_ZONES &&
15426                     ipif->ipif_zoneid != zoneid &&
15427                     ipif->ipif_zoneid != ALL_ZONES)
15428                         continue;
15429 
15430                 /*
15431                  * Interfaces with 0.0.0.0 address are allowed to be UP, but
15432                  * are not valid as source addresses.
15433                  */
15434                 if (ipif->ipif_lcl_addr == INADDR_ANY)
15435                         continue;
15436 
15437                 /*
15438                  * Check compatibility of local address for destination's
15439                  * default label if we're on a labeled system.  Incompatible
15440                  * addresses can't be used at all.
15441                  */
15442                 if (dst_rhtp != NULL) {
15443                         boolean_t incompat;
15444 
15445                         src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
15446                             IPV4_VERSION, B_FALSE);
15447                         if (src_rhtp == NULL)
15448                                 continue;
15449                         incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
15450                             src_rhtp->tpc_tp.tp_doi !=
15451                             dst_rhtp->tpc_tp.tp_doi ||
15452                             (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
15453                             &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
15454                             !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
15455                             src_rhtp->tpc_tp.tp_sl_set_cipso));
15456                         TPC_RELE(src_rhtp);
15457                         if (incompat)
15458                                 continue;
15459                 }
15460 
15461                 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15462 
15463                 if (ipif->ipif_lcl_addr == dst) {
15464                         type = IPIF_LOCALADDR;
15465                 } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15466                         type = samenet ? IPIF_SAMENET_DEPRECATED :
15467                             IPIF_DIFFNET_DEPRECATED;
15468                 } else if (ipif->ipif_zoneid == ALL_ZONES) {
15469                         type = samenet ? IPIF_SAMENET_ALLZONES :
15470                             IPIF_DIFFNET_ALLZONES;
15471                 } else {
15472                         type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15473                 }
15474 
15475                 if (type > best_type) {
15476                         best_type = type;
15477                         best_ipif = ipif;
15478                         if (best_type == IPIF_LOCALADDR)
15479                                 break; /* can't get better */
15480                 }
15481         } while ((ipif = next_ipif) != start_ipif);
15482 
15483         if ((ipif = best_ipif) != NULL) {
15484                 mutex_enter(&ipif->ipif_ill->ill_lock);
15485                 if (IPIF_IS_CONDEMNED(ipif)) {
15486                         mutex_exit(&ipif->ipif_ill->ill_lock);
15487                         goto retry;
15488                 }
15489                 ipif_refhold_locked(ipif);
15490 
15491                 /*
15492                  * For IPMP, update the source ipif rotor to the next ipif,
15493                  * provided we can look it up.  (We must not use it if it's
15494                  * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15495                  * ipif_free() checked ill_src_ipif.)
15496                  */
15497                 if (IS_IPMP(ill) && ipif != NULL) {
15498                         next_ipif = ipif->ipif_next;
15499                         if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15500                                 ill->ill_src_ipif = next_ipif;
15501                         else
15502                                 ill->ill_src_ipif = NULL;
15503                 }
15504                 mutex_exit(&ipif->ipif_ill->ill_lock);
15505         }
15506 
15507         rw_exit(&ipst->ips_ill_g_lock);
15508         if (usill != NULL)
15509                 ill_refrele(usill);
15510         if (ipmp_ill != NULL)
15511                 ill_refrele(ipmp_ill);
15512         if (dst_rhtp != NULL)
15513                 TPC_RELE(dst_rhtp);
15514 
15515 #ifdef DEBUG
15516         if (ipif == NULL) {
15517                 char buf1[INET6_ADDRSTRLEN];
15518 
15519                 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15520                     ill->ill_name,
15521                     inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15522         } else {
15523                 char buf1[INET6_ADDRSTRLEN];
15524                 char buf2[INET6_ADDRSTRLEN];
15525 
15526                 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15527                     ipif->ipif_ill->ill_name,
15528                     inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15529                     inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15530                     buf2, sizeof (buf2))));
15531         }
15532 #endif /* DEBUG */
15533         return (ipif);
15534 }
15535 
15536 /*
15537  * Pick a source address based on the destination ill and an optional setsrc
15538  * address.
15539  * The result is stored in srcp. If generation is set, then put the source
15540  * generation number there before we look for the source address (to avoid
15541  * missing changes in the set of source addresses.
15542  * If flagsp is set, then us it to pass back ipif_flags.
15543  *
15544  * If the caller wants to cache the returned source address and detect when
15545  * that might be stale, the caller should pass in a generation argument,
15546  * which the caller can later compare against ips_src_generation
15547  *
15548  * The precedence order for selecting an IPv4 source address is:
15549  *  - RTF_SETSRC on the offlink ire always wins.
15550  *  - If usrsrc is set, swap the ill to be the usesrc one.
15551  *  - If IPMP is used on the ill, select a random address from the most
15552  *    preferred ones below:
15553  * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15554  * 2. Not deprecated, not ALL_ZONES
15555  * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15556  * 4. Not deprecated, ALL_ZONES
15557  * 5. If onlink destination, same subnet and deprecated
15558  * 6. Deprecated.
15559  *
15560  * We have lower preference for ALL_ZONES IP addresses,
15561  * as they pose problems with unlabeled destinations.
15562  *
15563  * Note that when multiple IP addresses match e.g., #1 we pick
15564  * the first one if IPMP is not in use. With IPMP we randomize.
15565  */
15566 int
15567 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15568     ipaddr_t multicast_ifaddr,
15569     zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15570     uint32_t *generation, uint64_t *flagsp)
15571 {
15572         ipif_t *ipif;
15573         boolean_t notready = B_FALSE;   /* Set if !ipif_addr_ready found */
15574 
15575         if (flagsp != NULL)
15576                 *flagsp = 0;
15577 
15578         /*
15579          * Need to grab the generation number before we check to
15580          * avoid a race with a change to the set of local addresses.
15581          * No lock needed since the thread which updates the set of local
15582          * addresses use ipif/ill locks and exit those (hence a store memory
15583          * barrier) before doing the atomic increase of ips_src_generation.
15584          */
15585         if (generation != NULL) {
15586                 *generation = ipst->ips_src_generation;
15587         }
15588 
15589         if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15590                 *srcp = multicast_ifaddr;
15591                 return (0);
15592         }
15593 
15594         /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15595         if (setsrc != INADDR_ANY) {
15596                 *srcp = setsrc;
15597                 return (0);
15598         }
15599         ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, &notready);
15600         if (ipif == NULL) {
15601                 if (notready)
15602                         return (ENETDOWN);
15603                 else
15604                         return (EADDRNOTAVAIL);
15605         }
15606         *srcp = ipif->ipif_lcl_addr;
15607         if (flagsp != NULL)
15608                 *flagsp = ipif->ipif_flags;
15609         ipif_refrele(ipif);
15610         return (0);
15611 }
15612 
15613 /* ARGSUSED */
15614 int
15615 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15616         ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15617 {
15618         /*
15619          * ill_phyint_reinit merged the v4 and v6 into a single
15620          * ipsq.  We might not have been able to complete the
15621          * operation in ipif_set_values, if we could not become
15622          * exclusive.  If so restart it here.
15623          */
15624         return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15625 }
15626 
15627 /*
15628  * Can operate on either a module or a driver queue.
15629  * Returns an error if not a module queue.
15630  */
15631 /* ARGSUSED */
15632 int
15633 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15634     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15635 {
15636         queue_t         *q1 = q;
15637         char            *cp;
15638         char            interf_name[LIFNAMSIZ];
15639         uint_t          ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15640 
15641         if (q->q_next == NULL) {
15642                 ip1dbg((
15643                     "if_unitsel: IF_UNITSEL: no q_next\n"));
15644                 return (EINVAL);
15645         }
15646 
15647         if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15648                 return (EALREADY);
15649 
15650         do {
15651                 q1 = q1->q_next;
15652         } while (q1->q_next);
15653         cp = q1->q_qinfo->qi_minfo->mi_idname;
15654         (void) sprintf(interf_name, "%s%d", cp, ppa);
15655 
15656         /*
15657          * Here we are not going to delay the ioack until after
15658          * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15659          * original ioctl message before sending the requests.
15660          */
15661         return (ipif_set_values(q, mp, interf_name, &ppa));
15662 }
15663 
15664 /* ARGSUSED */
15665 int
15666 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15667     ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15668 {
15669         return (ENXIO);
15670 }
15671 
15672 /*
15673  * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15674  * `irep'.  Returns a pointer to the next free `irep' entry
15675  * A mirror exists in ipif_delete_bcast_ires().
15676  *
15677  * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15678  * done in ire_add.
15679  */
15680 static ire_t **
15681 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15682 {
15683         ipaddr_t addr;
15684         ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15685         ipaddr_t subnetmask = ipif->ipif_net_mask;
15686         ill_t *ill = ipif->ipif_ill;
15687         zoneid_t zoneid = ipif->ipif_zoneid;
15688 
15689         ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15690 
15691         ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15692         ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15693 
15694         if (ipif->ipif_lcl_addr == INADDR_ANY ||
15695             (ipif->ipif_flags & IPIF_NOLOCAL))
15696                 netmask = htonl(IN_CLASSA_NET);         /* fallback */
15697 
15698         irep = ire_create_bcast(ill, 0, zoneid, irep);
15699         irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15700 
15701         /*
15702          * For backward compatibility, we create net broadcast IREs based on
15703          * the old "IP address class system", since some old machines only
15704          * respond to these class derived net broadcast.  However, we must not
15705          * create these net broadcast IREs if the subnetmask is shorter than
15706          * the IP address class based derived netmask.  Otherwise, we may
15707          * create a net broadcast address which is the same as an IP address
15708          * on the subnet -- and then TCP will refuse to talk to that address.
15709          */
15710         if (netmask < subnetmask) {
15711                 addr = netmask & ipif->ipif_subnet;
15712                 irep = ire_create_bcast(ill, addr, zoneid, irep);
15713                 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15714         }
15715 
15716         /*
15717          * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15718          * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15719          * created.  Creating these broadcast IREs will only create confusion
15720          * as `addr' will be the same as the IP address.
15721          */
15722         if (subnetmask != 0xFFFFFFFF) {
15723                 addr = ipif->ipif_subnet;
15724                 irep = ire_create_bcast(ill, addr, zoneid, irep);
15725                 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15726         }
15727 
15728         return (irep);
15729 }
15730 
15731 /*
15732  * Mirror of ipif_create_bcast_ires()
15733  */
15734 static void
15735 ipif_delete_bcast_ires(ipif_t *ipif)
15736 {
15737         ipaddr_t        addr;
15738         ipaddr_t        netmask = ip_net_mask(ipif->ipif_lcl_addr);
15739         ipaddr_t        subnetmask = ipif->ipif_net_mask;
15740         ill_t           *ill = ipif->ipif_ill;
15741         zoneid_t        zoneid = ipif->ipif_zoneid;
15742         ire_t           *ire;
15743 
15744         ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15745         ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15746 
15747         if (ipif->ipif_lcl_addr == INADDR_ANY ||
15748             (ipif->ipif_flags & IPIF_NOLOCAL))
15749                 netmask = htonl(IN_CLASSA_NET);         /* fallback */
15750 
15751         ire = ire_lookup_bcast(ill, 0, zoneid);
15752         ASSERT(ire != NULL);
15753         ire_delete(ire); ire_refrele(ire);
15754         ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15755         ASSERT(ire != NULL);
15756         ire_delete(ire); ire_refrele(ire);
15757 
15758         /*
15759          * For backward compatibility, we create net broadcast IREs based on
15760          * the old "IP address class system", since some old machines only
15761          * respond to these class derived net broadcast.  However, we must not
15762          * create these net broadcast IREs if the subnetmask is shorter than
15763          * the IP address class based derived netmask.  Otherwise, we may
15764          * create a net broadcast address which is the same as an IP address
15765          * on the subnet -- and then TCP will refuse to talk to that address.
15766          */
15767         if (netmask < subnetmask) {
15768                 addr = netmask & ipif->ipif_subnet;
15769                 ire = ire_lookup_bcast(ill, addr, zoneid);
15770                 ASSERT(ire != NULL);
15771                 ire_delete(ire); ire_refrele(ire);
15772                 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15773                 ASSERT(ire != NULL);
15774                 ire_delete(ire); ire_refrele(ire);
15775         }
15776 
15777         /*
15778          * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15779          * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15780          * created.  Creating these broadcast IREs will only create confusion
15781          * as `addr' will be the same as the IP address.
15782          */
15783         if (subnetmask != 0xFFFFFFFF) {
15784                 addr = ipif->ipif_subnet;
15785                 ire = ire_lookup_bcast(ill, addr, zoneid);
15786                 ASSERT(ire != NULL);
15787                 ire_delete(ire); ire_refrele(ire);
15788                 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15789                 ASSERT(ire != NULL);
15790                 ire_delete(ire); ire_refrele(ire);
15791         }
15792 }
15793 
15794 /*
15795  * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15796  * from lifr_flags and the name from lifr_name.
15797  * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15798  * since ipif_lookup_on_name uses the _isv6 flags when matching.
15799  * Returns EINPROGRESS when mp has been consumed by queueing it on
15800  * ipx_pending_mp and the ioctl will complete in ip_rput.
15801  *
15802  * Can operate on either a module or a driver queue.
15803  * Returns an error if not a module queue.
15804  */
15805 /* ARGSUSED */
15806 int
15807 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15808     ip_ioctl_cmd_t *ipip, void *if_req)
15809 {
15810         ill_t   *ill = q->q_ptr;
15811         phyint_t *phyi;
15812         ip_stack_t *ipst;
15813         struct lifreq *lifr = if_req;
15814         uint64_t new_flags;
15815 
15816         ASSERT(ipif != NULL);
15817         ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15818 
15819         if (q->q_next == NULL) {
15820                 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15821                 return (EINVAL);
15822         }
15823 
15824         /*
15825          * If we are not writer on 'q' then this interface exists already
15826          * and previous lookups (ip_extract_lifreq()) found this ipif --
15827          * so return EALREADY.
15828          */
15829         if (ill != ipif->ipif_ill)
15830                 return (EALREADY);
15831 
15832         if (ill->ill_name[0] != '\0')
15833                 return (EALREADY);
15834 
15835         /*
15836          * If there's another ill already with the requested name, ensure
15837          * that it's of the same type.  Otherwise, ill_phyint_reinit() will
15838          * fuse together two unrelated ills, which will cause chaos.
15839          */
15840         ipst = ill->ill_ipst;
15841         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15842             lifr->lifr_name, NULL);
15843         if (phyi != NULL) {
15844                 ill_t *ill_mate = phyi->phyint_illv4;
15845 
15846                 if (ill_mate == NULL)
15847                         ill_mate = phyi->phyint_illv6;
15848                 ASSERT(ill_mate != NULL);
15849 
15850                 if (ill_mate->ill_media->ip_m_mac_type !=
15851                     ill->ill_media->ip_m_mac_type) {
15852                         ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15853                             "use the same ill name on differing media\n"));
15854                         return (EINVAL);
15855                 }
15856         }
15857 
15858         /*
15859          * We start off as IFF_IPV4 in ipif_allocate and become
15860          * IFF_IPV4 or IFF_IPV6 here depending  on lifr_flags value.
15861          * The only flags that we read from user space are IFF_IPV4,
15862          * IFF_IPV6, and IFF_BROADCAST.
15863          *
15864          * This ill has not been inserted into the global list.
15865          * So we are still single threaded and don't need any lock
15866          *
15867          * Saniy check the flags.
15868          */
15869 
15870         if ((lifr->lifr_flags & IFF_BROADCAST) &&
15871             ((lifr->lifr_flags & IFF_IPV6) ||
15872             (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15873                 ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15874                     "or IPv6 i.e., no broadcast \n"));
15875                 return (EINVAL);
15876         }
15877 
15878         new_flags =
15879             lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15880 
15881         if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15882                 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15883                     "IFF_IPV4 or IFF_IPV6\n"));
15884                 return (EINVAL);
15885         }
15886 
15887         /*
15888          * We always start off as IPv4, so only need to check for IPv6.
15889          */
15890         if ((new_flags & IFF_IPV6) != 0) {
15891                 ill->ill_flags |= ILLF_IPV6;
15892                 ill->ill_flags &= ~ILLF_IPV4;
15893 
15894                 if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15895                         ill->ill_flags |= ILLF_NOLINKLOCAL;
15896         }
15897 
15898         if ((new_flags & IFF_BROADCAST) != 0)
15899                 ipif->ipif_flags |= IPIF_BROADCAST;
15900         else
15901                 ipif->ipif_flags &= ~IPIF_BROADCAST;
15902 
15903         /* We started off as V4. */
15904         if (ill->ill_flags & ILLF_IPV6) {
15905                 ill->ill_phyint->phyint_illv6 = ill;
15906                 ill->ill_phyint->phyint_illv4 = NULL;
15907         }
15908 
15909         return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15910 }
15911 
15912 /* ARGSUSED */
15913 int
15914 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15915     ip_ioctl_cmd_t *ipip, void *if_req)
15916 {
15917         /*
15918          * ill_phyint_reinit merged the v4 and v6 into a single
15919          * ipsq.  We might not have been able to complete the
15920          * slifname in ipif_set_values, if we could not become
15921          * exclusive.  If so restart it here
15922          */
15923         return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15924 }
15925 
15926 /*
15927  * Return a pointer to the ipif which matches the index, IP version type and
15928  * zoneid.
15929  */
15930 ipif_t *
15931 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15932     ip_stack_t *ipst)
15933 {
15934         ill_t   *ill;
15935         ipif_t  *ipif = NULL;
15936 
15937         ill = ill_lookup_on_ifindex(index, isv6, ipst);
15938         if (ill != NULL) {
15939                 mutex_enter(&ill->ill_lock);
15940                 for (ipif = ill->ill_ipif; ipif != NULL;
15941                     ipif = ipif->ipif_next) {
15942                         if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
15943                             zoneid == ipif->ipif_zoneid ||
15944                             ipif->ipif_zoneid == ALL_ZONES)) {
15945                                 ipif_refhold_locked(ipif);
15946                                 break;
15947                         }
15948                 }
15949                 mutex_exit(&ill->ill_lock);
15950                 ill_refrele(ill);
15951         }
15952         return (ipif);
15953 }
15954 
15955 /*
15956  * Change an existing physical interface's index. If the new index
15957  * is acceptable we update the index and the phyint_list_avl_by_index tree.
15958  * Finally, we update other systems which may have a dependence on the
15959  * index value.
15960  */
15961 /* ARGSUSED */
15962 int
15963 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15964     ip_ioctl_cmd_t *ipip, void *ifreq)
15965 {
15966         ill_t           *ill;
15967         phyint_t        *phyi;
15968         struct ifreq    *ifr = (struct ifreq *)ifreq;
15969         struct lifreq   *lifr = (struct lifreq *)ifreq;
15970         uint_t  old_index, index;
15971         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
15972         avl_index_t     where;
15973 
15974         if (ipip->ipi_cmd_type == IF_CMD)
15975                 index = ifr->ifr_index;
15976         else
15977                 index = lifr->lifr_index;
15978 
15979         /*
15980          * Only allow on physical interface. Also, index zero is illegal.
15981          */
15982         ill = ipif->ipif_ill;
15983         phyi = ill->ill_phyint;
15984         if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
15985                 return (EINVAL);
15986         }
15987 
15988         /* If the index is not changing, no work to do */
15989         if (phyi->phyint_ifindex == index)
15990                 return (0);
15991 
15992         /*
15993          * Use phyint_exists() to determine if the new interface index
15994          * is already in use. If the index is unused then we need to
15995          * change the phyint's position in the phyint_list_avl_by_index
15996          * tree. If we do not do this, subsequent lookups (using the new
15997          * index value) will not find the phyint.
15998          */
15999         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16000         if (phyint_exists(index, ipst)) {
16001                 rw_exit(&ipst->ips_ill_g_lock);
16002                 return (EEXIST);
16003         }
16004 
16005         /*
16006          * The new index is unused. Set it in the phyint. However we must not
16007          * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
16008          * changes. The event must be bound to old ifindex value.
16009          */
16010         ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
16011             &index, sizeof (index));
16012 
16013         old_index = phyi->phyint_ifindex;
16014         phyi->phyint_ifindex = index;
16015 
16016         avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
16017         (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16018             &index, &where);
16019         avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16020             phyi, where);
16021         rw_exit(&ipst->ips_ill_g_lock);
16022 
16023         /* Update SCTP's ILL list */
16024         sctp_ill_reindex(ill, old_index);
16025 
16026         /* Send the routing sockets message */
16027         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
16028         if (ILL_OTHER(ill))
16029                 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
16030 
16031         /* Perhaps ilgs should use this ill */
16032         update_conn_ill(NULL, ill->ill_ipst);
16033         return (0);
16034 }
16035 
16036 /* ARGSUSED */
16037 int
16038 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16039     ip_ioctl_cmd_t *ipip, void *ifreq)
16040 {
16041         struct ifreq    *ifr = (struct ifreq *)ifreq;
16042         struct lifreq   *lifr = (struct lifreq *)ifreq;
16043 
16044         ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
16045             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16046         /* Get the interface index */
16047         if (ipip->ipi_cmd_type == IF_CMD) {
16048                 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16049         } else {
16050                 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16051         }
16052         return (0);
16053 }
16054 
16055 /* ARGSUSED */
16056 int
16057 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16058     ip_ioctl_cmd_t *ipip, void *ifreq)
16059 {
16060         struct lifreq   *lifr = (struct lifreq *)ifreq;
16061 
16062         ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
16063             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16064         /* Get the interface zone */
16065         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16066         lifr->lifr_zoneid = ipif->ipif_zoneid;
16067         return (0);
16068 }
16069 
16070 /*
16071  * Set the zoneid of an interface.
16072  */
16073 /* ARGSUSED */
16074 int
16075 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16076     ip_ioctl_cmd_t *ipip, void *ifreq)
16077 {
16078         struct lifreq   *lifr = (struct lifreq *)ifreq;
16079         int err = 0;
16080         boolean_t need_up = B_FALSE;
16081         zone_t *zptr;
16082         zone_status_t status;
16083         zoneid_t zoneid;
16084 
16085         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16086         if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) {
16087                 if (!is_system_labeled())
16088                         return (ENOTSUP);
16089                 zoneid = GLOBAL_ZONEID;
16090         }
16091 
16092         /* cannot assign instance zero to a non-global zone */
16093         if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
16094                 return (ENOTSUP);
16095 
16096         /*
16097          * Cannot assign to a zone that doesn't exist or is shutting down.  In
16098          * the event of a race with the zone shutdown processing, since IP
16099          * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
16100          * interface will be cleaned up even if the zone is shut down
16101          * immediately after the status check. If the interface can't be brought
16102          * down right away, and the zone is shut down before the restart
16103          * function is called, we resolve the possible races by rechecking the
16104          * zone status in the restart function.
16105          */
16106         if ((zptr = zone_find_by_id(zoneid)) == NULL)
16107                 return (EINVAL);
16108         status = zone_status_get(zptr);
16109         zone_rele(zptr);
16110 
16111         if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
16112                 return (EINVAL);
16113 
16114         if (ipif->ipif_flags & IPIF_UP) {
16115                 /*
16116                  * If the interface is already marked up,
16117                  * we call ipif_down which will take care
16118                  * of ditching any IREs that have been set
16119                  * up based on the old interface address.
16120                  */
16121                 err = ipif_logical_down(ipif, q, mp);
16122                 if (err == EINPROGRESS)
16123                         return (err);
16124                 (void) ipif_down_tail(ipif);
16125                 need_up = B_TRUE;
16126         }
16127 
16128         err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
16129         return (err);
16130 }
16131 
16132 static int
16133 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
16134     queue_t *q, mblk_t *mp, boolean_t need_up)
16135 {
16136         int     err = 0;
16137         ip_stack_t      *ipst;
16138 
16139         ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
16140             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16141 
16142         if (CONN_Q(q))
16143                 ipst = CONNQ_TO_IPST(q);
16144         else
16145                 ipst = ILLQ_TO_IPST(q);
16146 
16147         /*
16148          * For exclusive stacks we don't allow a different zoneid than
16149          * global.
16150          */
16151         if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
16152             zoneid != GLOBAL_ZONEID)
16153                 return (EINVAL);
16154 
16155         /* Set the new zone id. */
16156         ipif->ipif_zoneid = zoneid;
16157 
16158         /* Update sctp list */
16159         sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
16160 
16161         /* The default multicast interface might have changed */
16162         ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
16163 
16164         if (need_up) {
16165                 /*
16166                  * Now bring the interface back up.  If this
16167                  * is the only IPIF for the ILL, ipif_up
16168                  * will have to re-bind to the device, so
16169                  * we may get back EINPROGRESS, in which
16170                  * case, this IOCTL will get completed in
16171                  * ip_rput_dlpi when we see the DL_BIND_ACK.
16172                  */
16173                 err = ipif_up(ipif, q, mp);
16174         }
16175         return (err);
16176 }
16177 
16178 /* ARGSUSED */
16179 int
16180 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16181     ip_ioctl_cmd_t *ipip, void *if_req)
16182 {
16183         struct lifreq *lifr = (struct lifreq *)if_req;
16184         zoneid_t zoneid;
16185         zone_t *zptr;
16186         zone_status_t status;
16187 
16188         ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16189         if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
16190                 zoneid = GLOBAL_ZONEID;
16191 
16192         ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
16193             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16194 
16195         /*
16196          * We recheck the zone status to resolve the following race condition:
16197          * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
16198          * 2) hme0:1 is up and can't be brought down right away;
16199          * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
16200          * 3) zone "myzone" is halted; the zone status switches to
16201          * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
16202          * the interfaces to remove - hme0:1 is not returned because it's not
16203          * yet in "myzone", so it won't be removed;
16204          * 4) the restart function for SIOCSLIFZONE is called; without the
16205          * status check here, we would have hme0:1 in "myzone" after it's been
16206          * destroyed.
16207          * Note that if the status check fails, we need to bring the interface
16208          * back to its state prior to ip_sioctl_slifzone(), hence the call to
16209          * ipif_up_done[_v6]().
16210          */
16211         status = ZONE_IS_UNINITIALIZED;
16212         if ((zptr = zone_find_by_id(zoneid)) != NULL) {
16213                 status = zone_status_get(zptr);
16214                 zone_rele(zptr);
16215         }
16216         if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
16217                 if (ipif->ipif_isv6) {
16218                         (void) ipif_up_done_v6(ipif);
16219                 } else {
16220                         (void) ipif_up_done(ipif);
16221                 }
16222                 return (EINVAL);
16223         }
16224 
16225         (void) ipif_down_tail(ipif);
16226 
16227         return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
16228             B_TRUE));
16229 }
16230 
16231 /*
16232  * Return the number of addresses on `ill' with one or more of the values
16233  * in `set' set and all of the values in `clear' clear.
16234  */
16235 static uint_t
16236 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
16237 {
16238         ipif_t  *ipif;
16239         uint_t  cnt = 0;
16240 
16241         ASSERT(IAM_WRITER_ILL(ill));
16242 
16243         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
16244                 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
16245                         cnt++;
16246 
16247         return (cnt);
16248 }
16249 
16250 /*
16251  * Return the number of migratable addresses on `ill' that are under
16252  * application control.
16253  */
16254 uint_t
16255 ill_appaddr_cnt(const ill_t *ill)
16256 {
16257         return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
16258             IPIF_NOFAILOVER));
16259 }
16260 
16261 /*
16262  * Return the number of point-to-point addresses on `ill'.
16263  */
16264 uint_t
16265 ill_ptpaddr_cnt(const ill_t *ill)
16266 {
16267         return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
16268 }
16269 
16270 /* ARGSUSED */
16271 int
16272 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16273         ip_ioctl_cmd_t *ipip, void *ifreq)
16274 {
16275         struct lifreq   *lifr = ifreq;
16276 
16277         ASSERT(q->q_next == NULL);
16278         ASSERT(CONN_Q(q));
16279 
16280         ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16281             ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16282         lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16283         ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16284 
16285         return (0);
16286 }
16287 
16288 /* Find the previous ILL in this usesrc group */
16289 static ill_t *
16290 ill_prev_usesrc(ill_t *uill)
16291 {
16292         ill_t *ill;
16293 
16294         for (ill = uill->ill_usesrc_grp_next;
16295             ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16296             ill = ill->ill_usesrc_grp_next)
16297                 /* do nothing */;
16298         return (ill);
16299 }
16300 
16301 /*
16302  * Release all members of the usesrc group. This routine is called
16303  * from ill_delete when the interface being unplumbed is the
16304  * group head.
16305  *
16306  * This silently clears the usesrc that ifconfig setup.
16307  * An alternative would be to keep that ifindex, and drop packets on the floor
16308  * since no source address can be selected.
16309  * Even if we keep the current semantics, don't need a lock and a linked list.
16310  * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16311  * the one that is being removed. Issue is how we return the usesrc users
16312  * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16313  * ill_usesrc_ifindex matching a target ill. We could also do that with an
16314  * ill walk, but the walker would need to insert in the ioctl response.
16315  */
16316 static void
16317 ill_disband_usesrc_group(ill_t *uill)
16318 {
16319         ill_t *next_ill, *tmp_ill;
16320         ip_stack_t      *ipst = uill->ill_ipst;
16321 
16322         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16323         next_ill = uill->ill_usesrc_grp_next;
16324 
16325         do {
16326                 ASSERT(next_ill != NULL);
16327                 tmp_ill = next_ill->ill_usesrc_grp_next;
16328                 ASSERT(tmp_ill != NULL);
16329                 next_ill->ill_usesrc_grp_next = NULL;
16330                 next_ill->ill_usesrc_ifindex = 0;
16331                 next_ill = tmp_ill;
16332         } while (next_ill->ill_usesrc_ifindex != 0);
16333         uill->ill_usesrc_grp_next = NULL;
16334 }
16335 
16336 /*
16337  * Remove the client usesrc ILL from the list and relink to a new list
16338  */
16339 int
16340 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16341 {
16342         ill_t *ill, *tmp_ill;
16343         ip_stack_t      *ipst = ucill->ill_ipst;
16344 
16345         ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16346             (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16347 
16348         /*
16349          * Check if the usesrc client ILL passed in is not already
16350          * in use as a usesrc ILL i.e one whose source address is
16351          * in use OR a usesrc ILL is not already in use as a usesrc
16352          * client ILL
16353          */
16354         if ((ucill->ill_usesrc_ifindex == 0) ||
16355             (uill->ill_usesrc_ifindex != 0)) {
16356                 return (-1);
16357         }
16358 
16359         ill = ill_prev_usesrc(ucill);
16360         ASSERT(ill->ill_usesrc_grp_next != NULL);
16361 
16362         /* Remove from the current list */
16363         if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16364                 /* Only two elements in the list */
16365                 ASSERT(ill->ill_usesrc_ifindex == 0);
16366                 ill->ill_usesrc_grp_next = NULL;
16367         } else {
16368                 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16369         }
16370 
16371         if (ifindex == 0) {
16372                 ucill->ill_usesrc_ifindex = 0;
16373                 ucill->ill_usesrc_grp_next = NULL;
16374                 return (0);
16375         }
16376 
16377         ucill->ill_usesrc_ifindex = ifindex;
16378         tmp_ill = uill->ill_usesrc_grp_next;
16379         uill->ill_usesrc_grp_next = ucill;
16380         ucill->ill_usesrc_grp_next =
16381             (tmp_ill != NULL) ? tmp_ill : uill;
16382         return (0);
16383 }
16384 
16385 /*
16386  * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16387  * ip.c for locking details.
16388  */
16389 /* ARGSUSED */
16390 int
16391 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16392     ip_ioctl_cmd_t *ipip, void *ifreq)
16393 {
16394         struct lifreq *lifr = (struct lifreq *)ifreq;
16395         boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16396         ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16397         int err = 0, ret;
16398         uint_t ifindex;
16399         ipsq_t *ipsq = NULL;
16400         ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
16401 
16402         ASSERT(IAM_WRITER_IPIF(ipif));
16403         ASSERT(q->q_next == NULL);
16404         ASSERT(CONN_Q(q));
16405 
16406         isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16407 
16408         ifindex = lifr->lifr_index;
16409         if (ifindex == 0) {
16410                 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16411                         /* non usesrc group interface, nothing to reset */
16412                         return (0);
16413                 }
16414                 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16415                 /* valid reset request */
16416                 reset_flg = B_TRUE;
16417         }
16418 
16419         usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16420         if (usesrc_ill == NULL)
16421                 return (ENXIO);
16422         if (usesrc_ill == ipif->ipif_ill) {
16423                 ill_refrele(usesrc_ill);
16424                 return (EINVAL);
16425         }
16426 
16427         ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16428             NEW_OP, B_TRUE);
16429         if (ipsq == NULL) {
16430                 err = EINPROGRESS;
16431                 /* Operation enqueued on the ipsq of the usesrc ILL */
16432                 goto done;
16433         }
16434 
16435         /* USESRC isn't currently supported with IPMP */
16436         if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16437                 err = ENOTSUP;
16438                 goto done;
16439         }
16440 
16441         /*
16442          * USESRC isn't compatible with the STANDBY flag.  (STANDBY is only
16443          * used by IPMP underlying interfaces, but someone might think it's
16444          * more general and try to use it independently with VNI.)
16445          */
16446         if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16447                 err = ENOTSUP;
16448                 goto done;
16449         }
16450 
16451         /*
16452          * If the client is already in use as a usesrc_ill or a usesrc_ill is
16453          * already a client then return EINVAL
16454          */
16455         if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16456                 err = EINVAL;
16457                 goto done;
16458         }
16459 
16460         /*
16461          * If the ill_usesrc_ifindex field is already set to what it needs to
16462          * be then this is a duplicate operation.
16463          */
16464         if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16465                 err = 0;
16466                 goto done;
16467         }
16468 
16469         ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16470             " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16471             usesrc_ill->ill_isv6));
16472 
16473         /*
16474          * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16475          * and the ill_usesrc_ifindex fields
16476          */
16477         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16478 
16479         if (reset_flg) {
16480                 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16481                 if (ret != 0) {
16482                         err = EINVAL;
16483                 }
16484                 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16485                 goto done;
16486         }
16487 
16488         /*
16489          * Four possibilities to consider:
16490          * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16491          * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16492          * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16493          * 4. Both are part of their respective usesrc groups
16494          */
16495         if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16496             (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16497                 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16498                 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16499                 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16500                 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16501         } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16502             (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16503                 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16504                 /* Insert at head of list */
16505                 usesrc_cli_ill->ill_usesrc_grp_next =
16506                     usesrc_ill->ill_usesrc_grp_next;
16507                 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16508         } else {
16509                 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16510                     ifindex);
16511                 if (ret != 0)
16512                         err = EINVAL;
16513         }
16514         rw_exit(&ipst->ips_ill_g_usesrc_lock);
16515 
16516 done:
16517         if (ipsq != NULL)
16518                 ipsq_exit(ipsq);
16519         /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16520         ill_refrele(usesrc_ill);
16521 
16522         /* Let conn_ixa caching know that source address selection changed */
16523         ip_update_source_selection(ipst);
16524 
16525         return (err);
16526 }
16527 
16528 /* ARGSUSED */
16529 int
16530 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16531     ip_ioctl_cmd_t *ipip, void *if_req)
16532 {
16533         struct lifreq   *lifr = (struct lifreq *)if_req;
16534         ill_t           *ill = ipif->ipif_ill;
16535 
16536         /*
16537          * Need a lock since IFF_UP can be set even when there are
16538          * references to the ipif.
16539          */
16540         mutex_enter(&ill->ill_lock);
16541         if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16542                 lifr->lifr_dadstate = DAD_IN_PROGRESS;
16543         else
16544                 lifr->lifr_dadstate = DAD_DONE;
16545         mutex_exit(&ill->ill_lock);
16546         return (0);
16547 }
16548 
16549 /*
16550  * comparison function used by avl.
16551  */
16552 static int
16553 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16554 {
16555 
16556         uint_t index;
16557 
16558         ASSERT(phyip != NULL && index_ptr != NULL);
16559 
16560         index = *((uint_t *)index_ptr);
16561         /*
16562          * let the phyint with the lowest index be on top.
16563          */
16564         if (((phyint_t *)phyip)->phyint_ifindex < index)
16565                 return (1);
16566         if (((phyint_t *)phyip)->phyint_ifindex > index)
16567                 return (-1);
16568         return (0);
16569 }
16570 
16571 /*
16572  * comparison function used by avl.
16573  */
16574 static int
16575 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16576 {
16577         ill_t *ill;
16578         int res = 0;
16579 
16580         ASSERT(phyip != NULL && name_ptr != NULL);
16581 
16582         if (((phyint_t *)phyip)->phyint_illv4)
16583                 ill = ((phyint_t *)phyip)->phyint_illv4;
16584         else
16585                 ill = ((phyint_t *)phyip)->phyint_illv6;
16586         ASSERT(ill != NULL);
16587 
16588         res = strcmp(ill->ill_name, (char *)name_ptr);
16589         if (res > 0)
16590                 return (1);
16591         else if (res < 0)
16592                 return (-1);
16593         return (0);
16594 }
16595 
16596 /*
16597  * This function is called on the unplumb path via ill_glist_delete() when
16598  * there are no ills left on the phyint and thus the phyint can be freed.
16599  */
16600 static void
16601 phyint_free(phyint_t *phyi)
16602 {
16603         ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16604 
16605         ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16606 
16607         /*
16608          * If this phyint was an IPMP meta-interface, blow away the group.
16609          * This is safe to do because all of the illgrps have already been
16610          * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16611          * If we're cleaning up as a result of failed initialization,
16612          * phyint_grp may be NULL.
16613          */
16614         if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16615                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16616                 ipmp_grp_destroy(phyi->phyint_grp);
16617                 phyi->phyint_grp = NULL;
16618                 rw_exit(&ipst->ips_ipmp_lock);
16619         }
16620 
16621         /*
16622          * If this interface was under IPMP, take it out of the group.
16623          */
16624         if (phyi->phyint_grp != NULL)
16625                 ipmp_phyint_leave_grp(phyi);
16626 
16627         /*
16628          * Delete the phyint and disassociate its ipsq.  The ipsq itself
16629          * will be freed in ipsq_exit().
16630          */
16631         phyi->phyint_ipsq->ipsq_phyint = NULL;
16632         phyi->phyint_name[0] = '\0';
16633 
16634         mi_free(phyi);
16635 }
16636 
16637 /*
16638  * Attach the ill to the phyint structure which can be shared by both
16639  * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16640  * function is called from ipif_set_values and ill_lookup_on_name (for
16641  * loopback) where we know the name of the ill. We lookup the ill and if
16642  * there is one present already with the name use that phyint. Otherwise
16643  * reuse the one allocated by ill_init.
16644  */
16645 static void
16646 ill_phyint_reinit(ill_t *ill)
16647 {
16648         boolean_t isv6 = ill->ill_isv6;
16649         phyint_t *phyi_old;
16650         phyint_t *phyi;
16651         avl_index_t where = 0;
16652         ill_t   *ill_other = NULL;
16653         ip_stack_t      *ipst = ill->ill_ipst;
16654 
16655         ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16656 
16657         phyi_old = ill->ill_phyint;
16658         ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16659             phyi_old->phyint_illv6 == NULL));
16660         ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16661             phyi_old->phyint_illv4 == NULL));
16662         ASSERT(phyi_old->phyint_ifindex == 0);
16663 
16664         /*
16665          * Now that our ill has a name, set it in the phyint.
16666          */
16667         (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16668 
16669         phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16670             ill->ill_name, &where);
16671 
16672         /*
16673          * 1. We grabbed the ill_g_lock before inserting this ill into
16674          *    the global list of ills. So no other thread could have located
16675          *    this ill and hence the ipsq of this ill is guaranteed to be empty.
16676          * 2. Now locate the other protocol instance of this ill.
16677          * 3. Now grab both ill locks in the right order, and the phyint lock of
16678          *    the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16679          *    of neither ill can change.
16680          * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16681          *    other ill.
16682          * 5. Release all locks.
16683          */
16684 
16685         /*
16686          * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16687          * we are initializing IPv4.
16688          */
16689         if (phyi != NULL) {
16690                 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16691                 ASSERT(ill_other->ill_phyint != NULL);
16692                 ASSERT((isv6 && !ill_other->ill_isv6) ||
16693                     (!isv6 && ill_other->ill_isv6));
16694                 GRAB_ILL_LOCKS(ill, ill_other);
16695                 /*
16696                  * We are potentially throwing away phyint_flags which
16697                  * could be different from the one that we obtain from
16698                  * ill_other->ill_phyint. But it is okay as we are assuming
16699                  * that the state maintained within IP is correct.
16700                  */
16701                 mutex_enter(&phyi->phyint_lock);
16702                 if (isv6) {
16703                         ASSERT(phyi->phyint_illv6 == NULL);
16704                         phyi->phyint_illv6 = ill;
16705                 } else {
16706                         ASSERT(phyi->phyint_illv4 == NULL);
16707                         phyi->phyint_illv4 = ill;
16708                 }
16709 
16710                 /*
16711                  * Delete the old phyint and make its ipsq eligible
16712                  * to be freed in ipsq_exit().
16713                  */
16714                 phyi_old->phyint_illv4 = NULL;
16715                 phyi_old->phyint_illv6 = NULL;
16716                 phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16717                 phyi_old->phyint_name[0] = '\0';
16718                 mi_free(phyi_old);
16719         } else {
16720                 mutex_enter(&ill->ill_lock);
16721                 /*
16722                  * We don't need to acquire any lock, since
16723                  * the ill is not yet visible globally  and we
16724                  * have not yet released the ill_g_lock.
16725                  */
16726                 phyi = phyi_old;
16727                 mutex_enter(&phyi->phyint_lock);
16728                 /* XXX We need a recovery strategy here. */
16729                 if (!phyint_assign_ifindex(phyi, ipst))
16730                         cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16731 
16732                 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16733                     (void *)phyi, where);
16734 
16735                 (void) avl_find(&ipst->ips_phyint_g_list->
16736                     phyint_list_avl_by_index,
16737                     &phyi->phyint_ifindex, &where);
16738                 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16739                     (void *)phyi, where);
16740         }
16741 
16742         /*
16743          * Reassigning ill_phyint automatically reassigns the ipsq also.
16744          * pending mp is not affected because that is per ill basis.
16745          */
16746         ill->ill_phyint = phyi;
16747 
16748         /*
16749          * Now that the phyint's ifindex has been assigned, complete the
16750          * remaining
16751          */
16752         ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16753         if (ill->ill_isv6) {
16754                 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16755                     ill->ill_phyint->phyint_ifindex;
16756                 ill->ill_mcast_type = ipst->ips_mld_max_version;
16757         } else {
16758                 ill->ill_mcast_type = ipst->ips_igmp_max_version;
16759         }
16760 
16761         /*
16762          * Generate an event within the hooks framework to indicate that
16763          * a new interface has just been added to IP.  For this event to
16764          * be generated, the network interface must, at least, have an
16765          * ifindex assigned to it.  (We don't generate the event for
16766          * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16767          *
16768          * This needs to be run inside the ill_g_lock perimeter to ensure
16769          * that the ordering of delivered events to listeners matches the
16770          * order of them in the kernel.
16771          */
16772         if (!IS_LOOPBACK(ill)) {
16773                 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16774                     ill->ill_name_length);
16775         }
16776         RELEASE_ILL_LOCKS(ill, ill_other);
16777         mutex_exit(&phyi->phyint_lock);
16778 }
16779 
16780 /*
16781  * Notify any downstream modules of the name of this interface.
16782  * An M_IOCTL is used even though we don't expect a successful reply.
16783  * Any reply message from the driver (presumably an M_IOCNAK) will
16784  * eventually get discarded somewhere upstream.  The message format is
16785  * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16786  * to IP.
16787  */
16788 static void
16789 ip_ifname_notify(ill_t *ill, queue_t *q)
16790 {
16791         mblk_t *mp1, *mp2;
16792         struct iocblk *iocp;
16793         struct lifreq *lifr;
16794 
16795         mp1 = mkiocb(SIOCSLIFNAME);
16796         if (mp1 == NULL)
16797                 return;
16798         mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16799         if (mp2 == NULL) {
16800                 freeb(mp1);
16801                 return;
16802         }
16803 
16804         mp1->b_cont = mp2;
16805         iocp = (struct iocblk *)mp1->b_rptr;
16806         iocp->ioc_count = sizeof (struct lifreq);
16807 
16808         lifr = (struct lifreq *)mp2->b_rptr;
16809         mp2->b_wptr += sizeof (struct lifreq);
16810         bzero(lifr, sizeof (struct lifreq));
16811 
16812         (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16813         lifr->lifr_ppa = ill->ill_ppa;
16814         lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16815 
16816         DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16817             char *, "SIOCSLIFNAME", ill_t *, ill);
16818         putnext(q, mp1);
16819 }
16820 
16821 static int
16822 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16823 {
16824         int             err;
16825         ip_stack_t      *ipst = ill->ill_ipst;
16826         phyint_t        *phyi = ill->ill_phyint;
16827 
16828         /*
16829          * Now that ill_name is set, the configuration for the IPMP
16830          * meta-interface can be performed.
16831          */
16832         if (IS_IPMP(ill)) {
16833                 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16834                 /*
16835                  * If phyi->phyint_grp is NULL, then this is the first IPMP
16836                  * meta-interface and we need to create the IPMP group.
16837                  */
16838                 if (phyi->phyint_grp == NULL) {
16839                         /*
16840                          * If someone has renamed another IPMP group to have
16841                          * the same name as our interface, bail.
16842                          */
16843                         if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16844                                 rw_exit(&ipst->ips_ipmp_lock);
16845                                 return (EEXIST);
16846                         }
16847                         phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16848                         if (phyi->phyint_grp == NULL) {
16849                                 rw_exit(&ipst->ips_ipmp_lock);
16850                                 return (ENOMEM);
16851                         }
16852                 }
16853                 rw_exit(&ipst->ips_ipmp_lock);
16854         }
16855 
16856         /* Tell downstream modules where they are. */
16857         ip_ifname_notify(ill, q);
16858 
16859         /*
16860          * ill_dl_phys returns EINPROGRESS in the usual case.
16861          * Error cases are ENOMEM ...
16862          */
16863         err = ill_dl_phys(ill, ipif, mp, q);
16864 
16865         if (ill->ill_isv6) {
16866                 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16867                 if (ipst->ips_mld_slowtimeout_id == 0) {
16868                         ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16869                             (void *)ipst,
16870                             MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16871                 }
16872                 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16873         } else {
16874                 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16875                 if (ipst->ips_igmp_slowtimeout_id == 0) {
16876                         ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16877                             (void *)ipst,
16878                             MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16879                 }
16880                 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16881         }
16882 
16883         return (err);
16884 }
16885 
16886 /*
16887  * Common routine for ppa and ifname setting. Should be called exclusive.
16888  *
16889  * Returns EINPROGRESS when mp has been consumed by queueing it on
16890  * ipx_pending_mp and the ioctl will complete in ip_rput.
16891  *
16892  * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16893  * the new name and new ppa in lifr_name and lifr_ppa respectively.
16894  * For SLIFNAME, we pass these values back to the userland.
16895  */
16896 static int
16897 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16898 {
16899         ill_t   *ill;
16900         ipif_t  *ipif;
16901         ipsq_t  *ipsq;
16902         char    *ppa_ptr;
16903         char    *old_ptr;
16904         char    old_char;
16905         int     error;
16906         ip_stack_t      *ipst;
16907 
16908         ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16909         ASSERT(q->q_next != NULL);
16910         ASSERT(interf_name != NULL);
16911 
16912         ill = (ill_t *)q->q_ptr;
16913         ipst = ill->ill_ipst;
16914 
16915         ASSERT(ill->ill_ipst != NULL);
16916         ASSERT(ill->ill_name[0] == '\0');
16917         ASSERT(IAM_WRITER_ILL(ill));
16918         ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16919         ASSERT(ill->ill_ppa == UINT_MAX);
16920 
16921         ill->ill_defend_start = ill->ill_defend_count = 0;
16922         /* The ppa is sent down by ifconfig or is chosen */
16923         if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16924                 return (EINVAL);
16925         }
16926 
16927         /*
16928          * make sure ppa passed in is same as ppa in the name.
16929          * This check is not made when ppa == UINT_MAX in that case ppa
16930          * in the name could be anything. System will choose a ppa and
16931          * update new_ppa_ptr and inter_name to contain the choosen ppa.
16932          */
16933         if (*new_ppa_ptr != UINT_MAX) {
16934                 /* stoi changes the pointer */
16935                 old_ptr = ppa_ptr;
16936                 /*
16937                  * ifconfig passed in 0 for the ppa for DLPI 1 style devices
16938                  * (they don't have an externally visible ppa).  We assign one
16939                  * here so that we can manage the interface.  Note that in
16940                  * the past this value was always 0 for DLPI 1 drivers.
16941                  */
16942                 if (*new_ppa_ptr == 0)
16943                         *new_ppa_ptr = stoi(&old_ptr);
16944                 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
16945                         return (EINVAL);
16946         }
16947         /*
16948          * terminate string before ppa
16949          * save char at that location.
16950          */
16951         old_char = ppa_ptr[0];
16952         ppa_ptr[0] = '\0';
16953 
16954         ill->ill_ppa = *new_ppa_ptr;
16955         /*
16956          * Finish as much work now as possible before calling ill_glist_insert
16957          * which makes the ill globally visible and also merges it with the
16958          * other protocol instance of this phyint. The remaining work is
16959          * done after entering the ipsq which may happen sometime later.
16960          */
16961         ipif = ill->ill_ipif;
16962 
16963         /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
16964         ipif_assign_seqid(ipif);
16965 
16966         if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
16967                 ill->ill_flags |= ILLF_IPV4;
16968 
16969         ASSERT(ipif->ipif_next == NULL);     /* Only one ipif on ill */
16970         ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
16971 
16972         if (ill->ill_flags & ILLF_IPV6) {
16973 
16974                 ill->ill_isv6 = B_TRUE;
16975                 ill_set_inputfn(ill);
16976                 if (ill->ill_rq != NULL) {
16977                         ill->ill_rq->q_qinfo = &iprinitv6;
16978                 }
16979 
16980                 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
16981                 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
16982                 ipif->ipif_v6subnet = ipv6_all_zeros;
16983                 ipif->ipif_v6net_mask = ipv6_all_zeros;
16984                 ipif->ipif_v6brd_addr = ipv6_all_zeros;
16985                 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
16986                 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
16987                 /*
16988                  * point-to-point or Non-mulicast capable
16989                  * interfaces won't do NUD unless explicitly
16990                  * configured to do so.
16991                  */
16992                 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
16993                     !(ill->ill_flags & ILLF_MULTICAST)) {
16994                         ill->ill_flags |= ILLF_NONUD;
16995                 }
16996                 /* Make sure IPv4 specific flag is not set on IPv6 if */
16997                 if (ill->ill_flags & ILLF_NOARP) {
16998                         /*
16999                          * Note: xresolv interfaces will eventually need
17000                          * NOARP set here as well, but that will require
17001                          * those external resolvers to have some
17002                          * knowledge of that flag and act appropriately.
17003                          * Not to be changed at present.
17004                          */
17005                         ill->ill_flags &= ~ILLF_NOARP;
17006                 }
17007                 /*
17008                  * Set the ILLF_ROUTER flag according to the global
17009                  * IPv6 forwarding policy.
17010                  */
17011                 if (ipst->ips_ipv6_forwarding != 0)
17012                         ill->ill_flags |= ILLF_ROUTER;
17013         } else if (ill->ill_flags & ILLF_IPV4) {
17014                 ill->ill_isv6 = B_FALSE;
17015                 ill_set_inputfn(ill);
17016                 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
17017                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
17018                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
17019                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
17020                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
17021                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
17022                 /*
17023                  * Set the ILLF_ROUTER flag according to the global
17024                  * IPv4 forwarding policy.
17025                  */
17026                 if (ipst->ips_ip_forwarding != 0)
17027                         ill->ill_flags |= ILLF_ROUTER;
17028         }
17029 
17030         ASSERT(ill->ill_phyint != NULL);
17031 
17032         /*
17033          * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
17034          * be completed in ill_glist_insert -> ill_phyint_reinit
17035          */
17036         if (!ill_allocate_mibs(ill))
17037                 return (ENOMEM);
17038 
17039         /*
17040          * Pick a default sap until we get the DL_INFO_ACK back from
17041          * the driver.
17042          */
17043         ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
17044             ill->ill_media->ip_m_ipv4sap;
17045 
17046         ill->ill_ifname_pending = 1;
17047         ill->ill_ifname_pending_err = 0;
17048 
17049         /*
17050          * When the first ipif comes up in ipif_up_done(), multicast groups
17051          * that were joined while this ill was not bound to the DLPI link need
17052          * to be recovered by ill_recover_multicast().
17053          */
17054         ill->ill_need_recover_multicast = 1;
17055 
17056         ill_refhold(ill);
17057         rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
17058         if ((error = ill_glist_insert(ill, interf_name,
17059             (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
17060                 ill->ill_ppa = UINT_MAX;
17061                 ill->ill_name[0] = '\0';
17062                 /*
17063                  * undo null termination done above.
17064                  */
17065                 ppa_ptr[0] = old_char;
17066                 rw_exit(&ipst->ips_ill_g_lock);
17067                 ill_refrele(ill);
17068                 return (error);
17069         }
17070 
17071         ASSERT(ill->ill_name_length <= LIFNAMSIZ);
17072 
17073         /*
17074          * When we return the buffer pointed to by interf_name should contain
17075          * the same name as in ill_name.
17076          * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
17077          * the buffer pointed to by new_ppa_ptr would not contain the right ppa
17078          * so copy full name and update the ppa ptr.
17079          * When ppa passed in != UINT_MAX all values are correct just undo
17080          * null termination, this saves a bcopy.
17081          */
17082         if (*new_ppa_ptr == UINT_MAX) {
17083                 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
17084                 *new_ppa_ptr = ill->ill_ppa;
17085         } else {
17086                 /*
17087                  * undo null termination done above.
17088                  */
17089                 ppa_ptr[0] = old_char;
17090         }
17091 
17092         /* Let SCTP know about this ILL */
17093         sctp_update_ill(ill, SCTP_ILL_INSERT);
17094 
17095         /*
17096          * ill_glist_insert has made the ill visible globally, and
17097          * ill_phyint_reinit could have changed the ipsq. At this point,
17098          * we need to hold the ips_ill_g_lock across the call to enter the
17099          * ipsq to enforce atomicity and prevent reordering. In the event
17100          * the ipsq has changed, and if the new ipsq is currently busy,
17101          * we need to make sure that this half-completed ioctl is ahead of
17102          * any subsequent ioctl. We achieve this by not dropping the
17103          * ips_ill_g_lock which prevents any ill lookup itself thereby
17104          * ensuring that new ioctls can't start.
17105          */
17106         ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
17107             B_TRUE);
17108 
17109         rw_exit(&ipst->ips_ill_g_lock);
17110         ill_refrele(ill);
17111         if (ipsq == NULL)
17112                 return (EINPROGRESS);
17113 
17114         /*
17115          * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
17116          */
17117         if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
17118                 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
17119         else
17120                 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
17121 
17122         error = ipif_set_values_tail(ill, ipif, mp, q);
17123         ipsq_exit(ipsq);
17124         if (error != 0 && error != EINPROGRESS) {
17125                 /*
17126                  * restore previous values
17127                  */
17128                 ill->ill_isv6 = B_FALSE;
17129                 ill_set_inputfn(ill);
17130         }
17131         return (error);
17132 }
17133 
17134 void
17135 ipif_init(ip_stack_t *ipst)
17136 {
17137         int i;
17138 
17139         for (i = 0; i < MAX_G_HEADS; i++) {
17140                 ipst->ips_ill_g_heads[i].ill_g_list_head =
17141                     (ill_if_t *)&ipst->ips_ill_g_heads[i];
17142                 ipst->ips_ill_g_heads[i].ill_g_list_tail =
17143                     (ill_if_t *)&ipst->ips_ill_g_heads[i];
17144         }
17145 
17146         avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
17147             ill_phyint_compare_index,
17148             sizeof (phyint_t),
17149             offsetof(struct phyint, phyint_avl_by_index));
17150         avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
17151             ill_phyint_compare_name,
17152             sizeof (phyint_t),
17153             offsetof(struct phyint, phyint_avl_by_name));
17154 }
17155 
17156 /*
17157  * Save enough information so that we can recreate the IRE if
17158  * the interface goes down and then up.
17159  */
17160 void
17161 ill_save_ire(ill_t *ill, ire_t *ire)
17162 {
17163         mblk_t  *save_mp;
17164 
17165         save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
17166         if (save_mp != NULL) {
17167                 ifrt_t  *ifrt;
17168 
17169                 save_mp->b_wptr += sizeof (ifrt_t);
17170                 ifrt = (ifrt_t *)save_mp->b_rptr;
17171                 bzero(ifrt, sizeof (ifrt_t));
17172                 ifrt->ifrt_type = ire->ire_type;
17173                 if (ire->ire_ipversion == IPV4_VERSION) {
17174                         ASSERT(!ill->ill_isv6);
17175                         ifrt->ifrt_addr = ire->ire_addr;
17176                         ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
17177                         ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
17178                         ifrt->ifrt_mask = ire->ire_mask;
17179                 } else {
17180                         ASSERT(ill->ill_isv6);
17181                         ifrt->ifrt_v6addr = ire->ire_addr_v6;
17182                         /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
17183                         mutex_enter(&ire->ire_lock);
17184                         ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
17185                         mutex_exit(&ire->ire_lock);
17186                         ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
17187                         ifrt->ifrt_v6mask = ire->ire_mask_v6;
17188                 }
17189                 ifrt->ifrt_flags = ire->ire_flags;
17190                 ifrt->ifrt_zoneid = ire->ire_zoneid;
17191                 mutex_enter(&ill->ill_saved_ire_lock);
17192                 save_mp->b_cont = ill->ill_saved_ire_mp;
17193                 ill->ill_saved_ire_mp = save_mp;
17194                 ill->ill_saved_ire_cnt++;
17195                 mutex_exit(&ill->ill_saved_ire_lock);
17196         }
17197 }
17198 
17199 /*
17200  * Remove one entry from ill_saved_ire_mp.
17201  */
17202 void
17203 ill_remove_saved_ire(ill_t *ill, ire_t *ire)
17204 {
17205         mblk_t  **mpp;
17206         mblk_t  *mp;
17207         ifrt_t  *ifrt;
17208 
17209         /* Remove from ill_saved_ire_mp list if it is there */
17210         mutex_enter(&ill->ill_saved_ire_lock);
17211         for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
17212             mpp = &(*mpp)->b_cont) {
17213                 in6_addr_t      gw_addr_v6;
17214 
17215                 /*
17216                  * On a given ill, the tuple of address, gateway, mask,
17217                  * ire_type, and zoneid is unique for each saved IRE.
17218                  */
17219                 mp = *mpp;
17220                 ifrt = (ifrt_t *)mp->b_rptr;
17221                 /* ire_gateway_addr_v6 can change - need lock */
17222                 mutex_enter(&ire->ire_lock);
17223                 gw_addr_v6 = ire->ire_gateway_addr_v6;
17224                 mutex_exit(&ire->ire_lock);
17225 
17226                 if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
17227                     ifrt->ifrt_type != ire->ire_type)
17228                         continue;
17229 
17230                 if (ill->ill_isv6 ?
17231                     (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
17232                     &ire->ire_addr_v6) &&
17233                     IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
17234                     &gw_addr_v6) &&
17235                     IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
17236                     &ire->ire_mask_v6)) :
17237                     (ifrt->ifrt_addr == ire->ire_addr &&
17238                     ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
17239                     ifrt->ifrt_mask == ire->ire_mask)) {
17240                         *mpp = mp->b_cont;
17241                         ill->ill_saved_ire_cnt--;
17242                         freeb(mp);
17243                         break;
17244                 }
17245         }
17246         mutex_exit(&ill->ill_saved_ire_lock);
17247 }
17248 
17249 /*
17250  * IP multirouting broadcast routes handling
17251  * Append CGTP broadcast IREs to regular ones created
17252  * at ifconfig time.
17253  * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
17254  * the destination and the gateway are broadcast addresses.
17255  * The caller has verified that the destination is an IRE_BROADCAST and that
17256  * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
17257  * we create a MULTIRT IRE_BROADCAST.
17258  * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
17259  * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
17260  */
17261 static void
17262 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
17263 {
17264         ire_t *ire_prim;
17265 
17266         ASSERT(ire != NULL);
17267 
17268         ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17269             IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
17270             NULL);
17271         if (ire_prim != NULL) {
17272                 /*
17273                  * We are in the special case of broadcasts for
17274                  * CGTP. We add an IRE_BROADCAST that holds
17275                  * the RTF_MULTIRT flag, the destination
17276                  * address and the low level
17277                  * info of ire_prim. In other words, CGTP
17278                  * broadcast is added to the redundant ipif.
17279                  */
17280                 ill_t *ill_prim;
17281                 ire_t  *bcast_ire;
17282 
17283                 ill_prim = ire_prim->ire_ill;
17284 
17285                 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
17286                     (void *)ire_prim, (void *)ill_prim));
17287 
17288                 bcast_ire = ire_create(
17289                     (uchar_t *)&ire->ire_addr,
17290                     (uchar_t *)&ip_g_all_ones,
17291                     (uchar_t *)&ire->ire_gateway_addr,
17292                     IRE_BROADCAST,
17293                     ill_prim,
17294                     GLOBAL_ZONEID,      /* CGTP is only for the global zone */
17295                     ire->ire_flags | RTF_KERNEL,
17296                     NULL,
17297                     ipst);
17298 
17299                 /*
17300                  * Here we assume that ire_add does head insertion so that
17301                  * the added IRE_BROADCAST comes before the existing IRE_HOST.
17302                  */
17303                 if (bcast_ire != NULL) {
17304                         if (ire->ire_flags & RTF_SETSRC) {
17305                                 bcast_ire->ire_setsrc_addr =
17306                                     ire->ire_setsrc_addr;
17307                         }
17308                         bcast_ire = ire_add(bcast_ire);
17309                         if (bcast_ire != NULL) {
17310                                 ip2dbg(("ip_cgtp_filter_bcast_add: "
17311                                     "added bcast_ire %p\n",
17312                                     (void *)bcast_ire));
17313 
17314                                 ill_save_ire(ill_prim, bcast_ire);
17315                                 ire_refrele(bcast_ire);
17316                         }
17317                 }
17318                 ire_refrele(ire_prim);
17319         }
17320 }
17321 
17322 /*
17323  * IP multirouting broadcast routes handling
17324  * Remove the broadcast ire.
17325  * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
17326  * the destination and the gateway are broadcast addresses.
17327  * The caller has only verified that RTF_MULTIRT was set. We check
17328  * that the destination is broadcast and that the gateway is a broadcast
17329  * address, and if so delete the IRE added by ip_cgtp_bcast_add().
17330  */
17331 static void
17332 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
17333 {
17334         ASSERT(ire != NULL);
17335 
17336         if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
17337                 ire_t *ire_prim;
17338 
17339                 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17340                     IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
17341                     ipst, NULL);
17342                 if (ire_prim != NULL) {
17343                         ill_t *ill_prim;
17344                         ire_t  *bcast_ire;
17345 
17346                         ill_prim = ire_prim->ire_ill;
17347 
17348                         ip2dbg(("ip_cgtp_filter_bcast_delete: "
17349                             "ire_prim %p, ill_prim %p\n",
17350                             (void *)ire_prim, (void *)ill_prim));
17351 
17352                         bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
17353                             ire->ire_gateway_addr, IRE_BROADCAST,
17354                             ill_prim, ALL_ZONES, NULL,
17355                             MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
17356                             MATCH_IRE_MASK, 0, ipst, NULL);
17357 
17358                         if (bcast_ire != NULL) {
17359                                 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17360                                     "looked up bcast_ire %p\n",
17361                                     (void *)bcast_ire));
17362                                 ill_remove_saved_ire(bcast_ire->ire_ill,
17363                                     bcast_ire);
17364                                 ire_delete(bcast_ire);
17365                                 ire_refrele(bcast_ire);
17366                         }
17367                         ire_refrele(ire_prim);
17368                 }
17369         }
17370 }
17371 
17372 /*
17373  * Derive an interface id from the link layer address.
17374  * Knows about IEEE 802 and IEEE EUI-64 mappings.
17375  */
17376 static void
17377 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17378 {
17379         char            *addr;
17380 
17381         /*
17382          * Note that some IPv6 interfaces get plumbed over links that claim to
17383          * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
17384          * PPP links).  The ETHERADDRL check here ensures that we only set the
17385          * interface ID on IPv6 interfaces above links that actually have real
17386          * Ethernet addresses.
17387          */
17388         if (ill->ill_phys_addr_length == ETHERADDRL) {
17389                 /* Form EUI-64 like address */
17390                 addr = (char *)&v6addr->s6_addr32[2];
17391                 bcopy(ill->ill_phys_addr, addr, 3);
17392                 addr[0] ^= 0x2;         /* Toggle Universal/Local bit */
17393                 addr[3] = (char)0xff;
17394                 addr[4] = (char)0xfe;
17395                 bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17396         }
17397 }
17398 
17399 /* ARGSUSED */
17400 static void
17401 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17402 {
17403 }
17404 
17405 typedef struct ipmp_ifcookie {
17406         uint32_t        ic_hostid;
17407         char            ic_ifname[LIFNAMSIZ];
17408         char            ic_zonename[ZONENAME_MAX];
17409 } ipmp_ifcookie_t;
17410 
17411 /*
17412  * Construct a pseudo-random interface ID for the IPMP interface that's both
17413  * predictable and (almost) guaranteed to be unique.
17414  */
17415 static void
17416 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17417 {
17418         zone_t          *zp;
17419         uint8_t         *addr;
17420         uchar_t         hash[16];
17421         ulong_t         hostid;
17422         MD5_CTX         ctx;
17423         ipmp_ifcookie_t ic = { 0 };
17424 
17425         ASSERT(IS_IPMP(ill));
17426 
17427         (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17428         ic.ic_hostid = htonl((uint32_t)hostid);
17429 
17430         (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17431 
17432         if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17433                 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17434                 zone_rele(zp);
17435         }
17436 
17437         MD5Init(&ctx);
17438         MD5Update(&ctx, &ic, sizeof (ic));
17439         MD5Final(hash, &ctx);
17440 
17441         /*
17442          * Map the hash to an interface ID per the basic approach in RFC3041.
17443          */
17444         addr = &v6addr->s6_addr8[8];
17445         bcopy(hash + 8, addr, sizeof (uint64_t));
17446         addr[0] &= ~0x2;                            /* set local bit */
17447 }
17448 
17449 /*
17450  * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17451  */
17452 static void
17453 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17454 {
17455         phyint_t *phyi = ill->ill_phyint;
17456 
17457         /*
17458          * Check PHYI_MULTI_BCAST and length of physical
17459          * address to determine if we use the mapping or the
17460          * broadcast address.
17461          */
17462         if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17463             ill->ill_phys_addr_length != ETHERADDRL) {
17464                 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17465                 return;
17466         }
17467         m_physaddr[0] = 0x33;
17468         m_physaddr[1] = 0x33;
17469         m_physaddr[2] = m_ip6addr[12];
17470         m_physaddr[3] = m_ip6addr[13];
17471         m_physaddr[4] = m_ip6addr[14];
17472         m_physaddr[5] = m_ip6addr[15];
17473 }
17474 
17475 /*
17476  * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17477  */
17478 static void
17479 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17480 {
17481         phyint_t *phyi = ill->ill_phyint;
17482 
17483         /*
17484          * Check PHYI_MULTI_BCAST and length of physical
17485          * address to determine if we use the mapping or the
17486          * broadcast address.
17487          */
17488         if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17489             ill->ill_phys_addr_length != ETHERADDRL) {
17490                 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17491                 return;
17492         }
17493         m_physaddr[0] = 0x01;
17494         m_physaddr[1] = 0x00;
17495         m_physaddr[2] = 0x5e;
17496         m_physaddr[3] = m_ipaddr[1] & 0x7f;
17497         m_physaddr[4] = m_ipaddr[2];
17498         m_physaddr[5] = m_ipaddr[3];
17499 }
17500 
17501 /* ARGSUSED */
17502 static void
17503 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17504 {
17505         /*
17506          * for the MULTI_BCAST case and other cases when we want to
17507          * use the link-layer broadcast address for multicast.
17508          */
17509         uint8_t *bphys_addr;
17510         dl_unitdata_req_t *dlur;
17511 
17512         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17513         if (ill->ill_sap_length < 0) {
17514                 bphys_addr = (uchar_t *)dlur +
17515                     dlur->dl_dest_addr_offset;
17516         } else  {
17517                 bphys_addr = (uchar_t *)dlur +
17518                     dlur->dl_dest_addr_offset + ill->ill_sap_length;
17519         }
17520 
17521         bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17522 }
17523 
17524 /*
17525  * Derive IPoIB interface id from the link layer address.
17526  */
17527 static void
17528 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17529 {
17530         char            *addr;
17531 
17532         ASSERT(ill->ill_phys_addr_length == 20);
17533         addr = (char *)&v6addr->s6_addr32[2];
17534         bcopy(ill->ill_phys_addr + 12, addr, 8);
17535         /*
17536          * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17537          * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17538          * rules. In these cases, the IBA considers these GUIDs to be in
17539          * "Modified EUI-64" format, and thus toggling the u/l bit is not
17540          * required; vendors are required not to assign global EUI-64's
17541          * that differ only in u/l bit values, thus guaranteeing uniqueness
17542          * of the interface identifier. Whether the GUID is in modified
17543          * or proper EUI-64 format, the ipv6 identifier must have the u/l
17544          * bit set to 1.
17545          */
17546         addr[0] |= 2;                   /* Set Universal/Local bit to 1 */
17547 }
17548 
17549 /*
17550  * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17551  * Note on mapping from multicast IP addresses to IPoIB multicast link
17552  * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17553  * The format of an IPoIB multicast address is:
17554  *
17555  *  4 byte QPN      Scope Sign.  Pkey
17556  * +--------------------------------------------+
17557  * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17558  * +--------------------------------------------+
17559  *
17560  * The Scope and Pkey components are properties of the IBA port and
17561  * network interface. They can be ascertained from the broadcast address.
17562  * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17563  */
17564 static void
17565 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17566 {
17567         static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17568             0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17569             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17570         uint8_t *bphys_addr;
17571         dl_unitdata_req_t *dlur;
17572 
17573         bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17574 
17575         /*
17576          * RFC 4391: IPv4 MGID is 28-bit long.
17577          */
17578         m_physaddr[16] = m_ipaddr[0] & 0x0f;
17579         m_physaddr[17] = m_ipaddr[1];
17580         m_physaddr[18] = m_ipaddr[2];
17581         m_physaddr[19] = m_ipaddr[3];
17582 
17583 
17584         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17585         if (ill->ill_sap_length < 0) {
17586                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17587         } else  {
17588                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17589                     ill->ill_sap_length;
17590         }
17591         /*
17592          * Now fill in the IBA scope/Pkey values from the broadcast address.
17593          */
17594         m_physaddr[5] = bphys_addr[5];
17595         m_physaddr[8] = bphys_addr[8];
17596         m_physaddr[9] = bphys_addr[9];
17597 }
17598 
17599 static void
17600 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17601 {
17602         static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17603             0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17604             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17605         uint8_t *bphys_addr;
17606         dl_unitdata_req_t *dlur;
17607 
17608         bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17609 
17610         /*
17611          * RFC 4391: IPv4 MGID is 80-bit long.
17612          */
17613         bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17614 
17615         dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17616         if (ill->ill_sap_length < 0) {
17617                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17618         } else  {
17619                 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17620                     ill->ill_sap_length;
17621         }
17622         /*
17623          * Now fill in the IBA scope/Pkey values from the broadcast address.
17624          */
17625         m_physaddr[5] = bphys_addr[5];
17626         m_physaddr[8] = bphys_addr[8];
17627         m_physaddr[9] = bphys_addr[9];
17628 }
17629 
17630 /*
17631  * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17632  * tunnel).  The IPv4 address simply get placed in the lower 4 bytes of the
17633  * IPv6 interface id.  This is a suggested mechanism described in section 3.7
17634  * of RFC4213.
17635  */
17636 static void
17637 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17638 {
17639         ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17640         v6addr->s6_addr32[2] = 0;
17641         bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17642 }
17643 
17644 /*
17645  * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17646  * tunnel).  The lower 8 bytes of the IPv6 address simply become the interface
17647  * id.
17648  */
17649 static void
17650 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17651 {
17652         in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17653 
17654         ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17655         bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17656 }
17657 
17658 static void
17659 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17660 {
17661         ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17662 }
17663 
17664 static void
17665 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17666 {
17667         ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17668 }
17669 
17670 static void
17671 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17672 {
17673         ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17674 }
17675 
17676 static void
17677 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17678 {
17679         ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17680 }
17681 
17682 /*
17683  * Lookup an ill and verify that the zoneid has an ipif on that ill.
17684  * Returns an held ill, or NULL.
17685  */
17686 ill_t *
17687 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17688     ip_stack_t *ipst)
17689 {
17690         ill_t   *ill;
17691         ipif_t  *ipif;
17692 
17693         ill = ill_lookup_on_ifindex(index, isv6, ipst);
17694         if (ill == NULL)
17695                 return (NULL);
17696 
17697         mutex_enter(&ill->ill_lock);
17698         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17699                 if (IPIF_IS_CONDEMNED(ipif))
17700                         continue;
17701                 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17702                     ipif->ipif_zoneid != ALL_ZONES)
17703                         continue;
17704 
17705                 mutex_exit(&ill->ill_lock);
17706                 return (ill);
17707         }
17708         mutex_exit(&ill->ill_lock);
17709         ill_refrele(ill);
17710         return (NULL);
17711 }
17712 
17713 /*
17714  * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17715  * If a pointer to an ipif_t is returned then the caller will need to do
17716  * an ill_refrele().
17717  */
17718 ipif_t *
17719 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17720     ip_stack_t *ipst)
17721 {
17722         ipif_t *ipif;
17723         ill_t *ill;
17724 
17725         ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17726         if (ill == NULL)
17727                 return (NULL);
17728 
17729         mutex_enter(&ill->ill_lock);
17730         if (ill->ill_state_flags & ILL_CONDEMNED) {
17731                 mutex_exit(&ill->ill_lock);
17732                 ill_refrele(ill);
17733                 return (NULL);
17734         }
17735 
17736         for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17737                 if (!IPIF_CAN_LOOKUP(ipif))
17738                         continue;
17739                 if (lifidx == ipif->ipif_id) {
17740                         ipif_refhold_locked(ipif);
17741                         break;
17742                 }
17743         }
17744 
17745         mutex_exit(&ill->ill_lock);
17746         ill_refrele(ill);
17747         return (ipif);
17748 }
17749 
17750 /*
17751  * Set ill_inputfn based on the current know state.
17752  * This needs to be called when any of the factors taken into
17753  * account changes.
17754  */
17755 void
17756 ill_set_inputfn(ill_t *ill)
17757 {
17758         ip_stack_t      *ipst = ill->ill_ipst;
17759 
17760         if (ill->ill_isv6) {
17761                 if (is_system_labeled())
17762                         ill->ill_inputfn = ill_input_full_v6;
17763                 else
17764                         ill->ill_inputfn = ill_input_short_v6;
17765         } else {
17766                 if (is_system_labeled())
17767                         ill->ill_inputfn = ill_input_full_v4;
17768                 else if (ill->ill_dhcpinit != 0)
17769                         ill->ill_inputfn = ill_input_full_v4;
17770                 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17771                     != NULL)
17772                         ill->ill_inputfn = ill_input_full_v4;
17773                 else if (ipst->ips_ip_cgtp_filter &&
17774                     ipst->ips_ip_cgtp_filter_ops != NULL)
17775                         ill->ill_inputfn = ill_input_full_v4;
17776                 else
17777                         ill->ill_inputfn = ill_input_short_v4;
17778         }
17779 }
17780 
17781 /*
17782  * Re-evaluate ill_inputfn for all the IPv4 ills.
17783  * Used when RSVP and CGTP comes and goes.
17784  */
17785 void
17786 ill_set_inputfn_all(ip_stack_t *ipst)
17787 {
17788         ill_walk_context_t      ctx;
17789         ill_t                   *ill;
17790 
17791         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17792         ill = ILL_START_WALK_V4(&ctx, ipst);
17793         for (; ill != NULL; ill = ill_next(&ctx, ill))
17794                 ill_set_inputfn(ill);
17795 
17796         rw_exit(&ipst->ips_ill_g_lock);
17797 }
17798 
17799 /*
17800  * Set the physical address information for `ill' to the contents of the
17801  * dl_notify_ind_t pointed to by `mp'.  Must be called as writer, and will be
17802  * asynchronous if `ill' cannot immediately be quiesced -- in which case
17803  * EINPROGRESS will be returned.
17804  */
17805 int
17806 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17807 {
17808         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17809         dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17810 
17811         ASSERT(IAM_WRITER_IPSQ(ipsq));
17812 
17813         if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17814             dlindp->dl_data != DL_CURR_DEST_ADDR &&
17815             dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17816                 /* Changing DL_IPV6_TOKEN is not yet supported */
17817                 return (0);
17818         }
17819 
17820         /*
17821          * We need to store up to two copies of `mp' in `ill'.  Due to the
17822          * design of ipsq_pending_mp_add(), we can't pass them as separate
17823          * arguments to ill_set_phys_addr_tail().  Instead, chain them
17824          * together here, then pull 'em apart in ill_set_phys_addr_tail().
17825          */
17826         if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17827                 freemsg(mp);
17828                 return (ENOMEM);
17829         }
17830 
17831         ipsq_current_start(ipsq, ill->ill_ipif, 0);
17832 
17833         /*
17834          * Since we'll only do a logical down, we can't rely on ipif_down
17835          * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17836          * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17837          * case, to quiesce ire's and nce's for ill_is_quiescent.
17838          */
17839         mutex_enter(&ill->ill_lock);
17840         ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17841         /* no more ire/nce addition allowed */
17842         mutex_exit(&ill->ill_lock);
17843 
17844         /*
17845          * If we can quiesce the ill, then set the address.  If not, then
17846          * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17847          */
17848         ill_down_ipifs(ill, B_TRUE);
17849         mutex_enter(&ill->ill_lock);
17850         if (!ill_is_quiescent(ill)) {
17851                 /* call cannot fail since `conn_t *' argument is NULL */
17852                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17853                     mp, ILL_DOWN);
17854                 mutex_exit(&ill->ill_lock);
17855                 return (EINPROGRESS);
17856         }
17857         mutex_exit(&ill->ill_lock);
17858 
17859         ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17860         return (0);
17861 }
17862 
17863 /*
17864  * When the allowed-ips link property is set on the datalink, IP receives a
17865  * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17866  * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17867  * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17868  * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17869  * array.
17870  */
17871 void
17872 ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17873 {
17874         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17875         dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17876         mac_protect_t *mrp;
17877         int i;
17878 
17879         ASSERT(IAM_WRITER_IPSQ(ipsq));
17880         mrp = (mac_protect_t *)&dlip[1];
17881 
17882         if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17883                 kmem_free(ill->ill_allowed_ips,
17884                     ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17885                 ill->ill_allowed_ips_cnt = 0;
17886                 ill->ill_allowed_ips = NULL;
17887                 mutex_enter(&ill->ill_phyint->phyint_lock);
17888                 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17889                 mutex_exit(&ill->ill_phyint->phyint_lock);
17890                 return;
17891         }
17892 
17893         if (ill->ill_allowed_ips != NULL) {
17894                 kmem_free(ill->ill_allowed_ips,
17895                     ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17896         }
17897         ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17898         ill->ill_allowed_ips = kmem_alloc(
17899             ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17900         for (i = 0; i < mrp->mp_ipaddrcnt;  i++)
17901                 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17902 
17903         mutex_enter(&ill->ill_phyint->phyint_lock);
17904         ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17905         mutex_exit(&ill->ill_phyint->phyint_lock);
17906 }
17907 
17908 /*
17909  * Once the ill associated with `q' has quiesced, set its physical address
17910  * information to the values in `addrmp'.  Note that two copies of `addrmp'
17911  * are passed (linked by b_cont), since we sometimes need to save two distinct
17912  * copies in the ill_t, and our context doesn't permit sleeping or allocation
17913  * failure (we'll free the other copy if it's not needed).  Since the ill_t
17914  * is quiesced, we know any stale nce's with the old address information have
17915  * already been removed, so we don't need to call nce_flush().
17916  */
17917 /* ARGSUSED */
17918 static void
17919 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17920 {
17921         ill_t           *ill = q->q_ptr;
17922         mblk_t          *addrmp2 = unlinkb(addrmp);
17923         dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17924         uint_t          addrlen, addroff;
17925         int             status;
17926 
17927         ASSERT(IAM_WRITER_IPSQ(ipsq));
17928 
17929         addroff = dlindp->dl_addr_offset;
17930         addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17931 
17932         switch (dlindp->dl_data) {
17933         case DL_IPV6_LINK_LAYER_ADDR:
17934                 ill_set_ndmp(ill, addrmp, addroff, addrlen);
17935                 freemsg(addrmp2);
17936                 break;
17937 
17938         case DL_CURR_DEST_ADDR:
17939                 freemsg(ill->ill_dest_addr_mp);
17940                 ill->ill_dest_addr = addrmp->b_rptr + addroff;
17941                 ill->ill_dest_addr_mp = addrmp;
17942                 if (ill->ill_isv6) {
17943                         ill_setdesttoken(ill);
17944                         ipif_setdestlinklocal(ill->ill_ipif);
17945                 }
17946                 freemsg(addrmp2);
17947                 break;
17948 
17949         case DL_CURR_PHYS_ADDR:
17950                 freemsg(ill->ill_phys_addr_mp);
17951                 ill->ill_phys_addr = addrmp->b_rptr + addroff;
17952                 ill->ill_phys_addr_mp = addrmp;
17953                 ill->ill_phys_addr_length = addrlen;
17954                 if (ill->ill_isv6)
17955                         ill_set_ndmp(ill, addrmp2, addroff, addrlen);
17956                 else
17957                         freemsg(addrmp2);
17958                 if (ill->ill_isv6) {
17959                         ill_setdefaulttoken(ill);
17960                         ipif_setlinklocal(ill->ill_ipif);
17961                 }
17962                 break;
17963         default:
17964                 ASSERT(0);
17965         }
17966 
17967         /*
17968          * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
17969          * as we bring the ipifs up again.
17970          */
17971         mutex_enter(&ill->ill_lock);
17972         ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
17973         mutex_exit(&ill->ill_lock);
17974         /*
17975          * If there are ipifs to bring up, ill_up_ipifs() will return
17976          * EINPROGRESS, and ipsq_current_finish() will be called by
17977          * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
17978          * brought up.
17979          */
17980         status = ill_up_ipifs(ill, q, addrmp);
17981         if (status != EINPROGRESS)
17982                 ipsq_current_finish(ipsq);
17983 }
17984 
17985 /*
17986  * Helper routine for setting the ill_nd_lla fields.
17987  */
17988 void
17989 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
17990 {
17991         freemsg(ill->ill_nd_lla_mp);
17992         ill->ill_nd_lla = ndmp->b_rptr + addroff;
17993         ill->ill_nd_lla_mp = ndmp;
17994         ill->ill_nd_lla_len = addrlen;
17995 }
17996 
17997 /*
17998  * Replumb the ill.
17999  */
18000 int
18001 ill_replumb(ill_t *ill, mblk_t *mp)
18002 {
18003         ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
18004 
18005         ASSERT(IAM_WRITER_IPSQ(ipsq));
18006 
18007         ipsq_current_start(ipsq, ill->ill_ipif, 0);
18008 
18009         /*
18010          * If we can quiesce the ill, then continue.  If not, then
18011          * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
18012          */
18013         ill_down_ipifs(ill, B_FALSE);
18014 
18015         mutex_enter(&ill->ill_lock);
18016         if (!ill_is_quiescent(ill)) {
18017                 /* call cannot fail since `conn_t *' argument is NULL */
18018                 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
18019                     mp, ILL_DOWN);
18020                 mutex_exit(&ill->ill_lock);
18021                 return (EINPROGRESS);
18022         }
18023         mutex_exit(&ill->ill_lock);
18024 
18025         ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
18026         return (0);
18027 }
18028 
18029 /* ARGSUSED */
18030 static void
18031 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
18032 {
18033         ill_t *ill = q->q_ptr;
18034         int err;
18035         conn_t *connp = NULL;
18036 
18037         ASSERT(IAM_WRITER_IPSQ(ipsq));
18038         freemsg(ill->ill_replumb_mp);
18039         ill->ill_replumb_mp = copyb(mp);
18040 
18041         if (ill->ill_replumb_mp == NULL) {
18042                 /* out of memory */
18043                 ipsq_current_finish(ipsq);
18044                 return;
18045         }
18046 
18047         mutex_enter(&ill->ill_lock);
18048         ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
18049             ill->ill_rq, ill->ill_replumb_mp, 0);
18050         mutex_exit(&ill->ill_lock);
18051 
18052         if (!ill->ill_up_ipifs) {
18053                 /* already closing */
18054                 ipsq_current_finish(ipsq);
18055                 return;
18056         }
18057         ill->ill_replumbing = 1;
18058         err = ill_down_ipifs_tail(ill);
18059 
18060         /*
18061          * Successfully quiesced and brought down the interface, now we send
18062          * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
18063          * DL_NOTE_REPLUMB message.
18064          */
18065         mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
18066             DL_NOTIFY_CONF);
18067         ASSERT(mp != NULL);
18068         ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
18069             DL_NOTE_REPLUMB_DONE;
18070         ill_dlpi_send(ill, mp);
18071 
18072         /*
18073          * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
18074          * streams have to be unbound. When all the DLPI exchanges are done,
18075          * ipsq_current_finish() will be called by arp_bringup_done(). The
18076          * remainder of ipif bringup via ill_up_ipifs() will also be done in
18077          * arp_bringup_done().
18078          */
18079         ASSERT(ill->ill_replumb_mp != NULL);
18080         if (err == EINPROGRESS)
18081                 return;
18082         else
18083                 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
18084         ASSERT(connp == NULL);
18085         if (err == 0 && ill->ill_replumb_mp != NULL &&
18086             ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
18087                 return;
18088         }
18089         ipsq_current_finish(ipsq);
18090 }
18091 
18092 /*
18093  * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
18094  * which is `bufsize' bytes.  On success, zero is returned and `buf' updated
18095  * as per the ioctl.  On failure, an errno is returned.
18096  */
18097 static int
18098 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
18099 {
18100         int rval;
18101         struct strioctl iocb;
18102 
18103         iocb.ic_cmd = cmd;
18104         iocb.ic_timout = 15;
18105         iocb.ic_len = bufsize;
18106         iocb.ic_dp = buf;
18107 
18108         return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
18109 }
18110 
18111 /*
18112  * Issue an SIOCGLIFCONF for address family `af' and store the result into a
18113  * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
18114  */
18115 static int
18116 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
18117     uint_t *bufsizep, cred_t *cr)
18118 {
18119         int err;
18120         struct lifnum lifn;
18121 
18122         bzero(&lifn, sizeof (lifn));
18123         lifn.lifn_family = af;
18124         lifn.lifn_flags = LIFC_UNDER_IPMP;
18125 
18126         if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
18127                 return (err);
18128 
18129         /*
18130          * Pad the interface count to account for additional interfaces that
18131          * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
18132          */
18133         lifn.lifn_count += 4;
18134         bzero(lifcp, sizeof (*lifcp));
18135         lifcp->lifc_flags = LIFC_UNDER_IPMP;
18136         lifcp->lifc_family = af;
18137         lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
18138         lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
18139 
18140         err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
18141         if (err != 0) {
18142                 kmem_free(lifcp->lifc_buf, *bufsizep);
18143                 return (err);
18144         }
18145 
18146         return (0);
18147 }
18148 
18149 /*
18150  * Helper for ip_interface_cleanup() that removes the loopback interface.
18151  */
18152 static void
18153 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18154 {
18155         int err;
18156         struct lifreq lifr;
18157 
18158         bzero(&lifr, sizeof (lifr));
18159         (void) strcpy(lifr.lifr_name, ipif_loopback_name);
18160 
18161         /*
18162          * Attempt to remove the interface.  It may legitimately not exist
18163          * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
18164          */
18165         err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
18166         if (err != 0 && err != ENXIO) {
18167                 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
18168                     "error %d\n", isv6 ? "v6" : "v4", err));
18169         }
18170 }
18171 
18172 /*
18173  * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
18174  * groups and that IPMP data addresses are down.  These conditions must be met
18175  * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
18176  */
18177 static void
18178 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18179 {
18180         int af = isv6 ? AF_INET6 : AF_INET;
18181         int i, nifs;
18182         int err;
18183         uint_t bufsize;
18184         uint_t lifrsize = sizeof (struct lifreq);
18185         struct lifconf lifc;
18186         struct lifreq *lifrp;
18187 
18188         if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
18189                 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
18190                     "(error %d); any IPMP interfaces cannot be shutdown", err);
18191                 return;
18192         }
18193 
18194         nifs = lifc.lifc_len / lifrsize;
18195         for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
18196                 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18197                 if (err != 0) {
18198                         cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
18199                             "flags: error %d", lifrp->lifr_name, err);
18200                         continue;
18201                 }
18202 
18203                 if (lifrp->lifr_flags & IFF_IPMP) {
18204                         if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
18205                                 continue;
18206 
18207                         lifrp->lifr_flags &= ~IFF_UP;
18208                         err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
18209                         if (err != 0) {
18210                                 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18211                                     "bring down (error %d); IPMP interface may "
18212                                     "not be shutdown", lifrp->lifr_name, err);
18213                         }
18214 
18215                         /*
18216                          * Check if IFF_DUPLICATE is still set -- and if so,
18217                          * reset the address to clear it.
18218                          */
18219                         err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18220                         if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
18221                                 continue;
18222 
18223                         err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
18224                         if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
18225                             lifrp, lifrsize, cr)) != 0) {
18226                                 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18227                                     "reset DAD (error %d); IPMP interface may "
18228                                     "not be shutdown", lifrp->lifr_name, err);
18229                         }
18230                         continue;
18231                 }
18232 
18233                 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
18234                         lifrp->lifr_groupname[0] = '\0';
18235                         if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
18236                             lifrsize, cr)) != 0) {
18237                                 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18238                                     "leave IPMP group (error %d); associated "
18239                                     "IPMP interface may not be shutdown",
18240                                     lifrp->lifr_name, err);
18241                                 continue;
18242                         }
18243                 }
18244         }
18245 
18246         kmem_free(lifc.lifc_buf, bufsize);
18247 }
18248 
18249 #define UDPDEV          "/devices/pseudo/udp@0:udp"
18250 #define UDP6DEV         "/devices/pseudo/udp6@0:udp6"
18251 
18252 /*
18253  * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
18254  * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
18255  * when the user-level processes in the zone are killed and the latter are
18256  * cleaned up by str_stack_shutdown().
18257  */
18258 void
18259 ip_interface_cleanup(ip_stack_t *ipst)
18260 {
18261         ldi_handle_t    lh;
18262         ldi_ident_t     li;
18263         cred_t          *cr;
18264         int             err;
18265         int             i;
18266         char            *devs[] = { UDP6DEV, UDPDEV };
18267         netstackid_t    stackid = ipst->ips_netstack->netstack_stackid;
18268 
18269         if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
18270                 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
18271                     " error %d", err);
18272                 return;
18273         }
18274 
18275         cr = zone_get_kcred(netstackid_to_zoneid(stackid));
18276         ASSERT(cr != NULL);
18277 
18278         /*
18279          * NOTE: loop executes exactly twice and is hardcoded to know that the
18280          * first iteration is IPv6.  (Unrolling yields repetitious code, hence
18281          * the loop.)
18282          */
18283         for (i = 0; i < 2; i++) {
18284                 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
18285                 if (err != 0) {
18286                         cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
18287                             " error %d", devs[i], err);
18288                         continue;
18289                 }
18290 
18291                 ip_loopback_removeif(lh, i == 0, cr);
18292                 ip_ipmp_cleanup(lh, i == 0, cr);
18293 
18294                 (void) ldi_close(lh, FREAD|FWRITE, cr);
18295         }
18296 
18297         ldi_ident_release(li);
18298         crfree(cr);
18299 }
18300 
18301 /*
18302  * This needs to be in-sync with nic_event_t definition
18303  */
18304 static const char *
18305 ill_hook_event2str(nic_event_t event)
18306 {
18307         switch (event) {
18308         case NE_PLUMB:
18309                 return ("PLUMB");
18310         case NE_UNPLUMB:
18311                 return ("UNPLUMB");
18312         case NE_UP:
18313                 return ("UP");
18314         case NE_DOWN:
18315                 return ("DOWN");
18316         case NE_ADDRESS_CHANGE:
18317                 return ("ADDRESS_CHANGE");
18318         case NE_LIF_UP:
18319                 return ("LIF_UP");
18320         case NE_LIF_DOWN:
18321                 return ("LIF_DOWN");
18322         case NE_IFINDEX_CHANGE:
18323                 return ("IFINDEX_CHANGE");
18324         default:
18325                 return ("UNKNOWN");
18326         }
18327 }
18328 
18329 void
18330 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
18331     nic_event_data_t data, size_t datalen)
18332 {
18333         ip_stack_t              *ipst = ill->ill_ipst;
18334         hook_nic_event_int_t    *info;
18335         const char              *str = NULL;
18336 
18337         /* create a new nic event info */
18338         if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
18339                 goto fail;
18340 
18341         info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
18342         info->hnei_event.hne_lif = lif;
18343         info->hnei_event.hne_event = event;
18344         info->hnei_event.hne_protocol = ill->ill_isv6 ?
18345             ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
18346         info->hnei_event.hne_data = NULL;
18347         info->hnei_event.hne_datalen = 0;
18348         info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
18349 
18350         if (data != NULL && datalen != 0) {
18351                 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
18352                 if (info->hnei_event.hne_data == NULL)
18353                         goto fail;
18354                 bcopy(data, info->hnei_event.hne_data, datalen);
18355                 info->hnei_event.hne_datalen = datalen;
18356         }
18357 
18358         if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
18359             DDI_NOSLEEP) == DDI_SUCCESS)
18360                 return;
18361 
18362 fail:
18363         if (info != NULL) {
18364                 if (info->hnei_event.hne_data != NULL) {
18365                         kmem_free(info->hnei_event.hne_data,
18366                             info->hnei_event.hne_datalen);
18367                 }
18368                 kmem_free(info, sizeof (hook_nic_event_t));
18369         }
18370         str = ill_hook_event2str(event);
18371         ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
18372             "information for %s (ENOMEM)\n", str, ill->ill_name));
18373 }
18374 
18375 static int
18376 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
18377 {
18378         int             err = 0;
18379         const in_addr_t *addr = NULL;
18380         nce_t           *nce = NULL;
18381         ill_t           *ill = ipif->ipif_ill;
18382         ill_t           *bound_ill;
18383         boolean_t       added_ipif = B_FALSE;
18384         uint16_t        state;
18385         uint16_t        flags;
18386 
18387         DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
18388             ill_t *, ill, ipif_t *, ipif);
18389         if (ipif->ipif_lcl_addr != INADDR_ANY) {
18390                 addr = &ipif->ipif_lcl_addr;
18391         }
18392 
18393         if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
18394                 if (res_act != Res_act_initial)
18395                         return (EINVAL);
18396         }
18397 
18398         if (addr != NULL) {
18399                 ipmp_illgrp_t   *illg = ill->ill_grp;
18400 
18401                 /* add unicast nce for the local addr */
18402 
18403                 if (IS_IPMP(ill)) {
18404                         /*
18405                          * If we're here via ipif_up(), then the ipif
18406                          * won't be bound yet -- add it to the group,
18407                          * which will bind it if possible. (We would
18408                          * add it in ipif_up(), but deleting on failure
18409                          * there is gruesome.)  If we're here via
18410                          * ipmp_ill_bind_ipif(), then the ipif has
18411                          * already been added to the group and we
18412                          * just need to use the binding.
18413                          */
18414                         if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18415                                 bound_ill  = ipmp_illgrp_add_ipif(illg, ipif);
18416                                 if (bound_ill == NULL) {
18417                                         /*
18418                                          * We couldn't bind the ipif to an ill
18419                                          * yet, so we have nothing to publish.
18420                                          * Mark the address as ready and return.
18421                                          */
18422                                         ipif->ipif_addr_ready = 1;
18423                                         return (0);
18424                                 }
18425                                 added_ipif = B_TRUE;
18426                         }
18427                 } else {
18428                         bound_ill = ill;
18429                 }
18430 
18431                 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18432                     NCE_F_NONUD);
18433                 /*
18434                  * If this is an initial bring-up (or the ipif was never
18435                  * completely brought up), do DAD.  Otherwise, we're here
18436                  * because IPMP has rebound an address to this ill: send
18437                  * unsolicited advertisements (ARP announcements) to
18438                  * inform others.
18439                  */
18440                 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18441                         state = ND_UNCHANGED; /* compute in nce_add_common() */
18442                 } else {
18443                         state = ND_REACHABLE;
18444                         flags |= NCE_F_UNSOL_ADV;
18445                 }
18446 
18447 retry:
18448                 err = nce_lookup_then_add_v4(ill,
18449                     bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18450                     addr, flags, state, &nce);
18451 
18452                 /*
18453                  * note that we may encounter EEXIST if we are moving
18454                  * the nce as a result of a rebind operation.
18455                  */
18456                 switch (err) {
18457                 case 0:
18458                         ipif->ipif_added_nce = 1;
18459                         nce->nce_ipif_cnt++;
18460                         break;
18461                 case EEXIST:
18462                         ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18463                             ill->ill_name));
18464                         if (!NCE_MYADDR(nce->nce_common)) {
18465                                 /*
18466                                  * A leftover nce from before this address
18467                                  * existed
18468                                  */
18469                                 ncec_delete(nce->nce_common);
18470                                 nce_refrele(nce);
18471                                 nce = NULL;
18472                                 goto retry;
18473                         }
18474                         if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18475                                 nce_refrele(nce);
18476                                 nce = NULL;
18477                                 ip1dbg(("ipif_arp_up: NCE already exists "
18478                                     "for %s:%u\n", ill->ill_name,
18479                                     ipif->ipif_id));
18480                                 goto arp_up_done;
18481                         }
18482                         /*
18483                          * Duplicate local addresses are permissible for
18484                          * IPIF_POINTOPOINT interfaces which will get marked
18485                          * IPIF_UNNUMBERED later in
18486                          * ip_addr_availability_check().
18487                          *
18488                          * The nce_ipif_cnt field tracks the number of
18489                          * ipifs that have nce_addr as their local address.
18490                          */
18491                         ipif->ipif_addr_ready = 1;
18492                         ipif->ipif_added_nce = 1;
18493                         nce->nce_ipif_cnt++;
18494                         err = 0;
18495                         break;
18496                 default:
18497                         ASSERT(nce == NULL);
18498                         goto arp_up_done;
18499                 }
18500                 if (arp_no_defense) {
18501                         if ((ipif->ipif_flags & IPIF_UP) &&
18502                             !ipif->ipif_addr_ready)
18503                                 ipif_up_notify(ipif);
18504                         ipif->ipif_addr_ready = 1;
18505                 }
18506         } else {
18507                 /* zero address. nothing to publish */
18508                 ipif->ipif_addr_ready = 1;
18509         }
18510         if (nce != NULL)
18511                 nce_refrele(nce);
18512 arp_up_done:
18513         if (added_ipif && err != 0)
18514                 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18515         return (err);
18516 }
18517 
18518 int
18519 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18520 {
18521         int             err = 0;
18522         ill_t           *ill = ipif->ipif_ill;
18523         boolean_t       first_interface, wait_for_dlpi = B_FALSE;
18524 
18525         DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18526             ill_t *, ill, ipif_t *, ipif);
18527 
18528         /*
18529          * need to bring up ARP or setup mcast mapping only
18530          * when the first interface is coming UP.
18531          */
18532         first_interface = (ill->ill_ipif_up_count == 0 &&
18533             ill->ill_ipif_dup_count == 0 && !was_dup);
18534 
18535         if (res_act == Res_act_initial && first_interface) {
18536                 /*
18537                  * Send ATTACH + BIND
18538                  */
18539                 err = arp_ll_up(ill);
18540                 if (err != EINPROGRESS && err != 0)
18541                         return (err);
18542 
18543                 /*
18544                  * Add NCE for local address. Start DAD.
18545                  * we'll wait to hear that DAD has finished
18546                  * before using the interface.
18547                  */
18548                 if (err == EINPROGRESS)
18549                         wait_for_dlpi = B_TRUE;
18550         }
18551 
18552         if (!wait_for_dlpi)
18553                 (void) ipif_arp_up_done_tail(ipif, res_act);
18554 
18555         return (!wait_for_dlpi ? 0 : EINPROGRESS);
18556 }
18557 
18558 /*
18559  * Finish processing of "arp_up" after all the DLPI message
18560  * exchanges have completed between arp and the driver.
18561  */
18562 void
18563 arp_bringup_done(ill_t *ill, int err)
18564 {
18565         mblk_t  *mp1;
18566         ipif_t  *ipif;
18567         conn_t *connp = NULL;
18568         ipsq_t  *ipsq;
18569         queue_t *q;
18570 
18571         ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18572 
18573         ASSERT(IAM_WRITER_ILL(ill));
18574 
18575         ipsq = ill->ill_phyint->phyint_ipsq;
18576         ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18577         mp1 = ipsq_pending_mp_get(ipsq, &connp);
18578         ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18579         if (mp1 == NULL) /* bringup was aborted by the user */
18580                 return;
18581 
18582         /*
18583          * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18584          * must have an associated conn_t.  Otherwise, we're bringing this
18585          * interface back up as part of handling an asynchronous event (e.g.,
18586          * physical address change).
18587          */
18588         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18589                 ASSERT(connp != NULL);
18590                 q = CONNP_TO_WQ(connp);
18591         } else {
18592                 ASSERT(connp == NULL);
18593                 q = ill->ill_rq;
18594         }
18595         if (err == 0) {
18596                 if (ipif->ipif_isv6) {
18597                         if ((err = ipif_up_done_v6(ipif)) != 0)
18598                                 ip0dbg(("arp_bringup_done: init failed\n"));
18599                 } else {
18600                         err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18601                         if (err != 0 ||
18602                             (err = ipif_up_done(ipif)) != 0) {
18603                                 ip0dbg(("arp_bringup_done: "
18604                                     "init failed err %x\n", err));
18605                                 (void) ipif_arp_down(ipif);
18606                         }
18607 
18608                 }
18609         } else {
18610                 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18611         }
18612 
18613         if ((err == 0) && (ill->ill_up_ipifs)) {
18614                 err = ill_up_ipifs(ill, q, mp1);
18615                 if (err == EINPROGRESS)
18616                         return;
18617         }
18618 
18619         /*
18620          * If we have a moved ipif to bring up, and everything has succeeded
18621          * to this point, bring it up on the IPMP ill.  Otherwise, leave it
18622          * down -- the admin can try to bring it up by hand if need be.
18623          */
18624         if (ill->ill_move_ipif != NULL) {
18625                 ipif = ill->ill_move_ipif;
18626                 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18627                     ipif->ipif_ill->ill_name));
18628                 ill->ill_move_ipif = NULL;
18629                 if (err == 0) {
18630                         err = ipif_up(ipif, q, mp1);
18631                         if (err == EINPROGRESS)
18632                                 return;
18633                 }
18634         }
18635 
18636         /*
18637          * The operation must complete without EINPROGRESS since
18638          * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18639          * Otherwise, the operation will be stuck forever in the ipsq.
18640          */
18641         ASSERT(err != EINPROGRESS);
18642         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18643                 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18644                     int, ipsq->ipsq_xop->ipx_current_ioctl,
18645                     ill_t *, ill, ipif_t *, ipif);
18646                 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18647         } else {
18648                 ipsq_current_finish(ipsq);
18649         }
18650 }
18651 
18652 /*
18653  * Finish processing of arp replumb after all the DLPI message
18654  * exchanges have completed between arp and the driver.
18655  */
18656 void
18657 arp_replumb_done(ill_t *ill, int err)
18658 {
18659         mblk_t  *mp1;
18660         ipif_t  *ipif;
18661         conn_t *connp = NULL;
18662         ipsq_t  *ipsq;
18663         queue_t *q;
18664 
18665         ASSERT(IAM_WRITER_ILL(ill));
18666 
18667         ipsq = ill->ill_phyint->phyint_ipsq;
18668         ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18669         mp1 = ipsq_pending_mp_get(ipsq, &connp);
18670         ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18671         if (mp1 == NULL) {
18672                 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18673                     ipsq->ipsq_xop->ipx_current_ioctl));
18674                 /* bringup was aborted by the user */
18675                 return;
18676         }
18677         /*
18678          * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18679          * must have an associated conn_t.  Otherwise, we're bringing this
18680          * interface back up as part of handling an asynchronous event (e.g.,
18681          * physical address change).
18682          */
18683         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18684                 ASSERT(connp != NULL);
18685                 q = CONNP_TO_WQ(connp);
18686         } else {
18687                 ASSERT(connp == NULL);
18688                 q = ill->ill_rq;
18689         }
18690         if ((err == 0) && (ill->ill_up_ipifs)) {
18691                 err = ill_up_ipifs(ill, q, mp1);
18692                 if (err == EINPROGRESS)
18693                         return;
18694         }
18695         /*
18696          * The operation must complete without EINPROGRESS since
18697          * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18698          * Otherwise, the operation will be stuck forever in the ipsq.
18699          */
18700         ASSERT(err != EINPROGRESS);
18701         if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18702                 DTRACE_PROBE4(ipif__ioctl, char *,
18703                     "arp_replumb_done finish",
18704                     int, ipsq->ipsq_xop->ipx_current_ioctl,
18705                     ill_t *, ill, ipif_t *, ipif);
18706                 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18707         } else {
18708                 ipsq_current_finish(ipsq);
18709         }
18710 }
18711 
18712 void
18713 ipif_up_notify(ipif_t *ipif)
18714 {
18715         ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18716         ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18717         sctp_update_ipif(ipif, SCTP_IPIF_UP);
18718         ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18719             NE_LIF_UP, NULL, 0);
18720 }
18721 
18722 /*
18723  * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18724  * this assumes the context is cv_wait'able.  Hence it shouldnt' be used on
18725  * TPI end points with STREAMS modules pushed above.  This is assured by not
18726  * having the IPI_MODOK flag for the ioctl.  And IP ensures the ILB ioctl
18727  * never ends up on an ipsq, otherwise we may end up processing the ioctl
18728  * while unwinding from the ispq and that could be a thread from the bottom.
18729  */
18730 /* ARGSUSED */
18731 int
18732 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18733     ip_ioctl_cmd_t *ipip, void *arg)
18734 {
18735         mblk_t *cmd_mp = mp->b_cont->b_cont;
18736         ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18737         int ret = 0;
18738         int i;
18739         size_t size;
18740         ip_stack_t *ipst;
18741         zoneid_t zoneid;
18742         ilb_stack_t *ilbs;
18743 
18744         ipst = CONNQ_TO_IPST(q);
18745         ilbs = ipst->ips_netstack->netstack_ilb;
18746         zoneid = Q_TO_CONN(q)->conn_zoneid;
18747 
18748         switch (command) {
18749         case ILB_CREATE_RULE: {
18750                 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18751 
18752                 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18753                         ret = EINVAL;
18754                         break;
18755                 }
18756 
18757                 ret = ilb_rule_add(ilbs, zoneid, cmd);
18758                 break;
18759         }
18760         case ILB_DESTROY_RULE:
18761         case ILB_ENABLE_RULE:
18762         case ILB_DISABLE_RULE: {
18763                 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18764 
18765                 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18766                         ret = EINVAL;
18767                         break;
18768                 }
18769 
18770                 if (cmd->flags & ILB_RULE_ALLRULES) {
18771                         if (command == ILB_DESTROY_RULE) {
18772                                 ilb_rule_del_all(ilbs, zoneid);
18773                                 break;
18774                         } else if (command == ILB_ENABLE_RULE) {
18775                                 ilb_rule_enable_all(ilbs, zoneid);
18776                                 break;
18777                         } else if (command == ILB_DISABLE_RULE) {
18778                                 ilb_rule_disable_all(ilbs, zoneid);
18779                                 break;
18780                         }
18781                 } else {
18782                         if (command == ILB_DESTROY_RULE) {
18783                                 ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18784                         } else if (command == ILB_ENABLE_RULE) {
18785                                 ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18786                                     NULL);
18787                         } else if (command == ILB_DISABLE_RULE) {
18788                                 ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18789                                     NULL);
18790                         }
18791                 }
18792                 break;
18793         }
18794         case ILB_NUM_RULES: {
18795                 ilb_num_rules_cmd_t *cmd;
18796 
18797                 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18798                         ret = EINVAL;
18799                         break;
18800                 }
18801                 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18802                 ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18803                 break;
18804         }
18805         case ILB_RULE_NAMES: {
18806                 ilb_rule_names_cmd_t *cmd;
18807 
18808                 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18809                 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18810                     cmd->num_names == 0) {
18811                         ret = EINVAL;
18812                         break;
18813                 }
18814                 size = cmd->num_names * ILB_RULE_NAMESZ;
18815                 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18816                     size != cmd_mp->b_wptr) {
18817                         ret = EINVAL;
18818                         break;
18819                 }
18820                 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18821                 break;
18822         }
18823         case ILB_NUM_SERVERS: {
18824                 ilb_num_servers_cmd_t *cmd;
18825 
18826                 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18827                         ret = EINVAL;
18828                         break;
18829                 }
18830                 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18831                 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18832                     &(cmd->num));
18833                 break;
18834         }
18835         case ILB_LIST_RULE: {
18836                 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18837 
18838                 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18839                         ret = EINVAL;
18840                         break;
18841                 }
18842                 ret = ilb_rule_list(ilbs, zoneid, cmd);
18843                 break;
18844         }
18845         case ILB_LIST_SERVERS: {
18846                 ilb_servers_info_cmd_t *cmd;
18847 
18848                 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18849                 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18850                     cmd->num_servers == 0) {
18851                         ret = EINVAL;
18852                         break;
18853                 }
18854                 size = cmd->num_servers * sizeof (ilb_server_info_t);
18855                 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18856                     size != cmd_mp->b_wptr) {
18857                         ret = EINVAL;
18858                         break;
18859                 }
18860 
18861                 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18862                     &cmd->num_servers);
18863                 break;
18864         }
18865         case ILB_ADD_SERVERS: {
18866                 ilb_servers_info_cmd_t *cmd;
18867                 ilb_rule_t *rule;
18868 
18869                 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18870                 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18871                         ret = EINVAL;
18872                         break;
18873                 }
18874                 size = cmd->num_servers * sizeof (ilb_server_info_t);
18875                 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18876                     size != cmd_mp->b_wptr) {
18877                         ret = EINVAL;
18878                         break;
18879                 }
18880                 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18881                 if (rule == NULL) {
18882                         ASSERT(ret != 0);
18883                         break;
18884                 }
18885                 for (i = 0; i < cmd->num_servers; i++) {
18886                         ilb_server_info_t *s;
18887 
18888                         s = &cmd->servers[i];
18889                         s->err = ilb_server_add(ilbs, rule, s);
18890                 }
18891                 ILB_RULE_REFRELE(rule);
18892                 break;
18893         }
18894         case ILB_DEL_SERVERS:
18895         case ILB_ENABLE_SERVERS:
18896         case ILB_DISABLE_SERVERS: {
18897                 ilb_servers_cmd_t *cmd;
18898                 ilb_rule_t *rule;
18899                 int (*f)();
18900 
18901                 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18902                 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18903                         ret = EINVAL;
18904                         break;
18905                 }
18906                 size = cmd->num_servers * sizeof (ilb_server_arg_t);
18907                 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18908                     size != cmd_mp->b_wptr) {
18909                         ret = EINVAL;
18910                         break;
18911                 }
18912 
18913                 if (command == ILB_DEL_SERVERS)
18914                         f = ilb_server_del;
18915                 else if (command == ILB_ENABLE_SERVERS)
18916                         f = ilb_server_enable;
18917                 else if (command == ILB_DISABLE_SERVERS)
18918                         f = ilb_server_disable;
18919 
18920                 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18921                 if (rule == NULL) {
18922                         ASSERT(ret != 0);
18923                         break;
18924                 }
18925 
18926                 for (i = 0; i < cmd->num_servers; i++) {
18927                         ilb_server_arg_t *s;
18928 
18929                         s = &cmd->servers[i];
18930                         s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18931                 }
18932                 ILB_RULE_REFRELE(rule);
18933                 break;
18934         }
18935         case ILB_LIST_NAT_TABLE: {
18936                 ilb_list_nat_cmd_t *cmd;
18937 
18938                 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
18939                 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
18940                         ret = EINVAL;
18941                         break;
18942                 }
18943                 size = cmd->num_nat * sizeof (ilb_nat_entry_t);
18944                 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
18945                     size != cmd_mp->b_wptr) {
18946                         ret = EINVAL;
18947                         break;
18948                 }
18949 
18950                 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
18951                     &cmd->flags);
18952                 break;
18953         }
18954         case ILB_LIST_STICKY_TABLE: {
18955                 ilb_list_sticky_cmd_t *cmd;
18956 
18957                 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
18958                 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
18959                         ret = EINVAL;
18960                         break;
18961                 }
18962                 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
18963                 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
18964                     size != cmd_mp->b_wptr) {
18965                         ret = EINVAL;
18966                         break;
18967                 }
18968 
18969                 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
18970                     &cmd->num_sticky, &cmd->flags);
18971                 break;
18972         }
18973         default:
18974                 ret = EINVAL;
18975                 break;
18976         }
18977 done:
18978         return (ret);
18979 }
18980 
18981 /* Remove all cache entries for this logical interface */
18982 void
18983 ipif_nce_down(ipif_t *ipif)
18984 {
18985         ill_t *ill = ipif->ipif_ill;
18986         nce_t *nce;
18987 
18988         DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
18989             ill_t *, ill, ipif_t *, ipif);
18990         if (ipif->ipif_added_nce) {
18991                 if (ipif->ipif_isv6)
18992                         nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
18993                 else
18994                         nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
18995                 if (nce != NULL) {
18996                         if (--nce->nce_ipif_cnt == 0)
18997                                 ncec_delete(nce->nce_common);
18998                         ipif->ipif_added_nce = 0;
18999                         nce_refrele(nce);
19000                 } else {
19001                         /*
19002                          * nce may already be NULL because it was already
19003                          * flushed, e.g., due to a call to nce_flush
19004                          */
19005                         ipif->ipif_added_nce = 0;
19006                 }
19007         }
19008         /*
19009          * Make IPMP aware of the deleted data address.
19010          */
19011         if (IS_IPMP(ill))
19012                 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
19013 
19014         /*
19015          * Remove all other nces dependent on this ill when the last ipif
19016          * is going away.
19017          */
19018         if (ill->ill_ipif_up_count == 0) {
19019                 ncec_walk(ill, (pfi_t)ncec_delete_per_ill,
19020                     (uchar_t *)ill, ill->ill_ipst);
19021                 if (IS_UNDER_IPMP(ill))
19022                         nce_flush(ill, B_TRUE);
19023         }
19024 }
19025 
19026 /*
19027  * find the first interface that uses usill for its source address.
19028  */
19029 ill_t *
19030 ill_lookup_usesrc(ill_t *usill)
19031 {
19032         ip_stack_t *ipst = usill->ill_ipst;
19033         ill_t *ill;
19034 
19035         ASSERT(usill != NULL);
19036 
19037         /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
19038         rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
19039         rw_enter(&ipst->ips_ill_g_lock, RW_READER);
19040         for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
19041             ill = ill->ill_usesrc_grp_next) {
19042                 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
19043                     !ILL_IS_CONDEMNED(ill)) {
19044                         ill_refhold(ill);
19045                         break;
19046                 }
19047         }
19048         rw_exit(&ipst->ips_ill_g_lock);
19049         rw_exit(&ipst->ips_ill_g_usesrc_lock);
19050         return (ill);
19051 }
19052 
19053 /*
19054  * This comment applies to both ip_sioctl_get_ifhwaddr and
19055  * ip_sioctl_get_lifhwaddr as the basic function of these two functions
19056  * is the same.
19057  *
19058  * The goal here is to find an IP interface that corresponds to the name
19059  * provided by the caller in the ifreq/lifreq structure held in the mblk_t
19060  * chain and to fill out a sockaddr/sockaddr_storage structure with the
19061  * mac address.
19062  *
19063  * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
19064  * of different reasons:
19065  * ENXIO - the device name is not known to IP.
19066  * EADDRNOTAVAIL - the device has no hardware address. This is indicated
19067  * by ill_phys_addr not pointing to an actual address.
19068  * EPFNOSUPPORT - this will indicate that a request is being made for a
19069  * mac address that will not fit in the data structure supplier (struct
19070  * sockaddr).
19071  *
19072  */
19073 /* ARGSUSED */
19074 int
19075 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19076     ip_ioctl_cmd_t *ipip, void *if_req)
19077 {
19078         struct sockaddr *sock;
19079         struct ifreq *ifr;
19080         mblk_t *mp1;
19081         ill_t *ill;
19082 
19083         ASSERT(ipif != NULL);
19084         ill = ipif->ipif_ill;
19085 
19086         if (ill->ill_phys_addr == NULL) {
19087                 return (EADDRNOTAVAIL);
19088         }
19089         if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
19090                 return (EPFNOSUPPORT);
19091         }
19092 
19093         ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
19094 
19095         /* Existence of mp1 has been checked in ip_wput_nondata */
19096         mp1 = mp->b_cont->b_cont;
19097         ifr = (struct ifreq *)mp1->b_rptr;
19098 
19099         sock = &ifr->ifr_addr;
19100         /*
19101          * The "family" field in the returned structure is set to a value
19102          * that represents the type of device to which the address belongs.
19103          * The value returned may differ to that on Linux but it will still
19104          * represent the correct symbol on Solaris.
19105          */
19106         sock->sa_family = arp_hw_type(ill->ill_mactype);
19107         bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
19108 
19109         return (0);
19110 }
19111 
19112 /*
19113  * The expection of applications using SIOCGIFHWADDR is that data will
19114  * be returned in the sa_data field of the sockaddr structure. With
19115  * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
19116  * equivalent. In light of this, struct sockaddr_dl is used as it
19117  * offers more space for address storage in sll_data.
19118  */
19119 /* ARGSUSED */
19120 int
19121 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19122     ip_ioctl_cmd_t *ipip, void *if_req)
19123 {
19124         struct sockaddr_dl *sock;
19125         struct lifreq *lifr;
19126         mblk_t *mp1;
19127         ill_t *ill;
19128 
19129         ASSERT(ipif != NULL);
19130         ill = ipif->ipif_ill;
19131 
19132         if (ill->ill_phys_addr == NULL) {
19133                 return (EADDRNOTAVAIL);
19134         }
19135         if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
19136                 return (EPFNOSUPPORT);
19137         }
19138 
19139         ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
19140 
19141         /* Existence of mp1 has been checked in ip_wput_nondata */
19142         mp1 = mp->b_cont->b_cont;
19143         lifr = (struct lifreq *)mp1->b_rptr;
19144 
19145         /*
19146          * sockaddr_ll is used here because it is also the structure used in
19147          * responding to the same ioctl in sockpfp. The only other choice is
19148          * sockaddr_dl which contains fields that are not required here
19149          * because its purpose is different.
19150          */
19151         lifr->lifr_type = ill->ill_type;
19152         sock = (struct sockaddr_dl *)&lifr->lifr_addr;
19153         sock->sdl_family = AF_LINK;
19154         sock->sdl_index = ill->ill_phyint->phyint_ifindex;
19155         sock->sdl_type = ill->ill_mactype;
19156         sock->sdl_nlen = 0;
19157         sock->sdl_slen = 0;
19158         sock->sdl_alen = ill->ill_phys_addr_length;
19159         bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
19160 
19161         return (0);
19162 }