1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * The ipnet device defined here provides access to packets at the IP layer. To
  29  * provide access to packets at this layer it registers a callback function in
  30  * the ip module and when there are open instances of the device ip will pass
  31  * packets into the device. Packets from ip are passed on the input, output and
  32  * loopback paths. Internally the module returns to ip as soon as possible by
  33  * deferring processing using a taskq.
  34  *
  35  * Management of the devices in /dev/ipnet/ is handled by the devname
  36  * filesystem and use of the neti interfaces.  This module registers for NIC
  37  * events using the neti framework so that when IP interfaces are bought up,
  38  * taken down etc. the ipnet module is notified and its view of the interfaces
  39  * configured on the system adjusted.  On attach, the module gets an initial
  40  * view of the system again using the neti framework but as it has already
  41  * registered for IP interface events, it is still up-to-date with any changes.
  42  */
  43 
  44 #include <sys/types.h>
  45 #include <sys/conf.h>
  46 #include <sys/cred.h>
  47 #include <sys/stat.h>
  48 #include <sys/ddi.h>
  49 #include <sys/sunddi.h>
  50 #include <sys/modctl.h>
  51 #include <sys/dlpi.h>
  52 #include <sys/strsun.h>
  53 #include <sys/id_space.h>
  54 #include <sys/kmem.h>
  55 #include <sys/mkdev.h>
  56 #include <sys/neti.h>
  57 #include <net/if.h>
  58 #include <sys/errno.h>
  59 #include <sys/list.h>
  60 #include <sys/ksynch.h>
  61 #include <sys/hook_event.h>
  62 #include <sys/sdt.h>
  63 #include <sys/stropts.h>
  64 #include <sys/sysmacros.h>
  65 #include <inet/ip.h>
  66 #include <inet/ip_if.h>
  67 #include <inet/ip_multi.h>
  68 #include <inet/ip6.h>
  69 #include <inet/ipnet.h>
  70 #include <net/bpf.h>
  71 #include <net/bpfdesc.h>
  72 #include <net/dlt.h>
  73 
  74 static struct module_info ipnet_minfo = {
  75         1,              /* mi_idnum */
  76         "ipnet",        /* mi_idname */
  77         0,              /* mi_minpsz */
  78         INFPSZ,         /* mi_maxpsz */
  79         2048,           /* mi_hiwat */
  80         0               /* mi_lowat */
  81 };
  82 
  83 /*
  84  * List to hold static view of ipnetif_t's on the system. This is needed to
  85  * avoid holding the lock protecting the avl tree of ipnetif's over the
  86  * callback into the dev filesystem.
  87  */
  88 typedef struct ipnetif_cbdata {
  89         char            ic_ifname[LIFNAMSIZ];
  90         dev_t           ic_dev;
  91         list_node_t     ic_next;
  92 } ipnetif_cbdata_t;
  93 
  94 /*
  95  * Convenience enumerated type for ipnet_accept().  It describes the
  96  * properties of a given ipnet_addrp_t relative to a single ipnet_t
  97  * client stream.  The values represent whether the address is ...
  98  */
  99 typedef enum {
 100         IPNETADDR_MYADDR,       /* an address on my ipnetif_t. */
 101         IPNETADDR_MBCAST,       /* a multicast or broadcast address. */
 102         IPNETADDR_UNKNOWN       /* none of the above. */
 103 } ipnet_addrtype_t;
 104 
 105 /* Argument used for the ipnet_nicevent_taskq callback. */
 106 typedef struct ipnet_nicevent_s {
 107         nic_event_t             ipne_event;
 108         net_handle_t            ipne_protocol;
 109         netstackid_t            ipne_stackid;
 110         uint64_t                ipne_ifindex;
 111         uint64_t                ipne_lifindex;
 112         char                    ipne_ifname[LIFNAMSIZ];
 113 } ipnet_nicevent_t;
 114 
 115 static dev_info_t       *ipnet_dip;
 116 static major_t          ipnet_major;
 117 static ddi_taskq_t      *ipnet_taskq;           /* taskq for packets */
 118 static ddi_taskq_t      *ipnet_nicevent_taskq;  /* taskq for NIC events */
 119 static id_space_t       *ipnet_minor_space;
 120 static const int        IPNET_MINOR_LO = 1;     /* minor number for /dev/lo0 */
 121 static const int        IPNET_MINOR_MIN = 2;    /* start of dynamic minors */
 122 static dl_info_ack_t    ipnet_infoack = IPNET_INFO_ACK_INIT;
 123 static ipnet_acceptfn_t ipnet_accept, ipnet_loaccept;
 124 static bpf_itap_fn_t    ipnet_itap;
 125 
 126 static void     ipnet_input(mblk_t *);
 127 static int      ipnet_wput(queue_t *, mblk_t *);
 128 static int      ipnet_rsrv(queue_t *);
 129 static int      ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
 130 static int      ipnet_close(queue_t *);
 131 static void     ipnet_ioctl(queue_t *, mblk_t *);
 132 static void     ipnet_iocdata(queue_t *, mblk_t *);
 133 static void     ipnet_wputnondata(queue_t *, mblk_t *);
 134 static int      ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
 135 static int      ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
 136 static int      ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 137 static void     ipnet_inforeq(queue_t *q, mblk_t *mp);
 138 static void     ipnet_bindreq(queue_t *q, mblk_t *mp);
 139 static void     ipnet_unbindreq(queue_t *q, mblk_t *mp);
 140 static void     ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
 141 static void     ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
 142 static int      ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
 143 static void     ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
 144 static int      ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
 145 static void     ipnet_nicevent_task(void *);
 146 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
 147     uint64_t);
 148 static void     ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
 149 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
 150 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
 151 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
 152 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
 153 static void     ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
 154 static int      ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
 155 static int      ipnetif_compare_name(const void *, const void *);
 156 static int      ipnetif_compare_name_zone(const void *, const void *);
 157 static int      ipnetif_compare_index(const void *, const void *);
 158 static void     ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
 159 static void     ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
 160 static void     ipnetif_refhold(ipnetif_t *);
 161 static void     ipnetif_refrele(ipnetif_t *);
 162 static void     ipnet_walkers_inc(ipnet_stack_t *);
 163 static void     ipnet_walkers_dec(ipnet_stack_t *);
 164 static void     ipnet_register_netihook(ipnet_stack_t *);
 165 static void     *ipnet_stack_init(netstackid_t, netstack_t *);
 166 static void     ipnet_stack_fini(netstackid_t, void *);
 167 static void     ipnet_dispatch(void *);
 168 static int      ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
 169 static int      ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
 170 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
 171 static void     ipnetif_clone_release(ipnetif_t *);
 172 
 173 static struct qinit ipnet_rinit = {
 174         NULL,           /* qi_putp */
 175         ipnet_rsrv,     /* qi_srvp */
 176         ipnet_open,     /* qi_qopen */
 177         ipnet_close,    /* qi_qclose */
 178         NULL,           /* qi_qadmin */
 179         &ipnet_minfo,       /* qi_minfo */
 180 };
 181 
 182 static struct qinit ipnet_winit = {
 183         ipnet_wput,     /* qi_putp */
 184         NULL,           /* qi_srvp */
 185         NULL,           /* qi_qopen */
 186         NULL,           /* qi_qclose */
 187         NULL,           /* qi_qadmin */
 188         &ipnet_minfo,       /* qi_minfo */
 189 };
 190 
 191 static struct streamtab ipnet_info = {
 192         &ipnet_rinit, &ipnet_winit
 193 };
 194 
 195 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
 196     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
 197     ddi_quiesce_not_supported);
 198 
 199 static struct modldrv modldrv = {
 200         &mod_driverops,
 201         "STREAMS ipnet driver",
 202         &ipnet_ops
 203 };
 204 
 205 static struct modlinkage modlinkage = {
 206         MODREV_1, { &modldrv, NULL }
 207 };
 208 
 209 /*
 210  * This structure contains the template data (names and type) that is
 211  * copied, in bulk, into the new kstats structure created by net_kstat_create.
 212  * No actual statistical information is stored in this instance of the
 213  * ipnet_kstats_t structure.
 214  */
 215 static ipnet_kstats_t stats_template = {
 216         { "duplicationFail",    KSTAT_DATA_UINT64 },
 217         { "dispatchOk",         KSTAT_DATA_UINT64 },
 218         { "dispatchFail",       KSTAT_DATA_UINT64 },
 219         { "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
 220         { "dispatchDupDrop",    KSTAT_DATA_UINT64 },
 221         { "dispatchDeliver",    KSTAT_DATA_UINT64 },
 222         { "acceptOk",           KSTAT_DATA_UINT64 },
 223         { "acceptFail",         KSTAT_DATA_UINT64 }
 224 };
 225 
 226 /*
 227  * Walk the list of physical interfaces on the machine, for each
 228  * interface create a new ipnetif_t and add any addresses to it. We
 229  * need to do the walk twice, once for IPv4 and once for IPv6.
 230  *
 231  * The interfaces are destroyed as part of ipnet_stack_fini() for each
 232  * stack.  Note that we cannot do this initialization in
 233  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
 234  */
 235 static int
 236 ipnetif_init(void)
 237 {
 238         netstack_handle_t       nh;
 239         netstack_t              *ns;
 240         ipnet_stack_t           *ips;
 241         int                     ret = 0;
 242 
 243         netstack_next_init(&nh);
 244         while ((ns = netstack_next(&nh)) != NULL) {
 245                 ips = ns->netstack_ipnet;
 246                 if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
 247                         ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
 248                 netstack_rele(ns);
 249                 if (ret != 0)
 250                         break;
 251         }
 252         netstack_next_fini(&nh);
 253         return (ret);
 254 }
 255 
 256 /*
 257  * Standard module entry points.
 258  */
 259 int
 260 _init(void)
 261 {
 262         int             ret;
 263         boolean_t       netstack_registered = B_FALSE;
 264 
 265         if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
 266                 return (ENODEV);
 267         ipnet_minor_space = id_space_create("ipnet_minor_space",
 268             IPNET_MINOR_MIN, MAXMIN32);
 269 
 270         /*
 271          * We call ddi_taskq_create() with nthread == 1 to ensure in-order
 272          * delivery of packets to clients.  Note that we need to create the
 273          * taskqs before calling netstack_register() since ipnet_stack_init()
 274          * registers callbacks that use 'em.
 275          */
 276         ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
 277         ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
 278             1, TASKQ_DEFAULTPRI, 0);
 279         if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
 280                 ret = ENOMEM;
 281                 goto done;
 282         }
 283 
 284         netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
 285         netstack_registered = B_TRUE;
 286 
 287         if ((ret = ipnetif_init()) == 0)
 288                 ret = mod_install(&modlinkage);
 289 done:
 290         if (ret != 0) {
 291                 if (ipnet_taskq != NULL)
 292                         ddi_taskq_destroy(ipnet_taskq);
 293                 if (ipnet_nicevent_taskq != NULL)
 294                         ddi_taskq_destroy(ipnet_nicevent_taskq);
 295                 if (netstack_registered)
 296                         netstack_unregister(NS_IPNET);
 297                 id_space_destroy(ipnet_minor_space);
 298         }
 299         return (ret);
 300 }
 301 
 302 int
 303 _fini(void)
 304 {
 305         int     err;
 306 
 307         if ((err = mod_remove(&modlinkage)) != 0)
 308                 return (err);
 309 
 310         netstack_unregister(NS_IPNET);
 311         ddi_taskq_destroy(ipnet_nicevent_taskq);
 312         ddi_taskq_destroy(ipnet_taskq);
 313         id_space_destroy(ipnet_minor_space);
 314         return (0);
 315 }
 316 
 317 int
 318 _info(struct modinfo *modinfop)
 319 {
 320         return (mod_info(&modlinkage, modinfop));
 321 }
 322 
 323 static void
 324 ipnet_register_netihook(ipnet_stack_t *ips)
 325 {
 326         int             ret;
 327         zoneid_t        zoneid;
 328         netid_t         netid;
 329 
 330         HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
 331             ips);
 332 
 333         /*
 334          * It is possible for an exclusive stack to be in the process of
 335          * shutting down here, and the netid and protocol lookups could fail
 336          * in that case.
 337          */
 338         zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
 339         if ((netid = net_zoneidtonetid(zoneid)) == -1)
 340                 return;
 341 
 342         if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
 343                 if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
 344                     ips->ips_nicevents)) != 0) {
 345                         VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
 346                         ips->ips_ndv4 = NULL;
 347                         cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
 348                             " in zone %d: %d", zoneid, ret);
 349                 }
 350         }
 351         if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
 352                 if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
 353                     ips->ips_nicevents)) != 0) {
 354                         VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
 355                         ips->ips_ndv6 = NULL;
 356                         cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
 357                             " in zone %d: %d", zoneid, ret);
 358                 }
 359         }
 360 
 361         /*
 362          * Create a local set of kstats for each zone.
 363          */
 364         ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
 365             "misc", KSTAT_TYPE_NAMED,
 366             sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
 367         if (ips->ips_kstatp != NULL) {
 368                 bcopy(&stats_template, &ips->ips_stats,
 369                     sizeof (ips->ips_stats));
 370                 ips->ips_kstatp->ks_data = &ips->ips_stats;
 371                 ips->ips_kstatp->ks_private =
 372                     (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
 373                 kstat_install(ips->ips_kstatp);
 374         } else {
 375                 cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
 376                     "ipnet", "ipnet_stats", "misc");
 377         }
 378 }
 379 
 380 /*
 381  * This function is called on attach to build an initial view of the
 382  * interfaces on the system. It will be called once for IPv4 and once
 383  * for IPv6, although there is only one ipnet interface for both IPv4
 384  * and IPv6 there are separate address lists.
 385  */
 386 static int
 387 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
 388 {
 389         phy_if_t        phyif;
 390         lif_if_t        lif;
 391         ipnetif_t       *ipnetif;
 392         char            name[LIFNAMSIZ];
 393         boolean_t       new_if = B_FALSE;
 394         uint64_t        ifflags;
 395         int             ret = 0;
 396 
 397         /*
 398          * If ipnet_register_netihook() was unable to initialize this
 399          * stack's net_handle_t, then we cannot populate any interface
 400          * information.  This usually happens when we attempted to
 401          * grab a net_handle_t as a stack was shutting down.  We don't
 402          * want to fail the entire _init() operation because of a
 403          * stack shutdown (other stacks will continue to work just
 404          * fine), so we silently return success here.
 405          */
 406         if (nd == NULL)
 407                 return (0);
 408 
 409         /*
 410          * Make sure we're not processing NIC events during the
 411          * population of our interfaces and address lists.
 412          */
 413         mutex_enter(&ips->ips_event_lock);
 414 
 415         for (phyif = net_phygetnext(nd, 0); phyif != 0;
 416             phyif = net_phygetnext(nd, phyif)) {
 417                 if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
 418                         continue;
 419                 ifflags =  0;
 420                 (void) net_getlifflags(nd, phyif, 0, &ifflags);
 421                 if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
 422                         ipnetif = ipnetif_create(name, phyif, ips, ifflags);
 423                         if (ipnetif == NULL) {
 424                                 ret = ENOMEM;
 425                                 goto done;
 426                         }
 427                         new_if = B_TRUE;
 428                 }
 429                 ipnetif->if_flags |=
 430                     isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
 431 
 432                 for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
 433                     lif = net_lifgetnext(nd, phyif, lif)) {
 434                         /*
 435                          * Skip addresses that aren't up.  We'll add
 436                          * them when we receive an NE_LIF_UP event.
 437                          */
 438                         if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
 439                             !(ifflags & IFF_UP))
 440                                 continue;
 441                         /* Don't add it if we already have it. */
 442                         if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
 443                                 continue;
 444                         ipnet_add_ifaddr(lif, ipnetif, nd);
 445                 }
 446                 if (!new_if)
 447                         ipnetif_refrele(ipnetif);
 448         }
 449 
 450 done:
 451         mutex_exit(&ips->ips_event_lock);
 452         return (ret);
 453 }
 454 
 455 static int
 456 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 457 {
 458         if (cmd != DDI_ATTACH)
 459                 return (DDI_FAILURE);
 460 
 461         if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
 462             DDI_PSEUDO, 0) == DDI_FAILURE)
 463                 return (DDI_FAILURE);
 464 
 465         ipnet_dip = dip;
 466         return (DDI_SUCCESS);
 467 }
 468 
 469 static int
 470 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 471 {
 472         if (cmd != DDI_DETACH)
 473                 return (DDI_FAILURE);
 474 
 475         ASSERT(dip == ipnet_dip);
 476         ddi_remove_minor_node(ipnet_dip, NULL);
 477         ipnet_dip = NULL;
 478         return (DDI_SUCCESS);
 479 }
 480 
 481 /* ARGSUSED */
 482 static int
 483 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 484 {
 485         int     error = DDI_FAILURE;
 486 
 487         switch (infocmd) {
 488         case DDI_INFO_DEVT2INSTANCE:
 489                 *result = (void *)0;
 490                 error = DDI_SUCCESS;
 491                 break;
 492         case DDI_INFO_DEVT2DEVINFO:
 493                 if (ipnet_dip != NULL) {
 494                         *result = ipnet_dip;
 495                         error = DDI_SUCCESS;
 496                 }
 497                 break;
 498         }
 499         return (error);
 500 }
 501 
 502 /* ARGSUSED */
 503 static int
 504 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
 505 {
 506         ipnet_t         *ipnet;
 507         netstack_t      *ns = NULL;
 508         ipnet_stack_t   *ips;
 509         int             err = 0;
 510         zoneid_t        zoneid = crgetzoneid(crp);
 511 
 512         /*
 513          * If the system is labeled, only the global zone is allowed to open
 514          * IP observability nodes.
 515          */
 516         if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
 517                 return (EACCES);
 518 
 519         /* We don't support open as a module */
 520         if (sflag & MODOPEN)
 521                 return (ENOTSUP);
 522 
 523         /* This driver is self-cloning, we don't support re-open. */
 524         if (rq->q_ptr != NULL)
 525                 return (EBUSY);
 526 
 527         if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
 528                 return (ENOMEM);
 529 
 530         VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
 531         ips = ns->netstack_ipnet;
 532 
 533         rq->q_ptr = WR(rq)->q_ptr = ipnet;
 534         ipnet->ipnet_rq = rq;
 535         ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
 536         ipnet->ipnet_zoneid = zoneid;
 537         ipnet->ipnet_dlstate = DL_UNBOUND;
 538         ipnet->ipnet_ns = ns;
 539 
 540         /*
 541          * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
 542          * to be processed after ipnet_if is set and the ipnet_t has been
 543          * inserted in the ips_str_list.
 544          */
 545         mutex_enter(&ips->ips_event_lock);
 546         if (getminor(*dev) == IPNET_MINOR_LO) {
 547                 ipnet->ipnet_flags |= IPNET_LOMODE;
 548                 ipnet->ipnet_acceptfn = ipnet_loaccept;
 549         } else {
 550                 ipnet->ipnet_acceptfn = ipnet_accept;
 551                 ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
 552                 if (ipnet->ipnet_if == NULL ||
 553                     !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
 554                         err = ENODEV;
 555                         goto done;
 556                 }
 557         }
 558 
 559         mutex_enter(&ips->ips_walkers_lock);
 560         while (ips->ips_walkers_cnt != 0)
 561                 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
 562         list_insert_head(&ips->ips_str_list, ipnet);
 563         *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
 564         qprocson(rq);
 565 
 566         /*
 567          * Only register our callback if we're the first open client; we call
 568          * unregister in close() for the last open client.
 569          */
 570         if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
 571                 ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
 572         mutex_exit(&ips->ips_walkers_lock);
 573 
 574 done:
 575         mutex_exit(&ips->ips_event_lock);
 576         if (err != 0) {
 577                 netstack_rele(ns);
 578                 id_free(ipnet_minor_space, ipnet->ipnet_minor);
 579                 if (ipnet->ipnet_if != NULL)
 580                         ipnetif_refrele(ipnet->ipnet_if);
 581                 kmem_free(ipnet, sizeof (*ipnet));
 582         }
 583         return (err);
 584 }
 585 
 586 static int
 587 ipnet_close(queue_t *rq)
 588 {
 589         ipnet_t         *ipnet = rq->q_ptr;
 590         ipnet_stack_t   *ips = ipnet->ipnet_ns->netstack_ipnet;
 591 
 592         if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
 593                 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
 594         if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
 595                 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
 596 
 597         mutex_enter(&ips->ips_walkers_lock);
 598         while (ips->ips_walkers_cnt != 0)
 599                 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
 600 
 601         qprocsoff(rq);
 602 
 603         list_remove(&ips->ips_str_list, ipnet);
 604         if (ipnet->ipnet_if != NULL)
 605                 ipnetif_refrele(ipnet->ipnet_if);
 606         id_free(ipnet_minor_space, ipnet->ipnet_minor);
 607 
 608         if (list_is_empty(&ips->ips_str_list)) {
 609                 ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
 610                 ips->ips_hook = NULL;
 611         }
 612 
 613         kmem_free(ipnet, sizeof (*ipnet));
 614 
 615         mutex_exit(&ips->ips_walkers_lock);
 616         netstack_rele(ips->ips_netstack);
 617         return (0);
 618 }
 619 
 620 static int
 621 ipnet_wput(queue_t *q, mblk_t *mp)
 622 {
 623         switch (mp->b_datap->db_type) {
 624         case M_FLUSH:
 625                 if (*mp->b_rptr & FLUSHW) {
 626                         flushq(q, FLUSHDATA);
 627                         *mp->b_rptr &= ~FLUSHW;
 628                 }
 629                 if (*mp->b_rptr & FLUSHR)
 630                         qreply(q, mp);
 631                 else
 632                         freemsg(mp);
 633                 break;
 634         case M_PROTO:
 635         case M_PCPROTO:
 636                 ipnet_wputnondata(q, mp);
 637                 break;
 638         case M_IOCTL:
 639                 ipnet_ioctl(q, mp);
 640                 break;
 641         case M_IOCDATA:
 642                 ipnet_iocdata(q, mp);
 643                 break;
 644         default:
 645                 freemsg(mp);
 646                 break;
 647         }
 648         return (0);
 649 }
 650 
 651 static int
 652 ipnet_rsrv(queue_t *q)
 653 {
 654         mblk_t  *mp;
 655 
 656         while ((mp = getq(q)) != NULL) {
 657                 ASSERT(DB_TYPE(mp) == M_DATA);
 658                 if (canputnext(q)) {
 659                         putnext(q, mp);
 660                 } else {
 661                         (void) putbq(q, mp);
 662                         break;
 663                 }
 664         }
 665         return (0);
 666 }
 667 
 668 static void
 669 ipnet_ioctl(queue_t *q, mblk_t *mp)
 670 {
 671         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 672 
 673         switch (iocp->ioc_cmd) {
 674         case DLIOCRAW:
 675                 miocack(q, mp, 0, 0);
 676                 break;
 677         case DLIOCIPNETINFO:
 678                 if (iocp->ioc_count == TRANSPARENT) {
 679                         mcopyin(mp, NULL, sizeof (uint_t), NULL);
 680                         qreply(q, mp);
 681                         break;
 682                 }
 683                 /* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
 684         default:
 685                 miocnak(q, mp, 0, EINVAL);
 686                 break;
 687         }
 688 }
 689 
 690 static void
 691 ipnet_iocdata(queue_t *q, mblk_t *mp)
 692 {
 693         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 694         ipnet_t *ipnet = q->q_ptr;
 695 
 696         switch (iocp->ioc_cmd) {
 697         case DLIOCIPNETINFO:
 698                 if (*(int *)mp->b_cont->b_rptr == 1)
 699                         ipnet->ipnet_flags |= IPNET_INFO;
 700                 else if (*(int *)mp->b_cont->b_rptr == 0)
 701                         ipnet->ipnet_flags &= ~IPNET_INFO;
 702                 else
 703                         goto iocnak;
 704                 miocack(q, mp, 0, DL_IPNETINFO_VERSION);
 705                 break;
 706         default:
 707 iocnak:
 708                 miocnak(q, mp, 0, EINVAL);
 709                 break;
 710         }
 711 }
 712 
 713 static void
 714 ipnet_wputnondata(queue_t *q, mblk_t *mp)
 715 {
 716         union DL_primitives     *dlp = (union DL_primitives *)mp->b_rptr;
 717         t_uscalar_t             prim = dlp->dl_primitive;
 718 
 719         switch (prim) {
 720         case DL_INFO_REQ:
 721                 ipnet_inforeq(q, mp);
 722                 break;
 723         case DL_UNBIND_REQ:
 724                 ipnet_unbindreq(q, mp);
 725                 break;
 726         case DL_BIND_REQ:
 727                 ipnet_bindreq(q, mp);
 728                 break;
 729         case DL_PROMISCON_REQ:
 730                 ipnet_dlpromisconreq(q, mp);
 731                 break;
 732         case DL_PROMISCOFF_REQ:
 733                 ipnet_dlpromiscoffreq(q, mp);
 734                 break;
 735         case DL_UNITDATA_REQ:
 736         case DL_DETACH_REQ:
 737         case DL_PHYS_ADDR_REQ:
 738         case DL_SET_PHYS_ADDR_REQ:
 739         case DL_ENABMULTI_REQ:
 740         case DL_DISABMULTI_REQ:
 741         case DL_ATTACH_REQ:
 742                 dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
 743                 break;
 744         default:
 745                 dlerrorack(q, mp, prim, DL_BADPRIM, 0);
 746                 break;
 747         }
 748 }
 749 
 750 static void
 751 ipnet_inforeq(queue_t *q, mblk_t *mp)
 752 {
 753         dl_info_ack_t   *dlip;
 754         size_t          size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
 755 
 756         if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
 757                 dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
 758                 return;
 759         }
 760 
 761         if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
 762                 return;
 763 
 764         dlip = (dl_info_ack_t *)mp->b_rptr;
 765         *dlip = ipnet_infoack;
 766         qreply(q, mp);
 767 }
 768 
 769 static void
 770 ipnet_bindreq(queue_t *q, mblk_t *mp)
 771 {
 772         union DL_primitives     *dlp = (union DL_primitives *)mp->b_rptr;
 773         ipnet_t                 *ipnet = q->q_ptr;
 774 
 775         if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
 776                 dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
 777                 return;
 778         }
 779 
 780         switch (dlp->bind_req.dl_sap) {
 781         case 0 :
 782                 ipnet->ipnet_family = AF_UNSPEC;
 783                 break;
 784         case IPV4_VERSION :
 785                 ipnet->ipnet_family = AF_INET;
 786                 break;
 787         case IPV6_VERSION :
 788                 ipnet->ipnet_family = AF_INET6;
 789                 break;
 790         default :
 791                 dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
 792                 return;
 793                 /*NOTREACHED*/
 794         }
 795 
 796         ipnet->ipnet_dlstate = DL_IDLE;
 797         dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
 798 }
 799 
 800 static void
 801 ipnet_unbindreq(queue_t *q, mblk_t *mp)
 802 {
 803         ipnet_t *ipnet = q->q_ptr;
 804 
 805         if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
 806                 dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
 807                 return;
 808         }
 809 
 810         if (ipnet->ipnet_dlstate != DL_IDLE) {
 811                 dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
 812         } else {
 813                 ipnet->ipnet_dlstate = DL_UNBOUND;
 814                 ipnet->ipnet_family = AF_UNSPEC;
 815                 dlokack(q, mp, DL_UNBIND_REQ);
 816         }
 817 }
 818 
 819 static void
 820 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
 821 {
 822         ipnet_t         *ipnet = q->q_ptr;
 823         t_uscalar_t     level;
 824         int             err;
 825 
 826         if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
 827                 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
 828                 return;
 829         }
 830 
 831         if (ipnet->ipnet_flags & IPNET_LOMODE) {
 832                 dlokack(q, mp, DL_PROMISCON_REQ);
 833                 return;
 834         }
 835 
 836         level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
 837         if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
 838                 if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
 839                     ipnet->ipnet_ns->netstack_ipnet)) != 0) {
 840                         dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
 841                         return;
 842                 }
 843         }
 844 
 845         switch (level) {
 846         case DL_PROMISC_PHYS:
 847                 ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
 848                 break;
 849         case DL_PROMISC_SAP:
 850                 ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
 851                 break;
 852         case DL_PROMISC_MULTI:
 853                 ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
 854                 break;
 855         default:
 856                 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
 857                 return;
 858         }
 859 
 860         dlokack(q, mp, DL_PROMISCON_REQ);
 861 }
 862 
 863 static void
 864 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
 865 {
 866         ipnet_t         *ipnet = q->q_ptr;
 867         t_uscalar_t     level;
 868         uint16_t        orig_ipnet_flags = ipnet->ipnet_flags;
 869 
 870         if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
 871                 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
 872                 return;
 873         }
 874 
 875         if (ipnet->ipnet_flags & IPNET_LOMODE) {
 876                 dlokack(q, mp, DL_PROMISCOFF_REQ);
 877                 return;
 878         }
 879 
 880         level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
 881         switch (level) {
 882         case DL_PROMISC_PHYS:
 883                 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
 884                         ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
 885                 break;
 886         case DL_PROMISC_SAP:
 887                 if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
 888                         ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
 889                 break;
 890         case DL_PROMISC_MULTI:
 891                 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
 892                         ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
 893                 break;
 894         default:
 895                 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
 896                 return;
 897         }
 898 
 899         if (orig_ipnet_flags == ipnet->ipnet_flags) {
 900                 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
 901                 return;
 902         }
 903 
 904         if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
 905                 ipnet_leave_allmulti(ipnet->ipnet_if,
 906                     ipnet->ipnet_ns->netstack_ipnet);
 907         }
 908 
 909         dlokack(q, mp, DL_PROMISCOFF_REQ);
 910 }
 911 
 912 static int
 913 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
 914 {
 915         int             err = 0;
 916         ip_stack_t      *ipst = ips->ips_netstack->netstack_ip;
 917         uint64_t        index = ipnetif->if_index;
 918 
 919         mutex_enter(&ips->ips_event_lock);
 920         if (ipnetif->if_multicnt == 0) {
 921                 ASSERT((ipnetif->if_flags &
 922                     (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
 923                 if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
 924                         err = ip_join_allmulti(index, B_FALSE, ipst);
 925                         if (err != 0)
 926                                 goto done;
 927                         ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
 928                 }
 929                 if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
 930                         err = ip_join_allmulti(index, B_TRUE, ipst);
 931                         if (err != 0 &&
 932                             (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
 933                                 (void) ip_leave_allmulti(index, B_FALSE, ipst);
 934                                 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
 935                                 goto done;
 936                         }
 937                         ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
 938                 }
 939         }
 940         ipnetif->if_multicnt++;
 941 
 942 done:
 943         mutex_exit(&ips->ips_event_lock);
 944         return (err);
 945 }
 946 
 947 static void
 948 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
 949 {
 950         int             err;
 951         ip_stack_t      *ipst = ips->ips_netstack->netstack_ip;
 952         uint64_t        index = ipnetif->if_index;
 953 
 954         mutex_enter(&ips->ips_event_lock);
 955         ASSERT(ipnetif->if_multicnt != 0);
 956         if (--ipnetif->if_multicnt == 0) {
 957                 if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
 958                         err = ip_leave_allmulti(index, B_FALSE, ipst);
 959                         ASSERT(err == 0 || err == ENODEV);
 960                         ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
 961                 }
 962                 if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
 963                         err = ip_leave_allmulti(index, B_TRUE, ipst);
 964                         ASSERT(err == 0 || err == ENODEV);
 965                         ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
 966                 }
 967         }
 968         mutex_exit(&ips->ips_event_lock);
 969 }
 970 
 971 /*
 972  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
 973  * The structure it copies the header information from,
 974  * hook_pkt_observe_t, is constructed using network byte
 975  * order in ipobs_hook(), so there is no conversion here.
 976  */
 977 static mblk_t *
 978 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
 979 {
 980         mblk_t          *dlhdr;
 981         dl_ipnetinfo_t  *dl;
 982 
 983         if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
 984                 freemsg(mp);
 985                 return (NULL);
 986         }
 987         dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
 988         dl->dli_version = DL_IPNETINFO_VERSION;
 989         dl->dli_family = hdr->hpo_family;
 990         dl->dli_htype = hdr->hpo_htype;
 991         dl->dli_pktlen = hdr->hpo_pktlen;
 992         dl->dli_ifindex = hdr->hpo_ifindex;
 993         dl->dli_grifindex = hdr->hpo_grifindex;
 994         dl->dli_zsrc = hdr->hpo_zsrc;
 995         dl->dli_zdst = hdr->hpo_zdst;
 996         dlhdr->b_wptr += sizeof (*dl);
 997         dlhdr->b_cont = mp;
 998 
 999         return (dlhdr);
1000 }
1001 
1002 static ipnet_addrtype_t
1003 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1004 {
1005         list_t                  *list;
1006         ipnetif_t               *ipnetif = ipnet->ipnet_if;
1007         ipnetif_addr_t          *ifaddr;
1008         ipnet_addrtype_t        addrtype = IPNETADDR_UNKNOWN;
1009 
1010         /* First check if the address is multicast or limited broadcast. */
1011         switch (addr->iap_family) {
1012         case AF_INET:
1013                 if (CLASSD(*(addr->iap_addr4)) ||
1014                     *(addr->iap_addr4) == INADDR_BROADCAST)
1015                         return (IPNETADDR_MBCAST);
1016                 break;
1017         case AF_INET6:
1018                 if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1019                         return (IPNETADDR_MBCAST);
1020                 break;
1021         }
1022 
1023         /*
1024          * Walk the address list to see if the address belongs to our
1025          * interface or is one of our subnet broadcast addresses.
1026          */
1027         mutex_enter(&ipnetif->if_addr_lock);
1028         list = (addr->iap_family == AF_INET) ?
1029             &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1030         for (ifaddr = list_head(list);
1031             ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1032             ifaddr = list_next(list, ifaddr)) {
1033                 /*
1034                  * If we're not in the global zone, then only look at
1035                  * addresses in our zone.
1036                  */
1037                 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1038                     ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1039                         continue;
1040                 switch (addr->iap_family) {
1041                 case AF_INET:
1042                         if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1043                             *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1044                                 addrtype = IPNETADDR_MYADDR;
1045                         else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1046                             *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1047                                 addrtype = IPNETADDR_MBCAST;
1048                         break;
1049                 case AF_INET6:
1050                         if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1051                             &ifaddr->ifa_ip6addr))
1052                                 addrtype = IPNETADDR_MYADDR;
1053                         break;
1054                 }
1055         }
1056         mutex_exit(&ipnetif->if_addr_lock);
1057 
1058         return (addrtype);
1059 }
1060 
1061 /*
1062  * Verify if the packet contained in hdr should be passed up to the
1063  * ipnet client stream.
1064  */
1065 static boolean_t
1066 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1067     ipnet_addrp_t *dst)
1068 {
1069         boolean_t               obsif;
1070         uint64_t                ifindex = ipnet->ipnet_if->if_index;
1071         ipnet_addrtype_t        srctype;
1072         ipnet_addrtype_t        dsttype;
1073 
1074         srctype = ipnet_get_addrtype(ipnet, src);
1075         dsttype = ipnet_get_addrtype(ipnet, dst);
1076 
1077         /*
1078          * If the packet's ifindex matches ours, or the packet's group ifindex
1079          * matches ours, it's on the interface we're observing.  (Thus,
1080          * observing on the group ifindex matches all ifindexes in the group.)
1081          */
1082         obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1083             ntohl(hdr->hpo_grifindex) == ifindex);
1084 
1085         DTRACE_PROBE5(ipnet_accept__addr,
1086             ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1087             ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1088             boolean_t, obsif);
1089 
1090         /*
1091          * Do not allow an ipnet stream to see packets that are not from or to
1092          * its zone.  The exception is when zones are using the shared stack
1093          * model.  In this case, streams in the global zone have visibility
1094          * into other shared-stack zones, and broadcast and multicast traffic
1095          * is visible by all zones in the stack.
1096          */
1097         if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1098             dsttype != IPNETADDR_MBCAST) {
1099                 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1100                     ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1101                         return (B_FALSE);
1102         }
1103 
1104         /*
1105          * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1106          * packet's IP version.
1107          */
1108         if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1109             ipnet->ipnet_family != hdr->hpo_family)
1110                 return (B_FALSE);
1111 
1112         /* If the destination address is ours, then accept the packet. */
1113         if (dsttype == IPNETADDR_MYADDR)
1114                 return (B_TRUE);
1115 
1116         /*
1117          * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1118          * sent or received on the interface we're observing, or packets that
1119          * have our source address (this allows us to see packets we send).
1120          */
1121         if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1122                 if (srctype == IPNETADDR_MYADDR || obsif)
1123                         return (B_TRUE);
1124         }
1125 
1126         /*
1127          * We accept multicast and broadcast packets transmitted or received
1128          * on the interface we're observing.
1129          */
1130         if (dsttype == IPNETADDR_MBCAST && obsif)
1131                 return (B_TRUE);
1132 
1133         return (B_FALSE);
1134 }
1135 
1136 /*
1137  * Verify if the packet contained in hdr should be passed up to the ipnet
1138  * client stream that's in IPNET_LOMODE.
1139  */
1140 /* ARGSUSED */
1141 static boolean_t
1142 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1143     ipnet_addrp_t *dst)
1144 {
1145         if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1146                 /*
1147                  * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1148                  */
1149                 if (ipnet->ipnet_if == NULL)
1150                         return (B_FALSE);
1151         }
1152 
1153         /*
1154          * An ipnet stream must not see packets that are not from/to its zone.
1155          */
1156         if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1157                 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1158                     ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1159                         return (B_FALSE);
1160         }
1161 
1162         return (ipnet->ipnet_family == AF_UNSPEC ||
1163             ipnet->ipnet_family == hdr->hpo_family);
1164 }
1165 
1166 static void
1167 ipnet_dispatch(void *arg)
1168 {
1169         mblk_t                  *mp = arg;
1170         hook_pkt_observe_t      *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1171         ipnet_t                 *ipnet;
1172         mblk_t                  *netmp;
1173         list_t                  *list;
1174         ipnet_stack_t           *ips;
1175         ipnet_addrp_t           src;
1176         ipnet_addrp_t           dst;
1177 
1178         ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1179 
1180         netmp = hdr->hpo_pkt->b_cont;
1181         src.iap_family = hdr->hpo_family;
1182         dst.iap_family = hdr->hpo_family;
1183 
1184         if (hdr->hpo_family == AF_INET) {
1185                 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1186                 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1187         } else {
1188                 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1189                 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1190         }
1191 
1192         ipnet_walkers_inc(ips);
1193 
1194         list = &ips->ips_str_list;
1195         for (ipnet = list_head(list); ipnet != NULL;
1196             ipnet = list_next(list, ipnet)) {
1197                 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1198                         IPSK_BUMP(ips, ik_acceptFail);
1199                         continue;
1200                 }
1201                 IPSK_BUMP(ips, ik_acceptOk);
1202 
1203                 if (list_next(list, ipnet) == NULL) {
1204                         netmp = hdr->hpo_pkt->b_cont;
1205                         hdr->hpo_pkt->b_cont = NULL;
1206                 } else {
1207                         if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1208                             (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1209                                 IPSK_BUMP(ips, ik_duplicationFail);
1210                                 continue;
1211                         }
1212                 }
1213 
1214                 if (ipnet->ipnet_flags & IPNET_INFO) {
1215                         if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1216                                 IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1217                                 continue;
1218                         }
1219                 }
1220 
1221                 if (ipnet->ipnet_rq->q_first == NULL &&
1222                     canputnext(ipnet->ipnet_rq)) {
1223                         putnext(ipnet->ipnet_rq, netmp);
1224                         IPSK_BUMP(ips, ik_dispatchDeliver);
1225                 } else if (canput(ipnet->ipnet_rq)) {
1226                         (void) putq(ipnet->ipnet_rq, netmp);
1227                         IPSK_BUMP(ips, ik_dispatchDeliver);
1228                 } else {
1229                         freemsg(netmp);
1230                         IPSK_BUMP(ips, ik_dispatchPutDrop);
1231                 }
1232         }
1233 
1234         ipnet_walkers_dec(ips);
1235 
1236         freemsg(mp);
1237 }
1238 
1239 static void
1240 ipnet_input(mblk_t *mp)
1241 {
1242         hook_pkt_observe_t      *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1243         ipnet_stack_t           *ips;
1244 
1245         ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1246 
1247         if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1248             DDI_SUCCESS) {
1249                 IPSK_BUMP(ips, ik_dispatchFail);
1250                 freemsg(mp);
1251         } else {
1252                 IPSK_BUMP(ips, ik_dispatchOk);
1253         }
1254 }
1255 
1256 static ipnetif_t *
1257 ipnet_alloc_if(ipnet_stack_t *ips)
1258 {
1259         ipnetif_t       *ipnetif;
1260 
1261         if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1262                 return (NULL);
1263 
1264         mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1265         list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1266             offsetof(ipnetif_addr_t, ifa_link));
1267         list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1268             offsetof(ipnetif_addr_t, ifa_link));
1269         mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1270 
1271         ipnetif->if_stackp = ips;
1272 
1273         return (ipnetif);
1274 }
1275 
1276 /*
1277  * Create a new ipnetif_t and new minor node for it.  If creation is
1278  * successful the new ipnetif_t is inserted into an avl_tree
1279  * containing ipnetif's for this stack instance.
1280  */
1281 static ipnetif_t *
1282 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1283     uint64_t ifflags)
1284 {
1285         ipnetif_t       *ipnetif;
1286         avl_index_t     where = 0;
1287         minor_t         ifminor;
1288 
1289         /*
1290          * Because ipnetif_create() can be called from a NIC event
1291          * callback, it should not block.
1292          */
1293         ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1294         if (ifminor == (minor_t)-1)
1295                 return (NULL);
1296         if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1297                 id_free(ipnet_minor_space, ifminor);
1298                 return (NULL);
1299         }
1300 
1301         (void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1302         ipnetif->if_index = (uint_t)index;
1303         ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1304         ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1305 
1306         ipnetif->if_refcnt = 1;
1307         if ((ifflags & IFF_LOOPBACK) != 0)
1308                 ipnetif->if_flags = IPNETIF_LOOPBACK;
1309 
1310         mutex_enter(&ips->ips_avl_lock);
1311         VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1312         avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1313         VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1314         avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1315         mutex_exit(&ips->ips_avl_lock);
1316 
1317         return (ipnetif);
1318 }
1319 
1320 static void
1321 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1322 {
1323         ipnet_t *ipnet;
1324 
1325         ipnet_walkers_inc(ips);
1326         /* Send a SIGHUP to all open streams associated with this ipnetif. */
1327         for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1328             ipnet = list_next(&ips->ips_str_list, ipnet)) {
1329                 if (ipnet->ipnet_if == ipnetif)
1330                         (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1331         }
1332         ipnet_walkers_dec(ips);
1333         mutex_enter(&ips->ips_avl_lock);
1334         avl_remove(&ips->ips_avl_by_index, ipnetif);
1335         avl_remove(&ips->ips_avl_by_name, ipnetif);
1336         mutex_exit(&ips->ips_avl_lock);
1337         /*
1338          * Release the reference we implicitly held in ipnetif_create().
1339          */
1340         ipnetif_refrele(ipnetif);
1341 }
1342 
1343 static void
1344 ipnet_purge_addrlist(list_t *addrlist)
1345 {
1346         ipnetif_addr_t  *ifa;
1347 
1348         while ((ifa = list_head(addrlist)) != NULL) {
1349                 list_remove(addrlist, ifa);
1350                 if (ifa->ifa_shared != NULL)
1351                         ipnetif_clone_release(ifa->ifa_shared);
1352                 kmem_free(ifa, sizeof (*ifa));
1353         }
1354 }
1355 
1356 static void
1357 ipnetif_free(ipnetif_t *ipnetif)
1358 {
1359         ASSERT(ipnetif->if_refcnt == 0);
1360         ASSERT(ipnetif->if_sharecnt == 0);
1361 
1362         /* Remove IPv4/v6 address lists from the ipnetif */
1363         ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1364         list_destroy(&ipnetif->if_ip4addr_list);
1365         ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1366         list_destroy(&ipnetif->if_ip6addr_list);
1367         mutex_destroy(&ipnetif->if_addr_lock);
1368         mutex_destroy(&ipnetif->if_reflock);
1369         if (ipnetif->if_dev != 0)
1370                 id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1371         kmem_free(ipnetif, sizeof (*ipnetif));
1372 }
1373 
1374 /*
1375  * Create an ipnetif_addr_t with the given logical interface id (lif)
1376  * and add it to the supplied ipnetif.  The lif is the netinfo
1377  * representation of logical interface id, and we use this id to match
1378  * incoming netinfo events against our lists of addresses.
1379  */
1380 static void
1381 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1382 {
1383         ipnetif_addr_t          *ifaddr;
1384         zoneid_t                zoneid;
1385         struct sockaddr_in      bcast;
1386         struct sockaddr_storage addr;
1387         net_ifaddr_t            type = NA_ADDRESS;
1388         uint64_t                phyif = ipnetif->if_index;
1389 
1390         if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1391             net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1392                 return;
1393 
1394         if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1395                 return;
1396         ifaddr->ifa_zone = zoneid;
1397         ifaddr->ifa_id = lif;
1398         ifaddr->ifa_shared = NULL;
1399 
1400         switch (addr.ss_family) {
1401         case AF_INET:
1402                 ifaddr->ifa_ip4addr =
1403                     ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1404                 /*
1405                  * Try and get the broadcast address.  Note that it's okay for
1406                  * an interface to not have a broadcast address, so we don't
1407                  * fail the entire operation if net_getlifaddr() fails here.
1408                  */
1409                 type = NA_BROADCAST;
1410                 if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1411                         ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1412                 break;
1413         case AF_INET6:
1414                 ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1415                 break;
1416         }
1417 
1418         /*
1419          * The zoneid stored in ipnetif_t needs to correspond to the actual
1420          * zone the address is being used in. This facilitates finding the
1421          * correct netstack_t pointer, amongst other things, later.
1422          */
1423         if (zoneid == ALL_ZONES)
1424                 zoneid = GLOBAL_ZONEID;
1425 
1426         mutex_enter(&ipnetif->if_addr_lock);
1427         if (zoneid != ipnetif->if_zoneid) {
1428                 ipnetif_t *ifp2;
1429 
1430                 ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1431                 ifaddr->ifa_shared = ifp2;
1432         }
1433         list_insert_tail(addr.ss_family == AF_INET ?
1434             &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1435         mutex_exit(&ipnetif->if_addr_lock);
1436 }
1437 
1438 static void
1439 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1440 {
1441         mutex_enter(&ipnetif->if_addr_lock);
1442         if (ifaddr->ifa_shared != NULL)
1443                 ipnetif_clone_release(ifaddr->ifa_shared);
1444 
1445         list_remove(isv6 ?
1446             &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1447         mutex_exit(&ipnetif->if_addr_lock);
1448         kmem_free(ifaddr, sizeof (*ifaddr));
1449 }
1450 
1451 static void
1452 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1453 {
1454         ipnetif_t       *ipnetif;
1455         boolean_t       refrele_needed = B_TRUE;
1456         uint64_t        ifflags;
1457         uint64_t        ifindex;
1458         char            *ifname;
1459 
1460         ifflags = 0;
1461         ifname = ipne->ipne_ifname;
1462         ifindex = ipne->ipne_ifindex;
1463 
1464         (void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1465 
1466         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1467                 ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1468                 refrele_needed = B_FALSE;
1469         }
1470         if (ipnetif != NULL) {
1471                 ipnetif->if_flags |=
1472                     isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1473         }
1474 
1475         if (ipnetif->if_multicnt != 0) {
1476                 if (ip_join_allmulti(ifindex, isv6,
1477                     ips->ips_netstack->netstack_ip) == 0) {
1478                         ipnetif->if_flags |=
1479                             isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1480                 }
1481         }
1482 
1483         if (refrele_needed)
1484                 ipnetif_refrele(ipnetif);
1485 }
1486 
1487 static void
1488 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1489 {
1490         ipnetif_t       *ipnetif;
1491 
1492         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1493                 return;
1494 
1495         mutex_enter(&ipnetif->if_addr_lock);
1496         ipnet_purge_addrlist(isv6 ?
1497             &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1498         mutex_exit(&ipnetif->if_addr_lock);
1499 
1500         /*
1501          * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1502          * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1503          * if both IPv4 and IPv6 interfaces have been unplumbed.
1504          */
1505         ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1506         if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1507                 ipnetif_remove(ipnetif, ips);
1508         ipnetif_refrele(ipnetif);
1509 }
1510 
1511 static void
1512 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1513     ipnet_stack_t *ips, boolean_t isv6)
1514 {
1515         ipnetif_t       *ipnetif;
1516         ipnetif_addr_t  *ifaddr;
1517 
1518         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1519                 return;
1520         if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1521                 /*
1522                  * We must have missed a NE_LIF_DOWN event.  Delete this
1523                  * ifaddr and re-create it.
1524                  */
1525                 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1526         }
1527 
1528         ipnet_add_ifaddr(lifindex, ipnetif, nd);
1529         ipnetif_refrele(ipnetif);
1530 }
1531 
1532 static void
1533 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1534     boolean_t isv6)
1535 {
1536         ipnetif_t       *ipnetif;
1537         ipnetif_addr_t  *ifaddr;
1538 
1539         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1540                 return;
1541         if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1542                 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1543         ipnetif_refrele(ipnetif);
1544         /*
1545          * Make sure that open streams on this ipnetif are still allowed to
1546          * have it open.
1547          */
1548         ipnetif_zonecheck(ipnetif, ips);
1549 }
1550 
1551 /*
1552  * This callback from the NIC event framework dispatches a taskq as the event
1553  * handlers may block.
1554  */
1555 /* ARGSUSED */
1556 static int
1557 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1558 {
1559         ipnet_stack_t           *ips = arg;
1560         hook_nic_event_t        *hn = (hook_nic_event_t *)info;
1561         ipnet_nicevent_t        *ipne;
1562 
1563         if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1564                 return (0);
1565         ipne->ipne_event = hn->hne_event;
1566         ipne->ipne_protocol = hn->hne_protocol;
1567         ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1568         ipne->ipne_ifindex = hn->hne_nic;
1569         ipne->ipne_lifindex = hn->hne_lif;
1570         if (hn->hne_datalen != 0) {
1571                 (void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1572                     sizeof (ipne->ipne_ifname));
1573         }
1574         (void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1575             ipne, DDI_NOSLEEP);
1576         return (0);
1577 }
1578 
1579 static void
1580 ipnet_nicevent_task(void *arg)
1581 {
1582         ipnet_nicevent_t        *ipne = arg;
1583         netstack_t              *ns;
1584         ipnet_stack_t           *ips;
1585         boolean_t               isv6;
1586 
1587         if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1588                 goto done;
1589         ips = ns->netstack_ipnet;
1590         isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1591 
1592         mutex_enter(&ips->ips_event_lock);
1593         switch (ipne->ipne_event) {
1594         case NE_PLUMB:
1595                 ipnet_plumb_ev(ipne, ips, isv6);
1596                 break;
1597         case NE_UNPLUMB:
1598                 ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1599                 break;
1600         case NE_LIF_UP:
1601                 ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1602                     ipne->ipne_protocol, ips, isv6);
1603                 break;
1604         case NE_LIF_DOWN:
1605                 ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1606                     isv6);
1607                 break;
1608         default:
1609                 break;
1610         }
1611         mutex_exit(&ips->ips_event_lock);
1612 done:
1613         if (ns != NULL)
1614                 netstack_rele(ns);
1615         kmem_free(ipne, sizeof (ipnet_nicevent_t));
1616 }
1617 
1618 dev_t
1619 ipnet_if_getdev(char *name, zoneid_t zoneid)
1620 {
1621         netstack_t      *ns;
1622         ipnet_stack_t   *ips;
1623         ipnetif_t       *ipnetif;
1624         dev_t           dev = (dev_t)-1;
1625 
1626         if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1627                 return (dev);
1628         if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1629                 return (dev);
1630 
1631         ips = ns->netstack_ipnet;
1632         mutex_enter(&ips->ips_avl_lock);
1633         if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1634                 if (ipnetif_in_zone(ipnetif, zoneid, ips))
1635                         dev = ipnetif->if_dev;
1636         }
1637         mutex_exit(&ips->ips_avl_lock);
1638         netstack_rele(ns);
1639 
1640         return (dev);
1641 }
1642 
1643 static ipnetif_t *
1644 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1645 {
1646         ipnetif_t       *ipnetif;
1647 
1648         mutex_enter(&ips->ips_avl_lock);
1649         if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1650                 ipnetif_refhold(ipnetif);
1651         mutex_exit(&ips->ips_avl_lock);
1652         return (ipnetif);
1653 }
1654 
1655 static ipnetif_t *
1656 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1657 {
1658         ipnetif_t       *ipnetif;
1659         avl_tree_t      *tree;
1660 
1661         mutex_enter(&ips->ips_avl_lock);
1662         tree = &ips->ips_avl_by_index;
1663         for (ipnetif = avl_first(tree); ipnetif != NULL;
1664             ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1665                 if (ipnetif->if_dev == dev) {
1666                         ipnetif_refhold(ipnetif);
1667                         break;
1668                 }
1669         }
1670         mutex_exit(&ips->ips_avl_lock);
1671         return (ipnetif);
1672 }
1673 
1674 static ipnetif_addr_t *
1675 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1676 {
1677         ipnetif_addr_t  *ifaddr;
1678         list_t  *list;
1679 
1680         mutex_enter(&ipnetif->if_addr_lock);
1681         list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1682         for (ifaddr = list_head(list); ifaddr != NULL;
1683             ifaddr = list_next(list, ifaddr)) {
1684                 if (lid == ifaddr->ifa_id)
1685                         break;
1686         }
1687         mutex_exit(&ipnetif->if_addr_lock);
1688         return (ifaddr);
1689 }
1690 
1691 /* ARGSUSED */
1692 static void *
1693 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1694 {
1695         ipnet_stack_t   *ips;
1696 
1697         ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1698         ips->ips_netstack = ns;
1699         mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1700         avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1701             sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1702         avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1703             sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1704         avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1705             sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1706         mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1707         cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1708         list_create(&ips->ips_str_list, sizeof (ipnet_t),
1709             offsetof(ipnet_t, ipnet_next));
1710         ipnet_register_netihook(ips);
1711         return (ips);
1712 }
1713 
1714 /* ARGSUSED */
1715 static void
1716 ipnet_stack_fini(netstackid_t stackid, void *arg)
1717 {
1718         ipnet_stack_t   *ips = arg;
1719         ipnetif_t       *ipnetif, *nipnetif;
1720 
1721         if (ips->ips_kstatp != NULL) {
1722                 zoneid_t zoneid;
1723 
1724                 zoneid = netstackid_to_zoneid(stackid);
1725                 net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1726         }
1727         if (ips->ips_ndv4 != NULL) {
1728                 VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1729                     ips->ips_nicevents) == 0);
1730                 VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1731         }
1732         if (ips->ips_ndv6 != NULL) {
1733                 VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1734                     ips->ips_nicevents) == 0);
1735                 VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1736         }
1737         hook_free(ips->ips_nicevents);
1738 
1739         for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1740             ipnetif = nipnetif) {
1741                 nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1742                 ipnetif_remove(ipnetif, ips);
1743         }
1744         avl_destroy(&ips->ips_avl_by_shared);
1745         avl_destroy(&ips->ips_avl_by_index);
1746         avl_destroy(&ips->ips_avl_by_name);
1747         mutex_destroy(&ips->ips_avl_lock);
1748         mutex_destroy(&ips->ips_walkers_lock);
1749         cv_destroy(&ips->ips_walkers_cv);
1750         list_destroy(&ips->ips_str_list);
1751         kmem_free(ips, sizeof (*ips));
1752 }
1753 
1754 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1755 static boolean_t
1756 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1757 {
1758         ipnetif_addr_t  *ifa;
1759 
1760         for (ifa = list_head(addrlist); ifa != NULL;
1761             ifa = list_next(addrlist, ifa)) {
1762                 if (ifa->ifa_zone == zoneid)
1763                         return (B_TRUE);
1764         }
1765         return (B_FALSE);
1766 }
1767 
1768 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1769 static boolean_t
1770 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1771 {
1772         int     ret;
1773 
1774         /*
1775          * The global zone has visibility into all interfaces in the global
1776          * stack, and exclusive stack zones have visibility into all
1777          * interfaces in their stack.
1778          */
1779         if (zoneid == GLOBAL_ZONEID ||
1780             ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1781                 return (B_TRUE);
1782 
1783         /*
1784          * Shared-stack zones only have visibility for interfaces that have
1785          * addresses in their zone.
1786          */
1787         mutex_enter(&ipnetif->if_addr_lock);
1788         ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1789             ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1790         mutex_exit(&ipnetif->if_addr_lock);
1791         return (ret);
1792 }
1793 
1794 /*
1795  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1796  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1797  * to have an ipnetif open if there are no longer any addresses that belong to
1798  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1799  * case, send the ipnet_t an M_HANGUP.
1800  */
1801 static void
1802 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1803 {
1804         list_t  *strlist = &ips->ips_str_list;
1805         ipnet_t *ipnet;
1806 
1807         ipnet_walkers_inc(ips);
1808         for (ipnet = list_head(strlist); ipnet != NULL;
1809             ipnet = list_next(strlist, ipnet)) {
1810                 if (ipnet->ipnet_if != ipnetif)
1811                         continue;
1812                 if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1813                         (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1814         }
1815         ipnet_walkers_dec(ips);
1816 }
1817 
1818 void
1819 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1820 {
1821         ipnetif_t               *ipnetif;
1822         list_t                  cbdata;
1823         ipnetif_cbdata_t        *cbnode;
1824         netstack_t              *ns;
1825         ipnet_stack_t           *ips;
1826 
1827         /*
1828          * On labeled systems, non-global zones shouldn't see anything
1829          * in /dev/ipnet.
1830          */
1831         if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1832                 return;
1833 
1834         if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1835                 return;
1836 
1837         ips = ns->netstack_ipnet;
1838         list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1839             offsetof(ipnetif_cbdata_t, ic_next));
1840 
1841         mutex_enter(&ips->ips_avl_lock);
1842         for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1843             ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1844                 if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1845                         continue;
1846                 cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1847                 (void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1848                 cbnode->ic_dev = ipnetif->if_dev;
1849                 list_insert_head(&cbdata, cbnode);
1850         }
1851         mutex_exit(&ips->ips_avl_lock);
1852 
1853         while ((cbnode = list_head(&cbdata)) != NULL) {
1854                 cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1855                 list_remove(&cbdata, cbnode);
1856                 kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1857         }
1858         list_destroy(&cbdata);
1859         netstack_rele(ns);
1860 }
1861 
1862 static int
1863 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1864 {
1865         int64_t index1 = *((int64_t *)index_ptr);
1866         int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1867 
1868         return (SIGNOF(index2 - index1));
1869 }
1870 
1871 static int
1872 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1873 {
1874         int     res;
1875 
1876         res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1877         return (SIGNOF(res));
1878 }
1879 
1880 static int
1881 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1882 {
1883         const uintptr_t *ptr = key_ptr;
1884         const ipnetif_t *ifp;
1885         int             res;
1886 
1887         ifp = ipnetifp;
1888         res = ifp->if_zoneid - ptr[0];
1889         if (res != 0)
1890                 return (SIGNOF(res));
1891         res = strcmp(ifp->if_name, (char *)ptr[1]);
1892         return (SIGNOF(res));
1893 }
1894 
1895 static void
1896 ipnetif_refhold(ipnetif_t *ipnetif)
1897 {
1898         mutex_enter(&ipnetif->if_reflock);
1899         ipnetif->if_refcnt++;
1900         mutex_exit(&ipnetif->if_reflock);
1901 }
1902 
1903 static void
1904 ipnetif_refrele(ipnetif_t *ipnetif)
1905 {
1906         mutex_enter(&ipnetif->if_reflock);
1907         ASSERT(ipnetif->if_refcnt > 0);
1908         if (--ipnetif->if_refcnt == 0)
1909                 ipnetif_free(ipnetif);
1910         else
1911                 mutex_exit(&ipnetif->if_reflock);
1912 }
1913 
1914 static void
1915 ipnet_walkers_inc(ipnet_stack_t *ips)
1916 {
1917         mutex_enter(&ips->ips_walkers_lock);
1918         ips->ips_walkers_cnt++;
1919         mutex_exit(&ips->ips_walkers_lock);
1920 }
1921 
1922 static void
1923 ipnet_walkers_dec(ipnet_stack_t *ips)
1924 {
1925         mutex_enter(&ips->ips_walkers_lock);
1926         ASSERT(ips->ips_walkers_cnt != 0);
1927         if (--ips->ips_walkers_cnt == 0)
1928                 cv_broadcast(&ips->ips_walkers_cv);
1929         mutex_exit(&ips->ips_walkers_lock);
1930 }
1931 
1932 /*ARGSUSED*/
1933 static int
1934 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1935 {
1936         hook_pkt_observe_t      *hdr;
1937         pfv_t                   func = (pfv_t)arg;
1938         mblk_t                  *mp;
1939 
1940         hdr = (hook_pkt_observe_t *)info;
1941         /*
1942          * Code in ip_input() expects that it is the only one accessing the
1943          * packet.
1944          */
1945         mp = copymsg(hdr->hpo_pkt);
1946         if (mp == NULL)  {
1947                 netstack_t *ns = hdr->hpo_ctx;
1948                 ipnet_stack_t *ips = ns->netstack_ipnet;
1949 
1950                 IPSK_BUMP(ips, ik_dispatchDupDrop);
1951                 return (0);
1952         }
1953 
1954         hdr = (hook_pkt_observe_t *)mp->b_rptr;
1955         hdr->hpo_pkt = mp;
1956 
1957         func(mp);
1958 
1959         return (0);
1960 }
1961 
1962 hook_t *
1963 ipobs_register_hook(netstack_t *ns, pfv_t func)
1964 {
1965         ip_stack_t      *ipst = ns->netstack_ip;
1966         char            name[32];
1967         hook_t          *hook;
1968 
1969         HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1970         VERIFY(hook != NULL);
1971 
1972         /*
1973          * To register multiple hooks with he same callback function,
1974          * a unique name is needed.
1975          */
1976         (void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1977         hook->h_name = strdup(name);
1978 
1979         (void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1980         (void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1981 
1982         return (hook);
1983 }
1984 
1985 void
1986 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1987 {
1988         ip_stack_t      *ipst = ns->netstack_ip;
1989 
1990         (void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1991 
1992         (void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1993 
1994         strfree(hook->h_name);
1995 
1996         hook_free(hook);
1997 }
1998 
1999 /* ******************************************************************** */
2000 /* BPF Functions below                                                  */
2001 /* ******************************************************************** */
2002 
2003 /*
2004  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2005  */
2006 ipnet_stack_t *
2007 ipnet_find_by_zoneid(zoneid_t zoneid)
2008 {
2009         netstack_t      *ns;
2010 
2011         VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2012         return (ns->netstack_ipnet);
2013 }
2014 
2015 /*
2016  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2017  * pointer to ipnet_stack_t by calling a netstack lookup function.
2018  * The netstack_find_*() functions return a pointer after doing a "hold"
2019  * on the data structure and thereby require a "release" when the caller
2020  * is finished with it. We need to mirror that API here and thus a caller
2021  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2022  */
2023 void
2024 ipnet_rele(ipnet_stack_t *ips)
2025 {
2026         netstack_rele(ips->ips_netstack);
2027 }
2028 
2029 /*
2030  */
2031 void
2032 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2033 {
2034         ipnet_itap = tapfunc;
2035 }
2036 
2037 /*
2038  * The list of interfaces available via ipnet is private for each zone,
2039  * so the AVL tree of each zone must be searched for a given name, even
2040  * if all names are unique.
2041  */
2042 int
2043 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2044 {
2045         ipnet_stack_t   *ips;
2046         ipnetif_t       *ipnetif;
2047 
2048         ASSERT(ptr != NULL);
2049         VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2050 
2051         mutex_enter(&ips->ips_avl_lock);
2052 
2053         /*
2054          * Shared instance zone?
2055          */
2056         if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2057                 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2058 
2059                 ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2060         } else {
2061                 ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2062         }
2063         if (ipnetif != NULL)
2064                 ipnetif_refhold(ipnetif);
2065         mutex_exit(&ips->ips_avl_lock);
2066 
2067         *ptr = ipnetif;
2068         ipnet_rele(ips);
2069 
2070         if (ipnetif == NULL)
2071                 return (ESRCH);
2072         return (0);
2073 }
2074 
2075 void
2076 ipnet_close_byhandle(ipnetif_t *ifp)
2077 {
2078         ASSERT(ifp != NULL);
2079         ipnetif_refrele(ifp);
2080 }
2081 
2082 const char *
2083 ipnet_name(ipnetif_t *ifp)
2084 {
2085         ASSERT(ifp != NULL);
2086         return (ifp->if_name);
2087 }
2088 
2089 /*
2090  * To find the linkid for a given name, it is necessary to know which zone
2091  * the interface name belongs to and to search the avl tree for that zone
2092  * as there is no master list of all interfaces and which zone they belong
2093  * to. It is assumed that the caller of this function is somehow already
2094  * working with the ipnet interfaces and hence the ips_event_lock is held.
2095  * When BPF calls into this function, it is doing so because of an event
2096  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2097  * value returned has meaning without the need for grabbing a hold on the
2098  * owning structure.
2099  */
2100 int
2101 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2102 {
2103         ipnet_stack_t   *ips;
2104         ipnetif_t       *ifp;
2105 
2106         VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2107         ASSERT(mutex_owned(&ips->ips_event_lock));
2108 
2109         mutex_enter(&ips->ips_avl_lock);
2110         ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2111         if (ifp != NULL)
2112                 *idp = (uint_t)ifp->if_index;
2113 
2114         /*
2115          * Shared instance zone?
2116          */
2117         if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2118                 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2119 
2120                 ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2121                 if (ifp != NULL)
2122                         *idp = (uint_t)ifp->if_index;
2123         }
2124 
2125         mutex_exit(&ips->ips_avl_lock);
2126         ipnet_rele(ips);
2127 
2128         if (ifp == NULL)
2129                 return (ESRCH);
2130         return (0);
2131 }
2132 
2133 /*
2134  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2135  * there is in mac. BPF only needs to have this because it is required as
2136  * part of interfacing correctly with mac. The reuse of the original
2137  * ipnetif_t as a client poses no danger, so long as it is done with its
2138  * own ref-count'd hold that is given up on close.
2139  */
2140 int
2141 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2142 {
2143         ASSERT(ptr != NULL);
2144         ASSERT(result != NULL);
2145         ipnetif_refhold(ptr);
2146         *result = ptr;
2147 
2148         return (0);
2149 }
2150 
2151 void
2152 ipnet_client_close(ipnetif_t *ptr)
2153 {
2154         ASSERT(ptr != NULL);
2155         ipnetif_refrele(ptr);
2156 }
2157 
2158 /*
2159  * This is called from BPF when it needs to start receiving packets
2160  * from ipnet.
2161  *
2162  * The use of the ipnet_t structure here is somewhat lightweight when
2163  * compared to how it is used elsewhere but it already has all of the
2164  * right fields in it, so reuse here doesn't seem out of order. Its
2165  * primary purpose here is to provide the means to store pointers for
2166  * use when ipnet_promisc_remove() needs to be called.
2167  *
2168  * This should never be called for the IPNET_MINOR_LO device as it is
2169  * never created via ipnetif_create.
2170  */
2171 /*ARGSUSED*/
2172 int
2173 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2174     int flags)
2175 {
2176         ip_stack_t      *ipst;
2177         netstack_t      *ns;
2178         ipnetif_t       *ifp;
2179         ipnet_t         *ipnet;
2180         char            name[32];
2181         int             error;
2182 
2183         ifp = (ipnetif_t *)handle;
2184         ns = netstack_find_by_zoneid(ifp->if_zoneid);
2185 
2186         if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) {
2187                 error = ipnet_join_allmulti(ifp, ns->netstack_ipnet);
2188                 if (error != 0)
2189                         return (error);
2190         } else {
2191                 return (EINVAL);
2192         }
2193 
2194         ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2195         ipnet->ipnet_if = ifp;
2196         ipnet->ipnet_ns = ns;
2197         ipnet->ipnet_flags = flags;
2198 
2199         if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2200                 ipnet->ipnet_acceptfn = ipnet_loaccept;
2201         } else {
2202                 ipnet->ipnet_acceptfn = ipnet_accept;
2203         }
2204 
2205         /*
2206          * To register multiple hooks with the same callback function,
2207          * a unique name is needed.
2208          */
2209         HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2210         (void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2211             (void *)ipnet->ipnet_hook);
2212         ipnet->ipnet_hook->h_name = strdup(name);
2213         ipnet->ipnet_data = data;
2214         ipnet->ipnet_zoneid = ifp->if_zoneid;
2215 
2216         ipst = ns->netstack_ip;
2217 
2218         error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2219             ipnet->ipnet_hook);
2220         if (error != 0)
2221                 goto regfail;
2222 
2223         error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2224             ipnet->ipnet_hook);
2225         if (error != 0) {
2226                 (void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2227                     NH_OBSERVE, ipnet->ipnet_hook);
2228                 goto regfail;
2229         }
2230 
2231         *mhandle = (uintptr_t)ipnet;
2232         netstack_rele(ns);
2233 
2234         return (0);
2235 
2236 regfail:
2237         cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2238         strfree(ipnet->ipnet_hook->h_name);
2239         hook_free(ipnet->ipnet_hook);
2240         netstack_rele(ns);
2241         return (error);
2242 }
2243 
2244 void
2245 ipnet_promisc_remove(void *data)
2246 {
2247         ip_stack_t      *ipst;
2248         ipnet_t         *ipnet;
2249         hook_t          *hook;
2250 
2251         ipnet = data;
2252         ipst = ipnet->ipnet_ns->netstack_ip;
2253         hook = ipnet->ipnet_hook;
2254 
2255         VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2256             hook) == 0);
2257 
2258         VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2259             hook) == 0);
2260 
2261         strfree(hook->h_name);
2262 
2263         hook_free(hook);
2264 
2265         kmem_free(ipnet, sizeof (*ipnet));
2266 }
2267 
2268 /*
2269  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2270  * An important field from that structure is "ipnet_data" that
2271  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2272  * to be passed back to bpf when we call into ipnet_itap.
2273  *
2274  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2275  * from BPF.
2276  */
2277 /*ARGSUSED*/
2278 static int
2279 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2280 {
2281         hook_pkt_observe_t      *hdr;
2282         ipnet_addrp_t           src;
2283         ipnet_addrp_t           dst;
2284         ipnet_stack_t           *ips;
2285         ipnet_t                 *ipnet;
2286         mblk_t                  *netmp;
2287         mblk_t                  *mp;
2288 
2289         hdr = (hook_pkt_observe_t *)info;
2290         mp = hdr->hpo_pkt;
2291         ipnet = (ipnet_t *)arg;
2292         ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2293 
2294         netmp = hdr->hpo_pkt->b_cont;
2295         src.iap_family = hdr->hpo_family;
2296         dst.iap_family = hdr->hpo_family;
2297 
2298         if (hdr->hpo_family == AF_INET) {
2299                 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2300                 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2301         } else {
2302                 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2303                 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2304         }
2305 
2306         if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2307                 IPSK_BUMP(ips, ik_acceptFail);
2308                 return (0);
2309         }
2310         IPSK_BUMP(ips, ik_acceptOk);
2311 
2312         ipnet_itap(ipnet->ipnet_data, mp,
2313             hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2314             ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2315 
2316         return (0);
2317 }
2318 
2319 /*
2320  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2321  * to life and configures an IP address. The model that BPF uses is that
2322  * each interface must have a unique pointer and each interface must be
2323  * representative of what it can capture. They are limited to one DLT
2324  * per interface and one zone per interface. Thus every interface that
2325  * can be seen in a zone must be announced via an attach to bpf. For
2326  * shared instance zones, this means the ipnet driver needs to detect
2327  * when an address is added to an interface in a zone for the first
2328  * time (and also when the last address is removed.)
2329  */
2330 static ipnetif_t *
2331 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2332 {
2333         uintptr_t       key[2] = { zoneid, (uintptr_t)ifp->if_name };
2334         ipnet_stack_t   *ips = ifp->if_stackp;
2335         avl_index_t     where = 0;
2336         ipnetif_t       *newif;
2337 
2338         mutex_enter(&ips->ips_avl_lock);
2339         newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2340         if (newif != NULL) {
2341                 ipnetif_refhold(newif);
2342                 newif->if_sharecnt++;
2343                 mutex_exit(&ips->ips_avl_lock);
2344                 return (newif);
2345         }
2346 
2347         newif = ipnet_alloc_if(ips);
2348         if (newif == NULL) {
2349                 mutex_exit(&ips->ips_avl_lock);
2350                 return (NULL);
2351         }
2352 
2353         newif->if_refcnt = 1;
2354         newif->if_sharecnt = 1;
2355         newif->if_zoneid = zoneid;
2356         (void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2357         newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2358         newif->if_index = ifp->if_index;
2359 
2360         avl_insert(&ips->ips_avl_by_shared, newif, where);
2361         mutex_exit(&ips->ips_avl_lock);
2362 
2363         return (newif);
2364 }
2365 
2366 static void
2367 ipnetif_clone_release(ipnetif_t *ipnetif)
2368 {
2369         boolean_t       dofree = B_FALSE;
2370         boolean_t       doremove = B_FALSE;
2371         ipnet_stack_t   *ips = ipnetif->if_stackp;
2372 
2373         mutex_enter(&ipnetif->if_reflock);
2374         ASSERT(ipnetif->if_refcnt > 0);
2375         if (--ipnetif->if_refcnt == 0)
2376                 dofree = B_TRUE;
2377         ASSERT(ipnetif->if_sharecnt > 0);
2378         if (--ipnetif->if_sharecnt == 0)
2379                 doremove = B_TRUE;
2380         mutex_exit(&ipnetif->if_reflock);
2381         if (doremove) {
2382                 mutex_enter(&ips->ips_avl_lock);
2383                 avl_remove(&ips->ips_avl_by_shared, ipnetif);
2384                 mutex_exit(&ips->ips_avl_lock);
2385         }
2386         if (dofree) {
2387                 ASSERT(ipnetif->if_sharecnt == 0);
2388                 ipnetif_free(ipnetif);
2389         }
2390 }