1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 1990 Mentat Inc. 24 */ 25 26 /* 27 * This file contains the interface control functions for IP. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/dlpi.h> 33 #include <sys/stropts.h> 34 #include <sys/strsun.h> 35 #include <sys/sysmacros.h> 36 #include <sys/strsubr.h> 37 #include <sys/strlog.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/cmn_err.h> 41 #include <sys/kstat.h> 42 #include <sys/debug.h> 43 #include <sys/zone.h> 44 #include <sys/sunldi.h> 45 #include <sys/file.h> 46 #include <sys/bitmap.h> 47 #include <sys/cpuvar.h> 48 #include <sys/time.h> 49 #include <sys/ctype.h> 50 #include <sys/kmem.h> 51 #include <sys/systm.h> 52 #include <sys/param.h> 53 #include <sys/socket.h> 54 #include <sys/isa_defs.h> 55 #include <net/if.h> 56 #include <net/if_arp.h> 57 #include <net/if_types.h> 58 #include <net/if_dl.h> 59 #include <net/route.h> 60 #include <sys/sockio.h> 61 #include <netinet/in.h> 62 #include <netinet/ip6.h> 63 #include <netinet/icmp6.h> 64 #include <netinet/igmp_var.h> 65 #include <sys/policy.h> 66 #include <sys/ethernet.h> 67 #include <sys/callb.h> 68 #include <sys/md5.h> 69 70 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ 71 #include <inet/mi.h> 72 #include <inet/nd.h> 73 #include <inet/tunables.h> 74 #include <inet/arp.h> 75 #include <inet/ip_arp.h> 76 #include <inet/mib2.h> 77 #include <inet/ip.h> 78 #include <inet/ip6.h> 79 #include <inet/ip6_asp.h> 80 #include <inet/tcp.h> 81 #include <inet/ip_multi.h> 82 #include <inet/ip_ire.h> 83 #include <inet/ip_ftable.h> 84 #include <inet/ip_rts.h> 85 #include <inet/ip_ndp.h> 86 #include <inet/ip_if.h> 87 #include <inet/ip_impl.h> 88 #include <inet/sctp_ip.h> 89 #include <inet/ip_netinfo.h> 90 #include <inet/ilb_ip.h> 91 92 #include <netinet/igmp.h> 93 #include <inet/ip_listutils.h> 94 #include <inet/ipclassifier.h> 95 #include <sys/mac_client.h> 96 #include <sys/dld.h> 97 #include <sys/mac_flow.h> 98 99 #include <sys/systeminfo.h> 100 #include <sys/bootconf.h> 101 102 #include <sys/tsol/tndb.h> 103 #include <sys/tsol/tnet.h> 104 105 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */ 106 #include <inet/udp_impl.h> /* needed for udp_stack_t */ 107 108 /* The character which tells where the ill_name ends */ 109 #define IPIF_SEPARATOR_CHAR ':' 110 111 /* IP ioctl function table entry */ 112 typedef struct ipft_s { 113 int ipft_cmd; 114 pfi_t ipft_pfi; 115 int ipft_min_size; 116 int ipft_flags; 117 } ipft_t; 118 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ 119 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ 120 121 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); 122 static int nd_ill_forward_set(queue_t *q, mblk_t *mp, 123 char *value, caddr_t cp, cred_t *ioc_cr); 124 125 static boolean_t ill_is_quiescent(ill_t *); 126 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask); 127 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type); 128 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 129 mblk_t *mp, boolean_t need_up); 130 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 131 mblk_t *mp, boolean_t need_up); 132 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 133 queue_t *q, mblk_t *mp, boolean_t need_up); 134 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, 135 mblk_t *mp); 136 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, 137 mblk_t *mp); 138 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, 139 queue_t *q, mblk_t *mp, boolean_t need_up); 140 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, 141 int ioccmd, struct linkblk *li); 142 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); 143 static void ip_wput_ioctl(queue_t *q, mblk_t *mp); 144 static void ipsq_flush(ill_t *ill); 145 146 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, 147 queue_t *q, mblk_t *mp, boolean_t need_up); 148 static void ipsq_delete(ipsq_t *); 149 150 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, 151 boolean_t initialize, boolean_t insert, int *errorp); 152 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); 153 static void ipif_delete_bcast_ires(ipif_t *ipif); 154 static int ipif_add_ires_v4(ipif_t *, boolean_t); 155 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, 156 boolean_t isv6); 157 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); 158 static void ipif_free(ipif_t *ipif); 159 static void ipif_free_tail(ipif_t *ipif); 160 static void ipif_set_default(ipif_t *ipif); 161 static int ipif_set_values(queue_t *q, mblk_t *mp, 162 char *interf_name, uint_t *ppa); 163 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, 164 queue_t *q); 165 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, 166 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, 167 ip_stack_t *); 168 static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, 169 boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, 170 int *error, ip_stack_t *); 171 172 static int ill_alloc_ppa(ill_if_t *, ill_t *); 173 static void ill_delete_interface_type(ill_if_t *); 174 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); 175 static void ill_dl_down(ill_t *ill); 176 static void ill_down(ill_t *ill); 177 static void ill_down_ipifs(ill_t *, boolean_t); 178 static void ill_free_mib(ill_t *ill); 179 static void ill_glist_delete(ill_t *); 180 static void ill_phyint_reinit(ill_t *ill); 181 static void ill_set_nce_router_flags(ill_t *, boolean_t); 182 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *); 183 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *); 184 185 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; 186 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; 187 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; 188 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; 189 static ip_v4mapinfo_func_t ip_ether_v4_mapping; 190 static ip_v6mapinfo_func_t ip_ether_v6_mapping; 191 static ip_v4mapinfo_func_t ip_ib_v4_mapping; 192 static ip_v6mapinfo_func_t ip_ib_v6_mapping; 193 static ip_v4mapinfo_func_t ip_mbcast_mapping; 194 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); 195 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); 196 static void phyint_free(phyint_t *); 197 198 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); 199 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 200 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 201 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); 202 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); 203 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, 204 dl_capability_sub_t *); 205 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); 206 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); 207 static void ill_capability_dld_ack(ill_t *, mblk_t *, 208 dl_capability_sub_t *); 209 static void ill_capability_dld_enable(ill_t *); 210 static void ill_capability_ack_thr(void *); 211 static void ill_capability_lso_enable(ill_t *); 212 213 static ill_t *ill_prev_usesrc(ill_t *); 214 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); 215 static void ill_disband_usesrc_group(ill_t *); 216 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); 217 218 #ifdef DEBUG 219 static void ill_trace_cleanup(const ill_t *); 220 static void ipif_trace_cleanup(const ipif_t *); 221 #endif 222 223 static void ill_dlpi_clear_deferred(ill_t *ill); 224 225 /* 226 * if we go over the memory footprint limit more than once in this msec 227 * interval, we'll start pruning aggressively. 228 */ 229 int ip_min_frag_prune_time = 0; 230 231 static ipft_t ip_ioctl_ftbl[] = { 232 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, 233 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), 234 IPFT_F_NO_REPLY }, 235 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, 236 { 0 } 237 }; 238 239 /* Simple ICMP IP Header Template */ 240 static ipha_t icmp_ipha = { 241 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP 242 }; 243 244 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; 245 246 static ip_m_t ip_m_tbl[] = { 247 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 248 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 249 ip_nodef_v6intfid }, 250 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, 251 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 252 ip_nodef_v6intfid }, 253 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, 254 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 255 ip_nodef_v6intfid }, 256 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, 257 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 258 ip_nodef_v6intfid }, 259 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, 260 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, 261 ip_nodef_v6intfid }, 262 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, 263 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, 264 ip_nodef_v6intfid }, 265 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, 266 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 267 ip_ipv4_v6destintfid }, 268 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, 269 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, 270 ip_ipv6_v6destintfid }, 271 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, 272 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, 273 ip_nodef_v6intfid }, 274 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 275 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, 276 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 277 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, 278 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, 279 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, 280 ip_nodef_v6intfid } 281 }; 282 283 static ill_t ill_null; /* Empty ILL for init. */ 284 char ipif_loopback_name[] = "lo0"; 285 286 /* These are used by all IP network modules. */ 287 sin6_t sin6_null; /* Zero address for quick clears */ 288 sin_t sin_null; /* Zero address for quick clears */ 289 290 /* When set search for unused ipif_seqid */ 291 static ipif_t ipif_zero; 292 293 /* 294 * ppa arena is created after these many 295 * interfaces have been plumbed. 296 */ 297 uint_t ill_no_arena = 12; /* Setable in /etc/system */ 298 299 /* 300 * Allocate per-interface mibs. 301 * Returns true if ok. False otherwise. 302 * ipsq may not yet be allocated (loopback case ). 303 */ 304 static boolean_t 305 ill_allocate_mibs(ill_t *ill) 306 { 307 /* Already allocated? */ 308 if (ill->ill_ip_mib != NULL) { 309 if (ill->ill_isv6) 310 ASSERT(ill->ill_icmp6_mib != NULL); 311 return (B_TRUE); 312 } 313 314 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib), 315 KM_NOSLEEP); 316 if (ill->ill_ip_mib == NULL) { 317 return (B_FALSE); 318 } 319 320 /* Setup static information */ 321 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize, 322 sizeof (mib2_ipIfStatsEntry_t)); 323 if (ill->ill_isv6) { 324 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6; 325 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 326 sizeof (mib2_ipv6AddrEntry_t)); 327 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 328 sizeof (mib2_ipv6RouteEntry_t)); 329 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 330 sizeof (mib2_ipv6NetToMediaEntry_t)); 331 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 332 sizeof (ipv6_member_t)); 333 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 334 sizeof (ipv6_grpsrc_t)); 335 } else { 336 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4; 337 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize, 338 sizeof (mib2_ipAddrEntry_t)); 339 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize, 340 sizeof (mib2_ipRouteEntry_t)); 341 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize, 342 sizeof (mib2_ipNetToMediaEntry_t)); 343 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize, 344 sizeof (ip_member_t)); 345 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize, 346 sizeof (ip_grpsrc_t)); 347 348 /* 349 * For a v4 ill, we are done at this point, because per ill 350 * icmp mibs are only used for v6. 351 */ 352 return (B_TRUE); 353 } 354 355 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib), 356 KM_NOSLEEP); 357 if (ill->ill_icmp6_mib == NULL) { 358 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 359 ill->ill_ip_mib = NULL; 360 return (B_FALSE); 361 } 362 /* static icmp info */ 363 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize = 364 sizeof (mib2_ipv6IfIcmpEntry_t); 365 /* 366 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later 367 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert 368 * -> ill_phyint_reinit 369 */ 370 return (B_TRUE); 371 } 372 373 /* 374 * Completely vaporize a lower level tap and all associated interfaces. 375 * ill_delete is called only out of ip_close when the device control 376 * stream is being closed. 377 */ 378 void 379 ill_delete(ill_t *ill) 380 { 381 ipif_t *ipif; 382 ill_t *prev_ill; 383 ip_stack_t *ipst = ill->ill_ipst; 384 385 /* 386 * ill_delete may be forcibly entering the ipsq. The previous 387 * ioctl may not have completed and may need to be aborted. 388 * ipsq_flush takes care of it. If we don't need to enter the 389 * the ipsq forcibly, the 2nd invocation of ipsq_flush in 390 * ill_delete_tail is sufficient. 391 */ 392 ipsq_flush(ill); 393 394 /* 395 * Nuke all interfaces. ipif_free will take down the interface, 396 * remove it from the list, and free the data structure. 397 * Walk down the ipif list and remove the logical interfaces 398 * first before removing the main ipif. We can't unplumb 399 * zeroth interface first in the case of IPv6 as update_conn_ill 400 * -> ip_ll_multireq de-references ill_ipif for checking 401 * POINTOPOINT. 402 * 403 * If ill_ipif was not properly initialized (i.e low on memory), 404 * then no interfaces to clean up. In this case just clean up the 405 * ill. 406 */ 407 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 408 ipif_free(ipif); 409 410 /* 411 * clean out all the nce_t entries that depend on this 412 * ill for the ill_phys_addr. 413 */ 414 nce_flush(ill, B_TRUE); 415 416 /* Clean up msgs on pending upcalls for mrouted */ 417 reset_mrt_ill(ill); 418 419 update_conn_ill(ill, ipst); 420 421 /* 422 * Remove multicast references added as a result of calls to 423 * ip_join_allmulti(). 424 */ 425 ip_purge_allmulti(ill); 426 427 /* 428 * If the ill being deleted is under IPMP, boot it out of the illgrp. 429 */ 430 if (IS_UNDER_IPMP(ill)) 431 ipmp_ill_leave_illgrp(ill); 432 433 /* 434 * ill_down will arrange to blow off any IRE's dependent on this 435 * ILL, and shut down fragmentation reassembly. 436 */ 437 ill_down(ill); 438 439 /* Let SCTP know, so that it can remove this from its list. */ 440 sctp_update_ill(ill, SCTP_ILL_REMOVE); 441 442 /* 443 * Walk all CONNs that can have a reference on an ire or nce for this 444 * ill (we actually walk all that now have stale references). 445 */ 446 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 447 448 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 449 if (ill->ill_isv6) 450 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); 451 452 /* 453 * If an address on this ILL is being used as a source address then 454 * clear out the pointers in other ILLs that point to this ILL. 455 */ 456 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 457 if (ill->ill_usesrc_grp_next != NULL) { 458 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */ 459 ill_disband_usesrc_group(ill); 460 } else { /* consumer of the usesrc ILL */ 461 prev_ill = ill_prev_usesrc(ill); 462 prev_ill->ill_usesrc_grp_next = 463 ill->ill_usesrc_grp_next; 464 } 465 } 466 rw_exit(&ipst->ips_ill_g_usesrc_lock); 467 } 468 469 static void 470 ipif_non_duplicate(ipif_t *ipif) 471 { 472 ill_t *ill = ipif->ipif_ill; 473 mutex_enter(&ill->ill_lock); 474 if (ipif->ipif_flags & IPIF_DUPLICATE) { 475 ipif->ipif_flags &= ~IPIF_DUPLICATE; 476 ASSERT(ill->ill_ipif_dup_count > 0); 477 ill->ill_ipif_dup_count--; 478 } 479 mutex_exit(&ill->ill_lock); 480 } 481 482 /* 483 * ill_delete_tail is called from ip_modclose after all references 484 * to the closing ill are gone. The wait is done in ip_modclose 485 */ 486 void 487 ill_delete_tail(ill_t *ill) 488 { 489 mblk_t **mpp; 490 ipif_t *ipif; 491 ip_stack_t *ipst = ill->ill_ipst; 492 493 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 494 ipif_non_duplicate(ipif); 495 (void) ipif_down_tail(ipif); 496 } 497 498 ASSERT(ill->ill_ipif_dup_count == 0); 499 500 /* 501 * If polling capability is enabled (which signifies direct 502 * upcall into IP and driver has ill saved as a handle), 503 * we need to make sure that unbind has completed before we 504 * let the ill disappear and driver no longer has any reference 505 * to this ill. 506 */ 507 mutex_enter(&ill->ill_lock); 508 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS) 509 cv_wait(&ill->ill_cv, &ill->ill_lock); 510 mutex_exit(&ill->ill_lock); 511 ASSERT(!(ill->ill_capabilities & 512 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT))); 513 514 if (ill->ill_net_type != IRE_LOOPBACK) 515 qprocsoff(ill->ill_rq); 516 517 /* 518 * We do an ipsq_flush once again now. New messages could have 519 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls 520 * could also have landed up if an ioctl thread had looked up 521 * the ill before we set the ILL_CONDEMNED flag, but not yet 522 * enqueued the ioctl when we did the ipsq_flush last time. 523 */ 524 ipsq_flush(ill); 525 526 /* 527 * Free capabilities. 528 */ 529 if (ill->ill_hcksum_capab != NULL) { 530 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); 531 ill->ill_hcksum_capab = NULL; 532 } 533 534 if (ill->ill_zerocopy_capab != NULL) { 535 kmem_free(ill->ill_zerocopy_capab, 536 sizeof (ill_zerocopy_capab_t)); 537 ill->ill_zerocopy_capab = NULL; 538 } 539 540 if (ill->ill_lso_capab != NULL) { 541 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 542 ill->ill_lso_capab = NULL; 543 } 544 545 if (ill->ill_dld_capab != NULL) { 546 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t)); 547 ill->ill_dld_capab = NULL; 548 } 549 550 /* Clean up ill_allowed_ips* related state */ 551 if (ill->ill_allowed_ips != NULL) { 552 ASSERT(ill->ill_allowed_ips_cnt > 0); 553 kmem_free(ill->ill_allowed_ips, 554 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t)); 555 ill->ill_allowed_ips = NULL; 556 ill->ill_allowed_ips_cnt = 0; 557 } 558 559 while (ill->ill_ipif != NULL) 560 ipif_free_tail(ill->ill_ipif); 561 562 /* 563 * We have removed all references to ilm from conn and the ones joined 564 * within the kernel. 565 * 566 * We don't walk conns, mrts and ires because 567 * 568 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. 569 * 2) ill_down ->ill_downi walks all the ires and cleans up 570 * ill references. 571 */ 572 573 /* 574 * If this ill is an IPMP meta-interface, blow away the illgrp. This 575 * is safe to do because the illgrp has already been unlinked from the 576 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it. 577 */ 578 if (IS_IPMP(ill)) { 579 ipmp_illgrp_destroy(ill->ill_grp); 580 ill->ill_grp = NULL; 581 } 582 583 if (ill->ill_mphysaddr_list != NULL) { 584 multiphysaddr_t *mpa, *tmpa; 585 586 mpa = ill->ill_mphysaddr_list; 587 ill->ill_mphysaddr_list = NULL; 588 while (mpa) { 589 tmpa = mpa->mpa_next; 590 kmem_free(mpa, sizeof (*mpa)); 591 mpa = tmpa; 592 } 593 } 594 /* 595 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free 596 * could free the phyint. No more reference to the phyint after this 597 * point. 598 */ 599 (void) ill_glist_delete(ill); 600 601 if (ill->ill_frag_ptr != NULL) { 602 uint_t count; 603 604 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 605 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock); 606 } 607 mi_free(ill->ill_frag_ptr); 608 ill->ill_frag_ptr = NULL; 609 ill->ill_frag_hash_tbl = NULL; 610 } 611 612 freemsg(ill->ill_nd_lla_mp); 613 /* Free all retained control messages. */ 614 mpp = &ill->ill_first_mp_to_free; 615 do { 616 while (mpp[0]) { 617 mblk_t *mp; 618 mblk_t *mp1; 619 620 mp = mpp[0]; 621 mpp[0] = mp->b_next; 622 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { 623 mp1->b_next = NULL; 624 mp1->b_prev = NULL; 625 } 626 freemsg(mp); 627 } 628 } while (mpp++ != &ill->ill_last_mp_to_free); 629 630 ill_free_mib(ill); 631 632 #ifdef DEBUG 633 ill_trace_cleanup(ill); 634 #endif 635 636 /* The default multicast interface might have changed */ 637 ire_increment_multicast_generation(ipst, ill->ill_isv6); 638 639 /* Drop refcnt here */ 640 netstack_rele(ill->ill_ipst->ips_netstack); 641 ill->ill_ipst = NULL; 642 } 643 644 static void 645 ill_free_mib(ill_t *ill) 646 { 647 ip_stack_t *ipst = ill->ill_ipst; 648 649 /* 650 * MIB statistics must not be lost, so when an interface 651 * goes away the counter values will be added to the global 652 * MIBs. 653 */ 654 if (ill->ill_ip_mib != NULL) { 655 if (ill->ill_isv6) { 656 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib, 657 ill->ill_ip_mib); 658 } else { 659 ip_mib2_add_ip_stats(&ipst->ips_ip_mib, 660 ill->ill_ip_mib); 661 } 662 663 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib)); 664 ill->ill_ip_mib = NULL; 665 } 666 if (ill->ill_icmp6_mib != NULL) { 667 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib, 668 ill->ill_icmp6_mib); 669 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib)); 670 ill->ill_icmp6_mib = NULL; 671 } 672 } 673 674 /* 675 * Concatenate together a physical address and a sap. 676 * 677 * Sap_lengths are interpreted as follows: 678 * sap_length == 0 ==> no sap 679 * sap_length > 0 ==> sap is at the head of the dlpi address 680 * sap_length < 0 ==> sap is at the tail of the dlpi address 681 */ 682 static void 683 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length, 684 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst) 685 { 686 uint16_t sap_addr = (uint16_t)sap_src; 687 688 if (sap_length == 0) { 689 if (phys_src == NULL) 690 bzero(dst, phys_length); 691 else 692 bcopy(phys_src, dst, phys_length); 693 } else if (sap_length < 0) { 694 if (phys_src == NULL) 695 bzero(dst, phys_length); 696 else 697 bcopy(phys_src, dst, phys_length); 698 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr)); 699 } else { 700 bcopy(&sap_addr, dst, sizeof (sap_addr)); 701 if (phys_src == NULL) 702 bzero((char *)dst + sap_length, phys_length); 703 else 704 bcopy(phys_src, (char *)dst + sap_length, phys_length); 705 } 706 } 707 708 /* 709 * Generate a dl_unitdata_req mblk for the device and address given. 710 * addr_length is the length of the physical portion of the address. 711 * If addr is NULL include an all zero address of the specified length. 712 * TRUE? In any case, addr_length is taken to be the entire length of the 713 * dlpi address, including the absolute value of sap_length. 714 */ 715 mblk_t * 716 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, 717 t_scalar_t sap_length) 718 { 719 dl_unitdata_req_t *dlur; 720 mblk_t *mp; 721 t_scalar_t abs_sap_length; /* absolute value */ 722 723 abs_sap_length = ABS(sap_length); 724 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length, 725 DL_UNITDATA_REQ); 726 if (mp == NULL) 727 return (NULL); 728 dlur = (dl_unitdata_req_t *)mp->b_rptr; 729 /* HACK: accomodate incompatible DLPI drivers */ 730 if (addr_length == 8) 731 addr_length = 6; 732 dlur->dl_dest_addr_length = addr_length + abs_sap_length; 733 dlur->dl_dest_addr_offset = sizeof (*dlur); 734 dlur->dl_priority.dl_min = 0; 735 dlur->dl_priority.dl_max = 0; 736 ill_dlur_copy_address(addr, addr_length, sap, sap_length, 737 (uchar_t *)&dlur[1]); 738 return (mp); 739 } 740 741 /* 742 * Add the pending mp to the list. There can be only 1 pending mp 743 * in the list. Any exclusive ioctl that needs to wait for a response 744 * from another module or driver needs to use this function to set 745 * the ipx_pending_mp to the ioctl mblk and wait for the response from 746 * the other module/driver. This is also used while waiting for the 747 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif. 748 */ 749 boolean_t 750 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp, 751 int waitfor) 752 { 753 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop; 754 755 ASSERT(IAM_WRITER_IPIF(ipif)); 756 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 757 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); 758 ASSERT(ipx->ipx_pending_mp == NULL); 759 /* 760 * The caller may be using a different ipif than the one passed into 761 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4 762 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT 763 * that `ipx_current_ipif == ipif'. 764 */ 765 ASSERT(ipx->ipx_current_ipif != NULL); 766 767 /* 768 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the 769 * driver. 770 */ 771 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) || 772 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) || 773 (DB_TYPE(add_mp) == M_PCPROTO)); 774 775 if (connp != NULL) { 776 ASSERT(MUTEX_HELD(&connp->conn_lock)); 777 /* 778 * Return error if the conn has started closing. The conn 779 * could have finished cleaning up the pending mp list, 780 * If so we should not add another mp to the list negating 781 * the cleanup. 782 */ 783 if (connp->conn_state_flags & CONN_CLOSING) 784 return (B_FALSE); 785 } 786 mutex_enter(&ipx->ipx_lock); 787 ipx->ipx_pending_ipif = ipif; 788 /* 789 * Note down the queue in b_queue. This will be returned by 790 * ipsq_pending_mp_get. Caller will then use these values to restart 791 * the processing 792 */ 793 add_mp->b_next = NULL; 794 add_mp->b_queue = q; 795 ipx->ipx_pending_mp = add_mp; 796 ipx->ipx_waitfor = waitfor; 797 mutex_exit(&ipx->ipx_lock); 798 799 if (connp != NULL) 800 connp->conn_oper_pending_ill = ipif->ipif_ill; 801 802 return (B_TRUE); 803 } 804 805 /* 806 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp 807 * queued in the list. 808 */ 809 mblk_t * 810 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp) 811 { 812 mblk_t *curr = NULL; 813 ipxop_t *ipx = ipsq->ipsq_xop; 814 815 *connpp = NULL; 816 mutex_enter(&ipx->ipx_lock); 817 if (ipx->ipx_pending_mp == NULL) { 818 mutex_exit(&ipx->ipx_lock); 819 return (NULL); 820 } 821 822 /* There can be only 1 such excl message */ 823 curr = ipx->ipx_pending_mp; 824 ASSERT(curr->b_next == NULL); 825 ipx->ipx_pending_ipif = NULL; 826 ipx->ipx_pending_mp = NULL; 827 ipx->ipx_waitfor = 0; 828 mutex_exit(&ipx->ipx_lock); 829 830 if (CONN_Q(curr->b_queue)) { 831 /* 832 * This mp did a refhold on the conn, at the start of the ioctl. 833 * So we can safely return a pointer to the conn to the caller. 834 */ 835 *connpp = Q_TO_CONN(curr->b_queue); 836 } else { 837 *connpp = NULL; 838 } 839 curr->b_next = NULL; 840 curr->b_prev = NULL; 841 return (curr); 842 } 843 844 /* 845 * Cleanup the ioctl mp queued in ipx_pending_mp 846 * - Called in the ill_delete path 847 * - Called in the M_ERROR or M_HANGUP path on the ill. 848 * - Called in the conn close path. 849 * 850 * Returns success on finding the pending mblk associated with the ioctl or 851 * exclusive operation in progress, failure otherwise. 852 */ 853 boolean_t 854 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) 855 { 856 mblk_t *mp; 857 ipxop_t *ipx; 858 queue_t *q; 859 ipif_t *ipif; 860 int cmd; 861 862 ASSERT(IAM_WRITER_ILL(ill)); 863 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 864 865 mutex_enter(&ipx->ipx_lock); 866 mp = ipx->ipx_pending_mp; 867 if (connp != NULL) { 868 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) { 869 /* 870 * Nothing to clean since the conn that is closing 871 * does not have a matching pending mblk in 872 * ipx_pending_mp. 873 */ 874 mutex_exit(&ipx->ipx_lock); 875 return (B_FALSE); 876 } 877 } else { 878 /* 879 * A non-zero ill_error signifies we are called in the 880 * M_ERROR or M_HANGUP path and we need to unconditionally 881 * abort any current ioctl and do the corresponding cleanup. 882 * A zero ill_error means we are in the ill_delete path and 883 * we do the cleanup only if there is a pending mp. 884 */ 885 if (mp == NULL && ill->ill_error == 0) { 886 mutex_exit(&ipx->ipx_lock); 887 return (B_FALSE); 888 } 889 } 890 891 /* Now remove from the ipx_pending_mp */ 892 ipx->ipx_pending_mp = NULL; 893 ipif = ipx->ipx_pending_ipif; 894 ipx->ipx_pending_ipif = NULL; 895 ipx->ipx_waitfor = 0; 896 ipx->ipx_current_ipif = NULL; 897 cmd = ipx->ipx_current_ioctl; 898 ipx->ipx_current_ioctl = 0; 899 ipx->ipx_current_done = B_TRUE; 900 mutex_exit(&ipx->ipx_lock); 901 902 if (mp == NULL) 903 return (B_FALSE); 904 905 q = mp->b_queue; 906 mp->b_next = NULL; 907 mp->b_prev = NULL; 908 mp->b_queue = NULL; 909 910 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { 911 DTRACE_PROBE4(ipif__ioctl, 912 char *, "ipsq_pending_mp_cleanup", 913 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, 914 ipif_t *, ipif); 915 if (connp == NULL) { 916 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); 917 } else { 918 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL); 919 mutex_enter(&ipif->ipif_ill->ill_lock); 920 ipif->ipif_state_flags &= ~IPIF_CHANGING; 921 mutex_exit(&ipif->ipif_ill->ill_lock); 922 } 923 } else { 924 inet_freemsg(mp); 925 } 926 return (B_TRUE); 927 } 928 929 /* 930 * Called in the conn close path and ill delete path 931 */ 932 static void 933 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) 934 { 935 ipsq_t *ipsq; 936 mblk_t *prev; 937 mblk_t *curr; 938 mblk_t *next; 939 queue_t *wq, *rq = NULL; 940 mblk_t *tmp_list = NULL; 941 942 ASSERT(IAM_WRITER_ILL(ill)); 943 if (connp != NULL) 944 wq = CONNP_TO_WQ(connp); 945 else 946 wq = ill->ill_wq; 947 948 /* 949 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard 950 * against this here. 951 */ 952 if (wq != NULL) 953 rq = RD(wq); 954 955 ipsq = ill->ill_phyint->phyint_ipsq; 956 /* 957 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any. 958 * In the case of ioctl from a conn, there can be only 1 mp 959 * queued on the ipsq. If an ill is being unplumbed flush all 960 * the messages. 961 */ 962 mutex_enter(&ipsq->ipsq_lock); 963 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL; 964 curr = next) { 965 next = curr->b_next; 966 if (connp == NULL || 967 (curr->b_queue == wq || curr->b_queue == rq)) { 968 /* Unlink the mblk from the pending mp list */ 969 if (prev != NULL) { 970 prev->b_next = curr->b_next; 971 } else { 972 ASSERT(ipsq->ipsq_xopq_mphead == curr); 973 ipsq->ipsq_xopq_mphead = curr->b_next; 974 } 975 if (ipsq->ipsq_xopq_mptail == curr) 976 ipsq->ipsq_xopq_mptail = prev; 977 /* 978 * Create a temporary list and release the ipsq lock 979 * New elements are added to the head of the tmp_list 980 */ 981 curr->b_next = tmp_list; 982 tmp_list = curr; 983 } else { 984 prev = curr; 985 } 986 } 987 mutex_exit(&ipsq->ipsq_lock); 988 989 while (tmp_list != NULL) { 990 curr = tmp_list; 991 tmp_list = curr->b_next; 992 curr->b_next = NULL; 993 curr->b_prev = NULL; 994 wq = curr->b_queue; 995 curr->b_queue = NULL; 996 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { 997 DTRACE_PROBE4(ipif__ioctl, 998 char *, "ipsq_xopq_mp_cleanup", 999 int, 0, ill_t *, NULL, ipif_t *, NULL); 1000 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ? 1001 CONN_CLOSE : NO_COPYOUT, NULL); 1002 } else { 1003 /* 1004 * IP-MT XXX In the case of TLI/XTI bind / optmgmt 1005 * this can't be just inet_freemsg. we have to 1006 * restart it otherwise the thread will be stuck. 1007 */ 1008 inet_freemsg(curr); 1009 } 1010 } 1011 } 1012 1013 /* 1014 * This conn has started closing. Cleanup any pending ioctl from this conn. 1015 * STREAMS ensures that there can be at most 1 active ioctl on a stream. 1016 */ 1017 void 1018 conn_ioctl_cleanup(conn_t *connp) 1019 { 1020 ipsq_t *ipsq; 1021 ill_t *ill; 1022 boolean_t refheld; 1023 1024 /* 1025 * Check for a queued ioctl. If the ioctl has not yet started, the mp 1026 * is pending in the list headed by ipsq_xopq_head. If the ioctl has 1027 * started the mp could be present in ipx_pending_mp. Note that if 1028 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and 1029 * not yet queued anywhere. In this case, the conn close code will wait 1030 * until the conn_ref is dropped. If the stream was a tcp stream, then 1031 * tcp_close will wait first until all ioctls have completed for this 1032 * conn. 1033 */ 1034 mutex_enter(&connp->conn_lock); 1035 ill = connp->conn_oper_pending_ill; 1036 if (ill == NULL) { 1037 mutex_exit(&connp->conn_lock); 1038 return; 1039 } 1040 1041 /* 1042 * We may not be able to refhold the ill if the ill/ipif 1043 * is changing. But we need to make sure that the ill will 1044 * not vanish. So we just bump up the ill_waiter count. 1045 */ 1046 refheld = ill_waiter_inc(ill); 1047 mutex_exit(&connp->conn_lock); 1048 if (refheld) { 1049 if (ipsq_enter(ill, B_TRUE, NEW_OP)) { 1050 ill_waiter_dcr(ill); 1051 /* 1052 * Check whether this ioctl has started and is 1053 * pending. If it is not found there then check 1054 * whether this ioctl has not even started and is in 1055 * the ipsq_xopq list. 1056 */ 1057 if (!ipsq_pending_mp_cleanup(ill, connp)) 1058 ipsq_xopq_mp_cleanup(ill, connp); 1059 ipsq = ill->ill_phyint->phyint_ipsq; 1060 ipsq_exit(ipsq); 1061 return; 1062 } 1063 } 1064 1065 /* 1066 * The ill is also closing and we could not bump up the 1067 * ill_waiter_count or we could not enter the ipsq. Leave 1068 * the cleanup to ill_delete 1069 */ 1070 mutex_enter(&connp->conn_lock); 1071 while (connp->conn_oper_pending_ill != NULL) 1072 cv_wait(&connp->conn_refcv, &connp->conn_lock); 1073 mutex_exit(&connp->conn_lock); 1074 if (refheld) 1075 ill_waiter_dcr(ill); 1076 } 1077 1078 /* 1079 * ipcl_walk function for cleaning up conn_*_ill fields. 1080 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and 1081 * conn_bound_if in place. We prefer dropping 1082 * packets instead of sending them out the wrong interface, or accepting 1083 * packets from the wrong ifindex. 1084 */ 1085 static void 1086 conn_cleanup_ill(conn_t *connp, caddr_t arg) 1087 { 1088 ill_t *ill = (ill_t *)arg; 1089 1090 mutex_enter(&connp->conn_lock); 1091 if (connp->conn_dhcpinit_ill == ill) { 1092 connp->conn_dhcpinit_ill = NULL; 1093 ASSERT(ill->ill_dhcpinit != 0); 1094 atomic_dec_32(&ill->ill_dhcpinit); 1095 ill_set_inputfn(ill); 1096 } 1097 mutex_exit(&connp->conn_lock); 1098 } 1099 1100 static int 1101 ill_down_ipifs_tail(ill_t *ill) 1102 { 1103 ipif_t *ipif; 1104 int err; 1105 1106 ASSERT(IAM_WRITER_ILL(ill)); 1107 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 1108 ipif_non_duplicate(ipif); 1109 /* 1110 * ipif_down_tail will call arp_ll_down on the last ipif 1111 * and typically return EINPROGRESS when the DL_UNBIND is sent. 1112 */ 1113 if ((err = ipif_down_tail(ipif)) != 0) 1114 return (err); 1115 } 1116 return (0); 1117 } 1118 1119 /* ARGSUSED */ 1120 void 1121 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 1122 { 1123 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1124 (void) ill_down_ipifs_tail(q->q_ptr); 1125 freemsg(mp); 1126 ipsq_current_finish(ipsq); 1127 } 1128 1129 /* 1130 * ill_down_start is called when we want to down this ill and bring it up again 1131 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down 1132 * all interfaces, but don't tear down any plumbing. 1133 */ 1134 boolean_t 1135 ill_down_start(queue_t *q, mblk_t *mp) 1136 { 1137 ill_t *ill = q->q_ptr; 1138 ipif_t *ipif; 1139 1140 ASSERT(IAM_WRITER_ILL(ill)); 1141 /* 1142 * It is possible that some ioctl is already in progress while we 1143 * received the M_ERROR / M_HANGUP in which case, we need to abort 1144 * the ioctl. ill_down_start() is being processed as CUR_OP rather 1145 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent 1146 * the in progress ioctl from ever completing. 1147 * 1148 * The thread that started the ioctl (if any) must have returned, 1149 * since we are now executing as writer. After the 2 calls below, 1150 * the state of the ipsq and the ill would reflect no trace of any 1151 * pending operation. Subsequently if there is any response to the 1152 * original ioctl from the driver, it would be discarded as an 1153 * unsolicited message from the driver. 1154 */ 1155 (void) ipsq_pending_mp_cleanup(ill, NULL); 1156 ill_dlpi_clear_deferred(ill); 1157 1158 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1159 (void) ipif_down(ipif, NULL, NULL); 1160 1161 ill_down(ill); 1162 1163 /* 1164 * Walk all CONNs that can have a reference on an ire or nce for this 1165 * ill (we actually walk all that now have stale references). 1166 */ 1167 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); 1168 1169 /* With IPv6 we have dce_ifindex. Cleanup for neatness */ 1170 if (ill->ill_isv6) 1171 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); 1172 1173 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); 1174 1175 /* 1176 * Atomically test and add the pending mp if references are active. 1177 */ 1178 mutex_enter(&ill->ill_lock); 1179 if (!ill_is_quiescent(ill)) { 1180 /* call cannot fail since `conn_t *' argument is NULL */ 1181 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 1182 mp, ILL_DOWN); 1183 mutex_exit(&ill->ill_lock); 1184 return (B_FALSE); 1185 } 1186 mutex_exit(&ill->ill_lock); 1187 return (B_TRUE); 1188 } 1189 1190 static void 1191 ill_down(ill_t *ill) 1192 { 1193 mblk_t *mp; 1194 ip_stack_t *ipst = ill->ill_ipst; 1195 1196 /* 1197 * Blow off any IREs dependent on this ILL. 1198 * The caller needs to handle conn_ixa_cleanup 1199 */ 1200 ill_delete_ires(ill); 1201 1202 ire_walk_ill(0, 0, ill_downi, ill, ill); 1203 1204 /* Remove any conn_*_ill depending on this ill */ 1205 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); 1206 1207 /* 1208 * Free state for additional IREs. 1209 */ 1210 mutex_enter(&ill->ill_saved_ire_lock); 1211 mp = ill->ill_saved_ire_mp; 1212 ill->ill_saved_ire_mp = NULL; 1213 ill->ill_saved_ire_cnt = 0; 1214 mutex_exit(&ill->ill_saved_ire_lock); 1215 freemsg(mp); 1216 } 1217 1218 /* 1219 * ire_walk routine used to delete every IRE that depends on 1220 * 'ill'. (Always called as writer, and may only be called from ire_walk.) 1221 * 1222 * Note: since the routes added by the kernel are deleted separately, 1223 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. 1224 * 1225 * We also remove references on ire_nce_cache entries that refer to the ill. 1226 */ 1227 void 1228 ill_downi(ire_t *ire, char *ill_arg) 1229 { 1230 ill_t *ill = (ill_t *)ill_arg; 1231 nce_t *nce; 1232 1233 mutex_enter(&ire->ire_lock); 1234 nce = ire->ire_nce_cache; 1235 if (nce != NULL && nce->nce_ill == ill) 1236 ire->ire_nce_cache = NULL; 1237 else 1238 nce = NULL; 1239 mutex_exit(&ire->ire_lock); 1240 if (nce != NULL) 1241 nce_refrele(nce); 1242 if (ire->ire_ill == ill) { 1243 /* 1244 * The existing interface binding for ire must be 1245 * deleted before trying to bind the route to another 1246 * interface. However, since we are using the contents of the 1247 * ire after ire_delete, the caller has to ensure that 1248 * CONDEMNED (deleted) ire's are not removed from the list 1249 * when ire_delete() returns. Currently ill_downi() is 1250 * only called as part of ire_walk*() routines, so that 1251 * the irb_refhold() done by ire_walk*() will ensure that 1252 * ire_delete() does not lead to ire_inactive(). 1253 */ 1254 ASSERT(ire->ire_bucket->irb_refcnt > 0); 1255 ire_delete(ire); 1256 if (ire->ire_unbound) 1257 ire_rebind(ire); 1258 } 1259 } 1260 1261 /* Remove IRE_IF_CLONE on this ill */ 1262 void 1263 ill_downi_if_clone(ire_t *ire, char *ill_arg) 1264 { 1265 ill_t *ill = (ill_t *)ill_arg; 1266 1267 ASSERT(ire->ire_type & IRE_IF_CLONE); 1268 if (ire->ire_ill == ill) 1269 ire_delete(ire); 1270 } 1271 1272 /* Consume an M_IOCACK of the fastpath probe. */ 1273 void 1274 ill_fastpath_ack(ill_t *ill, mblk_t *mp) 1275 { 1276 mblk_t *mp1 = mp; 1277 1278 /* 1279 * If this was the first attempt turn on the fastpath probing. 1280 */ 1281 mutex_enter(&ill->ill_lock); 1282 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) 1283 ill->ill_dlpi_fastpath_state = IDS_OK; 1284 mutex_exit(&ill->ill_lock); 1285 1286 /* Free the M_IOCACK mblk, hold on to the data */ 1287 mp = mp->b_cont; 1288 freeb(mp1); 1289 if (mp == NULL) 1290 return; 1291 if (mp->b_cont != NULL) 1292 nce_fastpath_update(ill, mp); 1293 else 1294 ip0dbg(("ill_fastpath_ack: no b_cont\n")); 1295 freemsg(mp); 1296 } 1297 1298 /* 1299 * Throw an M_IOCTL message downstream asking "do you know fastpath?" 1300 * The data portion of the request is a dl_unitdata_req_t template for 1301 * what we would send downstream in the absence of a fastpath confirmation. 1302 */ 1303 int 1304 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) 1305 { 1306 struct iocblk *ioc; 1307 mblk_t *mp; 1308 1309 if (dlur_mp == NULL) 1310 return (EINVAL); 1311 1312 mutex_enter(&ill->ill_lock); 1313 switch (ill->ill_dlpi_fastpath_state) { 1314 case IDS_FAILED: 1315 /* 1316 * Driver NAKed the first fastpath ioctl - assume it doesn't 1317 * support it. 1318 */ 1319 mutex_exit(&ill->ill_lock); 1320 return (ENOTSUP); 1321 case IDS_UNKNOWN: 1322 /* This is the first probe */ 1323 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS; 1324 break; 1325 default: 1326 break; 1327 } 1328 mutex_exit(&ill->ill_lock); 1329 1330 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL) 1331 return (EAGAIN); 1332 1333 mp->b_cont = copyb(dlur_mp); 1334 if (mp->b_cont == NULL) { 1335 freeb(mp); 1336 return (EAGAIN); 1337 } 1338 1339 ioc = (struct iocblk *)mp->b_rptr; 1340 ioc->ioc_count = msgdsize(mp->b_cont); 1341 1342 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", 1343 char *, "DL_IOC_HDR_INFO", ill_t *, ill); 1344 putnext(ill->ill_wq, mp); 1345 return (0); 1346 } 1347 1348 void 1349 ill_capability_probe(ill_t *ill) 1350 { 1351 mblk_t *mp; 1352 1353 ASSERT(IAM_WRITER_ILL(ill)); 1354 1355 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN && 1356 ill->ill_dlpi_capab_state != IDCS_FAILED) 1357 return; 1358 1359 /* 1360 * We are starting a new cycle of capability negotiation. 1361 * Free up the capab reset messages of any previous incarnation. 1362 * We will do a fresh allocation when we get the response to our probe 1363 */ 1364 if (ill->ill_capab_reset_mp != NULL) { 1365 freemsg(ill->ill_capab_reset_mp); 1366 ill->ill_capab_reset_mp = NULL; 1367 } 1368 1369 ip1dbg(("ill_capability_probe: starting capability negotiation\n")); 1370 1371 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); 1372 if (mp == NULL) 1373 return; 1374 1375 ill_capability_send(ill, mp); 1376 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; 1377 } 1378 1379 void 1380 ill_capability_reset(ill_t *ill, boolean_t reneg) 1381 { 1382 ASSERT(IAM_WRITER_ILL(ill)); 1383 1384 if (ill->ill_dlpi_capab_state != IDCS_OK) 1385 return; 1386 1387 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; 1388 1389 ill_capability_send(ill, ill->ill_capab_reset_mp); 1390 ill->ill_capab_reset_mp = NULL; 1391 /* 1392 * We turn off all capabilities except those pertaining to 1393 * direct function call capabilities viz. ILL_CAPAB_DLD* 1394 * which will be turned off by the corresponding reset functions. 1395 */ 1396 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); 1397 } 1398 1399 static void 1400 ill_capability_reset_alloc(ill_t *ill) 1401 { 1402 mblk_t *mp; 1403 size_t size = 0; 1404 int err; 1405 dl_capability_req_t *capb; 1406 1407 ASSERT(IAM_WRITER_ILL(ill)); 1408 ASSERT(ill->ill_capab_reset_mp == NULL); 1409 1410 if (ILL_HCKSUM_CAPABLE(ill)) { 1411 size += sizeof (dl_capability_sub_t) + 1412 sizeof (dl_capab_hcksum_t); 1413 } 1414 1415 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) { 1416 size += sizeof (dl_capability_sub_t) + 1417 sizeof (dl_capab_zerocopy_t); 1418 } 1419 1420 if (ill->ill_capabilities & ILL_CAPAB_DLD) { 1421 size += sizeof (dl_capability_sub_t) + 1422 sizeof (dl_capab_dld_t); 1423 } 1424 1425 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED, 1426 STR_NOSIG, &err); 1427 1428 mp->b_datap->db_type = M_PROTO; 1429 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t)); 1430 1431 capb = (dl_capability_req_t *)mp->b_rptr; 1432 capb->dl_primitive = DL_CAPABILITY_REQ; 1433 capb->dl_sub_offset = sizeof (dl_capability_req_t); 1434 capb->dl_sub_length = size; 1435 1436 mp->b_wptr += sizeof (dl_capability_req_t); 1437 1438 /* 1439 * Each handler fills in the corresponding dl_capability_sub_t 1440 * inside the mblk, 1441 */ 1442 ill_capability_hcksum_reset_fill(ill, mp); 1443 ill_capability_zerocopy_reset_fill(ill, mp); 1444 ill_capability_dld_reset_fill(ill, mp); 1445 1446 ill->ill_capab_reset_mp = mp; 1447 } 1448 1449 static void 1450 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) 1451 { 1452 dl_capab_id_t *id_ic; 1453 uint_t sub_dl_cap = outers->dl_cap; 1454 dl_capability_sub_t *inners; 1455 uint8_t *capend; 1456 1457 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER); 1458 1459 /* 1460 * Note: range checks here are not absolutely sufficient to 1461 * make us robust against malformed messages sent by drivers; 1462 * this is in keeping with the rest of IP's dlpi handling. 1463 * (Remember, it's coming from something else in the kernel 1464 * address space) 1465 */ 1466 1467 capend = (uint8_t *)(outers + 1) + outers->dl_length; 1468 if (capend > mp->b_wptr) { 1469 cmn_err(CE_WARN, "ill_capability_id_ack: " 1470 "malformed sub-capability too long for mblk"); 1471 return; 1472 } 1473 1474 id_ic = (dl_capab_id_t *)(outers + 1); 1475 1476 if (outers->dl_length < sizeof (*id_ic) || 1477 (inners = &id_ic->id_subcap, 1478 inners->dl_length > (outers->dl_length - sizeof (*inners)))) { 1479 cmn_err(CE_WARN, "ill_capability_id_ack: malformed " 1480 "encapsulated capab type %d too long for mblk", 1481 inners->dl_cap); 1482 return; 1483 } 1484 1485 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) { 1486 ip1dbg(("ill_capability_id_ack: mid token for capab type %d " 1487 "isn't as expected; pass-thru module(s) detected, " 1488 "discarding capability\n", inners->dl_cap)); 1489 return; 1490 } 1491 1492 /* Process the encapsulated sub-capability */ 1493 ill_capability_dispatch(ill, mp, inners); 1494 } 1495 1496 static void 1497 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) 1498 { 1499 dl_capability_sub_t *dl_subcap; 1500 1501 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 1502 return; 1503 1504 /* 1505 * The dl_capab_dld_t that follows the dl_capability_sub_t is not 1506 * initialized below since it is not used by DLD. 1507 */ 1508 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1509 dl_subcap->dl_cap = DL_CAPAB_DLD; 1510 dl_subcap->dl_length = sizeof (dl_capab_dld_t); 1511 1512 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); 1513 } 1514 1515 static void 1516 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) 1517 { 1518 /* 1519 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK 1520 * is only to get the VRRP capability. 1521 * 1522 * Note that we cannot check ill_ipif_up_count here since 1523 * ill_ipif_up_count is only incremented when the resolver is setup. 1524 * That is done asynchronously, and can race with this function. 1525 */ 1526 if (!ill->ill_dl_up) { 1527 if (subp->dl_cap == DL_CAPAB_VRRP) 1528 ill_capability_vrrp_ack(ill, mp, subp); 1529 return; 1530 } 1531 1532 switch (subp->dl_cap) { 1533 case DL_CAPAB_HCKSUM: 1534 ill_capability_hcksum_ack(ill, mp, subp); 1535 break; 1536 case DL_CAPAB_ZEROCOPY: 1537 ill_capability_zerocopy_ack(ill, mp, subp); 1538 break; 1539 case DL_CAPAB_DLD: 1540 ill_capability_dld_ack(ill, mp, subp); 1541 break; 1542 case DL_CAPAB_VRRP: 1543 break; 1544 default: 1545 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", 1546 subp->dl_cap)); 1547 } 1548 } 1549 1550 /* 1551 * Process the vrrp capability received from a DLS Provider. isub must point 1552 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message. 1553 */ 1554 static void 1555 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1556 { 1557 dl_capab_vrrp_t *vrrp; 1558 uint_t sub_dl_cap = isub->dl_cap; 1559 uint8_t *capend; 1560 1561 ASSERT(IAM_WRITER_ILL(ill)); 1562 ASSERT(sub_dl_cap == DL_CAPAB_VRRP); 1563 1564 /* 1565 * Note: range checks here are not absolutely sufficient to 1566 * make us robust against malformed messages sent by drivers; 1567 * this is in keeping with the rest of IP's dlpi handling. 1568 * (Remember, it's coming from something else in the kernel 1569 * address space) 1570 */ 1571 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1572 if (capend > mp->b_wptr) { 1573 cmn_err(CE_WARN, "ill_capability_vrrp_ack: " 1574 "malformed sub-capability too long for mblk"); 1575 return; 1576 } 1577 vrrp = (dl_capab_vrrp_t *)(isub + 1); 1578 1579 /* 1580 * Compare the IP address family and set ILLF_VRRP for the right ill. 1581 */ 1582 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) || 1583 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) { 1584 ill->ill_flags |= ILLF_VRRP; 1585 } 1586 } 1587 1588 /* 1589 * Process a hardware checksum offload capability negotiation ack received 1590 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) 1591 * of a DL_CAPABILITY_ACK message. 1592 */ 1593 static void 1594 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1595 { 1596 dl_capability_req_t *ocap; 1597 dl_capab_hcksum_t *ihck, *ohck; 1598 ill_hcksum_capab_t **ill_hcksum; 1599 mblk_t *nmp = NULL; 1600 uint_t sub_dl_cap = isub->dl_cap; 1601 uint8_t *capend; 1602 1603 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM); 1604 1605 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab; 1606 1607 /* 1608 * Note: range checks here are not absolutely sufficient to 1609 * make us robust against malformed messages sent by drivers; 1610 * this is in keeping with the rest of IP's dlpi handling. 1611 * (Remember, it's coming from something else in the kernel 1612 * address space) 1613 */ 1614 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1615 if (capend > mp->b_wptr) { 1616 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1617 "malformed sub-capability too long for mblk"); 1618 return; 1619 } 1620 1621 /* 1622 * There are two types of acks we process here: 1623 * 1. acks in reply to a (first form) generic capability req 1624 * (no ENABLE flag set) 1625 * 2. acks in reply to a ENABLE capability req. 1626 * (ENABLE flag set) 1627 */ 1628 ihck = (dl_capab_hcksum_t *)(isub + 1); 1629 1630 if (ihck->hcksum_version != HCKSUM_VERSION_1) { 1631 cmn_err(CE_CONT, "ill_capability_hcksum_ack: " 1632 "unsupported hardware checksum " 1633 "sub-capability (version %d, expected %d)", 1634 ihck->hcksum_version, HCKSUM_VERSION_1); 1635 return; 1636 } 1637 1638 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) { 1639 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware " 1640 "checksum capability isn't as expected; pass-thru " 1641 "module(s) detected, discarding capability\n")); 1642 return; 1643 } 1644 1645 #define CURR_HCKSUM_CAPAB \ 1646 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ 1647 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) 1648 1649 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && 1650 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { 1651 /* do ENABLE processing */ 1652 if (*ill_hcksum == NULL) { 1653 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t), 1654 KM_NOSLEEP); 1655 1656 if (*ill_hcksum == NULL) { 1657 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1658 "could not enable hcksum version %d " 1659 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION, 1660 ill->ill_name); 1661 return; 1662 } 1663 } 1664 1665 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version; 1666 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags; 1667 ill->ill_capabilities |= ILL_CAPAB_HCKSUM; 1668 ip1dbg(("ill_capability_hcksum_ack: interface %s " 1669 "has enabled hardware checksumming\n ", 1670 ill->ill_name)); 1671 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) { 1672 /* 1673 * Enabling hardware checksum offload 1674 * Currently IP supports {TCP,UDP}/IPv4 1675 * partial and full cksum offload and 1676 * IPv4 header checksum offload. 1677 * Allocate new mblk which will 1678 * contain a new capability request 1679 * to enable hardware checksum offload. 1680 */ 1681 uint_t size; 1682 uchar_t *rptr; 1683 1684 size = sizeof (dl_capability_req_t) + 1685 sizeof (dl_capability_sub_t) + isub->dl_length; 1686 1687 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1688 cmn_err(CE_WARN, "ill_capability_hcksum_ack: " 1689 "could not enable hardware cksum for %s (ENOMEM)\n", 1690 ill->ill_name); 1691 return; 1692 } 1693 1694 rptr = nmp->b_rptr; 1695 /* initialize dl_capability_req_t */ 1696 ocap = (dl_capability_req_t *)nmp->b_rptr; 1697 ocap->dl_sub_offset = 1698 sizeof (dl_capability_req_t); 1699 ocap->dl_sub_length = 1700 sizeof (dl_capability_sub_t) + 1701 isub->dl_length; 1702 nmp->b_rptr += sizeof (dl_capability_req_t); 1703 1704 /* initialize dl_capability_sub_t */ 1705 bcopy(isub, nmp->b_rptr, sizeof (*isub)); 1706 nmp->b_rptr += sizeof (*isub); 1707 1708 /* initialize dl_capab_hcksum_t */ 1709 ohck = (dl_capab_hcksum_t *)nmp->b_rptr; 1710 bcopy(ihck, ohck, sizeof (*ihck)); 1711 1712 nmp->b_rptr = rptr; 1713 ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); 1714 1715 /* Set ENABLE flag */ 1716 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB; 1717 ohck->hcksum_txflags |= HCKSUM_ENABLE; 1718 1719 /* 1720 * nmp points to a DL_CAPABILITY_REQ message to enable 1721 * hardware checksum acceleration. 1722 */ 1723 ill_capability_send(ill, nmp); 1724 } else { 1725 ip1dbg(("ill_capability_hcksum_ack: interface %s has " 1726 "advertised %x hardware checksum capability flags\n", 1727 ill->ill_name, ihck->hcksum_txflags)); 1728 } 1729 } 1730 1731 static void 1732 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp) 1733 { 1734 dl_capab_hcksum_t *hck_subcap; 1735 dl_capability_sub_t *dl_subcap; 1736 1737 if (!ILL_HCKSUM_CAPABLE(ill)) 1738 return; 1739 1740 ASSERT(ill->ill_hcksum_capab != NULL); 1741 1742 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1743 dl_subcap->dl_cap = DL_CAPAB_HCKSUM; 1744 dl_subcap->dl_length = sizeof (*hck_subcap); 1745 1746 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1); 1747 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version; 1748 hck_subcap->hcksum_txflags = 0; 1749 1750 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap); 1751 } 1752 1753 static void 1754 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1755 { 1756 mblk_t *nmp = NULL; 1757 dl_capability_req_t *oc; 1758 dl_capab_zerocopy_t *zc_ic, *zc_oc; 1759 ill_zerocopy_capab_t **ill_zerocopy_capab; 1760 uint_t sub_dl_cap = isub->dl_cap; 1761 uint8_t *capend; 1762 1763 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY); 1764 1765 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab; 1766 1767 /* 1768 * Note: range checks here are not absolutely sufficient to 1769 * make us robust against malformed messages sent by drivers; 1770 * this is in keeping with the rest of IP's dlpi handling. 1771 * (Remember, it's coming from something else in the kernel 1772 * address space) 1773 */ 1774 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1775 if (capend > mp->b_wptr) { 1776 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1777 "malformed sub-capability too long for mblk"); 1778 return; 1779 } 1780 1781 zc_ic = (dl_capab_zerocopy_t *)(isub + 1); 1782 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) { 1783 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: " 1784 "unsupported ZEROCOPY sub-capability (version %d, " 1785 "expected %d)", zc_ic->zerocopy_version, 1786 ZEROCOPY_VERSION_1); 1787 return; 1788 } 1789 1790 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) { 1791 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy " 1792 "capability isn't as expected; pass-thru module(s) " 1793 "detected, discarding capability\n")); 1794 return; 1795 } 1796 1797 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) { 1798 if (*ill_zerocopy_capab == NULL) { 1799 *ill_zerocopy_capab = 1800 kmem_zalloc(sizeof (ill_zerocopy_capab_t), 1801 KM_NOSLEEP); 1802 1803 if (*ill_zerocopy_capab == NULL) { 1804 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1805 "could not enable Zero-copy version %d " 1806 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1, 1807 ill->ill_name); 1808 return; 1809 } 1810 } 1811 1812 ip1dbg(("ill_capability_zerocopy_ack: interface %s " 1813 "supports Zero-copy version %d\n", ill->ill_name, 1814 ZEROCOPY_VERSION_1)); 1815 1816 (*ill_zerocopy_capab)->ill_zerocopy_version = 1817 zc_ic->zerocopy_version; 1818 (*ill_zerocopy_capab)->ill_zerocopy_flags = 1819 zc_ic->zerocopy_flags; 1820 1821 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY; 1822 } else { 1823 uint_t size; 1824 uchar_t *rptr; 1825 1826 size = sizeof (dl_capability_req_t) + 1827 sizeof (dl_capability_sub_t) + 1828 sizeof (dl_capab_zerocopy_t); 1829 1830 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { 1831 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: " 1832 "could not enable zerocopy for %s (ENOMEM)\n", 1833 ill->ill_name); 1834 return; 1835 } 1836 1837 rptr = nmp->b_rptr; 1838 /* initialize dl_capability_req_t */ 1839 oc = (dl_capability_req_t *)rptr; 1840 oc->dl_sub_offset = sizeof (dl_capability_req_t); 1841 oc->dl_sub_length = sizeof (dl_capability_sub_t) + 1842 sizeof (dl_capab_zerocopy_t); 1843 rptr += sizeof (dl_capability_req_t); 1844 1845 /* initialize dl_capability_sub_t */ 1846 bcopy(isub, rptr, sizeof (*isub)); 1847 rptr += sizeof (*isub); 1848 1849 /* initialize dl_capab_zerocopy_t */ 1850 zc_oc = (dl_capab_zerocopy_t *)rptr; 1851 *zc_oc = *zc_ic; 1852 1853 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s " 1854 "to enable zero-copy version %d\n", ill->ill_name, 1855 ZEROCOPY_VERSION_1)); 1856 1857 /* set VMSAFE_MEM flag */ 1858 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM; 1859 1860 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */ 1861 ill_capability_send(ill, nmp); 1862 } 1863 } 1864 1865 static void 1866 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp) 1867 { 1868 dl_capab_zerocopy_t *zerocopy_subcap; 1869 dl_capability_sub_t *dl_subcap; 1870 1871 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY)) 1872 return; 1873 1874 ASSERT(ill->ill_zerocopy_capab != NULL); 1875 1876 dl_subcap = (dl_capability_sub_t *)mp->b_wptr; 1877 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY; 1878 dl_subcap->dl_length = sizeof (*zerocopy_subcap); 1879 1880 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1); 1881 zerocopy_subcap->zerocopy_version = 1882 ill->ill_zerocopy_capab->ill_zerocopy_version; 1883 zerocopy_subcap->zerocopy_flags = 0; 1884 1885 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap); 1886 } 1887 1888 /* 1889 * DLD capability 1890 * Refer to dld.h for more information regarding the purpose and usage 1891 * of this capability. 1892 */ 1893 static void 1894 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) 1895 { 1896 dl_capab_dld_t *dld_ic, dld; 1897 uint_t sub_dl_cap = isub->dl_cap; 1898 uint8_t *capend; 1899 ill_dld_capab_t *idc; 1900 1901 ASSERT(IAM_WRITER_ILL(ill)); 1902 ASSERT(sub_dl_cap == DL_CAPAB_DLD); 1903 1904 /* 1905 * Note: range checks here are not absolutely sufficient to 1906 * make us robust against malformed messages sent by drivers; 1907 * this is in keeping with the rest of IP's dlpi handling. 1908 * (Remember, it's coming from something else in the kernel 1909 * address space) 1910 */ 1911 capend = (uint8_t *)(isub + 1) + isub->dl_length; 1912 if (capend > mp->b_wptr) { 1913 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1914 "malformed sub-capability too long for mblk"); 1915 return; 1916 } 1917 dld_ic = (dl_capab_dld_t *)(isub + 1); 1918 if (dld_ic->dld_version != DLD_CURRENT_VERSION) { 1919 cmn_err(CE_CONT, "ill_capability_dld_ack: " 1920 "unsupported DLD sub-capability (version %d, " 1921 "expected %d)", dld_ic->dld_version, 1922 DLD_CURRENT_VERSION); 1923 return; 1924 } 1925 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) { 1926 ip1dbg(("ill_capability_dld_ack: mid token for dld " 1927 "capability isn't as expected; pass-thru module(s) " 1928 "detected, discarding capability\n")); 1929 return; 1930 } 1931 1932 /* 1933 * Copy locally to ensure alignment. 1934 */ 1935 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t)); 1936 1937 if ((idc = ill->ill_dld_capab) == NULL) { 1938 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP); 1939 if (idc == NULL) { 1940 cmn_err(CE_WARN, "ill_capability_dld_ack: " 1941 "could not enable DLD version %d " 1942 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION, 1943 ill->ill_name); 1944 return; 1945 } 1946 ill->ill_dld_capab = idc; 1947 } 1948 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab; 1949 idc->idc_capab_dh = (void *)dld.dld_capab_handle; 1950 ip1dbg(("ill_capability_dld_ack: interface %s " 1951 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION)); 1952 1953 ill_capability_dld_enable(ill); 1954 } 1955 1956 /* 1957 * Typically capability negotiation between IP and the driver happens via 1958 * DLPI message exchange. However GLD also offers a direct function call 1959 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities, 1960 * But arbitrary function calls into IP or GLD are not permitted, since both 1961 * of them are protected by their own perimeter mechanism. The perimeter can 1962 * be viewed as a coarse lock or serialization mechanism. The hierarchy of 1963 * these perimeters is IP -> MAC. Thus for example to enable the squeue 1964 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter 1965 * to enter the mac perimeter and then do the direct function calls into 1966 * GLD to enable squeue polling. The ring related callbacks from the mac into 1967 * the stack to add, bind, quiesce, restart or cleanup a ring are all 1968 * protected by the mac perimeter. 1969 */ 1970 static void 1971 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp) 1972 { 1973 ill_dld_capab_t *idc = ill->ill_dld_capab; 1974 int err; 1975 1976 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp, 1977 DLD_ENABLE); 1978 ASSERT(err == 0); 1979 } 1980 1981 static void 1982 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph) 1983 { 1984 ill_dld_capab_t *idc = ill->ill_dld_capab; 1985 int err; 1986 1987 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph, 1988 DLD_DISABLE); 1989 ASSERT(err == 0); 1990 } 1991 1992 boolean_t 1993 ill_mac_perim_held(ill_t *ill) 1994 { 1995 ill_dld_capab_t *idc = ill->ill_dld_capab; 1996 1997 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL, 1998 DLD_QUERY)); 1999 } 2000 2001 static void 2002 ill_capability_direct_enable(ill_t *ill) 2003 { 2004 ill_dld_capab_t *idc = ill->ill_dld_capab; 2005 ill_dld_direct_t *idd = &idc->idc_direct; 2006 dld_capab_direct_t direct; 2007 int rc; 2008 2009 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2010 2011 bzero(&direct, sizeof (direct)); 2012 direct.di_rx_cf = (uintptr_t)ip_input; 2013 direct.di_rx_ch = ill; 2014 2015 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct, 2016 DLD_ENABLE); 2017 if (rc == 0) { 2018 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df; 2019 idd->idd_tx_dh = direct.di_tx_dh; 2020 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; 2021 idd->idd_tx_cb_dh = direct.di_tx_cb_dh; 2022 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; 2023 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; 2024 ASSERT(idd->idd_tx_cb_df != NULL); 2025 ASSERT(idd->idd_tx_fctl_df != NULL); 2026 ASSERT(idd->idd_tx_df != NULL); 2027 /* 2028 * One time registration of flow enable callback function 2029 */ 2030 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh, 2031 ill_flow_enable, ill); 2032 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT; 2033 DTRACE_PROBE1(direct_on, (ill_t *), ill); 2034 } else { 2035 cmn_err(CE_WARN, "warning: could not enable DIRECT " 2036 "capability, rc = %d\n", rc); 2037 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc); 2038 } 2039 } 2040 2041 static void 2042 ill_capability_poll_enable(ill_t *ill) 2043 { 2044 ill_dld_capab_t *idc = ill->ill_dld_capab; 2045 dld_capab_poll_t poll; 2046 int rc; 2047 2048 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2049 2050 bzero(&poll, sizeof (poll)); 2051 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring; 2052 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring; 2053 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring; 2054 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring; 2055 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring; 2056 poll.poll_ring_ch = ill; 2057 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll, 2058 DLD_ENABLE); 2059 if (rc == 0) { 2060 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL; 2061 DTRACE_PROBE1(poll_on, (ill_t *), ill); 2062 } else { 2063 ip1dbg(("warning: could not enable POLL " 2064 "capability, rc = %d\n", rc)); 2065 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc); 2066 } 2067 } 2068 2069 /* 2070 * Enable the LSO capability. 2071 */ 2072 static void 2073 ill_capability_lso_enable(ill_t *ill) 2074 { 2075 ill_dld_capab_t *idc = ill->ill_dld_capab; 2076 dld_capab_lso_t lso; 2077 int rc; 2078 2079 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill)); 2080 2081 if (ill->ill_lso_capab == NULL) { 2082 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t), 2083 KM_NOSLEEP); 2084 if (ill->ill_lso_capab == NULL) { 2085 cmn_err(CE_WARN, "ill_capability_lso_enable: " 2086 "could not enable LSO for %s (ENOMEM)\n", 2087 ill->ill_name); 2088 return; 2089 } 2090 } 2091 2092 bzero(&lso, sizeof (lso)); 2093 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso, 2094 DLD_ENABLE)) == 0) { 2095 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; 2096 ill->ill_lso_capab->ill_lso_max = lso.lso_max; 2097 ill->ill_capabilities |= ILL_CAPAB_LSO; 2098 ip1dbg(("ill_capability_lso_enable: interface %s " 2099 "has enabled LSO\n ", ill->ill_name)); 2100 } else { 2101 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t)); 2102 ill->ill_lso_capab = NULL; 2103 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc); 2104 } 2105 } 2106 2107 static void 2108 ill_capability_dld_enable(ill_t *ill) 2109 { 2110 mac_perim_handle_t mph; 2111 2112 ASSERT(IAM_WRITER_ILL(ill)); 2113 2114 if (ill->ill_isv6) 2115 return; 2116 2117 ill_mac_perim_enter(ill, &mph); 2118 if (!ill->ill_isv6) { 2119 ill_capability_direct_enable(ill); 2120 ill_capability_poll_enable(ill); 2121 ill_capability_lso_enable(ill); 2122 } 2123 ill->ill_capabilities |= ILL_CAPAB_DLD; 2124 ill_mac_perim_exit(ill, mph); 2125 } 2126 2127 static void 2128 ill_capability_dld_disable(ill_t *ill) 2129 { 2130 ill_dld_capab_t *idc; 2131 ill_dld_direct_t *idd; 2132 mac_perim_handle_t mph; 2133 2134 ASSERT(IAM_WRITER_ILL(ill)); 2135 2136 if (!(ill->ill_capabilities & ILL_CAPAB_DLD)) 2137 return; 2138 2139 ill_mac_perim_enter(ill, &mph); 2140 2141 idc = ill->ill_dld_capab; 2142 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) { 2143 /* 2144 * For performance we avoid locks in the transmit data path 2145 * and don't maintain a count of the number of threads using 2146 * direct calls. Thus some threads could be using direct 2147 * transmit calls to GLD, even after the capability mechanism 2148 * turns it off. This is still safe since the handles used in 2149 * the direct calls continue to be valid until the unplumb is 2150 * completed. Remove the callback that was added (1-time) at 2151 * capab enable time. 2152 */ 2153 mutex_enter(&ill->ill_lock); 2154 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT; 2155 mutex_exit(&ill->ill_lock); 2156 if (ill->ill_flownotify_mh != NULL) { 2157 idd = &idc->idc_direct; 2158 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL, 2159 ill->ill_flownotify_mh); 2160 ill->ill_flownotify_mh = NULL; 2161 } 2162 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, 2163 NULL, DLD_DISABLE); 2164 } 2165 2166 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) { 2167 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL; 2168 ip_squeue_clean_all(ill); 2169 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, 2170 NULL, DLD_DISABLE); 2171 } 2172 2173 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { 2174 ASSERT(ill->ill_lso_capab != NULL); 2175 /* 2176 * Clear the capability flag for LSO but retain the 2177 * ill_lso_capab structure since it's possible that another 2178 * thread is still referring to it. The structure only gets 2179 * deallocated when we destroy the ill. 2180 */ 2181 2182 ill->ill_capabilities &= ~ILL_CAPAB_LSO; 2183 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, 2184 NULL, DLD_DISABLE); 2185 } 2186 2187 ill->ill_capabilities &= ~ILL_CAPAB_DLD; 2188 ill_mac_perim_exit(ill, mph); 2189 } 2190 2191 /* 2192 * Capability Negotiation protocol 2193 * 2194 * We don't wait for DLPI capability operations to finish during interface 2195 * bringup or teardown. Doing so would introduce more asynchrony and the 2196 * interface up/down operations will need multiple return and restarts. 2197 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as 2198 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next 2199 * exclusive operation won't start until the DLPI operations of the previous 2200 * exclusive operation complete. 2201 * 2202 * The capability state machine is shown below. 2203 * 2204 * state next state event, action 2205 * 2206 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe 2207 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack 2208 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack) 2209 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG 2210 * IDCS_OK IDCS_RESET_SENT ill_capability_reset 2211 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr 2212 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr -> 2213 * ill_capability_probe. 2214 */ 2215 2216 /* 2217 * Dedicated thread started from ip_stack_init that handles capability 2218 * disable. This thread ensures the taskq dispatch does not fail by waiting 2219 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure 2220 * that direct calls to DLD are done in a cv_waitable context. 2221 */ 2222 void 2223 ill_taskq_dispatch(ip_stack_t *ipst) 2224 { 2225 callb_cpr_t cprinfo; 2226 char name[64]; 2227 mblk_t *mp; 2228 2229 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d", 2230 ipst->ips_netstack->netstack_stackid); 2231 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr, 2232 name); 2233 mutex_enter(&ipst->ips_capab_taskq_lock); 2234 2235 for (;;) { 2236 mp = ipst->ips_capab_taskq_head; 2237 while (mp != NULL) { 2238 ipst->ips_capab_taskq_head = mp->b_next; 2239 if (ipst->ips_capab_taskq_head == NULL) 2240 ipst->ips_capab_taskq_tail = NULL; 2241 mutex_exit(&ipst->ips_capab_taskq_lock); 2242 mp->b_next = NULL; 2243 2244 VERIFY(taskq_dispatch(system_taskq, 2245 ill_capability_ack_thr, mp, TQ_SLEEP) != 0); 2246 mutex_enter(&ipst->ips_capab_taskq_lock); 2247 mp = ipst->ips_capab_taskq_head; 2248 } 2249 2250 if (ipst->ips_capab_taskq_quit) 2251 break; 2252 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2253 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock); 2254 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock); 2255 } 2256 VERIFY(ipst->ips_capab_taskq_head == NULL); 2257 VERIFY(ipst->ips_capab_taskq_tail == NULL); 2258 CALLB_CPR_EXIT(&cprinfo); 2259 thread_exit(); 2260 } 2261 2262 /* 2263 * Consume a new-style hardware capabilities negotiation ack. 2264 * Called via taskq on receipt of DL_CAPABILITY_ACK. 2265 */ 2266 static void 2267 ill_capability_ack_thr(void *arg) 2268 { 2269 mblk_t *mp = arg; 2270 dl_capability_ack_t *capp; 2271 dl_capability_sub_t *subp, *endp; 2272 ill_t *ill; 2273 boolean_t reneg; 2274 2275 ill = (ill_t *)mp->b_prev; 2276 mp->b_prev = NULL; 2277 2278 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE); 2279 2280 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT || 2281 ill->ill_dlpi_capab_state == IDCS_RENEG) { 2282 /* 2283 * We have received the ack for our DL_CAPAB reset request. 2284 * There isnt' anything in the message that needs processing. 2285 * All message based capabilities have been disabled, now 2286 * do the function call based capability disable. 2287 */ 2288 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG; 2289 ill_capability_dld_disable(ill); 2290 ill->ill_dlpi_capab_state = IDCS_UNKNOWN; 2291 if (reneg) 2292 ill_capability_probe(ill); 2293 goto done; 2294 } 2295 2296 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT) 2297 ill->ill_dlpi_capab_state = IDCS_OK; 2298 2299 capp = (dl_capability_ack_t *)mp->b_rptr; 2300 2301 if (capp->dl_sub_length == 0) { 2302 /* no new-style capabilities */ 2303 goto done; 2304 } 2305 2306 /* make sure the driver supplied correct dl_sub_length */ 2307 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) { 2308 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, " 2309 "invalid dl_sub_length (%d)\n", capp->dl_sub_length)); 2310 goto done; 2311 } 2312 2313 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset)) 2314 /* 2315 * There are sub-capabilities. Process the ones we know about. 2316 * Loop until we don't have room for another sub-cap header.. 2317 */ 2318 for (subp = SC(capp, capp->dl_sub_offset), 2319 endp = SC(subp, capp->dl_sub_length - sizeof (*subp)); 2320 subp <= endp; 2321 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) { 2322 2323 switch (subp->dl_cap) { 2324 case DL_CAPAB_ID_WRAPPER: 2325 ill_capability_id_ack(ill, mp, subp); 2326 break; 2327 default: 2328 ill_capability_dispatch(ill, mp, subp); 2329 break; 2330 } 2331 } 2332 #undef SC 2333 done: 2334 inet_freemsg(mp); 2335 ill_capability_done(ill); 2336 ipsq_exit(ill->ill_phyint->phyint_ipsq); 2337 } 2338 2339 /* 2340 * This needs to be started in a taskq thread to provide a cv_waitable 2341 * context. 2342 */ 2343 void 2344 ill_capability_ack(ill_t *ill, mblk_t *mp) 2345 { 2346 ip_stack_t *ipst = ill->ill_ipst; 2347 2348 mp->b_prev = (mblk_t *)ill; 2349 ASSERT(mp->b_next == NULL); 2350 2351 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp, 2352 TQ_NOSLEEP) != 0) 2353 return; 2354 2355 /* 2356 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread 2357 * which will do the dispatch using TQ_SLEEP to guarantee success. 2358 */ 2359 mutex_enter(&ipst->ips_capab_taskq_lock); 2360 if (ipst->ips_capab_taskq_head == NULL) { 2361 ASSERT(ipst->ips_capab_taskq_tail == NULL); 2362 ipst->ips_capab_taskq_head = mp; 2363 } else { 2364 ipst->ips_capab_taskq_tail->b_next = mp; 2365 } 2366 ipst->ips_capab_taskq_tail = mp; 2367 2368 cv_signal(&ipst->ips_capab_taskq_cv); 2369 mutex_exit(&ipst->ips_capab_taskq_lock); 2370 } 2371 2372 /* 2373 * This routine is called to scan the fragmentation reassembly table for 2374 * the specified ILL for any packets that are starting to smell. 2375 * dead_interval is the maximum time in seconds that will be tolerated. It 2376 * will either be the value specified in ip_g_frag_timeout, or zero if the 2377 * ILL is shutting down and it is time to blow everything off. 2378 * 2379 * It returns the number of seconds (as a time_t) that the next frag timer 2380 * should be scheduled for, 0 meaning that the timer doesn't need to be 2381 * re-started. Note that the method of calculating next_timeout isn't 2382 * entirely accurate since time will flow between the time we grab 2383 * current_time and the time we schedule the next timeout. This isn't a 2384 * big problem since this is the timer for sending an ICMP reassembly time 2385 * exceeded messages, and it doesn't have to be exactly accurate. 2386 * 2387 * This function is 2388 * sometimes called as writer, although this is not required. 2389 */ 2390 time_t 2391 ill_frag_timeout(ill_t *ill, time_t dead_interval) 2392 { 2393 ipfb_t *ipfb; 2394 ipfb_t *endp; 2395 ipf_t *ipf; 2396 ipf_t *ipfnext; 2397 mblk_t *mp; 2398 time_t current_time = gethrestime_sec(); 2399 time_t next_timeout = 0; 2400 uint32_t hdr_length; 2401 mblk_t *send_icmp_head; 2402 mblk_t *send_icmp_head_v6; 2403 ip_stack_t *ipst = ill->ill_ipst; 2404 ip_recv_attr_t iras; 2405 2406 bzero(&iras, sizeof (iras)); 2407 iras.ira_flags = 0; 2408 iras.ira_ill = iras.ira_rill = ill; 2409 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; 2410 iras.ira_rifindex = iras.ira_ruifindex; 2411 2412 ipfb = ill->ill_frag_hash_tbl; 2413 if (ipfb == NULL) 2414 return (B_FALSE); 2415 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT]; 2416 /* Walk the frag hash table. */ 2417 for (; ipfb < endp; ipfb++) { 2418 send_icmp_head = NULL; 2419 send_icmp_head_v6 = NULL; 2420 mutex_enter(&ipfb->ipfb_lock); 2421 while ((ipf = ipfb->ipfb_ipf) != 0) { 2422 time_t frag_time = current_time - ipf->ipf_timestamp; 2423 time_t frag_timeout; 2424 2425 if (frag_time < dead_interval) { 2426 /* 2427 * There are some outstanding fragments 2428 * that will timeout later. Make note of 2429 * the time so that we can reschedule the 2430 * next timeout appropriately. 2431 */ 2432 frag_timeout = dead_interval - frag_time; 2433 if (next_timeout == 0 || 2434 frag_timeout < next_timeout) { 2435 next_timeout = frag_timeout; 2436 } 2437 break; 2438 } 2439 /* Time's up. Get it out of here. */ 2440 hdr_length = ipf->ipf_nf_hdr_len; 2441 ipfnext = ipf->ipf_hash_next; 2442 if (ipfnext) 2443 ipfnext->ipf_ptphn = ipf->ipf_ptphn; 2444 *ipf->ipf_ptphn = ipfnext; 2445 mp = ipf->ipf_mp->b_cont; 2446 for (; mp; mp = mp->b_cont) { 2447 /* Extra points for neatness. */ 2448 IP_REASS_SET_START(mp, 0); 2449 IP_REASS_SET_END(mp, 0); 2450 } 2451 mp = ipf->ipf_mp->b_cont; 2452 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count); 2453 ASSERT(ipfb->ipfb_count >= ipf->ipf_count); 2454 ipfb->ipfb_count -= ipf->ipf_count; 2455 ASSERT(ipfb->ipfb_frag_pkts > 0); 2456 ipfb->ipfb_frag_pkts--; 2457 /* 2458 * We do not send any icmp message from here because 2459 * we currently are holding the ipfb_lock for this 2460 * hash chain. If we try and send any icmp messages 2461 * from here we may end up via a put back into ip 2462 * trying to get the same lock, causing a recursive 2463 * mutex panic. Instead we build a list and send all 2464 * the icmp messages after we have dropped the lock. 2465 */ 2466 if (ill->ill_isv6) { 2467 if (hdr_length != 0) { 2468 mp->b_next = send_icmp_head_v6; 2469 send_icmp_head_v6 = mp; 2470 } else { 2471 freemsg(mp); 2472 } 2473 } else { 2474 if (hdr_length != 0) { 2475 mp->b_next = send_icmp_head; 2476 send_icmp_head = mp; 2477 } else { 2478 freemsg(mp); 2479 } 2480 } 2481 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2482 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); 2483 freeb(ipf->ipf_mp); 2484 } 2485 mutex_exit(&ipfb->ipfb_lock); 2486 /* 2487 * Now need to send any icmp messages that we delayed from 2488 * above. 2489 */ 2490 while (send_icmp_head_v6 != NULL) { 2491 ip6_t *ip6h; 2492 2493 mp = send_icmp_head_v6; 2494 send_icmp_head_v6 = send_icmp_head_v6->b_next; 2495 mp->b_next = NULL; 2496 ip6h = (ip6_t *)mp->b_rptr; 2497 iras.ira_flags = 0; 2498 /* 2499 * This will result in an incorrect ALL_ZONES zoneid 2500 * for multicast packets, but we 2501 * don't send ICMP errors for those in any case. 2502 */ 2503 iras.ira_zoneid = 2504 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, 2505 ill, ipst); 2506 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2507 icmp_time_exceeded_v6(mp, 2508 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, 2509 &iras); 2510 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2511 } 2512 while (send_icmp_head != NULL) { 2513 ipaddr_t dst; 2514 2515 mp = send_icmp_head; 2516 send_icmp_head = send_icmp_head->b_next; 2517 mp->b_next = NULL; 2518 2519 dst = ((ipha_t *)mp->b_rptr)->ipha_dst; 2520 2521 iras.ira_flags = IRAF_IS_IPV4; 2522 /* 2523 * This will result in an incorrect ALL_ZONES zoneid 2524 * for broadcast and multicast packets, but we 2525 * don't send ICMP errors for those in any case. 2526 */ 2527 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, 2528 ill, ipst); 2529 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); 2530 icmp_time_exceeded(mp, 2531 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); 2532 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); 2533 } 2534 } 2535 /* 2536 * A non-dying ILL will use the return value to decide whether to 2537 * restart the frag timer, and for how long. 2538 */ 2539 return (next_timeout); 2540 } 2541 2542 /* 2543 * This routine is called when the approximate count of mblk memory used 2544 * for the specified ILL has exceeded max_count. 2545 */ 2546 void 2547 ill_frag_prune(ill_t *ill, uint_t max_count) 2548 { 2549 ipfb_t *ipfb; 2550 ipf_t *ipf; 2551 size_t count; 2552 clock_t now; 2553 2554 /* 2555 * If we are here within ip_min_frag_prune_time msecs remove 2556 * ill_frag_free_num_pkts oldest packets from each bucket and increment 2557 * ill_frag_free_num_pkts. 2558 */ 2559 mutex_enter(&ill->ill_lock); 2560 now = ddi_get_lbolt(); 2561 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <= 2562 (ip_min_frag_prune_time != 0 ? 2563 ip_min_frag_prune_time : msec_per_tick)) { 2564 2565 ill->ill_frag_free_num_pkts++; 2566 2567 } else { 2568 ill->ill_frag_free_num_pkts = 0; 2569 } 2570 ill->ill_last_frag_clean_time = now; 2571 mutex_exit(&ill->ill_lock); 2572 2573 /* 2574 * free ill_frag_free_num_pkts oldest packets from each bucket. 2575 */ 2576 if (ill->ill_frag_free_num_pkts != 0) { 2577 int ix; 2578 2579 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2580 ipfb = &ill->ill_frag_hash_tbl[ix]; 2581 mutex_enter(&ipfb->ipfb_lock); 2582 if (ipfb->ipfb_ipf != NULL) { 2583 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 2584 ill->ill_frag_free_num_pkts); 2585 } 2586 mutex_exit(&ipfb->ipfb_lock); 2587 } 2588 } 2589 /* 2590 * While the reassembly list for this ILL is too big, prune a fragment 2591 * queue by age, oldest first. 2592 */ 2593 while (ill->ill_frag_count > max_count) { 2594 int ix; 2595 ipfb_t *oipfb = NULL; 2596 uint_t oldest = UINT_MAX; 2597 2598 count = 0; 2599 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) { 2600 ipfb = &ill->ill_frag_hash_tbl[ix]; 2601 mutex_enter(&ipfb->ipfb_lock); 2602 ipf = ipfb->ipfb_ipf; 2603 if (ipf != NULL && ipf->ipf_gen < oldest) { 2604 oldest = ipf->ipf_gen; 2605 oipfb = ipfb; 2606 } 2607 count += ipfb->ipfb_count; 2608 mutex_exit(&ipfb->ipfb_lock); 2609 } 2610 if (oipfb == NULL) 2611 break; 2612 2613 if (count <= max_count) 2614 return; /* Somebody beat us to it, nothing to do */ 2615 mutex_enter(&oipfb->ipfb_lock); 2616 ipf = oipfb->ipfb_ipf; 2617 if (ipf != NULL) { 2618 ill_frag_free_pkts(ill, oipfb, ipf, 1); 2619 } 2620 mutex_exit(&oipfb->ipfb_lock); 2621 } 2622 } 2623 2624 /* 2625 * free 'free_cnt' fragmented packets starting at ipf. 2626 */ 2627 void 2628 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) 2629 { 2630 size_t count; 2631 mblk_t *mp; 2632 mblk_t *tmp; 2633 ipf_t **ipfp = ipf->ipf_ptphn; 2634 2635 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock)); 2636 ASSERT(ipfp != NULL); 2637 ASSERT(ipf != NULL); 2638 2639 while (ipf != NULL && free_cnt-- > 0) { 2640 count = ipf->ipf_count; 2641 mp = ipf->ipf_mp; 2642 ipf = ipf->ipf_hash_next; 2643 for (tmp = mp; tmp; tmp = tmp->b_cont) { 2644 IP_REASS_SET_START(tmp, 0); 2645 IP_REASS_SET_END(tmp, 0); 2646 } 2647 atomic_add_32(&ill->ill_frag_count, -count); 2648 ASSERT(ipfb->ipfb_count >= count); 2649 ipfb->ipfb_count -= count; 2650 ASSERT(ipfb->ipfb_frag_pkts > 0); 2651 ipfb->ipfb_frag_pkts--; 2652 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); 2653 ip_drop_input("ipIfStatsReasmFails", mp, ill); 2654 freemsg(mp); 2655 } 2656 2657 if (ipf) 2658 ipf->ipf_ptphn = ipfp; 2659 ipfp[0] = ipf; 2660 } 2661 2662 /* 2663 * Helper function for ill_forward_set(). 2664 */ 2665 static void 2666 ill_forward_set_on_ill(ill_t *ill, boolean_t enable) 2667 { 2668 ip_stack_t *ipst = ill->ill_ipst; 2669 2670 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2671 2672 ip1dbg(("ill_forward_set: %s %s forwarding on %s", 2673 (enable ? "Enabling" : "Disabling"), 2674 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name)); 2675 mutex_enter(&ill->ill_lock); 2676 if (enable) 2677 ill->ill_flags |= ILLF_ROUTER; 2678 else 2679 ill->ill_flags &= ~ILLF_ROUTER; 2680 mutex_exit(&ill->ill_lock); 2681 if (ill->ill_isv6) 2682 ill_set_nce_router_flags(ill, enable); 2683 /* Notify routing socket listeners of this change. */ 2684 if (ill->ill_ipif != NULL) 2685 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 2686 } 2687 2688 /* 2689 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing 2690 * socket messages for each interface whose flags we change. 2691 */ 2692 int 2693 ill_forward_set(ill_t *ill, boolean_t enable) 2694 { 2695 ipmp_illgrp_t *illg; 2696 ip_stack_t *ipst = ill->ill_ipst; 2697 2698 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock)); 2699 2700 if ((enable && (ill->ill_flags & ILLF_ROUTER)) || 2701 (!enable && !(ill->ill_flags & ILLF_ROUTER))) 2702 return (0); 2703 2704 if (IS_LOOPBACK(ill)) 2705 return (EINVAL); 2706 2707 if (enable && ill->ill_allowed_ips_cnt > 0) 2708 return (EPERM); 2709 2710 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { 2711 /* 2712 * Update all of the interfaces in the group. 2713 */ 2714 illg = ill->ill_grp; 2715 ill = list_head(&illg->ig_if); 2716 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2717 ill_forward_set_on_ill(ill, enable); 2718 2719 /* 2720 * Update the IPMP meta-interface. 2721 */ 2722 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable); 2723 return (0); 2724 } 2725 2726 ill_forward_set_on_ill(ill, enable); 2727 return (0); 2728 } 2729 2730 /* 2731 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for 2732 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately 2733 * set or clear. 2734 */ 2735 static void 2736 ill_set_nce_router_flags(ill_t *ill, boolean_t enable) 2737 { 2738 ipif_t *ipif; 2739 ncec_t *ncec; 2740 nce_t *nce; 2741 2742 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 2743 /* 2744 * NOTE: we match across the illgrp because nce's for 2745 * addresses on IPMP interfaces have an nce_ill that points to 2746 * the bound underlying ill. 2747 */ 2748 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 2749 if (nce != NULL) { 2750 ncec = nce->nce_common; 2751 mutex_enter(&ncec->ncec_lock); 2752 if (enable) 2753 ncec->ncec_flags |= NCE_F_ISROUTER; 2754 else 2755 ncec->ncec_flags &= ~NCE_F_ISROUTER; 2756 mutex_exit(&ncec->ncec_lock); 2757 nce_refrele(nce); 2758 } 2759 } 2760 } 2761 2762 /* 2763 * Intializes the context structure and returns the first ill in the list 2764 * cuurently start_list and end_list can have values: 2765 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists. 2766 * IP_V4_G_HEAD Traverse IPV4 list only. 2767 * IP_V6_G_HEAD Traverse IPV6 list only. 2768 */ 2769 2770 /* 2771 * We don't check for CONDEMNED ills here. Caller must do that if 2772 * necessary under the ill lock. 2773 */ 2774 ill_t * 2775 ill_first(int start_list, int end_list, ill_walk_context_t *ctx, 2776 ip_stack_t *ipst) 2777 { 2778 ill_if_t *ifp; 2779 ill_t *ill; 2780 avl_tree_t *avl_tree; 2781 2782 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 2783 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0); 2784 2785 /* 2786 * setup the lists to search 2787 */ 2788 if (end_list != MAX_G_HEADS) { 2789 ctx->ctx_current_list = start_list; 2790 ctx->ctx_last_list = end_list; 2791 } else { 2792 ctx->ctx_last_list = MAX_G_HEADS - 1; 2793 ctx->ctx_current_list = 0; 2794 } 2795 2796 while (ctx->ctx_current_list <= ctx->ctx_last_list) { 2797 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2798 if (ifp != (ill_if_t *) 2799 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2800 avl_tree = &ifp->illif_avl_by_ppa; 2801 ill = avl_first(avl_tree); 2802 /* 2803 * ill is guaranteed to be non NULL or ifp should have 2804 * not existed. 2805 */ 2806 ASSERT(ill != NULL); 2807 return (ill); 2808 } 2809 ctx->ctx_current_list++; 2810 } 2811 2812 return (NULL); 2813 } 2814 2815 /* 2816 * returns the next ill in the list. ill_first() must have been called 2817 * before calling ill_next() or bad things will happen. 2818 */ 2819 2820 /* 2821 * We don't check for CONDEMNED ills here. Caller must do that if 2822 * necessary under the ill lock. 2823 */ 2824 ill_t * 2825 ill_next(ill_walk_context_t *ctx, ill_t *lastill) 2826 { 2827 ill_if_t *ifp; 2828 ill_t *ill; 2829 ip_stack_t *ipst = lastill->ill_ipst; 2830 2831 ASSERT(lastill->ill_ifptr != (ill_if_t *) 2832 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)); 2833 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill, 2834 AVL_AFTER)) != NULL) { 2835 return (ill); 2836 } 2837 2838 /* goto next ill_ifp in the list. */ 2839 ifp = lastill->ill_ifptr->illif_next; 2840 2841 /* make sure not at end of circular list */ 2842 while (ifp == 2843 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) { 2844 if (++ctx->ctx_current_list > ctx->ctx_last_list) 2845 return (NULL); 2846 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst); 2847 } 2848 2849 return (avl_first(&ifp->illif_avl_by_ppa)); 2850 } 2851 2852 /* 2853 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+ 2854 * The final number (PPA) must not have any leading zeros. Upon success, a 2855 * pointer to the start of the PPA is returned; otherwise NULL is returned. 2856 */ 2857 static char * 2858 ill_get_ppa_ptr(char *name) 2859 { 2860 int namelen = strlen(name); 2861 int end_ndx = namelen - 1; 2862 int ppa_ndx, i; 2863 2864 /* 2865 * Check that the first character is [a-zA-Z], and that the last 2866 * character is [0-9]. 2867 */ 2868 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx])) 2869 return (NULL); 2870 2871 /* 2872 * Set `ppa_ndx' to the PPA start, and check for leading zeroes. 2873 */ 2874 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--) 2875 if (!isdigit(name[ppa_ndx - 1])) 2876 break; 2877 2878 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx) 2879 return (NULL); 2880 2881 /* 2882 * Check that the intermediate characters are [a-z0-9.] 2883 */ 2884 for (i = 1; i < ppa_ndx; i++) { 2885 if (!isalpha(name[i]) && !isdigit(name[i]) && 2886 name[i] != '.' && name[i] != '_') { 2887 return (NULL); 2888 } 2889 } 2890 2891 return (name + ppa_ndx); 2892 } 2893 2894 /* 2895 * use avl tree to locate the ill. 2896 */ 2897 static ill_t * 2898 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) 2899 { 2900 char *ppa_ptr = NULL; 2901 int len; 2902 uint_t ppa; 2903 ill_t *ill = NULL; 2904 ill_if_t *ifp; 2905 int list; 2906 2907 /* 2908 * get ppa ptr 2909 */ 2910 if (isv6) 2911 list = IP_V6_G_HEAD; 2912 else 2913 list = IP_V4_G_HEAD; 2914 2915 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { 2916 return (NULL); 2917 } 2918 2919 len = ppa_ptr - name + 1; 2920 2921 ppa = stoi(&ppa_ptr); 2922 2923 ifp = IP_VX_ILL_G_LIST(list, ipst); 2924 2925 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2926 /* 2927 * match is done on len - 1 as the name is not null 2928 * terminated it contains ppa in addition to the interface 2929 * name. 2930 */ 2931 if ((ifp->illif_name_len == len) && 2932 bcmp(ifp->illif_name, name, len - 1) == 0) { 2933 break; 2934 } else { 2935 ifp = ifp->illif_next; 2936 } 2937 } 2938 2939 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) { 2940 /* 2941 * Even the interface type does not exist. 2942 */ 2943 return (NULL); 2944 } 2945 2946 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); 2947 if (ill != NULL) { 2948 mutex_enter(&ill->ill_lock); 2949 if (ILL_CAN_LOOKUP(ill)) { 2950 ill_refhold_locked(ill); 2951 mutex_exit(&ill->ill_lock); 2952 return (ill); 2953 } 2954 mutex_exit(&ill->ill_lock); 2955 } 2956 return (NULL); 2957 } 2958 2959 /* 2960 * comparison function for use with avl. 2961 */ 2962 static int 2963 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr) 2964 { 2965 uint_t ppa; 2966 uint_t ill_ppa; 2967 2968 ASSERT(ppa_ptr != NULL && ill_ptr != NULL); 2969 2970 ppa = *((uint_t *)ppa_ptr); 2971 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa; 2972 /* 2973 * We want the ill with the lowest ppa to be on the 2974 * top. 2975 */ 2976 if (ill_ppa < ppa) 2977 return (1); 2978 if (ill_ppa > ppa) 2979 return (-1); 2980 return (0); 2981 } 2982 2983 /* 2984 * remove an interface type from the global list. 2985 */ 2986 static void 2987 ill_delete_interface_type(ill_if_t *interface) 2988 { 2989 ASSERT(interface != NULL); 2990 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0); 2991 2992 avl_destroy(&interface->illif_avl_by_ppa); 2993 if (interface->illif_ppa_arena != NULL) 2994 vmem_destroy(interface->illif_ppa_arena); 2995 2996 remque(interface); 2997 2998 mi_free(interface); 2999 } 3000 3001 /* 3002 * remove ill from the global list. 3003 */ 3004 static void 3005 ill_glist_delete(ill_t *ill) 3006 { 3007 ip_stack_t *ipst; 3008 phyint_t *phyi; 3009 3010 if (ill == NULL) 3011 return; 3012 ipst = ill->ill_ipst; 3013 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3014 3015 /* 3016 * If the ill was never inserted into the AVL tree 3017 * we skip the if branch. 3018 */ 3019 if (ill->ill_ifptr != NULL) { 3020 /* 3021 * remove from AVL tree and free ppa number 3022 */ 3023 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill); 3024 3025 if (ill->ill_ifptr->illif_ppa_arena != NULL) { 3026 vmem_free(ill->ill_ifptr->illif_ppa_arena, 3027 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3028 } 3029 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) { 3030 ill_delete_interface_type(ill->ill_ifptr); 3031 } 3032 3033 /* 3034 * Indicate ill is no longer in the list. 3035 */ 3036 ill->ill_ifptr = NULL; 3037 ill->ill_name_length = 0; 3038 ill->ill_name[0] = '\0'; 3039 ill->ill_ppa = UINT_MAX; 3040 } 3041 3042 /* Generate one last event for this ill. */ 3043 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name, 3044 ill->ill_name_length); 3045 3046 ASSERT(ill->ill_phyint != NULL); 3047 phyi = ill->ill_phyint; 3048 ill->ill_phyint = NULL; 3049 3050 /* 3051 * ill_init allocates a phyint always to store the copy 3052 * of flags relevant to phyint. At that point in time, we could 3053 * not assign the name and hence phyint_illv4/v6 could not be 3054 * initialized. Later in ipif_set_values, we assign the name to 3055 * the ill, at which point in time we assign phyint_illv4/v6. 3056 * Thus we don't rely on phyint_illv6 to be initialized always. 3057 */ 3058 if (ill->ill_flags & ILLF_IPV6) 3059 phyi->phyint_illv6 = NULL; 3060 else 3061 phyi->phyint_illv4 = NULL; 3062 3063 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) { 3064 rw_exit(&ipst->ips_ill_g_lock); 3065 return; 3066 } 3067 3068 /* 3069 * There are no ills left on this phyint; pull it out of the phyint 3070 * avl trees, and free it. 3071 */ 3072 if (phyi->phyint_ifindex > 0) { 3073 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3074 phyi); 3075 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 3076 phyi); 3077 } 3078 rw_exit(&ipst->ips_ill_g_lock); 3079 3080 phyint_free(phyi); 3081 } 3082 3083 /* 3084 * allocate a ppa, if the number of plumbed interfaces of this type are 3085 * less than ill_no_arena do a linear search to find a unused ppa. 3086 * When the number goes beyond ill_no_arena switch to using an arena. 3087 * Note: ppa value of zero cannot be allocated from vmem_arena as it 3088 * is the return value for an error condition, so allocation starts at one 3089 * and is decremented by one. 3090 */ 3091 static int 3092 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill) 3093 { 3094 ill_t *tmp_ill; 3095 uint_t start, end; 3096 int ppa; 3097 3098 if (ifp->illif_ppa_arena == NULL && 3099 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) { 3100 /* 3101 * Create an arena. 3102 */ 3103 ifp->illif_ppa_arena = vmem_create(ifp->illif_name, 3104 (void *)1, UINT_MAX - 1, 1, NULL, NULL, 3105 NULL, 0, VM_SLEEP | VMC_IDENTIFIER); 3106 /* allocate what has already been assigned */ 3107 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa); 3108 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, 3109 tmp_ill, AVL_AFTER)) { 3110 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3111 1, /* size */ 3112 1, /* align/quantum */ 3113 0, /* phase */ 3114 0, /* nocross */ 3115 /* minaddr */ 3116 (void *)((uintptr_t)tmp_ill->ill_ppa + 1), 3117 /* maxaddr */ 3118 (void *)((uintptr_t)tmp_ill->ill_ppa + 2), 3119 VM_NOSLEEP|VM_FIRSTFIT); 3120 if (ppa == 0) { 3121 ip1dbg(("ill_alloc_ppa: ppa allocation" 3122 " failed while switching")); 3123 vmem_destroy(ifp->illif_ppa_arena); 3124 ifp->illif_ppa_arena = NULL; 3125 break; 3126 } 3127 } 3128 } 3129 3130 if (ifp->illif_ppa_arena != NULL) { 3131 if (ill->ill_ppa == UINT_MAX) { 3132 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena, 3133 1, VM_NOSLEEP|VM_FIRSTFIT); 3134 if (ppa == 0) 3135 return (EAGAIN); 3136 ill->ill_ppa = --ppa; 3137 } else { 3138 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena, 3139 1, /* size */ 3140 1, /* align/quantum */ 3141 0, /* phase */ 3142 0, /* nocross */ 3143 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */ 3144 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */ 3145 VM_NOSLEEP|VM_FIRSTFIT); 3146 /* 3147 * Most likely the allocation failed because 3148 * the requested ppa was in use. 3149 */ 3150 if (ppa == 0) 3151 return (EEXIST); 3152 } 3153 return (0); 3154 } 3155 3156 /* 3157 * No arena is in use and not enough (>ill_no_arena) interfaces have 3158 * been plumbed to create one. Do a linear search to get a unused ppa. 3159 */ 3160 if (ill->ill_ppa == UINT_MAX) { 3161 end = UINT_MAX - 1; 3162 start = 0; 3163 } else { 3164 end = start = ill->ill_ppa; 3165 } 3166 3167 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL); 3168 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) { 3169 if (start++ >= end) { 3170 if (ill->ill_ppa == UINT_MAX) 3171 return (EAGAIN); 3172 else 3173 return (EEXIST); 3174 } 3175 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER); 3176 } 3177 ill->ill_ppa = start; 3178 return (0); 3179 } 3180 3181 /* 3182 * Insert ill into the list of configured ill's. Once this function completes, 3183 * the ill is globally visible and is available through lookups. More precisely 3184 * this happens after the caller drops the ill_g_lock. 3185 */ 3186 static int 3187 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6) 3188 { 3189 ill_if_t *ill_interface; 3190 avl_index_t where = 0; 3191 int error; 3192 int name_length; 3193 int index; 3194 boolean_t check_length = B_FALSE; 3195 ip_stack_t *ipst = ill->ill_ipst; 3196 3197 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 3198 3199 name_length = mi_strlen(name) + 1; 3200 3201 if (isv6) 3202 index = IP_V6_G_HEAD; 3203 else 3204 index = IP_V4_G_HEAD; 3205 3206 ill_interface = IP_VX_ILL_G_LIST(index, ipst); 3207 /* 3208 * Search for interface type based on name 3209 */ 3210 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3211 if ((ill_interface->illif_name_len == name_length) && 3212 (strcmp(ill_interface->illif_name, name) == 0)) { 3213 break; 3214 } 3215 ill_interface = ill_interface->illif_next; 3216 } 3217 3218 /* 3219 * Interface type not found, create one. 3220 */ 3221 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) { 3222 ill_g_head_t ghead; 3223 3224 /* 3225 * allocate ill_if_t structure 3226 */ 3227 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t)); 3228 if (ill_interface == NULL) { 3229 return (ENOMEM); 3230 } 3231 3232 (void) strcpy(ill_interface->illif_name, name); 3233 ill_interface->illif_name_len = name_length; 3234 3235 avl_create(&ill_interface->illif_avl_by_ppa, 3236 ill_compare_ppa, sizeof (ill_t), 3237 offsetof(struct ill_s, ill_avl_byppa)); 3238 3239 /* 3240 * link the structure in the back to maintain order 3241 * of configuration for ifconfig output. 3242 */ 3243 ghead = ipst->ips_ill_g_heads[index]; 3244 insque(ill_interface, ghead.ill_g_list_tail); 3245 } 3246 3247 if (ill->ill_ppa == UINT_MAX) 3248 check_length = B_TRUE; 3249 3250 error = ill_alloc_ppa(ill_interface, ill); 3251 if (error != 0) { 3252 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3253 ill_delete_interface_type(ill->ill_ifptr); 3254 return (error); 3255 } 3256 3257 /* 3258 * When the ppa is choosen by the system, check that there is 3259 * enough space to insert ppa. if a specific ppa was passed in this 3260 * check is not required as the interface name passed in will have 3261 * the right ppa in it. 3262 */ 3263 if (check_length) { 3264 /* 3265 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars. 3266 */ 3267 char buf[sizeof (uint_t) * 3]; 3268 3269 /* 3270 * convert ppa to string to calculate the amount of space 3271 * required for it in the name. 3272 */ 3273 numtos(ill->ill_ppa, buf); 3274 3275 /* Do we have enough space to insert ppa ? */ 3276 3277 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) { 3278 /* Free ppa and interface type struct */ 3279 if (ill_interface->illif_ppa_arena != NULL) { 3280 vmem_free(ill_interface->illif_ppa_arena, 3281 (void *)(uintptr_t)(ill->ill_ppa+1), 1); 3282 } 3283 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0) 3284 ill_delete_interface_type(ill->ill_ifptr); 3285 3286 return (EINVAL); 3287 } 3288 } 3289 3290 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa); 3291 ill->ill_name_length = mi_strlen(ill->ill_name) + 1; 3292 3293 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa, 3294 &where); 3295 ill->ill_ifptr = ill_interface; 3296 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where); 3297 3298 ill_phyint_reinit(ill); 3299 return (0); 3300 } 3301 3302 /* Initialize the per phyint ipsq used for serialization */ 3303 static boolean_t 3304 ipsq_init(ill_t *ill, boolean_t enter) 3305 { 3306 ipsq_t *ipsq; 3307 ipxop_t *ipx; 3308 3309 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL) 3310 return (B_FALSE); 3311 3312 ill->ill_phyint->phyint_ipsq = ipsq; 3313 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop; 3314 ipx->ipx_ipsq = ipsq; 3315 ipsq->ipsq_next = ipsq; 3316 ipsq->ipsq_phyint = ill->ill_phyint; 3317 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0); 3318 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0); 3319 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */ 3320 if (enter) { 3321 ipx->ipx_writer = curthread; 3322 ipx->ipx_forced = B_FALSE; 3323 ipx->ipx_reentry_cnt = 1; 3324 #ifdef DEBUG 3325 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 3326 #endif 3327 } 3328 return (B_TRUE); 3329 } 3330 3331 /* 3332 * ill_init is called by ip_open when a device control stream is opened. 3333 * It does a few initializations, and shoots a DL_INFO_REQ message down 3334 * to the driver. The response is later picked up in ip_rput_dlpi and 3335 * used to set up default mechanisms for talking to the driver. (Always 3336 * called as writer.) 3337 * 3338 * If this function returns error, ip_open will call ip_close which in 3339 * turn will call ill_delete to clean up any memory allocated here that 3340 * is not yet freed. 3341 */ 3342 int 3343 ill_init(queue_t *q, ill_t *ill) 3344 { 3345 int count; 3346 dl_info_req_t *dlir; 3347 mblk_t *info_mp; 3348 uchar_t *frag_ptr; 3349 3350 /* 3351 * The ill is initialized to zero by mi_alloc*(). In addition 3352 * some fields already contain valid values, initialized in 3353 * ip_open(), before we reach here. 3354 */ 3355 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); 3356 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); 3357 ill->ill_saved_ire_cnt = 0; 3358 3359 ill->ill_rq = q; 3360 ill->ill_wq = WR(q); 3361 3362 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), 3363 BPRI_HI); 3364 if (info_mp == NULL) 3365 return (ENOMEM); 3366 3367 /* 3368 * Allocate sufficient space to contain our fragment hash table and 3369 * the device name. 3370 */ 3371 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ); 3372 if (frag_ptr == NULL) { 3373 freemsg(info_mp); 3374 return (ENOMEM); 3375 } 3376 ill->ill_frag_ptr = frag_ptr; 3377 ill->ill_frag_free_num_pkts = 0; 3378 ill->ill_last_frag_clean_time = 0; 3379 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr; 3380 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE); 3381 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) { 3382 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock, 3383 NULL, MUTEX_DEFAULT, NULL); 3384 } 3385 3386 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3387 if (ill->ill_phyint == NULL) { 3388 freemsg(info_mp); 3389 mi_free(frag_ptr); 3390 return (ENOMEM); 3391 } 3392 3393 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3394 /* 3395 * For now pretend this is a v4 ill. We need to set phyint_ill* 3396 * at this point because of the following reason. If we can't 3397 * enter the ipsq at some point and cv_wait, the writer that 3398 * wakes us up tries to locate us using the list of all phyints 3399 * in an ipsq and the ills from the phyint thru the phyint_ill*. 3400 * If we don't set it now, we risk a missed wakeup. 3401 */ 3402 ill->ill_phyint->phyint_illv4 = ill; 3403 ill->ill_ppa = UINT_MAX; 3404 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3405 3406 ill_set_inputfn(ill); 3407 3408 if (!ipsq_init(ill, B_TRUE)) { 3409 freemsg(info_mp); 3410 mi_free(frag_ptr); 3411 mi_free(ill->ill_phyint); 3412 return (ENOMEM); 3413 } 3414 3415 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING; 3416 3417 /* Frag queue limit stuff */ 3418 ill->ill_frag_count = 0; 3419 ill->ill_ipf_gen = 0; 3420 3421 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3422 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3423 ill->ill_global_timer = INFINITY; 3424 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3425 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3426 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3427 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3428 3429 /* 3430 * Initialize IPv6 configuration variables. The IP module is always 3431 * opened as an IPv4 module. Instead tracking down the cases where 3432 * it switches to do ipv6, we'll just initialize the IPv6 configuration 3433 * here for convenience, this has no effect until the ill is set to do 3434 * IPv6. 3435 */ 3436 ill->ill_reachable_time = ND_REACHABLE_TIME; 3437 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; 3438 ill->ill_max_buf = ND_MAX_Q; 3439 ill->ill_refcnt = 0; 3440 3441 /* Send down the Info Request to the driver. */ 3442 info_mp->b_datap->db_type = M_PCPROTO; 3443 dlir = (dl_info_req_t *)info_mp->b_rptr; 3444 info_mp->b_wptr = (uchar_t *)&dlir[1]; 3445 dlir->dl_primitive = DL_INFO_REQ; 3446 3447 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3448 3449 qprocson(q); 3450 ill_dlpi_send(ill, info_mp); 3451 3452 return (0); 3453 } 3454 3455 /* 3456 * ill_dls_info 3457 * creates datalink socket info from the device. 3458 */ 3459 int 3460 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) 3461 { 3462 size_t len; 3463 3464 sdl->sdl_family = AF_LINK; 3465 sdl->sdl_index = ill_get_upper_ifindex(ill); 3466 sdl->sdl_type = ill->ill_type; 3467 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3468 len = strlen(sdl->sdl_data); 3469 ASSERT(len < 256); 3470 sdl->sdl_nlen = (uchar_t)len; 3471 sdl->sdl_alen = ill->ill_phys_addr_length; 3472 sdl->sdl_slen = 0; 3473 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL) 3474 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen); 3475 3476 return (sizeof (struct sockaddr_dl)); 3477 } 3478 3479 /* 3480 * ill_xarp_info 3481 * creates xarp info from the device. 3482 */ 3483 static int 3484 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) 3485 { 3486 sdl->sdl_family = AF_LINK; 3487 sdl->sdl_index = ill->ill_phyint->phyint_ifindex; 3488 sdl->sdl_type = ill->ill_type; 3489 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); 3490 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); 3491 sdl->sdl_alen = ill->ill_phys_addr_length; 3492 sdl->sdl_slen = 0; 3493 return (sdl->sdl_nlen); 3494 } 3495 3496 static int 3497 loopback_kstat_update(kstat_t *ksp, int rw) 3498 { 3499 kstat_named_t *kn; 3500 netstackid_t stackid; 3501 netstack_t *ns; 3502 ip_stack_t *ipst; 3503 3504 if (ksp == NULL || ksp->ks_data == NULL) 3505 return (EIO); 3506 3507 if (rw == KSTAT_WRITE) 3508 return (EACCES); 3509 3510 kn = KSTAT_NAMED_PTR(ksp); 3511 stackid = (zoneid_t)(uintptr_t)ksp->ks_private; 3512 3513 ns = netstack_find_by_stackid(stackid); 3514 if (ns == NULL) 3515 return (-1); 3516 3517 ipst = ns->netstack_ip; 3518 if (ipst == NULL) { 3519 netstack_rele(ns); 3520 return (-1); 3521 } 3522 kn[0].value.ui32 = ipst->ips_loopback_packets; 3523 kn[1].value.ui32 = ipst->ips_loopback_packets; 3524 netstack_rele(ns); 3525 return (0); 3526 } 3527 3528 /* 3529 * Has ifindex been plumbed already? 3530 */ 3531 static boolean_t 3532 phyint_exists(uint_t index, ip_stack_t *ipst) 3533 { 3534 ASSERT(index != 0); 3535 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 3536 3537 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3538 &index, NULL) != NULL); 3539 } 3540 3541 /* 3542 * Pick a unique ifindex. 3543 * When the index counter passes IF_INDEX_MAX for the first time, the wrap 3544 * flag is set so that next time time ip_assign_ifindex() is called, it 3545 * falls through and resets the index counter back to 1, the minimum value 3546 * for the interface index. The logic below assumes that ips_ill_index 3547 * can hold a value of IF_INDEX_MAX+1 without there being any loss 3548 * (i.e. reset back to 0.) 3549 */ 3550 boolean_t 3551 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst) 3552 { 3553 uint_t loops; 3554 3555 if (!ipst->ips_ill_index_wrap) { 3556 *indexp = ipst->ips_ill_index++; 3557 if (ipst->ips_ill_index > IF_INDEX_MAX) { 3558 /* 3559 * Reached the maximum ifindex value, set the wrap 3560 * flag to indicate that it is no longer possible 3561 * to assume that a given index is unallocated. 3562 */ 3563 ipst->ips_ill_index_wrap = B_TRUE; 3564 } 3565 return (B_TRUE); 3566 } 3567 3568 if (ipst->ips_ill_index > IF_INDEX_MAX) 3569 ipst->ips_ill_index = 1; 3570 3571 /* 3572 * Start reusing unused indexes. Note that we hold the ill_g_lock 3573 * at this point and don't want to call any function that attempts 3574 * to get the lock again. 3575 */ 3576 for (loops = IF_INDEX_MAX; loops > 0; loops--) { 3577 if (!phyint_exists(ipst->ips_ill_index, ipst)) { 3578 /* found unused index - use it */ 3579 *indexp = ipst->ips_ill_index; 3580 return (B_TRUE); 3581 } 3582 3583 ipst->ips_ill_index++; 3584 if (ipst->ips_ill_index > IF_INDEX_MAX) 3585 ipst->ips_ill_index = 1; 3586 } 3587 3588 /* 3589 * all interface indicies are inuse. 3590 */ 3591 return (B_FALSE); 3592 } 3593 3594 /* 3595 * Assign a unique interface index for the phyint. 3596 */ 3597 static boolean_t 3598 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst) 3599 { 3600 ASSERT(phyi->phyint_ifindex == 0); 3601 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst)); 3602 } 3603 3604 /* 3605 * Initialize the flags on `phyi' as per the provided mactype. 3606 */ 3607 static void 3608 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) 3609 { 3610 uint64_t flags = 0; 3611 3612 /* 3613 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces, 3614 * we always presume the underlying hardware is working and set 3615 * PHYI_RUNNING (if it's not, the driver will subsequently send a 3616 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization 3617 * there are no active interfaces in the group so we set PHYI_FAILED. 3618 */ 3619 if (mactype == SUNW_DL_IPMP) 3620 flags |= PHYI_FAILED; 3621 else 3622 flags |= PHYI_RUNNING; 3623 3624 switch (mactype) { 3625 case SUNW_DL_VNI: 3626 flags |= PHYI_VIRTUAL; 3627 break; 3628 case SUNW_DL_IPMP: 3629 flags |= PHYI_IPMP; 3630 break; 3631 case DL_LOOP: 3632 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL); 3633 break; 3634 } 3635 3636 mutex_enter(&phyi->phyint_lock); 3637 phyi->phyint_flags |= flags; 3638 mutex_exit(&phyi->phyint_lock); 3639 } 3640 3641 /* 3642 * Return a pointer to the ill which matches the supplied name. Note that 3643 * the ill name length includes the null termination character. (May be 3644 * called as writer.) 3645 * If do_alloc and the interface is "lo0" it will be automatically created. 3646 * Cannot bump up reference on condemned ills. So dup detect can't be done 3647 * using this func. 3648 */ 3649 ill_t * 3650 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, 3651 boolean_t *did_alloc, ip_stack_t *ipst) 3652 { 3653 ill_t *ill; 3654 ipif_t *ipif; 3655 ipsq_t *ipsq; 3656 kstat_named_t *kn; 3657 boolean_t isloopback; 3658 in6_addr_t ov6addr; 3659 3660 isloopback = mi_strcmp(name, ipif_loopback_name) == 0; 3661 3662 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3663 ill = ill_find_by_name(name, isv6, ipst); 3664 rw_exit(&ipst->ips_ill_g_lock); 3665 if (ill != NULL) 3666 return (ill); 3667 3668 /* 3669 * Couldn't find it. Does this happen to be a lookup for the 3670 * loopback device and are we allowed to allocate it? 3671 */ 3672 if (!isloopback || !do_alloc) 3673 return (NULL); 3674 3675 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 3676 ill = ill_find_by_name(name, isv6, ipst); 3677 if (ill != NULL) { 3678 rw_exit(&ipst->ips_ill_g_lock); 3679 return (ill); 3680 } 3681 3682 /* Create the loopback device on demand */ 3683 ill = (ill_t *)(mi_alloc(sizeof (ill_t) + 3684 sizeof (ipif_loopback_name), BPRI_MED)); 3685 if (ill == NULL) 3686 goto done; 3687 3688 *ill = ill_null; 3689 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); 3690 ill->ill_ipst = ipst; 3691 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); 3692 netstack_hold(ipst->ips_netstack); 3693 /* 3694 * For exclusive stacks we set the zoneid to zero 3695 * to make IP operate as if in the global zone. 3696 */ 3697 ill->ill_zoneid = GLOBAL_ZONEID; 3698 3699 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t)); 3700 if (ill->ill_phyint == NULL) 3701 goto done; 3702 3703 if (isv6) 3704 ill->ill_phyint->phyint_illv6 = ill; 3705 else 3706 ill->ill_phyint->phyint_illv4 = ill; 3707 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); 3708 phyint_flags_init(ill->ill_phyint, DL_LOOP); 3709 3710 if (isv6) { 3711 ill->ill_isv6 = B_TRUE; 3712 ill->ill_max_frag = ip_loopback_mtu_v6plus; 3713 } else { 3714 ill->ill_max_frag = ip_loopback_mtuplus; 3715 } 3716 if (!ill_allocate_mibs(ill)) 3717 goto done; 3718 ill->ill_current_frag = ill->ill_max_frag; 3719 ill->ill_mtu = ill->ill_max_frag; /* Initial value */ 3720 ill->ill_mc_mtu = ill->ill_mtu; 3721 /* 3722 * ipif_loopback_name can't be pointed at directly because its used 3723 * by both the ipv4 and ipv6 interfaces. When the ill is removed 3724 * from the glist, ill_glist_delete() sets the first character of 3725 * ill_name to '\0'. 3726 */ 3727 ill->ill_name = (char *)ill + sizeof (*ill); 3728 (void) strcpy(ill->ill_name, ipif_loopback_name); 3729 ill->ill_name_length = sizeof (ipif_loopback_name); 3730 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ 3731 ill->ill_dlpi_pending = DL_PRIM_INVAL; 3732 3733 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); 3734 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); 3735 ill->ill_global_timer = INFINITY; 3736 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; 3737 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; 3738 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; 3739 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL; 3740 3741 /* No resolver here. */ 3742 ill->ill_net_type = IRE_LOOPBACK; 3743 3744 /* Initialize the ipsq */ 3745 if (!ipsq_init(ill, B_FALSE)) 3746 goto done; 3747 3748 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL); 3749 if (ipif == NULL) 3750 goto done; 3751 3752 ill->ill_flags = ILLF_MULTICAST; 3753 3754 ov6addr = ipif->ipif_v6lcl_addr; 3755 /* Set up default loopback address and mask. */ 3756 if (!isv6) { 3757 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); 3758 3759 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); 3760 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); 3761 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3762 ipif->ipif_v6subnet); 3763 ill->ill_flags |= ILLF_IPV4; 3764 } else { 3765 ipif->ipif_v6lcl_addr = ipv6_loopback; 3766 ipif->ipif_v6net_mask = ipv6_all_ones; 3767 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 3768 ipif->ipif_v6subnet); 3769 ill->ill_flags |= ILLF_IPV6; 3770 } 3771 3772 /* 3773 * Chain us in at the end of the ill list. hold the ill 3774 * before we make it globally visible. 1 for the lookup. 3775 */ 3776 ill->ill_refcnt = 0; 3777 ill_refhold(ill); 3778 3779 ill->ill_frag_count = 0; 3780 ill->ill_frag_free_num_pkts = 0; 3781 ill->ill_last_frag_clean_time = 0; 3782 3783 ipsq = ill->ill_phyint->phyint_ipsq; 3784 3785 ill_set_inputfn(ill); 3786 3787 if (ill_glist_insert(ill, "lo", isv6) != 0) 3788 cmn_err(CE_PANIC, "cannot insert loopback interface"); 3789 3790 /* Let SCTP know so that it can add this to its list */ 3791 sctp_update_ill(ill, SCTP_ILL_INSERT); 3792 3793 /* 3794 * We have already assigned ipif_v6lcl_addr above, but we need to 3795 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which 3796 * requires to be after ill_glist_insert() since we need the 3797 * ill_index set. Pass on ipv6_loopback as the old address. 3798 */ 3799 sctp_update_ipif_addr(ipif, ov6addr); 3800 3801 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 3802 3803 /* 3804 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs. 3805 * If so, free our original one. 3806 */ 3807 if (ipsq != ill->ill_phyint->phyint_ipsq) 3808 ipsq_delete(ipsq); 3809 3810 if (ipst->ips_loopback_ksp == NULL) { 3811 /* Export loopback interface statistics */ 3812 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0, 3813 ipif_loopback_name, "net", 3814 KSTAT_TYPE_NAMED, 2, 0, 3815 ipst->ips_netstack->netstack_stackid); 3816 if (ipst->ips_loopback_ksp != NULL) { 3817 ipst->ips_loopback_ksp->ks_update = 3818 loopback_kstat_update; 3819 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp); 3820 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32); 3821 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32); 3822 ipst->ips_loopback_ksp->ks_private = 3823 (void *)(uintptr_t)ipst->ips_netstack-> 3824 netstack_stackid; 3825 kstat_install(ipst->ips_loopback_ksp); 3826 } 3827 } 3828 3829 *did_alloc = B_TRUE; 3830 rw_exit(&ipst->ips_ill_g_lock); 3831 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), 3832 NE_PLUMB, ill->ill_name, ill->ill_name_length); 3833 return (ill); 3834 done: 3835 if (ill != NULL) { 3836 if (ill->ill_phyint != NULL) { 3837 ipsq = ill->ill_phyint->phyint_ipsq; 3838 if (ipsq != NULL) { 3839 ipsq->ipsq_phyint = NULL; 3840 ipsq_delete(ipsq); 3841 } 3842 mi_free(ill->ill_phyint); 3843 } 3844 ill_free_mib(ill); 3845 if (ill->ill_ipst != NULL) 3846 netstack_rele(ill->ill_ipst->ips_netstack); 3847 mi_free(ill); 3848 } 3849 rw_exit(&ipst->ips_ill_g_lock); 3850 return (NULL); 3851 } 3852 3853 /* 3854 * For IPP calls - use the ip_stack_t for global stack. 3855 */ 3856 ill_t * 3857 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) 3858 { 3859 ip_stack_t *ipst; 3860 ill_t *ill; 3861 3862 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip; 3863 if (ipst == NULL) { 3864 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n"); 3865 return (NULL); 3866 } 3867 3868 ill = ill_lookup_on_ifindex(index, isv6, ipst); 3869 netstack_rele(ipst->ips_netstack); 3870 return (ill); 3871 } 3872 3873 /* 3874 * Return a pointer to the ill which matches the index and IP version type. 3875 */ 3876 ill_t * 3877 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3878 { 3879 ill_t *ill; 3880 phyint_t *phyi; 3881 3882 /* 3883 * Indexes are stored in the phyint - a common structure 3884 * to both IPv4 and IPv6. 3885 */ 3886 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3887 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3888 (void *) &index, NULL); 3889 if (phyi != NULL) { 3890 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; 3891 if (ill != NULL) { 3892 mutex_enter(&ill->ill_lock); 3893 if (!ILL_IS_CONDEMNED(ill)) { 3894 ill_refhold_locked(ill); 3895 mutex_exit(&ill->ill_lock); 3896 rw_exit(&ipst->ips_ill_g_lock); 3897 return (ill); 3898 } 3899 mutex_exit(&ill->ill_lock); 3900 } 3901 } 3902 rw_exit(&ipst->ips_ill_g_lock); 3903 return (NULL); 3904 } 3905 3906 /* 3907 * Verify whether or not an interface index is valid for the specified zoneid 3908 * to transmit packets. 3909 * It can be zero (meaning "reset") or an interface index assigned 3910 * to a non-VNI interface. (We don't use VNI interface to send packets.) 3911 */ 3912 boolean_t 3913 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6, 3914 ip_stack_t *ipst) 3915 { 3916 ill_t *ill; 3917 3918 if (ifindex == 0) 3919 return (B_TRUE); 3920 3921 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst); 3922 if (ill == NULL) 3923 return (B_FALSE); 3924 if (IS_VNI(ill)) { 3925 ill_refrele(ill); 3926 return (B_FALSE); 3927 } 3928 ill_refrele(ill); 3929 return (B_TRUE); 3930 } 3931 3932 /* 3933 * Return the ifindex next in sequence after the passed in ifindex. 3934 * If there is no next ifindex for the given protocol, return 0. 3935 */ 3936 uint_t 3937 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) 3938 { 3939 phyint_t *phyi; 3940 phyint_t *phyi_initial; 3941 uint_t ifindex; 3942 3943 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3944 3945 if (index == 0) { 3946 phyi = avl_first( 3947 &ipst->ips_phyint_g_list->phyint_list_avl_by_index); 3948 } else { 3949 phyi = phyi_initial = avl_find( 3950 &ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3951 (void *) &index, NULL); 3952 } 3953 3954 for (; phyi != NULL; 3955 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 3956 phyi, AVL_AFTER)) { 3957 /* 3958 * If we're not returning the first interface in the tree 3959 * and we still haven't moved past the phyint_t that 3960 * corresponds to index, avl_walk needs to be called again 3961 */ 3962 if (!((index != 0) && (phyi == phyi_initial))) { 3963 if (isv6) { 3964 if ((phyi->phyint_illv6) && 3965 ILL_CAN_LOOKUP(phyi->phyint_illv6) && 3966 (phyi->phyint_illv6->ill_isv6 == 1)) 3967 break; 3968 } else { 3969 if ((phyi->phyint_illv4) && 3970 ILL_CAN_LOOKUP(phyi->phyint_illv4) && 3971 (phyi->phyint_illv4->ill_isv6 == 0)) 3972 break; 3973 } 3974 } 3975 } 3976 3977 rw_exit(&ipst->ips_ill_g_lock); 3978 3979 if (phyi != NULL) 3980 ifindex = phyi->phyint_ifindex; 3981 else 3982 ifindex = 0; 3983 3984 return (ifindex); 3985 } 3986 3987 /* 3988 * Return the ifindex for the named interface. 3989 * If there is no next ifindex for the interface, return 0. 3990 */ 3991 uint_t 3992 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) 3993 { 3994 phyint_t *phyi; 3995 avl_index_t where = 0; 3996 uint_t ifindex; 3997 3998 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 3999 4000 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 4001 name, &where)) == NULL) { 4002 rw_exit(&ipst->ips_ill_g_lock); 4003 return (0); 4004 } 4005 4006 ifindex = phyi->phyint_ifindex; 4007 4008 rw_exit(&ipst->ips_ill_g_lock); 4009 4010 return (ifindex); 4011 } 4012 4013 /* 4014 * Return the ifindex to be used by upper layer protocols for instance 4015 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. 4016 */ 4017 uint_t 4018 ill_get_upper_ifindex(const ill_t *ill) 4019 { 4020 if (IS_UNDER_IPMP(ill)) 4021 return (ipmp_ill_get_ipmp_ifindex(ill)); 4022 else 4023 return (ill->ill_phyint->phyint_ifindex); 4024 } 4025 4026 4027 /* 4028 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt 4029 * that gives a running thread a reference to the ill. This reference must be 4030 * released by the thread when it is done accessing the ill and related 4031 * objects. ill_refcnt can not be used to account for static references 4032 * such as other structures pointing to an ill. Callers must generally 4033 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros 4034 * or be sure that the ill is not being deleted or changing state before 4035 * calling the refhold functions. A non-zero ill_refcnt ensures that the 4036 * ill won't change any of its critical state such as address, netmask etc. 4037 */ 4038 void 4039 ill_refhold(ill_t *ill) 4040 { 4041 mutex_enter(&ill->ill_lock); 4042 ill->ill_refcnt++; 4043 ILL_TRACE_REF(ill); 4044 mutex_exit(&ill->ill_lock); 4045 } 4046 4047 void 4048 ill_refhold_locked(ill_t *ill) 4049 { 4050 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4051 ill->ill_refcnt++; 4052 ILL_TRACE_REF(ill); 4053 } 4054 4055 /* Returns true if we managed to get a refhold */ 4056 boolean_t 4057 ill_check_and_refhold(ill_t *ill) 4058 { 4059 mutex_enter(&ill->ill_lock); 4060 if (!ILL_IS_CONDEMNED(ill)) { 4061 ill_refhold_locked(ill); 4062 mutex_exit(&ill->ill_lock); 4063 return (B_TRUE); 4064 } 4065 mutex_exit(&ill->ill_lock); 4066 return (B_FALSE); 4067 } 4068 4069 /* 4070 * Must not be called while holding any locks. Otherwise if this is 4071 * the last reference to be released, there is a chance of recursive mutex 4072 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 4073 * to restart an ioctl. 4074 */ 4075 void 4076 ill_refrele(ill_t *ill) 4077 { 4078 mutex_enter(&ill->ill_lock); 4079 ASSERT(ill->ill_refcnt != 0); 4080 ill->ill_refcnt--; 4081 ILL_UNTRACE_REF(ill); 4082 if (ill->ill_refcnt != 0) { 4083 /* Every ire pointing to the ill adds 1 to ill_refcnt */ 4084 mutex_exit(&ill->ill_lock); 4085 return; 4086 } 4087 4088 /* Drops the ill_lock */ 4089 ipif_ill_refrele_tail(ill); 4090 } 4091 4092 /* 4093 * Obtain a weak reference count on the ill. This reference ensures the 4094 * ill won't be freed, but the ill may change any of its critical state 4095 * such as netmask, address etc. Returns an error if the ill has started 4096 * closing. 4097 */ 4098 boolean_t 4099 ill_waiter_inc(ill_t *ill) 4100 { 4101 mutex_enter(&ill->ill_lock); 4102 if (ill->ill_state_flags & ILL_CONDEMNED) { 4103 mutex_exit(&ill->ill_lock); 4104 return (B_FALSE); 4105 } 4106 ill->ill_waiters++; 4107 mutex_exit(&ill->ill_lock); 4108 return (B_TRUE); 4109 } 4110 4111 void 4112 ill_waiter_dcr(ill_t *ill) 4113 { 4114 mutex_enter(&ill->ill_lock); 4115 ill->ill_waiters--; 4116 if (ill->ill_waiters == 0) 4117 cv_broadcast(&ill->ill_cv); 4118 mutex_exit(&ill->ill_lock); 4119 } 4120 4121 /* 4122 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the 4123 * driver. We construct best guess defaults for lower level information that 4124 * we need. If an interface is brought up without injection of any overriding 4125 * information from outside, we have to be ready to go with these defaults. 4126 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ) 4127 * we primarely want the dl_provider_style. 4128 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND 4129 * at which point we assume the other part of the information is valid. 4130 */ 4131 void 4132 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) 4133 { 4134 uchar_t *brdcst_addr; 4135 uint_t brdcst_addr_length, phys_addr_length; 4136 t_scalar_t sap_length; 4137 dl_info_ack_t *dlia; 4138 ip_m_t *ipm; 4139 dl_qos_cl_sel1_t *sel1; 4140 int min_mtu; 4141 4142 ASSERT(IAM_WRITER_ILL(ill)); 4143 4144 /* 4145 * Till the ill is fully up the ill is not globally visible. 4146 * So no need for a lock. 4147 */ 4148 dlia = (dl_info_ack_t *)mp->b_rptr; 4149 ill->ill_mactype = dlia->dl_mac_type; 4150 4151 ipm = ip_m_lookup(dlia->dl_mac_type); 4152 if (ipm == NULL) { 4153 ipm = ip_m_lookup(DL_OTHER); 4154 ASSERT(ipm != NULL); 4155 } 4156 ill->ill_media = ipm; 4157 4158 /* 4159 * When the new DLPI stuff is ready we'll pull lengths 4160 * from dlia. 4161 */ 4162 if (dlia->dl_version == DL_VERSION_2) { 4163 brdcst_addr_length = dlia->dl_brdcst_addr_length; 4164 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, 4165 brdcst_addr_length); 4166 if (brdcst_addr == NULL) { 4167 brdcst_addr_length = 0; 4168 } 4169 sap_length = dlia->dl_sap_length; 4170 phys_addr_length = dlia->dl_addr_length - ABS(sap_length); 4171 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n", 4172 brdcst_addr_length, sap_length, phys_addr_length)); 4173 } else { 4174 brdcst_addr_length = 6; 4175 brdcst_addr = ip_six_byte_all_ones; 4176 sap_length = -2; 4177 phys_addr_length = brdcst_addr_length; 4178 } 4179 4180 ill->ill_bcast_addr_length = brdcst_addr_length; 4181 ill->ill_phys_addr_length = phys_addr_length; 4182 ill->ill_sap_length = sap_length; 4183 4184 /* 4185 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU, 4186 * but we must ensure a minimum IP MTU is used since other bits of 4187 * IP will fly apart otherwise. 4188 */ 4189 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 4190 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); 4191 ill->ill_current_frag = ill->ill_max_frag; 4192 ill->ill_mtu = ill->ill_max_frag; 4193 ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */ 4194 4195 ill->ill_type = ipm->ip_m_type; 4196 4197 if (!ill->ill_dlpi_style_set) { 4198 if (dlia->dl_provider_style == DL_STYLE2) 4199 ill->ill_needs_attach = 1; 4200 4201 phyint_flags_init(ill->ill_phyint, ill->ill_mactype); 4202 4203 /* 4204 * Allocate the first ipif on this ill. We don't delay it 4205 * further as ioctl handling assumes at least one ipif exists. 4206 * 4207 * At this point we don't know whether the ill is v4 or v6. 4208 * We will know this whan the SIOCSLIFNAME happens and 4209 * the correct value for ill_isv6 will be assigned in 4210 * ipif_set_values(). We need to hold the ill lock and 4211 * clear the ILL_LL_SUBNET_PENDING flag and atomically do 4212 * the wakeup. 4213 */ 4214 (void) ipif_allocate(ill, 0, IRE_LOCAL, 4215 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL); 4216 mutex_enter(&ill->ill_lock); 4217 ASSERT(ill->ill_dlpi_style_set == 0); 4218 ill->ill_dlpi_style_set = 1; 4219 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING; 4220 cv_broadcast(&ill->ill_cv); 4221 mutex_exit(&ill->ill_lock); 4222 freemsg(mp); 4223 return; 4224 } 4225 ASSERT(ill->ill_ipif != NULL); 4226 /* 4227 * We know whether it is IPv4 or IPv6 now, as this is the 4228 * second DL_INFO_ACK we are recieving in response to the 4229 * DL_INFO_REQ sent in ipif_set_values. 4230 */ 4231 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; 4232 /* 4233 * Clear all the flags that were set based on ill_bcast_addr_length 4234 * and ill_phys_addr_length (in ipif_set_values) as these could have 4235 * changed now and we need to re-evaluate. 4236 */ 4237 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP); 4238 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); 4239 4240 /* 4241 * Free ill_bcast_mp as things could have changed now. 4242 * 4243 * NOTE: The IPMP meta-interface is special-cased because it starts 4244 * with no underlying interfaces (and thus an unknown broadcast 4245 * address length), but we enforce that an interface is broadcast- 4246 * capable as part of allowing it to join a group. 4247 */ 4248 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { 4249 if (ill->ill_bcast_mp != NULL) 4250 freemsg(ill->ill_bcast_mp); 4251 ill->ill_net_type = IRE_IF_NORESOLVER; 4252 4253 ill->ill_bcast_mp = ill_dlur_gen(NULL, 4254 ill->ill_phys_addr_length, 4255 ill->ill_sap, 4256 ill->ill_sap_length); 4257 4258 if (ill->ill_isv6) 4259 /* 4260 * Note: xresolv interfaces will eventually need NOARP 4261 * set here as well, but that will require those 4262 * external resolvers to have some knowledge of 4263 * that flag and act appropriately. Not to be changed 4264 * at present. 4265 */ 4266 ill->ill_flags |= ILLF_NONUD; 4267 else 4268 ill->ill_flags |= ILLF_NOARP; 4269 4270 if (ill->ill_mactype == SUNW_DL_VNI) { 4271 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT; 4272 } else if (ill->ill_phys_addr_length == 0 || 4273 ill->ill_mactype == DL_IPV4 || 4274 ill->ill_mactype == DL_IPV6) { 4275 /* 4276 * The underying link is point-to-point, so mark the 4277 * interface as such. We can do IP multicast over 4278 * such a link since it transmits all network-layer 4279 * packets to the remote side the same way. 4280 */ 4281 ill->ill_flags |= ILLF_MULTICAST; 4282 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT; 4283 } 4284 } else { 4285 ill->ill_net_type = IRE_IF_RESOLVER; 4286 if (ill->ill_bcast_mp != NULL) 4287 freemsg(ill->ill_bcast_mp); 4288 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr, 4289 ill->ill_bcast_addr_length, ill->ill_sap, 4290 ill->ill_sap_length); 4291 /* 4292 * Later detect lack of DLPI driver multicast 4293 * capability by catching DL_ENABMULTI errors in 4294 * ip_rput_dlpi. 4295 */ 4296 ill->ill_flags |= ILLF_MULTICAST; 4297 if (!ill->ill_isv6) 4298 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST; 4299 } 4300 4301 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */ 4302 if (ill->ill_mactype == SUNW_DL_IPMP) 4303 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP); 4304 4305 /* By default an interface does not support any CoS marking */ 4306 ill->ill_flags &= ~ILLF_COS_ENABLED; 4307 4308 /* 4309 * If we get QoS information in DL_INFO_ACK, the device supports 4310 * some form of CoS marking, set ILLF_COS_ENABLED. 4311 */ 4312 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset, 4313 dlia->dl_qos_length); 4314 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) { 4315 ill->ill_flags |= ILLF_COS_ENABLED; 4316 } 4317 4318 /* Clear any previous error indication. */ 4319 ill->ill_error = 0; 4320 freemsg(mp); 4321 } 4322 4323 /* 4324 * Perform various checks to verify that an address would make sense as a 4325 * local, remote, or subnet interface address. 4326 */ 4327 static boolean_t 4328 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask) 4329 { 4330 ipaddr_t net_mask; 4331 4332 /* 4333 * Don't allow all zeroes, or all ones, but allow 4334 * all ones netmask. 4335 */ 4336 if ((net_mask = ip_net_mask(addr)) == 0) 4337 return (B_FALSE); 4338 /* A given netmask overrides the "guess" netmask */ 4339 if (subnet_mask != 0) 4340 net_mask = subnet_mask; 4341 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) || 4342 (addr == (addr | ~net_mask)))) { 4343 return (B_FALSE); 4344 } 4345 4346 /* 4347 * Even if the netmask is all ones, we do not allow address to be 4348 * 255.255.255.255 4349 */ 4350 if (addr == INADDR_BROADCAST) 4351 return (B_FALSE); 4352 4353 if (CLASSD(addr)) 4354 return (B_FALSE); 4355 4356 return (B_TRUE); 4357 } 4358 4359 #define V6_IPIF_LINKLOCAL(p) \ 4360 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr) 4361 4362 /* 4363 * Compare two given ipifs and check if the second one is better than 4364 * the first one using the order of preference (not taking deprecated 4365 * into acount) specified in ipif_lookup_multicast(). 4366 */ 4367 static boolean_t 4368 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) 4369 { 4370 /* Check the least preferred first. */ 4371 if (IS_LOOPBACK(old_ipif->ipif_ill)) { 4372 /* If both ipifs are the same, use the first one. */ 4373 if (IS_LOOPBACK(new_ipif->ipif_ill)) 4374 return (B_FALSE); 4375 else 4376 return (B_TRUE); 4377 } 4378 4379 /* For IPv6, check for link local address. */ 4380 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) { 4381 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4382 V6_IPIF_LINKLOCAL(new_ipif)) { 4383 /* The second one is equal or less preferred. */ 4384 return (B_FALSE); 4385 } else { 4386 return (B_TRUE); 4387 } 4388 } 4389 4390 /* Then check for point to point interface. */ 4391 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) { 4392 if (IS_LOOPBACK(new_ipif->ipif_ill) || 4393 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) || 4394 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) { 4395 return (B_FALSE); 4396 } else { 4397 return (B_TRUE); 4398 } 4399 } 4400 4401 /* old_ipif is a normal interface, so no need to use the new one. */ 4402 return (B_FALSE); 4403 } 4404 4405 /* 4406 * Find a mulitcast-capable ipif given an IP instance and zoneid. 4407 * The ipif must be up, and its ill must multicast-capable, not 4408 * condemned, not an underlying interface in an IPMP group, and 4409 * not a VNI interface. Order of preference: 4410 * 4411 * 1a. normal 4412 * 1b. normal, but deprecated 4413 * 2a. point to point 4414 * 2b. point to point, but deprecated 4415 * 3a. link local 4416 * 3b. link local, but deprecated 4417 * 4. loopback. 4418 */ 4419 static ipif_t * 4420 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4421 { 4422 ill_t *ill; 4423 ill_walk_context_t ctx; 4424 ipif_t *ipif; 4425 ipif_t *saved_ipif = NULL; 4426 ipif_t *dep_ipif = NULL; 4427 4428 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4429 if (isv6) 4430 ill = ILL_START_WALK_V6(&ctx, ipst); 4431 else 4432 ill = ILL_START_WALK_V4(&ctx, ipst); 4433 4434 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4435 mutex_enter(&ill->ill_lock); 4436 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || 4437 ILL_IS_CONDEMNED(ill) || 4438 !(ill->ill_flags & ILLF_MULTICAST)) { 4439 mutex_exit(&ill->ill_lock); 4440 continue; 4441 } 4442 for (ipif = ill->ill_ipif; ipif != NULL; 4443 ipif = ipif->ipif_next) { 4444 if (zoneid != ipif->ipif_zoneid && 4445 zoneid != ALL_ZONES && 4446 ipif->ipif_zoneid != ALL_ZONES) { 4447 continue; 4448 } 4449 if (!(ipif->ipif_flags & IPIF_UP) || 4450 IPIF_IS_CONDEMNED(ipif)) { 4451 continue; 4452 } 4453 4454 /* 4455 * Found one candidate. If it is deprecated, 4456 * remember it in dep_ipif. If it is not deprecated, 4457 * remember it in saved_ipif. 4458 */ 4459 if (ipif->ipif_flags & IPIF_DEPRECATED) { 4460 if (dep_ipif == NULL) { 4461 dep_ipif = ipif; 4462 } else if (ipif_comp_multi(dep_ipif, ipif, 4463 isv6)) { 4464 /* 4465 * If the previous dep_ipif does not 4466 * belong to the same ill, we've done 4467 * a ipif_refhold() on it. So we need 4468 * to release it. 4469 */ 4470 if (dep_ipif->ipif_ill != ill) 4471 ipif_refrele(dep_ipif); 4472 dep_ipif = ipif; 4473 } 4474 continue; 4475 } 4476 if (saved_ipif == NULL) { 4477 saved_ipif = ipif; 4478 } else { 4479 if (ipif_comp_multi(saved_ipif, ipif, isv6)) { 4480 if (saved_ipif->ipif_ill != ill) 4481 ipif_refrele(saved_ipif); 4482 saved_ipif = ipif; 4483 } 4484 } 4485 } 4486 /* 4487 * Before going to the next ill, do a ipif_refhold() on the 4488 * saved ones. 4489 */ 4490 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill) 4491 ipif_refhold_locked(saved_ipif); 4492 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill) 4493 ipif_refhold_locked(dep_ipif); 4494 mutex_exit(&ill->ill_lock); 4495 } 4496 rw_exit(&ipst->ips_ill_g_lock); 4497 4498 /* 4499 * If we have only the saved_ipif, return it. But if we have both 4500 * saved_ipif and dep_ipif, check to see which one is better. 4501 */ 4502 if (saved_ipif != NULL) { 4503 if (dep_ipif != NULL) { 4504 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) { 4505 ipif_refrele(saved_ipif); 4506 return (dep_ipif); 4507 } else { 4508 ipif_refrele(dep_ipif); 4509 return (saved_ipif); 4510 } 4511 } 4512 return (saved_ipif); 4513 } else { 4514 return (dep_ipif); 4515 } 4516 } 4517 4518 ill_t * 4519 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) 4520 { 4521 ipif_t *ipif; 4522 ill_t *ill; 4523 4524 ipif = ipif_lookup_multicast(ipst, zoneid, isv6); 4525 if (ipif == NULL) 4526 return (NULL); 4527 4528 ill = ipif->ipif_ill; 4529 ill_refhold(ill); 4530 ipif_refrele(ipif); 4531 return (ill); 4532 } 4533 4534 /* 4535 * This function is called when an application does not specify an interface 4536 * to be used for multicast traffic (joining a group/sending data). It 4537 * calls ire_lookup_multi() to look for an interface route for the 4538 * specified multicast group. Doing this allows the administrator to add 4539 * prefix routes for multicast to indicate which interface to be used for 4540 * multicast traffic in the above scenario. The route could be for all 4541 * multicast (224.0/4), for a single multicast group (a /32 route) or 4542 * anything in between. If there is no such multicast route, we just find 4543 * any multicast capable interface and return it. The returned ipif 4544 * is refhold'ed. 4545 * 4546 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the 4547 * unicast table. This is used by CGTP. 4548 */ 4549 ill_t * 4550 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 4551 boolean_t *multirtp, ipaddr_t *setsrcp) 4552 { 4553 ill_t *ill; 4554 4555 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); 4556 if (ill != NULL) 4557 return (ill); 4558 4559 return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); 4560 } 4561 4562 /* 4563 * Look for an ipif with the specified interface address and destination. 4564 * The destination address is used only for matching point-to-point interfaces. 4565 */ 4566 ipif_t * 4567 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) 4568 { 4569 ipif_t *ipif; 4570 ill_t *ill; 4571 ill_walk_context_t ctx; 4572 4573 /* 4574 * First match all the point-to-point interfaces 4575 * before looking at non-point-to-point interfaces. 4576 * This is done to avoid returning non-point-to-point 4577 * ipif instead of unnumbered point-to-point ipif. 4578 */ 4579 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4580 ill = ILL_START_WALK_V4(&ctx, ipst); 4581 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4582 mutex_enter(&ill->ill_lock); 4583 for (ipif = ill->ill_ipif; ipif != NULL; 4584 ipif = ipif->ipif_next) { 4585 /* Allow the ipif to be down */ 4586 if ((ipif->ipif_flags & IPIF_POINTOPOINT) && 4587 (ipif->ipif_lcl_addr == if_addr) && 4588 (ipif->ipif_pp_dst_addr == dst)) { 4589 if (!IPIF_IS_CONDEMNED(ipif)) { 4590 ipif_refhold_locked(ipif); 4591 mutex_exit(&ill->ill_lock); 4592 rw_exit(&ipst->ips_ill_g_lock); 4593 return (ipif); 4594 } 4595 } 4596 } 4597 mutex_exit(&ill->ill_lock); 4598 } 4599 rw_exit(&ipst->ips_ill_g_lock); 4600 4601 /* lookup the ipif based on interface address */ 4602 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); 4603 ASSERT(ipif == NULL || !ipif->ipif_isv6); 4604 return (ipif); 4605 } 4606 4607 /* 4608 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). 4609 */ 4610 static ipif_t * 4611 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, 4612 zoneid_t zoneid, ip_stack_t *ipst) 4613 { 4614 ipif_t *ipif; 4615 ill_t *ill; 4616 boolean_t ptp = B_FALSE; 4617 ill_walk_context_t ctx; 4618 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); 4619 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); 4620 4621 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4622 /* 4623 * Repeat twice, first based on local addresses and 4624 * next time for pointopoint. 4625 */ 4626 repeat: 4627 ill = ILL_START_WALK_V4(&ctx, ipst); 4628 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4629 if (match_ill != NULL && ill != match_ill && 4630 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { 4631 continue; 4632 } 4633 mutex_enter(&ill->ill_lock); 4634 for (ipif = ill->ill_ipif; ipif != NULL; 4635 ipif = ipif->ipif_next) { 4636 if (zoneid != ALL_ZONES && 4637 zoneid != ipif->ipif_zoneid && 4638 ipif->ipif_zoneid != ALL_ZONES) 4639 continue; 4640 4641 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) 4642 continue; 4643 4644 /* Allow the ipif to be down */ 4645 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4646 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4647 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4648 (ipif->ipif_pp_dst_addr == addr))) { 4649 if (!IPIF_IS_CONDEMNED(ipif)) { 4650 ipif_refhold_locked(ipif); 4651 mutex_exit(&ill->ill_lock); 4652 rw_exit(&ipst->ips_ill_g_lock); 4653 return (ipif); 4654 } 4655 } 4656 } 4657 mutex_exit(&ill->ill_lock); 4658 } 4659 4660 /* If we already did the ptp case, then we are done */ 4661 if (ptp) { 4662 rw_exit(&ipst->ips_ill_g_lock); 4663 return (NULL); 4664 } 4665 ptp = B_TRUE; 4666 goto repeat; 4667 } 4668 4669 /* 4670 * Lookup an ipif with the specified address. For point-to-point links we 4671 * look for matches on either the destination address or the local address, 4672 * but we skip the local address check if IPIF_UNNUMBERED is set. If the 4673 * `match_ill' argument is non-NULL, the lookup is restricted to that ill 4674 * (or illgrp if `match_ill' is in an IPMP group). 4675 */ 4676 ipif_t * 4677 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4678 ip_stack_t *ipst) 4679 { 4680 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, 4681 zoneid, ipst)); 4682 } 4683 4684 /* 4685 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, 4686 * except that we will only return an address if it is not marked as 4687 * IPIF_DUPLICATE 4688 */ 4689 ipif_t * 4690 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, 4691 ip_stack_t *ipst) 4692 { 4693 return (ipif_lookup_addr_common(addr, match_ill, 4694 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), 4695 zoneid, ipst)); 4696 } 4697 4698 /* 4699 * Special abbreviated version of ipif_lookup_addr() that doesn't match 4700 * `match_ill' across the IPMP group. This function is only needed in some 4701 * corner-cases; almost everything should use ipif_lookup_addr(). 4702 */ 4703 ipif_t * 4704 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4705 { 4706 ASSERT(match_ill != NULL); 4707 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, 4708 ipst)); 4709 } 4710 4711 /* 4712 * Look for an ipif with the specified address. For point-point links 4713 * we look for matches on either the destination address and the local 4714 * address, but we ignore the check on the local address if IPIF_UNNUMBERED 4715 * is set. 4716 * If the `match_ill' argument is non-NULL, the lookup is restricted to that 4717 * ill (or illgrp if `match_ill' is in an IPMP group). 4718 * Return the zoneid for the ipif which matches. ALL_ZONES if no match. 4719 */ 4720 zoneid_t 4721 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) 4722 { 4723 zoneid_t zoneid; 4724 ipif_t *ipif; 4725 ill_t *ill; 4726 boolean_t ptp = B_FALSE; 4727 ill_walk_context_t ctx; 4728 4729 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 4730 /* 4731 * Repeat twice, first based on local addresses and 4732 * next time for pointopoint. 4733 */ 4734 repeat: 4735 ill = ILL_START_WALK_V4(&ctx, ipst); 4736 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 4737 if (match_ill != NULL && ill != match_ill && 4738 !IS_IN_SAME_ILLGRP(ill, match_ill)) { 4739 continue; 4740 } 4741 mutex_enter(&ill->ill_lock); 4742 for (ipif = ill->ill_ipif; ipif != NULL; 4743 ipif = ipif->ipif_next) { 4744 /* Allow the ipif to be down */ 4745 if ((!ptp && (ipif->ipif_lcl_addr == addr) && 4746 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || 4747 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && 4748 (ipif->ipif_pp_dst_addr == addr)) && 4749 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { 4750 zoneid = ipif->ipif_zoneid; 4751 mutex_exit(&ill->ill_lock); 4752 rw_exit(&ipst->ips_ill_g_lock); 4753 /* 4754 * If ipif_zoneid was ALL_ZONES then we have 4755 * a trusted extensions shared IP address. 4756 * In that case GLOBAL_ZONEID works to send. 4757 */ 4758 if (zoneid == ALL_ZONES) 4759 zoneid = GLOBAL_ZONEID; 4760 return (zoneid); 4761 } 4762 } 4763 mutex_exit(&ill->ill_lock); 4764 } 4765 4766 /* If we already did the ptp case, then we are done */ 4767 if (ptp) { 4768 rw_exit(&ipst->ips_ill_g_lock); 4769 return (ALL_ZONES); 4770 } 4771 ptp = B_TRUE; 4772 goto repeat; 4773 } 4774 4775 /* 4776 * Look for an ipif that matches the specified remote address i.e. the 4777 * ipif that would receive the specified packet. 4778 * First look for directly connected interfaces and then do a recursive 4779 * IRE lookup and pick the first ipif corresponding to the source address in the 4780 * ire. 4781 * Returns: held ipif 4782 * 4783 * This is only used for ICMP_ADDRESS_MASK_REQUESTs 4784 */ 4785 ipif_t * 4786 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) 4787 { 4788 ipif_t *ipif; 4789 4790 ASSERT(!ill->ill_isv6); 4791 4792 /* 4793 * Someone could be changing this ipif currently or change it 4794 * after we return this. Thus a few packets could use the old 4795 * old values. However structure updates/creates (ire, ilg, ilm etc) 4796 * will atomically be updated or cleaned up with the new value 4797 * Thus we don't need a lock to check the flags or other attrs below. 4798 */ 4799 mutex_enter(&ill->ill_lock); 4800 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4801 if (IPIF_IS_CONDEMNED(ipif)) 4802 continue; 4803 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && 4804 ipif->ipif_zoneid != ALL_ZONES) 4805 continue; 4806 /* Allow the ipif to be down */ 4807 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 4808 if ((ipif->ipif_pp_dst_addr == addr) || 4809 (!(ipif->ipif_flags & IPIF_UNNUMBERED) && 4810 ipif->ipif_lcl_addr == addr)) { 4811 ipif_refhold_locked(ipif); 4812 mutex_exit(&ill->ill_lock); 4813 return (ipif); 4814 } 4815 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) { 4816 ipif_refhold_locked(ipif); 4817 mutex_exit(&ill->ill_lock); 4818 return (ipif); 4819 } 4820 } 4821 mutex_exit(&ill->ill_lock); 4822 /* 4823 * For a remote destination it isn't possible to nail down a particular 4824 * ipif. 4825 */ 4826 4827 /* Pick the first interface */ 4828 ipif = ipif_get_next_ipif(NULL, ill); 4829 return (ipif); 4830 } 4831 4832 /* 4833 * This func does not prevent refcnt from increasing. But if 4834 * the caller has taken steps to that effect, then this func 4835 * can be used to determine whether the ill has become quiescent 4836 */ 4837 static boolean_t 4838 ill_is_quiescent(ill_t *ill) 4839 { 4840 ipif_t *ipif; 4841 4842 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4843 4844 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4845 if (ipif->ipif_refcnt != 0) 4846 return (B_FALSE); 4847 } 4848 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { 4849 return (B_FALSE); 4850 } 4851 return (B_TRUE); 4852 } 4853 4854 boolean_t 4855 ill_is_freeable(ill_t *ill) 4856 { 4857 ipif_t *ipif; 4858 4859 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4860 4861 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 4862 if (ipif->ipif_refcnt != 0) { 4863 return (B_FALSE); 4864 } 4865 } 4866 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) { 4867 return (B_FALSE); 4868 } 4869 return (B_TRUE); 4870 } 4871 4872 /* 4873 * This func does not prevent refcnt from increasing. But if 4874 * the caller has taken steps to that effect, then this func 4875 * can be used to determine whether the ipif has become quiescent 4876 */ 4877 static boolean_t 4878 ipif_is_quiescent(ipif_t *ipif) 4879 { 4880 ill_t *ill; 4881 4882 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4883 4884 if (ipif->ipif_refcnt != 0) 4885 return (B_FALSE); 4886 4887 ill = ipif->ipif_ill; 4888 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || 4889 ill->ill_logical_down) { 4890 return (B_TRUE); 4891 } 4892 4893 /* This is the last ipif going down or being deleted on this ill */ 4894 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { 4895 return (B_FALSE); 4896 } 4897 4898 return (B_TRUE); 4899 } 4900 4901 /* 4902 * return true if the ipif can be destroyed: the ipif has to be quiescent 4903 * with zero references from ire/ilm to it. 4904 */ 4905 static boolean_t 4906 ipif_is_freeable(ipif_t *ipif) 4907 { 4908 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 4909 ASSERT(ipif->ipif_id != 0); 4910 return (ipif->ipif_refcnt == 0); 4911 } 4912 4913 /* 4914 * The ipif/ill/ire has been refreled. Do the tail processing. 4915 * Determine if the ipif or ill in question has become quiescent and if so 4916 * wakeup close and/or restart any queued pending ioctl that is waiting 4917 * for the ipif_down (or ill_down) 4918 */ 4919 void 4920 ipif_ill_refrele_tail(ill_t *ill) 4921 { 4922 mblk_t *mp; 4923 conn_t *connp; 4924 ipsq_t *ipsq; 4925 ipxop_t *ipx; 4926 ipif_t *ipif; 4927 dl_notify_ind_t *dlindp; 4928 4929 ASSERT(MUTEX_HELD(&ill->ill_lock)); 4930 4931 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) { 4932 /* ip_modclose() may be waiting */ 4933 cv_broadcast(&ill->ill_cv); 4934 } 4935 4936 ipsq = ill->ill_phyint->phyint_ipsq; 4937 mutex_enter(&ipsq->ipsq_lock); 4938 ipx = ipsq->ipsq_xop; 4939 mutex_enter(&ipx->ipx_lock); 4940 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */ 4941 goto unlock; 4942 4943 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL); 4944 4945 ipif = ipx->ipx_pending_ipif; 4946 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */ 4947 goto unlock; 4948 4949 switch (ipx->ipx_waitfor) { 4950 case IPIF_DOWN: 4951 if (!ipif_is_quiescent(ipif)) 4952 goto unlock; 4953 break; 4954 case IPIF_FREE: 4955 if (!ipif_is_freeable(ipif)) 4956 goto unlock; 4957 break; 4958 case ILL_DOWN: 4959 if (!ill_is_quiescent(ill)) 4960 goto unlock; 4961 break; 4962 case ILL_FREE: 4963 /* 4964 * ILL_FREE is only for loopback; normal ill teardown waits 4965 * synchronously in ip_modclose() without using ipx_waitfor, 4966 * handled by the cv_broadcast() at the top of this function. 4967 */ 4968 if (!ill_is_freeable(ill)) 4969 goto unlock; 4970 break; 4971 default: 4972 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n", 4973 (void *)ipsq, ipx->ipx_waitfor); 4974 } 4975 4976 ill_refhold_locked(ill); /* for qwriter_ip() call below */ 4977 mutex_exit(&ipx->ipx_lock); 4978 mp = ipsq_pending_mp_get(ipsq, &connp); 4979 mutex_exit(&ipsq->ipsq_lock); 4980 mutex_exit(&ill->ill_lock); 4981 4982 ASSERT(mp != NULL); 4983 /* 4984 * NOTE: all of the qwriter_ip() calls below use CUR_OP since 4985 * we can only get here when the current operation decides it 4986 * it needs to quiesce via ipsq_pending_mp_add(). 4987 */ 4988 switch (mp->b_datap->db_type) { 4989 case M_PCPROTO: 4990 case M_PROTO: 4991 /* 4992 * For now, only DL_NOTIFY_IND messages can use this facility. 4993 */ 4994 dlindp = (dl_notify_ind_t *)mp->b_rptr; 4995 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND); 4996 4997 switch (dlindp->dl_notification) { 4998 case DL_NOTE_PHYS_ADDR: 4999 qwriter_ip(ill, ill->ill_rq, mp, 5000 ill_set_phys_addr_tail, CUR_OP, B_TRUE); 5001 return; 5002 case DL_NOTE_REPLUMB: 5003 qwriter_ip(ill, ill->ill_rq, mp, 5004 ill_replumb_tail, CUR_OP, B_TRUE); 5005 return; 5006 default: 5007 ASSERT(0); 5008 ill_refrele(ill); 5009 } 5010 break; 5011 5012 case M_ERROR: 5013 case M_HANGUP: 5014 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP, 5015 B_TRUE); 5016 return; 5017 5018 case M_IOCTL: 5019 case M_IOCDATA: 5020 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) : 5021 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE); 5022 return; 5023 5024 default: 5025 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p " 5026 "db_type %d\n", (void *)mp, mp->b_datap->db_type); 5027 } 5028 return; 5029 unlock: 5030 mutex_exit(&ipsq->ipsq_lock); 5031 mutex_exit(&ipx->ipx_lock); 5032 mutex_exit(&ill->ill_lock); 5033 } 5034 5035 #ifdef DEBUG 5036 /* Reuse trace buffer from beginning (if reached the end) and record trace */ 5037 static void 5038 th_trace_rrecord(th_trace_t *th_trace) 5039 { 5040 tr_buf_t *tr_buf; 5041 uint_t lastref; 5042 5043 lastref = th_trace->th_trace_lastref; 5044 lastref++; 5045 if (lastref == TR_BUF_MAX) 5046 lastref = 0; 5047 th_trace->th_trace_lastref = lastref; 5048 tr_buf = &th_trace->th_trbuf[lastref]; 5049 tr_buf->tr_time = ddi_get_lbolt(); 5050 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH); 5051 } 5052 5053 static void 5054 th_trace_free(void *value) 5055 { 5056 th_trace_t *th_trace = value; 5057 5058 ASSERT(th_trace->th_refcnt == 0); 5059 kmem_free(th_trace, sizeof (*th_trace)); 5060 } 5061 5062 /* 5063 * Find or create the per-thread hash table used to track object references. 5064 * The ipst argument is NULL if we shouldn't allocate. 5065 * 5066 * Accesses per-thread data, so there's no need to lock here. 5067 */ 5068 static mod_hash_t * 5069 th_trace_gethash(ip_stack_t *ipst) 5070 { 5071 th_hash_t *thh; 5072 5073 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) { 5074 mod_hash_t *mh; 5075 char name[256]; 5076 size_t objsize, rshift; 5077 int retv; 5078 5079 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL) 5080 return (NULL); 5081 (void) snprintf(name, sizeof (name), "th_trace_%p", 5082 (void *)curthread); 5083 5084 /* 5085 * We use mod_hash_create_extended here rather than the more 5086 * obvious mod_hash_create_ptrhash because the latter has a 5087 * hard-coded KM_SLEEP, and we'd prefer to fail rather than 5088 * block. 5089 */ 5090 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), 5091 MAX(sizeof (ire_t), sizeof (ncec_t))); 5092 rshift = highbit(objsize); 5093 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, 5094 th_trace_free, mod_hash_byptr, (void *)rshift, 5095 mod_hash_ptrkey_cmp, KM_NOSLEEP); 5096 if (mh == NULL) { 5097 kmem_free(thh, sizeof (*thh)); 5098 return (NULL); 5099 } 5100 thh->thh_hash = mh; 5101 thh->thh_ipst = ipst; 5102 /* 5103 * We trace ills, ipifs, ires, and nces. All of these are 5104 * per-IP-stack, so the lock on the thread list is as well. 5105 */ 5106 rw_enter(&ip_thread_rwlock, RW_WRITER); 5107 list_insert_tail(&ip_thread_list, thh); 5108 rw_exit(&ip_thread_rwlock); 5109 retv = tsd_set(ip_thread_data, thh); 5110 ASSERT(retv == 0); 5111 } 5112 return (thh != NULL ? thh->thh_hash : NULL); 5113 } 5114 5115 boolean_t 5116 th_trace_ref(const void *obj, ip_stack_t *ipst) 5117 { 5118 th_trace_t *th_trace; 5119 mod_hash_t *mh; 5120 mod_hash_val_t val; 5121 5122 if ((mh = th_trace_gethash(ipst)) == NULL) 5123 return (B_FALSE); 5124 5125 /* 5126 * Attempt to locate the trace buffer for this obj and thread. 5127 * If it does not exist, then allocate a new trace buffer and 5128 * insert into the hash. 5129 */ 5130 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) { 5131 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP); 5132 if (th_trace == NULL) 5133 return (B_FALSE); 5134 5135 th_trace->th_id = curthread; 5136 if (mod_hash_insert(mh, (mod_hash_key_t)obj, 5137 (mod_hash_val_t)th_trace) != 0) { 5138 kmem_free(th_trace, sizeof (th_trace_t)); 5139 return (B_FALSE); 5140 } 5141 } else { 5142 th_trace = (th_trace_t *)val; 5143 } 5144 5145 ASSERT(th_trace->th_refcnt >= 0 && 5146 th_trace->th_refcnt < TR_BUF_MAX - 1); 5147 5148 th_trace->th_refcnt++; 5149 th_trace_rrecord(th_trace); 5150 return (B_TRUE); 5151 } 5152 5153 /* 5154 * For the purpose of tracing a reference release, we assume that global 5155 * tracing is always on and that the same thread initiated the reference hold 5156 * is releasing. 5157 */ 5158 void 5159 th_trace_unref(const void *obj) 5160 { 5161 int retv; 5162 mod_hash_t *mh; 5163 th_trace_t *th_trace; 5164 mod_hash_val_t val; 5165 5166 mh = th_trace_gethash(NULL); 5167 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val); 5168 ASSERT(retv == 0); 5169 th_trace = (th_trace_t *)val; 5170 5171 ASSERT(th_trace->th_refcnt > 0); 5172 th_trace->th_refcnt--; 5173 th_trace_rrecord(th_trace); 5174 } 5175 5176 /* 5177 * If tracing has been disabled, then we assume that the reference counts are 5178 * now useless, and we clear them out before destroying the entries. 5179 */ 5180 void 5181 th_trace_cleanup(const void *obj, boolean_t trace_disable) 5182 { 5183 th_hash_t *thh; 5184 mod_hash_t *mh; 5185 mod_hash_val_t val; 5186 th_trace_t *th_trace; 5187 int retv; 5188 5189 rw_enter(&ip_thread_rwlock, RW_READER); 5190 for (thh = list_head(&ip_thread_list); thh != NULL; 5191 thh = list_next(&ip_thread_list, thh)) { 5192 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj, 5193 &val) == 0) { 5194 th_trace = (th_trace_t *)val; 5195 if (trace_disable) 5196 th_trace->th_refcnt = 0; 5197 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj); 5198 ASSERT(retv == 0); 5199 } 5200 } 5201 rw_exit(&ip_thread_rwlock); 5202 } 5203 5204 void 5205 ipif_trace_ref(ipif_t *ipif) 5206 { 5207 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5208 5209 if (ipif->ipif_trace_disable) 5210 return; 5211 5212 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) { 5213 ipif->ipif_trace_disable = B_TRUE; 5214 ipif_trace_cleanup(ipif); 5215 } 5216 } 5217 5218 void 5219 ipif_untrace_ref(ipif_t *ipif) 5220 { 5221 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5222 5223 if (!ipif->ipif_trace_disable) 5224 th_trace_unref(ipif); 5225 } 5226 5227 void 5228 ill_trace_ref(ill_t *ill) 5229 { 5230 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5231 5232 if (ill->ill_trace_disable) 5233 return; 5234 5235 if (!th_trace_ref(ill, ill->ill_ipst)) { 5236 ill->ill_trace_disable = B_TRUE; 5237 ill_trace_cleanup(ill); 5238 } 5239 } 5240 5241 void 5242 ill_untrace_ref(ill_t *ill) 5243 { 5244 ASSERT(MUTEX_HELD(&ill->ill_lock)); 5245 5246 if (!ill->ill_trace_disable) 5247 th_trace_unref(ill); 5248 } 5249 5250 /* 5251 * Called when ipif is unplumbed or when memory alloc fails. Note that on 5252 * failure, ipif_trace_disable is set. 5253 */ 5254 static void 5255 ipif_trace_cleanup(const ipif_t *ipif) 5256 { 5257 th_trace_cleanup(ipif, ipif->ipif_trace_disable); 5258 } 5259 5260 /* 5261 * Called when ill is unplumbed or when memory alloc fails. Note that on 5262 * failure, ill_trace_disable is set. 5263 */ 5264 static void 5265 ill_trace_cleanup(const ill_t *ill) 5266 { 5267 th_trace_cleanup(ill, ill->ill_trace_disable); 5268 } 5269 #endif /* DEBUG */ 5270 5271 void 5272 ipif_refhold_locked(ipif_t *ipif) 5273 { 5274 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 5275 ipif->ipif_refcnt++; 5276 IPIF_TRACE_REF(ipif); 5277 } 5278 5279 void 5280 ipif_refhold(ipif_t *ipif) 5281 { 5282 ill_t *ill; 5283 5284 ill = ipif->ipif_ill; 5285 mutex_enter(&ill->ill_lock); 5286 ipif->ipif_refcnt++; 5287 IPIF_TRACE_REF(ipif); 5288 mutex_exit(&ill->ill_lock); 5289 } 5290 5291 /* 5292 * Must not be called while holding any locks. Otherwise if this is 5293 * the last reference to be released there is a chance of recursive mutex 5294 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying 5295 * to restart an ioctl. 5296 */ 5297 void 5298 ipif_refrele(ipif_t *ipif) 5299 { 5300 ill_t *ill; 5301 5302 ill = ipif->ipif_ill; 5303 5304 mutex_enter(&ill->ill_lock); 5305 ASSERT(ipif->ipif_refcnt != 0); 5306 ipif->ipif_refcnt--; 5307 IPIF_UNTRACE_REF(ipif); 5308 if (ipif->ipif_refcnt != 0) { 5309 mutex_exit(&ill->ill_lock); 5310 return; 5311 } 5312 5313 /* Drops the ill_lock */ 5314 ipif_ill_refrele_tail(ill); 5315 } 5316 5317 ipif_t * 5318 ipif_get_next_ipif(ipif_t *curr, ill_t *ill) 5319 { 5320 ipif_t *ipif; 5321 5322 mutex_enter(&ill->ill_lock); 5323 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); 5324 ipif != NULL; ipif = ipif->ipif_next) { 5325 if (IPIF_IS_CONDEMNED(ipif)) 5326 continue; 5327 ipif_refhold_locked(ipif); 5328 mutex_exit(&ill->ill_lock); 5329 return (ipif); 5330 } 5331 mutex_exit(&ill->ill_lock); 5332 return (NULL); 5333 } 5334 5335 /* 5336 * TODO: make this table extendible at run time 5337 * Return a pointer to the mac type info for 'mac_type' 5338 */ 5339 static ip_m_t * 5340 ip_m_lookup(t_uscalar_t mac_type) 5341 { 5342 ip_m_t *ipm; 5343 5344 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++) 5345 if (ipm->ip_m_mac_type == mac_type) 5346 return (ipm); 5347 return (NULL); 5348 } 5349 5350 /* 5351 * Make a link layer address from the multicast IP address *addr. 5352 * To form the link layer address, invoke the ip_m_v*mapping function 5353 * associated with the link-layer type. 5354 */ 5355 void 5356 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) 5357 { 5358 ip_m_t *ipm; 5359 5360 if (ill->ill_net_type == IRE_IF_NORESOLVER) 5361 return; 5362 5363 ASSERT(addr != NULL); 5364 5365 ipm = ip_m_lookup(ill->ill_mactype); 5366 if (ipm == NULL || 5367 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || 5368 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { 5369 ip0dbg(("no mapping for ill %s mactype 0x%x\n", 5370 ill->ill_name, ill->ill_mactype)); 5371 return; 5372 } 5373 if (ill->ill_isv6) 5374 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); 5375 else 5376 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); 5377 } 5378 5379 /* 5380 * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous. 5381 * Otherwise returns B_TRUE. 5382 * 5383 * The netmask can be verified to be contiguous with 32 shifts and or 5384 * operations. Take the contiguous mask (in host byte order) and compute 5385 * mask | mask << 1 | mask << 2 | ... | mask << 31 5386 * the result will be the same as the 'mask' for contiguous mask. 5387 */ 5388 static boolean_t 5389 ip_contiguous_mask(uint32_t mask) 5390 { 5391 uint32_t m = mask; 5392 int i; 5393 5394 for (i = 1; i < 32; i++) 5395 m |= (mask << i); 5396 5397 return (m == mask); 5398 } 5399 5400 /* 5401 * ip_rt_add is called to add an IPv4 route to the forwarding table. 5402 * ill is passed in to associate it with the correct interface. 5403 * If ire_arg is set, then we return the held IRE in that location. 5404 */ 5405 int 5406 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5407 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, 5408 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) 5409 { 5410 ire_t *ire, *nire; 5411 ire_t *gw_ire = NULL; 5412 ipif_t *ipif = NULL; 5413 uint_t type; 5414 int match_flags = MATCH_IRE_TYPE; 5415 tsol_gc_t *gc = NULL; 5416 tsol_gcgrp_t *gcgrp = NULL; 5417 boolean_t gcgrp_xtraref = B_FALSE; 5418 boolean_t cgtp_broadcast; 5419 boolean_t unbound = B_FALSE; 5420 5421 ip1dbg(("ip_rt_add:")); 5422 5423 if (ire_arg != NULL) 5424 *ire_arg = NULL; 5425 5426 /* disallow non-contiguous netmasks */ 5427 if (!ip_contiguous_mask(ntohl(mask))) 5428 return (ENOTSUP); 5429 5430 /* 5431 * If this is the case of RTF_HOST being set, then we set the netmask 5432 * to all ones (regardless if one was supplied). 5433 */ 5434 if (flags & RTF_HOST) 5435 mask = IP_HOST_MASK; 5436 5437 /* 5438 * Prevent routes with a zero gateway from being created (since 5439 * interfaces can currently be plumbed and brought up no assigned 5440 * address). 5441 */ 5442 if (gw_addr == 0) 5443 return (ENETUNREACH); 5444 /* 5445 * Get the ipif, if any, corresponding to the gw_addr 5446 * If -ifp was specified we restrict ourselves to the ill, otherwise 5447 * we match on the gatway and destination to handle unnumbered pt-pt 5448 * interfaces. 5449 */ 5450 if (ill != NULL) 5451 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); 5452 else 5453 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 5454 if (ipif != NULL) { 5455 if (IS_VNI(ipif->ipif_ill)) { 5456 ipif_refrele(ipif); 5457 return (EINVAL); 5458 } 5459 } 5460 5461 /* 5462 * GateD will attempt to create routes with a loopback interface 5463 * address as the gateway and with RTF_GATEWAY set. We allow 5464 * these routes to be added, but create them as interface routes 5465 * since the gateway is an interface address. 5466 */ 5467 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) { 5468 flags &= ~RTF_GATEWAY; 5469 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && 5470 mask == IP_HOST_MASK) { 5471 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, 5472 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 5473 NULL); 5474 if (ire != NULL) { 5475 ire_refrele(ire); 5476 ipif_refrele(ipif); 5477 return (EEXIST); 5478 } 5479 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" 5480 "for 0x%x\n", (void *)ipif, 5481 ipif->ipif_ire_type, 5482 ntohl(ipif->ipif_lcl_addr))); 5483 ire = ire_create( 5484 (uchar_t *)&dst_addr, /* dest address */ 5485 (uchar_t *)&mask, /* mask */ 5486 NULL, /* no gateway */ 5487 ipif->ipif_ire_type, /* LOOPBACK */ 5488 ipif->ipif_ill, 5489 zoneid, 5490 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, 5491 NULL, 5492 ipst); 5493 5494 if (ire == NULL) { 5495 ipif_refrele(ipif); 5496 return (ENOMEM); 5497 } 5498 /* src address assigned by the caller? */ 5499 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5500 ire->ire_setsrc_addr = src_addr; 5501 5502 nire = ire_add(ire); 5503 if (nire == NULL) { 5504 /* 5505 * In the result of failure, ire_add() will have 5506 * already deleted the ire in question, so there 5507 * is no need to do that here. 5508 */ 5509 ipif_refrele(ipif); 5510 return (ENOMEM); 5511 } 5512 /* 5513 * Check if it was a duplicate entry. This handles 5514 * the case of two racing route adds for the same route 5515 */ 5516 if (nire != ire) { 5517 ASSERT(nire->ire_identical_ref > 1); 5518 ire_delete(nire); 5519 ire_refrele(nire); 5520 ipif_refrele(ipif); 5521 return (EEXIST); 5522 } 5523 ire = nire; 5524 goto save_ire; 5525 } 5526 } 5527 5528 /* 5529 * The routes for multicast with CGTP are quite special in that 5530 * the gateway is the local interface address, yet RTF_GATEWAY 5531 * is set. We turn off RTF_GATEWAY to provide compatibility with 5532 * this undocumented and unusual use of multicast routes. 5533 */ 5534 if ((flags & RTF_MULTIRT) && ipif != NULL) 5535 flags &= ~RTF_GATEWAY; 5536 5537 /* 5538 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set 5539 * and the gateway address provided is one of the system's interface 5540 * addresses. By using the routing socket interface and supplying an 5541 * RTA_IFP sockaddr with an interface index, an alternate method of 5542 * specifying an interface route to be created is available which uses 5543 * the interface index that specifies the outgoing interface rather than 5544 * the address of an outgoing interface (which may not be able to 5545 * uniquely identify an interface). When coupled with the RTF_GATEWAY 5546 * flag, routes can be specified which not only specify the next-hop to 5547 * be used when routing to a certain prefix, but also which outgoing 5548 * interface should be used. 5549 * 5550 * Previously, interfaces would have unique addresses assigned to them 5551 * and so the address assigned to a particular interface could be used 5552 * to identify a particular interface. One exception to this was the 5553 * case of an unnumbered interface (where IPIF_UNNUMBERED was set). 5554 * 5555 * With the advent of IPv6 and its link-local addresses, this 5556 * restriction was relaxed and interfaces could share addresses between 5557 * themselves. In fact, typically all of the link-local interfaces on 5558 * an IPv6 node or router will have the same link-local address. In 5559 * order to differentiate between these interfaces, the use of an 5560 * interface index is necessary and this index can be carried inside a 5561 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction 5562 * of using the interface index, however, is that all of the ipif's that 5563 * are part of an ill have the same index and so the RTA_IFP sockaddr 5564 * cannot be used to differentiate between ipif's (or logical 5565 * interfaces) that belong to the same ill (physical interface). 5566 * 5567 * For example, in the following case involving IPv4 interfaces and 5568 * logical interfaces 5569 * 5570 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 5571 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 5572 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 5573 * 5574 * the ipif's corresponding to each of these interface routes can be 5575 * uniquely identified by the "gateway" (actually interface address). 5576 * 5577 * In this case involving multiple IPv6 default routes to a particular 5578 * link-local gateway, the use of RTA_IFP is necessary to specify which 5579 * default route is of interest: 5580 * 5581 * default fe80::123:4567:89ab:cdef U if0 5582 * default fe80::123:4567:89ab:cdef U if1 5583 */ 5584 5585 /* RTF_GATEWAY not set */ 5586 if (!(flags & RTF_GATEWAY)) { 5587 if (sp != NULL) { 5588 ip2dbg(("ip_rt_add: gateway security attributes " 5589 "cannot be set with interface route\n")); 5590 if (ipif != NULL) 5591 ipif_refrele(ipif); 5592 return (EINVAL); 5593 } 5594 5595 /* 5596 * Whether or not ill (RTA_IFP) is set, we require that 5597 * the gateway is one of our local addresses. 5598 */ 5599 if (ipif == NULL) 5600 return (ENETUNREACH); 5601 5602 /* 5603 * We use MATCH_IRE_ILL here. If the caller specified an 5604 * interface (from the RTA_IFP sockaddr) we use it, otherwise 5605 * we use the ill derived from the gateway address. 5606 * We can always match the gateway address since we record it 5607 * in ire_gateway_addr. 5608 * We don't allow RTA_IFP to specify a different ill than the 5609 * one matching the ipif to make sure we can delete the route. 5610 */ 5611 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; 5612 if (ill == NULL) { 5613 ill = ipif->ipif_ill; 5614 } else if (ill != ipif->ipif_ill) { 5615 ipif_refrele(ipif); 5616 return (EINVAL); 5617 } 5618 5619 /* 5620 * We check for an existing entry at this point. 5621 * 5622 * Since a netmask isn't passed in via the ioctl interface 5623 * (SIOCADDRT), we don't check for a matching netmask in that 5624 * case. 5625 */ 5626 if (!ioctl_msg) 5627 match_flags |= MATCH_IRE_MASK; 5628 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 5629 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, 5630 NULL); 5631 if (ire != NULL) { 5632 ire_refrele(ire); 5633 ipif_refrele(ipif); 5634 return (EEXIST); 5635 } 5636 5637 /* 5638 * Some software (for example, GateD and Sun Cluster) attempts 5639 * to create (what amount to) IRE_PREFIX routes with the 5640 * loopback address as the gateway. This is primarily done to 5641 * set up prefixes with the RTF_REJECT flag set (for example, 5642 * when generating aggregate routes.) 5643 * 5644 * If the IRE type (as defined by ill->ill_net_type) would be 5645 * IRE_LOOPBACK, then we map the request into a 5646 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as 5647 * these interface routes, by definition, can only be that. 5648 * 5649 * Needless to say, the real IRE_LOOPBACK is NOT created by this 5650 * routine, but rather using ire_create() directly. 5651 * 5652 */ 5653 type = ill->ill_net_type; 5654 if (type == IRE_LOOPBACK) { 5655 type = IRE_IF_NORESOLVER; 5656 flags |= RTF_BLACKHOLE; 5657 } 5658 5659 /* 5660 * Create a copy of the IRE_IF_NORESOLVER or 5661 * IRE_IF_RESOLVER with the modified address, netmask, and 5662 * gateway. 5663 */ 5664 ire = ire_create( 5665 (uchar_t *)&dst_addr, 5666 (uint8_t *)&mask, 5667 (uint8_t *)&gw_addr, 5668 type, 5669 ill, 5670 zoneid, 5671 flags, 5672 NULL, 5673 ipst); 5674 if (ire == NULL) { 5675 ipif_refrele(ipif); 5676 return (ENOMEM); 5677 } 5678 5679 /* src address assigned by the caller? */ 5680 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5681 ire->ire_setsrc_addr = src_addr; 5682 5683 nire = ire_add(ire); 5684 if (nire == NULL) { 5685 /* 5686 * In the result of failure, ire_add() will have 5687 * already deleted the ire in question, so there 5688 * is no need to do that here. 5689 */ 5690 ipif_refrele(ipif); 5691 return (ENOMEM); 5692 } 5693 /* 5694 * Check if it was a duplicate entry. This handles 5695 * the case of two racing route adds for the same route 5696 */ 5697 if (nire != ire) { 5698 ire_delete(nire); 5699 ire_refrele(nire); 5700 ipif_refrele(ipif); 5701 return (EEXIST); 5702 } 5703 ire = nire; 5704 goto save_ire; 5705 } 5706 5707 /* 5708 * Get an interface IRE for the specified gateway. 5709 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the 5710 * gateway, it is currently unreachable and we fail the request 5711 * accordingly. We reject any RTF_GATEWAY routes where the gateway 5712 * is an IRE_LOCAL or IRE_LOOPBACK. 5713 * If RTA_IFP was specified we look on that particular ill. 5714 */ 5715 if (ill != NULL) 5716 match_flags |= MATCH_IRE_ILL; 5717 5718 /* Check whether the gateway is reachable. */ 5719 again: 5720 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK; 5721 if (flags & RTF_INDIRECT) 5722 type |= IRE_OFFLINK; 5723 5724 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, 5725 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 5726 if (gw_ire == NULL) { 5727 /* 5728 * With IPMP, we allow host routes to influence in.mpathd's 5729 * target selection. However, if the test addresses are on 5730 * their own network, the above lookup will fail since the 5731 * underlying IRE_INTERFACEs are marked hidden. So allow 5732 * hidden test IREs to be found and try again. 5733 */ 5734 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { 5735 match_flags |= MATCH_IRE_TESTHIDDEN; 5736 goto again; 5737 } 5738 if (ipif != NULL) 5739 ipif_refrele(ipif); 5740 return (ENETUNREACH); 5741 } 5742 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { 5743 ire_refrele(gw_ire); 5744 if (ipif != NULL) 5745 ipif_refrele(ipif); 5746 return (ENETUNREACH); 5747 } 5748 5749 if (ill == NULL && !(flags & RTF_INDIRECT)) { 5750 unbound = B_TRUE; 5751 if (ipst->ips_ip_strict_src_multihoming > 0) 5752 ill = gw_ire->ire_ill; 5753 } 5754 5755 /* 5756 * We create one of three types of IREs as a result of this request 5757 * based on the netmask. A netmask of all ones (which is automatically 5758 * assumed when RTF_HOST is set) results in an IRE_HOST being created. 5759 * An all zeroes netmask implies a default route so an IRE_DEFAULT is 5760 * created. Otherwise, an IRE_PREFIX route is created for the 5761 * destination prefix. 5762 */ 5763 if (mask == IP_HOST_MASK) 5764 type = IRE_HOST; 5765 else if (mask == 0) 5766 type = IRE_DEFAULT; 5767 else 5768 type = IRE_PREFIX; 5769 5770 /* check for a duplicate entry */ 5771 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 5772 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 5773 0, ipst, NULL); 5774 if (ire != NULL) { 5775 if (ipif != NULL) 5776 ipif_refrele(ipif); 5777 ire_refrele(gw_ire); 5778 ire_refrele(ire); 5779 return (EEXIST); 5780 } 5781 5782 /* Security attribute exists */ 5783 if (sp != NULL) { 5784 tsol_gcgrp_addr_t ga; 5785 5786 /* find or create the gateway credentials group */ 5787 ga.ga_af = AF_INET; 5788 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr); 5789 5790 /* we hold reference to it upon success */ 5791 gcgrp = gcgrp_lookup(&ga, B_TRUE); 5792 if (gcgrp == NULL) { 5793 if (ipif != NULL) 5794 ipif_refrele(ipif); 5795 ire_refrele(gw_ire); 5796 return (ENOMEM); 5797 } 5798 5799 /* 5800 * Create and add the security attribute to the group; a 5801 * reference to the group is made upon allocating a new 5802 * entry successfully. If it finds an already-existing 5803 * entry for the security attribute in the group, it simply 5804 * returns it and no new reference is made to the group. 5805 */ 5806 gc = gc_create(sp, gcgrp, &gcgrp_xtraref); 5807 if (gc == NULL) { 5808 if (ipif != NULL) 5809 ipif_refrele(ipif); 5810 /* release reference held by gcgrp_lookup */ 5811 GCGRP_REFRELE(gcgrp); 5812 ire_refrele(gw_ire); 5813 return (ENOMEM); 5814 } 5815 } 5816 5817 /* Create the IRE. */ 5818 ire = ire_create( 5819 (uchar_t *)&dst_addr, /* dest address */ 5820 (uchar_t *)&mask, /* mask */ 5821 (uchar_t *)&gw_addr, /* gateway address */ 5822 (ushort_t)type, /* IRE type */ 5823 ill, 5824 zoneid, 5825 flags, 5826 gc, /* security attribute */ 5827 ipst); 5828 5829 /* 5830 * The ire holds a reference to the 'gc' and the 'gc' holds a 5831 * reference to the 'gcgrp'. We can now release the extra reference 5832 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used. 5833 */ 5834 if (gcgrp_xtraref) 5835 GCGRP_REFRELE(gcgrp); 5836 if (ire == NULL) { 5837 if (gc != NULL) 5838 GC_REFRELE(gc); 5839 if (ipif != NULL) 5840 ipif_refrele(ipif); 5841 ire_refrele(gw_ire); 5842 return (ENOMEM); 5843 } 5844 5845 /* Before we add, check if an extra CGTP broadcast is needed */ 5846 cgtp_broadcast = ((flags & RTF_MULTIRT) && 5847 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); 5848 5849 /* src address assigned by the caller? */ 5850 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) 5851 ire->ire_setsrc_addr = src_addr; 5852 5853 ire->ire_unbound = unbound; 5854 5855 /* 5856 * POLICY: should we allow an RTF_HOST with address INADDR_ANY? 5857 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? 5858 */ 5859 5860 /* Add the new IRE. */ 5861 nire = ire_add(ire); 5862 if (nire == NULL) { 5863 /* 5864 * In the result of failure, ire_add() will have 5865 * already deleted the ire in question, so there 5866 * is no need to do that here. 5867 */ 5868 if (ipif != NULL) 5869 ipif_refrele(ipif); 5870 ire_refrele(gw_ire); 5871 return (ENOMEM); 5872 } 5873 /* 5874 * Check if it was a duplicate entry. This handles 5875 * the case of two racing route adds for the same route 5876 */ 5877 if (nire != ire) { 5878 ire_delete(nire); 5879 ire_refrele(nire); 5880 if (ipif != NULL) 5881 ipif_refrele(ipif); 5882 ire_refrele(gw_ire); 5883 return (EEXIST); 5884 } 5885 ire = nire; 5886 5887 if (flags & RTF_MULTIRT) { 5888 /* 5889 * Invoke the CGTP (multirouting) filtering module 5890 * to add the dst address in the filtering database. 5891 * Replicated inbound packets coming from that address 5892 * will be filtered to discard the duplicates. 5893 * It is not necessary to call the CGTP filter hook 5894 * when the dst address is a broadcast or multicast, 5895 * because an IP source address cannot be a broadcast 5896 * or a multicast. 5897 */ 5898 if (cgtp_broadcast) { 5899 ip_cgtp_bcast_add(ire, ipst); 5900 goto save_ire; 5901 } 5902 if (ipst->ips_ip_cgtp_filter_ops != NULL && 5903 !CLASSD(ire->ire_addr)) { 5904 int res; 5905 ipif_t *src_ipif; 5906 5907 /* Find the source address corresponding to gw_ire */ 5908 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, 5909 NULL, zoneid, ipst); 5910 if (src_ipif != NULL) { 5911 res = ipst->ips_ip_cgtp_filter_ops-> 5912 cfo_add_dest_v4( 5913 ipst->ips_netstack->netstack_stackid, 5914 ire->ire_addr, 5915 ire->ire_gateway_addr, 5916 ire->ire_setsrc_addr, 5917 src_ipif->ipif_lcl_addr); 5918 ipif_refrele(src_ipif); 5919 } else { 5920 res = EADDRNOTAVAIL; 5921 } 5922 if (res != 0) { 5923 if (ipif != NULL) 5924 ipif_refrele(ipif); 5925 ire_refrele(gw_ire); 5926 ire_delete(ire); 5927 ire_refrele(ire); /* Held in ire_add */ 5928 return (res); 5929 } 5930 } 5931 } 5932 5933 save_ire: 5934 if (gw_ire != NULL) { 5935 ire_refrele(gw_ire); 5936 gw_ire = NULL; 5937 } 5938 if (ill != NULL) { 5939 /* 5940 * Save enough information so that we can recreate the IRE if 5941 * the interface goes down and then up. The metrics associated 5942 * with the route will be saved as well when rts_setmetrics() is 5943 * called after the IRE has been created. In the case where 5944 * memory cannot be allocated, none of this information will be 5945 * saved. 5946 */ 5947 ill_save_ire(ill, ire); 5948 } 5949 if (ioctl_msg) 5950 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); 5951 if (ire_arg != NULL) { 5952 /* 5953 * Store the ire that was successfully added into where ire_arg 5954 * points to so that callers don't have to look it up 5955 * themselves (but they are responsible for ire_refrele()ing 5956 * the ire when they are finished with it). 5957 */ 5958 *ire_arg = ire; 5959 } else { 5960 ire_refrele(ire); /* Held in ire_add */ 5961 } 5962 if (ipif != NULL) 5963 ipif_refrele(ipif); 5964 return (0); 5965 } 5966 5967 /* 5968 * ip_rt_delete is called to delete an IPv4 route. 5969 * ill is passed in to associate it with the correct interface. 5970 */ 5971 /* ARGSUSED4 */ 5972 int 5973 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, 5974 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, 5975 ip_stack_t *ipst, zoneid_t zoneid) 5976 { 5977 ire_t *ire = NULL; 5978 ipif_t *ipif; 5979 uint_t type; 5980 uint_t match_flags = MATCH_IRE_TYPE; 5981 int err = 0; 5982 5983 ip1dbg(("ip_rt_delete:")); 5984 /* 5985 * If this is the case of RTF_HOST being set, then we set the netmask 5986 * to all ones. Otherwise, we use the netmask if one was supplied. 5987 */ 5988 if (flags & RTF_HOST) { 5989 mask = IP_HOST_MASK; 5990 match_flags |= MATCH_IRE_MASK; 5991 } else if (rtm_addrs & RTA_NETMASK) { 5992 match_flags |= MATCH_IRE_MASK; 5993 } 5994 5995 /* 5996 * Note that RTF_GATEWAY is never set on a delete, therefore 5997 * we check if the gateway address is one of our interfaces first, 5998 * and fall back on RTF_GATEWAY routes. 5999 * 6000 * This makes it possible to delete an original 6001 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. 6002 * However, we have RTF_KERNEL set on the ones created by ipif_up 6003 * and those can not be deleted here. 6004 * 6005 * We use MATCH_IRE_ILL if we know the interface. If the caller 6006 * specified an interface (from the RTA_IFP sockaddr) we use it, 6007 * otherwise we use the ill derived from the gateway address. 6008 * We can always match the gateway address since we record it 6009 * in ire_gateway_addr. 6010 * 6011 * For more detail on specifying routes by gateway address and by 6012 * interface index, see the comments in ip_rt_add(). 6013 */ 6014 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); 6015 if (ipif != NULL) { 6016 ill_t *ill_match; 6017 6018 if (ill != NULL) 6019 ill_match = ill; 6020 else 6021 ill_match = ipif->ipif_ill; 6022 6023 match_flags |= MATCH_IRE_ILL; 6024 if (ipif->ipif_ire_type == IRE_LOOPBACK) { 6025 ire = ire_ftable_lookup_v4(dst_addr, mask, 0, 6026 IRE_LOOPBACK, ill_match, ALL_ZONES, NULL, 6027 match_flags, 0, ipst, NULL); 6028 } 6029 if (ire == NULL) { 6030 match_flags |= MATCH_IRE_GW; 6031 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, 6032 IRE_INTERFACE, ill_match, ALL_ZONES, NULL, 6033 match_flags, 0, ipst, NULL); 6034 } 6035 /* Avoid deleting routes created by kernel from an ipif */ 6036 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { 6037 ire_refrele(ire); 6038 ire = NULL; 6039 } 6040 6041 /* Restore in case we didn't find a match */ 6042 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); 6043 } 6044 6045 if (ire == NULL) { 6046 /* 6047 * At this point, the gateway address is not one of our own 6048 * addresses or a matching interface route was not found. We 6049 * set the IRE type to lookup based on whether 6050 * this is a host route, a default route or just a prefix. 6051 * 6052 * If an ill was passed in, then the lookup is based on an 6053 * interface index so MATCH_IRE_ILL is added to match_flags. 6054 */ 6055 match_flags |= MATCH_IRE_GW; 6056 if (ill != NULL) 6057 match_flags |= MATCH_IRE_ILL; 6058 if (mask == IP_HOST_MASK) 6059 type = IRE_HOST; 6060 else if (mask == 0) 6061 type = IRE_DEFAULT; 6062 else 6063 type = IRE_PREFIX; 6064 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, 6065 ALL_ZONES, NULL, match_flags, 0, ipst, NULL); 6066 } 6067 6068 if (ipif != NULL) { 6069 ipif_refrele(ipif); 6070 ipif = NULL; 6071 } 6072 6073 if (ire == NULL) 6074 return (ESRCH); 6075 6076 if (ire->ire_flags & RTF_MULTIRT) { 6077 /* 6078 * Invoke the CGTP (multirouting) filtering module 6079 * to remove the dst address from the filtering database. 6080 * Packets coming from that address will no longer be 6081 * filtered to remove duplicates. 6082 */ 6083 if (ipst->ips_ip_cgtp_filter_ops != NULL) { 6084 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4( 6085 ipst->ips_netstack->netstack_stackid, 6086 ire->ire_addr, ire->ire_gateway_addr); 6087 } 6088 ip_cgtp_bcast_delete(ire, ipst); 6089 } 6090 6091 ill = ire->ire_ill; 6092 if (ill != NULL) 6093 ill_remove_saved_ire(ill, ire); 6094 if (ioctl_msg) 6095 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); 6096 ire_delete(ire); 6097 ire_refrele(ire); 6098 return (err); 6099 } 6100 6101 /* 6102 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL. 6103 */ 6104 /* ARGSUSED */ 6105 int 6106 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6107 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6108 { 6109 ipaddr_t dst_addr; 6110 ipaddr_t gw_addr; 6111 ipaddr_t mask; 6112 int error = 0; 6113 mblk_t *mp1; 6114 struct rtentry *rt; 6115 ipif_t *ipif = NULL; 6116 ip_stack_t *ipst; 6117 6118 ASSERT(q->q_next == NULL); 6119 ipst = CONNQ_TO_IPST(q); 6120 6121 ip1dbg(("ip_siocaddrt:")); 6122 /* Existence of mp1 verified in ip_wput_nondata */ 6123 mp1 = mp->b_cont->b_cont; 6124 rt = (struct rtentry *)mp1->b_rptr; 6125 6126 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6127 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6128 6129 /* 6130 * If the RTF_HOST flag is on, this is a request to assign a gateway 6131 * to a particular host address. In this case, we set the netmask to 6132 * all ones for the particular destination address. Otherwise, 6133 * determine the netmask to be used based on dst_addr and the interfaces 6134 * in use. 6135 */ 6136 if (rt->rt_flags & RTF_HOST) { 6137 mask = IP_HOST_MASK; 6138 } else { 6139 /* 6140 * Note that ip_subnet_mask returns a zero mask in the case of 6141 * default (an all-zeroes address). 6142 */ 6143 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6144 } 6145 6146 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, 6147 B_TRUE, NULL, ipst, ALL_ZONES); 6148 if (ipif != NULL) 6149 ipif_refrele(ipif); 6150 return (error); 6151 } 6152 6153 /* 6154 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL. 6155 */ 6156 /* ARGSUSED */ 6157 int 6158 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 6159 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 6160 { 6161 ipaddr_t dst_addr; 6162 ipaddr_t gw_addr; 6163 ipaddr_t mask; 6164 int error; 6165 mblk_t *mp1; 6166 struct rtentry *rt; 6167 ipif_t *ipif = NULL; 6168 ip_stack_t *ipst; 6169 6170 ASSERT(q->q_next == NULL); 6171 ipst = CONNQ_TO_IPST(q); 6172 6173 ip1dbg(("ip_siocdelrt:")); 6174 /* Existence of mp1 verified in ip_wput_nondata */ 6175 mp1 = mp->b_cont->b_cont; 6176 rt = (struct rtentry *)mp1->b_rptr; 6177 6178 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr; 6179 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr; 6180 6181 /* 6182 * If the RTF_HOST flag is on, this is a request to delete a gateway 6183 * to a particular host address. In this case, we set the netmask to 6184 * all ones for the particular destination address. Otherwise, 6185 * determine the netmask to be used based on dst_addr and the interfaces 6186 * in use. 6187 */ 6188 if (rt->rt_flags & RTF_HOST) { 6189 mask = IP_HOST_MASK; 6190 } else { 6191 /* 6192 * Note that ip_subnet_mask returns a zero mask in the case of 6193 * default (an all-zeroes address). 6194 */ 6195 mask = ip_subnet_mask(dst_addr, &ipif, ipst); 6196 } 6197 6198 error = ip_rt_delete(dst_addr, mask, gw_addr, 6199 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, 6200 ipst, ALL_ZONES); 6201 if (ipif != NULL) 6202 ipif_refrele(ipif); 6203 return (error); 6204 } 6205 6206 /* 6207 * Enqueue the mp onto the ipsq, chained by b_next. 6208 * b_prev stores the function to be executed later, and b_queue the queue 6209 * where this mp originated. 6210 */ 6211 void 6212 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6213 ill_t *pending_ill) 6214 { 6215 conn_t *connp; 6216 ipxop_t *ipx = ipsq->ipsq_xop; 6217 6218 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock)); 6219 ASSERT(MUTEX_HELD(&ipx->ipx_lock)); 6220 ASSERT(func != NULL); 6221 6222 mp->b_queue = q; 6223 mp->b_prev = (void *)func; 6224 mp->b_next = NULL; 6225 6226 switch (type) { 6227 case CUR_OP: 6228 if (ipx->ipx_mptail != NULL) { 6229 ASSERT(ipx->ipx_mphead != NULL); 6230 ipx->ipx_mptail->b_next = mp; 6231 } else { 6232 ASSERT(ipx->ipx_mphead == NULL); 6233 ipx->ipx_mphead = mp; 6234 } 6235 ipx->ipx_mptail = mp; 6236 break; 6237 6238 case NEW_OP: 6239 if (ipsq->ipsq_xopq_mptail != NULL) { 6240 ASSERT(ipsq->ipsq_xopq_mphead != NULL); 6241 ipsq->ipsq_xopq_mptail->b_next = mp; 6242 } else { 6243 ASSERT(ipsq->ipsq_xopq_mphead == NULL); 6244 ipsq->ipsq_xopq_mphead = mp; 6245 } 6246 ipsq->ipsq_xopq_mptail = mp; 6247 ipx->ipx_ipsq_queued = B_TRUE; 6248 break; 6249 6250 case SWITCH_OP: 6251 ASSERT(ipsq->ipsq_swxop != NULL); 6252 /* only one switch operation is currently allowed */ 6253 ASSERT(ipsq->ipsq_switch_mp == NULL); 6254 ipsq->ipsq_switch_mp = mp; 6255 ipx->ipx_ipsq_queued = B_TRUE; 6256 break; 6257 default: 6258 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type); 6259 } 6260 6261 if (CONN_Q(q) && pending_ill != NULL) { 6262 connp = Q_TO_CONN(q); 6263 ASSERT(MUTEX_HELD(&connp->conn_lock)); 6264 connp->conn_oper_pending_ill = pending_ill; 6265 } 6266 } 6267 6268 /* 6269 * Dequeue the next message that requested exclusive access to this IPSQ's 6270 * xop. Specifically: 6271 * 6272 * 1. If we're still processing the current operation on `ipsq', then 6273 * dequeue the next message for the operation (from ipx_mphead), or 6274 * return NULL if there are no queued messages for the operation. 6275 * These messages are queued via CUR_OP to qwriter_ip() and friends. 6276 * 6277 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is 6278 * not set) see if the ipsq has requested an xop switch. If so, switch 6279 * `ipsq' to a different xop. Xop switches only happen when joining or 6280 * leaving IPMP groups and require a careful dance -- see the comments 6281 * in-line below for details. If we're leaving a group xop or if we're 6282 * joining a group xop and become writer on it, then we proceed to (3). 6283 * Otherwise, we return NULL and exit the xop. 6284 * 6285 * 3. For each IPSQ in the xop, return any switch operation stored on 6286 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before 6287 * any other messages queued on the IPSQ. Otherwise, dequeue the next 6288 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead. 6289 * Note that if the phyint tied to `ipsq' is not using IPMP there will 6290 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for 6291 * each phyint in the group, including the IPMP meta-interface phyint. 6292 */ 6293 static mblk_t * 6294 ipsq_dq(ipsq_t *ipsq) 6295 { 6296 ill_t *illv4, *illv6; 6297 mblk_t *mp; 6298 ipsq_t *xopipsq; 6299 ipsq_t *leftipsq = NULL; 6300 ipxop_t *ipx; 6301 phyint_t *phyi = ipsq->ipsq_phyint; 6302 ip_stack_t *ipst = ipsq->ipsq_ipst; 6303 boolean_t emptied = B_FALSE; 6304 6305 /* 6306 * Grab all the locks we need in the defined order (ill_g_lock -> 6307 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next. 6308 */ 6309 rw_enter(&ipst->ips_ill_g_lock, 6310 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER); 6311 mutex_enter(&ipsq->ipsq_lock); 6312 ipx = ipsq->ipsq_xop; 6313 mutex_enter(&ipx->ipx_lock); 6314 6315 /* 6316 * Dequeue the next message associated with the current exclusive 6317 * operation, if any. 6318 */ 6319 if ((mp = ipx->ipx_mphead) != NULL) { 6320 ipx->ipx_mphead = mp->b_next; 6321 if (ipx->ipx_mphead == NULL) 6322 ipx->ipx_mptail = NULL; 6323 mp->b_next = (void *)ipsq; 6324 goto out; 6325 } 6326 6327 if (ipx->ipx_current_ipif != NULL) 6328 goto empty; 6329 6330 if (ipsq->ipsq_swxop != NULL) { 6331 /* 6332 * The exclusive operation that is now being completed has 6333 * requested a switch to a different xop. This happens 6334 * when an interface joins or leaves an IPMP group. Joins 6335 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()). 6336 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb 6337 * (phyint_free()), or interface plumb for an ill type 6338 * not in the IPMP group (ip_rput_dlpi_writer()). 6339 * 6340 * Xop switches are not allowed on the IPMP meta-interface. 6341 */ 6342 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP)); 6343 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 6344 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq); 6345 6346 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) { 6347 /* 6348 * We're switching back to our own xop, so we have two 6349 * xop's to drain/exit: our own, and the group xop 6350 * that we are leaving. 6351 * 6352 * First, pull ourselves out of the group ipsq list. 6353 * This is safe since we're writer on ill_g_lock. 6354 */ 6355 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop); 6356 6357 xopipsq = ipx->ipx_ipsq; 6358 while (xopipsq->ipsq_next != ipsq) 6359 xopipsq = xopipsq->ipsq_next; 6360 6361 xopipsq->ipsq_next = ipsq->ipsq_next; 6362 ipsq->ipsq_next = ipsq; 6363 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6364 ipsq->ipsq_swxop = NULL; 6365 6366 /* 6367 * Second, prepare to exit the group xop. The actual 6368 * ipsq_exit() is done at the end of this function 6369 * since we cannot hold any locks across ipsq_exit(). 6370 * Note that although we drop the group's ipx_lock, no 6371 * threads can proceed since we're still ipx_writer. 6372 */ 6373 leftipsq = xopipsq; 6374 mutex_exit(&ipx->ipx_lock); 6375 6376 /* 6377 * Third, set ipx to point to our own xop (which was 6378 * inactive and therefore can be entered). 6379 */ 6380 ipx = ipsq->ipsq_xop; 6381 mutex_enter(&ipx->ipx_lock); 6382 ASSERT(ipx->ipx_writer == NULL); 6383 ASSERT(ipx->ipx_current_ipif == NULL); 6384 } else { 6385 /* 6386 * We're switching from our own xop to a group xop. 6387 * The requestor of the switch must ensure that the 6388 * group xop cannot go away (e.g. by ensuring the 6389 * phyint associated with the xop cannot go away). 6390 * 6391 * If we can become writer on our new xop, then we'll 6392 * do the drain. Otherwise, the current writer of our 6393 * new xop will do the drain when it exits. 6394 * 6395 * First, splice ourselves into the group IPSQ list. 6396 * This is safe since we're writer on ill_g_lock. 6397 */ 6398 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6399 6400 xopipsq = ipsq->ipsq_swxop->ipx_ipsq; 6401 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq) 6402 xopipsq = xopipsq->ipsq_next; 6403 6404 xopipsq->ipsq_next = ipsq; 6405 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq; 6406 ipsq->ipsq_xop = ipsq->ipsq_swxop; 6407 ipsq->ipsq_swxop = NULL; 6408 6409 /* 6410 * Second, exit our own xop, since it's now unused. 6411 * This is safe since we've got the only reference. 6412 */ 6413 ASSERT(ipx->ipx_writer == curthread); 6414 ipx->ipx_writer = NULL; 6415 VERIFY(--ipx->ipx_reentry_cnt == 0); 6416 ipx->ipx_ipsq_queued = B_FALSE; 6417 mutex_exit(&ipx->ipx_lock); 6418 6419 /* 6420 * Third, set ipx to point to our new xop, and check 6421 * if we can become writer on it. If we cannot, then 6422 * the current writer will drain the IPSQ group when 6423 * it exits. Our ipsq_xop is guaranteed to be stable 6424 * because we're still holding ipsq_lock. 6425 */ 6426 ipx = ipsq->ipsq_xop; 6427 mutex_enter(&ipx->ipx_lock); 6428 if (ipx->ipx_writer != NULL || 6429 ipx->ipx_current_ipif != NULL) { 6430 goto out; 6431 } 6432 } 6433 6434 /* 6435 * Fourth, become writer on our new ipx before we continue 6436 * with the drain. Note that we never dropped ipsq_lock 6437 * above, so no other thread could've raced with us to 6438 * become writer first. Also, we're holding ipx_lock, so 6439 * no other thread can examine the ipx right now. 6440 */ 6441 ASSERT(ipx->ipx_current_ipif == NULL); 6442 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6443 VERIFY(ipx->ipx_reentry_cnt++ == 0); 6444 ipx->ipx_writer = curthread; 6445 ipx->ipx_forced = B_FALSE; 6446 #ifdef DEBUG 6447 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6448 #endif 6449 } 6450 6451 xopipsq = ipsq; 6452 do { 6453 /* 6454 * So that other operations operate on a consistent and 6455 * complete phyint, a switch message on an IPSQ must be 6456 * handled prior to any other operations on that IPSQ. 6457 */ 6458 if ((mp = xopipsq->ipsq_switch_mp) != NULL) { 6459 xopipsq->ipsq_switch_mp = NULL; 6460 ASSERT(mp->b_next == NULL); 6461 mp->b_next = (void *)xopipsq; 6462 goto out; 6463 } 6464 6465 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) { 6466 xopipsq->ipsq_xopq_mphead = mp->b_next; 6467 if (xopipsq->ipsq_xopq_mphead == NULL) 6468 xopipsq->ipsq_xopq_mptail = NULL; 6469 mp->b_next = (void *)xopipsq; 6470 goto out; 6471 } 6472 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6473 empty: 6474 /* 6475 * There are no messages. Further, we are holding ipx_lock, hence no 6476 * new messages can end up on any IPSQ in the xop. 6477 */ 6478 ipx->ipx_writer = NULL; 6479 ipx->ipx_forced = B_FALSE; 6480 VERIFY(--ipx->ipx_reentry_cnt == 0); 6481 ipx->ipx_ipsq_queued = B_FALSE; 6482 emptied = B_TRUE; 6483 #ifdef DEBUG 6484 ipx->ipx_depth = 0; 6485 #endif 6486 out: 6487 mutex_exit(&ipx->ipx_lock); 6488 mutex_exit(&ipsq->ipsq_lock); 6489 6490 /* 6491 * If we completely emptied the xop, then wake up any threads waiting 6492 * to enter any of the IPSQ's associated with it. 6493 */ 6494 if (emptied) { 6495 xopipsq = ipsq; 6496 do { 6497 if ((phyi = xopipsq->ipsq_phyint) == NULL) 6498 continue; 6499 6500 illv4 = phyi->phyint_illv4; 6501 illv6 = phyi->phyint_illv6; 6502 6503 GRAB_ILL_LOCKS(illv4, illv6); 6504 if (illv4 != NULL) 6505 cv_broadcast(&illv4->ill_cv); 6506 if (illv6 != NULL) 6507 cv_broadcast(&illv6->ill_cv); 6508 RELEASE_ILL_LOCKS(illv4, illv6); 6509 } while ((xopipsq = xopipsq->ipsq_next) != ipsq); 6510 } 6511 rw_exit(&ipst->ips_ill_g_lock); 6512 6513 /* 6514 * Now that all locks are dropped, exit the IPSQ we left. 6515 */ 6516 if (leftipsq != NULL) 6517 ipsq_exit(leftipsq); 6518 6519 return (mp); 6520 } 6521 6522 /* 6523 * Return completion status of previously initiated DLPI operations on 6524 * ills in the purview of an ipsq. 6525 */ 6526 static boolean_t 6527 ipsq_dlpi_done(ipsq_t *ipsq) 6528 { 6529 ipsq_t *ipsq_start; 6530 phyint_t *phyi; 6531 ill_t *ill; 6532 6533 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock)); 6534 ipsq_start = ipsq; 6535 6536 do { 6537 /* 6538 * The only current users of this function are ipsq_try_enter 6539 * and ipsq_enter which have made sure that ipsq_writer is 6540 * NULL before we reach here. ill_dlpi_pending is modified 6541 * only by an ipsq writer 6542 */ 6543 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL); 6544 phyi = ipsq->ipsq_phyint; 6545 /* 6546 * phyi could be NULL if a phyint that is part of an 6547 * IPMP group is being unplumbed. A more detailed 6548 * comment is in ipmp_grp_update_kstats() 6549 */ 6550 if (phyi != NULL) { 6551 ill = phyi->phyint_illv4; 6552 if (ill != NULL && 6553 (ill->ill_dlpi_pending != DL_PRIM_INVAL || 6554 ill->ill_arl_dlpi_pending)) 6555 return (B_FALSE); 6556 6557 ill = phyi->phyint_illv6; 6558 if (ill != NULL && 6559 ill->ill_dlpi_pending != DL_PRIM_INVAL) 6560 return (B_FALSE); 6561 } 6562 6563 } while ((ipsq = ipsq->ipsq_next) != ipsq_start); 6564 6565 return (B_TRUE); 6566 } 6567 6568 /* 6569 * Enter the ipsq corresponding to ill, by waiting synchronously till 6570 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq 6571 * will have to drain completely before ipsq_enter returns success. 6572 * ipx_current_ipif will be set if some exclusive op is in progress, 6573 * and the ipsq_exit logic will start the next enqueued op after 6574 * completion of the current op. If 'force' is used, we don't wait 6575 * for the enqueued ops. This is needed when a conn_close wants to 6576 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb 6577 * of an ill can also use this option. But we dont' use it currently. 6578 */ 6579 #define ENTER_SQ_WAIT_TICKS 100 6580 boolean_t 6581 ipsq_enter(ill_t *ill, boolean_t force, int type) 6582 { 6583 ipsq_t *ipsq; 6584 ipxop_t *ipx; 6585 boolean_t waited_enough = B_FALSE; 6586 ip_stack_t *ipst = ill->ill_ipst; 6587 6588 /* 6589 * Note that the relationship between ill and ipsq is fixed as long as 6590 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the 6591 * relationship between the IPSQ and xop cannot change. However, 6592 * since we cannot hold ipsq_lock across the cv_wait(), it may change 6593 * while we're waiting. We wait on ill_cv and rely on ipsq_exit() 6594 * waking up all ills in the xop when it becomes available. 6595 */ 6596 for (;;) { 6597 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6598 mutex_enter(&ill->ill_lock); 6599 if (ill->ill_state_flags & ILL_CONDEMNED) { 6600 mutex_exit(&ill->ill_lock); 6601 rw_exit(&ipst->ips_ill_g_lock); 6602 return (B_FALSE); 6603 } 6604 6605 ipsq = ill->ill_phyint->phyint_ipsq; 6606 mutex_enter(&ipsq->ipsq_lock); 6607 ipx = ipsq->ipsq_xop; 6608 mutex_enter(&ipx->ipx_lock); 6609 6610 if (ipx->ipx_writer == NULL && (type == CUR_OP || 6611 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) || 6612 waited_enough)) 6613 break; 6614 6615 rw_exit(&ipst->ips_ill_g_lock); 6616 6617 if (!force || ipx->ipx_writer != NULL) { 6618 mutex_exit(&ipx->ipx_lock); 6619 mutex_exit(&ipsq->ipsq_lock); 6620 cv_wait(&ill->ill_cv, &ill->ill_lock); 6621 } else { 6622 mutex_exit(&ipx->ipx_lock); 6623 mutex_exit(&ipsq->ipsq_lock); 6624 (void) cv_reltimedwait(&ill->ill_cv, 6625 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK); 6626 waited_enough = B_TRUE; 6627 } 6628 mutex_exit(&ill->ill_lock); 6629 } 6630 6631 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL); 6632 ASSERT(ipx->ipx_reentry_cnt == 0); 6633 ipx->ipx_writer = curthread; 6634 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL); 6635 ipx->ipx_reentry_cnt++; 6636 #ifdef DEBUG 6637 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6638 #endif 6639 mutex_exit(&ipx->ipx_lock); 6640 mutex_exit(&ipsq->ipsq_lock); 6641 mutex_exit(&ill->ill_lock); 6642 rw_exit(&ipst->ips_ill_g_lock); 6643 6644 return (B_TRUE); 6645 } 6646 6647 /* 6648 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock 6649 * across the call to the core interface ipsq_try_enter() and hence calls this 6650 * function directly. This is explained more fully in ipif_set_values(). 6651 * In order to support the above constraint, ipsq_try_enter is implemented as 6652 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently 6653 */ 6654 static ipsq_t * 6655 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, 6656 int type, boolean_t reentry_ok) 6657 { 6658 ipsq_t *ipsq; 6659 ipxop_t *ipx; 6660 ip_stack_t *ipst = ill->ill_ipst; 6661 6662 /* 6663 * lock ordering: 6664 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock. 6665 * 6666 * ipx of an ipsq can't change when ipsq_lock is held. 6667 */ 6668 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 6669 GRAB_CONN_LOCK(q); 6670 mutex_enter(&ill->ill_lock); 6671 ipsq = ill->ill_phyint->phyint_ipsq; 6672 mutex_enter(&ipsq->ipsq_lock); 6673 ipx = ipsq->ipsq_xop; 6674 mutex_enter(&ipx->ipx_lock); 6675 6676 /* 6677 * 1. Enter the ipsq if we are already writer and reentry is ok. 6678 * (Note: If the caller does not specify reentry_ok then neither 6679 * 'func' nor any of its callees must ever attempt to enter the ipsq 6680 * again. Otherwise it can lead to an infinite loop 6681 * 2. Enter the ipsq if there is no current writer and this attempted 6682 * entry is part of the current operation 6683 * 3. Enter the ipsq if there is no current writer and this is a new 6684 * operation and the operation queue is empty and there is no 6685 * operation currently in progress and if all previously initiated 6686 * DLPI operations have completed. 6687 */ 6688 if ((ipx->ipx_writer == curthread && reentry_ok) || 6689 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP && 6690 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL && 6691 ipsq_dlpi_done(ipsq))))) { 6692 /* Success. */ 6693 ipx->ipx_reentry_cnt++; 6694 ipx->ipx_writer = curthread; 6695 ipx->ipx_forced = B_FALSE; 6696 mutex_exit(&ipx->ipx_lock); 6697 mutex_exit(&ipsq->ipsq_lock); 6698 mutex_exit(&ill->ill_lock); 6699 RELEASE_CONN_LOCK(q); 6700 #ifdef DEBUG 6701 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH); 6702 #endif 6703 return (ipsq); 6704 } 6705 6706 if (func != NULL) 6707 ipsq_enq(ipsq, q, mp, func, type, ill); 6708 6709 mutex_exit(&ipx->ipx_lock); 6710 mutex_exit(&ipsq->ipsq_lock); 6711 mutex_exit(&ill->ill_lock); 6712 RELEASE_CONN_LOCK(q); 6713 return (NULL); 6714 } 6715 6716 /* 6717 * The ipsq_t (ipsq) is the synchronization data structure used to serialize 6718 * certain critical operations like plumbing (i.e. most set ioctls), etc. 6719 * There is one ipsq per phyint. The ipsq 6720 * serializes exclusive ioctls issued by applications on a per ipsq basis in 6721 * ipsq_xopq_mphead. It also protects against multiple threads executing in 6722 * the ipsq. Responses from the driver pertain to the current ioctl (say a 6723 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing 6724 * up the interface) and are enqueued in ipx_mphead. 6725 * 6726 * If a thread does not want to reenter the ipsq when it is already writer, 6727 * it must make sure that the specified reentry point to be called later 6728 * when the ipsq is empty, nor any code path starting from the specified reentry 6729 * point must never ever try to enter the ipsq again. Otherwise it can lead 6730 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example. 6731 * When the thread that is currently exclusive finishes, it (ipsq_exit) 6732 * dequeues the requests waiting to become exclusive in ipx_mphead and calls 6733 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit 6734 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next 6735 * ioctl if the current ioctl has completed. If the current ioctl is still 6736 * in progress it simply returns. The current ioctl could be waiting for 6737 * a response from another module (the driver or could be waiting for 6738 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp 6739 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the 6740 * execution of the ioctl and ipsq_exit does not start the next ioctl unless 6741 * ipx_current_ipif is NULL which happens only once the ioctl is complete and 6742 * all associated DLPI operations have completed. 6743 */ 6744 6745 /* 6746 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif' 6747 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ 6748 * on success, or NULL on failure. The caller ensures ipif/ill is valid by 6749 * refholding it as necessary. If the IPSQ cannot be entered and `func' is 6750 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ 6751 * can be entered. If `func' is NULL, then `q' and `mp' are ignored. 6752 */ 6753 ipsq_t * 6754 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp, 6755 ipsq_func_t func, int type, boolean_t reentry_ok) 6756 { 6757 ip_stack_t *ipst; 6758 ipsq_t *ipsq; 6759 6760 /* Only 1 of ipif or ill can be specified */ 6761 ASSERT((ipif != NULL) ^ (ill != NULL)); 6762 6763 if (ipif != NULL) 6764 ill = ipif->ipif_ill; 6765 ipst = ill->ill_ipst; 6766 6767 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 6768 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok); 6769 rw_exit(&ipst->ips_ill_g_lock); 6770 6771 return (ipsq); 6772 } 6773 6774 /* 6775 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures 6776 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ 6777 * cannot be entered, the mp is queued for completion. 6778 */ 6779 void 6780 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type, 6781 boolean_t reentry_ok) 6782 { 6783 ipsq_t *ipsq; 6784 6785 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok); 6786 6787 /* 6788 * Drop the caller's refhold on the ill. This is safe since we either 6789 * entered the IPSQ (and thus are exclusive), or failed to enter the 6790 * IPSQ, in which case we return without accessing ill anymore. This 6791 * is needed because func needs to see the correct refcount. 6792 * e.g. removeif can work only then. 6793 */ 6794 ill_refrele(ill); 6795 if (ipsq != NULL) { 6796 (*func)(ipsq, q, mp, NULL); 6797 ipsq_exit(ipsq); 6798 } 6799 } 6800 6801 /* 6802 * Exit the specified IPSQ. If this is the final exit on it then drain it 6803 * prior to exiting. Caller must be writer on the specified IPSQ. 6804 */ 6805 void 6806 ipsq_exit(ipsq_t *ipsq) 6807 { 6808 mblk_t *mp; 6809 ipsq_t *mp_ipsq; 6810 queue_t *q; 6811 phyint_t *phyi; 6812 ipsq_func_t func; 6813 6814 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6815 6816 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1); 6817 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) { 6818 ipsq->ipsq_xop->ipx_reentry_cnt--; 6819 return; 6820 } 6821 6822 for (;;) { 6823 phyi = ipsq->ipsq_phyint; 6824 mp = ipsq_dq(ipsq); 6825 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next; 6826 6827 /* 6828 * If we've changed to a new IPSQ, and the phyint associated 6829 * with the old one has gone away, free the old IPSQ. Note 6830 * that this cannot happen while the IPSQ is in a group. 6831 */ 6832 if (mp_ipsq != ipsq && phyi == NULL) { 6833 ASSERT(ipsq->ipsq_next == ipsq); 6834 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop); 6835 ipsq_delete(ipsq); 6836 } 6837 6838 if (mp == NULL) 6839 break; 6840 6841 q = mp->b_queue; 6842 func = (ipsq_func_t)mp->b_prev; 6843 ipsq = mp_ipsq; 6844 mp->b_next = mp->b_prev = NULL; 6845 mp->b_queue = NULL; 6846 6847 /* 6848 * If 'q' is an conn queue, it is valid, since we did a 6849 * a refhold on the conn at the start of the ioctl. 6850 * If 'q' is an ill queue, it is valid, since close of an 6851 * ill will clean up its IPSQ. 6852 */ 6853 (*func)(ipsq, q, mp, NULL); 6854 } 6855 } 6856 6857 /* 6858 * Used to start any igmp or mld timers that could not be started 6859 * while holding ill_mcast_lock. The timers can't be started while holding 6860 * the lock, since mld/igmp_start_timers may need to call untimeout() 6861 * which can't be done while holding the lock which the timeout handler 6862 * acquires. Otherwise 6863 * there could be a deadlock since the timeout handlers 6864 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire 6865 * ill_mcast_lock. 6866 */ 6867 void 6868 ill_mcast_timer_start(ip_stack_t *ipst) 6869 { 6870 int next; 6871 6872 mutex_enter(&ipst->ips_igmp_timer_lock); 6873 next = ipst->ips_igmp_deferred_next; 6874 ipst->ips_igmp_deferred_next = INFINITY; 6875 mutex_exit(&ipst->ips_igmp_timer_lock); 6876 6877 if (next != INFINITY) 6878 igmp_start_timers(next, ipst); 6879 6880 mutex_enter(&ipst->ips_mld_timer_lock); 6881 next = ipst->ips_mld_deferred_next; 6882 ipst->ips_mld_deferred_next = INFINITY; 6883 mutex_exit(&ipst->ips_mld_timer_lock); 6884 6885 if (next != INFINITY) 6886 mld_start_timers(next, ipst); 6887 } 6888 6889 /* 6890 * Start the current exclusive operation on `ipsq'; associate it with `ipif' 6891 * and `ioccmd'. 6892 */ 6893 void 6894 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd) 6895 { 6896 ill_t *ill = ipif->ipif_ill; 6897 ipxop_t *ipx = ipsq->ipsq_xop; 6898 6899 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6900 ASSERT(ipx->ipx_current_ipif == NULL); 6901 ASSERT(ipx->ipx_current_ioctl == 0); 6902 6903 ipx->ipx_current_done = B_FALSE; 6904 ipx->ipx_current_ioctl = ioccmd; 6905 mutex_enter(&ipx->ipx_lock); 6906 ipx->ipx_current_ipif = ipif; 6907 mutex_exit(&ipx->ipx_lock); 6908 6909 /* 6910 * Set IPIF_CHANGING on one or more ipifs associated with the 6911 * current exclusive operation. IPIF_CHANGING prevents any new 6912 * references to the ipif (so that the references will eventually 6913 * drop to zero) and also prevents any "get" operations (e.g., 6914 * SIOCGLIFFLAGS) from being able to access the ipif until the 6915 * operation has completed and the ipif is again in a stable state. 6916 * 6917 * For ioctls, IPIF_CHANGING is set on the ipif associated with the 6918 * ioctl. For internal operations (where ioccmd is zero), all ipifs 6919 * on the ill are marked with IPIF_CHANGING since it's unclear which 6920 * ipifs will be affected. 6921 * 6922 * Note that SIOCLIFREMOVEIF is a special case as it sets 6923 * IPIF_CONDEMNED internally after identifying the right ipif to 6924 * operate on. 6925 */ 6926 switch (ioccmd) { 6927 case SIOCLIFREMOVEIF: 6928 break; 6929 case 0: 6930 mutex_enter(&ill->ill_lock); 6931 ipif = ipif->ipif_ill->ill_ipif; 6932 for (; ipif != NULL; ipif = ipif->ipif_next) 6933 ipif->ipif_state_flags |= IPIF_CHANGING; 6934 mutex_exit(&ill->ill_lock); 6935 break; 6936 default: 6937 mutex_enter(&ill->ill_lock); 6938 ipif->ipif_state_flags |= IPIF_CHANGING; 6939 mutex_exit(&ill->ill_lock); 6940 } 6941 } 6942 6943 /* 6944 * Finish the current exclusive operation on `ipsq'. Usually, this will allow 6945 * the next exclusive operation to begin once we ipsq_exit(). However, if 6946 * pending DLPI operations remain, then we will wait for the queue to drain 6947 * before allowing the next exclusive operation to begin. This ensures that 6948 * DLPI operations from one exclusive operation are never improperly processed 6949 * as part of a subsequent exclusive operation. 6950 */ 6951 void 6952 ipsq_current_finish(ipsq_t *ipsq) 6953 { 6954 ipxop_t *ipx = ipsq->ipsq_xop; 6955 t_uscalar_t dlpi_pending = DL_PRIM_INVAL; 6956 ipif_t *ipif = ipx->ipx_current_ipif; 6957 6958 ASSERT(IAM_WRITER_IPSQ(ipsq)); 6959 6960 /* 6961 * For SIOCLIFREMOVEIF, the ipif has been already been blown away 6962 * (but in that case, IPIF_CHANGING will already be clear and no 6963 * pending DLPI messages can remain). 6964 */ 6965 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) { 6966 ill_t *ill = ipif->ipif_ill; 6967 6968 mutex_enter(&ill->ill_lock); 6969 dlpi_pending = ill->ill_dlpi_pending; 6970 if (ipx->ipx_current_ioctl == 0) { 6971 ipif = ill->ill_ipif; 6972 for (; ipif != NULL; ipif = ipif->ipif_next) 6973 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6974 } else { 6975 ipif->ipif_state_flags &= ~IPIF_CHANGING; 6976 } 6977 mutex_exit(&ill->ill_lock); 6978 } 6979 6980 ASSERT(!ipx->ipx_current_done); 6981 ipx->ipx_current_done = B_TRUE; 6982 ipx->ipx_current_ioctl = 0; 6983 if (dlpi_pending == DL_PRIM_INVAL) { 6984 mutex_enter(&ipx->ipx_lock); 6985 ipx->ipx_current_ipif = NULL; 6986 mutex_exit(&ipx->ipx_lock); 6987 } 6988 } 6989 6990 /* 6991 * The ill is closing. Flush all messages on the ipsq that originated 6992 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead 6993 * for this ill since ipsq_enter could not have entered until then. 6994 * New messages can't be queued since the CONDEMNED flag is set. 6995 */ 6996 static void 6997 ipsq_flush(ill_t *ill) 6998 { 6999 queue_t *q; 7000 mblk_t *prev; 7001 mblk_t *mp; 7002 mblk_t *mp_next; 7003 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; 7004 7005 ASSERT(IAM_WRITER_ILL(ill)); 7006 7007 /* 7008 * Flush any messages sent up by the driver. 7009 */ 7010 mutex_enter(&ipx->ipx_lock); 7011 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) { 7012 mp_next = mp->b_next; 7013 q = mp->b_queue; 7014 if (q == ill->ill_rq || q == ill->ill_wq) { 7015 /* dequeue mp */ 7016 if (prev == NULL) 7017 ipx->ipx_mphead = mp->b_next; 7018 else 7019 prev->b_next = mp->b_next; 7020 if (ipx->ipx_mptail == mp) { 7021 ASSERT(mp_next == NULL); 7022 ipx->ipx_mptail = prev; 7023 } 7024 inet_freemsg(mp); 7025 } else { 7026 prev = mp; 7027 } 7028 } 7029 mutex_exit(&ipx->ipx_lock); 7030 (void) ipsq_pending_mp_cleanup(ill, NULL); 7031 ipsq_xopq_mp_cleanup(ill, NULL); 7032 } 7033 7034 /* 7035 * Parse an ifreq or lifreq struct coming down ioctls and refhold 7036 * and return the associated ipif. 7037 * Return value: 7038 * Non zero: An error has occurred. ci may not be filled out. 7039 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and 7040 * a held ipif in ci.ci_ipif. 7041 */ 7042 int 7043 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 7044 cmd_info_t *ci) 7045 { 7046 char *name; 7047 struct ifreq *ifr; 7048 struct lifreq *lifr; 7049 ipif_t *ipif = NULL; 7050 ill_t *ill; 7051 conn_t *connp; 7052 boolean_t isv6; 7053 int err; 7054 mblk_t *mp1; 7055 zoneid_t zoneid; 7056 ip_stack_t *ipst; 7057 7058 if (q->q_next != NULL) { 7059 ill = (ill_t *)q->q_ptr; 7060 isv6 = ill->ill_isv6; 7061 connp = NULL; 7062 zoneid = ALL_ZONES; 7063 ipst = ill->ill_ipst; 7064 } else { 7065 ill = NULL; 7066 connp = Q_TO_CONN(q); 7067 isv6 = (connp->conn_family == AF_INET6); 7068 zoneid = connp->conn_zoneid; 7069 if (zoneid == GLOBAL_ZONEID) { 7070 /* global zone can access ipifs in all zones */ 7071 zoneid = ALL_ZONES; 7072 } 7073 ipst = connp->conn_netstack->netstack_ip; 7074 } 7075 7076 /* Has been checked in ip_wput_nondata */ 7077 mp1 = mp->b_cont->b_cont; 7078 7079 if (ipip->ipi_cmd_type == IF_CMD) { 7080 /* This a old style SIOC[GS]IF* command */ 7081 ifr = (struct ifreq *)mp1->b_rptr; 7082 /* 7083 * Null terminate the string to protect against buffer 7084 * overrun. String was generated by user code and may not 7085 * be trusted. 7086 */ 7087 ifr->ifr_name[IFNAMSIZ - 1] = '\0'; 7088 name = ifr->ifr_name; 7089 ci->ci_sin = (sin_t *)&ifr->ifr_addr; 7090 ci->ci_sin6 = NULL; 7091 ci->ci_lifr = (struct lifreq *)ifr; 7092 } else { 7093 /* This a new style SIOC[GS]LIF* command */ 7094 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 7095 lifr = (struct lifreq *)mp1->b_rptr; 7096 /* 7097 * Null terminate the string to protect against buffer 7098 * overrun. String was generated by user code and may not 7099 * be trusted. 7100 */ 7101 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 7102 name = lifr->lifr_name; 7103 ci->ci_sin = (sin_t *)&lifr->lifr_addr; 7104 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr; 7105 ci->ci_lifr = lifr; 7106 } 7107 7108 if (ipip->ipi_cmd == SIOCSLIFNAME) { 7109 /* 7110 * The ioctl will be failed if the ioctl comes down 7111 * an conn stream 7112 */ 7113 if (ill == NULL) { 7114 /* 7115 * Not an ill queue, return EINVAL same as the 7116 * old error code. 7117 */ 7118 return (ENXIO); 7119 } 7120 ipif = ill->ill_ipif; 7121 ipif_refhold(ipif); 7122 } else { 7123 /* 7124 * Ensure that ioctls don't see any internal state changes 7125 * caused by set ioctls by deferring them if IPIF_CHANGING is 7126 * set. 7127 */ 7128 ipif = ipif_lookup_on_name_async(name, mi_strlen(name), 7129 isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst); 7130 if (ipif == NULL) { 7131 if (err == EINPROGRESS) 7132 return (err); 7133 err = 0; /* Ensure we don't use it below */ 7134 } 7135 } 7136 7137 /* 7138 * Old style [GS]IFCMD does not admit IPv6 ipif 7139 */ 7140 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) { 7141 ipif_refrele(ipif); 7142 return (ENXIO); 7143 } 7144 7145 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL && 7146 name[0] == '\0') { 7147 /* 7148 * Handle a or a SIOC?IF* with a null name 7149 * during plumb (on the ill queue before the I_PLINK). 7150 */ 7151 ipif = ill->ill_ipif; 7152 ipif_refhold(ipif); 7153 } 7154 7155 if (ipif == NULL) 7156 return (ENXIO); 7157 7158 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", 7159 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); 7160 7161 ci->ci_ipif = ipif; 7162 return (0); 7163 } 7164 7165 /* 7166 * Return the total number of ipifs. 7167 */ 7168 static uint_t 7169 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst) 7170 { 7171 uint_t numifs = 0; 7172 ill_t *ill; 7173 ill_walk_context_t ctx; 7174 ipif_t *ipif; 7175 7176 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7177 ill = ILL_START_WALK_V4(&ctx, ipst); 7178 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7179 if (IS_UNDER_IPMP(ill)) 7180 continue; 7181 for (ipif = ill->ill_ipif; ipif != NULL; 7182 ipif = ipif->ipif_next) { 7183 if (ipif->ipif_zoneid == zoneid || 7184 ipif->ipif_zoneid == ALL_ZONES) 7185 numifs++; 7186 } 7187 } 7188 rw_exit(&ipst->ips_ill_g_lock); 7189 return (numifs); 7190 } 7191 7192 /* 7193 * Return the total number of ipifs. 7194 */ 7195 static uint_t 7196 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst) 7197 { 7198 uint_t numifs = 0; 7199 ill_t *ill; 7200 ipif_t *ipif; 7201 ill_walk_context_t ctx; 7202 7203 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid)); 7204 7205 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7206 if (family == AF_INET) 7207 ill = ILL_START_WALK_V4(&ctx, ipst); 7208 else if (family == AF_INET6) 7209 ill = ILL_START_WALK_V6(&ctx, ipst); 7210 else 7211 ill = ILL_START_WALK_ALL(&ctx, ipst); 7212 7213 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7214 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP)) 7215 continue; 7216 7217 for (ipif = ill->ill_ipif; ipif != NULL; 7218 ipif = ipif->ipif_next) { 7219 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7220 !(lifn_flags & LIFC_NOXMIT)) 7221 continue; 7222 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7223 !(lifn_flags & LIFC_TEMPORARY)) 7224 continue; 7225 if (((ipif->ipif_flags & 7226 (IPIF_NOXMIT|IPIF_NOLOCAL| 7227 IPIF_DEPRECATED)) || 7228 IS_LOOPBACK(ill) || 7229 !(ipif->ipif_flags & IPIF_UP)) && 7230 (lifn_flags & LIFC_EXTERNAL_SOURCE)) 7231 continue; 7232 7233 if (zoneid != ipif->ipif_zoneid && 7234 ipif->ipif_zoneid != ALL_ZONES && 7235 (zoneid != GLOBAL_ZONEID || 7236 !(lifn_flags & LIFC_ALLZONES))) 7237 continue; 7238 7239 numifs++; 7240 } 7241 } 7242 rw_exit(&ipst->ips_ill_g_lock); 7243 return (numifs); 7244 } 7245 7246 uint_t 7247 ip_get_lifsrcofnum(ill_t *ill) 7248 { 7249 uint_t numifs = 0; 7250 ill_t *ill_head = ill; 7251 ip_stack_t *ipst = ill->ill_ipst; 7252 7253 /* 7254 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some 7255 * other thread may be trying to relink the ILLs in this usesrc group 7256 * and adjusting the ill_usesrc_grp_next pointers 7257 */ 7258 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7259 if ((ill->ill_usesrc_ifindex == 0) && 7260 (ill->ill_usesrc_grp_next != NULL)) { 7261 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head); 7262 ill = ill->ill_usesrc_grp_next) 7263 numifs++; 7264 } 7265 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7266 7267 return (numifs); 7268 } 7269 7270 /* Null values are passed in for ipif, sin, and ifreq */ 7271 /* ARGSUSED */ 7272 int 7273 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7274 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7275 { 7276 int *nump; 7277 conn_t *connp = Q_TO_CONN(q); 7278 7279 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7280 7281 /* Existence of b_cont->b_cont checked in ip_wput_nondata */ 7282 nump = (int *)mp->b_cont->b_cont->b_rptr; 7283 7284 *nump = ip_get_numifs(connp->conn_zoneid, 7285 connp->conn_netstack->netstack_ip); 7286 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump)); 7287 return (0); 7288 } 7289 7290 /* Null values are passed in for ipif, sin, and ifreq */ 7291 /* ARGSUSED */ 7292 int 7293 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, 7294 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7295 { 7296 struct lifnum *lifn; 7297 mblk_t *mp1; 7298 conn_t *connp = Q_TO_CONN(q); 7299 7300 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */ 7301 7302 /* Existence checked in ip_wput_nondata */ 7303 mp1 = mp->b_cont->b_cont; 7304 7305 lifn = (struct lifnum *)mp1->b_rptr; 7306 switch (lifn->lifn_family) { 7307 case AF_UNSPEC: 7308 case AF_INET: 7309 case AF_INET6: 7310 break; 7311 default: 7312 return (EAFNOSUPPORT); 7313 } 7314 7315 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags, 7316 connp->conn_zoneid, connp->conn_netstack->netstack_ip); 7317 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count)); 7318 return (0); 7319 } 7320 7321 /* ARGSUSED */ 7322 int 7323 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7324 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7325 { 7326 STRUCT_HANDLE(ifconf, ifc); 7327 mblk_t *mp1; 7328 struct iocblk *iocp; 7329 struct ifreq *ifr; 7330 ill_walk_context_t ctx; 7331 ill_t *ill; 7332 ipif_t *ipif; 7333 struct sockaddr_in *sin; 7334 int32_t ifclen; 7335 zoneid_t zoneid; 7336 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7337 7338 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */ 7339 7340 ip1dbg(("ip_sioctl_get_ifconf")); 7341 /* Existence verified in ip_wput_nondata */ 7342 mp1 = mp->b_cont->b_cont; 7343 iocp = (struct iocblk *)mp->b_rptr; 7344 zoneid = Q_TO_CONN(q)->conn_zoneid; 7345 7346 /* 7347 * The original SIOCGIFCONF passed in a struct ifconf which specified 7348 * the user buffer address and length into which the list of struct 7349 * ifreqs was to be copied. Since AT&T Streams does not seem to 7350 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS, 7351 * the SIOCGIFCONF operation was redefined to simply provide 7352 * a large output buffer into which we are supposed to jam the ifreq 7353 * array. The same ioctl command code was used, despite the fact that 7354 * both the applications and the kernel code had to change, thus making 7355 * it impossible to support both interfaces. 7356 * 7357 * For reasons not good enough to try to explain, the following 7358 * algorithm is used for deciding what to do with one of these: 7359 * If the IOCTL comes in as an I_STR, it is assumed to be of the new 7360 * form with the output buffer coming down as the continuation message. 7361 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style, 7362 * and we have to copy in the ifconf structure to find out how big the 7363 * output buffer is and where to copy out to. Sure no problem... 7364 * 7365 */ 7366 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL); 7367 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) { 7368 int numifs = 0; 7369 size_t ifc_bufsize; 7370 7371 /* 7372 * Must be (better be!) continuation of a TRANSPARENT 7373 * IOCTL. We just copied in the ifconf structure. 7374 */ 7375 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, 7376 (struct ifconf *)mp1->b_rptr); 7377 7378 /* 7379 * Allocate a buffer to hold requested information. 7380 * 7381 * If ifc_len is larger than what is needed, we only 7382 * allocate what we will use. 7383 * 7384 * If ifc_len is smaller than what is needed, return 7385 * EINVAL. 7386 * 7387 * XXX: the ill_t structure can hava 2 counters, for 7388 * v4 and v6 (not just ill_ipif_up_count) to store the 7389 * number of interfaces for a device, so we don't need 7390 * to count them here... 7391 */ 7392 numifs = ip_get_numifs(zoneid, ipst); 7393 7394 ifclen = STRUCT_FGET(ifc, ifc_len); 7395 ifc_bufsize = numifs * sizeof (struct ifreq); 7396 if (ifc_bufsize > ifclen) { 7397 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7398 /* old behaviour */ 7399 return (EINVAL); 7400 } else { 7401 ifc_bufsize = ifclen; 7402 } 7403 } 7404 7405 mp1 = mi_copyout_alloc(q, mp, 7406 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE); 7407 if (mp1 == NULL) 7408 return (ENOMEM); 7409 7410 mp1->b_wptr = mp1->b_rptr + ifc_bufsize; 7411 } 7412 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7413 /* 7414 * the SIOCGIFCONF ioctl only knows about 7415 * IPv4 addresses, so don't try to tell 7416 * it about interfaces with IPv6-only 7417 * addresses. (Last parm 'isv6' is B_FALSE) 7418 */ 7419 7420 ifr = (struct ifreq *)mp1->b_rptr; 7421 7422 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7423 ill = ILL_START_WALK_V4(&ctx, ipst); 7424 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7425 if (IS_UNDER_IPMP(ill)) 7426 continue; 7427 for (ipif = ill->ill_ipif; ipif != NULL; 7428 ipif = ipif->ipif_next) { 7429 if (zoneid != ipif->ipif_zoneid && 7430 ipif->ipif_zoneid != ALL_ZONES) 7431 continue; 7432 if ((uchar_t *)&ifr[1] > mp1->b_wptr) { 7433 if (iocp->ioc_cmd == O_SIOCGIFCONF) { 7434 /* old behaviour */ 7435 rw_exit(&ipst->ips_ill_g_lock); 7436 return (EINVAL); 7437 } else { 7438 goto if_copydone; 7439 } 7440 } 7441 ipif_get_name(ipif, ifr->ifr_name, 7442 sizeof (ifr->ifr_name)); 7443 sin = (sin_t *)&ifr->ifr_addr; 7444 *sin = sin_null; 7445 sin->sin_family = AF_INET; 7446 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7447 ifr++; 7448 } 7449 } 7450 if_copydone: 7451 rw_exit(&ipst->ips_ill_g_lock); 7452 mp1->b_wptr = (uchar_t *)ifr; 7453 7454 if (STRUCT_BUF(ifc) != NULL) { 7455 STRUCT_FSET(ifc, ifc_len, 7456 (int)((uchar_t *)ifr - mp1->b_rptr)); 7457 } 7458 return (0); 7459 } 7460 7461 /* 7462 * Get the interfaces using the address hosted on the interface passed in, 7463 * as a source adddress 7464 */ 7465 /* ARGSUSED */ 7466 int 7467 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7468 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7469 { 7470 mblk_t *mp1; 7471 ill_t *ill, *ill_head; 7472 ipif_t *ipif, *orig_ipif; 7473 int numlifs = 0; 7474 size_t lifs_bufsize, lifsmaxlen; 7475 struct lifreq *lifr; 7476 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7477 uint_t ifindex; 7478 zoneid_t zoneid; 7479 boolean_t isv6 = B_FALSE; 7480 struct sockaddr_in *sin; 7481 struct sockaddr_in6 *sin6; 7482 STRUCT_HANDLE(lifsrcof, lifs); 7483 ip_stack_t *ipst; 7484 7485 ipst = CONNQ_TO_IPST(q); 7486 7487 ASSERT(q->q_next == NULL); 7488 7489 zoneid = Q_TO_CONN(q)->conn_zoneid; 7490 7491 /* Existence verified in ip_wput_nondata */ 7492 mp1 = mp->b_cont->b_cont; 7493 7494 /* 7495 * Must be (better be!) continuation of a TRANSPARENT 7496 * IOCTL. We just copied in the lifsrcof structure. 7497 */ 7498 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag, 7499 (struct lifsrcof *)mp1->b_rptr); 7500 7501 if (MBLKL(mp1) != STRUCT_SIZE(lifs)) 7502 return (EINVAL); 7503 7504 ifindex = STRUCT_FGET(lifs, lifs_ifindex); 7505 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 7506 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); 7507 if (ipif == NULL) { 7508 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", 7509 ifindex)); 7510 return (ENXIO); 7511 } 7512 7513 /* Allocate a buffer to hold requested information */ 7514 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill); 7515 lifs_bufsize = numlifs * sizeof (struct lifreq); 7516 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen); 7517 /* The actual size needed is always returned in lifs_len */ 7518 STRUCT_FSET(lifs, lifs_len, lifs_bufsize); 7519 7520 /* If the amount we need is more than what is passed in, abort */ 7521 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) { 7522 ipif_refrele(ipif); 7523 return (0); 7524 } 7525 7526 mp1 = mi_copyout_alloc(q, mp, 7527 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE); 7528 if (mp1 == NULL) { 7529 ipif_refrele(ipif); 7530 return (ENOMEM); 7531 } 7532 7533 mp1->b_wptr = mp1->b_rptr + lifs_bufsize; 7534 bzero(mp1->b_rptr, lifs_bufsize); 7535 7536 lifr = (struct lifreq *)mp1->b_rptr; 7537 7538 ill = ill_head = ipif->ipif_ill; 7539 orig_ipif = ipif; 7540 7541 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 7542 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); 7543 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7544 7545 ill = ill->ill_usesrc_grp_next; /* start from next ill */ 7546 for (; (ill != NULL) && (ill != ill_head); 7547 ill = ill->ill_usesrc_grp_next) { 7548 7549 if ((uchar_t *)&lifr[1] > mp1->b_wptr) 7550 break; 7551 7552 ipif = ill->ill_ipif; 7553 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name)); 7554 if (ipif->ipif_isv6) { 7555 sin6 = (sin6_t *)&lifr->lifr_addr; 7556 *sin6 = sin6_null; 7557 sin6->sin6_family = AF_INET6; 7558 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 7559 lifr->lifr_addrlen = ip_mask_to_plen_v6( 7560 &ipif->ipif_v6net_mask); 7561 } else { 7562 sin = (sin_t *)&lifr->lifr_addr; 7563 *sin = sin_null; 7564 sin->sin_family = AF_INET; 7565 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 7566 lifr->lifr_addrlen = ip_mask_to_plen( 7567 ipif->ipif_net_mask); 7568 } 7569 lifr++; 7570 } 7571 rw_exit(&ipst->ips_ill_g_lock); 7572 rw_exit(&ipst->ips_ill_g_usesrc_lock); 7573 ipif_refrele(orig_ipif); 7574 mp1->b_wptr = (uchar_t *)lifr; 7575 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr)); 7576 7577 return (0); 7578 } 7579 7580 /* ARGSUSED */ 7581 int 7582 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, 7583 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) 7584 { 7585 mblk_t *mp1; 7586 int list; 7587 ill_t *ill; 7588 ipif_t *ipif; 7589 int flags; 7590 int numlifs = 0; 7591 size_t lifc_bufsize; 7592 struct lifreq *lifr; 7593 sa_family_t family; 7594 struct sockaddr_in *sin; 7595 struct sockaddr_in6 *sin6; 7596 ill_walk_context_t ctx; 7597 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7598 int32_t lifclen; 7599 zoneid_t zoneid; 7600 STRUCT_HANDLE(lifconf, lifc); 7601 ip_stack_t *ipst = CONNQ_TO_IPST(q); 7602 7603 ip1dbg(("ip_sioctl_get_lifconf")); 7604 7605 ASSERT(q->q_next == NULL); 7606 7607 zoneid = Q_TO_CONN(q)->conn_zoneid; 7608 7609 /* Existence verified in ip_wput_nondata */ 7610 mp1 = mp->b_cont->b_cont; 7611 7612 /* 7613 * An extended version of SIOCGIFCONF that takes an 7614 * additional address family and flags field. 7615 * AF_UNSPEC retrieve both IPv4 and IPv6. 7616 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT 7617 * interfaces are omitted. 7618 * Similarly, IPIF_TEMPORARY interfaces are omitted 7619 * unless LIFC_TEMPORARY is specified. 7620 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT, 7621 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and 7622 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE 7623 * has priority over LIFC_NOXMIT. 7624 */ 7625 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL); 7626 7627 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc)) 7628 return (EINVAL); 7629 7630 /* 7631 * Must be (better be!) continuation of a TRANSPARENT 7632 * IOCTL. We just copied in the lifconf structure. 7633 */ 7634 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr); 7635 7636 family = STRUCT_FGET(lifc, lifc_family); 7637 flags = STRUCT_FGET(lifc, lifc_flags); 7638 7639 switch (family) { 7640 case AF_UNSPEC: 7641 /* 7642 * walk all ILL's. 7643 */ 7644 list = MAX_G_HEADS; 7645 break; 7646 case AF_INET: 7647 /* 7648 * walk only IPV4 ILL's. 7649 */ 7650 list = IP_V4_G_HEAD; 7651 break; 7652 case AF_INET6: 7653 /* 7654 * walk only IPV6 ILL's. 7655 */ 7656 list = IP_V6_G_HEAD; 7657 break; 7658 default: 7659 return (EAFNOSUPPORT); 7660 } 7661 7662 /* 7663 * Allocate a buffer to hold requested information. 7664 * 7665 * If lifc_len is larger than what is needed, we only 7666 * allocate what we will use. 7667 * 7668 * If lifc_len is smaller than what is needed, return 7669 * EINVAL. 7670 */ 7671 numlifs = ip_get_numlifs(family, flags, zoneid, ipst); 7672 lifc_bufsize = numlifs * sizeof (struct lifreq); 7673 lifclen = STRUCT_FGET(lifc, lifc_len); 7674 if (lifc_bufsize > lifclen) { 7675 if (iocp->ioc_cmd == O_SIOCGLIFCONF) 7676 return (EINVAL); 7677 else 7678 lifc_bufsize = lifclen; 7679 } 7680 7681 mp1 = mi_copyout_alloc(q, mp, 7682 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE); 7683 if (mp1 == NULL) 7684 return (ENOMEM); 7685 7686 mp1->b_wptr = mp1->b_rptr + lifc_bufsize; 7687 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr); 7688 7689 lifr = (struct lifreq *)mp1->b_rptr; 7690 7691 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 7692 ill = ill_first(list, list, &ctx, ipst); 7693 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 7694 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP)) 7695 continue; 7696 7697 for (ipif = ill->ill_ipif; ipif != NULL; 7698 ipif = ipif->ipif_next) { 7699 if ((ipif->ipif_flags & IPIF_NOXMIT) && 7700 !(flags & LIFC_NOXMIT)) 7701 continue; 7702 7703 if ((ipif->ipif_flags & IPIF_TEMPORARY) && 7704 !(flags & LIFC_TEMPORARY)) 7705 continue; 7706 7707 if (((ipif->ipif_flags & 7708 (IPIF_NOXMIT|IPIF_NOLOCAL| 7709 IPIF_DEPRECATED)) || 7710 IS_LOOPBACK(ill) || 7711 !(ipif->ipif_flags & IPIF_UP)) && 7712 (flags & LIFC_EXTERNAL_SOURCE)) 7713 continue; 7714 7715 if (zoneid != ipif->ipif_zoneid && 7716 ipif->ipif_zoneid != ALL_ZONES && 7717 (zoneid != GLOBAL_ZONEID || 7718 !(flags & LIFC_ALLZONES))) 7719 continue; 7720 7721 if ((uchar_t *)&lifr[1] > mp1->b_wptr) { 7722 if (iocp->ioc_cmd == O_SIOCGLIFCONF) { 7723 rw_exit(&ipst->ips_ill_g_lock); 7724 return (EINVAL); 7725 } else { 7726 goto lif_copydone; 7727 } 7728 } 7729 7730 ipif_get_name(ipif, lifr->lifr_name, 7731 sizeof (lifr->lifr_name)); 7732 lifr->lifr_type = ill->ill_type; 7733 if (ipif->ipif_isv6) { 7734 sin6 = (sin6_t *)&lifr->lifr_addr; 7735 *sin6 = sin6_null; 7736 sin6->sin6_family = AF_INET6; 7737 sin6->sin6_addr = 7738 ipif->ipif_v6lcl_addr; 7739 lifr->lifr_addrlen = 7740 ip_mask_to_plen_v6( 7741 &ipif->ipif_v6net_mask); 7742 } else { 7743 sin = (sin_t *)&lifr->lifr_addr; 7744 *sin = sin_null; 7745 sin->sin_family = AF_INET; 7746 sin->sin_addr.s_addr = 7747 ipif->ipif_lcl_addr; 7748 lifr->lifr_addrlen = 7749 ip_mask_to_plen( 7750 ipif->ipif_net_mask); 7751 } 7752 lifr++; 7753 } 7754 } 7755 lif_copydone: 7756 rw_exit(&ipst->ips_ill_g_lock); 7757 7758 mp1->b_wptr = (uchar_t *)lifr; 7759 if (STRUCT_BUF(lifc) != NULL) { 7760 STRUCT_FSET(lifc, lifc_len, 7761 (int)((uchar_t *)lifr - mp1->b_rptr)); 7762 } 7763 return (0); 7764 } 7765 7766 static void 7767 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp) 7768 { 7769 ip6_asp_t *table; 7770 size_t table_size; 7771 mblk_t *data_mp; 7772 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7773 ip_stack_t *ipst; 7774 7775 if (q->q_next == NULL) 7776 ipst = CONNQ_TO_IPST(q); 7777 else 7778 ipst = ILLQ_TO_IPST(q); 7779 7780 /* These two ioctls are I_STR only */ 7781 if (iocp->ioc_count == TRANSPARENT) { 7782 miocnak(q, mp, 0, EINVAL); 7783 return; 7784 } 7785 7786 data_mp = mp->b_cont; 7787 if (data_mp == NULL) { 7788 /* The user passed us a NULL argument */ 7789 table = NULL; 7790 table_size = iocp->ioc_count; 7791 } else { 7792 /* 7793 * The user provided a table. The stream head 7794 * may have copied in the user data in chunks, 7795 * so make sure everything is pulled up 7796 * properly. 7797 */ 7798 if (MBLKL(data_mp) < iocp->ioc_count) { 7799 mblk_t *new_data_mp; 7800 if ((new_data_mp = msgpullup(data_mp, -1)) == 7801 NULL) { 7802 miocnak(q, mp, 0, ENOMEM); 7803 return; 7804 } 7805 freemsg(data_mp); 7806 data_mp = new_data_mp; 7807 mp->b_cont = data_mp; 7808 } 7809 table = (ip6_asp_t *)data_mp->b_rptr; 7810 table_size = iocp->ioc_count; 7811 } 7812 7813 switch (iocp->ioc_cmd) { 7814 case SIOCGIP6ADDRPOLICY: 7815 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst); 7816 if (iocp->ioc_rval == -1) 7817 iocp->ioc_error = EINVAL; 7818 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7819 else if (table != NULL && 7820 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) { 7821 ip6_asp_t *src = table; 7822 ip6_asp32_t *dst = (void *)table; 7823 int count = table_size / sizeof (ip6_asp_t); 7824 int i; 7825 7826 /* 7827 * We need to do an in-place shrink of the array 7828 * to match the alignment attributes of the 7829 * 32-bit ABI looking at it. 7830 */ 7831 /* LINTED: logical expression always true: op "||" */ 7832 ASSERT(sizeof (*src) > sizeof (*dst)); 7833 for (i = 1; i < count; i++) 7834 bcopy(src + i, dst + i, sizeof (*dst)); 7835 } 7836 #endif 7837 break; 7838 7839 case SIOCSIP6ADDRPOLICY: 7840 ASSERT(mp->b_prev == NULL); 7841 mp->b_prev = (void *)q; 7842 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4 7843 /* 7844 * We pass in the datamodel here so that the ip6_asp_replace() 7845 * routine can handle converting from 32-bit to native formats 7846 * where necessary. 7847 * 7848 * A better way to handle this might be to convert the inbound 7849 * data structure here, and hang it off a new 'mp'; thus the 7850 * ip6_asp_replace() logic would always be dealing with native 7851 * format data structures.. 7852 * 7853 * (An even simpler way to handle these ioctls is to just 7854 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure 7855 * and just recompile everything that depends on it.) 7856 */ 7857 #endif 7858 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst, 7859 iocp->ioc_flag & IOC_MODELS); 7860 return; 7861 } 7862 7863 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK; 7864 qreply(q, mp); 7865 } 7866 7867 static void 7868 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) 7869 { 7870 mblk_t *data_mp; 7871 struct dstinforeq *dir; 7872 uint8_t *end, *cur; 7873 in6_addr_t *daddr, *saddr; 7874 ipaddr_t v4daddr; 7875 ire_t *ire; 7876 ipaddr_t v4setsrc; 7877 in6_addr_t v6setsrc; 7878 char *slabel, *dlabel; 7879 boolean_t isipv4; 7880 int match_ire; 7881 ill_t *dst_ill; 7882 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 7883 conn_t *connp = Q_TO_CONN(q); 7884 zoneid_t zoneid = IPCL_ZONEID(connp); 7885 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 7886 uint64_t ipif_flags; 7887 7888 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 7889 7890 /* 7891 * This ioctl is I_STR only, and must have a 7892 * data mblk following the M_IOCTL mblk. 7893 */ 7894 data_mp = mp->b_cont; 7895 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) { 7896 miocnak(q, mp, 0, EINVAL); 7897 return; 7898 } 7899 7900 if (MBLKL(data_mp) < iocp->ioc_count) { 7901 mblk_t *new_data_mp; 7902 7903 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) { 7904 miocnak(q, mp, 0, ENOMEM); 7905 return; 7906 } 7907 freemsg(data_mp); 7908 data_mp = new_data_mp; 7909 mp->b_cont = data_mp; 7910 } 7911 match_ire = MATCH_IRE_DSTONLY; 7912 7913 for (cur = data_mp->b_rptr, end = data_mp->b_wptr; 7914 end - cur >= sizeof (struct dstinforeq); 7915 cur += sizeof (struct dstinforeq)) { 7916 dir = (struct dstinforeq *)cur; 7917 daddr = &dir->dir_daddr; 7918 saddr = &dir->dir_saddr; 7919 7920 /* 7921 * ip_addr_scope_v6() and ip6_asp_lookup() handle 7922 * v4 mapped addresses; ire_ftable_lookup_v6() 7923 * and ip_select_source_v6() do not. 7924 */ 7925 dir->dir_dscope = ip_addr_scope_v6(daddr); 7926 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); 7927 7928 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); 7929 if (isipv4) { 7930 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); 7931 v4setsrc = INADDR_ANY; 7932 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, 7933 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc, 7934 NULL, NULL); 7935 } else { 7936 v6setsrc = ipv6_all_zeros; 7937 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, 7938 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc, 7939 NULL, NULL); 7940 } 7941 ASSERT(ire != NULL); 7942 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 7943 ire_refrele(ire); 7944 dir->dir_dreachable = 0; 7945 7946 /* move on to next dst addr */ 7947 continue; 7948 } 7949 dir->dir_dreachable = 1; 7950 7951 dst_ill = ire_nexthop_ill(ire); 7952 if (dst_ill == NULL) { 7953 ire_refrele(ire); 7954 continue; 7955 } 7956 7957 /* With ipmp we most likely look at the ipmp ill here */ 7958 dir->dir_dmactype = dst_ill->ill_mactype; 7959 7960 if (isipv4) { 7961 ipaddr_t v4saddr; 7962 7963 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, 7964 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, 7965 &v4saddr, NULL, &ipif_flags) != 0) { 7966 v4saddr = INADDR_ANY; 7967 ipif_flags = 0; 7968 } 7969 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); 7970 } else { 7971 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, 7972 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, 7973 saddr, NULL, &ipif_flags) != 0) { 7974 *saddr = ipv6_all_zeros; 7975 ipif_flags = 0; 7976 } 7977 } 7978 7979 dir->dir_sscope = ip_addr_scope_v6(saddr); 7980 slabel = ip6_asp_lookup(saddr, NULL, ipst); 7981 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); 7982 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; 7983 ire_refrele(ire); 7984 ill_refrele(dst_ill); 7985 } 7986 miocack(q, mp, iocp->ioc_count, 0); 7987 } 7988 7989 /* 7990 * Check if this is an address assigned to this machine. 7991 * Skips interfaces that are down by using ire checks. 7992 * Translates mapped addresses to v4 addresses and then 7993 * treats them as such, returning true if the v4 address 7994 * associated with this mapped address is configured. 7995 * Note: Applications will have to be careful what they do 7996 * with the response; use of mapped addresses limits 7997 * what can be done with the socket, especially with 7998 * respect to socket options and ioctls - neither IPv4 7999 * options nor IPv6 sticky options/ancillary data options 8000 * may be used. 8001 */ 8002 /* ARGSUSED */ 8003 int 8004 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8005 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8006 { 8007 struct sioc_addrreq *sia; 8008 sin_t *sin; 8009 ire_t *ire; 8010 mblk_t *mp1; 8011 zoneid_t zoneid; 8012 ip_stack_t *ipst; 8013 8014 ip1dbg(("ip_sioctl_tmyaddr")); 8015 8016 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8017 zoneid = Q_TO_CONN(q)->conn_zoneid; 8018 ipst = CONNQ_TO_IPST(q); 8019 8020 /* Existence verified in ip_wput_nondata */ 8021 mp1 = mp->b_cont->b_cont; 8022 sia = (struct sioc_addrreq *)mp1->b_rptr; 8023 sin = (sin_t *)&sia->sa_addr; 8024 switch (sin->sin_family) { 8025 case AF_INET6: { 8026 sin6_t *sin6 = (sin6_t *)sin; 8027 8028 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8029 ipaddr_t v4_addr; 8030 8031 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8032 v4_addr); 8033 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 8034 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8035 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8036 } else { 8037 in6_addr_t v6addr; 8038 8039 v6addr = sin6->sin6_addr; 8040 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 8041 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, 8042 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8043 } 8044 break; 8045 } 8046 case AF_INET: { 8047 ipaddr_t v4addr; 8048 8049 v4addr = sin->sin_addr.s_addr; 8050 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 8051 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, 8052 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); 8053 break; 8054 } 8055 default: 8056 return (EAFNOSUPPORT); 8057 } 8058 if (ire != NULL) { 8059 sia->sa_res = 1; 8060 ire_refrele(ire); 8061 } else { 8062 sia->sa_res = 0; 8063 } 8064 return (0); 8065 } 8066 8067 /* 8068 * Check if this is an address assigned on-link i.e. neighbor, 8069 * and makes sure it's reachable from the current zone. 8070 * Returns true for my addresses as well. 8071 * Translates mapped addresses to v4 addresses and then 8072 * treats them as such, returning true if the v4 address 8073 * associated with this mapped address is configured. 8074 * Note: Applications will have to be careful what they do 8075 * with the response; use of mapped addresses limits 8076 * what can be done with the socket, especially with 8077 * respect to socket options and ioctls - neither IPv4 8078 * options nor IPv6 sticky options/ancillary data options 8079 * may be used. 8080 */ 8081 /* ARGSUSED */ 8082 int 8083 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 8084 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq) 8085 { 8086 struct sioc_addrreq *sia; 8087 sin_t *sin; 8088 mblk_t *mp1; 8089 ire_t *ire = NULL; 8090 zoneid_t zoneid; 8091 ip_stack_t *ipst; 8092 8093 ip1dbg(("ip_sioctl_tonlink")); 8094 8095 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ 8096 zoneid = Q_TO_CONN(q)->conn_zoneid; 8097 ipst = CONNQ_TO_IPST(q); 8098 8099 /* Existence verified in ip_wput_nondata */ 8100 mp1 = mp->b_cont->b_cont; 8101 sia = (struct sioc_addrreq *)mp1->b_rptr; 8102 sin = (sin_t *)&sia->sa_addr; 8103 8104 /* 8105 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST 8106 * to make sure we only look at on-link unicast address. 8107 */ 8108 switch (sin->sin_family) { 8109 case AF_INET6: { 8110 sin6_t *sin6 = (sin6_t *)sin; 8111 8112 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 8113 ipaddr_t v4_addr; 8114 8115 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, 8116 v4_addr); 8117 if (!CLASSD(v4_addr)) { 8118 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, 8119 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 8120 0, ipst, NULL); 8121 } 8122 } else { 8123 in6_addr_t v6addr; 8124 8125 v6addr = sin6->sin6_addr; 8126 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { 8127 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, 8128 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, 8129 ipst, NULL); 8130 } 8131 } 8132 break; 8133 } 8134 case AF_INET: { 8135 ipaddr_t v4addr; 8136 8137 v4addr = sin->sin_addr.s_addr; 8138 if (!CLASSD(v4addr)) { 8139 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, 8140 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); 8141 } 8142 break; 8143 } 8144 default: 8145 return (EAFNOSUPPORT); 8146 } 8147 sia->sa_res = 0; 8148 if (ire != NULL) { 8149 ASSERT(!(ire->ire_type & IRE_MULTICAST)); 8150 8151 if ((ire->ire_type & IRE_ONLINK) && 8152 !(ire->ire_type & IRE_BROADCAST)) 8153 sia->sa_res = 1; 8154 ire_refrele(ire); 8155 } 8156 return (0); 8157 } 8158 8159 /* 8160 * TBD: implement when kernel maintaines a list of site prefixes. 8161 */ 8162 /* ARGSUSED */ 8163 int 8164 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8165 ip_ioctl_cmd_t *ipip, void *ifreq) 8166 { 8167 return (ENXIO); 8168 } 8169 8170 /* ARP IOCTLs. */ 8171 /* ARGSUSED */ 8172 int 8173 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 8174 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 8175 { 8176 int err; 8177 ipaddr_t ipaddr; 8178 struct iocblk *iocp; 8179 conn_t *connp; 8180 struct arpreq *ar; 8181 struct xarpreq *xar; 8182 int arp_flags, flags, alength; 8183 uchar_t *lladdr; 8184 ip_stack_t *ipst; 8185 ill_t *ill = ipif->ipif_ill; 8186 ill_t *proxy_ill = NULL; 8187 ipmp_arpent_t *entp = NULL; 8188 boolean_t proxyarp = B_FALSE; 8189 boolean_t if_arp_ioctl = B_FALSE; 8190 ncec_t *ncec = NULL; 8191 nce_t *nce; 8192 8193 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8194 connp = Q_TO_CONN(q); 8195 ipst = connp->conn_netstack->netstack_ip; 8196 iocp = (struct iocblk *)mp->b_rptr; 8197 8198 if (ipip->ipi_cmd_type == XARP_CMD) { 8199 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ 8200 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; 8201 ar = NULL; 8202 8203 arp_flags = xar->xarp_flags; 8204 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); 8205 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); 8206 /* 8207 * Validate against user's link layer address length 8208 * input and name and addr length limits. 8209 */ 8210 alength = ill->ill_phys_addr_length; 8211 if (ipip->ipi_cmd == SIOCSXARP) { 8212 if (alength != xar->xarp_ha.sdl_alen || 8213 (alength + xar->xarp_ha.sdl_nlen > 8214 sizeof (xar->xarp_ha.sdl_data))) 8215 return (EINVAL); 8216 } 8217 } else { 8218 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */ 8219 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; 8220 xar = NULL; 8221 8222 arp_flags = ar->arp_flags; 8223 lladdr = (uchar_t *)ar->arp_ha.sa_data; 8224 /* 8225 * Theoretically, the sa_family could tell us what link 8226 * layer type this operation is trying to deal with. By 8227 * common usage AF_UNSPEC means ethernet. We'll assume 8228 * any attempt to use the SIOC?ARP ioctls is for ethernet, 8229 * for now. Our new SIOC*XARP ioctls can be used more 8230 * generally. 8231 * 8232 * If the underlying media happens to have a non 6 byte 8233 * address, arp module will fail set/get, but the del 8234 * operation will succeed. 8235 */ 8236 alength = 6; 8237 if ((ipip->ipi_cmd != SIOCDARP) && 8238 (alength != ill->ill_phys_addr_length)) { 8239 return (EINVAL); 8240 } 8241 } 8242 8243 /* Translate ATF* flags to NCE* flags */ 8244 flags = 0; 8245 if (arp_flags & ATF_AUTHORITY) 8246 flags |= NCE_F_AUTHORITY; 8247 if (arp_flags & ATF_PERM) 8248 flags |= NCE_F_NONUD; /* not subject to aging */ 8249 if (arp_flags & ATF_PUBL) 8250 flags |= NCE_F_PUBLISH; 8251 8252 /* 8253 * IPMP ARP special handling: 8254 * 8255 * 1. Since ARP mappings must appear consistent across the group, 8256 * prohibit changing ARP mappings on the underlying interfaces. 8257 * 8258 * 2. Since ARP mappings for IPMP data addresses are maintained by 8259 * IP itself, prohibit changing them. 8260 * 8261 * 3. For proxy ARP, use a functioning hardware address in the group, 8262 * provided one exists. If one doesn't, just add the entry as-is; 8263 * ipmp_illgrp_refresh_arpent() will refresh it if things change. 8264 */ 8265 if (IS_UNDER_IPMP(ill)) { 8266 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP) 8267 return (EPERM); 8268 } 8269 if (IS_IPMP(ill)) { 8270 ipmp_illgrp_t *illg = ill->ill_grp; 8271 8272 switch (ipip->ipi_cmd) { 8273 case SIOCSARP: 8274 case SIOCSXARP: 8275 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength); 8276 if (proxy_ill != NULL) { 8277 proxyarp = B_TRUE; 8278 if (!ipmp_ill_is_active(proxy_ill)) 8279 proxy_ill = ipmp_illgrp_next_ill(illg); 8280 if (proxy_ill != NULL) 8281 lladdr = proxy_ill->ill_phys_addr; 8282 } 8283 /* FALLTHRU */ 8284 } 8285 } 8286 8287 ipaddr = sin->sin_addr.s_addr; 8288 /* 8289 * don't match across illgrp per case (1) and (2). 8290 * XXX use IS_IPMP(ill) like ndp_sioc_update? 8291 */ 8292 nce = nce_lookup_v4(ill, &ipaddr); 8293 if (nce != NULL) 8294 ncec = nce->nce_common; 8295 8296 switch (iocp->ioc_cmd) { 8297 case SIOCDARP: 8298 case SIOCDXARP: { 8299 /* 8300 * Delete the NCE if any. 8301 */ 8302 if (ncec == NULL) { 8303 iocp->ioc_error = ENXIO; 8304 break; 8305 } 8306 /* Don't allow changes to arp mappings of local addresses. */ 8307 if (NCE_MYADDR(ncec)) { 8308 nce_refrele(nce); 8309 return (ENOTSUP); 8310 } 8311 iocp->ioc_error = 0; 8312 8313 /* 8314 * Delete the nce_common which has ncec_ill set to ipmp_ill. 8315 * This will delete all the nce entries on the under_ills. 8316 */ 8317 ncec_delete(ncec); 8318 /* 8319 * Once the NCE has been deleted, then the ire_dep* consistency 8320 * mechanism will find any IRE which depended on the now 8321 * condemned NCE (as part of sending packets). 8322 * That mechanism handles redirects by deleting redirects 8323 * that refer to UNREACHABLE nces. 8324 */ 8325 break; 8326 } 8327 case SIOCGARP: 8328 case SIOCGXARP: 8329 if (ncec != NULL) { 8330 lladdr = ncec->ncec_lladdr; 8331 flags = ncec->ncec_flags; 8332 iocp->ioc_error = 0; 8333 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); 8334 } else { 8335 iocp->ioc_error = ENXIO; 8336 } 8337 break; 8338 case SIOCSARP: 8339 case SIOCSXARP: 8340 /* Don't allow changes to arp mappings of local addresses. */ 8341 if (ncec != NULL && NCE_MYADDR(ncec)) { 8342 nce_refrele(nce); 8343 return (ENOTSUP); 8344 } 8345 8346 /* static arp entries will undergo NUD if ATF_PERM is not set */ 8347 flags |= NCE_F_STATIC; 8348 if (!if_arp_ioctl) { 8349 ip_nce_lookup_and_update(&ipaddr, NULL, ipst, 8350 lladdr, alength, flags); 8351 } else { 8352 ipif_t *ipif = ipif_get_next_ipif(NULL, ill); 8353 if (ipif != NULL) { 8354 ip_nce_lookup_and_update(&ipaddr, ipif, ipst, 8355 lladdr, alength, flags); 8356 ipif_refrele(ipif); 8357 } 8358 } 8359 if (nce != NULL) { 8360 nce_refrele(nce); 8361 nce = NULL; 8362 } 8363 /* 8364 * NCE_F_STATIC entries will be added in state ND_REACHABLE 8365 * by nce_add_common() 8366 */ 8367 err = nce_lookup_then_add_v4(ill, lladdr, 8368 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, 8369 &nce); 8370 if (err == EEXIST) { 8371 ncec = nce->nce_common; 8372 mutex_enter(&ncec->ncec_lock); 8373 ncec->ncec_state = ND_REACHABLE; 8374 ncec->ncec_flags = flags; 8375 nce_update(ncec, ND_UNCHANGED, lladdr); 8376 mutex_exit(&ncec->ncec_lock); 8377 err = 0; 8378 } 8379 if (nce != NULL) { 8380 nce_refrele(nce); 8381 nce = NULL; 8382 } 8383 if (IS_IPMP(ill) && err == 0) { 8384 entp = ipmp_illgrp_create_arpent(ill->ill_grp, 8385 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, 8386 flags); 8387 if (entp == NULL || (proxyarp && proxy_ill == NULL)) { 8388 iocp->ioc_error = (entp == NULL ? ENOMEM : 0); 8389 break; 8390 } 8391 } 8392 iocp->ioc_error = err; 8393 } 8394 8395 if (nce != NULL) { 8396 nce_refrele(nce); 8397 } 8398 8399 /* 8400 * If we created an IPMP ARP entry, mark that we've notified ARP. 8401 */ 8402 if (entp != NULL) 8403 ipmp_illgrp_mark_arpent(ill->ill_grp, entp); 8404 8405 return (iocp->ioc_error); 8406 } 8407 8408 /* 8409 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify 8410 * the associated sin and refhold and return the associated ipif via `ci'. 8411 */ 8412 int 8413 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, 8414 cmd_info_t *ci) 8415 { 8416 mblk_t *mp1; 8417 sin_t *sin; 8418 conn_t *connp; 8419 ipif_t *ipif; 8420 ire_t *ire = NULL; 8421 ill_t *ill = NULL; 8422 boolean_t exists; 8423 ip_stack_t *ipst; 8424 struct arpreq *ar; 8425 struct xarpreq *xar; 8426 struct sockaddr_dl *sdl; 8427 8428 /* ioctl comes down on a conn */ 8429 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); 8430 connp = Q_TO_CONN(q); 8431 if (connp->conn_family == AF_INET6) 8432 return (ENXIO); 8433 8434 ipst = connp->conn_netstack->netstack_ip; 8435 8436 /* Verified in ip_wput_nondata */ 8437 mp1 = mp->b_cont->b_cont; 8438 8439 if (ipip->ipi_cmd_type == XARP_CMD) { 8440 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq)); 8441 xar = (struct xarpreq *)mp1->b_rptr; 8442 sin = (sin_t *)&xar->xarp_pa; 8443 sdl = &xar->xarp_ha; 8444 8445 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET) 8446 return (ENXIO); 8447 if (sdl->sdl_nlen >= LIFNAMSIZ) 8448 return (EINVAL); 8449 } else { 8450 ASSERT(ipip->ipi_cmd_type == ARP_CMD); 8451 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq)); 8452 ar = (struct arpreq *)mp1->b_rptr; 8453 sin = (sin_t *)&ar->arp_pa; 8454 } 8455 8456 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { 8457 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, 8458 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); 8459 if (ipif == NULL) 8460 return (ENXIO); 8461 if (ipif->ipif_id != 0) { 8462 ipif_refrele(ipif); 8463 return (ENXIO); 8464 } 8465 } else { 8466 /* 8467 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen 8468 * of 0: use the IP address to find the ipif. If the IP 8469 * address is an IPMP test address, ire_ftable_lookup() will 8470 * find the wrong ill, so we first do an ipif_lookup_addr(). 8471 */ 8472 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, 8473 ipst); 8474 if (ipif == NULL) { 8475 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, 8476 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, 8477 NULL, MATCH_IRE_TYPE, 0, ipst, NULL); 8478 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { 8479 if (ire != NULL) 8480 ire_refrele(ire); 8481 return (ENXIO); 8482 } 8483 ASSERT(ire != NULL && ill != NULL); 8484 ipif = ill->ill_ipif; 8485 ipif_refhold(ipif); 8486 ire_refrele(ire); 8487 } 8488 } 8489 8490 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { 8491 ipif_refrele(ipif); 8492 return (ENXIO); 8493 } 8494 8495 ci->ci_sin = sin; 8496 ci->ci_ipif = ipif; 8497 return (0); 8498 } 8499 8500 /* 8501 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the 8502 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is 8503 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it 8504 * up and thus an ill can join that illgrp. 8505 * 8506 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than 8507 * open()/close() primarily because close() is not allowed to fail or block 8508 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason 8509 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure 8510 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the 8511 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts 8512 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent 8513 * state if I_UNLINK didn't occur. 8514 * 8515 * Note that for each plumb/unplumb operation, we may end up here more than 8516 * once because of the way ifconfig works. However, it's OK to link the same 8517 * illgrp more than once, or unlink an illgrp that's already unlinked. 8518 */ 8519 static int 8520 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) 8521 { 8522 int err; 8523 ip_stack_t *ipst = ill->ill_ipst; 8524 8525 ASSERT(IS_IPMP(ill)); 8526 ASSERT(IAM_WRITER_ILL(ill)); 8527 8528 switch (ioccmd) { 8529 case I_LINK: 8530 return (ENOTSUP); 8531 8532 case I_PLINK: 8533 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8534 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp); 8535 rw_exit(&ipst->ips_ipmp_lock); 8536 break; 8537 8538 case I_PUNLINK: 8539 /* 8540 * Require all UP ipifs be brought down prior to unlinking the 8541 * illgrp so any associated IREs (and other state) is torched. 8542 */ 8543 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 8544 return (EBUSY); 8545 8546 /* 8547 * NOTE: We hold ipmp_lock across the unlink to prevent a race 8548 * with an SIOCSLIFGROUPNAME request from an ill trying to 8549 * join this group. Specifically: ills trying to join grab 8550 * ipmp_lock and bump a "pending join" counter checked by 8551 * ipmp_illgrp_unlink_grp(). During the unlink no new pending 8552 * joins can occur (since we have ipmp_lock). Once we drop 8553 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not 8554 * find the illgrp (since we unlinked it) and will return 8555 * EAFNOSUPPORT. This will then take them back through the 8556 * IPMP meta-interface plumbing logic in ifconfig, and thus 8557 * back through I_PLINK above. 8558 */ 8559 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 8560 err = ipmp_illgrp_unlink_grp(ill->ill_grp); 8561 rw_exit(&ipst->ips_ipmp_lock); 8562 return (err); 8563 default: 8564 break; 8565 } 8566 return (0); 8567 } 8568 8569 /* 8570 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also 8571 * atomically set/clear the muxids. Also complete the ioctl by acking or 8572 * naking it. Note that the code is structured such that the link type, 8573 * whether it's persistent or not, is treated equally. ifconfig(1M) and 8574 * its clones use the persistent link, while pppd(1M) and perhaps many 8575 * other daemons may use non-persistent link. When combined with some 8576 * ill_t states, linking and unlinking lower streams may be used as 8577 * indicators of dynamic re-plumbing events [see PSARC/1999/348]. 8578 */ 8579 /* ARGSUSED */ 8580 void 8581 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) 8582 { 8583 mblk_t *mp1; 8584 struct linkblk *li; 8585 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; 8586 int err = 0; 8587 8588 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || 8589 ioccmd == I_LINK || ioccmd == I_UNLINK); 8590 8591 mp1 = mp->b_cont; /* This is the linkblk info */ 8592 li = (struct linkblk *)mp1->b_rptr; 8593 8594 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); 8595 if (err == EINPROGRESS) 8596 return; 8597 if (err == 0) 8598 miocack(q, mp, 0, 0); 8599 else 8600 miocnak(q, mp, 0, err); 8601 8602 /* Conn was refheld in ip_sioctl_copyin_setup */ 8603 if (CONN_Q(q)) { 8604 CONN_DEC_IOCTLREF(Q_TO_CONN(q)); 8605 CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); 8606 } 8607 } 8608 8609 /* 8610 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to 8611 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP 8612 * module stream). 8613 * Returns zero on success, EINPROGRESS if the operation is still pending, or 8614 * an error code on failure. 8615 */ 8616 static int 8617 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, 8618 struct linkblk *li) 8619 { 8620 int err = 0; 8621 ill_t *ill; 8622 queue_t *ipwq, *dwq; 8623 const char *name; 8624 struct qinit *qinfo; 8625 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); 8626 boolean_t entered_ipsq = B_FALSE; 8627 boolean_t is_ip = B_FALSE; 8628 arl_t *arl; 8629 8630 /* 8631 * Walk the lower stream to verify it's the IP module stream. 8632 * The IP module is identified by its name, wput function, 8633 * and non-NULL q_next. STREAMS ensures that the lower stream 8634 * (li->l_qbot) will not vanish until this ioctl completes. 8635 */ 8636 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) { 8637 qinfo = ipwq->q_qinfo; 8638 name = qinfo->qi_minfo->mi_idname; 8639 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && 8640 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8641 is_ip = B_TRUE; 8642 break; 8643 } 8644 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && 8645 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { 8646 break; 8647 } 8648 } 8649 8650 /* 8651 * If this isn't an IP module stream, bail. 8652 */ 8653 if (ipwq == NULL) 8654 return (0); 8655 8656 if (!is_ip) { 8657 arl = (arl_t *)ipwq->q_ptr; 8658 ill = arl_to_ill(arl); 8659 if (ill == NULL) 8660 return (0); 8661 } else { 8662 ill = ipwq->q_ptr; 8663 } 8664 ASSERT(ill != NULL); 8665 8666 if (ipsq == NULL) { 8667 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, 8668 NEW_OP, B_FALSE); 8669 if (ipsq == NULL) { 8670 if (!is_ip) 8671 ill_refrele(ill); 8672 return (EINPROGRESS); 8673 } 8674 entered_ipsq = B_TRUE; 8675 } 8676 ASSERT(IAM_WRITER_ILL(ill)); 8677 mutex_enter(&ill->ill_lock); 8678 if (!is_ip) { 8679 if (islink && ill->ill_muxid == 0) { 8680 /* 8681 * Plumbing has to be done with IP plumbed first, arp 8682 * second, but here we have arp being plumbed first. 8683 */ 8684 mutex_exit(&ill->ill_lock); 8685 if (entered_ipsq) 8686 ipsq_exit(ipsq); 8687 ill_refrele(ill); 8688 return (EINVAL); 8689 } 8690 } 8691 mutex_exit(&ill->ill_lock); 8692 if (!is_ip) { 8693 arl->arl_muxid = islink ? li->l_index : 0; 8694 ill_refrele(ill); 8695 goto done; 8696 } 8697 8698 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) 8699 goto done; 8700 8701 /* 8702 * As part of I_{P}LINKing, stash the number of downstream modules and 8703 * the read queue of the module immediately below IP in the ill. 8704 * These are used during the capability negotiation below. 8705 */ 8706 ill->ill_lmod_rq = NULL; 8707 ill->ill_lmod_cnt = 0; 8708 if (islink && ((dwq = ipwq->q_next) != NULL)) { 8709 ill->ill_lmod_rq = RD(dwq); 8710 for (; dwq != NULL; dwq = dwq->q_next) 8711 ill->ill_lmod_cnt++; 8712 } 8713 8714 ill->ill_muxid = islink ? li->l_index : 0; 8715 8716 /* 8717 * Mark the ipsq busy until the capability operations initiated below 8718 * complete. The PLINK/UNLINK ioctl itself completes when our caller 8719 * returns, but the capability operation may complete asynchronously 8720 * much later. 8721 */ 8722 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd); 8723 /* 8724 * If there's at least one up ipif on this ill, then we're bound to 8725 * the underlying driver via DLPI. In that case, renegotiate 8726 * capabilities to account for any possible change in modules 8727 * interposed between IP and the driver. 8728 */ 8729 if (ill->ill_ipif_up_count > 0) { 8730 if (islink) 8731 ill_capability_probe(ill); 8732 else 8733 ill_capability_reset(ill, B_FALSE); 8734 } 8735 ipsq_current_finish(ipsq); 8736 done: 8737 if (entered_ipsq) 8738 ipsq_exit(ipsq); 8739 8740 return (err); 8741 } 8742 8743 /* 8744 * Search the ioctl command in the ioctl tables and return a pointer 8745 * to the ioctl command information. The ioctl command tables are 8746 * static and fully populated at compile time. 8747 */ 8748 ip_ioctl_cmd_t * 8749 ip_sioctl_lookup(int ioc_cmd) 8750 { 8751 int index; 8752 ip_ioctl_cmd_t *ipip; 8753 ip_ioctl_cmd_t *ipip_end; 8754 8755 if (ioc_cmd == IPI_DONTCARE) 8756 return (NULL); 8757 8758 /* 8759 * Do a 2 step search. First search the indexed table 8760 * based on the least significant byte of the ioctl cmd. 8761 * If we don't find a match, then search the misc table 8762 * serially. 8763 */ 8764 index = ioc_cmd & 0xFF; 8765 if (index < ip_ndx_ioctl_count) { 8766 ipip = &ip_ndx_ioctl_table[index]; 8767 if (ipip->ipi_cmd == ioc_cmd) { 8768 /* Found a match in the ndx table */ 8769 return (ipip); 8770 } 8771 } 8772 8773 /* Search the misc table */ 8774 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count]; 8775 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) { 8776 if (ipip->ipi_cmd == ioc_cmd) 8777 /* Found a match in the misc table */ 8778 return (ipip); 8779 } 8780 8781 return (NULL); 8782 } 8783 8784 /* 8785 * helper function for ip_sioctl_getsetprop(), which does some sanity checks 8786 */ 8787 static boolean_t 8788 getset_ioctl_checks(mblk_t *mp) 8789 { 8790 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8791 mblk_t *mp1 = mp->b_cont; 8792 mod_ioc_prop_t *pioc; 8793 uint_t flags; 8794 uint_t pioc_size; 8795 8796 /* do sanity checks on various arguments */ 8797 if (mp1 == NULL || iocp->ioc_count == 0 || 8798 iocp->ioc_count == TRANSPARENT) { 8799 return (B_FALSE); 8800 } 8801 if (msgdsize(mp1) < iocp->ioc_count) { 8802 if (!pullupmsg(mp1, iocp->ioc_count)) 8803 return (B_FALSE); 8804 } 8805 8806 pioc = (mod_ioc_prop_t *)mp1->b_rptr; 8807 8808 /* sanity checks on mpr_valsize */ 8809 pioc_size = sizeof (mod_ioc_prop_t); 8810 if (pioc->mpr_valsize != 0) 8811 pioc_size += pioc->mpr_valsize - 1; 8812 8813 if (iocp->ioc_count != pioc_size) 8814 return (B_FALSE); 8815 8816 flags = pioc->mpr_flags; 8817 if (iocp->ioc_cmd == SIOCSETPROP) { 8818 /* 8819 * One can either reset the value to it's default value or 8820 * change the current value or append/remove the value from 8821 * a multi-valued properties. 8822 */ 8823 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT && 8824 flags != MOD_PROP_ACTIVE && 8825 flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) && 8826 flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE)) 8827 return (B_FALSE); 8828 } else { 8829 ASSERT(iocp->ioc_cmd == SIOCGETPROP); 8830 8831 /* 8832 * One can retrieve only one kind of property information 8833 * at a time. 8834 */ 8835 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE && 8836 (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT && 8837 (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE && 8838 (flags & MOD_PROP_PERM) != MOD_PROP_PERM) 8839 return (B_FALSE); 8840 } 8841 8842 return (B_TRUE); 8843 } 8844 8845 /* 8846 * process the SIOC{SET|GET}PROP ioctl's 8847 */ 8848 /* ARGSUSED */ 8849 static void 8850 ip_sioctl_getsetprop(queue_t *q, mblk_t *mp) 8851 { 8852 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8853 mblk_t *mp1 = mp->b_cont; 8854 mod_ioc_prop_t *pioc; 8855 mod_prop_info_t *ptbl = NULL, *pinfo = NULL; 8856 ip_stack_t *ipst; 8857 icmp_stack_t *is; 8858 tcp_stack_t *tcps; 8859 sctp_stack_t *sctps; 8860 udp_stack_t *us; 8861 netstack_t *stack; 8862 void *cbarg; 8863 cred_t *cr; 8864 boolean_t set; 8865 int err; 8866 8867 ASSERT(q->q_next == NULL); 8868 ASSERT(CONN_Q(q)); 8869 8870 if (!getset_ioctl_checks(mp)) { 8871 miocnak(q, mp, 0, EINVAL); 8872 return; 8873 } 8874 ipst = CONNQ_TO_IPST(q); 8875 stack = ipst->ips_netstack; 8876 pioc = (mod_ioc_prop_t *)mp1->b_rptr; 8877 8878 switch (pioc->mpr_proto) { 8879 case MOD_PROTO_IP: 8880 case MOD_PROTO_IPV4: 8881 case MOD_PROTO_IPV6: 8882 ptbl = ipst->ips_propinfo_tbl; 8883 cbarg = ipst; 8884 break; 8885 case MOD_PROTO_RAWIP: 8886 is = stack->netstack_icmp; 8887 ptbl = is->is_propinfo_tbl; 8888 cbarg = is; 8889 break; 8890 case MOD_PROTO_TCP: 8891 tcps = stack->netstack_tcp; 8892 ptbl = tcps->tcps_propinfo_tbl; 8893 cbarg = tcps; 8894 break; 8895 case MOD_PROTO_UDP: 8896 us = stack->netstack_udp; 8897 ptbl = us->us_propinfo_tbl; 8898 cbarg = us; 8899 break; 8900 case MOD_PROTO_SCTP: 8901 sctps = stack->netstack_sctp; 8902 ptbl = sctps->sctps_propinfo_tbl; 8903 cbarg = sctps; 8904 break; 8905 default: 8906 miocnak(q, mp, 0, EINVAL); 8907 return; 8908 } 8909 8910 /* search for given property in respective protocol propinfo table */ 8911 for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) { 8912 if (strcmp(pinfo->mpi_name, pioc->mpr_name) == 0 && 8913 pinfo->mpi_proto == pioc->mpr_proto) 8914 break; 8915 } 8916 if (pinfo->mpi_name == NULL) { 8917 miocnak(q, mp, 0, ENOENT); 8918 return; 8919 } 8920 8921 set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE; 8922 if (set && pinfo->mpi_setf != NULL) { 8923 cr = msg_getcred(mp, NULL); 8924 if (cr == NULL) 8925 cr = iocp->ioc_cr; 8926 err = pinfo->mpi_setf(cbarg, cr, pinfo, pioc->mpr_ifname, 8927 pioc->mpr_val, pioc->mpr_flags); 8928 } else if (!set && pinfo->mpi_getf != NULL) { 8929 err = pinfo->mpi_getf(cbarg, pinfo, pioc->mpr_ifname, 8930 pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags); 8931 } else { 8932 err = EPERM; 8933 } 8934 8935 if (err != 0) { 8936 miocnak(q, mp, 0, err); 8937 } else { 8938 if (set) 8939 miocack(q, mp, 0, 0); 8940 else /* For get, we need to return back the data */ 8941 miocack(q, mp, iocp->ioc_count, 0); 8942 } 8943 } 8944 8945 /* 8946 * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding 8947 * as several routing daemons have unfortunately used this 'unpublished' 8948 * but well-known ioctls. 8949 */ 8950 /* ARGSUSED */ 8951 static void 8952 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp) 8953 { 8954 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 8955 mblk_t *mp1 = mp->b_cont; 8956 char *pname, *pval, *buf; 8957 uint_t bufsize, proto; 8958 mod_prop_info_t *ptbl = NULL, *pinfo = NULL; 8959 ip_stack_t *ipst; 8960 int err = 0; 8961 8962 ASSERT(CONN_Q(q)); 8963 ipst = CONNQ_TO_IPST(q); 8964 8965 if (iocp->ioc_count == 0 || mp1 == NULL) { 8966 miocnak(q, mp, 0, EINVAL); 8967 return; 8968 } 8969 8970 mp1->b_datap->db_lim[-1] = '\0'; /* Force null termination */ 8971 pval = buf = pname = (char *)mp1->b_rptr; 8972 bufsize = MBLKL(mp1); 8973 8974 if (strcmp(pname, "ip_forwarding") == 0) { 8975 pname = "forwarding"; 8976 proto = MOD_PROTO_IPV4; 8977 } else if (strcmp(pname, "ip6_forwarding") == 0) { 8978 pname = "forwarding"; 8979 proto = MOD_PROTO_IPV6; 8980 } else { 8981 miocnak(q, mp, 0, EINVAL); 8982 return; 8983 } 8984 8985 ptbl = ipst->ips_propinfo_tbl; 8986 for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) { 8987 if (strcmp(pinfo->mpi_name, pname) == 0 && 8988 pinfo->mpi_proto == proto) 8989 break; 8990 } 8991 8992 ASSERT(pinfo->mpi_name != NULL); 8993 8994 switch (iocp->ioc_cmd) { 8995 case ND_GET: 8996 if ((err = pinfo->mpi_getf(ipst, pinfo, NULL, buf, bufsize, 8997 0)) == 0) { 8998 miocack(q, mp, iocp->ioc_count, 0); 8999 return; 9000 } 9001 break; 9002 case ND_SET: 9003 /* 9004 * buffer will have property name and value in the following 9005 * format, 9006 * <property name>'\0'<property value>'\0', extract them; 9007 */ 9008 while (*pval++) 9009 noop; 9010 9011 if (!*pval || pval >= (char *)mp1->b_wptr) { 9012 err = EINVAL; 9013 } else if ((err = pinfo->mpi_setf(ipst, NULL, pinfo, NULL, 9014 pval, 0)) == 0) { 9015 miocack(q, mp, 0, 0); 9016 return; 9017 } 9018 break; 9019 default: 9020 err = EINVAL; 9021 break; 9022 } 9023 miocnak(q, mp, 0, err); 9024 } 9025 9026 /* 9027 * Wrapper function for resuming deferred ioctl processing 9028 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER, 9029 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently. 9030 */ 9031 /* ARGSUSED */ 9032 void 9033 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, 9034 void *dummy_arg) 9035 { 9036 ip_sioctl_copyin_setup(q, mp); 9037 } 9038 9039 /* 9040 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message 9041 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle 9042 * in either I_STR or TRANSPARENT form, using the mi_copy facility. 9043 * We establish here the size of the block to be copied in. mi_copyin 9044 * arranges for this to happen, an processing continues in ip_wput_nondata with 9045 * an M_IOCDATA message. 9046 */ 9047 void 9048 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) 9049 { 9050 int copyin_size; 9051 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 9052 ip_ioctl_cmd_t *ipip; 9053 cred_t *cr; 9054 ip_stack_t *ipst; 9055 9056 if (CONN_Q(q)) 9057 ipst = CONNQ_TO_IPST(q); 9058 else 9059 ipst = ILLQ_TO_IPST(q); 9060 9061 ipip = ip_sioctl_lookup(iocp->ioc_cmd); 9062 if (ipip == NULL) { 9063 /* 9064 * The ioctl is not one we understand or own. 9065 * Pass it along to be processed down stream, 9066 * if this is a module instance of IP, else nak 9067 * the ioctl. 9068 */ 9069 if (q->q_next == NULL) { 9070 goto nak; 9071 } else { 9072 putnext(q, mp); 9073 return; 9074 } 9075 } 9076 9077 /* 9078 * If this is deferred, then we will do all the checks when we 9079 * come back. 9080 */ 9081 if ((iocp->ioc_cmd == SIOCGDSTINFO || 9082 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) { 9083 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume); 9084 return; 9085 } 9086 9087 /* 9088 * Only allow a very small subset of IP ioctls on this stream if 9089 * IP is a module and not a driver. Allowing ioctls to be processed 9090 * in this case may cause assert failures or data corruption. 9091 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few 9092 * ioctls allowed on an IP module stream, after which this stream 9093 * normally becomes a multiplexor (at which time the stream head 9094 * will fail all ioctls). 9095 */ 9096 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { 9097 goto nak; 9098 } 9099 9100 /* Make sure we have ioctl data to process. */ 9101 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT)) 9102 goto nak; 9103 9104 /* 9105 * Prefer dblk credential over ioctl credential; some synthesized 9106 * ioctls have kcred set because there's no way to crhold() 9107 * a credential in some contexts. (ioc_cr is not crfree() by 9108 * the framework; the caller of ioctl needs to hold the reference 9109 * for the duration of the call). 9110 */ 9111 cr = msg_getcred(mp, NULL); 9112 if (cr == NULL) 9113 cr = iocp->ioc_cr; 9114 9115 /* Make sure normal users don't send down privileged ioctls */ 9116 if ((ipip->ipi_flags & IPI_PRIV) && 9117 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) { 9118 /* We checked the privilege earlier but log it here */ 9119 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE)); 9120 return; 9121 } 9122 9123 /* 9124 * The ioctl command tables can only encode fixed length 9125 * ioctl data. If the length is variable, the table will 9126 * encode the length as zero. Such special cases are handled 9127 * below in the switch. 9128 */ 9129 if (ipip->ipi_copyin_size != 0) { 9130 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size); 9131 return; 9132 } 9133 9134 switch (iocp->ioc_cmd) { 9135 case O_SIOCGIFCONF: 9136 case SIOCGIFCONF: 9137 /* 9138 * This IOCTL is hilarious. See comments in 9139 * ip_sioctl_get_ifconf for the story. 9140 */ 9141 if (iocp->ioc_count == TRANSPARENT) 9142 copyin_size = SIZEOF_STRUCT(ifconf, 9143 iocp->ioc_flag); 9144 else 9145 copyin_size = iocp->ioc_count; 9146 mi_copyin(q, mp, NULL, copyin_size); 9147 return; 9148 9149 case O_SIOCGLIFCONF: 9150 case SIOCGLIFCONF: 9151 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag); 9152 mi_copyin(q, mp, NULL, copyin_size); 9153 return; 9154 9155 case SIOCGLIFSRCOF: 9156 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag); 9157 mi_copyin(q, mp, NULL, copyin_size); 9158 return; 9159 9160 case SIOCGIP6ADDRPOLICY: 9161 ip_sioctl_ip6addrpolicy(q, mp); 9162 ip6_asp_table_refrele(ipst); 9163 return; 9164 9165 case SIOCSIP6ADDRPOLICY: 9166 ip_sioctl_ip6addrpolicy(q, mp); 9167 return; 9168 9169 case SIOCGDSTINFO: 9170 ip_sioctl_dstinfo(q, mp); 9171 ip6_asp_table_refrele(ipst); 9172 return; 9173 9174 case ND_SET: 9175 case ND_GET: 9176 ip_process_legacy_nddprop(q, mp); 9177 return; 9178 9179 case SIOCSETPROP: 9180 case SIOCGETPROP: 9181 ip_sioctl_getsetprop(q, mp); 9182 return; 9183 9184 case I_PLINK: 9185 case I_PUNLINK: 9186 case I_LINK: 9187 case I_UNLINK: 9188 /* 9189 * We treat non-persistent link similarly as the persistent 9190 * link case, in terms of plumbing/unplumbing, as well as 9191 * dynamic re-plumbing events indicator. See comments 9192 * in ip_sioctl_plink() for more. 9193 * 9194 * Request can be enqueued in the 'ipsq' while waiting 9195 * to become exclusive. So bump up the conn ref. 9196 */ 9197 if (CONN_Q(q)) { 9198 CONN_INC_REF(Q_TO_CONN(q)); 9199 CONN_INC_IOCTLREF(Q_TO_CONN(q)) 9200 } 9201 ip_sioctl_plink(NULL, q, mp, NULL); 9202 return; 9203 9204 case IP_IOCTL: 9205 ip_wput_ioctl(q, mp); 9206 return; 9207 9208 case SIOCILB: 9209 /* The ioctl length varies depending on the ILB command. */ 9210 copyin_size = iocp->ioc_count; 9211 if (copyin_size < sizeof (ilb_cmd_t)) 9212 goto nak; 9213 mi_copyin(q, mp, NULL, copyin_size); 9214 return; 9215 9216 default: 9217 cmn_err(CE_PANIC, "should not happen "); 9218 } 9219 nak: 9220 if (mp->b_cont != NULL) { 9221 freemsg(mp->b_cont); 9222 mp->b_cont = NULL; 9223 } 9224 iocp->ioc_error = EINVAL; 9225 mp->b_datap->db_type = M_IOCNAK; 9226 iocp->ioc_count = 0; 9227 qreply(q, mp); 9228 } 9229 9230 static void 9231 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) 9232 { 9233 struct arpreq *ar; 9234 struct xarpreq *xar; 9235 mblk_t *tmp; 9236 struct iocblk *iocp; 9237 int x_arp_ioctl = B_FALSE; 9238 int *flagsp; 9239 char *storage = NULL; 9240 9241 ASSERT(ill != NULL); 9242 9243 iocp = (struct iocblk *)mp->b_rptr; 9244 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); 9245 9246 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ 9247 if ((iocp->ioc_cmd == SIOCGXARP) || 9248 (iocp->ioc_cmd == SIOCSXARP)) { 9249 x_arp_ioctl = B_TRUE; 9250 xar = (struct xarpreq *)tmp->b_rptr; 9251 flagsp = &xar->xarp_flags; 9252 storage = xar->xarp_ha.sdl_data; 9253 } else { 9254 ar = (struct arpreq *)tmp->b_rptr; 9255 flagsp = &ar->arp_flags; 9256 storage = ar->arp_ha.sa_data; 9257 } 9258 9259 /* 9260 * We're done if this is not an SIOCG{X}ARP 9261 */ 9262 if (x_arp_ioctl) { 9263 storage += ill_xarp_info(&xar->xarp_ha, ill); 9264 if ((ill->ill_phys_addr_length + ill->ill_name_length) > 9265 sizeof (xar->xarp_ha.sdl_data)) { 9266 iocp->ioc_error = EINVAL; 9267 return; 9268 } 9269 } 9270 *flagsp = ATF_INUSE; 9271 /* 9272 * If /sbin/arp told us we are the authority using the "permanent" 9273 * flag, or if this is one of my addresses print "permanent" 9274 * in the /sbin/arp output. 9275 */ 9276 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) 9277 *flagsp |= ATF_AUTHORITY; 9278 if (flags & NCE_F_NONUD) 9279 *flagsp |= ATF_PERM; /* not subject to aging */ 9280 if (flags & NCE_F_PUBLISH) 9281 *flagsp |= ATF_PUBL; 9282 if (hwaddr != NULL) { 9283 *flagsp |= ATF_COM; 9284 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); 9285 } 9286 } 9287 9288 /* 9289 * Create a new logical interface. If ipif_id is zero (i.e. not a logical 9290 * interface) create the next available logical interface for this 9291 * physical interface. 9292 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an 9293 * ipif with the specified name. 9294 * 9295 * If the address family is not AF_UNSPEC then set the address as well. 9296 * 9297 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout) 9298 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer. 9299 * 9300 * Executed as a writer on the ill. 9301 * So no lock is needed to traverse the ipif chain, or examine the 9302 * phyint flags. 9303 */ 9304 /* ARGSUSED */ 9305 int 9306 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 9307 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9308 { 9309 mblk_t *mp1; 9310 struct lifreq *lifr; 9311 boolean_t isv6; 9312 boolean_t exists; 9313 char *name; 9314 char *endp; 9315 char *cp; 9316 int namelen; 9317 ipif_t *ipif; 9318 long id; 9319 ipsq_t *ipsq; 9320 ill_t *ill; 9321 sin_t *sin; 9322 int err = 0; 9323 boolean_t found_sep = B_FALSE; 9324 conn_t *connp; 9325 zoneid_t zoneid; 9326 ip_stack_t *ipst = CONNQ_TO_IPST(q); 9327 9328 ASSERT(q->q_next == NULL); 9329 ip1dbg(("ip_sioctl_addif\n")); 9330 /* Existence of mp1 has been checked in ip_wput_nondata */ 9331 mp1 = mp->b_cont->b_cont; 9332 /* 9333 * Null terminate the string to protect against buffer 9334 * overrun. String was generated by user code and may not 9335 * be trusted. 9336 */ 9337 lifr = (struct lifreq *)mp1->b_rptr; 9338 lifr->lifr_name[LIFNAMSIZ - 1] = '\0'; 9339 name = lifr->lifr_name; 9340 ASSERT(CONN_Q(q)); 9341 connp = Q_TO_CONN(q); 9342 isv6 = (connp->conn_family == AF_INET6); 9343 zoneid = connp->conn_zoneid; 9344 namelen = mi_strlen(name); 9345 if (namelen == 0) 9346 return (EINVAL); 9347 9348 exists = B_FALSE; 9349 if ((namelen + 1 == sizeof (ipif_loopback_name)) && 9350 (mi_strcmp(name, ipif_loopback_name) == 0)) { 9351 /* 9352 * Allow creating lo0 using SIOCLIFADDIF. 9353 * can't be any other writer thread. So can pass null below 9354 * for the last 4 args to ipif_lookup_name. 9355 */ 9356 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, 9357 &exists, isv6, zoneid, ipst); 9358 /* Prevent any further action */ 9359 if (ipif == NULL) { 9360 return (ENOBUFS); 9361 } else if (!exists) { 9362 /* We created the ipif now and as writer */ 9363 ipif_refrele(ipif); 9364 return (0); 9365 } else { 9366 ill = ipif->ipif_ill; 9367 ill_refhold(ill); 9368 ipif_refrele(ipif); 9369 } 9370 } else { 9371 /* Look for a colon in the name. */ 9372 endp = &name[namelen]; 9373 for (cp = endp; --cp > name; ) { 9374 if (*cp == IPIF_SEPARATOR_CHAR) { 9375 found_sep = B_TRUE; 9376 /* 9377 * Reject any non-decimal aliases for plumbing 9378 * of logical interfaces. Aliases with leading 9379 * zeroes are also rejected as they introduce 9380 * ambiguity in the naming of the interfaces. 9381 * Comparing with "0" takes care of all such 9382 * cases. 9383 */ 9384 if ((strncmp("0", cp+1, 1)) == 0) 9385 return (EINVAL); 9386 9387 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 || 9388 id <= 0 || *endp != '\0') { 9389 return (EINVAL); 9390 } 9391 *cp = '\0'; 9392 break; 9393 } 9394 } 9395 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); 9396 if (found_sep) 9397 *cp = IPIF_SEPARATOR_CHAR; 9398 if (ill == NULL) 9399 return (ENXIO); 9400 } 9401 9402 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, 9403 B_TRUE); 9404 9405 /* 9406 * Release the refhold due to the lookup, now that we are excl 9407 * or we are just returning 9408 */ 9409 ill_refrele(ill); 9410 9411 if (ipsq == NULL) 9412 return (EINPROGRESS); 9413 9414 /* We are now exclusive on the IPSQ */ 9415 ASSERT(IAM_WRITER_ILL(ill)); 9416 9417 if (found_sep) { 9418 /* Now see if there is an IPIF with this unit number. */ 9419 for (ipif = ill->ill_ipif; ipif != NULL; 9420 ipif = ipif->ipif_next) { 9421 if (ipif->ipif_id == id) { 9422 err = EEXIST; 9423 goto done; 9424 } 9425 } 9426 } 9427 9428 /* 9429 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use 9430 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name() 9431 * instead. 9432 */ 9433 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL, 9434 B_TRUE, B_TRUE, &err)) == NULL) { 9435 goto done; 9436 } 9437 9438 /* Return created name with ioctl */ 9439 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name, 9440 IPIF_SEPARATOR_CHAR, ipif->ipif_id); 9441 ip1dbg(("created %s\n", lifr->lifr_name)); 9442 9443 /* Set address */ 9444 sin = (sin_t *)&lifr->lifr_addr; 9445 if (sin->sin_family != AF_UNSPEC) { 9446 err = ip_sioctl_addr(ipif, sin, q, mp, 9447 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr); 9448 } 9449 9450 done: 9451 ipsq_exit(ipsq); 9452 return (err); 9453 } 9454 9455 /* 9456 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical 9457 * interface) delete it based on the IP address (on this physical interface). 9458 * Otherwise delete it based on the ipif_id. 9459 * Also, special handling to allow a removeif of lo0. 9460 */ 9461 /* ARGSUSED */ 9462 int 9463 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9464 ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9465 { 9466 conn_t *connp; 9467 ill_t *ill = ipif->ipif_ill; 9468 boolean_t success; 9469 ip_stack_t *ipst; 9470 9471 ipst = CONNQ_TO_IPST(q); 9472 9473 ASSERT(q->q_next == NULL); 9474 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", 9475 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9476 ASSERT(IAM_WRITER_IPIF(ipif)); 9477 9478 connp = Q_TO_CONN(q); 9479 /* 9480 * Special case for unplumbing lo0 (the loopback physical interface). 9481 * If unplumbing lo0, the incoming address structure has been 9482 * initialized to all zeros. When unplumbing lo0, all its logical 9483 * interfaces must be removed too. 9484 * 9485 * Note that this interface may be called to remove a specific 9486 * loopback logical interface (eg, lo0:1). But in that case 9487 * ipif->ipif_id != 0 so that the code path for that case is the 9488 * same as any other interface (meaning it skips the code directly 9489 * below). 9490 */ 9491 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9492 if (sin->sin_family == AF_UNSPEC && 9493 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { 9494 /* 9495 * Mark it condemned. No new ref. will be made to ill. 9496 */ 9497 mutex_enter(&ill->ill_lock); 9498 ill->ill_state_flags |= ILL_CONDEMNED; 9499 for (ipif = ill->ill_ipif; ipif != NULL; 9500 ipif = ipif->ipif_next) { 9501 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9502 } 9503 mutex_exit(&ill->ill_lock); 9504 9505 ipif = ill->ill_ipif; 9506 /* unplumb the loopback interface */ 9507 ill_delete(ill); 9508 mutex_enter(&connp->conn_lock); 9509 mutex_enter(&ill->ill_lock); 9510 9511 /* Are any references to this ill active */ 9512 if (ill_is_freeable(ill)) { 9513 mutex_exit(&ill->ill_lock); 9514 mutex_exit(&connp->conn_lock); 9515 ill_delete_tail(ill); 9516 mi_free(ill); 9517 return (0); 9518 } 9519 success = ipsq_pending_mp_add(connp, ipif, 9520 CONNP_TO_WQ(connp), mp, ILL_FREE); 9521 mutex_exit(&connp->conn_lock); 9522 mutex_exit(&ill->ill_lock); 9523 if (success) 9524 return (EINPROGRESS); 9525 else 9526 return (EINTR); 9527 } 9528 } 9529 9530 if (ipif->ipif_id == 0) { 9531 ipsq_t *ipsq; 9532 9533 /* Find based on address */ 9534 if (ipif->ipif_isv6) { 9535 sin6_t *sin6; 9536 9537 if (sin->sin_family != AF_INET6) 9538 return (EAFNOSUPPORT); 9539 9540 sin6 = (sin6_t *)sin; 9541 /* We are a writer, so we should be able to lookup */ 9542 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill, 9543 ipst); 9544 } else { 9545 if (sin->sin_family != AF_INET) 9546 return (EAFNOSUPPORT); 9547 9548 /* We are a writer, so we should be able to lookup */ 9549 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill, 9550 ipst); 9551 } 9552 if (ipif == NULL) { 9553 return (EADDRNOTAVAIL); 9554 } 9555 9556 /* 9557 * It is possible for a user to send an SIOCLIFREMOVEIF with 9558 * lifr_name of the physical interface but with an ip address 9559 * lifr_addr of a logical interface plumbed over it. 9560 * So update ipx_current_ipif now that ipif points to the 9561 * correct one. 9562 */ 9563 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; 9564 ipsq->ipsq_xop->ipx_current_ipif = ipif; 9565 9566 /* This is a writer */ 9567 ipif_refrele(ipif); 9568 } 9569 9570 /* 9571 * Can not delete instance zero since it is tied to the ill. 9572 */ 9573 if (ipif->ipif_id == 0) 9574 return (EBUSY); 9575 9576 mutex_enter(&ill->ill_lock); 9577 ipif->ipif_state_flags |= IPIF_CONDEMNED; 9578 mutex_exit(&ill->ill_lock); 9579 9580 ipif_free(ipif); 9581 9582 mutex_enter(&connp->conn_lock); 9583 mutex_enter(&ill->ill_lock); 9584 9585 /* Are any references to this ipif active */ 9586 if (ipif_is_freeable(ipif)) { 9587 mutex_exit(&ill->ill_lock); 9588 mutex_exit(&connp->conn_lock); 9589 ipif_non_duplicate(ipif); 9590 (void) ipif_down_tail(ipif); 9591 ipif_free_tail(ipif); /* frees ipif */ 9592 return (0); 9593 } 9594 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, 9595 IPIF_FREE); 9596 mutex_exit(&ill->ill_lock); 9597 mutex_exit(&connp->conn_lock); 9598 if (success) 9599 return (EINPROGRESS); 9600 else 9601 return (EINTR); 9602 } 9603 9604 /* 9605 * Restart the removeif ioctl. The refcnt has gone down to 0. 9606 * The ipif is already condemned. So can't find it thru lookups. 9607 */ 9608 /* ARGSUSED */ 9609 int 9610 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, 9611 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req) 9612 { 9613 ill_t *ill = ipif->ipif_ill; 9614 9615 ASSERT(IAM_WRITER_IPIF(ipif)); 9616 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); 9617 9618 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", 9619 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9620 9621 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { 9622 ASSERT(ill->ill_state_flags & ILL_CONDEMNED); 9623 ill_delete_tail(ill); 9624 mi_free(ill); 9625 return (0); 9626 } 9627 9628 ipif_non_duplicate(ipif); 9629 (void) ipif_down_tail(ipif); 9630 ipif_free_tail(ipif); 9631 9632 return (0); 9633 } 9634 9635 /* 9636 * Set the local interface address using the given prefix and ill_token. 9637 */ 9638 /* ARGSUSED */ 9639 int 9640 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9641 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9642 { 9643 int err; 9644 in6_addr_t v6addr; 9645 sin6_t *sin6; 9646 ill_t *ill; 9647 int i; 9648 9649 ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n", 9650 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9651 9652 ASSERT(IAM_WRITER_IPIF(ipif)); 9653 9654 if (!ipif->ipif_isv6) 9655 return (EINVAL); 9656 9657 if (sin->sin_family != AF_INET6) 9658 return (EAFNOSUPPORT); 9659 9660 sin6 = (sin6_t *)sin; 9661 v6addr = sin6->sin6_addr; 9662 ill = ipif->ipif_ill; 9663 9664 if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) || 9665 IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token)) 9666 return (EADDRNOTAVAIL); 9667 9668 for (i = 0; i < 4; i++) 9669 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i]; 9670 9671 err = ip_sioctl_addr(ipif, sin, q, mp, 9672 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq); 9673 return (err); 9674 } 9675 9676 /* 9677 * Restart entry point to restart the address set operation after the 9678 * refcounts have dropped to zero. 9679 */ 9680 /* ARGSUSED */ 9681 int 9682 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9683 ip_ioctl_cmd_t *ipip, void *ifreq) 9684 { 9685 ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n", 9686 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9687 return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq)); 9688 } 9689 9690 /* 9691 * Set the local interface address. 9692 * Allow an address of all zero when the interface is down. 9693 */ 9694 /* ARGSUSED */ 9695 int 9696 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9697 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq) 9698 { 9699 int err = 0; 9700 in6_addr_t v6addr; 9701 boolean_t need_up = B_FALSE; 9702 ill_t *ill; 9703 int i; 9704 9705 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", 9706 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9707 9708 ASSERT(IAM_WRITER_IPIF(ipif)); 9709 9710 ill = ipif->ipif_ill; 9711 if (ipif->ipif_isv6) { 9712 sin6_t *sin6; 9713 phyint_t *phyi; 9714 9715 if (sin->sin_family != AF_INET6) 9716 return (EAFNOSUPPORT); 9717 9718 sin6 = (sin6_t *)sin; 9719 v6addr = sin6->sin6_addr; 9720 phyi = ill->ill_phyint; 9721 9722 /* 9723 * Enforce that true multicast interfaces have a link-local 9724 * address for logical unit 0. 9725 * 9726 * However for those ipif's for which link-local address was 9727 * not created by default, also allow setting :: as the address. 9728 * This scenario would arise, when we delete an address on ipif 9729 * with logical unit 0, we would want to set :: as the address. 9730 */ 9731 if (ipif->ipif_id == 0 && 9732 (ill->ill_flags & ILLF_MULTICAST) && 9733 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) && 9734 !(phyi->phyint_flags & (PHYI_LOOPBACK)) && 9735 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) { 9736 9737 /* 9738 * if default link-local was not created by kernel for 9739 * this ill, allow setting :: as the address on ipif:0. 9740 */ 9741 if (ill->ill_flags & ILLF_NOLINKLOCAL) { 9742 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) 9743 return (EADDRNOTAVAIL); 9744 } else { 9745 return (EADDRNOTAVAIL); 9746 } 9747 } 9748 9749 /* 9750 * up interfaces shouldn't have the unspecified address 9751 * unless they also have the IPIF_NOLOCAL flags set and 9752 * have a subnet assigned. 9753 */ 9754 if ((ipif->ipif_flags & IPIF_UP) && 9755 IN6_IS_ADDR_UNSPECIFIED(&v6addr) && 9756 (!(ipif->ipif_flags & IPIF_NOLOCAL) || 9757 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) { 9758 return (EADDRNOTAVAIL); 9759 } 9760 9761 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 9762 return (EADDRNOTAVAIL); 9763 } else { 9764 ipaddr_t addr; 9765 9766 if (sin->sin_family != AF_INET) 9767 return (EAFNOSUPPORT); 9768 9769 addr = sin->sin_addr.s_addr; 9770 9771 /* Allow INADDR_ANY as the local address. */ 9772 if (addr != INADDR_ANY && 9773 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) 9774 return (EADDRNOTAVAIL); 9775 9776 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9777 } 9778 /* 9779 * verify that the address being configured is permitted by the 9780 * ill_allowed_ips[] for the interface. 9781 */ 9782 if (ill->ill_allowed_ips_cnt > 0) { 9783 for (i = 0; i < ill->ill_allowed_ips_cnt; i++) { 9784 if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i], 9785 &v6addr)) 9786 break; 9787 } 9788 if (i == ill->ill_allowed_ips_cnt) { 9789 pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr); 9790 return (EPERM); 9791 } 9792 } 9793 /* 9794 * Even if there is no change we redo things just to rerun 9795 * ipif_set_default. 9796 */ 9797 if (ipif->ipif_flags & IPIF_UP) { 9798 /* 9799 * Setting a new local address, make sure 9800 * we have net and subnet bcast ire's for 9801 * the old address if we need them. 9802 */ 9803 /* 9804 * If the interface is already marked up, 9805 * we call ipif_down which will take care 9806 * of ditching any IREs that have been set 9807 * up based on the old interface address. 9808 */ 9809 err = ipif_logical_down(ipif, q, mp); 9810 if (err == EINPROGRESS) 9811 return (err); 9812 (void) ipif_down_tail(ipif); 9813 need_up = 1; 9814 } 9815 9816 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up); 9817 return (err); 9818 } 9819 9820 int 9821 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9822 boolean_t need_up) 9823 { 9824 in6_addr_t v6addr; 9825 in6_addr_t ov6addr; 9826 ipaddr_t addr; 9827 sin6_t *sin6; 9828 int sinlen; 9829 int err = 0; 9830 ill_t *ill = ipif->ipif_ill; 9831 boolean_t need_dl_down; 9832 boolean_t need_arp_down; 9833 struct iocblk *iocp; 9834 9835 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL; 9836 9837 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", 9838 ill->ill_name, ipif->ipif_id, (void *)ipif)); 9839 ASSERT(IAM_WRITER_IPIF(ipif)); 9840 9841 /* Must cancel any pending timer before taking the ill_lock */ 9842 if (ipif->ipif_recovery_id != 0) 9843 (void) untimeout(ipif->ipif_recovery_id); 9844 ipif->ipif_recovery_id = 0; 9845 9846 if (ipif->ipif_isv6) { 9847 sin6 = (sin6_t *)sin; 9848 v6addr = sin6->sin6_addr; 9849 sinlen = sizeof (struct sockaddr_in6); 9850 } else { 9851 addr = sin->sin_addr.s_addr; 9852 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 9853 sinlen = sizeof (struct sockaddr_in); 9854 } 9855 mutex_enter(&ill->ill_lock); 9856 ov6addr = ipif->ipif_v6lcl_addr; 9857 ipif->ipif_v6lcl_addr = v6addr; 9858 sctp_update_ipif_addr(ipif, ov6addr); 9859 ipif->ipif_addr_ready = 0; 9860 9861 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT); 9862 9863 /* 9864 * If the interface was previously marked as a duplicate, then since 9865 * we've now got a "new" address, it should no longer be considered a 9866 * duplicate -- even if the "new" address is the same as the old one. 9867 * Note that if all ipifs are down, we may have a pending ARP down 9868 * event to handle. This is because we want to recover from duplicates 9869 * and thus delay tearing down ARP until the duplicates have been 9870 * removed or disabled. 9871 */ 9872 need_dl_down = need_arp_down = B_FALSE; 9873 if (ipif->ipif_flags & IPIF_DUPLICATE) { 9874 need_arp_down = !need_up; 9875 ipif->ipif_flags &= ~IPIF_DUPLICATE; 9876 if (--ill->ill_ipif_dup_count == 0 && !need_up && 9877 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 9878 need_dl_down = B_TRUE; 9879 } 9880 } 9881 9882 ipif_set_default(ipif); 9883 9884 /* 9885 * If we've just manually set the IPv6 link-local address (0th ipif), 9886 * tag the ill so that future updates to the interface ID don't result 9887 * in this address getting automatically reconfigured from under the 9888 * administrator. 9889 */ 9890 if (ipif->ipif_isv6 && ipif->ipif_id == 0) { 9891 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR && 9892 !IN6_IS_ADDR_UNSPECIFIED(&v6addr))) 9893 ill->ill_manual_linklocal = 1; 9894 } 9895 9896 /* 9897 * When publishing an interface address change event, we only notify 9898 * the event listeners of the new address. It is assumed that if they 9899 * actively care about the addresses assigned that they will have 9900 * already discovered the previous address assigned (if there was one.) 9901 * 9902 * Don't attach nic event message for SIOCLIFADDIF ioctl. 9903 */ 9904 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) { 9905 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id), 9906 NE_ADDRESS_CHANGE, sin, sinlen); 9907 } 9908 9909 mutex_exit(&ill->ill_lock); 9910 9911 if (need_up) { 9912 /* 9913 * Now bring the interface back up. If this 9914 * is the only IPIF for the ILL, ipif_up 9915 * will have to re-bind to the device, so 9916 * we may get back EINPROGRESS, in which 9917 * case, this IOCTL will get completed in 9918 * ip_rput_dlpi when we see the DL_BIND_ACK. 9919 */ 9920 err = ipif_up(ipif, q, mp); 9921 } else { 9922 /* Perhaps ilgs should use this ill */ 9923 update_conn_ill(NULL, ill->ill_ipst); 9924 } 9925 9926 if (need_dl_down) 9927 ill_dl_down(ill); 9928 9929 if (need_arp_down && !ill->ill_isv6) 9930 (void) ipif_arp_down(ipif); 9931 9932 /* 9933 * The default multicast interface might have changed (for 9934 * instance if the IPv6 scope of the address changed) 9935 */ 9936 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 9937 9938 return (err); 9939 } 9940 9941 /* 9942 * Restart entry point to restart the address set operation after the 9943 * refcounts have dropped to zero. 9944 */ 9945 /* ARGSUSED */ 9946 int 9947 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9948 ip_ioctl_cmd_t *ipip, void *ifreq) 9949 { 9950 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", 9951 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9952 ASSERT(IAM_WRITER_IPIF(ipif)); 9953 (void) ipif_down_tail(ipif); 9954 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); 9955 } 9956 9957 /* ARGSUSED */ 9958 int 9959 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9960 ip_ioctl_cmd_t *ipip, void *if_req) 9961 { 9962 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 9963 struct lifreq *lifr = (struct lifreq *)if_req; 9964 9965 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n", 9966 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 9967 /* 9968 * The net mask and address can't change since we have a 9969 * reference to the ipif. So no lock is necessary. 9970 */ 9971 if (ipif->ipif_isv6) { 9972 *sin6 = sin6_null; 9973 sin6->sin6_family = AF_INET6; 9974 sin6->sin6_addr = ipif->ipif_v6lcl_addr; 9975 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 9976 lifr->lifr_addrlen = 9977 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 9978 } else { 9979 *sin = sin_null; 9980 sin->sin_family = AF_INET; 9981 sin->sin_addr.s_addr = ipif->ipif_lcl_addr; 9982 if (ipip->ipi_cmd_type == LIF_CMD) { 9983 lifr->lifr_addrlen = 9984 ip_mask_to_plen(ipif->ipif_net_mask); 9985 } 9986 } 9987 return (0); 9988 } 9989 9990 /* 9991 * Set the destination address for a pt-pt interface. 9992 */ 9993 /* ARGSUSED */ 9994 int 9995 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 9996 ip_ioctl_cmd_t *ipip, void *if_req) 9997 { 9998 int err = 0; 9999 in6_addr_t v6addr; 10000 boolean_t need_up = B_FALSE; 10001 10002 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n", 10003 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10004 ASSERT(IAM_WRITER_IPIF(ipif)); 10005 10006 if (ipif->ipif_isv6) { 10007 sin6_t *sin6; 10008 10009 if (sin->sin_family != AF_INET6) 10010 return (EAFNOSUPPORT); 10011 10012 sin6 = (sin6_t *)sin; 10013 v6addr = sin6->sin6_addr; 10014 10015 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask)) 10016 return (EADDRNOTAVAIL); 10017 } else { 10018 ipaddr_t addr; 10019 10020 if (sin->sin_family != AF_INET) 10021 return (EAFNOSUPPORT); 10022 10023 addr = sin->sin_addr.s_addr; 10024 if (addr != INADDR_ANY && 10025 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) { 10026 return (EADDRNOTAVAIL); 10027 } 10028 10029 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10030 } 10031 10032 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr)) 10033 return (0); /* No change */ 10034 10035 if (ipif->ipif_flags & IPIF_UP) { 10036 /* 10037 * If the interface is already marked up, 10038 * we call ipif_down which will take care 10039 * of ditching any IREs that have been set 10040 * up based on the old pp dst address. 10041 */ 10042 err = ipif_logical_down(ipif, q, mp); 10043 if (err == EINPROGRESS) 10044 return (err); 10045 (void) ipif_down_tail(ipif); 10046 need_up = B_TRUE; 10047 } 10048 /* 10049 * could return EINPROGRESS. If so ioctl will complete in 10050 * ip_rput_dlpi_writer 10051 */ 10052 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up); 10053 return (err); 10054 } 10055 10056 static int 10057 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10058 boolean_t need_up) 10059 { 10060 in6_addr_t v6addr; 10061 ill_t *ill = ipif->ipif_ill; 10062 int err = 0; 10063 boolean_t need_dl_down; 10064 boolean_t need_arp_down; 10065 10066 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, 10067 ipif->ipif_id, (void *)ipif)); 10068 10069 /* Must cancel any pending timer before taking the ill_lock */ 10070 if (ipif->ipif_recovery_id != 0) 10071 (void) untimeout(ipif->ipif_recovery_id); 10072 ipif->ipif_recovery_id = 0; 10073 10074 if (ipif->ipif_isv6) { 10075 sin6_t *sin6; 10076 10077 sin6 = (sin6_t *)sin; 10078 v6addr = sin6->sin6_addr; 10079 } else { 10080 ipaddr_t addr; 10081 10082 addr = sin->sin_addr.s_addr; 10083 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 10084 } 10085 mutex_enter(&ill->ill_lock); 10086 /* Set point to point destination address. */ 10087 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10088 /* 10089 * Allow this as a means of creating logical 10090 * pt-pt interfaces on top of e.g. an Ethernet. 10091 * XXX Undocumented HACK for testing. 10092 * pt-pt interfaces are created with NUD disabled. 10093 */ 10094 ipif->ipif_flags |= IPIF_POINTOPOINT; 10095 ipif->ipif_flags &= ~IPIF_BROADCAST; 10096 if (ipif->ipif_isv6) 10097 ill->ill_flags |= ILLF_NONUD; 10098 } 10099 10100 /* 10101 * If the interface was previously marked as a duplicate, then since 10102 * we've now got a "new" address, it should no longer be considered a 10103 * duplicate -- even if the "new" address is the same as the old one. 10104 * Note that if all ipifs are down, we may have a pending ARP down 10105 * event to handle. 10106 */ 10107 need_dl_down = need_arp_down = B_FALSE; 10108 if (ipif->ipif_flags & IPIF_DUPLICATE) { 10109 need_arp_down = !need_up; 10110 ipif->ipif_flags &= ~IPIF_DUPLICATE; 10111 if (--ill->ill_ipif_dup_count == 0 && !need_up && 10112 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { 10113 need_dl_down = B_TRUE; 10114 } 10115 } 10116 10117 /* 10118 * If we've just manually set the IPv6 destination link-local address 10119 * (0th ipif), tag the ill so that future updates to the destination 10120 * interface ID (as can happen with interfaces over IP tunnels) don't 10121 * result in this address getting automatically reconfigured from 10122 * under the administrator. 10123 */ 10124 if (ipif->ipif_isv6 && ipif->ipif_id == 0) 10125 ill->ill_manual_dst_linklocal = 1; 10126 10127 /* Set the new address. */ 10128 ipif->ipif_v6pp_dst_addr = v6addr; 10129 /* Make sure subnet tracks pp_dst */ 10130 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 10131 mutex_exit(&ill->ill_lock); 10132 10133 if (need_up) { 10134 /* 10135 * Now bring the interface back up. If this 10136 * is the only IPIF for the ILL, ipif_up 10137 * will have to re-bind to the device, so 10138 * we may get back EINPROGRESS, in which 10139 * case, this IOCTL will get completed in 10140 * ip_rput_dlpi when we see the DL_BIND_ACK. 10141 */ 10142 err = ipif_up(ipif, q, mp); 10143 } 10144 10145 if (need_dl_down) 10146 ill_dl_down(ill); 10147 if (need_arp_down && !ipif->ipif_isv6) 10148 (void) ipif_arp_down(ipif); 10149 10150 return (err); 10151 } 10152 10153 /* 10154 * Restart entry point to restart the dstaddress set operation after the 10155 * refcounts have dropped to zero. 10156 */ 10157 /* ARGSUSED */ 10158 int 10159 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10160 ip_ioctl_cmd_t *ipip, void *ifreq) 10161 { 10162 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", 10163 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10164 (void) ipif_down_tail(ipif); 10165 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); 10166 } 10167 10168 /* ARGSUSED */ 10169 int 10170 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10171 ip_ioctl_cmd_t *ipip, void *if_req) 10172 { 10173 sin6_t *sin6 = (struct sockaddr_in6 *)sin; 10174 10175 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n", 10176 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10177 /* 10178 * Get point to point destination address. The addresses can't 10179 * change since we hold a reference to the ipif. 10180 */ 10181 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) 10182 return (EADDRNOTAVAIL); 10183 10184 if (ipif->ipif_isv6) { 10185 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 10186 *sin6 = sin6_null; 10187 sin6->sin6_family = AF_INET6; 10188 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr; 10189 } else { 10190 *sin = sin_null; 10191 sin->sin_family = AF_INET; 10192 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr; 10193 } 10194 return (0); 10195 } 10196 10197 /* 10198 * Check which flags will change by the given flags being set 10199 * silently ignore flags which userland is not allowed to control. 10200 * (Because these flags may change between SIOCGLIFFLAGS and 10201 * SIOCSLIFFLAGS, and that's outside of userland's control, 10202 * we need to silently ignore them rather than fail.) 10203 */ 10204 static void 10205 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp, 10206 uint64_t *offp) 10207 { 10208 ill_t *ill = ipif->ipif_ill; 10209 phyint_t *phyi = ill->ill_phyint; 10210 uint64_t cantchange_flags, intf_flags; 10211 uint64_t turn_on, turn_off; 10212 10213 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 10214 cantchange_flags = IFF_CANTCHANGE; 10215 if (IS_IPMP(ill)) 10216 cantchange_flags |= IFF_IPMP_CANTCHANGE; 10217 turn_on = (flags ^ intf_flags) & ~cantchange_flags; 10218 turn_off = intf_flags & turn_on; 10219 turn_on ^= turn_off; 10220 *onp = turn_on; 10221 *offp = turn_off; 10222 } 10223 10224 /* 10225 * Set interface flags. Many flags require special handling (e.g., 10226 * bringing the interface down); see below for details. 10227 * 10228 * NOTE : We really don't enforce that ipif_id zero should be used 10229 * for setting any flags other than IFF_LOGINT_FLAGS. This 10230 * is because applications generally does SICGLIFFLAGS and 10231 * ORs in the new flags (that affects the logical) and does a 10232 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other 10233 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the 10234 * flags that will be turned on is correct with respect to 10235 * ipif_id 0. For backward compatibility reasons, it is not done. 10236 */ 10237 /* ARGSUSED */ 10238 int 10239 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10240 ip_ioctl_cmd_t *ipip, void *if_req) 10241 { 10242 uint64_t turn_on; 10243 uint64_t turn_off; 10244 int err = 0; 10245 phyint_t *phyi; 10246 ill_t *ill; 10247 conn_t *connp; 10248 uint64_t intf_flags; 10249 boolean_t phyint_flags_modified = B_FALSE; 10250 uint64_t flags; 10251 struct ifreq *ifr; 10252 struct lifreq *lifr; 10253 boolean_t set_linklocal = B_FALSE; 10254 10255 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", 10256 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10257 10258 ASSERT(IAM_WRITER_IPIF(ipif)); 10259 10260 ill = ipif->ipif_ill; 10261 phyi = ill->ill_phyint; 10262 10263 if (ipip->ipi_cmd_type == IF_CMD) { 10264 ifr = (struct ifreq *)if_req; 10265 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); 10266 } else { 10267 lifr = (struct lifreq *)if_req; 10268 flags = lifr->lifr_flags; 10269 } 10270 10271 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags; 10272 10273 /* 10274 * Have the flags been set correctly until now? 10275 */ 10276 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10277 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10278 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10279 /* 10280 * Compare the new flags to the old, and partition 10281 * into those coming on and those going off. 10282 * For the 16 bit command keep the bits above bit 16 unchanged. 10283 */ 10284 if (ipip->ipi_cmd == SIOCSIFFLAGS) 10285 flags |= intf_flags & ~0xFFFF; 10286 10287 /* 10288 * Explicitly fail attempts to change flags that are always invalid on 10289 * an IPMP meta-interface. 10290 */ 10291 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID)) 10292 return (EINVAL); 10293 10294 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10295 if ((turn_on|turn_off) == 0) 10296 return (0); /* No change */ 10297 10298 /* 10299 * All test addresses must be IFF_DEPRECATED (to ensure source address 10300 * selection avoids them) -- so force IFF_DEPRECATED on, and do not 10301 * allow it to be turned off. 10302 */ 10303 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED && 10304 (turn_on|intf_flags) & IFF_NOFAILOVER) 10305 return (EINVAL); 10306 10307 if ((connp = Q_TO_CONN(q)) == NULL) 10308 return (EINVAL); 10309 10310 /* 10311 * Only vrrp control socket is allowed to change IFF_UP and 10312 * IFF_NOACCEPT flags when IFF_VRRP is set. 10313 */ 10314 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) { 10315 if (!connp->conn_isvrrp) 10316 return (EINVAL); 10317 } 10318 10319 /* 10320 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by 10321 * VRRP control socket. 10322 */ 10323 if ((turn_off | turn_on) & IFF_NOACCEPT) { 10324 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP)) 10325 return (EINVAL); 10326 } 10327 10328 if (turn_on & IFF_NOFAILOVER) { 10329 turn_on |= IFF_DEPRECATED; 10330 flags |= IFF_DEPRECATED; 10331 } 10332 10333 /* 10334 * On underlying interfaces, only allow applications to manage test 10335 * addresses -- otherwise, they may get confused when the address 10336 * moves as part of being brought up. Likewise, prevent an 10337 * application-managed test address from being converted to a data 10338 * address. To prevent migration of administratively up addresses in 10339 * the kernel, we don't allow them to be converted either. 10340 */ 10341 if (IS_UNDER_IPMP(ill)) { 10342 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF; 10343 10344 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER)) 10345 return (EINVAL); 10346 10347 if ((turn_off & IFF_NOFAILOVER) && 10348 (flags & (appflags | IFF_UP | IFF_DUPLICATE))) 10349 return (EINVAL); 10350 } 10351 10352 /* 10353 * Only allow IFF_TEMPORARY flag to be set on 10354 * IPv6 interfaces. 10355 */ 10356 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) 10357 return (EINVAL); 10358 10359 /* 10360 * cannot turn off IFF_NOXMIT on VNI interfaces. 10361 */ 10362 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill)) 10363 return (EINVAL); 10364 10365 /* 10366 * Don't allow the IFF_ROUTER flag to be turned on on loopback 10367 * interfaces. It makes no sense in that context. 10368 */ 10369 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) 10370 return (EINVAL); 10371 10372 /* 10373 * For IPv6 ipif_id 0, don't allow the interface to be up without 10374 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. 10375 * If the link local address isn't set, and can be set, it will get 10376 * set later on in this function. 10377 */ 10378 if (ipif->ipif_id == 0 && ipif->ipif_isv6 && 10379 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && 10380 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { 10381 if (ipif_cant_setlinklocal(ipif)) 10382 return (EINVAL); 10383 set_linklocal = B_TRUE; 10384 } 10385 10386 /* 10387 * If we modify physical interface flags, we'll potentially need to 10388 * send up two routing socket messages for the changes (one for the 10389 * IPv4 ill, and another for the IPv6 ill). Note that here. 10390 */ 10391 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10392 phyint_flags_modified = B_TRUE; 10393 10394 /* 10395 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE 10396 * (otherwise, we'd immediately use them, defeating standby). Also, 10397 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not 10398 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already 10399 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We 10400 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics 10401 * will not be honored. 10402 */ 10403 if (turn_on & PHYI_STANDBY) { 10404 /* 10405 * No need to grab ill_g_usesrc_lock here; see the 10406 * synchronization notes in ip.c. 10407 */ 10408 if (ill->ill_usesrc_grp_next != NULL || 10409 intf_flags & PHYI_INACTIVE) 10410 return (EINVAL); 10411 if (!(flags & PHYI_FAILED)) { 10412 flags |= PHYI_INACTIVE; 10413 turn_on |= PHYI_INACTIVE; 10414 } 10415 } 10416 10417 if (turn_off & PHYI_STANDBY) { 10418 flags &= ~PHYI_INACTIVE; 10419 turn_off |= PHYI_INACTIVE; 10420 } 10421 10422 /* 10423 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both 10424 * would end up on. 10425 */ 10426 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) == 10427 (PHYI_FAILED | PHYI_INACTIVE)) 10428 return (EINVAL); 10429 10430 /* 10431 * If ILLF_ROUTER changes, we need to change the ip forwarding 10432 * status of the interface. 10433 */ 10434 if ((turn_on | turn_off) & ILLF_ROUTER) { 10435 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0)); 10436 if (err != 0) 10437 return (err); 10438 } 10439 10440 /* 10441 * If the interface is not UP and we are not going to 10442 * bring it UP, record the flags and return. When the 10443 * interface comes UP later, the right actions will be 10444 * taken. 10445 */ 10446 if (!(ipif->ipif_flags & IPIF_UP) && 10447 !(turn_on & IPIF_UP)) { 10448 /* Record new flags in their respective places. */ 10449 mutex_enter(&ill->ill_lock); 10450 mutex_enter(&ill->ill_phyint->phyint_lock); 10451 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10452 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10453 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10454 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10455 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10456 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10457 mutex_exit(&ill->ill_lock); 10458 mutex_exit(&ill->ill_phyint->phyint_lock); 10459 10460 /* 10461 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the 10462 * same to the kernel: if any of them has been set by 10463 * userland, the interface cannot be used for data traffic. 10464 */ 10465 if ((turn_on|turn_off) & 10466 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10467 ASSERT(!IS_IPMP(ill)); 10468 /* 10469 * It's possible the ill is part of an "anonymous" 10470 * IPMP group rather than a real group. In that case, 10471 * there are no other interfaces in the group and thus 10472 * no need to call ipmp_phyint_refresh_active(). 10473 */ 10474 if (IS_UNDER_IPMP(ill)) 10475 ipmp_phyint_refresh_active(phyi); 10476 } 10477 10478 if (phyint_flags_modified) { 10479 if (phyi->phyint_illv4 != NULL) { 10480 ip_rts_ifmsg(phyi->phyint_illv4-> 10481 ill_ipif, RTSQ_DEFAULT); 10482 } 10483 if (phyi->phyint_illv6 != NULL) { 10484 ip_rts_ifmsg(phyi->phyint_illv6-> 10485 ill_ipif, RTSQ_DEFAULT); 10486 } 10487 } 10488 /* The default multicast interface might have changed */ 10489 ire_increment_multicast_generation(ill->ill_ipst, 10490 ill->ill_isv6); 10491 10492 return (0); 10493 } else if (set_linklocal) { 10494 mutex_enter(&ill->ill_lock); 10495 if (set_linklocal) 10496 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; 10497 mutex_exit(&ill->ill_lock); 10498 } 10499 10500 /* 10501 * Disallow IPv6 interfaces coming up that have the unspecified address, 10502 * or point-to-point interfaces with an unspecified destination. We do 10503 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that 10504 * have a subnet assigned, which is how in.ndpd currently manages its 10505 * onlink prefix list when no addresses are configured with those 10506 * prefixes. 10507 */ 10508 if (ipif->ipif_isv6 && 10509 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 10510 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) || 10511 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) || 10512 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10513 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) { 10514 return (EINVAL); 10515 } 10516 10517 /* 10518 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination 10519 * from being brought up. 10520 */ 10521 if (!ipif->ipif_isv6 && 10522 ((ipif->ipif_flags & IPIF_POINTOPOINT) && 10523 ipif->ipif_pp_dst_addr == INADDR_ANY)) { 10524 return (EINVAL); 10525 } 10526 10527 /* 10528 * If we are going to change one or more of the flags that are 10529 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP, 10530 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and 10531 * IPIF_NOFAILOVER, we will take special action. This is 10532 * done by bring the ipif down, changing the flags and bringing 10533 * it back up again. For IPIF_NOFAILOVER, the act of bringing it 10534 * back up will trigger the address to be moved. 10535 * 10536 * If we are going to change IFF_NOACCEPT, we need to bring 10537 * all the ipifs down then bring them up again. The act of 10538 * bringing all the ipifs back up will trigger the local 10539 * ires being recreated with "no_accept" set/cleared. 10540 * 10541 * Note that ILLF_NOACCEPT is always set separately from the 10542 * other flags. 10543 */ 10544 if ((turn_on|turn_off) & 10545 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP| 10546 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| 10547 IPIF_NOFAILOVER)) { 10548 /* 10549 * ipif_down() will ire_delete bcast ire's for the subnet, 10550 * while the ire_identical_ref tracks the case of IRE_BROADCAST 10551 * entries shared between multiple ipifs on the same subnet. 10552 */ 10553 if (((ipif->ipif_flags | turn_on) & IPIF_UP) && 10554 !(turn_off & IPIF_UP)) { 10555 if (ipif->ipif_flags & IPIF_UP) 10556 ill->ill_logical_down = 1; 10557 turn_on &= ~IPIF_UP; 10558 } 10559 err = ipif_down(ipif, q, mp); 10560 ip1dbg(("ipif_down returns %d err ", err)); 10561 if (err == EINPROGRESS) 10562 return (err); 10563 (void) ipif_down_tail(ipif); 10564 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10565 /* 10566 * If we can quiesce the ill, then continue. If not, then 10567 * ip_sioctl_flags_tail() will be called from 10568 * ipif_ill_refrele_tail(). 10569 */ 10570 ill_down_ipifs(ill, B_TRUE); 10571 10572 mutex_enter(&connp->conn_lock); 10573 mutex_enter(&ill->ill_lock); 10574 if (!ill_is_quiescent(ill)) { 10575 boolean_t success; 10576 10577 success = ipsq_pending_mp_add(connp, ill->ill_ipif, 10578 q, mp, ILL_DOWN); 10579 mutex_exit(&ill->ill_lock); 10580 mutex_exit(&connp->conn_lock); 10581 return (success ? EINPROGRESS : EINTR); 10582 } 10583 mutex_exit(&ill->ill_lock); 10584 mutex_exit(&connp->conn_lock); 10585 } 10586 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10587 } 10588 10589 static int 10590 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) 10591 { 10592 ill_t *ill; 10593 phyint_t *phyi; 10594 uint64_t turn_on, turn_off; 10595 boolean_t phyint_flags_modified = B_FALSE; 10596 int err = 0; 10597 boolean_t set_linklocal = B_FALSE; 10598 10599 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", 10600 ipif->ipif_ill->ill_name, ipif->ipif_id)); 10601 10602 ASSERT(IAM_WRITER_IPIF(ipif)); 10603 10604 ill = ipif->ipif_ill; 10605 phyi = ill->ill_phyint; 10606 10607 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10608 10609 /* 10610 * IFF_UP is handled separately. 10611 */ 10612 turn_on &= ~IFF_UP; 10613 turn_off &= ~IFF_UP; 10614 10615 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS) 10616 phyint_flags_modified = B_TRUE; 10617 10618 /* 10619 * Now we change the flags. Track current value of 10620 * other flags in their respective places. 10621 */ 10622 mutex_enter(&ill->ill_lock); 10623 mutex_enter(&phyi->phyint_lock); 10624 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS); 10625 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS); 10626 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS); 10627 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS); 10628 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS); 10629 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS); 10630 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) { 10631 set_linklocal = B_TRUE; 10632 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; 10633 } 10634 10635 mutex_exit(&ill->ill_lock); 10636 mutex_exit(&phyi->phyint_lock); 10637 10638 if (set_linklocal) 10639 (void) ipif_setlinklocal(ipif); 10640 10641 /* 10642 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to 10643 * the kernel: if any of them has been set by userland, the interface 10644 * cannot be used for data traffic. 10645 */ 10646 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) { 10647 ASSERT(!IS_IPMP(ill)); 10648 /* 10649 * It's possible the ill is part of an "anonymous" IPMP group 10650 * rather than a real group. In that case, there are no other 10651 * interfaces in the group and thus no need for us to call 10652 * ipmp_phyint_refresh_active(). 10653 */ 10654 if (IS_UNDER_IPMP(ill)) 10655 ipmp_phyint_refresh_active(phyi); 10656 } 10657 10658 if ((turn_on|turn_off) & ILLF_NOACCEPT) { 10659 /* 10660 * If the ILLF_NOACCEPT flag is changed, bring up all the 10661 * ipifs that were brought down. 10662 * 10663 * The routing sockets messages are sent as the result 10664 * of ill_up_ipifs(), further, SCTP's IPIF list was updated 10665 * as well. 10666 */ 10667 err = ill_up_ipifs(ill, q, mp); 10668 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) { 10669 /* 10670 * XXX ipif_up really does not know whether a phyint flags 10671 * was modified or not. So, it sends up information on 10672 * only one routing sockets message. As we don't bring up 10673 * the interface and also set PHYI_ flags simultaneously 10674 * it should be okay. 10675 */ 10676 err = ipif_up(ipif, q, mp); 10677 } else { 10678 /* 10679 * Make sure routing socket sees all changes to the flags. 10680 * ipif_up_done* handles this when we use ipif_up. 10681 */ 10682 if (phyint_flags_modified) { 10683 if (phyi->phyint_illv4 != NULL) { 10684 ip_rts_ifmsg(phyi->phyint_illv4-> 10685 ill_ipif, RTSQ_DEFAULT); 10686 } 10687 if (phyi->phyint_illv6 != NULL) { 10688 ip_rts_ifmsg(phyi->phyint_illv6-> 10689 ill_ipif, RTSQ_DEFAULT); 10690 } 10691 } else { 10692 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 10693 } 10694 /* 10695 * Update the flags in SCTP's IPIF list, ipif_up() will do 10696 * this in need_up case. 10697 */ 10698 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10699 } 10700 10701 /* The default multicast interface might have changed */ 10702 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); 10703 return (err); 10704 } 10705 10706 /* 10707 * Restart the flags operation now that the refcounts have dropped to zero. 10708 */ 10709 /* ARGSUSED */ 10710 int 10711 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10712 ip_ioctl_cmd_t *ipip, void *if_req) 10713 { 10714 uint64_t flags; 10715 struct ifreq *ifr = if_req; 10716 struct lifreq *lifr = if_req; 10717 uint64_t turn_on, turn_off; 10718 10719 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", 10720 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10721 10722 if (ipip->ipi_cmd_type == IF_CMD) { 10723 /* cast to uint16_t prevents unwanted sign extension */ 10724 flags = (uint16_t)ifr->ifr_flags; 10725 } else { 10726 flags = lifr->lifr_flags; 10727 } 10728 10729 /* 10730 * If this function call is a result of the ILLF_NOACCEPT flag 10731 * change, do not call ipif_down_tail(). See ip_sioctl_flags(). 10732 */ 10733 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off); 10734 if (!((turn_on|turn_off) & ILLF_NOACCEPT)) 10735 (void) ipif_down_tail(ipif); 10736 10737 return (ip_sioctl_flags_tail(ipif, flags, q, mp)); 10738 } 10739 10740 /* 10741 * Can operate on either a module or a driver queue. 10742 */ 10743 /* ARGSUSED */ 10744 int 10745 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10746 ip_ioctl_cmd_t *ipip, void *if_req) 10747 { 10748 /* 10749 * Has the flags been set correctly till now ? 10750 */ 10751 ill_t *ill = ipif->ipif_ill; 10752 phyint_t *phyi = ill->ill_phyint; 10753 10754 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n", 10755 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10756 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0); 10757 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0); 10758 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0); 10759 10760 /* 10761 * Need a lock since some flags can be set even when there are 10762 * references to the ipif. 10763 */ 10764 mutex_enter(&ill->ill_lock); 10765 if (ipip->ipi_cmd_type == IF_CMD) { 10766 struct ifreq *ifr = (struct ifreq *)if_req; 10767 10768 /* Get interface flags (low 16 only). */ 10769 ifr->ifr_flags = ((ipif->ipif_flags | 10770 ill->ill_flags | phyi->phyint_flags) & 0xffff); 10771 } else { 10772 struct lifreq *lifr = (struct lifreq *)if_req; 10773 10774 /* Get interface flags. */ 10775 lifr->lifr_flags = ipif->ipif_flags | 10776 ill->ill_flags | phyi->phyint_flags; 10777 } 10778 mutex_exit(&ill->ill_lock); 10779 return (0); 10780 } 10781 10782 /* 10783 * We allow the MTU to be set on an ILL, but not have it be different 10784 * for different IPIFs since we don't actually send packets on IPIFs. 10785 */ 10786 /* ARGSUSED */ 10787 int 10788 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10789 ip_ioctl_cmd_t *ipip, void *if_req) 10790 { 10791 int mtu; 10792 int ip_min_mtu; 10793 struct ifreq *ifr; 10794 struct lifreq *lifr; 10795 ill_t *ill; 10796 10797 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, 10798 ipif->ipif_id, (void *)ipif)); 10799 if (ipip->ipi_cmd_type == IF_CMD) { 10800 ifr = (struct ifreq *)if_req; 10801 mtu = ifr->ifr_metric; 10802 } else { 10803 lifr = (struct lifreq *)if_req; 10804 mtu = lifr->lifr_mtu; 10805 } 10806 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 10807 if (ipif->ipif_id != 0) 10808 return (EINVAL); 10809 10810 ill = ipif->ipif_ill; 10811 if (ipif->ipif_isv6) 10812 ip_min_mtu = IPV6_MIN_MTU; 10813 else 10814 ip_min_mtu = IP_MIN_MTU; 10815 10816 mutex_enter(&ill->ill_lock); 10817 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { 10818 mutex_exit(&ill->ill_lock); 10819 return (EINVAL); 10820 } 10821 /* Avoid increasing ill_mc_mtu */ 10822 if (ill->ill_mc_mtu > mtu) 10823 ill->ill_mc_mtu = mtu; 10824 10825 /* 10826 * The dce and fragmentation code can handle changes to ill_mtu 10827 * concurrent with sending/fragmenting packets. 10828 */ 10829 ill->ill_mtu = mtu; 10830 ill->ill_flags |= ILLF_FIXEDMTU; 10831 mutex_exit(&ill->ill_lock); 10832 10833 /* 10834 * Make sure all dce_generation checks find out 10835 * that ill_mtu/ill_mc_mtu has changed. 10836 */ 10837 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 10838 10839 /* 10840 * Refresh IPMP meta-interface MTU if necessary. 10841 */ 10842 if (IS_UNDER_IPMP(ill)) 10843 ipmp_illgrp_refresh_mtu(ill->ill_grp); 10844 10845 /* Update the MTU in SCTP's list */ 10846 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 10847 return (0); 10848 } 10849 10850 /* Get interface MTU. */ 10851 /* ARGSUSED */ 10852 int 10853 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10854 ip_ioctl_cmd_t *ipip, void *if_req) 10855 { 10856 struct ifreq *ifr; 10857 struct lifreq *lifr; 10858 10859 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", 10860 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10861 10862 /* 10863 * We allow a get on any logical interface even though the set 10864 * can only be done on logical unit 0. 10865 */ 10866 if (ipip->ipi_cmd_type == IF_CMD) { 10867 ifr = (struct ifreq *)if_req; 10868 ifr->ifr_metric = ipif->ipif_ill->ill_mtu; 10869 } else { 10870 lifr = (struct lifreq *)if_req; 10871 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; 10872 } 10873 return (0); 10874 } 10875 10876 /* Set interface broadcast address. */ 10877 /* ARGSUSED2 */ 10878 int 10879 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10880 ip_ioctl_cmd_t *ipip, void *if_req) 10881 { 10882 ipaddr_t addr; 10883 ire_t *ire; 10884 ill_t *ill = ipif->ipif_ill; 10885 ip_stack_t *ipst = ill->ill_ipst; 10886 10887 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, 10888 ipif->ipif_id)); 10889 10890 ASSERT(IAM_WRITER_IPIF(ipif)); 10891 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10892 return (EADDRNOTAVAIL); 10893 10894 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */ 10895 10896 if (sin->sin_family != AF_INET) 10897 return (EAFNOSUPPORT); 10898 10899 addr = sin->sin_addr.s_addr; 10900 10901 if (ipif->ipif_flags & IPIF_UP) { 10902 /* 10903 * If we are already up, make sure the new 10904 * broadcast address makes sense. If it does, 10905 * there should be an IRE for it already. 10906 */ 10907 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, 10908 ill, ipif->ipif_zoneid, NULL, 10909 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); 10910 if (ire == NULL) { 10911 return (EINVAL); 10912 } else { 10913 ire_refrele(ire); 10914 } 10915 } 10916 /* 10917 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST 10918 * needs to already exist we never need to change the set of 10919 * IRE_BROADCASTs when we are UP. 10920 */ 10921 if (addr != ipif->ipif_brd_addr) 10922 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); 10923 10924 return (0); 10925 } 10926 10927 /* Get interface broadcast address. */ 10928 /* ARGSUSED */ 10929 int 10930 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10931 ip_ioctl_cmd_t *ipip, void *if_req) 10932 { 10933 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n", 10934 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10935 if (!(ipif->ipif_flags & IPIF_BROADCAST)) 10936 return (EADDRNOTAVAIL); 10937 10938 /* IPIF_BROADCAST not possible with IPv6 */ 10939 ASSERT(!ipif->ipif_isv6); 10940 *sin = sin_null; 10941 sin->sin_family = AF_INET; 10942 sin->sin_addr.s_addr = ipif->ipif_brd_addr; 10943 return (0); 10944 } 10945 10946 /* 10947 * This routine is called to handle the SIOCS*IFNETMASK IOCTL. 10948 */ 10949 /* ARGSUSED */ 10950 int 10951 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 10952 ip_ioctl_cmd_t *ipip, void *if_req) 10953 { 10954 int err = 0; 10955 in6_addr_t v6mask; 10956 10957 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n", 10958 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 10959 10960 ASSERT(IAM_WRITER_IPIF(ipif)); 10961 10962 if (ipif->ipif_isv6) { 10963 sin6_t *sin6; 10964 10965 if (sin->sin_family != AF_INET6) 10966 return (EAFNOSUPPORT); 10967 10968 sin6 = (sin6_t *)sin; 10969 v6mask = sin6->sin6_addr; 10970 } else { 10971 ipaddr_t mask; 10972 10973 if (sin->sin_family != AF_INET) 10974 return (EAFNOSUPPORT); 10975 10976 mask = sin->sin_addr.s_addr; 10977 if (!ip_contiguous_mask(ntohl(mask))) 10978 return (ENOTSUP); 10979 V4MASK_TO_V6(mask, v6mask); 10980 } 10981 10982 /* 10983 * No big deal if the interface isn't already up, or the mask 10984 * isn't really changing, or this is pt-pt. 10985 */ 10986 if (!(ipif->ipif_flags & IPIF_UP) || 10987 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) || 10988 (ipif->ipif_flags & IPIF_POINTOPOINT)) { 10989 ipif->ipif_v6net_mask = v6mask; 10990 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 10991 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 10992 ipif->ipif_v6net_mask, 10993 ipif->ipif_v6subnet); 10994 } 10995 return (0); 10996 } 10997 /* 10998 * Make sure we have valid net and subnet broadcast ire's 10999 * for the old netmask, if needed by other logical interfaces. 11000 */ 11001 err = ipif_logical_down(ipif, q, mp); 11002 if (err == EINPROGRESS) 11003 return (err); 11004 (void) ipif_down_tail(ipif); 11005 err = ip_sioctl_netmask_tail(ipif, sin, q, mp); 11006 return (err); 11007 } 11008 11009 static int 11010 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp) 11011 { 11012 in6_addr_t v6mask; 11013 int err = 0; 11014 11015 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n", 11016 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11017 11018 if (ipif->ipif_isv6) { 11019 sin6_t *sin6; 11020 11021 sin6 = (sin6_t *)sin; 11022 v6mask = sin6->sin6_addr; 11023 } else { 11024 ipaddr_t mask; 11025 11026 mask = sin->sin_addr.s_addr; 11027 V4MASK_TO_V6(mask, v6mask); 11028 } 11029 11030 ipif->ipif_v6net_mask = v6mask; 11031 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11032 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 11033 ipif->ipif_v6subnet); 11034 } 11035 err = ipif_up(ipif, q, mp); 11036 11037 if (err == 0 || err == EINPROGRESS) { 11038 /* 11039 * The interface must be DL_BOUND if this packet has to 11040 * go out on the wire. Since we only go through a logical 11041 * down and are bound with the driver during an internal 11042 * down/up that is satisfied. 11043 */ 11044 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) { 11045 /* Potentially broadcast an address mask reply. */ 11046 ipif_mask_reply(ipif); 11047 } 11048 } 11049 return (err); 11050 } 11051 11052 /* ARGSUSED */ 11053 int 11054 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11055 ip_ioctl_cmd_t *ipip, void *if_req) 11056 { 11057 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", 11058 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11059 (void) ipif_down_tail(ipif); 11060 return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); 11061 } 11062 11063 /* Get interface net mask. */ 11064 /* ARGSUSED */ 11065 int 11066 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11067 ip_ioctl_cmd_t *ipip, void *if_req) 11068 { 11069 struct lifreq *lifr = (struct lifreq *)if_req; 11070 struct sockaddr_in6 *sin6 = (sin6_t *)sin; 11071 11072 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n", 11073 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11074 11075 /* 11076 * net mask can't change since we have a reference to the ipif. 11077 */ 11078 if (ipif->ipif_isv6) { 11079 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11080 *sin6 = sin6_null; 11081 sin6->sin6_family = AF_INET6; 11082 sin6->sin6_addr = ipif->ipif_v6net_mask; 11083 lifr->lifr_addrlen = 11084 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11085 } else { 11086 *sin = sin_null; 11087 sin->sin_family = AF_INET; 11088 sin->sin_addr.s_addr = ipif->ipif_net_mask; 11089 if (ipip->ipi_cmd_type == LIF_CMD) { 11090 lifr->lifr_addrlen = 11091 ip_mask_to_plen(ipif->ipif_net_mask); 11092 } 11093 } 11094 return (0); 11095 } 11096 11097 /* ARGSUSED */ 11098 int 11099 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11100 ip_ioctl_cmd_t *ipip, void *if_req) 11101 { 11102 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n", 11103 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11104 11105 /* 11106 * Since no applications should ever be setting metrics on underlying 11107 * interfaces, we explicitly fail to smoke 'em out. 11108 */ 11109 if (IS_UNDER_IPMP(ipif->ipif_ill)) 11110 return (EINVAL); 11111 11112 /* 11113 * Set interface metric. We don't use this for 11114 * anything but we keep track of it in case it is 11115 * important to routing applications or such. 11116 */ 11117 if (ipip->ipi_cmd_type == IF_CMD) { 11118 struct ifreq *ifr; 11119 11120 ifr = (struct ifreq *)if_req; 11121 ipif->ipif_ill->ill_metric = ifr->ifr_metric; 11122 } else { 11123 struct lifreq *lifr; 11124 11125 lifr = (struct lifreq *)if_req; 11126 ipif->ipif_ill->ill_metric = lifr->lifr_metric; 11127 } 11128 return (0); 11129 } 11130 11131 /* ARGSUSED */ 11132 int 11133 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11134 ip_ioctl_cmd_t *ipip, void *if_req) 11135 { 11136 /* Get interface metric. */ 11137 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n", 11138 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11139 11140 if (ipip->ipi_cmd_type == IF_CMD) { 11141 struct ifreq *ifr; 11142 11143 ifr = (struct ifreq *)if_req; 11144 ifr->ifr_metric = ipif->ipif_ill->ill_metric; 11145 } else { 11146 struct lifreq *lifr; 11147 11148 lifr = (struct lifreq *)if_req; 11149 lifr->lifr_metric = ipif->ipif_ill->ill_metric; 11150 } 11151 11152 return (0); 11153 } 11154 11155 /* ARGSUSED */ 11156 int 11157 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11158 ip_ioctl_cmd_t *ipip, void *if_req) 11159 { 11160 int arp_muxid; 11161 11162 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", 11163 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11164 /* 11165 * Set the muxid returned from I_PLINK. 11166 */ 11167 if (ipip->ipi_cmd_type == IF_CMD) { 11168 struct ifreq *ifr = (struct ifreq *)if_req; 11169 11170 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; 11171 arp_muxid = ifr->ifr_arp_muxid; 11172 } else { 11173 struct lifreq *lifr = (struct lifreq *)if_req; 11174 11175 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; 11176 arp_muxid = lifr->lifr_arp_muxid; 11177 } 11178 arl_set_muxid(ipif->ipif_ill, arp_muxid); 11179 return (0); 11180 } 11181 11182 /* ARGSUSED */ 11183 int 11184 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11185 ip_ioctl_cmd_t *ipip, void *if_req) 11186 { 11187 int arp_muxid = 0; 11188 11189 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", 11190 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11191 /* 11192 * Get the muxid saved in ill for I_PUNLINK. 11193 */ 11194 arp_muxid = arl_get_muxid(ipif->ipif_ill); 11195 if (ipip->ipi_cmd_type == IF_CMD) { 11196 struct ifreq *ifr = (struct ifreq *)if_req; 11197 11198 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; 11199 ifr->ifr_arp_muxid = arp_muxid; 11200 } else { 11201 struct lifreq *lifr = (struct lifreq *)if_req; 11202 11203 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; 11204 lifr->lifr_arp_muxid = arp_muxid; 11205 } 11206 return (0); 11207 } 11208 11209 /* 11210 * Set the subnet prefix. Does not modify the broadcast address. 11211 */ 11212 /* ARGSUSED */ 11213 int 11214 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11215 ip_ioctl_cmd_t *ipip, void *if_req) 11216 { 11217 int err = 0; 11218 in6_addr_t v6addr; 11219 in6_addr_t v6mask; 11220 boolean_t need_up = B_FALSE; 11221 int addrlen; 11222 11223 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n", 11224 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11225 11226 ASSERT(IAM_WRITER_IPIF(ipif)); 11227 addrlen = ((struct lifreq *)if_req)->lifr_addrlen; 11228 11229 if (ipif->ipif_isv6) { 11230 sin6_t *sin6; 11231 11232 if (sin->sin_family != AF_INET6) 11233 return (EAFNOSUPPORT); 11234 11235 sin6 = (sin6_t *)sin; 11236 v6addr = sin6->sin6_addr; 11237 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones)) 11238 return (EADDRNOTAVAIL); 11239 } else { 11240 ipaddr_t addr; 11241 11242 if (sin->sin_family != AF_INET) 11243 return (EAFNOSUPPORT); 11244 11245 addr = sin->sin_addr.s_addr; 11246 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF)) 11247 return (EADDRNOTAVAIL); 11248 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11249 /* Add 96 bits */ 11250 addrlen += IPV6_ABITS - IP_ABITS; 11251 } 11252 11253 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL) 11254 return (EINVAL); 11255 11256 /* Check if bits in the address is set past the mask */ 11257 if (!V6_MASK_EQ(v6addr, v6mask, v6addr)) 11258 return (EINVAL); 11259 11260 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) && 11261 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask)) 11262 return (0); /* No change */ 11263 11264 if (ipif->ipif_flags & IPIF_UP) { 11265 /* 11266 * If the interface is already marked up, 11267 * we call ipif_down which will take care 11268 * of ditching any IREs that have been set 11269 * up based on the old interface address. 11270 */ 11271 err = ipif_logical_down(ipif, q, mp); 11272 if (err == EINPROGRESS) 11273 return (err); 11274 (void) ipif_down_tail(ipif); 11275 need_up = B_TRUE; 11276 } 11277 11278 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up); 11279 return (err); 11280 } 11281 11282 static int 11283 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask, 11284 queue_t *q, mblk_t *mp, boolean_t need_up) 11285 { 11286 ill_t *ill = ipif->ipif_ill; 11287 int err = 0; 11288 11289 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n", 11290 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11291 11292 /* Set the new address. */ 11293 mutex_enter(&ill->ill_lock); 11294 ipif->ipif_v6net_mask = v6mask; 11295 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 11296 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask, 11297 ipif->ipif_v6subnet); 11298 } 11299 mutex_exit(&ill->ill_lock); 11300 11301 if (need_up) { 11302 /* 11303 * Now bring the interface back up. If this 11304 * is the only IPIF for the ILL, ipif_up 11305 * will have to re-bind to the device, so 11306 * we may get back EINPROGRESS, in which 11307 * case, this IOCTL will get completed in 11308 * ip_rput_dlpi when we see the DL_BIND_ACK. 11309 */ 11310 err = ipif_up(ipif, q, mp); 11311 if (err == EINPROGRESS) 11312 return (err); 11313 } 11314 return (err); 11315 } 11316 11317 /* ARGSUSED */ 11318 int 11319 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11320 ip_ioctl_cmd_t *ipip, void *if_req) 11321 { 11322 int addrlen; 11323 in6_addr_t v6addr; 11324 in6_addr_t v6mask; 11325 struct lifreq *lifr = (struct lifreq *)if_req; 11326 11327 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", 11328 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11329 (void) ipif_down_tail(ipif); 11330 11331 addrlen = lifr->lifr_addrlen; 11332 if (ipif->ipif_isv6) { 11333 sin6_t *sin6; 11334 11335 sin6 = (sin6_t *)sin; 11336 v6addr = sin6->sin6_addr; 11337 } else { 11338 ipaddr_t addr; 11339 11340 addr = sin->sin_addr.s_addr; 11341 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 11342 addrlen += IPV6_ABITS - IP_ABITS; 11343 } 11344 (void) ip_plen_to_mask_v6(addrlen, &v6mask); 11345 11346 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE)); 11347 } 11348 11349 /* ARGSUSED */ 11350 int 11351 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11352 ip_ioctl_cmd_t *ipip, void *if_req) 11353 { 11354 struct lifreq *lifr = (struct lifreq *)if_req; 11355 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin; 11356 11357 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n", 11358 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11359 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 11360 11361 if (ipif->ipif_isv6) { 11362 *sin6 = sin6_null; 11363 sin6->sin6_family = AF_INET6; 11364 sin6->sin6_addr = ipif->ipif_v6subnet; 11365 lifr->lifr_addrlen = 11366 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); 11367 } else { 11368 *sin = sin_null; 11369 sin->sin_family = AF_INET; 11370 sin->sin_addr.s_addr = ipif->ipif_subnet; 11371 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask); 11372 } 11373 return (0); 11374 } 11375 11376 /* 11377 * Set the IPv6 address token. 11378 */ 11379 /* ARGSUSED */ 11380 int 11381 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11382 ip_ioctl_cmd_t *ipi, void *if_req) 11383 { 11384 ill_t *ill = ipif->ipif_ill; 11385 int err; 11386 in6_addr_t v6addr; 11387 in6_addr_t v6mask; 11388 boolean_t need_up = B_FALSE; 11389 int i; 11390 sin6_t *sin6 = (sin6_t *)sin; 11391 struct lifreq *lifr = (struct lifreq *)if_req; 11392 int addrlen; 11393 11394 ip1dbg(("ip_sioctl_token(%s:%u %p)\n", 11395 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11396 ASSERT(IAM_WRITER_IPIF(ipif)); 11397 11398 addrlen = lifr->lifr_addrlen; 11399 /* Only allow for logical unit zero i.e. not on "le0:17" */ 11400 if (ipif->ipif_id != 0) 11401 return (EINVAL); 11402 11403 if (!ipif->ipif_isv6) 11404 return (EINVAL); 11405 11406 if (addrlen > IPV6_ABITS) 11407 return (EINVAL); 11408 11409 v6addr = sin6->sin6_addr; 11410 11411 /* 11412 * The length of the token is the length from the end. To get 11413 * the proper mask for this, compute the mask of the bits not 11414 * in the token; ie. the prefix, and then xor to get the mask. 11415 */ 11416 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL) 11417 return (EINVAL); 11418 for (i = 0; i < 4; i++) { 11419 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11420 } 11421 11422 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) && 11423 ill->ill_token_length == addrlen) 11424 return (0); /* No change */ 11425 11426 if (ipif->ipif_flags & IPIF_UP) { 11427 err = ipif_logical_down(ipif, q, mp); 11428 if (err == EINPROGRESS) 11429 return (err); 11430 (void) ipif_down_tail(ipif); 11431 need_up = B_TRUE; 11432 } 11433 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); 11434 return (err); 11435 } 11436 11437 static int 11438 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q, 11439 mblk_t *mp, boolean_t need_up) 11440 { 11441 in6_addr_t v6addr; 11442 in6_addr_t v6mask; 11443 ill_t *ill = ipif->ipif_ill; 11444 int i; 11445 int err = 0; 11446 11447 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n", 11448 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11449 v6addr = sin6->sin6_addr; 11450 /* 11451 * The length of the token is the length from the end. To get 11452 * the proper mask for this, compute the mask of the bits not 11453 * in the token; ie. the prefix, and then xor to get the mask. 11454 */ 11455 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask); 11456 for (i = 0; i < 4; i++) 11457 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff; 11458 11459 mutex_enter(&ill->ill_lock); 11460 V6_MASK_COPY(v6addr, v6mask, ill->ill_token); 11461 ill->ill_token_length = addrlen; 11462 ill->ill_manual_token = 1; 11463 11464 /* Reconfigure the link-local address based on this new token */ 11465 ipif_setlinklocal(ill->ill_ipif); 11466 11467 mutex_exit(&ill->ill_lock); 11468 11469 if (need_up) { 11470 /* 11471 * Now bring the interface back up. If this 11472 * is the only IPIF for the ILL, ipif_up 11473 * will have to re-bind to the device, so 11474 * we may get back EINPROGRESS, in which 11475 * case, this IOCTL will get completed in 11476 * ip_rput_dlpi when we see the DL_BIND_ACK. 11477 */ 11478 err = ipif_up(ipif, q, mp); 11479 if (err == EINPROGRESS) 11480 return (err); 11481 } 11482 return (err); 11483 } 11484 11485 /* ARGSUSED */ 11486 int 11487 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11488 ip_ioctl_cmd_t *ipi, void *if_req) 11489 { 11490 ill_t *ill; 11491 sin6_t *sin6 = (sin6_t *)sin; 11492 struct lifreq *lifr = (struct lifreq *)if_req; 11493 11494 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n", 11495 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11496 if (ipif->ipif_id != 0) 11497 return (EINVAL); 11498 11499 ill = ipif->ipif_ill; 11500 if (!ill->ill_isv6) 11501 return (ENXIO); 11502 11503 *sin6 = sin6_null; 11504 sin6->sin6_family = AF_INET6; 11505 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token)); 11506 sin6->sin6_addr = ill->ill_token; 11507 lifr->lifr_addrlen = ill->ill_token_length; 11508 return (0); 11509 } 11510 11511 /* 11512 * Set (hardware) link specific information that might override 11513 * what was acquired through the DL_INFO_ACK. 11514 */ 11515 /* ARGSUSED */ 11516 int 11517 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11518 ip_ioctl_cmd_t *ipi, void *if_req) 11519 { 11520 ill_t *ill = ipif->ipif_ill; 11521 int ip_min_mtu; 11522 struct lifreq *lifr = (struct lifreq *)if_req; 11523 lif_ifinfo_req_t *lir; 11524 11525 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", 11526 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11527 lir = &lifr->lifr_ifinfo; 11528 ASSERT(IAM_WRITER_IPIF(ipif)); 11529 11530 /* Only allow for logical unit zero i.e. not on "bge0:17" */ 11531 if (ipif->ipif_id != 0) 11532 return (EINVAL); 11533 11534 /* Set interface MTU. */ 11535 if (ipif->ipif_isv6) 11536 ip_min_mtu = IPV6_MIN_MTU; 11537 else 11538 ip_min_mtu = IP_MIN_MTU; 11539 11540 /* 11541 * Verify values before we set anything. Allow zero to 11542 * mean unspecified. 11543 * 11544 * XXX We should be able to set the user-defined lir_mtu to some value 11545 * that is greater than ill_current_frag but less than ill_max_frag- the 11546 * ill_max_frag value tells us the max MTU that can be handled by the 11547 * datalink, whereas the ill_current_frag is dynamically computed for 11548 * some link-types like tunnels, based on the tunnel PMTU. However, 11549 * since there is currently no way of distinguishing between 11550 * administratively fixed link mtu values (e.g., those set via 11551 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered 11552 * for tunnels) we conservatively choose the ill_current_frag as the 11553 * upper-bound. 11554 */ 11555 if (lir->lir_maxmtu != 0 && 11556 (lir->lir_maxmtu > ill->ill_current_frag || 11557 lir->lir_maxmtu < ip_min_mtu)) 11558 return (EINVAL); 11559 if (lir->lir_reachtime != 0 && 11560 lir->lir_reachtime > ND_MAX_REACHTIME) 11561 return (EINVAL); 11562 if (lir->lir_reachretrans != 0 && 11563 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME) 11564 return (EINVAL); 11565 11566 mutex_enter(&ill->ill_lock); 11567 /* 11568 * The dce and fragmentation code can handle changes to ill_mtu 11569 * concurrent with sending/fragmenting packets. 11570 */ 11571 if (lir->lir_maxmtu != 0) 11572 ill->ill_user_mtu = lir->lir_maxmtu; 11573 11574 if (lir->lir_reachtime != 0) 11575 ill->ill_reachable_time = lir->lir_reachtime; 11576 11577 if (lir->lir_reachretrans != 0) 11578 ill->ill_reachable_retrans_time = lir->lir_reachretrans; 11579 11580 ill->ill_max_hops = lir->lir_maxhops; 11581 ill->ill_max_buf = ND_MAX_Q; 11582 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { 11583 /* 11584 * ill_mtu is the actual interface MTU, obtained as the min 11585 * of user-configured mtu and the value announced by the 11586 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since 11587 * we have already made the choice of requiring 11588 * ill_user_mtu < ill_current_frag by the time we get here, 11589 * the ill_mtu effectively gets assigned to the ill_user_mtu 11590 * here. 11591 */ 11592 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); 11593 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu); 11594 } 11595 mutex_exit(&ill->ill_lock); 11596 11597 /* 11598 * Make sure all dce_generation checks find out 11599 * that ill_mtu/ill_mc_mtu has changed. 11600 */ 11601 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) 11602 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); 11603 11604 /* 11605 * Refresh IPMP meta-interface MTU if necessary. 11606 */ 11607 if (IS_UNDER_IPMP(ill)) 11608 ipmp_illgrp_refresh_mtu(ill->ill_grp); 11609 11610 return (0); 11611 } 11612 11613 /* ARGSUSED */ 11614 int 11615 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 11616 ip_ioctl_cmd_t *ipi, void *if_req) 11617 { 11618 struct lif_ifinfo_req *lir; 11619 ill_t *ill = ipif->ipif_ill; 11620 11621 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n", 11622 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 11623 if (ipif->ipif_id != 0) 11624 return (EINVAL); 11625 11626 lir = &((struct lifreq *)if_req)->lifr_ifinfo; 11627 lir->lir_maxhops = ill->ill_max_hops; 11628 lir->lir_reachtime = ill->ill_reachable_time; 11629 lir->lir_reachretrans = ill->ill_reachable_retrans_time; 11630 lir->lir_maxmtu = ill->ill_mtu; 11631 11632 return (0); 11633 } 11634 11635 /* 11636 * Return best guess as to the subnet mask for the specified address. 11637 * Based on the subnet masks for all the configured interfaces. 11638 * 11639 * We end up returning a zero mask in the case of default, multicast or 11640 * experimental. 11641 */ 11642 static ipaddr_t 11643 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) 11644 { 11645 ipaddr_t net_mask; 11646 ill_t *ill; 11647 ipif_t *ipif; 11648 ill_walk_context_t ctx; 11649 ipif_t *fallback_ipif = NULL; 11650 11651 net_mask = ip_net_mask(addr); 11652 if (net_mask == 0) { 11653 *ipifp = NULL; 11654 return (0); 11655 } 11656 11657 /* Let's check to see if this is maybe a local subnet route. */ 11658 /* this function only applies to IPv4 interfaces */ 11659 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 11660 ill = ILL_START_WALK_V4(&ctx, ipst); 11661 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 11662 mutex_enter(&ill->ill_lock); 11663 for (ipif = ill->ill_ipif; ipif != NULL; 11664 ipif = ipif->ipif_next) { 11665 if (IPIF_IS_CONDEMNED(ipif)) 11666 continue; 11667 if (!(ipif->ipif_flags & IPIF_UP)) 11668 continue; 11669 if ((ipif->ipif_subnet & net_mask) == 11670 (addr & net_mask)) { 11671 /* 11672 * Don't trust pt-pt interfaces if there are 11673 * other interfaces. 11674 */ 11675 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 11676 if (fallback_ipif == NULL) { 11677 ipif_refhold_locked(ipif); 11678 fallback_ipif = ipif; 11679 } 11680 continue; 11681 } 11682 11683 /* 11684 * Fine. Just assume the same net mask as the 11685 * directly attached subnet interface is using. 11686 */ 11687 ipif_refhold_locked(ipif); 11688 mutex_exit(&ill->ill_lock); 11689 rw_exit(&ipst->ips_ill_g_lock); 11690 if (fallback_ipif != NULL) 11691 ipif_refrele(fallback_ipif); 11692 *ipifp = ipif; 11693 return (ipif->ipif_net_mask); 11694 } 11695 } 11696 mutex_exit(&ill->ill_lock); 11697 } 11698 rw_exit(&ipst->ips_ill_g_lock); 11699 11700 *ipifp = fallback_ipif; 11701 return ((fallback_ipif != NULL) ? 11702 fallback_ipif->ipif_net_mask : net_mask); 11703 } 11704 11705 /* 11706 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl. 11707 */ 11708 static void 11709 ip_wput_ioctl(queue_t *q, mblk_t *mp) 11710 { 11711 IOCP iocp; 11712 ipft_t *ipft; 11713 ipllc_t *ipllc; 11714 mblk_t *mp1; 11715 cred_t *cr; 11716 int error = 0; 11717 conn_t *connp; 11718 11719 ip1dbg(("ip_wput_ioctl")); 11720 iocp = (IOCP)mp->b_rptr; 11721 mp1 = mp->b_cont; 11722 if (mp1 == NULL) { 11723 iocp->ioc_error = EINVAL; 11724 mp->b_datap->db_type = M_IOCNAK; 11725 iocp->ioc_count = 0; 11726 qreply(q, mp); 11727 return; 11728 } 11729 11730 /* 11731 * These IOCTLs provide various control capabilities to 11732 * upstream agents such as ULPs and processes. There 11733 * are currently two such IOCTLs implemented. They 11734 * are used by TCP to provide update information for 11735 * existing IREs and to forcibly delete an IRE for a 11736 * host that is not responding, thereby forcing an 11737 * attempt at a new route. 11738 */ 11739 iocp->ioc_error = EINVAL; 11740 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd))) 11741 goto done; 11742 11743 ipllc = (ipllc_t *)mp1->b_rptr; 11744 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) { 11745 if (ipllc->ipllc_cmd == ipft->ipft_cmd) 11746 break; 11747 } 11748 /* 11749 * prefer credential from mblk over ioctl; 11750 * see ip_sioctl_copyin_setup 11751 */ 11752 cr = msg_getcred(mp, NULL); 11753 if (cr == NULL) 11754 cr = iocp->ioc_cr; 11755 11756 /* 11757 * Refhold the conn in case the request gets queued up in some lookup 11758 */ 11759 ASSERT(CONN_Q(q)); 11760 connp = Q_TO_CONN(q); 11761 CONN_INC_REF(connp); 11762 CONN_INC_IOCTLREF(connp); 11763 if (ipft->ipft_pfi && 11764 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size || 11765 pullupmsg(mp1, ipft->ipft_min_size))) { 11766 error = (*ipft->ipft_pfi)(q, 11767 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr); 11768 } 11769 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) { 11770 /* 11771 * CONN_OPER_PENDING_DONE happens in the function called 11772 * through ipft_pfi above. 11773 */ 11774 return; 11775 } 11776 11777 CONN_DEC_IOCTLREF(connp); 11778 CONN_OPER_PENDING_DONE(connp); 11779 if (ipft->ipft_flags & IPFT_F_NO_REPLY) { 11780 freemsg(mp); 11781 return; 11782 } 11783 iocp->ioc_error = error; 11784 11785 done: 11786 mp->b_datap->db_type = M_IOCACK; 11787 if (iocp->ioc_error) 11788 iocp->ioc_count = 0; 11789 qreply(q, mp); 11790 } 11791 11792 /* 11793 * Assign a unique id for the ipif. This is used by sctp_addr.c 11794 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. 11795 */ 11796 static void 11797 ipif_assign_seqid(ipif_t *ipif) 11798 { 11799 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 11800 11801 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1); 11802 } 11803 11804 /* 11805 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are 11806 * administratively down (i.e., no DAD), of the same type, and locked. Note 11807 * that the clone is complete -- including the seqid -- and the expectation is 11808 * that the caller will either free or overwrite `sipif' before it's unlocked. 11809 */ 11810 static void 11811 ipif_clone(const ipif_t *sipif, ipif_t *dipif) 11812 { 11813 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock)); 11814 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock)); 11815 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11816 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); 11817 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); 11818 11819 dipif->ipif_flags = sipif->ipif_flags; 11820 dipif->ipif_zoneid = sipif->ipif_zoneid; 11821 dipif->ipif_v6subnet = sipif->ipif_v6subnet; 11822 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; 11823 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; 11824 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; 11825 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; 11826 11827 /* 11828 * As per the comment atop the function, we assume that these sipif 11829 * fields will be changed before sipif is unlocked. 11830 */ 11831 dipif->ipif_seqid = sipif->ipif_seqid; 11832 dipif->ipif_state_flags = sipif->ipif_state_flags; 11833 } 11834 11835 /* 11836 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif' 11837 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin 11838 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then 11839 * transfer the xop to `dipif'. Requires that all ipifs are administratively 11840 * down (i.e., no DAD), of the same type, and unlocked. 11841 */ 11842 static void 11843 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) 11844 { 11845 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq; 11846 ipxop_t *ipx = ipsq->ipsq_xop; 11847 11848 ASSERT(sipif != dipif); 11849 ASSERT(sipif != virgipif); 11850 11851 /* 11852 * Grab all of the locks that protect the ipif in a defined order. 11853 */ 11854 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11855 11856 ipif_clone(sipif, dipif); 11857 if (virgipif != NULL) { 11858 ipif_clone(virgipif, sipif); 11859 mi_free(virgipif); 11860 } 11861 11862 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); 11863 11864 /* 11865 * Transfer ownership of the current xop, if necessary. 11866 */ 11867 if (ipx->ipx_current_ipif == sipif) { 11868 ASSERT(ipx->ipx_pending_ipif == NULL); 11869 mutex_enter(&ipx->ipx_lock); 11870 ipx->ipx_current_ipif = dipif; 11871 mutex_exit(&ipx->ipx_lock); 11872 } 11873 11874 if (virgipif == NULL) 11875 mi_free(sipif); 11876 } 11877 11878 /* 11879 * checks if: 11880 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and 11881 * - logical interface is within the allowed range 11882 */ 11883 static int 11884 is_lifname_valid(ill_t *ill, unsigned int ipif_id) 11885 { 11886 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ) 11887 return (ENAMETOOLONG); 11888 11889 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if) 11890 return (ERANGE); 11891 return (0); 11892 } 11893 11894 /* 11895 * Insert the ipif, so that the list of ipifs on the ill will be sorted 11896 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will 11897 * be inserted into the first space available in the list. The value of 11898 * ipif_id will then be set to the appropriate value for its position. 11899 */ 11900 static int 11901 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock) 11902 { 11903 ill_t *ill; 11904 ipif_t *tipif; 11905 ipif_t **tipifp; 11906 int id, err; 11907 ip_stack_t *ipst; 11908 11909 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK || 11910 IAM_WRITER_IPIF(ipif)); 11911 11912 ill = ipif->ipif_ill; 11913 ASSERT(ill != NULL); 11914 ipst = ill->ill_ipst; 11915 11916 /* 11917 * In the case of lo0:0 we already hold the ill_g_lock. 11918 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate -> 11919 * ipif_insert. 11920 */ 11921 if (acquire_g_lock) 11922 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 11923 mutex_enter(&ill->ill_lock); 11924 id = ipif->ipif_id; 11925 tipifp = &(ill->ill_ipif); 11926 if (id == -1) { /* need to find a real id */ 11927 id = 0; 11928 while ((tipif = *tipifp) != NULL) { 11929 ASSERT(tipif->ipif_id >= id); 11930 if (tipif->ipif_id != id) 11931 break; /* non-consecutive id */ 11932 id++; 11933 tipifp = &(tipif->ipif_next); 11934 } 11935 if ((err = is_lifname_valid(ill, id)) != 0) { 11936 mutex_exit(&ill->ill_lock); 11937 if (acquire_g_lock) 11938 rw_exit(&ipst->ips_ill_g_lock); 11939 return (err); 11940 } 11941 ipif->ipif_id = id; /* assign new id */ 11942 } else if ((err = is_lifname_valid(ill, id)) == 0) { 11943 /* we have a real id; insert ipif in the right place */ 11944 while ((tipif = *tipifp) != NULL) { 11945 ASSERT(tipif->ipif_id != id); 11946 if (tipif->ipif_id > id) 11947 break; /* found correct location */ 11948 tipifp = &(tipif->ipif_next); 11949 } 11950 } else { 11951 mutex_exit(&ill->ill_lock); 11952 if (acquire_g_lock) 11953 rw_exit(&ipst->ips_ill_g_lock); 11954 return (err); 11955 } 11956 11957 ASSERT(tipifp != &(ill->ill_ipif) || id == 0); 11958 11959 ipif->ipif_next = tipif; 11960 *tipifp = ipif; 11961 mutex_exit(&ill->ill_lock); 11962 if (acquire_g_lock) 11963 rw_exit(&ipst->ips_ill_g_lock); 11964 11965 return (0); 11966 } 11967 11968 static void 11969 ipif_remove(ipif_t *ipif) 11970 { 11971 ipif_t **ipifp; 11972 ill_t *ill = ipif->ipif_ill; 11973 11974 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock)); 11975 11976 mutex_enter(&ill->ill_lock); 11977 ipifp = &ill->ill_ipif; 11978 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) { 11979 if (*ipifp == ipif) { 11980 *ipifp = ipif->ipif_next; 11981 break; 11982 } 11983 } 11984 mutex_exit(&ill->ill_lock); 11985 } 11986 11987 /* 11988 * Allocate and initialize a new interface control structure. (Always 11989 * called as writer.) 11990 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill 11991 * is not part of the global linked list of ills. ipif_seqid is unique 11992 * in the system and to preserve the uniqueness, it is assigned only 11993 * when ill becomes part of the global list. At that point ill will 11994 * have a name. If it doesn't get assigned here, it will get assigned 11995 * in ipif_set_values() as part of SIOCSLIFNAME processing. 11996 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set 11997 * the interface flags or any other information from the DL_INFO_ACK for 11998 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at 11999 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the 12000 * second DL_INFO_ACK comes in from the driver. 12001 */ 12002 static ipif_t * 12003 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, 12004 boolean_t insert, int *errorp) 12005 { 12006 int err; 12007 ipif_t *ipif; 12008 ip_stack_t *ipst = ill->ill_ipst; 12009 12010 ip1dbg(("ipif_allocate(%s:%d ill %p)\n", 12011 ill->ill_name, id, (void *)ill)); 12012 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill)); 12013 12014 if (errorp != NULL) 12015 *errorp = 0; 12016 12017 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) { 12018 if (errorp != NULL) 12019 *errorp = ENOMEM; 12020 return (NULL); 12021 } 12022 *ipif = ipif_zero; /* start clean */ 12023 12024 ipif->ipif_ill = ill; 12025 ipif->ipif_id = id; /* could be -1 */ 12026 /* 12027 * Inherit the zoneid from the ill; for the shared stack instance 12028 * this is always the global zone 12029 */ 12030 ipif->ipif_zoneid = ill->ill_zoneid; 12031 12032 ipif->ipif_refcnt = 0; 12033 12034 if (insert) { 12035 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) { 12036 mi_free(ipif); 12037 if (errorp != NULL) 12038 *errorp = err; 12039 return (NULL); 12040 } 12041 /* -1 id should have been replaced by real id */ 12042 id = ipif->ipif_id; 12043 ASSERT(id >= 0); 12044 } 12045 12046 if (ill->ill_name[0] != '\0') 12047 ipif_assign_seqid(ipif); 12048 12049 /* 12050 * If this is the zeroth ipif on the IPMP ill, create the illgrp 12051 * (which must not exist yet because the zeroth ipif is created once 12052 * per ill). However, do not not link it to the ipmp_grp_t until 12053 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details. 12054 */ 12055 if (id == 0 && IS_IPMP(ill)) { 12056 if (ipmp_illgrp_create(ill) == NULL) { 12057 if (insert) { 12058 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 12059 ipif_remove(ipif); 12060 rw_exit(&ipst->ips_ill_g_lock); 12061 } 12062 mi_free(ipif); 12063 if (errorp != NULL) 12064 *errorp = ENOMEM; 12065 return (NULL); 12066 } 12067 } 12068 12069 /* 12070 * We grab ill_lock to protect the flag changes. The ipif is still 12071 * not up and can't be looked up until the ioctl completes and the 12072 * IPIF_CHANGING flag is cleared. 12073 */ 12074 mutex_enter(&ill->ill_lock); 12075 12076 ipif->ipif_ire_type = ire_type; 12077 12078 if (ipif->ipif_isv6) { 12079 ill->ill_flags |= ILLF_IPV6; 12080 } else { 12081 ipaddr_t inaddr_any = INADDR_ANY; 12082 12083 ill->ill_flags |= ILLF_IPV4; 12084 12085 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */ 12086 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12087 &ipif->ipif_v6lcl_addr); 12088 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12089 &ipif->ipif_v6subnet); 12090 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12091 &ipif->ipif_v6net_mask); 12092 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12093 &ipif->ipif_v6brd_addr); 12094 IN6_IPADDR_TO_V4MAPPED(inaddr_any, 12095 &ipif->ipif_v6pp_dst_addr); 12096 } 12097 12098 /* 12099 * Don't set the interface flags etc. now, will do it in 12100 * ip_ll_subnet_defaults. 12101 */ 12102 if (!initialize) 12103 goto out; 12104 12105 /* 12106 * NOTE: The IPMP meta-interface is special-cased because it starts 12107 * with no underlying interfaces (and thus an unknown broadcast 12108 * address length), but all interfaces that can be placed into an IPMP 12109 * group are required to be broadcast-capable. 12110 */ 12111 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) { 12112 /* 12113 * Later detect lack of DLPI driver multicast capability by 12114 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi(). 12115 */ 12116 ill->ill_flags |= ILLF_MULTICAST; 12117 if (!ipif->ipif_isv6) 12118 ipif->ipif_flags |= IPIF_BROADCAST; 12119 } else { 12120 if (ill->ill_net_type != IRE_LOOPBACK) { 12121 if (ipif->ipif_isv6) 12122 /* 12123 * Note: xresolv interfaces will eventually need 12124 * NOARP set here as well, but that will require 12125 * those external resolvers to have some 12126 * knowledge of that flag and act appropriately. 12127 * Not to be changed at present. 12128 */ 12129 ill->ill_flags |= ILLF_NONUD; 12130 else 12131 ill->ill_flags |= ILLF_NOARP; 12132 } 12133 if (ill->ill_phys_addr_length == 0) { 12134 if (IS_VNI(ill)) { 12135 ipif->ipif_flags |= IPIF_NOXMIT; 12136 } else { 12137 /* pt-pt supports multicast. */ 12138 ill->ill_flags |= ILLF_MULTICAST; 12139 if (ill->ill_net_type != IRE_LOOPBACK) 12140 ipif->ipif_flags |= IPIF_POINTOPOINT; 12141 } 12142 } 12143 } 12144 out: 12145 mutex_exit(&ill->ill_lock); 12146 return (ipif); 12147 } 12148 12149 /* 12150 * Remove the neighbor cache entries associated with this logical 12151 * interface. 12152 */ 12153 int 12154 ipif_arp_down(ipif_t *ipif) 12155 { 12156 ill_t *ill = ipif->ipif_ill; 12157 int err = 0; 12158 12159 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 12160 ASSERT(IAM_WRITER_IPIF(ipif)); 12161 12162 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", 12163 ill_t *, ill, ipif_t *, ipif); 12164 ipif_nce_down(ipif); 12165 12166 /* 12167 * If this is the last ipif that is going down and there are no 12168 * duplicate addresses we may yet attempt to re-probe, then we need to 12169 * clean up ARP completely. 12170 */ 12171 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 12172 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { 12173 /* 12174 * If this was the last ipif on an IPMP interface, purge any 12175 * static ARP entries associated with it. 12176 */ 12177 if (IS_IPMP(ill)) 12178 ipmp_illgrp_refresh_arpent(ill->ill_grp); 12179 12180 /* UNBIND, DETACH */ 12181 err = arp_ll_down(ill); 12182 } 12183 12184 return (err); 12185 } 12186 12187 /* 12188 * Get the resolver set up for a new IP address. (Always called as writer.) 12189 * Called both for IPv4 and IPv6 interfaces, though it only does some 12190 * basic DAD related initialization for IPv6. Honors ILLF_NOARP. 12191 * 12192 * The enumerated value res_act tunes the behavior: 12193 * * Res_act_initial: set up all the resolver structures for a new 12194 * IP address. 12195 * * Res_act_defend: tell ARP that it needs to send a single gratuitous 12196 * ARP message in defense of the address. 12197 * * Res_act_rebind: tell ARP to change the hardware address for an IP 12198 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif(). 12199 * 12200 * Returns zero on success, or an errno upon failure. 12201 */ 12202 int 12203 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) 12204 { 12205 ill_t *ill = ipif->ipif_ill; 12206 int err; 12207 boolean_t was_dup; 12208 12209 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", 12210 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); 12211 ASSERT(IAM_WRITER_IPIF(ipif)); 12212 12213 was_dup = B_FALSE; 12214 if (res_act == Res_act_initial) { 12215 ipif->ipif_addr_ready = 0; 12216 /* 12217 * We're bringing an interface up here. There's no way that we 12218 * should need to shut down ARP now. 12219 */ 12220 mutex_enter(&ill->ill_lock); 12221 if (ipif->ipif_flags & IPIF_DUPLICATE) { 12222 ipif->ipif_flags &= ~IPIF_DUPLICATE; 12223 ill->ill_ipif_dup_count--; 12224 was_dup = B_TRUE; 12225 } 12226 mutex_exit(&ill->ill_lock); 12227 } 12228 if (ipif->ipif_recovery_id != 0) 12229 (void) untimeout(ipif->ipif_recovery_id); 12230 ipif->ipif_recovery_id = 0; 12231 if (ill->ill_net_type != IRE_IF_RESOLVER) { 12232 ipif->ipif_addr_ready = 1; 12233 return (0); 12234 } 12235 /* NDP will set the ipif_addr_ready flag when it's ready */ 12236 if (ill->ill_isv6) 12237 return (0); 12238 12239 err = ipif_arp_up(ipif, res_act, was_dup); 12240 return (err); 12241 } 12242 12243 /* 12244 * This routine restarts IPv4/IPv6 duplicate address detection (DAD) 12245 * when a link has just gone back up. 12246 */ 12247 static void 12248 ipif_nce_start_dad(ipif_t *ipif) 12249 { 12250 ncec_t *ncec; 12251 ill_t *ill = ipif->ipif_ill; 12252 boolean_t isv6 = ill->ill_isv6; 12253 12254 if (isv6) { 12255 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, 12256 &ipif->ipif_v6lcl_addr); 12257 } else { 12258 ipaddr_t v4addr; 12259 12260 if (ill->ill_net_type != IRE_IF_RESOLVER || 12261 (ipif->ipif_flags & IPIF_UNNUMBERED) || 12262 ipif->ipif_lcl_addr == INADDR_ANY) { 12263 /* 12264 * If we can't contact ARP for some reason, 12265 * that's not really a problem. Just send 12266 * out the routing socket notification that 12267 * DAD completion would have done, and continue. 12268 */ 12269 ipif_mask_reply(ipif); 12270 ipif_up_notify(ipif); 12271 ipif->ipif_addr_ready = 1; 12272 return; 12273 } 12274 12275 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); 12276 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); 12277 } 12278 12279 if (ncec == NULL) { 12280 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", 12281 (void *)ipif)); 12282 return; 12283 } 12284 if (!nce_restart_dad(ncec)) { 12285 /* 12286 * If we can't restart DAD for some reason, that's not really a 12287 * problem. Just send out the routing socket notification that 12288 * DAD completion would have done, and continue. 12289 */ 12290 ipif_up_notify(ipif); 12291 ipif->ipif_addr_ready = 1; 12292 } 12293 ncec_refrele(ncec); 12294 } 12295 12296 /* 12297 * Restart duplicate address detection on all interfaces on the given ill. 12298 * 12299 * This is called when an interface transitions from down to up 12300 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). 12301 * 12302 * Note that since the underlying physical link has transitioned, we must cause 12303 * at least one routing socket message to be sent here, either via DAD 12304 * completion or just by default on the first ipif. (If we don't do this, then 12305 * in.mpathd will see long delays when doing link-based failure recovery.) 12306 */ 12307 void 12308 ill_restart_dad(ill_t *ill, boolean_t went_up) 12309 { 12310 ipif_t *ipif; 12311 12312 if (ill == NULL) 12313 return; 12314 12315 /* 12316 * If layer two doesn't support duplicate address detection, then just 12317 * send the routing socket message now and be done with it. 12318 */ 12319 if (!ill->ill_isv6 && arp_no_defense) { 12320 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12321 return; 12322 } 12323 12324 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12325 if (went_up) { 12326 12327 if (ipif->ipif_flags & IPIF_UP) { 12328 ipif_nce_start_dad(ipif); 12329 } else if (ipif->ipif_flags & IPIF_DUPLICATE) { 12330 /* 12331 * kick off the bring-up process now. 12332 */ 12333 ipif_do_recovery(ipif); 12334 } else { 12335 /* 12336 * Unfortunately, the first ipif is "special" 12337 * and represents the underlying ill in the 12338 * routing socket messages. Thus, when this 12339 * one ipif is down, we must still notify so 12340 * that the user knows the IFF_RUNNING status 12341 * change. (If the first ipif is up, then 12342 * we'll handle eventual routing socket 12343 * notification via DAD completion.) 12344 */ 12345 if (ipif == ill->ill_ipif) { 12346 ip_rts_ifmsg(ill->ill_ipif, 12347 RTSQ_DEFAULT); 12348 } 12349 } 12350 } else { 12351 /* 12352 * After link down, we'll need to send a new routing 12353 * message when the link comes back, so clear 12354 * ipif_addr_ready. 12355 */ 12356 ipif->ipif_addr_ready = 0; 12357 } 12358 } 12359 12360 /* 12361 * If we've torn down links, then notify the user right away. 12362 */ 12363 if (!went_up) 12364 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); 12365 } 12366 12367 static void 12368 ipsq_delete(ipsq_t *ipsq) 12369 { 12370 ipxop_t *ipx = ipsq->ipsq_xop; 12371 12372 ipsq->ipsq_ipst = NULL; 12373 ASSERT(ipsq->ipsq_phyint == NULL); 12374 ASSERT(ipsq->ipsq_xop != NULL); 12375 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL); 12376 ASSERT(ipx->ipx_pending_mp == NULL); 12377 kmem_free(ipsq, sizeof (ipsq_t)); 12378 } 12379 12380 static int 12381 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) 12382 { 12383 int err = 0; 12384 ipif_t *ipif; 12385 12386 if (ill == NULL) 12387 return (0); 12388 12389 ASSERT(IAM_WRITER_ILL(ill)); 12390 ill->ill_up_ipifs = B_TRUE; 12391 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12392 if (ipif->ipif_was_up) { 12393 if (!(ipif->ipif_flags & IPIF_UP)) 12394 err = ipif_up(ipif, q, mp); 12395 ipif->ipif_was_up = B_FALSE; 12396 if (err != 0) { 12397 ASSERT(err == EINPROGRESS); 12398 return (err); 12399 } 12400 } 12401 } 12402 ill->ill_up_ipifs = B_FALSE; 12403 return (0); 12404 } 12405 12406 /* 12407 * This function is called to bring up all the ipifs that were up before 12408 * bringing the ill down via ill_down_ipifs(). 12409 */ 12410 int 12411 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) 12412 { 12413 int err; 12414 12415 ASSERT(IAM_WRITER_ILL(ill)); 12416 12417 if (ill->ill_replumbing) { 12418 ill->ill_replumbing = 0; 12419 /* 12420 * Send down REPLUMB_DONE notification followed by the 12421 * BIND_REQ on the arp stream. 12422 */ 12423 if (!ill->ill_isv6) 12424 arp_send_replumb_conf(ill); 12425 } 12426 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); 12427 if (err != 0) 12428 return (err); 12429 12430 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp)); 12431 } 12432 12433 /* 12434 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring 12435 * down the ipifs without sending DL_UNBIND_REQ to the driver. 12436 */ 12437 static void 12438 ill_down_ipifs(ill_t *ill, boolean_t logical) 12439 { 12440 ipif_t *ipif; 12441 12442 ASSERT(IAM_WRITER_ILL(ill)); 12443 12444 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 12445 /* 12446 * We go through the ipif_down logic even if the ipif 12447 * is already down, since routes can be added based 12448 * on down ipifs. Going through ipif_down once again 12449 * will delete any IREs created based on these routes. 12450 */ 12451 if (ipif->ipif_flags & IPIF_UP) 12452 ipif->ipif_was_up = B_TRUE; 12453 12454 if (logical) { 12455 (void) ipif_logical_down(ipif, NULL, NULL); 12456 ipif_non_duplicate(ipif); 12457 (void) ipif_down_tail(ipif); 12458 } else { 12459 (void) ipif_down(ipif, NULL, NULL); 12460 } 12461 } 12462 } 12463 12464 /* 12465 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take 12466 * a look again at valid source addresses. 12467 * This should be called each time after the set of source addresses has been 12468 * changed. 12469 */ 12470 void 12471 ip_update_source_selection(ip_stack_t *ipst) 12472 { 12473 /* We skip past SRC_GENERATION_VERIFY */ 12474 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == 12475 SRC_GENERATION_VERIFY) 12476 atomic_add_32(&ipst->ips_src_generation, 1); 12477 } 12478 12479 /* 12480 * Finish the group join started in ip_sioctl_groupname(). 12481 */ 12482 /* ARGSUSED */ 12483 static void 12484 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 12485 { 12486 ill_t *ill = q->q_ptr; 12487 phyint_t *phyi = ill->ill_phyint; 12488 ipmp_grp_t *grp = phyi->phyint_grp; 12489 ip_stack_t *ipst = ill->ill_ipst; 12490 12491 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */ 12492 ASSERT(!IS_IPMP(ill) && grp != NULL); 12493 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12494 12495 if (phyi->phyint_illv4 != NULL) { 12496 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12497 VERIFY(grp->gr_pendv4-- > 0); 12498 rw_exit(&ipst->ips_ipmp_lock); 12499 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4); 12500 } 12501 if (phyi->phyint_illv6 != NULL) { 12502 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12503 VERIFY(grp->gr_pendv6-- > 0); 12504 rw_exit(&ipst->ips_ipmp_lock); 12505 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6); 12506 } 12507 freemsg(mp); 12508 } 12509 12510 /* 12511 * Process an SIOCSLIFGROUPNAME request. 12512 */ 12513 /* ARGSUSED */ 12514 int 12515 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12516 ip_ioctl_cmd_t *ipip, void *ifreq) 12517 { 12518 struct lifreq *lifr = ifreq; 12519 ill_t *ill = ipif->ipif_ill; 12520 ip_stack_t *ipst = ill->ill_ipst; 12521 phyint_t *phyi = ill->ill_phyint; 12522 ipmp_grp_t *grp = phyi->phyint_grp; 12523 mblk_t *ipsq_mp; 12524 int err = 0; 12525 12526 /* 12527 * Note that phyint_grp can only change here, where we're exclusive. 12528 */ 12529 ASSERT(IAM_WRITER_ILL(ill)); 12530 12531 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL || 12532 (phyi->phyint_flags & PHYI_VIRTUAL)) 12533 return (EINVAL); 12534 12535 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0'; 12536 12537 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 12538 12539 /* 12540 * If the name hasn't changed, there's nothing to do. 12541 */ 12542 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0) 12543 goto unlock; 12544 12545 /* 12546 * Handle requests to rename an IPMP meta-interface. 12547 * 12548 * Note that creation of the IPMP meta-interface is handled in 12549 * userland through the standard plumbing sequence. As part of the 12550 * plumbing the IPMP meta-interface, its initial groupname is set to 12551 * the name of the interface (see ipif_set_values_tail()). 12552 */ 12553 if (IS_IPMP(ill)) { 12554 err = ipmp_grp_rename(grp, lifr->lifr_groupname); 12555 goto unlock; 12556 } 12557 12558 /* 12559 * Handle requests to add or remove an IP interface from a group. 12560 */ 12561 if (lifr->lifr_groupname[0] != '\0') { /* add */ 12562 /* 12563 * Moves are handled by first removing the interface from 12564 * its existing group, and then adding it to another group. 12565 * So, fail if it's already in a group. 12566 */ 12567 if (IS_UNDER_IPMP(ill)) { 12568 err = EALREADY; 12569 goto unlock; 12570 } 12571 12572 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst); 12573 if (grp == NULL) { 12574 err = ENOENT; 12575 goto unlock; 12576 } 12577 12578 /* 12579 * Check if the phyint and its ills are suitable for 12580 * inclusion into the group. 12581 */ 12582 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0) 12583 goto unlock; 12584 12585 /* 12586 * Checks pass; join the group, and enqueue the remaining 12587 * illgrp joins for when we've become part of the group xop 12588 * and are exclusive across its IPSQs. Since qwriter_ip() 12589 * requires an mblk_t to scribble on, and since `mp' will be 12590 * freed as part of completing the ioctl, allocate another. 12591 */ 12592 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) { 12593 err = ENOMEM; 12594 goto unlock; 12595 } 12596 12597 /* 12598 * Before we drop ipmp_lock, bump gr_pend* to ensure that the 12599 * IPMP meta-interface ills needed by `phyi' cannot go away 12600 * before ip_join_illgrps() is called back. See the comments 12601 * in ip_sioctl_plink_ipmp() for more. 12602 */ 12603 if (phyi->phyint_illv4 != NULL) 12604 grp->gr_pendv4++; 12605 if (phyi->phyint_illv6 != NULL) 12606 grp->gr_pendv6++; 12607 12608 rw_exit(&ipst->ips_ipmp_lock); 12609 12610 ipmp_phyint_join_grp(phyi, grp); 12611 ill_refhold(ill); 12612 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps, 12613 SWITCH_OP, B_FALSE); 12614 return (0); 12615 } else { 12616 /* 12617 * Request to remove the interface from a group. If the 12618 * interface is not in a group, this trivially succeeds. 12619 */ 12620 rw_exit(&ipst->ips_ipmp_lock); 12621 if (IS_UNDER_IPMP(ill)) 12622 ipmp_phyint_leave_grp(phyi); 12623 return (0); 12624 } 12625 unlock: 12626 rw_exit(&ipst->ips_ipmp_lock); 12627 return (err); 12628 } 12629 12630 /* 12631 * Process an SIOCGLIFBINDING request. 12632 */ 12633 /* ARGSUSED */ 12634 int 12635 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12636 ip_ioctl_cmd_t *ipip, void *ifreq) 12637 { 12638 ill_t *ill; 12639 struct lifreq *lifr = ifreq; 12640 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12641 12642 if (!IS_IPMP(ipif->ipif_ill)) 12643 return (EINVAL); 12644 12645 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12646 if ((ill = ipif->ipif_bound_ill) == NULL) 12647 lifr->lifr_binding[0] = '\0'; 12648 else 12649 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ); 12650 rw_exit(&ipst->ips_ipmp_lock); 12651 return (0); 12652 } 12653 12654 /* 12655 * Process an SIOCGLIFGROUPNAME request. 12656 */ 12657 /* ARGSUSED */ 12658 int 12659 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12660 ip_ioctl_cmd_t *ipip, void *ifreq) 12661 { 12662 ipmp_grp_t *grp; 12663 struct lifreq *lifr = ifreq; 12664 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 12665 12666 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12667 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL) 12668 lifr->lifr_groupname[0] = '\0'; 12669 else 12670 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ); 12671 rw_exit(&ipst->ips_ipmp_lock); 12672 return (0); 12673 } 12674 12675 /* 12676 * Process an SIOCGLIFGROUPINFO request. 12677 */ 12678 /* ARGSUSED */ 12679 int 12680 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, 12681 ip_ioctl_cmd_t *ipip, void *dummy) 12682 { 12683 ipmp_grp_t *grp; 12684 lifgroupinfo_t *lifgr; 12685 ip_stack_t *ipst = CONNQ_TO_IPST(q); 12686 12687 /* ip_wput_nondata() verified mp->b_cont->b_cont */ 12688 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr; 12689 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0'; 12690 12691 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 12692 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) { 12693 rw_exit(&ipst->ips_ipmp_lock); 12694 return (ENOENT); 12695 } 12696 ipmp_grp_info(grp, lifgr); 12697 rw_exit(&ipst->ips_ipmp_lock); 12698 return (0); 12699 } 12700 12701 static void 12702 ill_dl_down(ill_t *ill) 12703 { 12704 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); 12705 12706 /* 12707 * The ill is down; unbind but stay attached since we're still 12708 * associated with a PPA. If we have negotiated DLPI capabilites 12709 * with the data link service provider (IDS_OK) then reset them. 12710 * The interval between unbinding and rebinding is potentially 12711 * unbounded hence we cannot assume things will be the same. 12712 * The DLPI capabilities will be probed again when the data link 12713 * is brought up. 12714 */ 12715 mblk_t *mp = ill->ill_unbind_mp; 12716 12717 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); 12718 12719 if (!ill->ill_replumbing) { 12720 /* Free all ilms for this ill */ 12721 update_conn_ill(ill, ill->ill_ipst); 12722 } else { 12723 ill_leave_multicast(ill); 12724 } 12725 12726 ill->ill_unbind_mp = NULL; 12727 if (mp != NULL) { 12728 ip1dbg(("ill_dl_down: %s (%u) for %s\n", 12729 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, 12730 ill->ill_name)); 12731 mutex_enter(&ill->ill_lock); 12732 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS; 12733 mutex_exit(&ill->ill_lock); 12734 /* 12735 * ip_rput does not pass up normal (M_PROTO) DLPI messages 12736 * after ILL_CONDEMNED is set. So in the unplumb case, we call 12737 * ill_capability_dld_disable disable rightaway. If this is not 12738 * an unplumb operation then the disable happens on receipt of 12739 * the capab ack via ip_rput_dlpi_writer -> 12740 * ill_capability_ack_thr. In both cases the order of 12741 * the operations seen by DLD is capability disable followed 12742 * by DL_UNBIND. Also the DLD capability disable needs a 12743 * cv_wait'able context. 12744 */ 12745 if (ill->ill_state_flags & ILL_CONDEMNED) 12746 ill_capability_dld_disable(ill); 12747 ill_capability_reset(ill, B_FALSE); 12748 ill_dlpi_send(ill, mp); 12749 } 12750 mutex_enter(&ill->ill_lock); 12751 ill->ill_dl_up = 0; 12752 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); 12753 mutex_exit(&ill->ill_lock); 12754 } 12755 12756 void 12757 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) 12758 { 12759 union DL_primitives *dlp; 12760 t_uscalar_t prim; 12761 boolean_t waitack = B_FALSE; 12762 12763 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12764 12765 dlp = (union DL_primitives *)mp->b_rptr; 12766 prim = dlp->dl_primitive; 12767 12768 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n", 12769 dl_primstr(prim), prim, ill->ill_name)); 12770 12771 switch (prim) { 12772 case DL_PHYS_ADDR_REQ: 12773 { 12774 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr; 12775 ill->ill_phys_addr_pend = dlpap->dl_addr_type; 12776 break; 12777 } 12778 case DL_BIND_REQ: 12779 mutex_enter(&ill->ill_lock); 12780 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS; 12781 mutex_exit(&ill->ill_lock); 12782 break; 12783 } 12784 12785 /* 12786 * Except for the ACKs for the M_PCPROTO messages, all other ACKs 12787 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore 12788 * we only wait for the ACK of the DL_UNBIND_REQ. 12789 */ 12790 mutex_enter(&ill->ill_lock); 12791 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 12792 (prim == DL_UNBIND_REQ)) { 12793 ill->ill_dlpi_pending = prim; 12794 waitack = B_TRUE; 12795 } 12796 12797 mutex_exit(&ill->ill_lock); 12798 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", 12799 char *, dl_primstr(prim), ill_t *, ill); 12800 putnext(ill->ill_wq, mp); 12801 12802 /* 12803 * There is no ack for DL_NOTIFY_CONF messages 12804 */ 12805 if (waitack && prim == DL_NOTIFY_CONF) 12806 ill_dlpi_done(ill, prim); 12807 } 12808 12809 /* 12810 * Helper function for ill_dlpi_send(). 12811 */ 12812 /* ARGSUSED */ 12813 static void 12814 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) 12815 { 12816 ill_dlpi_send(q->q_ptr, mp); 12817 } 12818 12819 /* 12820 * Send a DLPI control message to the driver but make sure there 12821 * is only one outstanding message. Uses ill_dlpi_pending to tell 12822 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done() 12823 * when an ACK or a NAK is received to process the next queued message. 12824 */ 12825 void 12826 ill_dlpi_send(ill_t *ill, mblk_t *mp) 12827 { 12828 mblk_t **mpp; 12829 12830 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 12831 12832 /* 12833 * To ensure that any DLPI requests for current exclusive operation 12834 * are always completely sent before any DLPI messages for other 12835 * operations, require writer access before enqueuing. 12836 */ 12837 if (!IAM_WRITER_ILL(ill)) { 12838 ill_refhold(ill); 12839 /* qwriter_ip() does the ill_refrele() */ 12840 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer, 12841 NEW_OP, B_TRUE); 12842 return; 12843 } 12844 12845 mutex_enter(&ill->ill_lock); 12846 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 12847 /* Must queue message. Tail insertion */ 12848 mpp = &ill->ill_dlpi_deferred; 12849 while (*mpp != NULL) 12850 mpp = &((*mpp)->b_next); 12851 12852 ip1dbg(("ill_dlpi_send: deferring request for %s " 12853 "while %s pending\n", ill->ill_name, 12854 dl_primstr(ill->ill_dlpi_pending))); 12855 12856 *mpp = mp; 12857 mutex_exit(&ill->ill_lock); 12858 return; 12859 } 12860 mutex_exit(&ill->ill_lock); 12861 ill_dlpi_dispatch(ill, mp); 12862 } 12863 12864 void 12865 ill_capability_send(ill_t *ill, mblk_t *mp) 12866 { 12867 ill->ill_capab_pending_cnt++; 12868 ill_dlpi_send(ill, mp); 12869 } 12870 12871 void 12872 ill_capability_done(ill_t *ill) 12873 { 12874 ASSERT(ill->ill_capab_pending_cnt != 0); 12875 12876 ill_dlpi_done(ill, DL_CAPABILITY_REQ); 12877 12878 ill->ill_capab_pending_cnt--; 12879 if (ill->ill_capab_pending_cnt == 0 && 12880 ill->ill_dlpi_capab_state == IDCS_OK) 12881 ill_capability_reset_alloc(ill); 12882 } 12883 12884 /* 12885 * Send all deferred DLPI messages without waiting for their ACKs. 12886 */ 12887 void 12888 ill_dlpi_send_deferred(ill_t *ill) 12889 { 12890 mblk_t *mp, *nextmp; 12891 12892 /* 12893 * Clear ill_dlpi_pending so that the message is not queued in 12894 * ill_dlpi_send(). 12895 */ 12896 mutex_enter(&ill->ill_lock); 12897 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12898 mp = ill->ill_dlpi_deferred; 12899 ill->ill_dlpi_deferred = NULL; 12900 mutex_exit(&ill->ill_lock); 12901 12902 for (; mp != NULL; mp = nextmp) { 12903 nextmp = mp->b_next; 12904 mp->b_next = NULL; 12905 ill_dlpi_send(ill, mp); 12906 } 12907 } 12908 12909 /* 12910 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR 12911 * or M_HANGUP 12912 */ 12913 static void 12914 ill_dlpi_clear_deferred(ill_t *ill) 12915 { 12916 mblk_t *mp, *nextmp; 12917 12918 mutex_enter(&ill->ill_lock); 12919 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12920 mp = ill->ill_dlpi_deferred; 12921 ill->ill_dlpi_deferred = NULL; 12922 mutex_exit(&ill->ill_lock); 12923 12924 for (; mp != NULL; mp = nextmp) { 12925 nextmp = mp->b_next; 12926 inet_freemsg(mp); 12927 } 12928 } 12929 12930 /* 12931 * Check if the DLPI primitive `prim' is pending; print a warning if not. 12932 */ 12933 boolean_t 12934 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim) 12935 { 12936 t_uscalar_t pending; 12937 12938 mutex_enter(&ill->ill_lock); 12939 if (ill->ill_dlpi_pending == prim) { 12940 mutex_exit(&ill->ill_lock); 12941 return (B_TRUE); 12942 } 12943 12944 /* 12945 * During teardown, ill_dlpi_dispatch() will send DLPI requests 12946 * without waiting, so don't print any warnings in that case. 12947 */ 12948 if (ill->ill_state_flags & ILL_CONDEMNED) { 12949 mutex_exit(&ill->ill_lock); 12950 return (B_FALSE); 12951 } 12952 pending = ill->ill_dlpi_pending; 12953 mutex_exit(&ill->ill_lock); 12954 12955 if (pending == DL_PRIM_INVAL) { 12956 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12957 "received unsolicited ack for %s on %s\n", 12958 dl_primstr(prim), ill->ill_name); 12959 } else { 12960 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE, 12961 "received unexpected ack for %s on %s (expecting %s)\n", 12962 dl_primstr(prim), ill->ill_name, dl_primstr(pending)); 12963 } 12964 return (B_FALSE); 12965 } 12966 12967 /* 12968 * Complete the current DLPI operation associated with `prim' on `ill' and 12969 * start the next queued DLPI operation (if any). If there are no queued DLPI 12970 * operations and the ill's current exclusive IPSQ operation has finished 12971 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to 12972 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See 12973 * the comments above ipsq_current_finish() for details. 12974 */ 12975 void 12976 ill_dlpi_done(ill_t *ill, t_uscalar_t prim) 12977 { 12978 mblk_t *mp; 12979 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 12980 ipxop_t *ipx = ipsq->ipsq_xop; 12981 12982 ASSERT(IAM_WRITER_IPSQ(ipsq)); 12983 mutex_enter(&ill->ill_lock); 12984 12985 ASSERT(prim != DL_PRIM_INVAL); 12986 ASSERT(ill->ill_dlpi_pending == prim); 12987 12988 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name, 12989 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending)); 12990 12991 if ((mp = ill->ill_dlpi_deferred) == NULL) { 12992 ill->ill_dlpi_pending = DL_PRIM_INVAL; 12993 if (ipx->ipx_current_done) { 12994 mutex_enter(&ipx->ipx_lock); 12995 ipx->ipx_current_ipif = NULL; 12996 mutex_exit(&ipx->ipx_lock); 12997 } 12998 cv_signal(&ill->ill_cv); 12999 mutex_exit(&ill->ill_lock); 13000 return; 13001 } 13002 13003 ill->ill_dlpi_deferred = mp->b_next; 13004 mp->b_next = NULL; 13005 mutex_exit(&ill->ill_lock); 13006 13007 ill_dlpi_dispatch(ill, mp); 13008 } 13009 13010 /* 13011 * Queue a (multicast) DLPI control message to be sent to the driver by 13012 * later calling ill_dlpi_send_queued. 13013 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 13014 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ 13015 * for the same group to race. 13016 * We send DLPI control messages in order using ill_lock. 13017 * For IPMP we should be called on the cast_ill. 13018 */ 13019 void 13020 ill_dlpi_queue(ill_t *ill, mblk_t *mp) 13021 { 13022 mblk_t **mpp; 13023 13024 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 13025 13026 mutex_enter(&ill->ill_lock); 13027 /* Must queue message. Tail insertion */ 13028 mpp = &ill->ill_dlpi_deferred; 13029 while (*mpp != NULL) 13030 mpp = &((*mpp)->b_next); 13031 13032 *mpp = mp; 13033 mutex_exit(&ill->ill_lock); 13034 } 13035 13036 /* 13037 * Send the messages that were queued. Make sure there is only 13038 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() 13039 * when an ACK or a NAK is received to process the next queued message. 13040 * For IPMP we are called on the upper ill, but when send what is queued 13041 * on the cast_ill. 13042 */ 13043 void 13044 ill_dlpi_send_queued(ill_t *ill) 13045 { 13046 mblk_t *mp; 13047 union DL_primitives *dlp; 13048 t_uscalar_t prim; 13049 ill_t *release_ill = NULL; 13050 13051 if (IS_IPMP(ill)) { 13052 /* On the upper IPMP ill. */ 13053 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 13054 if (release_ill == NULL) { 13055 /* Avoid ever sending anything down to the ipmpstub */ 13056 return; 13057 } 13058 ill = release_ill; 13059 } 13060 mutex_enter(&ill->ill_lock); 13061 while ((mp = ill->ill_dlpi_deferred) != NULL) { 13062 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { 13063 /* Can't send. Somebody else will send it */ 13064 mutex_exit(&ill->ill_lock); 13065 goto done; 13066 } 13067 ill->ill_dlpi_deferred = mp->b_next; 13068 mp->b_next = NULL; 13069 if (!ill->ill_dl_up) { 13070 /* 13071 * Nobody there. All multicast addresses will be 13072 * re-joined when we get the DL_BIND_ACK bringing the 13073 * interface up. 13074 */ 13075 freemsg(mp); 13076 continue; 13077 } 13078 dlp = (union DL_primitives *)mp->b_rptr; 13079 prim = dlp->dl_primitive; 13080 13081 if (!(ill->ill_state_flags & ILL_CONDEMNED) || 13082 (prim == DL_UNBIND_REQ)) { 13083 ill->ill_dlpi_pending = prim; 13084 } 13085 mutex_exit(&ill->ill_lock); 13086 13087 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", 13088 char *, dl_primstr(prim), ill_t *, ill); 13089 putnext(ill->ill_wq, mp); 13090 mutex_enter(&ill->ill_lock); 13091 } 13092 mutex_exit(&ill->ill_lock); 13093 done: 13094 if (release_ill != NULL) 13095 ill_refrele(release_ill); 13096 } 13097 13098 /* 13099 * Queue an IP (IGMP/MLD) message to be sent by IP from 13100 * ill_mcast_send_queued 13101 * We queue them while holding a lock (ill_mcast_lock) to ensure that they 13102 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same 13103 * group to race. 13104 * We send them in order using ill_lock. 13105 * For IPMP we are called on the upper ill, but we queue on the cast_ill. 13106 */ 13107 void 13108 ill_mcast_queue(ill_t *ill, mblk_t *mp) 13109 { 13110 mblk_t **mpp; 13111 ill_t *release_ill = NULL; 13112 13113 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); 13114 13115 if (IS_IPMP(ill)) { 13116 /* On the upper IPMP ill. */ 13117 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 13118 if (release_ill == NULL) { 13119 /* Discard instead of queuing for the ipmp interface */ 13120 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); 13121 ip_drop_output("ipIfStatsOutDiscards - no cast_ill", 13122 mp, ill); 13123 freemsg(mp); 13124 return; 13125 } 13126 ill = release_ill; 13127 } 13128 13129 mutex_enter(&ill->ill_lock); 13130 /* Must queue message. Tail insertion */ 13131 mpp = &ill->ill_mcast_deferred; 13132 while (*mpp != NULL) 13133 mpp = &((*mpp)->b_next); 13134 13135 *mpp = mp; 13136 mutex_exit(&ill->ill_lock); 13137 if (release_ill != NULL) 13138 ill_refrele(release_ill); 13139 } 13140 13141 /* 13142 * Send the IP packets that were queued by ill_mcast_queue. 13143 * These are IGMP/MLD packets. 13144 * 13145 * For IPMP we are called on the upper ill, but when send what is queued 13146 * on the cast_ill. 13147 * 13148 * Request loopback of the report if we are acting as a multicast 13149 * router, so that the process-level routing demon can hear it. 13150 * This will run multiple times for the same group if there are members 13151 * on the same group for multiple ipif's on the same ill. The 13152 * igmp_input/mld_input code will suppress this due to the loopback thus we 13153 * always loopback membership report. 13154 * 13155 * We also need to make sure that this does not get load balanced 13156 * by IPMP. We do this by passing an ill to ip_output_simple. 13157 */ 13158 void 13159 ill_mcast_send_queued(ill_t *ill) 13160 { 13161 mblk_t *mp; 13162 ip_xmit_attr_t ixas; 13163 ill_t *release_ill = NULL; 13164 13165 if (IS_IPMP(ill)) { 13166 /* On the upper IPMP ill. */ 13167 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 13168 if (release_ill == NULL) { 13169 /* 13170 * We should have no messages on the ipmp interface 13171 * but no point in trying to send them. 13172 */ 13173 return; 13174 } 13175 ill = release_ill; 13176 } 13177 bzero(&ixas, sizeof (ixas)); 13178 ixas.ixa_zoneid = ALL_ZONES; 13179 ixas.ixa_cred = kcred; 13180 ixas.ixa_cpid = NOPID; 13181 ixas.ixa_tsl = NULL; 13182 /* 13183 * Here we set ixa_ifindex. If IPMP it will be the lower ill which 13184 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. 13185 * That is necessary to handle IGMP/MLD snooping switches. 13186 */ 13187 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; 13188 ixas.ixa_ipst = ill->ill_ipst; 13189 13190 mutex_enter(&ill->ill_lock); 13191 while ((mp = ill->ill_mcast_deferred) != NULL) { 13192 ill->ill_mcast_deferred = mp->b_next; 13193 mp->b_next = NULL; 13194 if (!ill->ill_dl_up) { 13195 /* 13196 * Nobody there. Just drop the ip packets. 13197 * IGMP/MLD will resend later, if this is a replumb. 13198 */ 13199 freemsg(mp); 13200 continue; 13201 } 13202 mutex_enter(&ill->ill_phyint->phyint_lock); 13203 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { 13204 /* 13205 * When the ill is getting deactivated, we only want to 13206 * send the DLPI messages, so drop IGMP/MLD packets. 13207 * DLPI messages are handled by ill_dlpi_send_queued() 13208 */ 13209 mutex_exit(&ill->ill_phyint->phyint_lock); 13210 freemsg(mp); 13211 continue; 13212 } 13213 mutex_exit(&ill->ill_phyint->phyint_lock); 13214 mutex_exit(&ill->ill_lock); 13215 13216 /* Check whether we are sending IPv4 or IPv6. */ 13217 if (ill->ill_isv6) { 13218 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 13219 13220 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 13221 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; 13222 } else { 13223 ipha_t *ipha = (ipha_t *)mp->b_rptr; 13224 13225 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 13226 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13227 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; 13228 } 13229 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE; 13230 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; 13231 (void) ip_output_simple(mp, &ixas); 13232 ixa_cleanup(&ixas); 13233 13234 mutex_enter(&ill->ill_lock); 13235 } 13236 mutex_exit(&ill->ill_lock); 13237 13238 done: 13239 if (release_ill != NULL) 13240 ill_refrele(release_ill); 13241 } 13242 13243 /* 13244 * Take down a specific interface, but don't lose any information about it. 13245 * (Always called as writer.) 13246 * This function goes through the down sequence even if the interface is 13247 * already down. There are 2 reasons. 13248 * a. Currently we permit interface routes that depend on down interfaces 13249 * to be added. This behaviour itself is questionable. However it appears 13250 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long 13251 * time. We go thru the cleanup in order to remove these routes. 13252 * b. The bringup of the interface could fail in ill_dl_up i.e. we get 13253 * DL_ERROR_ACK in response to the DL_BIND request. The interface is 13254 * down, but we need to cleanup i.e. do ill_dl_down and 13255 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. 13256 * 13257 * IP-MT notes: 13258 * 13259 * Model of reference to interfaces. 13260 * 13261 * The following members in ipif_t track references to the ipif. 13262 * int ipif_refcnt; Active reference count 13263 * 13264 * The following members in ill_t track references to the ill. 13265 * int ill_refcnt; active refcnt 13266 * uint_t ill_ire_cnt; Number of ires referencing ill 13267 * uint_t ill_ncec_cnt; Number of ncecs referencing ill 13268 * uint_t ill_nce_cnt; Number of nces referencing ill 13269 * uint_t ill_ilm_cnt; Number of ilms referencing ill 13270 * 13271 * Reference to an ipif or ill can be obtained in any of the following ways. 13272 * 13273 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions 13274 * Pointers to ipif / ill from other data structures viz ire and conn. 13275 * Implicit reference to the ipif / ill by holding a reference to the ire. 13276 * 13277 * The ipif/ill lookup functions return a reference held ipif / ill. 13278 * ipif_refcnt and ill_refcnt track the reference counts respectively. 13279 * This is a purely dynamic reference count associated with threads holding 13280 * references to the ipif / ill. Pointers from other structures do not 13281 * count towards this reference count. 13282 * 13283 * ill_ire_cnt is the number of ire's associated with the 13284 * ill. This is incremented whenever a new ire is created referencing the 13285 * ill. This is done atomically inside ire_add_v[46] where the ire is 13286 * actually added to the ire hash table. The count is decremented in 13287 * ire_inactive where the ire is destroyed. 13288 * 13289 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. 13290 * This is incremented atomically in 13291 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the 13292 * table. Similarly it is decremented in ncec_inactive() where the ncec 13293 * is destroyed. 13294 * 13295 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is 13296 * incremented atomically in nce_add() where the nce is actually added to the 13297 * ill_nce. Similarly it is decremented in nce_inactive() where the nce 13298 * is destroyed. 13299 * 13300 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in 13301 * ilm_add() and decremented before the ilm is freed in ilm_delete(). 13302 * 13303 * Flow of ioctls involving interface down/up 13304 * 13305 * The following is the sequence of an attempt to set some critical flags on an 13306 * up interface. 13307 * ip_sioctl_flags 13308 * ipif_down 13309 * wait for ipif to be quiescent 13310 * ipif_down_tail 13311 * ip_sioctl_flags_tail 13312 * 13313 * All set ioctls that involve down/up sequence would have a skeleton similar 13314 * to the above. All the *tail functions are called after the refcounts have 13315 * dropped to the appropriate values. 13316 * 13317 * SIOC ioctls during the IPIF_CHANGING interval. 13318 * 13319 * Threads handling SIOC set ioctls serialize on the squeue, but this 13320 * is not done for SIOC get ioctls. Since a set ioctl can cause several 13321 * steps of internal changes to the state, some of which are visible in 13322 * ipif_flags (such as IFF_UP being cleared and later set), and we want 13323 * the set ioctl to be atomic related to the get ioctls, the SIOC get code 13324 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then 13325 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when 13326 * the current exclusive operation completes. The IPIF_CHANGING check 13327 * and enqueue is atomic using the ill_lock and ipsq_lock. The 13328 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't 13329 * change while the ill_lock is held. Before dropping the ill_lock we acquire 13330 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish 13331 * until we release the ipsq_lock, even though the ill/ipif state flags 13332 * can change after we drop the ill_lock. 13333 */ 13334 int 13335 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13336 { 13337 ill_t *ill = ipif->ipif_ill; 13338 conn_t *connp; 13339 boolean_t success; 13340 boolean_t ipif_was_up = B_FALSE; 13341 ip_stack_t *ipst = ill->ill_ipst; 13342 13343 ASSERT(IAM_WRITER_IPIF(ipif)); 13344 13345 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 13346 13347 DTRACE_PROBE3(ipif__downup, char *, "ipif_down", 13348 ill_t *, ill, ipif_t *, ipif); 13349 13350 if (ipif->ipif_flags & IPIF_UP) { 13351 mutex_enter(&ill->ill_lock); 13352 ipif->ipif_flags &= ~IPIF_UP; 13353 ASSERT(ill->ill_ipif_up_count > 0); 13354 --ill->ill_ipif_up_count; 13355 mutex_exit(&ill->ill_lock); 13356 ipif_was_up = B_TRUE; 13357 /* Update status in SCTP's list */ 13358 sctp_update_ipif(ipif, SCTP_IPIF_DOWN); 13359 ill_nic_event_dispatch(ipif->ipif_ill, 13360 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0); 13361 } 13362 13363 /* 13364 * Removal of the last ipif from an ill may result in a DL_UNBIND 13365 * being sent to the driver, and we must not send any data packets to 13366 * the driver after the DL_UNBIND_REQ. To ensure this, all the 13367 * ire and nce entries used in the data path will be cleaned 13368 * up, and we also set the ILL_DOWN_IN_PROGRESS bit to make 13369 * sure on new entries will be added until the ill is bound 13370 * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon 13371 * receipt of a DL_BIND_ACK. 13372 */ 13373 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13374 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13375 ill->ill_dl_up) { 13376 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 13377 } 13378 13379 /* 13380 * Blow away memberships we established in ipif_multicast_up(). 13381 */ 13382 ipif_multicast_down(ipif); 13383 13384 /* 13385 * Remove from the mapping for __sin6_src_id. We insert only 13386 * when the address is not INADDR_ANY. As IPv4 addresses are 13387 * stored as mapped addresses, we need to check for mapped 13388 * INADDR_ANY also. 13389 */ 13390 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && 13391 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) && 13392 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 13393 int err; 13394 13395 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr, 13396 ipif->ipif_zoneid, ipst); 13397 if (err != 0) { 13398 ip0dbg(("ipif_down: srcid_remove %d\n", err)); 13399 } 13400 } 13401 13402 if (ipif_was_up) { 13403 /* only delete if we'd added ire's before */ 13404 if (ipif->ipif_isv6) 13405 ipif_delete_ires_v6(ipif); 13406 else 13407 ipif_delete_ires_v4(ipif); 13408 } 13409 13410 if (ipif_was_up && ill->ill_ipif_up_count == 0) { 13411 /* 13412 * Since the interface is now down, it may have just become 13413 * inactive. Note that this needs to be done even for a 13414 * lll_logical_down(), or ARP entries will not get correctly 13415 * restored when the interface comes back up. 13416 */ 13417 if (IS_UNDER_IPMP(ill)) 13418 ipmp_ill_refresh_active(ill); 13419 } 13420 13421 /* 13422 * neighbor-discovery or arp entries for this interface. The ipif 13423 * has to be quiesced, so we walk all the nce's and delete those 13424 * that point at the ipif->ipif_ill. At the same time, we also 13425 * update IPMP so that ipifs for data addresses are unbound. We dont 13426 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer 13427 * that for ipif_down_tail() 13428 */ 13429 ipif_nce_down(ipif); 13430 13431 /* 13432 * If this is the last ipif on the ill, we also need to remove 13433 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will 13434 * never succeed. 13435 */ 13436 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) 13437 ire_walk_ill(0, 0, ill_downi, ill, ill); 13438 13439 /* 13440 * Walk all CONNs that can have a reference on an ire for this 13441 * ipif (we actually walk all that now have stale references). 13442 */ 13443 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); 13444 13445 /* 13446 * If mp is NULL the caller will wait for the appropriate refcnt. 13447 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down 13448 * and ill_delete -> ipif_free -> ipif_down 13449 */ 13450 if (mp == NULL) { 13451 ASSERT(q == NULL); 13452 return (0); 13453 } 13454 13455 if (CONN_Q(q)) { 13456 connp = Q_TO_CONN(q); 13457 mutex_enter(&connp->conn_lock); 13458 } else { 13459 connp = NULL; 13460 } 13461 mutex_enter(&ill->ill_lock); 13462 /* 13463 * Are there any ire's pointing to this ipif that are still active ? 13464 * If this is the last ipif going down, are there any ire's pointing 13465 * to this ill that are still active ? 13466 */ 13467 if (ipif_is_quiescent(ipif)) { 13468 mutex_exit(&ill->ill_lock); 13469 if (connp != NULL) 13470 mutex_exit(&connp->conn_lock); 13471 return (0); 13472 } 13473 13474 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p", 13475 ill->ill_name, (void *)ill)); 13476 /* 13477 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount 13478 * drops down, the operation will be restarted by ipif_ill_refrele_tail 13479 * which in turn is called by the last refrele on the ipif/ill/ire. 13480 */ 13481 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN); 13482 if (!success) { 13483 /* The conn is closing. So just return */ 13484 ASSERT(connp != NULL); 13485 mutex_exit(&ill->ill_lock); 13486 mutex_exit(&connp->conn_lock); 13487 return (EINTR); 13488 } 13489 13490 mutex_exit(&ill->ill_lock); 13491 if (connp != NULL) 13492 mutex_exit(&connp->conn_lock); 13493 return (EINPROGRESS); 13494 } 13495 13496 int 13497 ipif_down_tail(ipif_t *ipif) 13498 { 13499 ill_t *ill = ipif->ipif_ill; 13500 int err = 0; 13501 13502 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", 13503 ill_t *, ill, ipif_t *, ipif); 13504 13505 /* 13506 * Skip any loopback interface (null wq). 13507 * If this is the last logical interface on the ill 13508 * have ill_dl_down tell the driver we are gone (unbind) 13509 * Note that lun 0 can ipif_down even though 13510 * there are other logical units that are up. 13511 * This occurs e.g. when we change a "significant" IFF_ flag. 13512 */ 13513 if (ill->ill_wq != NULL && !ill->ill_logical_down && 13514 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && 13515 ill->ill_dl_up) { 13516 ill_dl_down(ill); 13517 } 13518 if (!ipif->ipif_isv6) 13519 err = ipif_arp_down(ipif); 13520 13521 ill->ill_logical_down = 0; 13522 13523 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 13524 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); 13525 return (err); 13526 } 13527 13528 /* 13529 * Bring interface logically down without bringing the physical interface 13530 * down e.g. when the netmask is changed. This avoids long lasting link 13531 * negotiations between an ethernet interface and a certain switches. 13532 */ 13533 static int 13534 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) 13535 { 13536 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", 13537 ill_t *, ipif->ipif_ill, ipif_t *, ipif); 13538 13539 /* 13540 * The ill_logical_down flag is a transient flag. It is set here 13541 * and is cleared once the down has completed in ipif_down_tail. 13542 * This flag does not indicate whether the ill stream is in the 13543 * DL_BOUND state with the driver. Instead this flag is used by 13544 * ipif_down_tail to determine whether to DL_UNBIND the stream with 13545 * the driver. The state of the ill stream i.e. whether it is 13546 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag. 13547 */ 13548 ipif->ipif_ill->ill_logical_down = 1; 13549 return (ipif_down(ipif, q, mp)); 13550 } 13551 13552 /* 13553 * Initiate deallocate of an IPIF. Always called as writer. Called by 13554 * ill_delete or ip_sioctl_removeif. 13555 */ 13556 static void 13557 ipif_free(ipif_t *ipif) 13558 { 13559 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13560 13561 ASSERT(IAM_WRITER_IPIF(ipif)); 13562 13563 if (ipif->ipif_recovery_id != 0) 13564 (void) untimeout(ipif->ipif_recovery_id); 13565 ipif->ipif_recovery_id = 0; 13566 13567 /* 13568 * Take down the interface. We can be called either from ill_delete 13569 * or from ip_sioctl_removeif. 13570 */ 13571 (void) ipif_down(ipif, NULL, NULL); 13572 13573 /* 13574 * Now that the interface is down, there's no chance it can still 13575 * become a duplicate. Cancel any timer that may have been set while 13576 * tearing down. 13577 */ 13578 if (ipif->ipif_recovery_id != 0) 13579 (void) untimeout(ipif->ipif_recovery_id); 13580 ipif->ipif_recovery_id = 0; 13581 13582 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13583 /* Remove pointers to this ill in the multicast routing tables */ 13584 reset_mrt_vif_ipif(ipif); 13585 /* If necessary, clear the cached source ipif rotor. */ 13586 if (ipif->ipif_ill->ill_src_ipif == ipif) 13587 ipif->ipif_ill->ill_src_ipif = NULL; 13588 rw_exit(&ipst->ips_ill_g_lock); 13589 } 13590 13591 static void 13592 ipif_free_tail(ipif_t *ipif) 13593 { 13594 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13595 13596 /* 13597 * Need to hold both ill_g_lock and ill_lock while 13598 * inserting or removing an ipif from the linked list 13599 * of ipifs hanging off the ill. 13600 */ 13601 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 13602 13603 #ifdef DEBUG 13604 ipif_trace_cleanup(ipif); 13605 #endif 13606 13607 /* Ask SCTP to take it out of it list */ 13608 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE); 13609 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT); 13610 13611 /* Get it out of the ILL interface list. */ 13612 ipif_remove(ipif); 13613 rw_exit(&ipst->ips_ill_g_lock); 13614 13615 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); 13616 ASSERT(ipif->ipif_recovery_id == 0); 13617 ASSERT(ipif->ipif_ire_local == NULL); 13618 ASSERT(ipif->ipif_ire_if == NULL); 13619 13620 /* Free the memory. */ 13621 mi_free(ipif); 13622 } 13623 13624 /* 13625 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id" 13626 * is zero. 13627 */ 13628 void 13629 ipif_get_name(const ipif_t *ipif, char *buf, int len) 13630 { 13631 char lbuf[LIFNAMSIZ]; 13632 char *name; 13633 size_t name_len; 13634 13635 buf[0] = '\0'; 13636 name = ipif->ipif_ill->ill_name; 13637 name_len = ipif->ipif_ill->ill_name_length; 13638 if (ipif->ipif_id != 0) { 13639 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR, 13640 ipif->ipif_id); 13641 name = lbuf; 13642 name_len = mi_strlen(name) + 1; 13643 } 13644 len -= 1; 13645 buf[len] = '\0'; 13646 len = MIN(len, name_len); 13647 bcopy(name, buf, len); 13648 } 13649 13650 /* 13651 * Sets `buf' to an ill name. 13652 */ 13653 void 13654 ill_get_name(const ill_t *ill, char *buf, int len) 13655 { 13656 char *name; 13657 size_t name_len; 13658 13659 name = ill->ill_name; 13660 name_len = ill->ill_name_length; 13661 len -= 1; 13662 buf[len] = '\0'; 13663 len = MIN(len, name_len); 13664 bcopy(name, buf, len); 13665 } 13666 13667 /* 13668 * Find an IPIF based on the name passed in. Names can be of the form <phys> 13669 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the 13670 * implied unit id is zero. <phys> must correspond to the name of an ILL. 13671 * (May be called as writer.) 13672 */ 13673 static ipif_t * 13674 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, 13675 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) 13676 { 13677 char *cp; 13678 char *endp; 13679 long id; 13680 ill_t *ill; 13681 ipif_t *ipif; 13682 uint_t ire_type; 13683 boolean_t did_alloc = B_FALSE; 13684 char last; 13685 13686 /* 13687 * If the caller wants to us to create the ipif, make sure we have a 13688 * valid zoneid 13689 */ 13690 ASSERT(!do_alloc || zoneid != ALL_ZONES); 13691 13692 if (namelen == 0) { 13693 return (NULL); 13694 } 13695 13696 *exists = B_FALSE; 13697 /* Look for a colon in the name. */ 13698 endp = &name[namelen]; 13699 for (cp = endp; --cp > name; ) { 13700 if (*cp == IPIF_SEPARATOR_CHAR) 13701 break; 13702 } 13703 13704 if (*cp == IPIF_SEPARATOR_CHAR) { 13705 /* 13706 * Reject any non-decimal aliases for logical 13707 * interfaces. Aliases with leading zeroes 13708 * are also rejected as they introduce ambiguity 13709 * in the naming of the interfaces. 13710 * In order to confirm with existing semantics, 13711 * and to not break any programs/script relying 13712 * on that behaviour, if<0>:0 is considered to be 13713 * a valid interface. 13714 * 13715 * If alias has two or more digits and the first 13716 * is zero, fail. 13717 */ 13718 if (&cp[2] < endp && cp[1] == '0') { 13719 return (NULL); 13720 } 13721 } 13722 13723 if (cp <= name) { 13724 cp = endp; 13725 } 13726 last = *cp; 13727 *cp = '\0'; 13728 13729 /* 13730 * Look up the ILL, based on the portion of the name 13731 * before the slash. ill_lookup_on_name returns a held ill. 13732 * Temporary to check whether ill exists already. If so 13733 * ill_lookup_on_name will clear it. 13734 */ 13735 ill = ill_lookup_on_name(name, do_alloc, isv6, 13736 &did_alloc, ipst); 13737 *cp = last; 13738 if (ill == NULL) 13739 return (NULL); 13740 13741 /* Establish the unit number in the name. */ 13742 id = 0; 13743 if (cp < endp && *endp == '\0') { 13744 /* If there was a colon, the unit number follows. */ 13745 cp++; 13746 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13747 ill_refrele(ill); 13748 return (NULL); 13749 } 13750 } 13751 13752 mutex_enter(&ill->ill_lock); 13753 /* Now see if there is an IPIF with this unit number. */ 13754 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13755 if (ipif->ipif_id == id) { 13756 if (zoneid != ALL_ZONES && 13757 zoneid != ipif->ipif_zoneid && 13758 ipif->ipif_zoneid != ALL_ZONES) { 13759 mutex_exit(&ill->ill_lock); 13760 ill_refrele(ill); 13761 return (NULL); 13762 } 13763 if (IPIF_CAN_LOOKUP(ipif)) { 13764 ipif_refhold_locked(ipif); 13765 mutex_exit(&ill->ill_lock); 13766 if (!did_alloc) 13767 *exists = B_TRUE; 13768 /* 13769 * Drop locks before calling ill_refrele 13770 * since it can potentially call into 13771 * ipif_ill_refrele_tail which can end up 13772 * in trying to acquire any lock. 13773 */ 13774 ill_refrele(ill); 13775 return (ipif); 13776 } 13777 } 13778 } 13779 13780 if (!do_alloc) { 13781 mutex_exit(&ill->ill_lock); 13782 ill_refrele(ill); 13783 return (NULL); 13784 } 13785 13786 /* 13787 * If none found, atomically allocate and return a new one. 13788 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL 13789 * to support "receive only" use of lo0:1 etc. as is still done 13790 * below as an initial guess. 13791 * However, this is now likely to be overriden later in ipif_up_done() 13792 * when we know for sure what address has been configured on the 13793 * interface, since we might have more than one loopback interface 13794 * with a loopback address, e.g. in the case of zones, and all the 13795 * interfaces with loopback addresses need to be marked IRE_LOOPBACK. 13796 */ 13797 if (ill->ill_net_type == IRE_LOOPBACK && id == 0) 13798 ire_type = IRE_LOOPBACK; 13799 else 13800 ire_type = IRE_LOCAL; 13801 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL); 13802 if (ipif != NULL) 13803 ipif_refhold_locked(ipif); 13804 mutex_exit(&ill->ill_lock); 13805 ill_refrele(ill); 13806 return (ipif); 13807 } 13808 13809 /* 13810 * Variant of the above that queues the request on the ipsq when 13811 * IPIF_CHANGING is set. 13812 */ 13813 static ipif_t * 13814 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6, 13815 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, 13816 ip_stack_t *ipst) 13817 { 13818 char *cp; 13819 char *endp; 13820 long id; 13821 ill_t *ill; 13822 ipif_t *ipif; 13823 boolean_t did_alloc = B_FALSE; 13824 ipsq_t *ipsq; 13825 13826 if (error != NULL) 13827 *error = 0; 13828 13829 if (namelen == 0) { 13830 if (error != NULL) 13831 *error = ENXIO; 13832 return (NULL); 13833 } 13834 13835 /* Look for a colon in the name. */ 13836 endp = &name[namelen]; 13837 for (cp = endp; --cp > name; ) { 13838 if (*cp == IPIF_SEPARATOR_CHAR) 13839 break; 13840 } 13841 13842 if (*cp == IPIF_SEPARATOR_CHAR) { 13843 /* 13844 * Reject any non-decimal aliases for logical 13845 * interfaces. Aliases with leading zeroes 13846 * are also rejected as they introduce ambiguity 13847 * in the naming of the interfaces. 13848 * In order to confirm with existing semantics, 13849 * and to not break any programs/script relying 13850 * on that behaviour, if<0>:0 is considered to be 13851 * a valid interface. 13852 * 13853 * If alias has two or more digits and the first 13854 * is zero, fail. 13855 */ 13856 if (&cp[2] < endp && cp[1] == '0') { 13857 if (error != NULL) 13858 *error = EINVAL; 13859 return (NULL); 13860 } 13861 } 13862 13863 if (cp <= name) { 13864 cp = endp; 13865 } else { 13866 *cp = '\0'; 13867 } 13868 13869 /* 13870 * Look up the ILL, based on the portion of the name 13871 * before the slash. ill_lookup_on_name returns a held ill. 13872 * Temporary to check whether ill exists already. If so 13873 * ill_lookup_on_name will clear it. 13874 */ 13875 ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst); 13876 if (cp != endp) 13877 *cp = IPIF_SEPARATOR_CHAR; 13878 if (ill == NULL) 13879 return (NULL); 13880 13881 /* Establish the unit number in the name. */ 13882 id = 0; 13883 if (cp < endp && *endp == '\0') { 13884 /* If there was a colon, the unit number follows. */ 13885 cp++; 13886 if (ddi_strtol(cp, NULL, 0, &id) != 0) { 13887 ill_refrele(ill); 13888 if (error != NULL) 13889 *error = ENXIO; 13890 return (NULL); 13891 } 13892 } 13893 13894 GRAB_CONN_LOCK(q); 13895 mutex_enter(&ill->ill_lock); 13896 /* Now see if there is an IPIF with this unit number. */ 13897 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 13898 if (ipif->ipif_id == id) { 13899 if (zoneid != ALL_ZONES && 13900 zoneid != ipif->ipif_zoneid && 13901 ipif->ipif_zoneid != ALL_ZONES) { 13902 mutex_exit(&ill->ill_lock); 13903 RELEASE_CONN_LOCK(q); 13904 ill_refrele(ill); 13905 if (error != NULL) 13906 *error = ENXIO; 13907 return (NULL); 13908 } 13909 13910 if (!(IPIF_IS_CHANGING(ipif) || 13911 IPIF_IS_CONDEMNED(ipif)) || 13912 IAM_WRITER_IPIF(ipif)) { 13913 ipif_refhold_locked(ipif); 13914 mutex_exit(&ill->ill_lock); 13915 /* 13916 * Drop locks before calling ill_refrele 13917 * since it can potentially call into 13918 * ipif_ill_refrele_tail which can end up 13919 * in trying to acquire any lock. 13920 */ 13921 RELEASE_CONN_LOCK(q); 13922 ill_refrele(ill); 13923 return (ipif); 13924 } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) { 13925 ipsq = ill->ill_phyint->phyint_ipsq; 13926 mutex_enter(&ipsq->ipsq_lock); 13927 mutex_enter(&ipsq->ipsq_xop->ipx_lock); 13928 mutex_exit(&ill->ill_lock); 13929 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); 13930 mutex_exit(&ipsq->ipsq_xop->ipx_lock); 13931 mutex_exit(&ipsq->ipsq_lock); 13932 RELEASE_CONN_LOCK(q); 13933 ill_refrele(ill); 13934 if (error != NULL) 13935 *error = EINPROGRESS; 13936 return (NULL); 13937 } 13938 } 13939 } 13940 RELEASE_CONN_LOCK(q); 13941 mutex_exit(&ill->ill_lock); 13942 ill_refrele(ill); 13943 if (error != NULL) 13944 *error = ENXIO; 13945 return (NULL); 13946 } 13947 13948 /* 13949 * This routine is called whenever a new address comes up on an ipif. If 13950 * we are configured to respond to address mask requests, then we are supposed 13951 * to broadcast an address mask reply at this time. This routine is also 13952 * called if we are already up, but a netmask change is made. This is legal 13953 * but might not make the system manager very popular. (May be called 13954 * as writer.) 13955 */ 13956 void 13957 ipif_mask_reply(ipif_t *ipif) 13958 { 13959 icmph_t *icmph; 13960 ipha_t *ipha; 13961 mblk_t *mp; 13962 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 13963 ip_xmit_attr_t ixas; 13964 13965 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) 13966 13967 if (!ipst->ips_ip_respond_to_address_mask_broadcast) 13968 return; 13969 13970 /* ICMP mask reply is IPv4 only */ 13971 ASSERT(!ipif->ipif_isv6); 13972 /* ICMP mask reply is not for a loopback interface */ 13973 ASSERT(ipif->ipif_ill->ill_wq != NULL); 13974 13975 if (ipif->ipif_lcl_addr == INADDR_ANY) 13976 return; 13977 13978 mp = allocb(REPLY_LEN, BPRI_HI); 13979 if (mp == NULL) 13980 return; 13981 mp->b_wptr = mp->b_rptr + REPLY_LEN; 13982 13983 ipha = (ipha_t *)mp->b_rptr; 13984 bzero(ipha, REPLY_LEN); 13985 *ipha = icmp_ipha; 13986 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; 13987 ipha->ipha_src = ipif->ipif_lcl_addr; 13988 ipha->ipha_dst = ipif->ipif_brd_addr; 13989 ipha->ipha_length = htons(REPLY_LEN); 13990 ipha->ipha_ident = 0; 13991 13992 icmph = (icmph_t *)&ipha[1]; 13993 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; 13994 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); 13995 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); 13996 13997 bzero(&ixas, sizeof (ixas)); 13998 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; 13999 ixas.ixa_zoneid = ALL_ZONES; 14000 ixas.ixa_ifindex = 0; 14001 ixas.ixa_ipst = ipst; 14002 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; 14003 (void) ip_output_simple(mp, &ixas); 14004 ixa_cleanup(&ixas); 14005 #undef REPLY_LEN 14006 } 14007 14008 /* 14009 * Join the ipif specific multicast groups. 14010 * Must be called after a mapping has been set up in the resolver. (Always 14011 * called as writer.) 14012 */ 14013 void 14014 ipif_multicast_up(ipif_t *ipif) 14015 { 14016 int err; 14017 ill_t *ill; 14018 ilm_t *ilm; 14019 14020 ASSERT(IAM_WRITER_IPIF(ipif)); 14021 14022 ill = ipif->ipif_ill; 14023 14024 ip1dbg(("ipif_multicast_up\n")); 14025 if (!(ill->ill_flags & ILLF_MULTICAST) || 14026 ipif->ipif_allhosts_ilm != NULL) 14027 return; 14028 14029 if (ipif->ipif_isv6) { 14030 in6_addr_t v6allmc = ipv6_all_hosts_mcast; 14031 in6_addr_t v6solmc = ipv6_solicited_node_mcast; 14032 14033 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3]; 14034 14035 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) 14036 return; 14037 14038 ip1dbg(("ipif_multicast_up - addmulti\n")); 14039 14040 /* 14041 * Join the all hosts multicast address. We skip this for 14042 * underlying IPMP interfaces since they should be invisible. 14043 */ 14044 if (!IS_UNDER_IPMP(ill)) { 14045 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, 14046 &err); 14047 if (ilm == NULL) { 14048 ASSERT(err != 0); 14049 ip0dbg(("ipif_multicast_up: " 14050 "all_hosts_mcast failed %d\n", err)); 14051 return; 14052 } 14053 ipif->ipif_allhosts_ilm = ilm; 14054 } 14055 14056 /* 14057 * Enable multicast for the solicited node multicast address. 14058 * If IPMP we need to put the membership on the upper ill. 14059 */ 14060 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { 14061 ill_t *mcast_ill = NULL; 14062 boolean_t need_refrele; 14063 14064 if (IS_UNDER_IPMP(ill) && 14065 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { 14066 need_refrele = B_TRUE; 14067 } else { 14068 mcast_ill = ill; 14069 need_refrele = B_FALSE; 14070 } 14071 14072 ilm = ip_addmulti(&v6solmc, mcast_ill, 14073 ipif->ipif_zoneid, &err); 14074 if (need_refrele) 14075 ill_refrele(mcast_ill); 14076 14077 if (ilm == NULL) { 14078 ASSERT(err != 0); 14079 ip0dbg(("ipif_multicast_up: solicited MC" 14080 " failed %d\n", err)); 14081 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { 14082 ipif->ipif_allhosts_ilm = NULL; 14083 (void) ip_delmulti(ilm); 14084 } 14085 return; 14086 } 14087 ipif->ipif_solmulti_ilm = ilm; 14088 } 14089 } else { 14090 in6_addr_t v6group; 14091 14092 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) 14093 return; 14094 14095 /* Join the all hosts multicast address */ 14096 ip1dbg(("ipif_multicast_up - addmulti\n")); 14097 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); 14098 14099 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); 14100 if (ilm == NULL) { 14101 ASSERT(err != 0); 14102 ip0dbg(("ipif_multicast_up: failed %d\n", err)); 14103 return; 14104 } 14105 ipif->ipif_allhosts_ilm = ilm; 14106 } 14107 } 14108 14109 /* 14110 * Blow away any multicast groups that we joined in ipif_multicast_up(). 14111 * (ilms from explicit memberships are handled in conn_update_ill.) 14112 */ 14113 void 14114 ipif_multicast_down(ipif_t *ipif) 14115 { 14116 ASSERT(IAM_WRITER_IPIF(ipif)); 14117 14118 ip1dbg(("ipif_multicast_down\n")); 14119 14120 if (ipif->ipif_allhosts_ilm != NULL) { 14121 (void) ip_delmulti(ipif->ipif_allhosts_ilm); 14122 ipif->ipif_allhosts_ilm = NULL; 14123 } 14124 if (ipif->ipif_solmulti_ilm != NULL) { 14125 (void) ip_delmulti(ipif->ipif_solmulti_ilm); 14126 ipif->ipif_solmulti_ilm = NULL; 14127 } 14128 } 14129 14130 /* 14131 * Used when an interface comes up to recreate any extra routes on this 14132 * interface. 14133 */ 14134 int 14135 ill_recover_saved_ire(ill_t *ill) 14136 { 14137 mblk_t *mp; 14138 ip_stack_t *ipst = ill->ill_ipst; 14139 14140 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); 14141 14142 mutex_enter(&ill->ill_saved_ire_lock); 14143 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { 14144 ire_t *ire, *nire; 14145 ifrt_t *ifrt; 14146 14147 ifrt = (ifrt_t *)mp->b_rptr; 14148 /* 14149 * Create a copy of the IRE with the saved address and netmask. 14150 */ 14151 if (ill->ill_isv6) { 14152 ire = ire_create_v6( 14153 &ifrt->ifrt_v6addr, 14154 &ifrt->ifrt_v6mask, 14155 &ifrt->ifrt_v6gateway_addr, 14156 ifrt->ifrt_type, 14157 ill, 14158 ifrt->ifrt_zoneid, 14159 ifrt->ifrt_flags, 14160 NULL, 14161 ipst); 14162 } else { 14163 ire = ire_create( 14164 (uint8_t *)&ifrt->ifrt_addr, 14165 (uint8_t *)&ifrt->ifrt_mask, 14166 (uint8_t *)&ifrt->ifrt_gateway_addr, 14167 ifrt->ifrt_type, 14168 ill, 14169 ifrt->ifrt_zoneid, 14170 ifrt->ifrt_flags, 14171 NULL, 14172 ipst); 14173 } 14174 if (ire == NULL) { 14175 mutex_exit(&ill->ill_saved_ire_lock); 14176 return (ENOMEM); 14177 } 14178 14179 if (ifrt->ifrt_flags & RTF_SETSRC) { 14180 if (ill->ill_isv6) { 14181 ire->ire_setsrc_addr_v6 = 14182 ifrt->ifrt_v6setsrc_addr; 14183 } else { 14184 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; 14185 } 14186 } 14187 14188 /* 14189 * Some software (for example, GateD and Sun Cluster) attempts 14190 * to create (what amount to) IRE_PREFIX routes with the 14191 * loopback address as the gateway. This is primarily done to 14192 * set up prefixes with the RTF_REJECT flag set (for example, 14193 * when generating aggregate routes.) 14194 * 14195 * If the IRE type (as defined by ill->ill_net_type) is 14196 * IRE_LOOPBACK, then we map the request into a 14197 * IRE_IF_NORESOLVER. 14198 */ 14199 if (ill->ill_net_type == IRE_LOOPBACK) 14200 ire->ire_type = IRE_IF_NORESOLVER; 14201 14202 /* 14203 * ire held by ire_add, will be refreled' towards the 14204 * the end of ipif_up_done 14205 */ 14206 nire = ire_add(ire); 14207 /* 14208 * Check if it was a duplicate entry. This handles 14209 * the case of two racing route adds for the same route 14210 */ 14211 if (nire == NULL) { 14212 ip1dbg(("ill_recover_saved_ire: FAILED\n")); 14213 } else if (nire != ire) { 14214 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", 14215 (void *)nire)); 14216 ire_delete(nire); 14217 } else { 14218 ip1dbg(("ill_recover_saved_ire: added ire %p\n", 14219 (void *)nire)); 14220 } 14221 if (nire != NULL) 14222 ire_refrele(nire); 14223 } 14224 mutex_exit(&ill->ill_saved_ire_lock); 14225 return (0); 14226 } 14227 14228 /* 14229 * Used to set the netmask and broadcast address to default values when the 14230 * interface is brought up. (Always called as writer.) 14231 */ 14232 static void 14233 ipif_set_default(ipif_t *ipif) 14234 { 14235 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14236 14237 if (!ipif->ipif_isv6) { 14238 /* 14239 * Interface holds an IPv4 address. Default 14240 * mask is the natural netmask. 14241 */ 14242 if (!ipif->ipif_net_mask) { 14243 ipaddr_t v4mask; 14244 14245 v4mask = ip_net_mask(ipif->ipif_lcl_addr); 14246 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask); 14247 } 14248 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14249 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14250 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 14251 } else { 14252 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 14253 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 14254 } 14255 /* 14256 * NOTE: SunOS 4.X does this even if the broadcast address 14257 * has been already set thus we do the same here. 14258 */ 14259 if (ipif->ipif_flags & IPIF_BROADCAST) { 14260 ipaddr_t v4addr; 14261 14262 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask; 14263 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr); 14264 } 14265 } else { 14266 /* 14267 * Interface holds an IPv6-only address. Default 14268 * mask is all-ones. 14269 */ 14270 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) 14271 ipif->ipif_v6net_mask = ipv6_all_ones; 14272 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14273 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14274 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; 14275 } else { 14276 V6_MASK_COPY(ipif->ipif_v6lcl_addr, 14277 ipif->ipif_v6net_mask, ipif->ipif_v6subnet); 14278 } 14279 } 14280 } 14281 14282 /* 14283 * Return 0 if this address can be used as local address without causing 14284 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address 14285 * is already up on a different ill, and EADDRINUSE if it's up on the same ill. 14286 * Note that the same IPv6 link-local address is allowed as long as the ills 14287 * are not on the same link. 14288 */ 14289 int 14290 ip_addr_availability_check(ipif_t *new_ipif) 14291 { 14292 in6_addr_t our_v6addr; 14293 ill_t *ill; 14294 ipif_t *ipif; 14295 ill_walk_context_t ctx; 14296 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst; 14297 14298 ASSERT(IAM_WRITER_IPIF(new_ipif)); 14299 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock)); 14300 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); 14301 14302 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED; 14303 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) || 14304 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr)) 14305 return (0); 14306 14307 our_v6addr = new_ipif->ipif_v6lcl_addr; 14308 14309 if (new_ipif->ipif_isv6) 14310 ill = ILL_START_WALK_V6(&ctx, ipst); 14311 else 14312 ill = ILL_START_WALK_V4(&ctx, ipst); 14313 14314 for (; ill != NULL; ill = ill_next(&ctx, ill)) { 14315 for (ipif = ill->ill_ipif; ipif != NULL; 14316 ipif = ipif->ipif_next) { 14317 if ((ipif == new_ipif) || 14318 !(ipif->ipif_flags & IPIF_UP) || 14319 (ipif->ipif_flags & IPIF_UNNUMBERED) || 14320 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, 14321 &our_v6addr)) 14322 continue; 14323 14324 if (new_ipif->ipif_flags & IPIF_POINTOPOINT) 14325 new_ipif->ipif_flags |= IPIF_UNNUMBERED; 14326 else if (ipif->ipif_flags & IPIF_POINTOPOINT) 14327 ipif->ipif_flags |= IPIF_UNNUMBERED; 14328 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) || 14329 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) && 14330 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill)) 14331 continue; 14332 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid && 14333 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill)) 14334 continue; 14335 else if (new_ipif->ipif_ill == ill) 14336 return (EADDRINUSE); 14337 else 14338 return (EADDRNOTAVAIL); 14339 } 14340 } 14341 14342 return (0); 14343 } 14344 14345 /* 14346 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add 14347 * IREs for the ipif. 14348 * When the routine returns EINPROGRESS then mp has been consumed and 14349 * the ioctl will be acked from ip_rput_dlpi. 14350 */ 14351 int 14352 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) 14353 { 14354 ill_t *ill = ipif->ipif_ill; 14355 boolean_t isv6 = ipif->ipif_isv6; 14356 int err = 0; 14357 boolean_t success; 14358 uint_t ipif_orig_id; 14359 ip_stack_t *ipst = ill->ill_ipst; 14360 14361 ASSERT(IAM_WRITER_IPIF(ipif)); 14362 14363 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); 14364 DTRACE_PROBE3(ipif__downup, char *, "ipif_up", 14365 ill_t *, ill, ipif_t *, ipif); 14366 14367 /* Shouldn't get here if it is already up. */ 14368 if (ipif->ipif_flags & IPIF_UP) 14369 return (EALREADY); 14370 14371 /* 14372 * If this is a request to bring up a data address on an interface 14373 * under IPMP, then move the address to its IPMP meta-interface and 14374 * try to bring it up. One complication is that the zeroth ipif for 14375 * an ill is special, in that every ill always has one, and that code 14376 * throughout IP deferences ill->ill_ipif without holding any locks. 14377 */ 14378 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) && 14379 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) { 14380 ipif_t *stubipif = NULL, *moveipif = NULL; 14381 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); 14382 14383 /* 14384 * The ipif being brought up should be quiesced. If it's not, 14385 * something has gone amiss and we need to bail out. (If it's 14386 * quiesced, we know it will remain so via IPIF_CONDEMNED.) 14387 */ 14388 mutex_enter(&ill->ill_lock); 14389 if (!ipif_is_quiescent(ipif)) { 14390 mutex_exit(&ill->ill_lock); 14391 return (EINVAL); 14392 } 14393 mutex_exit(&ill->ill_lock); 14394 14395 /* 14396 * If we're going to need to allocate ipifs, do it prior 14397 * to starting the move (and grabbing locks). 14398 */ 14399 if (ipif->ipif_id == 0) { 14400 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 14401 B_FALSE, &err)) == NULL) { 14402 return (err); 14403 } 14404 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE, 14405 B_FALSE, &err)) == NULL) { 14406 mi_free(moveipif); 14407 return (err); 14408 } 14409 } 14410 14411 /* 14412 * Grab or transfer the ipif to move. During the move, keep 14413 * ill_g_lock held to prevent any ill walker threads from 14414 * seeing things in an inconsistent state. 14415 */ 14416 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 14417 if (ipif->ipif_id != 0) { 14418 ipif_remove(ipif); 14419 } else { 14420 ipif_transfer(ipif, moveipif, stubipif); 14421 ipif = moveipif; 14422 } 14423 14424 /* 14425 * Place the ipif on the IPMP ill. If the zeroth ipif on 14426 * the IPMP ill is a stub (0.0.0.0 down address) then we 14427 * replace that one. Otherwise, pick the next available slot. 14428 */ 14429 ipif->ipif_ill = ipmp_ill; 14430 ipif_orig_id = ipif->ipif_id; 14431 14432 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) { 14433 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL); 14434 ipif = ipmp_ill->ill_ipif; 14435 } else { 14436 ipif->ipif_id = -1; 14437 if ((err = ipif_insert(ipif, B_FALSE)) != 0) { 14438 /* 14439 * No more available ipif_id's -- put it back 14440 * on the original ill and fail the operation. 14441 * Since we're writer on the ill, we can be 14442 * sure our old slot is still available. 14443 */ 14444 ipif->ipif_id = ipif_orig_id; 14445 ipif->ipif_ill = ill; 14446 if (ipif_orig_id == 0) { 14447 ipif_transfer(ipif, ill->ill_ipif, 14448 NULL); 14449 } else { 14450 VERIFY(ipif_insert(ipif, B_FALSE) == 0); 14451 } 14452 rw_exit(&ipst->ips_ill_g_lock); 14453 return (err); 14454 } 14455 } 14456 rw_exit(&ipst->ips_ill_g_lock); 14457 14458 /* 14459 * Tell SCTP that the ipif has moved. Note that even if we 14460 * had to allocate a new ipif, the original sequence id was 14461 * preserved and therefore SCTP won't know. 14462 */ 14463 sctp_move_ipif(ipif, ill, ipmp_ill); 14464 14465 /* 14466 * If the ipif being brought up was on slot zero, then we 14467 * first need to bring up the placeholder we stuck there. In 14468 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive 14469 * call to ipif_up() itself, if we successfully bring up the 14470 * placeholder, we'll check ill_move_ipif and bring it up too. 14471 */ 14472 if (ipif_orig_id == 0) { 14473 ASSERT(ill->ill_move_ipif == NULL); 14474 ill->ill_move_ipif = ipif; 14475 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0) 14476 ASSERT(ill->ill_move_ipif == NULL); 14477 if (err != EINPROGRESS) 14478 ill->ill_move_ipif = NULL; 14479 return (err); 14480 } 14481 14482 /* 14483 * Bring it up on the IPMP ill. 14484 */ 14485 return (ipif_up(ipif, q, mp)); 14486 } 14487 14488 /* Skip arp/ndp for any loopback interface. */ 14489 if (ill->ill_wq != NULL) { 14490 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14491 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 14492 14493 if (!ill->ill_dl_up) { 14494 /* 14495 * ill_dl_up is not yet set. i.e. we are yet to 14496 * DL_BIND with the driver and this is the first 14497 * logical interface on the ill to become "up". 14498 * Tell the driver to get going (via DL_BIND_REQ). 14499 * Note that changing "significant" IFF_ flags 14500 * address/netmask etc cause a down/up dance, but 14501 * does not cause an unbind (DL_UNBIND) with the driver 14502 */ 14503 return (ill_dl_up(ill, ipif, mp, q)); 14504 } 14505 14506 /* 14507 * ipif_resolver_up may end up needeing to bind/attach 14508 * the ARP stream, which in turn necessitates a 14509 * DLPI message exchange with the driver. ioctls are 14510 * serialized and so we cannot send more than one 14511 * interface up message at a time. If ipif_resolver_up 14512 * does need to wait for the DLPI handshake for the ARP stream, 14513 * we get EINPROGRESS and we will complete in arp_bringup_done. 14514 */ 14515 14516 ASSERT(connp != NULL || !CONN_Q(q)); 14517 if (connp != NULL) 14518 mutex_enter(&connp->conn_lock); 14519 mutex_enter(&ill->ill_lock); 14520 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14521 mutex_exit(&ill->ill_lock); 14522 if (connp != NULL) 14523 mutex_exit(&connp->conn_lock); 14524 if (!success) 14525 return (EINTR); 14526 14527 /* 14528 * Crank up IPv6 neighbor discovery. Unlike ARP, this should 14529 * complete when ipif_ndp_up returns. 14530 */ 14531 err = ipif_resolver_up(ipif, Res_act_initial); 14532 if (err == EINPROGRESS) { 14533 /* We will complete it in arp_bringup_done() */ 14534 return (err); 14535 } 14536 14537 if (isv6 && err == 0) 14538 err = ipif_ndp_up(ipif, B_TRUE); 14539 14540 ASSERT(err != EINPROGRESS); 14541 mp = ipsq_pending_mp_get(ipsq, &connp); 14542 ASSERT(mp != NULL); 14543 if (err != 0) 14544 return (err); 14545 } else { 14546 /* 14547 * Interfaces without underlying hardware don't do duplicate 14548 * address detection. 14549 */ 14550 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); 14551 ipif->ipif_addr_ready = 1; 14552 err = ill_add_ires(ill); 14553 /* allocation failure? */ 14554 if (err != 0) 14555 return (err); 14556 } 14557 14558 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); 14559 if (err == 0 && ill->ill_move_ipif != NULL) { 14560 ipif = ill->ill_move_ipif; 14561 ill->ill_move_ipif = NULL; 14562 return (ipif_up(ipif, q, mp)); 14563 } 14564 return (err); 14565 } 14566 14567 /* 14568 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. 14569 * The identical set of IREs need to be removed in ill_delete_ires(). 14570 */ 14571 int 14572 ill_add_ires(ill_t *ill) 14573 { 14574 ire_t *ire; 14575 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; 14576 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); 14577 14578 if (ill->ill_ire_multicast != NULL) 14579 return (0); 14580 14581 /* 14582 * provide some dummy ire_addr for creating the ire. 14583 */ 14584 if (ill->ill_isv6) { 14585 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, 14586 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14587 } else { 14588 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, 14589 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); 14590 } 14591 if (ire == NULL) 14592 return (ENOMEM); 14593 14594 ill->ill_ire_multicast = ire; 14595 return (0); 14596 } 14597 14598 void 14599 ill_delete_ires(ill_t *ill) 14600 { 14601 if (ill->ill_ire_multicast != NULL) { 14602 /* 14603 * BIND/ATTACH completed; Release the ref for ill_ire_multicast 14604 * which was taken without any th_tracing enabled. 14605 * We also mark it as condemned (note that it was never added) 14606 * so that caching conn's can move off of it. 14607 */ 14608 ire_make_condemned(ill->ill_ire_multicast); 14609 ire_refrele_notr(ill->ill_ire_multicast); 14610 ill->ill_ire_multicast = NULL; 14611 } 14612 } 14613 14614 /* 14615 * Perform a bind for the physical device. 14616 * When the routine returns EINPROGRESS then mp has been consumed and 14617 * the ioctl will be acked from ip_rput_dlpi. 14618 * Allocate an unbind message and save it until ipif_down. 14619 */ 14620 static int 14621 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 14622 { 14623 mblk_t *bind_mp = NULL; 14624 mblk_t *unbind_mp = NULL; 14625 conn_t *connp; 14626 boolean_t success; 14627 int err; 14628 14629 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); 14630 14631 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); 14632 ASSERT(IAM_WRITER_ILL(ill)); 14633 ASSERT(mp != NULL); 14634 14635 /* 14636 * Make sure we have an IRE_MULTICAST in case we immediately 14637 * start receiving packets. 14638 */ 14639 err = ill_add_ires(ill); 14640 if (err != 0) 14641 goto bad; 14642 14643 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), 14644 DL_BIND_REQ); 14645 if (bind_mp == NULL) 14646 goto bad; 14647 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap; 14648 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; 14649 14650 /* 14651 * ill_unbind_mp would be non-null if the following sequence had 14652 * happened: 14653 * - send DL_BIND_REQ to driver, wait for response 14654 * - multiple ioctls that need to bring the ipif up are encountered, 14655 * but they cannot enter the ipsq due to the outstanding DL_BIND_REQ. 14656 * These ioctls will then be enqueued on the ipsq 14657 * - a DL_ERROR_ACK is returned for the DL_BIND_REQ 14658 * At this point, the pending ioctls in the ipsq will be drained, and 14659 * since ill->ill_dl_up was not set, ill_dl_up would be invoked with 14660 * a non-null ill->ill_unbind_mp 14661 */ 14662 if (ill->ill_unbind_mp == NULL) { 14663 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t), 14664 DL_UNBIND_REQ); 14665 if (unbind_mp == NULL) 14666 goto bad; 14667 } 14668 /* 14669 * Record state needed to complete this operation when the 14670 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. 14671 */ 14672 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; 14673 ASSERT(connp != NULL || !CONN_Q(q)); 14674 GRAB_CONN_LOCK(q); 14675 mutex_enter(&ipif->ipif_ill->ill_lock); 14676 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); 14677 mutex_exit(&ipif->ipif_ill->ill_lock); 14678 RELEASE_CONN_LOCK(q); 14679 if (!success) 14680 goto bad; 14681 14682 /* 14683 * Save the unbind message for ill_dl_down(); it will be consumed when 14684 * the interface goes down. 14685 */ 14686 if (ill->ill_unbind_mp == NULL) 14687 ill->ill_unbind_mp = unbind_mp; 14688 14689 ill_dlpi_send(ill, bind_mp); 14690 /* Send down link-layer capabilities probe if not already done. */ 14691 ill_capability_probe(ill); 14692 14693 /* 14694 * Sysid used to rely on the fact that netboots set domainname 14695 * and the like. Now that miniroot boots aren't strictly netboots 14696 * and miniroot network configuration is driven from userland 14697 * these things still need to be set. This situation can be detected 14698 * by comparing the interface being configured here to the one 14699 * dhcifname was set to reference by the boot loader. Once sysid is 14700 * converted to use dhcp_ipc_getinfo() this call can go away. 14701 */ 14702 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) && 14703 (strcmp(ill->ill_name, dhcifname) == 0) && 14704 (strlen(srpc_domain) == 0)) { 14705 if (dhcpinit() != 0) 14706 cmn_err(CE_WARN, "no cached dhcp response"); 14707 } 14708 14709 /* 14710 * This operation will complete in ip_rput_dlpi with either 14711 * a DL_BIND_ACK or DL_ERROR_ACK. 14712 */ 14713 return (EINPROGRESS); 14714 bad: 14715 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); 14716 14717 freemsg(bind_mp); 14718 freemsg(unbind_mp); 14719 return (ENOMEM); 14720 } 14721 14722 /* Add room for tcp+ip headers */ 14723 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; 14724 14725 /* 14726 * DLPI and ARP is up. 14727 * Create all the IREs associated with an interface. Bring up multicast. 14728 * Set the interface flag and finish other initialization 14729 * that potentially had to be deferred to after DL_BIND_ACK. 14730 */ 14731 int 14732 ipif_up_done(ipif_t *ipif) 14733 { 14734 ill_t *ill = ipif->ipif_ill; 14735 int err = 0; 14736 boolean_t loopback = B_FALSE; 14737 boolean_t update_src_selection = B_TRUE; 14738 ipif_t *tmp_ipif; 14739 14740 ip1dbg(("ipif_up_done(%s:%u)\n", 14741 ipif->ipif_ill->ill_name, ipif->ipif_id)); 14742 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", 14743 ill_t *, ill, ipif_t *, ipif); 14744 14745 /* Check if this is a loopback interface */ 14746 if (ipif->ipif_ill->ill_wq == NULL) 14747 loopback = B_TRUE; 14748 14749 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); 14750 14751 /* 14752 * If all other interfaces for this ill are down or DEPRECATED, 14753 * or otherwise unsuitable for source address selection, 14754 * reset the src generation numbers to make sure source 14755 * address selection gets to take this new ipif into account. 14756 * No need to hold ill_lock while traversing the ipif list since 14757 * we are writer 14758 */ 14759 for (tmp_ipif = ill->ill_ipif; tmp_ipif; 14760 tmp_ipif = tmp_ipif->ipif_next) { 14761 if (((tmp_ipif->ipif_flags & 14762 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || 14763 !(tmp_ipif->ipif_flags & IPIF_UP)) || 14764 (tmp_ipif == ipif)) 14765 continue; 14766 /* first useable pre-existing interface */ 14767 update_src_selection = B_FALSE; 14768 break; 14769 } 14770 if (update_src_selection) 14771 ip_update_source_selection(ill->ill_ipst); 14772 14773 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { 14774 nce_t *loop_nce = NULL; 14775 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); 14776 14777 /* 14778 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in 14779 * ipif_lookup_on_name(), but in the case of zones we can have 14780 * several loopback addresses on lo0. So all the interfaces with 14781 * loopback addresses need to be marked IRE_LOOPBACK. 14782 */ 14783 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) == 14784 htonl(INADDR_LOOPBACK)) 14785 ipif->ipif_ire_type = IRE_LOOPBACK; 14786 else 14787 ipif->ipif_ire_type = IRE_LOCAL; 14788 if (ill->ill_net_type != IRE_LOOPBACK) 14789 flags |= NCE_F_PUBLISH; 14790 14791 /* add unicast nce for the local addr */ 14792 err = nce_lookup_then_add_v4(ill, NULL, 14793 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, 14794 ND_REACHABLE, &loop_nce); 14795 /* A shared-IP zone sees EEXIST for lo0:N */ 14796 if (err == 0 || err == EEXIST) { 14797 ipif->ipif_added_nce = 1; 14798 loop_nce->nce_ipif_cnt++; 14799 nce_refrele(loop_nce); 14800 err = 0; 14801 } else { 14802 ASSERT(loop_nce == NULL); 14803 return (err); 14804 } 14805 } 14806 14807 /* Create all the IREs associated with this interface */ 14808 err = ipif_add_ires_v4(ipif, loopback); 14809 if (err != 0) { 14810 /* 14811 * see comments about return value from 14812 * ip_addr_availability_check() in ipif_add_ires_v4(). 14813 */ 14814 if (err != EADDRINUSE) { 14815 (void) ipif_arp_down(ipif); 14816 } else { 14817 /* 14818 * Make IPMP aware of the deleted ipif so that 14819 * the needed ipmp cleanup (e.g., of ipif_bound_ill) 14820 * can be completed. Note that we do not want to 14821 * destroy the nce that was created on the ipmp_ill 14822 * for the active copy of the duplicate address in 14823 * use. 14824 */ 14825 if (IS_IPMP(ill)) 14826 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 14827 err = EADDRNOTAVAIL; 14828 } 14829 return (err); 14830 } 14831 14832 if (ill->ill_ipif_up_count == 1 && !loopback) { 14833 /* Recover any additional IREs entries for this ill */ 14834 (void) ill_recover_saved_ire(ill); 14835 } 14836 14837 if (ill->ill_need_recover_multicast) { 14838 /* 14839 * Need to recover all multicast memberships in the driver. 14840 * This had to be deferred until we had attached. The same 14841 * code exists in ipif_up_done_v6() to recover IPv6 14842 * memberships. 14843 * 14844 * Note that it would be preferable to unconditionally do the 14845 * ill_recover_multicast() in ill_dl_up(), but we cannot do 14846 * that since ill_join_allmulti() depends on ill_dl_up being 14847 * set, and it is not set until we receive a DL_BIND_ACK after 14848 * having called ill_dl_up(). 14849 */ 14850 ill_recover_multicast(ill); 14851 } 14852 14853 if (ill->ill_ipif_up_count == 1) { 14854 /* 14855 * Since the interface is now up, it may now be active. 14856 */ 14857 if (IS_UNDER_IPMP(ill)) 14858 ipmp_ill_refresh_active(ill); 14859 14860 /* 14861 * If this is an IPMP interface, we may now be able to 14862 * establish ARP entries. 14863 */ 14864 if (IS_IPMP(ill)) 14865 ipmp_illgrp_refresh_arpent(ill->ill_grp); 14866 } 14867 14868 /* Join the allhosts multicast address */ 14869 ipif_multicast_up(ipif); 14870 14871 if (!loopback && !update_src_selection && 14872 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) 14873 ip_update_source_selection(ill->ill_ipst); 14874 14875 if (!loopback && ipif->ipif_addr_ready) { 14876 /* Broadcast an address mask reply. */ 14877 ipif_mask_reply(ipif); 14878 } 14879 /* Perhaps ilgs should use this ill */ 14880 update_conn_ill(NULL, ill->ill_ipst); 14881 14882 /* 14883 * This had to be deferred until we had bound. Tell routing sockets and 14884 * others that this interface is up if it looks like the address has 14885 * been validated. Otherwise, if it isn't ready yet, wait for 14886 * duplicate address detection to do its thing. 14887 */ 14888 if (ipif->ipif_addr_ready) 14889 ipif_up_notify(ipif); 14890 return (0); 14891 } 14892 14893 /* 14894 * Add the IREs associated with the ipif. 14895 * Those MUST be explicitly removed in ipif_delete_ires_v4. 14896 */ 14897 static int 14898 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) 14899 { 14900 ill_t *ill = ipif->ipif_ill; 14901 ip_stack_t *ipst = ill->ill_ipst; 14902 ire_t *ire_array[20]; 14903 ire_t **irep = ire_array; 14904 ire_t **irep1; 14905 ipaddr_t net_mask = 0; 14906 ipaddr_t subnet_mask, route_mask; 14907 int err; 14908 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ 14909 ire_t *ire_if = NULL; 14910 uchar_t *gw; 14911 14912 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14913 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14914 /* 14915 * If we're on a labeled system then make sure that zone- 14916 * private addresses have proper remote host database entries. 14917 */ 14918 if (is_system_labeled() && 14919 ipif->ipif_ire_type != IRE_LOOPBACK && 14920 !tsol_check_interface_address(ipif)) 14921 return (EINVAL); 14922 14923 /* Register the source address for __sin6_src_id */ 14924 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, 14925 ipif->ipif_zoneid, ipst); 14926 if (err != 0) { 14927 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); 14928 return (err); 14929 } 14930 14931 if (loopback) 14932 gw = (uchar_t *)&ipif->ipif_lcl_addr; 14933 else 14934 gw = NULL; 14935 14936 /* If the interface address is set, create the local IRE. */ 14937 ire_local = ire_create( 14938 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ 14939 (uchar_t *)&ip_g_all_ones, /* mask */ 14940 gw, /* gateway */ 14941 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ 14942 ipif->ipif_ill, 14943 ipif->ipif_zoneid, 14944 ((ipif->ipif_flags & IPIF_PRIVATE) ? 14945 RTF_PRIVATE : 0) | RTF_KERNEL, 14946 NULL, 14947 ipst); 14948 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" 14949 " for 0x%x\n", (void *)ipif, (void *)ire_local, 14950 ipif->ipif_ire_type, 14951 ntohl(ipif->ipif_lcl_addr))); 14952 if (ire_local == NULL) { 14953 ip1dbg(("ipif_up_done: NULL ire_local\n")); 14954 err = ENOMEM; 14955 goto bad; 14956 } 14957 } else { 14958 ip1dbg(( 14959 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", 14960 ipif->ipif_ire_type, 14961 ntohl(ipif->ipif_lcl_addr), 14962 (uint_t)ipif->ipif_flags)); 14963 } 14964 if ((ipif->ipif_lcl_addr != INADDR_ANY) && 14965 !(ipif->ipif_flags & IPIF_NOLOCAL)) { 14966 net_mask = ip_net_mask(ipif->ipif_lcl_addr); 14967 } else { 14968 net_mask = htonl(IN_CLASSA_NET); /* fallback */ 14969 } 14970 14971 subnet_mask = ipif->ipif_net_mask; 14972 14973 /* 14974 * If mask was not specified, use natural netmask of 14975 * interface address. Also, store this mask back into the 14976 * ipif struct. 14977 */ 14978 if (subnet_mask == 0) { 14979 subnet_mask = net_mask; 14980 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask); 14981 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, 14982 ipif->ipif_v6subnet); 14983 } 14984 14985 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ 14986 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && 14987 ipif->ipif_subnet != INADDR_ANY) { 14988 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ 14989 14990 if (ipif->ipif_flags & IPIF_POINTOPOINT) { 14991 route_mask = IP_HOST_MASK; 14992 } else { 14993 route_mask = subnet_mask; 14994 } 14995 14996 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " 14997 "creating if IRE ill_net_type 0x%x for 0x%x\n", 14998 (void *)ipif, (void *)ill, ill->ill_net_type, 14999 ntohl(ipif->ipif_subnet))); 15000 ire_if = ire_create( 15001 (uchar_t *)&ipif->ipif_subnet, 15002 (uchar_t *)&route_mask, 15003 (uchar_t *)&ipif->ipif_lcl_addr, 15004 ill->ill_net_type, 15005 ill, 15006 ipif->ipif_zoneid, 15007 ((ipif->ipif_flags & IPIF_PRIVATE) ? 15008 RTF_PRIVATE: 0) | RTF_KERNEL, 15009 NULL, 15010 ipst); 15011 if (ire_if == NULL) { 15012 ip1dbg(("ipif_up_done: NULL ire_if\n")); 15013 err = ENOMEM; 15014 goto bad; 15015 } 15016 } 15017 15018 /* 15019 * Create any necessary broadcast IREs. 15020 */ 15021 if ((ipif->ipif_flags & IPIF_BROADCAST) && 15022 !(ipif->ipif_flags & IPIF_NOXMIT)) 15023 irep = ipif_create_bcast_ires(ipif, irep); 15024 15025 /* If an earlier ire_create failed, get out now */ 15026 for (irep1 = irep; irep1 > ire_array; ) { 15027 irep1--; 15028 if (*irep1 == NULL) { 15029 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n")); 15030 err = ENOMEM; 15031 goto bad; 15032 } 15033 } 15034 15035 /* 15036 * Need to atomically check for IP address availability under 15037 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new 15038 * ills or new ipifs can be added while we are checking availability. 15039 */ 15040 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 15041 mutex_enter(&ipst->ips_ip_addr_avail_lock); 15042 /* Mark it up, and increment counters. */ 15043 ipif->ipif_flags |= IPIF_UP; 15044 ill->ill_ipif_up_count++; 15045 err = ip_addr_availability_check(ipif); 15046 mutex_exit(&ipst->ips_ip_addr_avail_lock); 15047 rw_exit(&ipst->ips_ill_g_lock); 15048 15049 if (err != 0) { 15050 /* 15051 * Our address may already be up on the same ill. In this case, 15052 * the ARP entry for our ipif replaced the one for the other 15053 * ipif. So we don't want to delete it (otherwise the other ipif 15054 * would be unable to send packets). 15055 * ip_addr_availability_check() identifies this case for us and 15056 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL 15057 * which is the expected error code. 15058 */ 15059 ill->ill_ipif_up_count--; 15060 ipif->ipif_flags &= ~IPIF_UP; 15061 goto bad; 15062 } 15063 15064 /* 15065 * Add in all newly created IREs. ire_create_bcast() has 15066 * already checked for duplicates of the IRE_BROADCAST type. 15067 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure 15068 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is 15069 * a /32 route. 15070 */ 15071 if (ire_if != NULL) { 15072 ire_if = ire_add(ire_if); 15073 if (ire_if == NULL) { 15074 err = ENOMEM; 15075 goto bad2; 15076 } 15077 #ifdef DEBUG 15078 ire_refhold_notr(ire_if); 15079 ire_refrele(ire_if); 15080 #endif 15081 } 15082 if (ire_local != NULL) { 15083 ire_local = ire_add(ire_local); 15084 if (ire_local == NULL) { 15085 err = ENOMEM; 15086 goto bad2; 15087 } 15088 #ifdef DEBUG 15089 ire_refhold_notr(ire_local); 15090 ire_refrele(ire_local); 15091 #endif 15092 } 15093 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15094 if (ire_local != NULL) 15095 ipif->ipif_ire_local = ire_local; 15096 if (ire_if != NULL) 15097 ipif->ipif_ire_if = ire_if; 15098 rw_exit(&ipst->ips_ill_g_lock); 15099 ire_local = NULL; 15100 ire_if = NULL; 15101 15102 /* 15103 * We first add all of them, and if that succeeds we refrele the 15104 * bunch. That enables us to delete all of them should any of the 15105 * ire_adds fail. 15106 */ 15107 for (irep1 = irep; irep1 > ire_array; ) { 15108 irep1--; 15109 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); 15110 *irep1 = ire_add(*irep1); 15111 if (*irep1 == NULL) { 15112 err = ENOMEM; 15113 goto bad2; 15114 } 15115 } 15116 15117 for (irep1 = irep; irep1 > ire_array; ) { 15118 irep1--; 15119 /* refheld by ire_add. */ 15120 if (*irep1 != NULL) { 15121 ire_refrele(*irep1); 15122 *irep1 = NULL; 15123 } 15124 } 15125 15126 if (!loopback) { 15127 /* 15128 * If the broadcast address has been set, make sure it makes 15129 * sense based on the interface address. 15130 * Only match on ill since we are sharing broadcast addresses. 15131 */ 15132 if ((ipif->ipif_brd_addr != INADDR_ANY) && 15133 (ipif->ipif_flags & IPIF_BROADCAST)) { 15134 ire_t *ire; 15135 15136 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, 15137 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, 15138 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); 15139 15140 if (ire == NULL) { 15141 /* 15142 * If there isn't a matching broadcast IRE, 15143 * revert to the default for this netmask. 15144 */ 15145 ipif->ipif_v6brd_addr = ipv6_all_zeros; 15146 mutex_enter(&ipif->ipif_ill->ill_lock); 15147 ipif_set_default(ipif); 15148 mutex_exit(&ipif->ipif_ill->ill_lock); 15149 } else { 15150 ire_refrele(ire); 15151 } 15152 } 15153 15154 } 15155 return (0); 15156 15157 bad2: 15158 ill->ill_ipif_up_count--; 15159 ipif->ipif_flags &= ~IPIF_UP; 15160 15161 bad: 15162 ip1dbg(("ipif_add_ires: FAILED \n")); 15163 if (ire_local != NULL) 15164 ire_delete(ire_local); 15165 if (ire_if != NULL) 15166 ire_delete(ire_if); 15167 15168 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15169 ire_local = ipif->ipif_ire_local; 15170 ipif->ipif_ire_local = NULL; 15171 ire_if = ipif->ipif_ire_if; 15172 ipif->ipif_ire_if = NULL; 15173 rw_exit(&ipst->ips_ill_g_lock); 15174 if (ire_local != NULL) { 15175 ire_delete(ire_local); 15176 ire_refrele_notr(ire_local); 15177 } 15178 if (ire_if != NULL) { 15179 ire_delete(ire_if); 15180 ire_refrele_notr(ire_if); 15181 } 15182 15183 while (irep > ire_array) { 15184 irep--; 15185 if (*irep != NULL) { 15186 ire_delete(*irep); 15187 } 15188 } 15189 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); 15190 15191 return (err); 15192 } 15193 15194 /* Remove all the IREs created by ipif_add_ires_v4 */ 15195 void 15196 ipif_delete_ires_v4(ipif_t *ipif) 15197 { 15198 ill_t *ill = ipif->ipif_ill; 15199 ip_stack_t *ipst = ill->ill_ipst; 15200 ire_t *ire; 15201 15202 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15203 ire = ipif->ipif_ire_local; 15204 ipif->ipif_ire_local = NULL; 15205 rw_exit(&ipst->ips_ill_g_lock); 15206 if (ire != NULL) { 15207 /* 15208 * Move count to ipif so we don't loose the count due to 15209 * a down/up dance. 15210 */ 15211 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); 15212 15213 ire_delete(ire); 15214 ire_refrele_notr(ire); 15215 } 15216 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 15217 ire = ipif->ipif_ire_if; 15218 ipif->ipif_ire_if = NULL; 15219 rw_exit(&ipst->ips_ill_g_lock); 15220 if (ire != NULL) { 15221 ire_delete(ire); 15222 ire_refrele_notr(ire); 15223 } 15224 15225 /* 15226 * Delete the broadcast IREs. 15227 */ 15228 if ((ipif->ipif_flags & IPIF_BROADCAST) && 15229 !(ipif->ipif_flags & IPIF_NOXMIT)) 15230 ipif_delete_bcast_ires(ipif); 15231 } 15232 15233 /* 15234 * Checks for availbility of a usable source address (if there is one) when the 15235 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note 15236 * this selection is done regardless of the destination. 15237 */ 15238 boolean_t 15239 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, 15240 ip_stack_t *ipst) 15241 { 15242 ipif_t *ipif = NULL; 15243 ill_t *uill; 15244 15245 ASSERT(ifindex != 0); 15246 15247 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 15248 if (uill == NULL) 15249 return (B_FALSE); 15250 15251 mutex_enter(&uill->ill_lock); 15252 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15253 if (IPIF_IS_CONDEMNED(ipif)) 15254 continue; 15255 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 15256 continue; 15257 if (!(ipif->ipif_flags & IPIF_UP)) 15258 continue; 15259 if (ipif->ipif_zoneid != zoneid) 15260 continue; 15261 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 15262 ipif->ipif_lcl_addr == INADDR_ANY) 15263 continue; 15264 mutex_exit(&uill->ill_lock); 15265 ill_refrele(uill); 15266 return (B_TRUE); 15267 } 15268 mutex_exit(&uill->ill_lock); 15269 ill_refrele(uill); 15270 return (B_FALSE); 15271 } 15272 15273 /* 15274 * Find an ipif with a good local address on the ill+zoneid. 15275 */ 15276 ipif_t * 15277 ipif_good_addr(ill_t *ill, zoneid_t zoneid) 15278 { 15279 ipif_t *ipif; 15280 15281 mutex_enter(&ill->ill_lock); 15282 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 15283 if (IPIF_IS_CONDEMNED(ipif)) 15284 continue; 15285 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 15286 continue; 15287 if (!(ipif->ipif_flags & IPIF_UP)) 15288 continue; 15289 if (ipif->ipif_zoneid != zoneid && 15290 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) 15291 continue; 15292 if (ill->ill_isv6 ? 15293 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : 15294 ipif->ipif_lcl_addr == INADDR_ANY) 15295 continue; 15296 ipif_refhold_locked(ipif); 15297 mutex_exit(&ill->ill_lock); 15298 return (ipif); 15299 } 15300 mutex_exit(&ill->ill_lock); 15301 return (NULL); 15302 } 15303 15304 /* 15305 * IP source address type, sorted from worst to best. For a given type, 15306 * always prefer IP addresses on the same subnet. All-zones addresses are 15307 * suboptimal because they pose problems with unlabeled destinations. 15308 */ 15309 typedef enum { 15310 IPIF_NONE, 15311 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */ 15312 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */ 15313 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ 15314 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ 15315 IPIF_DIFFNET, /* normal and different subnet */ 15316 IPIF_SAMENET, /* normal and same subnet */ 15317 IPIF_LOCALADDR /* local loopback */ 15318 } ipif_type_t; 15319 15320 /* 15321 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone 15322 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t 15323 * enumeration, and return the highest-rated ipif. If there's a tie, we pick 15324 * the first one, unless IPMP is used in which case we round-robin among them; 15325 * see below for more. 15326 * 15327 * Returns NULL if there is no suitable source address for the ill. 15328 * This only occurs when there is no valid source address for the ill. 15329 */ 15330 ipif_t * 15331 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, 15332 boolean_t allow_usesrc, boolean_t *notreadyp) 15333 { 15334 ill_t *usill = NULL; 15335 ill_t *ipmp_ill = NULL; 15336 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif; 15337 ipif_type_t type, best_type; 15338 tsol_tpc_t *src_rhtp, *dst_rhtp; 15339 ip_stack_t *ipst = ill->ill_ipst; 15340 boolean_t samenet; 15341 15342 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { 15343 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, 15344 B_FALSE, ipst); 15345 if (usill != NULL) 15346 ill = usill; /* Select source from usesrc ILL */ 15347 else 15348 return (NULL); 15349 } 15350 15351 /* 15352 * Test addresses should never be used for source address selection, 15353 * so if we were passed one, switch to the IPMP meta-interface. 15354 */ 15355 if (IS_UNDER_IPMP(ill)) { 15356 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) 15357 ill = ipmp_ill; /* Select source from IPMP ill */ 15358 else 15359 return (NULL); 15360 } 15361 15362 /* 15363 * If we're dealing with an unlabeled destination on a labeled system, 15364 * make sure that we ignore source addresses that are incompatible with 15365 * the destination's default label. That destination's default label 15366 * must dominate the minimum label on the source address. 15367 */ 15368 dst_rhtp = NULL; 15369 if (is_system_labeled()) { 15370 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE); 15371 if (dst_rhtp == NULL) 15372 return (NULL); 15373 if (dst_rhtp->tpc_tp.host_type != UNLABELED) { 15374 TPC_RELE(dst_rhtp); 15375 dst_rhtp = NULL; 15376 } 15377 } 15378 15379 /* 15380 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill 15381 * can be deleted. But an ipif/ill can get CONDEMNED any time. 15382 * After selecting the right ipif, under ill_lock make sure ipif is 15383 * not condemned, and increment refcnt. If ipif is CONDEMNED, 15384 * we retry. Inside the loop we still need to check for CONDEMNED, 15385 * but not under a lock. 15386 */ 15387 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 15388 retry: 15389 /* 15390 * For source address selection, we treat the ipif list as circular 15391 * and continue until we get back to where we started. This allows 15392 * IPMP to vary source address selection (which improves inbound load 15393 * spreading) by caching its last ending point and starting from 15394 * there. NOTE: we don't have to worry about ill_src_ipif changing 15395 * ills since that can't happen on the IPMP ill. 15396 */ 15397 start_ipif = ill->ill_ipif; 15398 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL) 15399 start_ipif = ill->ill_src_ipif; 15400 15401 ipif = start_ipif; 15402 best_ipif = NULL; 15403 best_type = IPIF_NONE; 15404 do { 15405 if ((next_ipif = ipif->ipif_next) == NULL) 15406 next_ipif = ill->ill_ipif; 15407 15408 if (IPIF_IS_CONDEMNED(ipif)) 15409 continue; 15410 /* Always skip NOLOCAL and ANYCAST interfaces */ 15411 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) 15412 continue; 15413 /* Always skip NOACCEPT interfaces */ 15414 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT) 15415 continue; 15416 if (!(ipif->ipif_flags & IPIF_UP)) 15417 continue; 15418 15419 if (!ipif->ipif_addr_ready) { 15420 if (notreadyp != NULL) 15421 *notreadyp = B_TRUE; 15422 continue; 15423 } 15424 15425 if (zoneid != ALL_ZONES && 15426 ipif->ipif_zoneid != zoneid && 15427 ipif->ipif_zoneid != ALL_ZONES) 15428 continue; 15429 15430 /* 15431 * Interfaces with 0.0.0.0 address are allowed to be UP, but 15432 * are not valid as source addresses. 15433 */ 15434 if (ipif->ipif_lcl_addr == INADDR_ANY) 15435 continue; 15436 15437 /* 15438 * Check compatibility of local address for destination's 15439 * default label if we're on a labeled system. Incompatible 15440 * addresses can't be used at all. 15441 */ 15442 if (dst_rhtp != NULL) { 15443 boolean_t incompat; 15444 15445 src_rhtp = find_tpc(&ipif->ipif_lcl_addr, 15446 IPV4_VERSION, B_FALSE); 15447 if (src_rhtp == NULL) 15448 continue; 15449 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO || 15450 src_rhtp->tpc_tp.tp_doi != 15451 dst_rhtp->tpc_tp.tp_doi || 15452 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label, 15453 &src_rhtp->tpc_tp.tp_sl_range_cipso) && 15454 !blinlset(&dst_rhtp->tpc_tp.tp_def_label, 15455 src_rhtp->tpc_tp.tp_sl_set_cipso)); 15456 TPC_RELE(src_rhtp); 15457 if (incompat) 15458 continue; 15459 } 15460 15461 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); 15462 15463 if (ipif->ipif_lcl_addr == dst) { 15464 type = IPIF_LOCALADDR; 15465 } else if (ipif->ipif_flags & IPIF_DEPRECATED) { 15466 type = samenet ? IPIF_SAMENET_DEPRECATED : 15467 IPIF_DIFFNET_DEPRECATED; 15468 } else if (ipif->ipif_zoneid == ALL_ZONES) { 15469 type = samenet ? IPIF_SAMENET_ALLZONES : 15470 IPIF_DIFFNET_ALLZONES; 15471 } else { 15472 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET; 15473 } 15474 15475 if (type > best_type) { 15476 best_type = type; 15477 best_ipif = ipif; 15478 if (best_type == IPIF_LOCALADDR) 15479 break; /* can't get better */ 15480 } 15481 } while ((ipif = next_ipif) != start_ipif); 15482 15483 if ((ipif = best_ipif) != NULL) { 15484 mutex_enter(&ipif->ipif_ill->ill_lock); 15485 if (IPIF_IS_CONDEMNED(ipif)) { 15486 mutex_exit(&ipif->ipif_ill->ill_lock); 15487 goto retry; 15488 } 15489 ipif_refhold_locked(ipif); 15490 15491 /* 15492 * For IPMP, update the source ipif rotor to the next ipif, 15493 * provided we can look it up. (We must not use it if it's 15494 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after 15495 * ipif_free() checked ill_src_ipif.) 15496 */ 15497 if (IS_IPMP(ill) && ipif != NULL) { 15498 next_ipif = ipif->ipif_next; 15499 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) 15500 ill->ill_src_ipif = next_ipif; 15501 else 15502 ill->ill_src_ipif = NULL; 15503 } 15504 mutex_exit(&ipif->ipif_ill->ill_lock); 15505 } 15506 15507 rw_exit(&ipst->ips_ill_g_lock); 15508 if (usill != NULL) 15509 ill_refrele(usill); 15510 if (ipmp_ill != NULL) 15511 ill_refrele(ipmp_ill); 15512 if (dst_rhtp != NULL) 15513 TPC_RELE(dst_rhtp); 15514 15515 #ifdef DEBUG 15516 if (ipif == NULL) { 15517 char buf1[INET6_ADDRSTRLEN]; 15518 15519 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", 15520 ill->ill_name, 15521 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); 15522 } else { 15523 char buf1[INET6_ADDRSTRLEN]; 15524 char buf2[INET6_ADDRSTRLEN]; 15525 15526 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", 15527 ipif->ipif_ill->ill_name, 15528 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), 15529 inet_ntop(AF_INET, &ipif->ipif_lcl_addr, 15530 buf2, sizeof (buf2)))); 15531 } 15532 #endif /* DEBUG */ 15533 return (ipif); 15534 } 15535 15536 /* 15537 * Pick a source address based on the destination ill and an optional setsrc 15538 * address. 15539 * The result is stored in srcp. If generation is set, then put the source 15540 * generation number there before we look for the source address (to avoid 15541 * missing changes in the set of source addresses. 15542 * If flagsp is set, then us it to pass back ipif_flags. 15543 * 15544 * If the caller wants to cache the returned source address and detect when 15545 * that might be stale, the caller should pass in a generation argument, 15546 * which the caller can later compare against ips_src_generation 15547 * 15548 * The precedence order for selecting an IPv4 source address is: 15549 * - RTF_SETSRC on the offlink ire always wins. 15550 * - If usrsrc is set, swap the ill to be the usesrc one. 15551 * - If IPMP is used on the ill, select a random address from the most 15552 * preferred ones below: 15553 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES 15554 * 2. Not deprecated, not ALL_ZONES 15555 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES 15556 * 4. Not deprecated, ALL_ZONES 15557 * 5. If onlink destination, same subnet and deprecated 15558 * 6. Deprecated. 15559 * 15560 * We have lower preference for ALL_ZONES IP addresses, 15561 * as they pose problems with unlabeled destinations. 15562 * 15563 * Note that when multiple IP addresses match e.g., #1 we pick 15564 * the first one if IPMP is not in use. With IPMP we randomize. 15565 */ 15566 int 15567 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, 15568 ipaddr_t multicast_ifaddr, 15569 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, 15570 uint32_t *generation, uint64_t *flagsp) 15571 { 15572 ipif_t *ipif; 15573 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ 15574 15575 if (flagsp != NULL) 15576 *flagsp = 0; 15577 15578 /* 15579 * Need to grab the generation number before we check to 15580 * avoid a race with a change to the set of local addresses. 15581 * No lock needed since the thread which updates the set of local 15582 * addresses use ipif/ill locks and exit those (hence a store memory 15583 * barrier) before doing the atomic increase of ips_src_generation. 15584 */ 15585 if (generation != NULL) { 15586 *generation = ipst->ips_src_generation; 15587 } 15588 15589 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { 15590 *srcp = multicast_ifaddr; 15591 return (0); 15592 } 15593 15594 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ 15595 if (setsrc != INADDR_ANY) { 15596 *srcp = setsrc; 15597 return (0); 15598 } 15599 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); 15600 if (ipif == NULL) { 15601 if (notready) 15602 return (ENETDOWN); 15603 else 15604 return (EADDRNOTAVAIL); 15605 } 15606 *srcp = ipif->ipif_lcl_addr; 15607 if (flagsp != NULL) 15608 *flagsp = ipif->ipif_flags; 15609 ipif_refrele(ipif); 15610 return (0); 15611 } 15612 15613 /* ARGSUSED */ 15614 int 15615 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15616 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15617 { 15618 /* 15619 * ill_phyint_reinit merged the v4 and v6 into a single 15620 * ipsq. We might not have been able to complete the 15621 * operation in ipif_set_values, if we could not become 15622 * exclusive. If so restart it here. 15623 */ 15624 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15625 } 15626 15627 /* 15628 * Can operate on either a module or a driver queue. 15629 * Returns an error if not a module queue. 15630 */ 15631 /* ARGSUSED */ 15632 int 15633 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15634 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15635 { 15636 queue_t *q1 = q; 15637 char *cp; 15638 char interf_name[LIFNAMSIZ]; 15639 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr; 15640 15641 if (q->q_next == NULL) { 15642 ip1dbg(( 15643 "if_unitsel: IF_UNITSEL: no q_next\n")); 15644 return (EINVAL); 15645 } 15646 15647 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0') 15648 return (EALREADY); 15649 15650 do { 15651 q1 = q1->q_next; 15652 } while (q1->q_next); 15653 cp = q1->q_qinfo->qi_minfo->mi_idname; 15654 (void) sprintf(interf_name, "%s%d", cp, ppa); 15655 15656 /* 15657 * Here we are not going to delay the ioack until after 15658 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the 15659 * original ioctl message before sending the requests. 15660 */ 15661 return (ipif_set_values(q, mp, interf_name, &ppa)); 15662 } 15663 15664 /* ARGSUSED */ 15665 int 15666 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 15667 ip_ioctl_cmd_t *ipip, void *dummy_ifreq) 15668 { 15669 return (ENXIO); 15670 } 15671 15672 /* 15673 * Create any IRE_BROADCAST entries for `ipif', and store those entries in 15674 * `irep'. Returns a pointer to the next free `irep' entry 15675 * A mirror exists in ipif_delete_bcast_ires(). 15676 * 15677 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is 15678 * done in ire_add. 15679 */ 15680 static ire_t ** 15681 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) 15682 { 15683 ipaddr_t addr; 15684 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15685 ipaddr_t subnetmask = ipif->ipif_net_mask; 15686 ill_t *ill = ipif->ipif_ill; 15687 zoneid_t zoneid = ipif->ipif_zoneid; 15688 15689 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); 15690 15691 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15692 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15693 15694 if (ipif->ipif_lcl_addr == INADDR_ANY || 15695 (ipif->ipif_flags & IPIF_NOLOCAL)) 15696 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15697 15698 irep = ire_create_bcast(ill, 0, zoneid, irep); 15699 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); 15700 15701 /* 15702 * For backward compatibility, we create net broadcast IREs based on 15703 * the old "IP address class system", since some old machines only 15704 * respond to these class derived net broadcast. However, we must not 15705 * create these net broadcast IREs if the subnetmask is shorter than 15706 * the IP address class based derived netmask. Otherwise, we may 15707 * create a net broadcast address which is the same as an IP address 15708 * on the subnet -- and then TCP will refuse to talk to that address. 15709 */ 15710 if (netmask < subnetmask) { 15711 addr = netmask & ipif->ipif_subnet; 15712 irep = ire_create_bcast(ill, addr, zoneid, irep); 15713 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); 15714 } 15715 15716 /* 15717 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15718 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15719 * created. Creating these broadcast IREs will only create confusion 15720 * as `addr' will be the same as the IP address. 15721 */ 15722 if (subnetmask != 0xFFFFFFFF) { 15723 addr = ipif->ipif_subnet; 15724 irep = ire_create_bcast(ill, addr, zoneid, irep); 15725 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); 15726 } 15727 15728 return (irep); 15729 } 15730 15731 /* 15732 * Mirror of ipif_create_bcast_ires() 15733 */ 15734 static void 15735 ipif_delete_bcast_ires(ipif_t *ipif) 15736 { 15737 ipaddr_t addr; 15738 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); 15739 ipaddr_t subnetmask = ipif->ipif_net_mask; 15740 ill_t *ill = ipif->ipif_ill; 15741 zoneid_t zoneid = ipif->ipif_zoneid; 15742 ire_t *ire; 15743 15744 ASSERT(ipif->ipif_flags & IPIF_BROADCAST); 15745 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); 15746 15747 if (ipif->ipif_lcl_addr == INADDR_ANY || 15748 (ipif->ipif_flags & IPIF_NOLOCAL)) 15749 netmask = htonl(IN_CLASSA_NET); /* fallback */ 15750 15751 ire = ire_lookup_bcast(ill, 0, zoneid); 15752 ASSERT(ire != NULL); 15753 ire_delete(ire); ire_refrele(ire); 15754 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); 15755 ASSERT(ire != NULL); 15756 ire_delete(ire); ire_refrele(ire); 15757 15758 /* 15759 * For backward compatibility, we create net broadcast IREs based on 15760 * the old "IP address class system", since some old machines only 15761 * respond to these class derived net broadcast. However, we must not 15762 * create these net broadcast IREs if the subnetmask is shorter than 15763 * the IP address class based derived netmask. Otherwise, we may 15764 * create a net broadcast address which is the same as an IP address 15765 * on the subnet -- and then TCP will refuse to talk to that address. 15766 */ 15767 if (netmask < subnetmask) { 15768 addr = netmask & ipif->ipif_subnet; 15769 ire = ire_lookup_bcast(ill, addr, zoneid); 15770 ASSERT(ire != NULL); 15771 ire_delete(ire); ire_refrele(ire); 15772 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); 15773 ASSERT(ire != NULL); 15774 ire_delete(ire); ire_refrele(ire); 15775 } 15776 15777 /* 15778 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask 15779 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already 15780 * created. Creating these broadcast IREs will only create confusion 15781 * as `addr' will be the same as the IP address. 15782 */ 15783 if (subnetmask != 0xFFFFFFFF) { 15784 addr = ipif->ipif_subnet; 15785 ire = ire_lookup_bcast(ill, addr, zoneid); 15786 ASSERT(ire != NULL); 15787 ire_delete(ire); ire_refrele(ire); 15788 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); 15789 ASSERT(ire != NULL); 15790 ire_delete(ire); ire_refrele(ire); 15791 } 15792 } 15793 15794 /* 15795 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV* 15796 * from lifr_flags and the name from lifr_name. 15797 * Set IFF_IPV* and ill_isv6 prior to doing the lookup 15798 * since ipif_lookup_on_name uses the _isv6 flags when matching. 15799 * Returns EINPROGRESS when mp has been consumed by queueing it on 15800 * ipx_pending_mp and the ioctl will complete in ip_rput. 15801 * 15802 * Can operate on either a module or a driver queue. 15803 * Returns an error if not a module queue. 15804 */ 15805 /* ARGSUSED */ 15806 int 15807 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15808 ip_ioctl_cmd_t *ipip, void *if_req) 15809 { 15810 ill_t *ill = q->q_ptr; 15811 phyint_t *phyi; 15812 ip_stack_t *ipst; 15813 struct lifreq *lifr = if_req; 15814 uint64_t new_flags; 15815 15816 ASSERT(ipif != NULL); 15817 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name)); 15818 15819 if (q->q_next == NULL) { 15820 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n")); 15821 return (EINVAL); 15822 } 15823 15824 /* 15825 * If we are not writer on 'q' then this interface exists already 15826 * and previous lookups (ip_extract_lifreq()) found this ipif -- 15827 * so return EALREADY. 15828 */ 15829 if (ill != ipif->ipif_ill) 15830 return (EALREADY); 15831 15832 if (ill->ill_name[0] != '\0') 15833 return (EALREADY); 15834 15835 /* 15836 * If there's another ill already with the requested name, ensure 15837 * that it's of the same type. Otherwise, ill_phyint_reinit() will 15838 * fuse together two unrelated ills, which will cause chaos. 15839 */ 15840 ipst = ill->ill_ipst; 15841 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 15842 lifr->lifr_name, NULL); 15843 if (phyi != NULL) { 15844 ill_t *ill_mate = phyi->phyint_illv4; 15845 15846 if (ill_mate == NULL) 15847 ill_mate = phyi->phyint_illv6; 15848 ASSERT(ill_mate != NULL); 15849 15850 if (ill_mate->ill_media->ip_m_mac_type != 15851 ill->ill_media->ip_m_mac_type) { 15852 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to " 15853 "use the same ill name on differing media\n")); 15854 return (EINVAL); 15855 } 15856 } 15857 15858 /* 15859 * We start off as IFF_IPV4 in ipif_allocate and become 15860 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. 15861 * The only flags that we read from user space are IFF_IPV4, 15862 * IFF_IPV6, and IFF_BROADCAST. 15863 * 15864 * This ill has not been inserted into the global list. 15865 * So we are still single threaded and don't need any lock 15866 * 15867 * Saniy check the flags. 15868 */ 15869 15870 if ((lifr->lifr_flags & IFF_BROADCAST) && 15871 ((lifr->lifr_flags & IFF_IPV6) || 15872 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) { 15873 ip1dbg(("ip_sioctl_slifname: link not broadcast capable " 15874 "or IPv6 i.e., no broadcast \n")); 15875 return (EINVAL); 15876 } 15877 15878 new_flags = 15879 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); 15880 15881 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { 15882 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " 15883 "IFF_IPV4 or IFF_IPV6\n")); 15884 return (EINVAL); 15885 } 15886 15887 /* 15888 * We always start off as IPv4, so only need to check for IPv6. 15889 */ 15890 if ((new_flags & IFF_IPV6) != 0) { 15891 ill->ill_flags |= ILLF_IPV6; 15892 ill->ill_flags &= ~ILLF_IPV4; 15893 15894 if (lifr->lifr_flags & IFF_NOLINKLOCAL) 15895 ill->ill_flags |= ILLF_NOLINKLOCAL; 15896 } 15897 15898 if ((new_flags & IFF_BROADCAST) != 0) 15899 ipif->ipif_flags |= IPIF_BROADCAST; 15900 else 15901 ipif->ipif_flags &= ~IPIF_BROADCAST; 15902 15903 /* We started off as V4. */ 15904 if (ill->ill_flags & ILLF_IPV6) { 15905 ill->ill_phyint->phyint_illv6 = ill; 15906 ill->ill_phyint->phyint_illv4 = NULL; 15907 } 15908 15909 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa)); 15910 } 15911 15912 /* ARGSUSED */ 15913 int 15914 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15915 ip_ioctl_cmd_t *ipip, void *if_req) 15916 { 15917 /* 15918 * ill_phyint_reinit merged the v4 and v6 into a single 15919 * ipsq. We might not have been able to complete the 15920 * slifname in ipif_set_values, if we could not become 15921 * exclusive. If so restart it here 15922 */ 15923 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q)); 15924 } 15925 15926 /* 15927 * Return a pointer to the ipif which matches the index, IP version type and 15928 * zoneid. 15929 */ 15930 ipif_t * 15931 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, 15932 ip_stack_t *ipst) 15933 { 15934 ill_t *ill; 15935 ipif_t *ipif = NULL; 15936 15937 ill = ill_lookup_on_ifindex(index, isv6, ipst); 15938 if (ill != NULL) { 15939 mutex_enter(&ill->ill_lock); 15940 for (ipif = ill->ill_ipif; ipif != NULL; 15941 ipif = ipif->ipif_next) { 15942 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || 15943 zoneid == ipif->ipif_zoneid || 15944 ipif->ipif_zoneid == ALL_ZONES)) { 15945 ipif_refhold_locked(ipif); 15946 break; 15947 } 15948 } 15949 mutex_exit(&ill->ill_lock); 15950 ill_refrele(ill); 15951 } 15952 return (ipif); 15953 } 15954 15955 /* 15956 * Change an existing physical interface's index. If the new index 15957 * is acceptable we update the index and the phyint_list_avl_by_index tree. 15958 * Finally, we update other systems which may have a dependence on the 15959 * index value. 15960 */ 15961 /* ARGSUSED */ 15962 int 15963 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 15964 ip_ioctl_cmd_t *ipip, void *ifreq) 15965 { 15966 ill_t *ill; 15967 phyint_t *phyi; 15968 struct ifreq *ifr = (struct ifreq *)ifreq; 15969 struct lifreq *lifr = (struct lifreq *)ifreq; 15970 uint_t old_index, index; 15971 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 15972 avl_index_t where; 15973 15974 if (ipip->ipi_cmd_type == IF_CMD) 15975 index = ifr->ifr_index; 15976 else 15977 index = lifr->lifr_index; 15978 15979 /* 15980 * Only allow on physical interface. Also, index zero is illegal. 15981 */ 15982 ill = ipif->ipif_ill; 15983 phyi = ill->ill_phyint; 15984 if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) { 15985 return (EINVAL); 15986 } 15987 15988 /* If the index is not changing, no work to do */ 15989 if (phyi->phyint_ifindex == index) 15990 return (0); 15991 15992 /* 15993 * Use phyint_exists() to determine if the new interface index 15994 * is already in use. If the index is unused then we need to 15995 * change the phyint's position in the phyint_list_avl_by_index 15996 * tree. If we do not do this, subsequent lookups (using the new 15997 * index value) will not find the phyint. 15998 */ 15999 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 16000 if (phyint_exists(index, ipst)) { 16001 rw_exit(&ipst->ips_ill_g_lock); 16002 return (EEXIST); 16003 } 16004 16005 /* 16006 * The new index is unused. Set it in the phyint. However we must not 16007 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex 16008 * changes. The event must be bound to old ifindex value. 16009 */ 16010 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE, 16011 &index, sizeof (index)); 16012 16013 old_index = phyi->phyint_ifindex; 16014 phyi->phyint_ifindex = index; 16015 16016 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi); 16017 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16018 &index, &where); 16019 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16020 phyi, where); 16021 rw_exit(&ipst->ips_ill_g_lock); 16022 16023 /* Update SCTP's ILL list */ 16024 sctp_ill_reindex(ill, old_index); 16025 16026 /* Send the routing sockets message */ 16027 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 16028 if (ILL_OTHER(ill)) 16029 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); 16030 16031 /* Perhaps ilgs should use this ill */ 16032 update_conn_ill(NULL, ill->ill_ipst); 16033 return (0); 16034 } 16035 16036 /* ARGSUSED */ 16037 int 16038 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16039 ip_ioctl_cmd_t *ipip, void *ifreq) 16040 { 16041 struct ifreq *ifr = (struct ifreq *)ifreq; 16042 struct lifreq *lifr = (struct lifreq *)ifreq; 16043 16044 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n", 16045 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16046 /* Get the interface index */ 16047 if (ipip->ipi_cmd_type == IF_CMD) { 16048 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 16049 } else { 16050 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; 16051 } 16052 return (0); 16053 } 16054 16055 /* ARGSUSED */ 16056 int 16057 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16058 ip_ioctl_cmd_t *ipip, void *ifreq) 16059 { 16060 struct lifreq *lifr = (struct lifreq *)ifreq; 16061 16062 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n", 16063 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16064 /* Get the interface zone */ 16065 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 16066 lifr->lifr_zoneid = ipif->ipif_zoneid; 16067 return (0); 16068 } 16069 16070 /* 16071 * Set the zoneid of an interface. 16072 */ 16073 /* ARGSUSED */ 16074 int 16075 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16076 ip_ioctl_cmd_t *ipip, void *ifreq) 16077 { 16078 struct lifreq *lifr = (struct lifreq *)ifreq; 16079 int err = 0; 16080 boolean_t need_up = B_FALSE; 16081 zone_t *zptr; 16082 zone_status_t status; 16083 zoneid_t zoneid; 16084 16085 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 16086 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) { 16087 if (!is_system_labeled()) 16088 return (ENOTSUP); 16089 zoneid = GLOBAL_ZONEID; 16090 } 16091 16092 /* cannot assign instance zero to a non-global zone */ 16093 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID) 16094 return (ENOTSUP); 16095 16096 /* 16097 * Cannot assign to a zone that doesn't exist or is shutting down. In 16098 * the event of a race with the zone shutdown processing, since IP 16099 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the 16100 * interface will be cleaned up even if the zone is shut down 16101 * immediately after the status check. If the interface can't be brought 16102 * down right away, and the zone is shut down before the restart 16103 * function is called, we resolve the possible races by rechecking the 16104 * zone status in the restart function. 16105 */ 16106 if ((zptr = zone_find_by_id(zoneid)) == NULL) 16107 return (EINVAL); 16108 status = zone_status_get(zptr); 16109 zone_rele(zptr); 16110 16111 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) 16112 return (EINVAL); 16113 16114 if (ipif->ipif_flags & IPIF_UP) { 16115 /* 16116 * If the interface is already marked up, 16117 * we call ipif_down which will take care 16118 * of ditching any IREs that have been set 16119 * up based on the old interface address. 16120 */ 16121 err = ipif_logical_down(ipif, q, mp); 16122 if (err == EINPROGRESS) 16123 return (err); 16124 (void) ipif_down_tail(ipif); 16125 need_up = B_TRUE; 16126 } 16127 16128 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up); 16129 return (err); 16130 } 16131 16132 static int 16133 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, 16134 queue_t *q, mblk_t *mp, boolean_t need_up) 16135 { 16136 int err = 0; 16137 ip_stack_t *ipst; 16138 16139 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n", 16140 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16141 16142 if (CONN_Q(q)) 16143 ipst = CONNQ_TO_IPST(q); 16144 else 16145 ipst = ILLQ_TO_IPST(q); 16146 16147 /* 16148 * For exclusive stacks we don't allow a different zoneid than 16149 * global. 16150 */ 16151 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID && 16152 zoneid != GLOBAL_ZONEID) 16153 return (EINVAL); 16154 16155 /* Set the new zone id. */ 16156 ipif->ipif_zoneid = zoneid; 16157 16158 /* Update sctp list */ 16159 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); 16160 16161 /* The default multicast interface might have changed */ 16162 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); 16163 16164 if (need_up) { 16165 /* 16166 * Now bring the interface back up. If this 16167 * is the only IPIF for the ILL, ipif_up 16168 * will have to re-bind to the device, so 16169 * we may get back EINPROGRESS, in which 16170 * case, this IOCTL will get completed in 16171 * ip_rput_dlpi when we see the DL_BIND_ACK. 16172 */ 16173 err = ipif_up(ipif, q, mp); 16174 } 16175 return (err); 16176 } 16177 16178 /* ARGSUSED */ 16179 int 16180 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16181 ip_ioctl_cmd_t *ipip, void *if_req) 16182 { 16183 struct lifreq *lifr = (struct lifreq *)if_req; 16184 zoneid_t zoneid; 16185 zone_t *zptr; 16186 zone_status_t status; 16187 16188 ASSERT(ipip->ipi_cmd_type == LIF_CMD); 16189 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) 16190 zoneid = GLOBAL_ZONEID; 16191 16192 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n", 16193 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16194 16195 /* 16196 * We recheck the zone status to resolve the following race condition: 16197 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone"; 16198 * 2) hme0:1 is up and can't be brought down right away; 16199 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued; 16200 * 3) zone "myzone" is halted; the zone status switches to 16201 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list 16202 * the interfaces to remove - hme0:1 is not returned because it's not 16203 * yet in "myzone", so it won't be removed; 16204 * 4) the restart function for SIOCSLIFZONE is called; without the 16205 * status check here, we would have hme0:1 in "myzone" after it's been 16206 * destroyed. 16207 * Note that if the status check fails, we need to bring the interface 16208 * back to its state prior to ip_sioctl_slifzone(), hence the call to 16209 * ipif_up_done[_v6](). 16210 */ 16211 status = ZONE_IS_UNINITIALIZED; 16212 if ((zptr = zone_find_by_id(zoneid)) != NULL) { 16213 status = zone_status_get(zptr); 16214 zone_rele(zptr); 16215 } 16216 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) { 16217 if (ipif->ipif_isv6) { 16218 (void) ipif_up_done_v6(ipif); 16219 } else { 16220 (void) ipif_up_done(ipif); 16221 } 16222 return (EINVAL); 16223 } 16224 16225 (void) ipif_down_tail(ipif); 16226 16227 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, 16228 B_TRUE)); 16229 } 16230 16231 /* 16232 * Return the number of addresses on `ill' with one or more of the values 16233 * in `set' set and all of the values in `clear' clear. 16234 */ 16235 static uint_t 16236 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear) 16237 { 16238 ipif_t *ipif; 16239 uint_t cnt = 0; 16240 16241 ASSERT(IAM_WRITER_ILL(ill)); 16242 16243 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 16244 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear)) 16245 cnt++; 16246 16247 return (cnt); 16248 } 16249 16250 /* 16251 * Return the number of migratable addresses on `ill' that are under 16252 * application control. 16253 */ 16254 uint_t 16255 ill_appaddr_cnt(const ill_t *ill) 16256 { 16257 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF, 16258 IPIF_NOFAILOVER)); 16259 } 16260 16261 /* 16262 * Return the number of point-to-point addresses on `ill'. 16263 */ 16264 uint_t 16265 ill_ptpaddr_cnt(const ill_t *ill) 16266 { 16267 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0)); 16268 } 16269 16270 /* ARGSUSED */ 16271 int 16272 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16273 ip_ioctl_cmd_t *ipip, void *ifreq) 16274 { 16275 struct lifreq *lifr = ifreq; 16276 16277 ASSERT(q->q_next == NULL); 16278 ASSERT(CONN_Q(q)); 16279 16280 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n", 16281 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); 16282 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex; 16283 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index)); 16284 16285 return (0); 16286 } 16287 16288 /* Find the previous ILL in this usesrc group */ 16289 static ill_t * 16290 ill_prev_usesrc(ill_t *uill) 16291 { 16292 ill_t *ill; 16293 16294 for (ill = uill->ill_usesrc_grp_next; 16295 ASSERT(ill), ill->ill_usesrc_grp_next != uill; 16296 ill = ill->ill_usesrc_grp_next) 16297 /* do nothing */; 16298 return (ill); 16299 } 16300 16301 /* 16302 * Release all members of the usesrc group. This routine is called 16303 * from ill_delete when the interface being unplumbed is the 16304 * group head. 16305 * 16306 * This silently clears the usesrc that ifconfig setup. 16307 * An alternative would be to keep that ifindex, and drop packets on the floor 16308 * since no source address can be selected. 16309 * Even if we keep the current semantics, don't need a lock and a linked list. 16310 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching 16311 * the one that is being removed. Issue is how we return the usesrc users 16312 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an 16313 * ill_usesrc_ifindex matching a target ill. We could also do that with an 16314 * ill walk, but the walker would need to insert in the ioctl response. 16315 */ 16316 static void 16317 ill_disband_usesrc_group(ill_t *uill) 16318 { 16319 ill_t *next_ill, *tmp_ill; 16320 ip_stack_t *ipst = uill->ill_ipst; 16321 16322 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 16323 next_ill = uill->ill_usesrc_grp_next; 16324 16325 do { 16326 ASSERT(next_ill != NULL); 16327 tmp_ill = next_ill->ill_usesrc_grp_next; 16328 ASSERT(tmp_ill != NULL); 16329 next_ill->ill_usesrc_grp_next = NULL; 16330 next_ill->ill_usesrc_ifindex = 0; 16331 next_ill = tmp_ill; 16332 } while (next_ill->ill_usesrc_ifindex != 0); 16333 uill->ill_usesrc_grp_next = NULL; 16334 } 16335 16336 /* 16337 * Remove the client usesrc ILL from the list and relink to a new list 16338 */ 16339 int 16340 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex) 16341 { 16342 ill_t *ill, *tmp_ill; 16343 ip_stack_t *ipst = ucill->ill_ipst; 16344 16345 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) && 16346 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock)); 16347 16348 /* 16349 * Check if the usesrc client ILL passed in is not already 16350 * in use as a usesrc ILL i.e one whose source address is 16351 * in use OR a usesrc ILL is not already in use as a usesrc 16352 * client ILL 16353 */ 16354 if ((ucill->ill_usesrc_ifindex == 0) || 16355 (uill->ill_usesrc_ifindex != 0)) { 16356 return (-1); 16357 } 16358 16359 ill = ill_prev_usesrc(ucill); 16360 ASSERT(ill->ill_usesrc_grp_next != NULL); 16361 16362 /* Remove from the current list */ 16363 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) { 16364 /* Only two elements in the list */ 16365 ASSERT(ill->ill_usesrc_ifindex == 0); 16366 ill->ill_usesrc_grp_next = NULL; 16367 } else { 16368 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next; 16369 } 16370 16371 if (ifindex == 0) { 16372 ucill->ill_usesrc_ifindex = 0; 16373 ucill->ill_usesrc_grp_next = NULL; 16374 return (0); 16375 } 16376 16377 ucill->ill_usesrc_ifindex = ifindex; 16378 tmp_ill = uill->ill_usesrc_grp_next; 16379 uill->ill_usesrc_grp_next = ucill; 16380 ucill->ill_usesrc_grp_next = 16381 (tmp_ill != NULL) ? tmp_ill : uill; 16382 return (0); 16383 } 16384 16385 /* 16386 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in 16387 * ip.c for locking details. 16388 */ 16389 /* ARGSUSED */ 16390 int 16391 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16392 ip_ioctl_cmd_t *ipip, void *ifreq) 16393 { 16394 struct lifreq *lifr = (struct lifreq *)ifreq; 16395 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; 16396 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; 16397 int err = 0, ret; 16398 uint_t ifindex; 16399 ipsq_t *ipsq = NULL; 16400 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 16401 16402 ASSERT(IAM_WRITER_IPIF(ipif)); 16403 ASSERT(q->q_next == NULL); 16404 ASSERT(CONN_Q(q)); 16405 16406 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; 16407 16408 ifindex = lifr->lifr_index; 16409 if (ifindex == 0) { 16410 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) { 16411 /* non usesrc group interface, nothing to reset */ 16412 return (0); 16413 } 16414 ifindex = usesrc_cli_ill->ill_usesrc_ifindex; 16415 /* valid reset request */ 16416 reset_flg = B_TRUE; 16417 } 16418 16419 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 16420 if (usesrc_ill == NULL) 16421 return (ENXIO); 16422 if (usesrc_ill == ipif->ipif_ill) { 16423 ill_refrele(usesrc_ill); 16424 return (EINVAL); 16425 } 16426 16427 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, 16428 NEW_OP, B_TRUE); 16429 if (ipsq == NULL) { 16430 err = EINPROGRESS; 16431 /* Operation enqueued on the ipsq of the usesrc ILL */ 16432 goto done; 16433 } 16434 16435 /* USESRC isn't currently supported with IPMP */ 16436 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) { 16437 err = ENOTSUP; 16438 goto done; 16439 } 16440 16441 /* 16442 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only 16443 * used by IPMP underlying interfaces, but someone might think it's 16444 * more general and try to use it independently with VNI.) 16445 */ 16446 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) { 16447 err = ENOTSUP; 16448 goto done; 16449 } 16450 16451 /* 16452 * If the client is already in use as a usesrc_ill or a usesrc_ill is 16453 * already a client then return EINVAL 16454 */ 16455 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) { 16456 err = EINVAL; 16457 goto done; 16458 } 16459 16460 /* 16461 * If the ill_usesrc_ifindex field is already set to what it needs to 16462 * be then this is a duplicate operation. 16463 */ 16464 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) { 16465 err = 0; 16466 goto done; 16467 } 16468 16469 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s," 16470 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name, 16471 usesrc_ill->ill_isv6)); 16472 16473 /* 16474 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next 16475 * and the ill_usesrc_ifindex fields 16476 */ 16477 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 16478 16479 if (reset_flg) { 16480 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0); 16481 if (ret != 0) { 16482 err = EINVAL; 16483 } 16484 rw_exit(&ipst->ips_ill_g_usesrc_lock); 16485 goto done; 16486 } 16487 16488 /* 16489 * Four possibilities to consider: 16490 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp 16491 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't 16492 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't 16493 * 4. Both are part of their respective usesrc groups 16494 */ 16495 if ((usesrc_ill->ill_usesrc_grp_next == NULL) && 16496 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 16497 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0); 16498 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 16499 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 16500 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill; 16501 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) && 16502 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) { 16503 usesrc_cli_ill->ill_usesrc_ifindex = ifindex; 16504 /* Insert at head of list */ 16505 usesrc_cli_ill->ill_usesrc_grp_next = 16506 usesrc_ill->ill_usesrc_grp_next; 16507 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill; 16508 } else { 16509 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 16510 ifindex); 16511 if (ret != 0) 16512 err = EINVAL; 16513 } 16514 rw_exit(&ipst->ips_ill_g_usesrc_lock); 16515 16516 done: 16517 if (ipsq != NULL) 16518 ipsq_exit(ipsq); 16519 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ 16520 ill_refrele(usesrc_ill); 16521 16522 /* Let conn_ixa caching know that source address selection changed */ 16523 ip_update_source_selection(ipst); 16524 16525 return (err); 16526 } 16527 16528 /* ARGSUSED */ 16529 int 16530 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 16531 ip_ioctl_cmd_t *ipip, void *if_req) 16532 { 16533 struct lifreq *lifr = (struct lifreq *)if_req; 16534 ill_t *ill = ipif->ipif_ill; 16535 16536 /* 16537 * Need a lock since IFF_UP can be set even when there are 16538 * references to the ipif. 16539 */ 16540 mutex_enter(&ill->ill_lock); 16541 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0) 16542 lifr->lifr_dadstate = DAD_IN_PROGRESS; 16543 else 16544 lifr->lifr_dadstate = DAD_DONE; 16545 mutex_exit(&ill->ill_lock); 16546 return (0); 16547 } 16548 16549 /* 16550 * comparison function used by avl. 16551 */ 16552 static int 16553 ill_phyint_compare_index(const void *index_ptr, const void *phyip) 16554 { 16555 16556 uint_t index; 16557 16558 ASSERT(phyip != NULL && index_ptr != NULL); 16559 16560 index = *((uint_t *)index_ptr); 16561 /* 16562 * let the phyint with the lowest index be on top. 16563 */ 16564 if (((phyint_t *)phyip)->phyint_ifindex < index) 16565 return (1); 16566 if (((phyint_t *)phyip)->phyint_ifindex > index) 16567 return (-1); 16568 return (0); 16569 } 16570 16571 /* 16572 * comparison function used by avl. 16573 */ 16574 static int 16575 ill_phyint_compare_name(const void *name_ptr, const void *phyip) 16576 { 16577 ill_t *ill; 16578 int res = 0; 16579 16580 ASSERT(phyip != NULL && name_ptr != NULL); 16581 16582 if (((phyint_t *)phyip)->phyint_illv4) 16583 ill = ((phyint_t *)phyip)->phyint_illv4; 16584 else 16585 ill = ((phyint_t *)phyip)->phyint_illv6; 16586 ASSERT(ill != NULL); 16587 16588 res = strcmp(ill->ill_name, (char *)name_ptr); 16589 if (res > 0) 16590 return (1); 16591 else if (res < 0) 16592 return (-1); 16593 return (0); 16594 } 16595 16596 /* 16597 * This function is called on the unplumb path via ill_glist_delete() when 16598 * there are no ills left on the phyint and thus the phyint can be freed. 16599 */ 16600 static void 16601 phyint_free(phyint_t *phyi) 16602 { 16603 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 16604 16605 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL); 16606 16607 /* 16608 * If this phyint was an IPMP meta-interface, blow away the group. 16609 * This is safe to do because all of the illgrps have already been 16610 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us. 16611 * If we're cleaning up as a result of failed initialization, 16612 * phyint_grp may be NULL. 16613 */ 16614 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) { 16615 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16616 ipmp_grp_destroy(phyi->phyint_grp); 16617 phyi->phyint_grp = NULL; 16618 rw_exit(&ipst->ips_ipmp_lock); 16619 } 16620 16621 /* 16622 * If this interface was under IPMP, take it out of the group. 16623 */ 16624 if (phyi->phyint_grp != NULL) 16625 ipmp_phyint_leave_grp(phyi); 16626 16627 /* 16628 * Delete the phyint and disassociate its ipsq. The ipsq itself 16629 * will be freed in ipsq_exit(). 16630 */ 16631 phyi->phyint_ipsq->ipsq_phyint = NULL; 16632 phyi->phyint_name[0] = '\0'; 16633 16634 mi_free(phyi); 16635 } 16636 16637 /* 16638 * Attach the ill to the phyint structure which can be shared by both 16639 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This 16640 * function is called from ipif_set_values and ill_lookup_on_name (for 16641 * loopback) where we know the name of the ill. We lookup the ill and if 16642 * there is one present already with the name use that phyint. Otherwise 16643 * reuse the one allocated by ill_init. 16644 */ 16645 static void 16646 ill_phyint_reinit(ill_t *ill) 16647 { 16648 boolean_t isv6 = ill->ill_isv6; 16649 phyint_t *phyi_old; 16650 phyint_t *phyi; 16651 avl_index_t where = 0; 16652 ill_t *ill_other = NULL; 16653 ip_stack_t *ipst = ill->ill_ipst; 16654 16655 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock)); 16656 16657 phyi_old = ill->ill_phyint; 16658 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill && 16659 phyi_old->phyint_illv6 == NULL)); 16660 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill && 16661 phyi_old->phyint_illv4 == NULL)); 16662 ASSERT(phyi_old->phyint_ifindex == 0); 16663 16664 /* 16665 * Now that our ill has a name, set it in the phyint. 16666 */ 16667 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ); 16668 16669 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16670 ill->ill_name, &where); 16671 16672 /* 16673 * 1. We grabbed the ill_g_lock before inserting this ill into 16674 * the global list of ills. So no other thread could have located 16675 * this ill and hence the ipsq of this ill is guaranteed to be empty. 16676 * 2. Now locate the other protocol instance of this ill. 16677 * 3. Now grab both ill locks in the right order, and the phyint lock of 16678 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq 16679 * of neither ill can change. 16680 * 4. Merge the phyint and thus the ipsq as well of this ill onto the 16681 * other ill. 16682 * 5. Release all locks. 16683 */ 16684 16685 /* 16686 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if 16687 * we are initializing IPv4. 16688 */ 16689 if (phyi != NULL) { 16690 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6; 16691 ASSERT(ill_other->ill_phyint != NULL); 16692 ASSERT((isv6 && !ill_other->ill_isv6) || 16693 (!isv6 && ill_other->ill_isv6)); 16694 GRAB_ILL_LOCKS(ill, ill_other); 16695 /* 16696 * We are potentially throwing away phyint_flags which 16697 * could be different from the one that we obtain from 16698 * ill_other->ill_phyint. But it is okay as we are assuming 16699 * that the state maintained within IP is correct. 16700 */ 16701 mutex_enter(&phyi->phyint_lock); 16702 if (isv6) { 16703 ASSERT(phyi->phyint_illv6 == NULL); 16704 phyi->phyint_illv6 = ill; 16705 } else { 16706 ASSERT(phyi->phyint_illv4 == NULL); 16707 phyi->phyint_illv4 = ill; 16708 } 16709 16710 /* 16711 * Delete the old phyint and make its ipsq eligible 16712 * to be freed in ipsq_exit(). 16713 */ 16714 phyi_old->phyint_illv4 = NULL; 16715 phyi_old->phyint_illv6 = NULL; 16716 phyi_old->phyint_ipsq->ipsq_phyint = NULL; 16717 phyi_old->phyint_name[0] = '\0'; 16718 mi_free(phyi_old); 16719 } else { 16720 mutex_enter(&ill->ill_lock); 16721 /* 16722 * We don't need to acquire any lock, since 16723 * the ill is not yet visible globally and we 16724 * have not yet released the ill_g_lock. 16725 */ 16726 phyi = phyi_old; 16727 mutex_enter(&phyi->phyint_lock); 16728 /* XXX We need a recovery strategy here. */ 16729 if (!phyint_assign_ifindex(phyi, ipst)) 16730 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed"); 16731 16732 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 16733 (void *)phyi, where); 16734 16735 (void) avl_find(&ipst->ips_phyint_g_list-> 16736 phyint_list_avl_by_index, 16737 &phyi->phyint_ifindex, &where); 16738 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 16739 (void *)phyi, where); 16740 } 16741 16742 /* 16743 * Reassigning ill_phyint automatically reassigns the ipsq also. 16744 * pending mp is not affected because that is per ill basis. 16745 */ 16746 ill->ill_phyint = phyi; 16747 16748 /* 16749 * Now that the phyint's ifindex has been assigned, complete the 16750 * remaining 16751 */ 16752 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; 16753 if (ill->ill_isv6) { 16754 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = 16755 ill->ill_phyint->phyint_ifindex; 16756 ill->ill_mcast_type = ipst->ips_mld_max_version; 16757 } else { 16758 ill->ill_mcast_type = ipst->ips_igmp_max_version; 16759 } 16760 16761 /* 16762 * Generate an event within the hooks framework to indicate that 16763 * a new interface has just been added to IP. For this event to 16764 * be generated, the network interface must, at least, have an 16765 * ifindex assigned to it. (We don't generate the event for 16766 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.) 16767 * 16768 * This needs to be run inside the ill_g_lock perimeter to ensure 16769 * that the ordering of delivered events to listeners matches the 16770 * order of them in the kernel. 16771 */ 16772 if (!IS_LOOPBACK(ill)) { 16773 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name, 16774 ill->ill_name_length); 16775 } 16776 RELEASE_ILL_LOCKS(ill, ill_other); 16777 mutex_exit(&phyi->phyint_lock); 16778 } 16779 16780 /* 16781 * Notify any downstream modules of the name of this interface. 16782 * An M_IOCTL is used even though we don't expect a successful reply. 16783 * Any reply message from the driver (presumably an M_IOCNAK) will 16784 * eventually get discarded somewhere upstream. The message format is 16785 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig 16786 * to IP. 16787 */ 16788 static void 16789 ip_ifname_notify(ill_t *ill, queue_t *q) 16790 { 16791 mblk_t *mp1, *mp2; 16792 struct iocblk *iocp; 16793 struct lifreq *lifr; 16794 16795 mp1 = mkiocb(SIOCSLIFNAME); 16796 if (mp1 == NULL) 16797 return; 16798 mp2 = allocb(sizeof (struct lifreq), BPRI_HI); 16799 if (mp2 == NULL) { 16800 freeb(mp1); 16801 return; 16802 } 16803 16804 mp1->b_cont = mp2; 16805 iocp = (struct iocblk *)mp1->b_rptr; 16806 iocp->ioc_count = sizeof (struct lifreq); 16807 16808 lifr = (struct lifreq *)mp2->b_rptr; 16809 mp2->b_wptr += sizeof (struct lifreq); 16810 bzero(lifr, sizeof (struct lifreq)); 16811 16812 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ); 16813 lifr->lifr_ppa = ill->ill_ppa; 16814 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); 16815 16816 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", 16817 char *, "SIOCSLIFNAME", ill_t *, ill); 16818 putnext(q, mp1); 16819 } 16820 16821 static int 16822 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) 16823 { 16824 int err; 16825 ip_stack_t *ipst = ill->ill_ipst; 16826 phyint_t *phyi = ill->ill_phyint; 16827 16828 /* 16829 * Now that ill_name is set, the configuration for the IPMP 16830 * meta-interface can be performed. 16831 */ 16832 if (IS_IPMP(ill)) { 16833 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 16834 /* 16835 * If phyi->phyint_grp is NULL, then this is the first IPMP 16836 * meta-interface and we need to create the IPMP group. 16837 */ 16838 if (phyi->phyint_grp == NULL) { 16839 /* 16840 * If someone has renamed another IPMP group to have 16841 * the same name as our interface, bail. 16842 */ 16843 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) { 16844 rw_exit(&ipst->ips_ipmp_lock); 16845 return (EEXIST); 16846 } 16847 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi); 16848 if (phyi->phyint_grp == NULL) { 16849 rw_exit(&ipst->ips_ipmp_lock); 16850 return (ENOMEM); 16851 } 16852 } 16853 rw_exit(&ipst->ips_ipmp_lock); 16854 } 16855 16856 /* Tell downstream modules where they are. */ 16857 ip_ifname_notify(ill, q); 16858 16859 /* 16860 * ill_dl_phys returns EINPROGRESS in the usual case. 16861 * Error cases are ENOMEM ... 16862 */ 16863 err = ill_dl_phys(ill, ipif, mp, q); 16864 16865 if (ill->ill_isv6) { 16866 mutex_enter(&ipst->ips_mld_slowtimeout_lock); 16867 if (ipst->ips_mld_slowtimeout_id == 0) { 16868 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, 16869 (void *)ipst, 16870 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16871 } 16872 mutex_exit(&ipst->ips_mld_slowtimeout_lock); 16873 } else { 16874 mutex_enter(&ipst->ips_igmp_slowtimeout_lock); 16875 if (ipst->ips_igmp_slowtimeout_id == 0) { 16876 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, 16877 (void *)ipst, 16878 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); 16879 } 16880 mutex_exit(&ipst->ips_igmp_slowtimeout_lock); 16881 } 16882 16883 return (err); 16884 } 16885 16886 /* 16887 * Common routine for ppa and ifname setting. Should be called exclusive. 16888 * 16889 * Returns EINPROGRESS when mp has been consumed by queueing it on 16890 * ipx_pending_mp and the ioctl will complete in ip_rput. 16891 * 16892 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return 16893 * the new name and new ppa in lifr_name and lifr_ppa respectively. 16894 * For SLIFNAME, we pass these values back to the userland. 16895 */ 16896 static int 16897 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) 16898 { 16899 ill_t *ill; 16900 ipif_t *ipif; 16901 ipsq_t *ipsq; 16902 char *ppa_ptr; 16903 char *old_ptr; 16904 char old_char; 16905 int error; 16906 ip_stack_t *ipst; 16907 16908 ip1dbg(("ipif_set_values: interface %s\n", interf_name)); 16909 ASSERT(q->q_next != NULL); 16910 ASSERT(interf_name != NULL); 16911 16912 ill = (ill_t *)q->q_ptr; 16913 ipst = ill->ill_ipst; 16914 16915 ASSERT(ill->ill_ipst != NULL); 16916 ASSERT(ill->ill_name[0] == '\0'); 16917 ASSERT(IAM_WRITER_ILL(ill)); 16918 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); 16919 ASSERT(ill->ill_ppa == UINT_MAX); 16920 16921 ill->ill_defend_start = ill->ill_defend_count = 0; 16922 /* The ppa is sent down by ifconfig or is chosen */ 16923 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { 16924 return (EINVAL); 16925 } 16926 16927 /* 16928 * make sure ppa passed in is same as ppa in the name. 16929 * This check is not made when ppa == UINT_MAX in that case ppa 16930 * in the name could be anything. System will choose a ppa and 16931 * update new_ppa_ptr and inter_name to contain the choosen ppa. 16932 */ 16933 if (*new_ppa_ptr != UINT_MAX) { 16934 /* stoi changes the pointer */ 16935 old_ptr = ppa_ptr; 16936 /* 16937 * ifconfig passed in 0 for the ppa for DLPI 1 style devices 16938 * (they don't have an externally visible ppa). We assign one 16939 * here so that we can manage the interface. Note that in 16940 * the past this value was always 0 for DLPI 1 drivers. 16941 */ 16942 if (*new_ppa_ptr == 0) 16943 *new_ppa_ptr = stoi(&old_ptr); 16944 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr)) 16945 return (EINVAL); 16946 } 16947 /* 16948 * terminate string before ppa 16949 * save char at that location. 16950 */ 16951 old_char = ppa_ptr[0]; 16952 ppa_ptr[0] = '\0'; 16953 16954 ill->ill_ppa = *new_ppa_ptr; 16955 /* 16956 * Finish as much work now as possible before calling ill_glist_insert 16957 * which makes the ill globally visible and also merges it with the 16958 * other protocol instance of this phyint. The remaining work is 16959 * done after entering the ipsq which may happen sometime later. 16960 */ 16961 ipif = ill->ill_ipif; 16962 16963 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */ 16964 ipif_assign_seqid(ipif); 16965 16966 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6))) 16967 ill->ill_flags |= ILLF_IPV4; 16968 16969 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */ 16970 ASSERT((ipif->ipif_flags & IPIF_UP) == 0); 16971 16972 if (ill->ill_flags & ILLF_IPV6) { 16973 16974 ill->ill_isv6 = B_TRUE; 16975 ill_set_inputfn(ill); 16976 if (ill->ill_rq != NULL) { 16977 ill->ill_rq->q_qinfo = &iprinitv6; 16978 } 16979 16980 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ 16981 ipif->ipif_v6lcl_addr = ipv6_all_zeros; 16982 ipif->ipif_v6subnet = ipv6_all_zeros; 16983 ipif->ipif_v6net_mask = ipv6_all_zeros; 16984 ipif->ipif_v6brd_addr = ipv6_all_zeros; 16985 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; 16986 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; 16987 /* 16988 * point-to-point or Non-mulicast capable 16989 * interfaces won't do NUD unless explicitly 16990 * configured to do so. 16991 */ 16992 if (ipif->ipif_flags & IPIF_POINTOPOINT || 16993 !(ill->ill_flags & ILLF_MULTICAST)) { 16994 ill->ill_flags |= ILLF_NONUD; 16995 } 16996 /* Make sure IPv4 specific flag is not set on IPv6 if */ 16997 if (ill->ill_flags & ILLF_NOARP) { 16998 /* 16999 * Note: xresolv interfaces will eventually need 17000 * NOARP set here as well, but that will require 17001 * those external resolvers to have some 17002 * knowledge of that flag and act appropriately. 17003 * Not to be changed at present. 17004 */ 17005 ill->ill_flags &= ~ILLF_NOARP; 17006 } 17007 /* 17008 * Set the ILLF_ROUTER flag according to the global 17009 * IPv6 forwarding policy. 17010 */ 17011 if (ipst->ips_ipv6_forwarding != 0) 17012 ill->ill_flags |= ILLF_ROUTER; 17013 } else if (ill->ill_flags & ILLF_IPV4) { 17014 ill->ill_isv6 = B_FALSE; 17015 ill_set_inputfn(ill); 17016 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; 17017 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); 17018 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); 17019 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); 17020 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); 17021 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr); 17022 /* 17023 * Set the ILLF_ROUTER flag according to the global 17024 * IPv4 forwarding policy. 17025 */ 17026 if (ipst->ips_ip_forwarding != 0) 17027 ill->ill_flags |= ILLF_ROUTER; 17028 } 17029 17030 ASSERT(ill->ill_phyint != NULL); 17031 17032 /* 17033 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will 17034 * be completed in ill_glist_insert -> ill_phyint_reinit 17035 */ 17036 if (!ill_allocate_mibs(ill)) 17037 return (ENOMEM); 17038 17039 /* 17040 * Pick a default sap until we get the DL_INFO_ACK back from 17041 * the driver. 17042 */ 17043 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap : 17044 ill->ill_media->ip_m_ipv4sap; 17045 17046 ill->ill_ifname_pending = 1; 17047 ill->ill_ifname_pending_err = 0; 17048 17049 /* 17050 * When the first ipif comes up in ipif_up_done(), multicast groups 17051 * that were joined while this ill was not bound to the DLPI link need 17052 * to be recovered by ill_recover_multicast(). 17053 */ 17054 ill->ill_need_recover_multicast = 1; 17055 17056 ill_refhold(ill); 17057 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 17058 if ((error = ill_glist_insert(ill, interf_name, 17059 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) { 17060 ill->ill_ppa = UINT_MAX; 17061 ill->ill_name[0] = '\0'; 17062 /* 17063 * undo null termination done above. 17064 */ 17065 ppa_ptr[0] = old_char; 17066 rw_exit(&ipst->ips_ill_g_lock); 17067 ill_refrele(ill); 17068 return (error); 17069 } 17070 17071 ASSERT(ill->ill_name_length <= LIFNAMSIZ); 17072 17073 /* 17074 * When we return the buffer pointed to by interf_name should contain 17075 * the same name as in ill_name. 17076 * If a ppa was choosen by the system (ppa passed in was UINT_MAX) 17077 * the buffer pointed to by new_ppa_ptr would not contain the right ppa 17078 * so copy full name and update the ppa ptr. 17079 * When ppa passed in != UINT_MAX all values are correct just undo 17080 * null termination, this saves a bcopy. 17081 */ 17082 if (*new_ppa_ptr == UINT_MAX) { 17083 bcopy(ill->ill_name, interf_name, ill->ill_name_length); 17084 *new_ppa_ptr = ill->ill_ppa; 17085 } else { 17086 /* 17087 * undo null termination done above. 17088 */ 17089 ppa_ptr[0] = old_char; 17090 } 17091 17092 /* Let SCTP know about this ILL */ 17093 sctp_update_ill(ill, SCTP_ILL_INSERT); 17094 17095 /* 17096 * ill_glist_insert has made the ill visible globally, and 17097 * ill_phyint_reinit could have changed the ipsq. At this point, 17098 * we need to hold the ips_ill_g_lock across the call to enter the 17099 * ipsq to enforce atomicity and prevent reordering. In the event 17100 * the ipsq has changed, and if the new ipsq is currently busy, 17101 * we need to make sure that this half-completed ioctl is ahead of 17102 * any subsequent ioctl. We achieve this by not dropping the 17103 * ips_ill_g_lock which prevents any ill lookup itself thereby 17104 * ensuring that new ioctls can't start. 17105 */ 17106 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP, 17107 B_TRUE); 17108 17109 rw_exit(&ipst->ips_ill_g_lock); 17110 ill_refrele(ill); 17111 if (ipsq == NULL) 17112 return (EINPROGRESS); 17113 17114 /* 17115 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq. 17116 */ 17117 if (ipsq->ipsq_xop->ipx_current_ipif == NULL) 17118 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME); 17119 else 17120 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif); 17121 17122 error = ipif_set_values_tail(ill, ipif, mp, q); 17123 ipsq_exit(ipsq); 17124 if (error != 0 && error != EINPROGRESS) { 17125 /* 17126 * restore previous values 17127 */ 17128 ill->ill_isv6 = B_FALSE; 17129 ill_set_inputfn(ill); 17130 } 17131 return (error); 17132 } 17133 17134 void 17135 ipif_init(ip_stack_t *ipst) 17136 { 17137 int i; 17138 17139 for (i = 0; i < MAX_G_HEADS; i++) { 17140 ipst->ips_ill_g_heads[i].ill_g_list_head = 17141 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 17142 ipst->ips_ill_g_heads[i].ill_g_list_tail = 17143 (ill_if_t *)&ipst->ips_ill_g_heads[i]; 17144 } 17145 17146 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, 17147 ill_phyint_compare_index, 17148 sizeof (phyint_t), 17149 offsetof(struct phyint, phyint_avl_by_index)); 17150 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name, 17151 ill_phyint_compare_name, 17152 sizeof (phyint_t), 17153 offsetof(struct phyint, phyint_avl_by_name)); 17154 } 17155 17156 /* 17157 * Save enough information so that we can recreate the IRE if 17158 * the interface goes down and then up. 17159 */ 17160 void 17161 ill_save_ire(ill_t *ill, ire_t *ire) 17162 { 17163 mblk_t *save_mp; 17164 17165 save_mp = allocb(sizeof (ifrt_t), BPRI_MED); 17166 if (save_mp != NULL) { 17167 ifrt_t *ifrt; 17168 17169 save_mp->b_wptr += sizeof (ifrt_t); 17170 ifrt = (ifrt_t *)save_mp->b_rptr; 17171 bzero(ifrt, sizeof (ifrt_t)); 17172 ifrt->ifrt_type = ire->ire_type; 17173 if (ire->ire_ipversion == IPV4_VERSION) { 17174 ASSERT(!ill->ill_isv6); 17175 ifrt->ifrt_addr = ire->ire_addr; 17176 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; 17177 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; 17178 ifrt->ifrt_mask = ire->ire_mask; 17179 } else { 17180 ASSERT(ill->ill_isv6); 17181 ifrt->ifrt_v6addr = ire->ire_addr_v6; 17182 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ 17183 mutex_enter(&ire->ire_lock); 17184 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; 17185 mutex_exit(&ire->ire_lock); 17186 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; 17187 ifrt->ifrt_v6mask = ire->ire_mask_v6; 17188 } 17189 ifrt->ifrt_flags = ire->ire_flags; 17190 ifrt->ifrt_zoneid = ire->ire_zoneid; 17191 mutex_enter(&ill->ill_saved_ire_lock); 17192 save_mp->b_cont = ill->ill_saved_ire_mp; 17193 ill->ill_saved_ire_mp = save_mp; 17194 ill->ill_saved_ire_cnt++; 17195 mutex_exit(&ill->ill_saved_ire_lock); 17196 } 17197 } 17198 17199 /* 17200 * Remove one entry from ill_saved_ire_mp. 17201 */ 17202 void 17203 ill_remove_saved_ire(ill_t *ill, ire_t *ire) 17204 { 17205 mblk_t **mpp; 17206 mblk_t *mp; 17207 ifrt_t *ifrt; 17208 17209 /* Remove from ill_saved_ire_mp list if it is there */ 17210 mutex_enter(&ill->ill_saved_ire_lock); 17211 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; 17212 mpp = &(*mpp)->b_cont) { 17213 in6_addr_t gw_addr_v6; 17214 17215 /* 17216 * On a given ill, the tuple of address, gateway, mask, 17217 * ire_type, and zoneid is unique for each saved IRE. 17218 */ 17219 mp = *mpp; 17220 ifrt = (ifrt_t *)mp->b_rptr; 17221 /* ire_gateway_addr_v6 can change - need lock */ 17222 mutex_enter(&ire->ire_lock); 17223 gw_addr_v6 = ire->ire_gateway_addr_v6; 17224 mutex_exit(&ire->ire_lock); 17225 17226 if (ifrt->ifrt_zoneid != ire->ire_zoneid || 17227 ifrt->ifrt_type != ire->ire_type) 17228 continue; 17229 17230 if (ill->ill_isv6 ? 17231 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, 17232 &ire->ire_addr_v6) && 17233 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, 17234 &gw_addr_v6) && 17235 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, 17236 &ire->ire_mask_v6)) : 17237 (ifrt->ifrt_addr == ire->ire_addr && 17238 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && 17239 ifrt->ifrt_mask == ire->ire_mask)) { 17240 *mpp = mp->b_cont; 17241 ill->ill_saved_ire_cnt--; 17242 freeb(mp); 17243 break; 17244 } 17245 } 17246 mutex_exit(&ill->ill_saved_ire_lock); 17247 } 17248 17249 /* 17250 * IP multirouting broadcast routes handling 17251 * Append CGTP broadcast IREs to regular ones created 17252 * at ifconfig time. 17253 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both 17254 * the destination and the gateway are broadcast addresses. 17255 * The caller has verified that the destination is an IRE_BROADCAST and that 17256 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then 17257 * we create a MULTIRT IRE_BROADCAST. 17258 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything 17259 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. 17260 */ 17261 static void 17262 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) 17263 { 17264 ire_t *ire_prim; 17265 17266 ASSERT(ire != NULL); 17267 17268 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 17269 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, 17270 NULL); 17271 if (ire_prim != NULL) { 17272 /* 17273 * We are in the special case of broadcasts for 17274 * CGTP. We add an IRE_BROADCAST that holds 17275 * the RTF_MULTIRT flag, the destination 17276 * address and the low level 17277 * info of ire_prim. In other words, CGTP 17278 * broadcast is added to the redundant ipif. 17279 */ 17280 ill_t *ill_prim; 17281 ire_t *bcast_ire; 17282 17283 ill_prim = ire_prim->ire_ill; 17284 17285 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", 17286 (void *)ire_prim, (void *)ill_prim)); 17287 17288 bcast_ire = ire_create( 17289 (uchar_t *)&ire->ire_addr, 17290 (uchar_t *)&ip_g_all_ones, 17291 (uchar_t *)&ire->ire_gateway_addr, 17292 IRE_BROADCAST, 17293 ill_prim, 17294 GLOBAL_ZONEID, /* CGTP is only for the global zone */ 17295 ire->ire_flags | RTF_KERNEL, 17296 NULL, 17297 ipst); 17298 17299 /* 17300 * Here we assume that ire_add does head insertion so that 17301 * the added IRE_BROADCAST comes before the existing IRE_HOST. 17302 */ 17303 if (bcast_ire != NULL) { 17304 if (ire->ire_flags & RTF_SETSRC) { 17305 bcast_ire->ire_setsrc_addr = 17306 ire->ire_setsrc_addr; 17307 } 17308 bcast_ire = ire_add(bcast_ire); 17309 if (bcast_ire != NULL) { 17310 ip2dbg(("ip_cgtp_filter_bcast_add: " 17311 "added bcast_ire %p\n", 17312 (void *)bcast_ire)); 17313 17314 ill_save_ire(ill_prim, bcast_ire); 17315 ire_refrele(bcast_ire); 17316 } 17317 } 17318 ire_refrele(ire_prim); 17319 } 17320 } 17321 17322 /* 17323 * IP multirouting broadcast routes handling 17324 * Remove the broadcast ire. 17325 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both 17326 * the destination and the gateway are broadcast addresses. 17327 * The caller has only verified that RTF_MULTIRT was set. We check 17328 * that the destination is broadcast and that the gateway is a broadcast 17329 * address, and if so delete the IRE added by ip_cgtp_bcast_add(). 17330 */ 17331 static void 17332 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) 17333 { 17334 ASSERT(ire != NULL); 17335 17336 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { 17337 ire_t *ire_prim; 17338 17339 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, 17340 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, 17341 ipst, NULL); 17342 if (ire_prim != NULL) { 17343 ill_t *ill_prim; 17344 ire_t *bcast_ire; 17345 17346 ill_prim = ire_prim->ire_ill; 17347 17348 ip2dbg(("ip_cgtp_filter_bcast_delete: " 17349 "ire_prim %p, ill_prim %p\n", 17350 (void *)ire_prim, (void *)ill_prim)); 17351 17352 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, 17353 ire->ire_gateway_addr, IRE_BROADCAST, 17354 ill_prim, ALL_ZONES, NULL, 17355 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | 17356 MATCH_IRE_MASK, 0, ipst, NULL); 17357 17358 if (bcast_ire != NULL) { 17359 ip2dbg(("ip_cgtp_filter_bcast_delete: " 17360 "looked up bcast_ire %p\n", 17361 (void *)bcast_ire)); 17362 ill_remove_saved_ire(bcast_ire->ire_ill, 17363 bcast_ire); 17364 ire_delete(bcast_ire); 17365 ire_refrele(bcast_ire); 17366 } 17367 ire_refrele(ire_prim); 17368 } 17369 } 17370 } 17371 17372 /* 17373 * Derive an interface id from the link layer address. 17374 * Knows about IEEE 802 and IEEE EUI-64 mappings. 17375 */ 17376 static void 17377 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17378 { 17379 char *addr; 17380 17381 /* 17382 * Note that some IPv6 interfaces get plumbed over links that claim to 17383 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g. 17384 * PPP links). The ETHERADDRL check here ensures that we only set the 17385 * interface ID on IPv6 interfaces above links that actually have real 17386 * Ethernet addresses. 17387 */ 17388 if (ill->ill_phys_addr_length == ETHERADDRL) { 17389 /* Form EUI-64 like address */ 17390 addr = (char *)&v6addr->s6_addr32[2]; 17391 bcopy(ill->ill_phys_addr, addr, 3); 17392 addr[0] ^= 0x2; /* Toggle Universal/Local bit */ 17393 addr[3] = (char)0xff; 17394 addr[4] = (char)0xfe; 17395 bcopy(ill->ill_phys_addr + 3, addr + 5, 3); 17396 } 17397 } 17398 17399 /* ARGSUSED */ 17400 static void 17401 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17402 { 17403 } 17404 17405 typedef struct ipmp_ifcookie { 17406 uint32_t ic_hostid; 17407 char ic_ifname[LIFNAMSIZ]; 17408 char ic_zonename[ZONENAME_MAX]; 17409 } ipmp_ifcookie_t; 17410 17411 /* 17412 * Construct a pseudo-random interface ID for the IPMP interface that's both 17413 * predictable and (almost) guaranteed to be unique. 17414 */ 17415 static void 17416 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17417 { 17418 zone_t *zp; 17419 uint8_t *addr; 17420 uchar_t hash[16]; 17421 ulong_t hostid; 17422 MD5_CTX ctx; 17423 ipmp_ifcookie_t ic = { 0 }; 17424 17425 ASSERT(IS_IPMP(ill)); 17426 17427 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); 17428 ic.ic_hostid = htonl((uint32_t)hostid); 17429 17430 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ); 17431 17432 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) { 17433 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX); 17434 zone_rele(zp); 17435 } 17436 17437 MD5Init(&ctx); 17438 MD5Update(&ctx, &ic, sizeof (ic)); 17439 MD5Final(hash, &ctx); 17440 17441 /* 17442 * Map the hash to an interface ID per the basic approach in RFC3041. 17443 */ 17444 addr = &v6addr->s6_addr8[8]; 17445 bcopy(hash + 8, addr, sizeof (uint64_t)); 17446 addr[0] &= ~0x2; /* set local bit */ 17447 } 17448 17449 /* 17450 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. 17451 */ 17452 static void 17453 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) 17454 { 17455 phyint_t *phyi = ill->ill_phyint; 17456 17457 /* 17458 * Check PHYI_MULTI_BCAST and length of physical 17459 * address to determine if we use the mapping or the 17460 * broadcast address. 17461 */ 17462 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 17463 ill->ill_phys_addr_length != ETHERADDRL) { 17464 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); 17465 return; 17466 } 17467 m_physaddr[0] = 0x33; 17468 m_physaddr[1] = 0x33; 17469 m_physaddr[2] = m_ip6addr[12]; 17470 m_physaddr[3] = m_ip6addr[13]; 17471 m_physaddr[4] = m_ip6addr[14]; 17472 m_physaddr[5] = m_ip6addr[15]; 17473 } 17474 17475 /* 17476 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. 17477 */ 17478 static void 17479 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17480 { 17481 phyint_t *phyi = ill->ill_phyint; 17482 17483 /* 17484 * Check PHYI_MULTI_BCAST and length of physical 17485 * address to determine if we use the mapping or the 17486 * broadcast address. 17487 */ 17488 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || 17489 ill->ill_phys_addr_length != ETHERADDRL) { 17490 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); 17491 return; 17492 } 17493 m_physaddr[0] = 0x01; 17494 m_physaddr[1] = 0x00; 17495 m_physaddr[2] = 0x5e; 17496 m_physaddr[3] = m_ipaddr[1] & 0x7f; 17497 m_physaddr[4] = m_ipaddr[2]; 17498 m_physaddr[5] = m_ipaddr[3]; 17499 } 17500 17501 /* ARGSUSED */ 17502 static void 17503 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17504 { 17505 /* 17506 * for the MULTI_BCAST case and other cases when we want to 17507 * use the link-layer broadcast address for multicast. 17508 */ 17509 uint8_t *bphys_addr; 17510 dl_unitdata_req_t *dlur; 17511 17512 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17513 if (ill->ill_sap_length < 0) { 17514 bphys_addr = (uchar_t *)dlur + 17515 dlur->dl_dest_addr_offset; 17516 } else { 17517 bphys_addr = (uchar_t *)dlur + 17518 dlur->dl_dest_addr_offset + ill->ill_sap_length; 17519 } 17520 17521 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); 17522 } 17523 17524 /* 17525 * Derive IPoIB interface id from the link layer address. 17526 */ 17527 static void 17528 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17529 { 17530 char *addr; 17531 17532 ASSERT(ill->ill_phys_addr_length == 20); 17533 addr = (char *)&v6addr->s6_addr32[2]; 17534 bcopy(ill->ill_phys_addr + 12, addr, 8); 17535 /* 17536 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit 17537 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE 17538 * rules. In these cases, the IBA considers these GUIDs to be in 17539 * "Modified EUI-64" format, and thus toggling the u/l bit is not 17540 * required; vendors are required not to assign global EUI-64's 17541 * that differ only in u/l bit values, thus guaranteeing uniqueness 17542 * of the interface identifier. Whether the GUID is in modified 17543 * or proper EUI-64 format, the ipv6 identifier must have the u/l 17544 * bit set to 1. 17545 */ 17546 addr[0] |= 2; /* Set Universal/Local bit to 1 */ 17547 } 17548 17549 /* 17550 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. 17551 * Note on mapping from multicast IP addresses to IPoIB multicast link 17552 * addresses. IPoIB multicast link addresses are based on IBA link addresses. 17553 * The format of an IPoIB multicast address is: 17554 * 17555 * 4 byte QPN Scope Sign. Pkey 17556 * +--------------------------------------------+ 17557 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID | 17558 * +--------------------------------------------+ 17559 * 17560 * The Scope and Pkey components are properties of the IBA port and 17561 * network interface. They can be ascertained from the broadcast address. 17562 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. 17563 */ 17564 static void 17565 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17566 { 17567 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17568 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 17569 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17570 uint8_t *bphys_addr; 17571 dl_unitdata_req_t *dlur; 17572 17573 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17574 17575 /* 17576 * RFC 4391: IPv4 MGID is 28-bit long. 17577 */ 17578 m_physaddr[16] = m_ipaddr[0] & 0x0f; 17579 m_physaddr[17] = m_ipaddr[1]; 17580 m_physaddr[18] = m_ipaddr[2]; 17581 m_physaddr[19] = m_ipaddr[3]; 17582 17583 17584 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17585 if (ill->ill_sap_length < 0) { 17586 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17587 } else { 17588 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17589 ill->ill_sap_length; 17590 } 17591 /* 17592 * Now fill in the IBA scope/Pkey values from the broadcast address. 17593 */ 17594 m_physaddr[5] = bphys_addr[5]; 17595 m_physaddr[8] = bphys_addr[8]; 17596 m_physaddr[9] = bphys_addr[9]; 17597 } 17598 17599 static void 17600 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) 17601 { 17602 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, 17603 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 17604 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; 17605 uint8_t *bphys_addr; 17606 dl_unitdata_req_t *dlur; 17607 17608 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); 17609 17610 /* 17611 * RFC 4391: IPv4 MGID is 80-bit long. 17612 */ 17613 bcopy(&m_ipaddr[6], &m_physaddr[10], 10); 17614 17615 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; 17616 if (ill->ill_sap_length < 0) { 17617 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; 17618 } else { 17619 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + 17620 ill->ill_sap_length; 17621 } 17622 /* 17623 * Now fill in the IBA scope/Pkey values from the broadcast address. 17624 */ 17625 m_physaddr[5] = bphys_addr[5]; 17626 m_physaddr[8] = bphys_addr[8]; 17627 m_physaddr[9] = bphys_addr[9]; 17628 } 17629 17630 /* 17631 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4 17632 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the 17633 * IPv6 interface id. This is a suggested mechanism described in section 3.7 17634 * of RFC4213. 17635 */ 17636 static void 17637 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17638 { 17639 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t)); 17640 v6addr->s6_addr32[2] = 0; 17641 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t)); 17642 } 17643 17644 /* 17645 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6 17646 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface 17647 * id. 17648 */ 17649 static void 17650 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr) 17651 { 17652 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr; 17653 17654 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t)); 17655 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8); 17656 } 17657 17658 static void 17659 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17660 { 17661 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17662 } 17663 17664 static void 17665 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17666 { 17667 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17668 } 17669 17670 static void 17671 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr) 17672 { 17673 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr); 17674 } 17675 17676 static void 17677 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) 17678 { 17679 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr); 17680 } 17681 17682 /* 17683 * Lookup an ill and verify that the zoneid has an ipif on that ill. 17684 * Returns an held ill, or NULL. 17685 */ 17686 ill_t * 17687 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, 17688 ip_stack_t *ipst) 17689 { 17690 ill_t *ill; 17691 ipif_t *ipif; 17692 17693 ill = ill_lookup_on_ifindex(index, isv6, ipst); 17694 if (ill == NULL) 17695 return (NULL); 17696 17697 mutex_enter(&ill->ill_lock); 17698 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17699 if (IPIF_IS_CONDEMNED(ipif)) 17700 continue; 17701 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && 17702 ipif->ipif_zoneid != ALL_ZONES) 17703 continue; 17704 17705 mutex_exit(&ill->ill_lock); 17706 return (ill); 17707 } 17708 mutex_exit(&ill->ill_lock); 17709 ill_refrele(ill); 17710 return (NULL); 17711 } 17712 17713 /* 17714 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id) 17715 * If a pointer to an ipif_t is returned then the caller will need to do 17716 * an ill_refrele(). 17717 */ 17718 ipif_t * 17719 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, 17720 ip_stack_t *ipst) 17721 { 17722 ipif_t *ipif; 17723 ill_t *ill; 17724 17725 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); 17726 if (ill == NULL) 17727 return (NULL); 17728 17729 mutex_enter(&ill->ill_lock); 17730 if (ill->ill_state_flags & ILL_CONDEMNED) { 17731 mutex_exit(&ill->ill_lock); 17732 ill_refrele(ill); 17733 return (NULL); 17734 } 17735 17736 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { 17737 if (!IPIF_CAN_LOOKUP(ipif)) 17738 continue; 17739 if (lifidx == ipif->ipif_id) { 17740 ipif_refhold_locked(ipif); 17741 break; 17742 } 17743 } 17744 17745 mutex_exit(&ill->ill_lock); 17746 ill_refrele(ill); 17747 return (ipif); 17748 } 17749 17750 /* 17751 * Set ill_inputfn based on the current know state. 17752 * This needs to be called when any of the factors taken into 17753 * account changes. 17754 */ 17755 void 17756 ill_set_inputfn(ill_t *ill) 17757 { 17758 ip_stack_t *ipst = ill->ill_ipst; 17759 17760 if (ill->ill_isv6) { 17761 if (is_system_labeled()) 17762 ill->ill_inputfn = ill_input_full_v6; 17763 else 17764 ill->ill_inputfn = ill_input_short_v6; 17765 } else { 17766 if (is_system_labeled()) 17767 ill->ill_inputfn = ill_input_full_v4; 17768 else if (ill->ill_dhcpinit != 0) 17769 ill->ill_inputfn = ill_input_full_v4; 17770 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head 17771 != NULL) 17772 ill->ill_inputfn = ill_input_full_v4; 17773 else if (ipst->ips_ip_cgtp_filter && 17774 ipst->ips_ip_cgtp_filter_ops != NULL) 17775 ill->ill_inputfn = ill_input_full_v4; 17776 else 17777 ill->ill_inputfn = ill_input_short_v4; 17778 } 17779 } 17780 17781 /* 17782 * Re-evaluate ill_inputfn for all the IPv4 ills. 17783 * Used when RSVP and CGTP comes and goes. 17784 */ 17785 void 17786 ill_set_inputfn_all(ip_stack_t *ipst) 17787 { 17788 ill_walk_context_t ctx; 17789 ill_t *ill; 17790 17791 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 17792 ill = ILL_START_WALK_V4(&ctx, ipst); 17793 for (; ill != NULL; ill = ill_next(&ctx, ill)) 17794 ill_set_inputfn(ill); 17795 17796 rw_exit(&ipst->ips_ill_g_lock); 17797 } 17798 17799 /* 17800 * Set the physical address information for `ill' to the contents of the 17801 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be 17802 * asynchronous if `ill' cannot immediately be quiesced -- in which case 17803 * EINPROGRESS will be returned. 17804 */ 17805 int 17806 ill_set_phys_addr(ill_t *ill, mblk_t *mp) 17807 { 17808 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17809 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr; 17810 17811 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17812 17813 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR && 17814 dlindp->dl_data != DL_CURR_DEST_ADDR && 17815 dlindp->dl_data != DL_CURR_PHYS_ADDR) { 17816 /* Changing DL_IPV6_TOKEN is not yet supported */ 17817 return (0); 17818 } 17819 17820 /* 17821 * We need to store up to two copies of `mp' in `ill'. Due to the 17822 * design of ipsq_pending_mp_add(), we can't pass them as separate 17823 * arguments to ill_set_phys_addr_tail(). Instead, chain them 17824 * together here, then pull 'em apart in ill_set_phys_addr_tail(). 17825 */ 17826 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) { 17827 freemsg(mp); 17828 return (ENOMEM); 17829 } 17830 17831 ipsq_current_start(ipsq, ill->ill_ipif, 0); 17832 17833 /* 17834 * Since we'll only do a logical down, we can't rely on ipif_down 17835 * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset 17836 * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this 17837 * case, to quiesce ire's and nce's for ill_is_quiescent. 17838 */ 17839 mutex_enter(&ill->ill_lock); 17840 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; 17841 /* no more ire/nce addition allowed */ 17842 mutex_exit(&ill->ill_lock); 17843 17844 /* 17845 * If we can quiesce the ill, then set the address. If not, then 17846 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail(). 17847 */ 17848 ill_down_ipifs(ill, B_TRUE); 17849 mutex_enter(&ill->ill_lock); 17850 if (!ill_is_quiescent(ill)) { 17851 /* call cannot fail since `conn_t *' argument is NULL */ 17852 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 17853 mp, ILL_DOWN); 17854 mutex_exit(&ill->ill_lock); 17855 return (EINPROGRESS); 17856 } 17857 mutex_exit(&ill->ill_lock); 17858 17859 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL); 17860 return (0); 17861 } 17862 17863 /* 17864 * When the allowed-ips link property is set on the datalink, IP receives a 17865 * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips() 17866 * to initialize the ill_allowed_ips[] array in the ill_t. This array is then 17867 * used to vet addresses passed to ip_sioctl_addr() and to ensure that the 17868 * only IP addresses configured on the ill_t are those in the ill_allowed_ips[] 17869 * array. 17870 */ 17871 void 17872 ill_set_allowed_ips(ill_t *ill, mblk_t *mp) 17873 { 17874 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 17875 dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr; 17876 mac_protect_t *mrp; 17877 int i; 17878 17879 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17880 mrp = (mac_protect_t *)&dlip[1]; 17881 17882 if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */ 17883 kmem_free(ill->ill_allowed_ips, 17884 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t)); 17885 ill->ill_allowed_ips_cnt = 0; 17886 ill->ill_allowed_ips = NULL; 17887 mutex_enter(&ill->ill_phyint->phyint_lock); 17888 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT; 17889 mutex_exit(&ill->ill_phyint->phyint_lock); 17890 return; 17891 } 17892 17893 if (ill->ill_allowed_ips != NULL) { 17894 kmem_free(ill->ill_allowed_ips, 17895 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t)); 17896 } 17897 ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt; 17898 ill->ill_allowed_ips = kmem_alloc( 17899 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP); 17900 for (i = 0; i < mrp->mp_ipaddrcnt; i++) 17901 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr; 17902 17903 mutex_enter(&ill->ill_phyint->phyint_lock); 17904 ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT; 17905 mutex_exit(&ill->ill_phyint->phyint_lock); 17906 } 17907 17908 /* 17909 * Once the ill associated with `q' has quiesced, set its physical address 17910 * information to the values in `addrmp'. Note that two copies of `addrmp' 17911 * are passed (linked by b_cont), since we sometimes need to save two distinct 17912 * copies in the ill_t, and our context doesn't permit sleeping or allocation 17913 * failure (we'll free the other copy if it's not needed). Since the ill_t 17914 * is quiesced, we know any stale nce's with the old address information have 17915 * already been removed, so we don't need to call nce_flush(). 17916 */ 17917 /* ARGSUSED */ 17918 static void 17919 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) 17920 { 17921 ill_t *ill = q->q_ptr; 17922 mblk_t *addrmp2 = unlinkb(addrmp); 17923 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; 17924 uint_t addrlen, addroff; 17925 int status; 17926 17927 ASSERT(IAM_WRITER_IPSQ(ipsq)); 17928 17929 addroff = dlindp->dl_addr_offset; 17930 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length); 17931 17932 switch (dlindp->dl_data) { 17933 case DL_IPV6_LINK_LAYER_ADDR: 17934 ill_set_ndmp(ill, addrmp, addroff, addrlen); 17935 freemsg(addrmp2); 17936 break; 17937 17938 case DL_CURR_DEST_ADDR: 17939 freemsg(ill->ill_dest_addr_mp); 17940 ill->ill_dest_addr = addrmp->b_rptr + addroff; 17941 ill->ill_dest_addr_mp = addrmp; 17942 if (ill->ill_isv6) { 17943 ill_setdesttoken(ill); 17944 ipif_setdestlinklocal(ill->ill_ipif); 17945 } 17946 freemsg(addrmp2); 17947 break; 17948 17949 case DL_CURR_PHYS_ADDR: 17950 freemsg(ill->ill_phys_addr_mp); 17951 ill->ill_phys_addr = addrmp->b_rptr + addroff; 17952 ill->ill_phys_addr_mp = addrmp; 17953 ill->ill_phys_addr_length = addrlen; 17954 if (ill->ill_isv6) 17955 ill_set_ndmp(ill, addrmp2, addroff, addrlen); 17956 else 17957 freemsg(addrmp2); 17958 if (ill->ill_isv6) { 17959 ill_setdefaulttoken(ill); 17960 ipif_setlinklocal(ill->ill_ipif); 17961 } 17962 break; 17963 default: 17964 ASSERT(0); 17965 } 17966 17967 /* 17968 * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires 17969 * as we bring the ipifs up again. 17970 */ 17971 mutex_enter(&ill->ill_lock); 17972 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; 17973 mutex_exit(&ill->ill_lock); 17974 /* 17975 * If there are ipifs to bring up, ill_up_ipifs() will return 17976 * EINPROGRESS, and ipsq_current_finish() will be called by 17977 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is 17978 * brought up. 17979 */ 17980 status = ill_up_ipifs(ill, q, addrmp); 17981 if (status != EINPROGRESS) 17982 ipsq_current_finish(ipsq); 17983 } 17984 17985 /* 17986 * Helper routine for setting the ill_nd_lla fields. 17987 */ 17988 void 17989 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen) 17990 { 17991 freemsg(ill->ill_nd_lla_mp); 17992 ill->ill_nd_lla = ndmp->b_rptr + addroff; 17993 ill->ill_nd_lla_mp = ndmp; 17994 ill->ill_nd_lla_len = addrlen; 17995 } 17996 17997 /* 17998 * Replumb the ill. 17999 */ 18000 int 18001 ill_replumb(ill_t *ill, mblk_t *mp) 18002 { 18003 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; 18004 18005 ASSERT(IAM_WRITER_IPSQ(ipsq)); 18006 18007 ipsq_current_start(ipsq, ill->ill_ipif, 0); 18008 18009 /* 18010 * If we can quiesce the ill, then continue. If not, then 18011 * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). 18012 */ 18013 ill_down_ipifs(ill, B_FALSE); 18014 18015 mutex_enter(&ill->ill_lock); 18016 if (!ill_is_quiescent(ill)) { 18017 /* call cannot fail since `conn_t *' argument is NULL */ 18018 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq, 18019 mp, ILL_DOWN); 18020 mutex_exit(&ill->ill_lock); 18021 return (EINPROGRESS); 18022 } 18023 mutex_exit(&ill->ill_lock); 18024 18025 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL); 18026 return (0); 18027 } 18028 18029 /* ARGSUSED */ 18030 static void 18031 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) 18032 { 18033 ill_t *ill = q->q_ptr; 18034 int err; 18035 conn_t *connp = NULL; 18036 18037 ASSERT(IAM_WRITER_IPSQ(ipsq)); 18038 freemsg(ill->ill_replumb_mp); 18039 ill->ill_replumb_mp = copyb(mp); 18040 18041 if (ill->ill_replumb_mp == NULL) { 18042 /* out of memory */ 18043 ipsq_current_finish(ipsq); 18044 return; 18045 } 18046 18047 mutex_enter(&ill->ill_lock); 18048 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, 18049 ill->ill_rq, ill->ill_replumb_mp, 0); 18050 mutex_exit(&ill->ill_lock); 18051 18052 if (!ill->ill_up_ipifs) { 18053 /* already closing */ 18054 ipsq_current_finish(ipsq); 18055 return; 18056 } 18057 ill->ill_replumbing = 1; 18058 err = ill_down_ipifs_tail(ill); 18059 18060 /* 18061 * Successfully quiesced and brought down the interface, now we send 18062 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the 18063 * DL_NOTE_REPLUMB message. 18064 */ 18065 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, 18066 DL_NOTIFY_CONF); 18067 ASSERT(mp != NULL); 18068 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification = 18069 DL_NOTE_REPLUMB_DONE; 18070 ill_dlpi_send(ill, mp); 18071 18072 /* 18073 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP 18074 * streams have to be unbound. When all the DLPI exchanges are done, 18075 * ipsq_current_finish() will be called by arp_bringup_done(). The 18076 * remainder of ipif bringup via ill_up_ipifs() will also be done in 18077 * arp_bringup_done(). 18078 */ 18079 ASSERT(ill->ill_replumb_mp != NULL); 18080 if (err == EINPROGRESS) 18081 return; 18082 else 18083 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); 18084 ASSERT(connp == NULL); 18085 if (err == 0 && ill->ill_replumb_mp != NULL && 18086 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { 18087 return; 18088 } 18089 ipsq_current_finish(ipsq); 18090 } 18091 18092 /* 18093 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf' 18094 * which is `bufsize' bytes. On success, zero is returned and `buf' updated 18095 * as per the ioctl. On failure, an errno is returned. 18096 */ 18097 static int 18098 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr) 18099 { 18100 int rval; 18101 struct strioctl iocb; 18102 18103 iocb.ic_cmd = cmd; 18104 iocb.ic_timout = 15; 18105 iocb.ic_len = bufsize; 18106 iocb.ic_dp = buf; 18107 18108 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval)); 18109 } 18110 18111 /* 18112 * Issue an SIOCGLIFCONF for address family `af' and store the result into a 18113 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success. 18114 */ 18115 static int 18116 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp, 18117 uint_t *bufsizep, cred_t *cr) 18118 { 18119 int err; 18120 struct lifnum lifn; 18121 18122 bzero(&lifn, sizeof (lifn)); 18123 lifn.lifn_family = af; 18124 lifn.lifn_flags = LIFC_UNDER_IPMP; 18125 18126 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0) 18127 return (err); 18128 18129 /* 18130 * Pad the interface count to account for additional interfaces that 18131 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 18132 */ 18133 lifn.lifn_count += 4; 18134 bzero(lifcp, sizeof (*lifcp)); 18135 lifcp->lifc_flags = LIFC_UNDER_IPMP; 18136 lifcp->lifc_family = af; 18137 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 18138 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 18139 18140 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr); 18141 if (err != 0) { 18142 kmem_free(lifcp->lifc_buf, *bufsizep); 18143 return (err); 18144 } 18145 18146 return (0); 18147 } 18148 18149 /* 18150 * Helper for ip_interface_cleanup() that removes the loopback interface. 18151 */ 18152 static void 18153 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 18154 { 18155 int err; 18156 struct lifreq lifr; 18157 18158 bzero(&lifr, sizeof (lifr)); 18159 (void) strcpy(lifr.lifr_name, ipif_loopback_name); 18160 18161 /* 18162 * Attempt to remove the interface. It may legitimately not exist 18163 * (e.g. the zone administrator unplumbed it), so ignore ENXIO. 18164 */ 18165 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr); 18166 if (err != 0 && err != ENXIO) { 18167 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: " 18168 "error %d\n", isv6 ? "v6" : "v4", err)); 18169 } 18170 } 18171 18172 /* 18173 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP 18174 * groups and that IPMP data addresses are down. These conditions must be met 18175 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp(). 18176 */ 18177 static void 18178 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr) 18179 { 18180 int af = isv6 ? AF_INET6 : AF_INET; 18181 int i, nifs; 18182 int err; 18183 uint_t bufsize; 18184 uint_t lifrsize = sizeof (struct lifreq); 18185 struct lifconf lifc; 18186 struct lifreq *lifrp; 18187 18188 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) { 18189 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list " 18190 "(error %d); any IPMP interfaces cannot be shutdown", err); 18191 return; 18192 } 18193 18194 nifs = lifc.lifc_len / lifrsize; 18195 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 18196 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 18197 if (err != 0) { 18198 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get " 18199 "flags: error %d", lifrp->lifr_name, err); 18200 continue; 18201 } 18202 18203 if (lifrp->lifr_flags & IFF_IPMP) { 18204 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0) 18205 continue; 18206 18207 lifrp->lifr_flags &= ~IFF_UP; 18208 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr); 18209 if (err != 0) { 18210 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 18211 "bring down (error %d); IPMP interface may " 18212 "not be shutdown", lifrp->lifr_name, err); 18213 } 18214 18215 /* 18216 * Check if IFF_DUPLICATE is still set -- and if so, 18217 * reset the address to clear it. 18218 */ 18219 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr); 18220 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE)) 18221 continue; 18222 18223 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr); 18224 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR, 18225 lifrp, lifrsize, cr)) != 0) { 18226 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 18227 "reset DAD (error %d); IPMP interface may " 18228 "not be shutdown", lifrp->lifr_name, err); 18229 } 18230 continue; 18231 } 18232 18233 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) { 18234 lifrp->lifr_groupname[0] = '\0'; 18235 if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp, 18236 lifrsize, cr)) != 0) { 18237 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot " 18238 "leave IPMP group (error %d); associated " 18239 "IPMP interface may not be shutdown", 18240 lifrp->lifr_name, err); 18241 continue; 18242 } 18243 } 18244 } 18245 18246 kmem_free(lifc.lifc_buf, bufsize); 18247 } 18248 18249 #define UDPDEV "/devices/pseudo/udp@0:udp" 18250 #define UDP6DEV "/devices/pseudo/udp6@0:udp6" 18251 18252 /* 18253 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down. 18254 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away 18255 * when the user-level processes in the zone are killed and the latter are 18256 * cleaned up by str_stack_shutdown(). 18257 */ 18258 void 18259 ip_interface_cleanup(ip_stack_t *ipst) 18260 { 18261 ldi_handle_t lh; 18262 ldi_ident_t li; 18263 cred_t *cr; 18264 int err; 18265 int i; 18266 char *devs[] = { UDP6DEV, UDPDEV }; 18267 netstackid_t stackid = ipst->ips_netstack->netstack_stackid; 18268 18269 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) { 18270 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:" 18271 " error %d", err); 18272 return; 18273 } 18274 18275 cr = zone_get_kcred(netstackid_to_zoneid(stackid)); 18276 ASSERT(cr != NULL); 18277 18278 /* 18279 * NOTE: loop executes exactly twice and is hardcoded to know that the 18280 * first iteration is IPv6. (Unrolling yields repetitious code, hence 18281 * the loop.) 18282 */ 18283 for (i = 0; i < 2; i++) { 18284 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li); 18285 if (err != 0) { 18286 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:" 18287 " error %d", devs[i], err); 18288 continue; 18289 } 18290 18291 ip_loopback_removeif(lh, i == 0, cr); 18292 ip_ipmp_cleanup(lh, i == 0, cr); 18293 18294 (void) ldi_close(lh, FREAD|FWRITE, cr); 18295 } 18296 18297 ldi_ident_release(li); 18298 crfree(cr); 18299 } 18300 18301 /* 18302 * This needs to be in-sync with nic_event_t definition 18303 */ 18304 static const char * 18305 ill_hook_event2str(nic_event_t event) 18306 { 18307 switch (event) { 18308 case NE_PLUMB: 18309 return ("PLUMB"); 18310 case NE_UNPLUMB: 18311 return ("UNPLUMB"); 18312 case NE_UP: 18313 return ("UP"); 18314 case NE_DOWN: 18315 return ("DOWN"); 18316 case NE_ADDRESS_CHANGE: 18317 return ("ADDRESS_CHANGE"); 18318 case NE_LIF_UP: 18319 return ("LIF_UP"); 18320 case NE_LIF_DOWN: 18321 return ("LIF_DOWN"); 18322 case NE_IFINDEX_CHANGE: 18323 return ("IFINDEX_CHANGE"); 18324 default: 18325 return ("UNKNOWN"); 18326 } 18327 } 18328 18329 void 18330 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event, 18331 nic_event_data_t data, size_t datalen) 18332 { 18333 ip_stack_t *ipst = ill->ill_ipst; 18334 hook_nic_event_int_t *info; 18335 const char *str = NULL; 18336 18337 /* create a new nic event info */ 18338 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL) 18339 goto fail; 18340 18341 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex; 18342 info->hnei_event.hne_lif = lif; 18343 info->hnei_event.hne_event = event; 18344 info->hnei_event.hne_protocol = ill->ill_isv6 ? 18345 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data; 18346 info->hnei_event.hne_data = NULL; 18347 info->hnei_event.hne_datalen = 0; 18348 info->hnei_stackid = ipst->ips_netstack->netstack_stackid; 18349 18350 if (data != NULL && datalen != 0) { 18351 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP); 18352 if (info->hnei_event.hne_data == NULL) 18353 goto fail; 18354 bcopy(data, info->hnei_event.hne_data, datalen); 18355 info->hnei_event.hne_datalen = datalen; 18356 } 18357 18358 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info, 18359 DDI_NOSLEEP) == DDI_SUCCESS) 18360 return; 18361 18362 fail: 18363 if (info != NULL) { 18364 if (info->hnei_event.hne_data != NULL) { 18365 kmem_free(info->hnei_event.hne_data, 18366 info->hnei_event.hne_datalen); 18367 } 18368 kmem_free(info, sizeof (hook_nic_event_t)); 18369 } 18370 str = ill_hook_event2str(event); 18371 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event " 18372 "information for %s (ENOMEM)\n", str, ill->ill_name)); 18373 } 18374 18375 static int 18376 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) 18377 { 18378 int err = 0; 18379 const in_addr_t *addr = NULL; 18380 nce_t *nce = NULL; 18381 ill_t *ill = ipif->ipif_ill; 18382 ill_t *bound_ill; 18383 boolean_t added_ipif = B_FALSE; 18384 uint16_t state; 18385 uint16_t flags; 18386 18387 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", 18388 ill_t *, ill, ipif_t *, ipif); 18389 if (ipif->ipif_lcl_addr != INADDR_ANY) { 18390 addr = &ipif->ipif_lcl_addr; 18391 } 18392 18393 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { 18394 if (res_act != Res_act_initial) 18395 return (EINVAL); 18396 } 18397 18398 if (addr != NULL) { 18399 ipmp_illgrp_t *illg = ill->ill_grp; 18400 18401 /* add unicast nce for the local addr */ 18402 18403 if (IS_IPMP(ill)) { 18404 /* 18405 * If we're here via ipif_up(), then the ipif 18406 * won't be bound yet -- add it to the group, 18407 * which will bind it if possible. (We would 18408 * add it in ipif_up(), but deleting on failure 18409 * there is gruesome.) If we're here via 18410 * ipmp_ill_bind_ipif(), then the ipif has 18411 * already been added to the group and we 18412 * just need to use the binding. 18413 */ 18414 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { 18415 bound_ill = ipmp_illgrp_add_ipif(illg, ipif); 18416 if (bound_ill == NULL) { 18417 /* 18418 * We couldn't bind the ipif to an ill 18419 * yet, so we have nothing to publish. 18420 * Mark the address as ready and return. 18421 */ 18422 ipif->ipif_addr_ready = 1; 18423 return (0); 18424 } 18425 added_ipif = B_TRUE; 18426 } 18427 } else { 18428 bound_ill = ill; 18429 } 18430 18431 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | 18432 NCE_F_NONUD); 18433 /* 18434 * If this is an initial bring-up (or the ipif was never 18435 * completely brought up), do DAD. Otherwise, we're here 18436 * because IPMP has rebound an address to this ill: send 18437 * unsolicited advertisements (ARP announcements) to 18438 * inform others. 18439 */ 18440 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { 18441 state = ND_UNCHANGED; /* compute in nce_add_common() */ 18442 } else { 18443 state = ND_REACHABLE; 18444 flags |= NCE_F_UNSOL_ADV; 18445 } 18446 18447 retry: 18448 err = nce_lookup_then_add_v4(ill, 18449 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, 18450 addr, flags, state, &nce); 18451 18452 /* 18453 * note that we may encounter EEXIST if we are moving 18454 * the nce as a result of a rebind operation. 18455 */ 18456 switch (err) { 18457 case 0: 18458 ipif->ipif_added_nce = 1; 18459 nce->nce_ipif_cnt++; 18460 break; 18461 case EEXIST: 18462 ip1dbg(("ipif_arp_up: NCE already exists for %s\n", 18463 ill->ill_name)); 18464 if (!NCE_MYADDR(nce->nce_common)) { 18465 /* 18466 * A leftover nce from before this address 18467 * existed 18468 */ 18469 ncec_delete(nce->nce_common); 18470 nce_refrele(nce); 18471 nce = NULL; 18472 goto retry; 18473 } 18474 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { 18475 nce_refrele(nce); 18476 nce = NULL; 18477 ip1dbg(("ipif_arp_up: NCE already exists " 18478 "for %s:%u\n", ill->ill_name, 18479 ipif->ipif_id)); 18480 goto arp_up_done; 18481 } 18482 /* 18483 * Duplicate local addresses are permissible for 18484 * IPIF_POINTOPOINT interfaces which will get marked 18485 * IPIF_UNNUMBERED later in 18486 * ip_addr_availability_check(). 18487 * 18488 * The nce_ipif_cnt field tracks the number of 18489 * ipifs that have nce_addr as their local address. 18490 */ 18491 ipif->ipif_addr_ready = 1; 18492 ipif->ipif_added_nce = 1; 18493 nce->nce_ipif_cnt++; 18494 err = 0; 18495 break; 18496 default: 18497 ASSERT(nce == NULL); 18498 goto arp_up_done; 18499 } 18500 if (arp_no_defense) { 18501 if ((ipif->ipif_flags & IPIF_UP) && 18502 !ipif->ipif_addr_ready) 18503 ipif_up_notify(ipif); 18504 ipif->ipif_addr_ready = 1; 18505 } 18506 } else { 18507 /* zero address. nothing to publish */ 18508 ipif->ipif_addr_ready = 1; 18509 } 18510 if (nce != NULL) 18511 nce_refrele(nce); 18512 arp_up_done: 18513 if (added_ipif && err != 0) 18514 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 18515 return (err); 18516 } 18517 18518 int 18519 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) 18520 { 18521 int err = 0; 18522 ill_t *ill = ipif->ipif_ill; 18523 boolean_t first_interface, wait_for_dlpi = B_FALSE; 18524 18525 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", 18526 ill_t *, ill, ipif_t *, ipif); 18527 18528 /* 18529 * need to bring up ARP or setup mcast mapping only 18530 * when the first interface is coming UP. 18531 */ 18532 first_interface = (ill->ill_ipif_up_count == 0 && 18533 ill->ill_ipif_dup_count == 0 && !was_dup); 18534 18535 if (res_act == Res_act_initial && first_interface) { 18536 /* 18537 * Send ATTACH + BIND 18538 */ 18539 err = arp_ll_up(ill); 18540 if (err != EINPROGRESS && err != 0) 18541 return (err); 18542 18543 /* 18544 * Add NCE for local address. Start DAD. 18545 * we'll wait to hear that DAD has finished 18546 * before using the interface. 18547 */ 18548 if (err == EINPROGRESS) 18549 wait_for_dlpi = B_TRUE; 18550 } 18551 18552 if (!wait_for_dlpi) 18553 (void) ipif_arp_up_done_tail(ipif, res_act); 18554 18555 return (!wait_for_dlpi ? 0 : EINPROGRESS); 18556 } 18557 18558 /* 18559 * Finish processing of "arp_up" after all the DLPI message 18560 * exchanges have completed between arp and the driver. 18561 */ 18562 void 18563 arp_bringup_done(ill_t *ill, int err) 18564 { 18565 mblk_t *mp1; 18566 ipif_t *ipif; 18567 conn_t *connp = NULL; 18568 ipsq_t *ipsq; 18569 queue_t *q; 18570 18571 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); 18572 18573 ASSERT(IAM_WRITER_ILL(ill)); 18574 18575 ipsq = ill->ill_phyint->phyint_ipsq; 18576 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18577 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18578 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18579 if (mp1 == NULL) /* bringup was aborted by the user */ 18580 return; 18581 18582 /* 18583 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18584 * must have an associated conn_t. Otherwise, we're bringing this 18585 * interface back up as part of handling an asynchronous event (e.g., 18586 * physical address change). 18587 */ 18588 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18589 ASSERT(connp != NULL); 18590 q = CONNP_TO_WQ(connp); 18591 } else { 18592 ASSERT(connp == NULL); 18593 q = ill->ill_rq; 18594 } 18595 if (err == 0) { 18596 if (ipif->ipif_isv6) { 18597 if ((err = ipif_up_done_v6(ipif)) != 0) 18598 ip0dbg(("arp_bringup_done: init failed\n")); 18599 } else { 18600 err = ipif_arp_up_done_tail(ipif, Res_act_initial); 18601 if (err != 0 || 18602 (err = ipif_up_done(ipif)) != 0) { 18603 ip0dbg(("arp_bringup_done: " 18604 "init failed err %x\n", err)); 18605 (void) ipif_arp_down(ipif); 18606 } 18607 18608 } 18609 } else { 18610 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); 18611 } 18612 18613 if ((err == 0) && (ill->ill_up_ipifs)) { 18614 err = ill_up_ipifs(ill, q, mp1); 18615 if (err == EINPROGRESS) 18616 return; 18617 } 18618 18619 /* 18620 * If we have a moved ipif to bring up, and everything has succeeded 18621 * to this point, bring it up on the IPMP ill. Otherwise, leave it 18622 * down -- the admin can try to bring it up by hand if need be. 18623 */ 18624 if (ill->ill_move_ipif != NULL) { 18625 ipif = ill->ill_move_ipif; 18626 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, 18627 ipif->ipif_ill->ill_name)); 18628 ill->ill_move_ipif = NULL; 18629 if (err == 0) { 18630 err = ipif_up(ipif, q, mp1); 18631 if (err == EINPROGRESS) 18632 return; 18633 } 18634 } 18635 18636 /* 18637 * The operation must complete without EINPROGRESS since 18638 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18639 * Otherwise, the operation will be stuck forever in the ipsq. 18640 */ 18641 ASSERT(err != EINPROGRESS); 18642 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18643 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", 18644 int, ipsq->ipsq_xop->ipx_current_ioctl, 18645 ill_t *, ill, ipif_t *, ipif); 18646 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18647 } else { 18648 ipsq_current_finish(ipsq); 18649 } 18650 } 18651 18652 /* 18653 * Finish processing of arp replumb after all the DLPI message 18654 * exchanges have completed between arp and the driver. 18655 */ 18656 void 18657 arp_replumb_done(ill_t *ill, int err) 18658 { 18659 mblk_t *mp1; 18660 ipif_t *ipif; 18661 conn_t *connp = NULL; 18662 ipsq_t *ipsq; 18663 queue_t *q; 18664 18665 ASSERT(IAM_WRITER_ILL(ill)); 18666 18667 ipsq = ill->ill_phyint->phyint_ipsq; 18668 ipif = ipsq->ipsq_xop->ipx_pending_ipif; 18669 mp1 = ipsq_pending_mp_get(ipsq, &connp); 18670 ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); 18671 if (mp1 == NULL) { 18672 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", 18673 ipsq->ipsq_xop->ipx_current_ioctl)); 18674 /* bringup was aborted by the user */ 18675 return; 18676 } 18677 /* 18678 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we 18679 * must have an associated conn_t. Otherwise, we're bringing this 18680 * interface back up as part of handling an asynchronous event (e.g., 18681 * physical address change). 18682 */ 18683 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18684 ASSERT(connp != NULL); 18685 q = CONNP_TO_WQ(connp); 18686 } else { 18687 ASSERT(connp == NULL); 18688 q = ill->ill_rq; 18689 } 18690 if ((err == 0) && (ill->ill_up_ipifs)) { 18691 err = ill_up_ipifs(ill, q, mp1); 18692 if (err == EINPROGRESS) 18693 return; 18694 } 18695 /* 18696 * The operation must complete without EINPROGRESS since 18697 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. 18698 * Otherwise, the operation will be stuck forever in the ipsq. 18699 */ 18700 ASSERT(err != EINPROGRESS); 18701 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { 18702 DTRACE_PROBE4(ipif__ioctl, char *, 18703 "arp_replumb_done finish", 18704 int, ipsq->ipsq_xop->ipx_current_ioctl, 18705 ill_t *, ill, ipif_t *, ipif); 18706 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); 18707 } else { 18708 ipsq_current_finish(ipsq); 18709 } 18710 } 18711 18712 void 18713 ipif_up_notify(ipif_t *ipif) 18714 { 18715 ip_rts_ifmsg(ipif, RTSQ_DEFAULT); 18716 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT); 18717 sctp_update_ipif(ipif, SCTP_IPIF_UP); 18718 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id), 18719 NE_LIF_UP, NULL, 0); 18720 } 18721 18722 /* 18723 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and 18724 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on 18725 * TPI end points with STREAMS modules pushed above. This is assured by not 18726 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl 18727 * never ends up on an ipsq, otherwise we may end up processing the ioctl 18728 * while unwinding from the ispq and that could be a thread from the bottom. 18729 */ 18730 /* ARGSUSED */ 18731 int 18732 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, 18733 ip_ioctl_cmd_t *ipip, void *arg) 18734 { 18735 mblk_t *cmd_mp = mp->b_cont->b_cont; 18736 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr); 18737 int ret = 0; 18738 int i; 18739 size_t size; 18740 ip_stack_t *ipst; 18741 zoneid_t zoneid; 18742 ilb_stack_t *ilbs; 18743 18744 ipst = CONNQ_TO_IPST(q); 18745 ilbs = ipst->ips_netstack->netstack_ilb; 18746 zoneid = Q_TO_CONN(q)->conn_zoneid; 18747 18748 switch (command) { 18749 case ILB_CREATE_RULE: { 18750 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18751 18752 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18753 ret = EINVAL; 18754 break; 18755 } 18756 18757 ret = ilb_rule_add(ilbs, zoneid, cmd); 18758 break; 18759 } 18760 case ILB_DESTROY_RULE: 18761 case ILB_ENABLE_RULE: 18762 case ILB_DISABLE_RULE: { 18763 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr; 18764 18765 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) { 18766 ret = EINVAL; 18767 break; 18768 } 18769 18770 if (cmd->flags & ILB_RULE_ALLRULES) { 18771 if (command == ILB_DESTROY_RULE) { 18772 ilb_rule_del_all(ilbs, zoneid); 18773 break; 18774 } else if (command == ILB_ENABLE_RULE) { 18775 ilb_rule_enable_all(ilbs, zoneid); 18776 break; 18777 } else if (command == ILB_DISABLE_RULE) { 18778 ilb_rule_disable_all(ilbs, zoneid); 18779 break; 18780 } 18781 } else { 18782 if (command == ILB_DESTROY_RULE) { 18783 ret = ilb_rule_del(ilbs, zoneid, cmd->name); 18784 } else if (command == ILB_ENABLE_RULE) { 18785 ret = ilb_rule_enable(ilbs, zoneid, cmd->name, 18786 NULL); 18787 } else if (command == ILB_DISABLE_RULE) { 18788 ret = ilb_rule_disable(ilbs, zoneid, cmd->name, 18789 NULL); 18790 } 18791 } 18792 break; 18793 } 18794 case ILB_NUM_RULES: { 18795 ilb_num_rules_cmd_t *cmd; 18796 18797 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) { 18798 ret = EINVAL; 18799 break; 18800 } 18801 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr; 18802 ilb_get_num_rules(ilbs, zoneid, &(cmd->num)); 18803 break; 18804 } 18805 case ILB_RULE_NAMES: { 18806 ilb_rule_names_cmd_t *cmd; 18807 18808 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr; 18809 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) || 18810 cmd->num_names == 0) { 18811 ret = EINVAL; 18812 break; 18813 } 18814 size = cmd->num_names * ILB_RULE_NAMESZ; 18815 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) + 18816 size != cmd_mp->b_wptr) { 18817 ret = EINVAL; 18818 break; 18819 } 18820 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf); 18821 break; 18822 } 18823 case ILB_NUM_SERVERS: { 18824 ilb_num_servers_cmd_t *cmd; 18825 18826 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) { 18827 ret = EINVAL; 18828 break; 18829 } 18830 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr; 18831 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name, 18832 &(cmd->num)); 18833 break; 18834 } 18835 case ILB_LIST_RULE: { 18836 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr; 18837 18838 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) { 18839 ret = EINVAL; 18840 break; 18841 } 18842 ret = ilb_rule_list(ilbs, zoneid, cmd); 18843 break; 18844 } 18845 case ILB_LIST_SERVERS: { 18846 ilb_servers_info_cmd_t *cmd; 18847 18848 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18849 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) || 18850 cmd->num_servers == 0) { 18851 ret = EINVAL; 18852 break; 18853 } 18854 size = cmd->num_servers * sizeof (ilb_server_info_t); 18855 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18856 size != cmd_mp->b_wptr) { 18857 ret = EINVAL; 18858 break; 18859 } 18860 18861 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers, 18862 &cmd->num_servers); 18863 break; 18864 } 18865 case ILB_ADD_SERVERS: { 18866 ilb_servers_info_cmd_t *cmd; 18867 ilb_rule_t *rule; 18868 18869 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr; 18870 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) { 18871 ret = EINVAL; 18872 break; 18873 } 18874 size = cmd->num_servers * sizeof (ilb_server_info_t); 18875 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) + 18876 size != cmd_mp->b_wptr) { 18877 ret = EINVAL; 18878 break; 18879 } 18880 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18881 if (rule == NULL) { 18882 ASSERT(ret != 0); 18883 break; 18884 } 18885 for (i = 0; i < cmd->num_servers; i++) { 18886 ilb_server_info_t *s; 18887 18888 s = &cmd->servers[i]; 18889 s->err = ilb_server_add(ilbs, rule, s); 18890 } 18891 ILB_RULE_REFRELE(rule); 18892 break; 18893 } 18894 case ILB_DEL_SERVERS: 18895 case ILB_ENABLE_SERVERS: 18896 case ILB_DISABLE_SERVERS: { 18897 ilb_servers_cmd_t *cmd; 18898 ilb_rule_t *rule; 18899 int (*f)(); 18900 18901 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr; 18902 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) { 18903 ret = EINVAL; 18904 break; 18905 } 18906 size = cmd->num_servers * sizeof (ilb_server_arg_t); 18907 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) + 18908 size != cmd_mp->b_wptr) { 18909 ret = EINVAL; 18910 break; 18911 } 18912 18913 if (command == ILB_DEL_SERVERS) 18914 f = ilb_server_del; 18915 else if (command == ILB_ENABLE_SERVERS) 18916 f = ilb_server_enable; 18917 else if (command == ILB_DISABLE_SERVERS) 18918 f = ilb_server_disable; 18919 18920 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret); 18921 if (rule == NULL) { 18922 ASSERT(ret != 0); 18923 break; 18924 } 18925 18926 for (i = 0; i < cmd->num_servers; i++) { 18927 ilb_server_arg_t *s; 18928 18929 s = &cmd->servers[i]; 18930 s->err = f(ilbs, zoneid, NULL, rule, &s->addr); 18931 } 18932 ILB_RULE_REFRELE(rule); 18933 break; 18934 } 18935 case ILB_LIST_NAT_TABLE: { 18936 ilb_list_nat_cmd_t *cmd; 18937 18938 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr; 18939 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) { 18940 ret = EINVAL; 18941 break; 18942 } 18943 size = cmd->num_nat * sizeof (ilb_nat_entry_t); 18944 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) + 18945 size != cmd_mp->b_wptr) { 18946 ret = EINVAL; 18947 break; 18948 } 18949 18950 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat, 18951 &cmd->flags); 18952 break; 18953 } 18954 case ILB_LIST_STICKY_TABLE: { 18955 ilb_list_sticky_cmd_t *cmd; 18956 18957 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr; 18958 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) { 18959 ret = EINVAL; 18960 break; 18961 } 18962 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t); 18963 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) + 18964 size != cmd_mp->b_wptr) { 18965 ret = EINVAL; 18966 break; 18967 } 18968 18969 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries, 18970 &cmd->num_sticky, &cmd->flags); 18971 break; 18972 } 18973 default: 18974 ret = EINVAL; 18975 break; 18976 } 18977 done: 18978 return (ret); 18979 } 18980 18981 /* Remove all cache entries for this logical interface */ 18982 void 18983 ipif_nce_down(ipif_t *ipif) 18984 { 18985 ill_t *ill = ipif->ipif_ill; 18986 nce_t *nce; 18987 18988 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", 18989 ill_t *, ill, ipif_t *, ipif); 18990 if (ipif->ipif_added_nce) { 18991 if (ipif->ipif_isv6) 18992 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); 18993 else 18994 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); 18995 if (nce != NULL) { 18996 if (--nce->nce_ipif_cnt == 0) 18997 ncec_delete(nce->nce_common); 18998 ipif->ipif_added_nce = 0; 18999 nce_refrele(nce); 19000 } else { 19001 /* 19002 * nce may already be NULL because it was already 19003 * flushed, e.g., due to a call to nce_flush 19004 */ 19005 ipif->ipif_added_nce = 0; 19006 } 19007 } 19008 /* 19009 * Make IPMP aware of the deleted data address. 19010 */ 19011 if (IS_IPMP(ill)) 19012 ipmp_illgrp_del_ipif(ill->ill_grp, ipif); 19013 19014 /* 19015 * Remove all other nces dependent on this ill when the last ipif 19016 * is going away. 19017 */ 19018 if (ill->ill_ipif_up_count == 0) { 19019 ncec_walk(ill, (pfi_t)ncec_delete_per_ill, 19020 (uchar_t *)ill, ill->ill_ipst); 19021 if (IS_UNDER_IPMP(ill)) 19022 nce_flush(ill, B_TRUE); 19023 } 19024 } 19025 19026 /* 19027 * find the first interface that uses usill for its source address. 19028 */ 19029 ill_t * 19030 ill_lookup_usesrc(ill_t *usill) 19031 { 19032 ip_stack_t *ipst = usill->ill_ipst; 19033 ill_t *ill; 19034 19035 ASSERT(usill != NULL); 19036 19037 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */ 19038 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER); 19039 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 19040 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill; 19041 ill = ill->ill_usesrc_grp_next) { 19042 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) && 19043 !ILL_IS_CONDEMNED(ill)) { 19044 ill_refhold(ill); 19045 break; 19046 } 19047 } 19048 rw_exit(&ipst->ips_ill_g_lock); 19049 rw_exit(&ipst->ips_ill_g_usesrc_lock); 19050 return (ill); 19051 } 19052 19053 /* 19054 * This comment applies to both ip_sioctl_get_ifhwaddr and 19055 * ip_sioctl_get_lifhwaddr as the basic function of these two functions 19056 * is the same. 19057 * 19058 * The goal here is to find an IP interface that corresponds to the name 19059 * provided by the caller in the ifreq/lifreq structure held in the mblk_t 19060 * chain and to fill out a sockaddr/sockaddr_storage structure with the 19061 * mac address. 19062 * 19063 * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number 19064 * of different reasons: 19065 * ENXIO - the device name is not known to IP. 19066 * EADDRNOTAVAIL - the device has no hardware address. This is indicated 19067 * by ill_phys_addr not pointing to an actual address. 19068 * EPFNOSUPPORT - this will indicate that a request is being made for a 19069 * mac address that will not fit in the data structure supplier (struct 19070 * sockaddr). 19071 * 19072 */ 19073 /* ARGSUSED */ 19074 int 19075 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 19076 ip_ioctl_cmd_t *ipip, void *if_req) 19077 { 19078 struct sockaddr *sock; 19079 struct ifreq *ifr; 19080 mblk_t *mp1; 19081 ill_t *ill; 19082 19083 ASSERT(ipif != NULL); 19084 ill = ipif->ipif_ill; 19085 19086 if (ill->ill_phys_addr == NULL) { 19087 return (EADDRNOTAVAIL); 19088 } 19089 if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) { 19090 return (EPFNOSUPPORT); 19091 } 19092 19093 ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name)); 19094 19095 /* Existence of mp1 has been checked in ip_wput_nondata */ 19096 mp1 = mp->b_cont->b_cont; 19097 ifr = (struct ifreq *)mp1->b_rptr; 19098 19099 sock = &ifr->ifr_addr; 19100 /* 19101 * The "family" field in the returned structure is set to a value 19102 * that represents the type of device to which the address belongs. 19103 * The value returned may differ to that on Linux but it will still 19104 * represent the correct symbol on Solaris. 19105 */ 19106 sock->sa_family = arp_hw_type(ill->ill_mactype); 19107 bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length); 19108 19109 return (0); 19110 } 19111 19112 /* 19113 * The expection of applications using SIOCGIFHWADDR is that data will 19114 * be returned in the sa_data field of the sockaddr structure. With 19115 * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux 19116 * equivalent. In light of this, struct sockaddr_dl is used as it 19117 * offers more space for address storage in sll_data. 19118 */ 19119 /* ARGSUSED */ 19120 int 19121 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, 19122 ip_ioctl_cmd_t *ipip, void *if_req) 19123 { 19124 struct sockaddr_dl *sock; 19125 struct lifreq *lifr; 19126 mblk_t *mp1; 19127 ill_t *ill; 19128 19129 ASSERT(ipif != NULL); 19130 ill = ipif->ipif_ill; 19131 19132 if (ill->ill_phys_addr == NULL) { 19133 return (EADDRNOTAVAIL); 19134 } 19135 if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) { 19136 return (EPFNOSUPPORT); 19137 } 19138 19139 ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name)); 19140 19141 /* Existence of mp1 has been checked in ip_wput_nondata */ 19142 mp1 = mp->b_cont->b_cont; 19143 lifr = (struct lifreq *)mp1->b_rptr; 19144 19145 /* 19146 * sockaddr_ll is used here because it is also the structure used in 19147 * responding to the same ioctl in sockpfp. The only other choice is 19148 * sockaddr_dl which contains fields that are not required here 19149 * because its purpose is different. 19150 */ 19151 lifr->lifr_type = ill->ill_type; 19152 sock = (struct sockaddr_dl *)&lifr->lifr_addr; 19153 sock->sdl_family = AF_LINK; 19154 sock->sdl_index = ill->ill_phyint->phyint_ifindex; 19155 sock->sdl_type = ill->ill_mactype; 19156 sock->sdl_nlen = 0; 19157 sock->sdl_slen = 0; 19158 sock->sdl_alen = ill->ill_phys_addr_length; 19159 bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length); 19160 19161 return (0); 19162 }