1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 1990 Mentat Inc.
24 */
25 /*
26 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27 */
28
29 /*
30 * This file contains the interface control functions for IP.
31 */
32
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/dlpi.h>
36 #include <sys/stropts.h>
37 #include <sys/strsun.h>
38 #include <sys/sysmacros.h>
39 #include <sys/strsubr.h>
40 #include <sys/strlog.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/cmn_err.h>
44 #include <sys/kstat.h>
45 #include <sys/debug.h>
46 #include <sys/zone.h>
47 #include <sys/sunldi.h>
48 #include <sys/file.h>
49 #include <sys/bitmap.h>
50 #include <sys/cpuvar.h>
51 #include <sys/time.h>
52 #include <sys/ctype.h>
53 #include <sys/kmem.h>
54 #include <sys/systm.h>
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/isa_defs.h>
58 #include <net/if.h>
59 #include <net/if_arp.h>
60 #include <net/if_types.h>
61 #include <net/if_dl.h>
62 #include <net/route.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <netinet/ip6.h>
66 #include <netinet/icmp6.h>
67 #include <netinet/igmp_var.h>
68 #include <sys/policy.h>
69 #include <sys/ethernet.h>
70 #include <sys/callb.h>
71 #include <sys/md5.h>
72
73 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
74 #include <inet/mi.h>
75 #include <inet/nd.h>
76 #include <inet/tunables.h>
77 #include <inet/arp.h>
78 #include <inet/ip_arp.h>
79 #include <inet/mib2.h>
80 #include <inet/ip.h>
81 #include <inet/ip6.h>
82 #include <inet/ip6_asp.h>
83 #include <inet/tcp.h>
84 #include <inet/ip_multi.h>
85 #include <inet/ip_ire.h>
86 #include <inet/ip_ftable.h>
87 #include <inet/ip_rts.h>
88 #include <inet/ip_ndp.h>
89 #include <inet/ip_if.h>
90 #include <inet/ip_impl.h>
91 #include <inet/sctp_ip.h>
92 #include <inet/ip_netinfo.h>
93 #include <inet/ilb_ip.h>
94
95 #include <netinet/igmp.h>
96 #include <inet/ip_listutils.h>
97 #include <inet/ipclassifier.h>
98 #include <sys/mac_client.h>
99 #include <sys/dld.h>
100 #include <sys/mac_flow.h>
101
102 #include <sys/systeminfo.h>
103 #include <sys/bootconf.h>
104
105 #include <sys/tsol/tndb.h>
106 #include <sys/tsol/tnet.h>
107
108 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
109 #include <inet/udp_impl.h> /* needed for udp_stack_t */
110
111 /* The character which tells where the ill_name ends */
112 #define IPIF_SEPARATOR_CHAR ':'
113
114 /* IP ioctl function table entry */
115 typedef struct ipft_s {
116 int ipft_cmd;
117 pfi_t ipft_pfi;
118 int ipft_min_size;
119 int ipft_flags;
120 } ipft_t;
121 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
122 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
123
124 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
125 static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
126 char *value, caddr_t cp, cred_t *ioc_cr);
127
128 static boolean_t ill_is_quiescent(ill_t *);
129 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
130 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
131 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
132 mblk_t *mp, boolean_t need_up);
133 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
134 mblk_t *mp, boolean_t need_up);
135 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
136 queue_t *q, mblk_t *mp, boolean_t need_up);
137 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
138 mblk_t *mp);
139 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
140 mblk_t *mp);
141 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
142 queue_t *q, mblk_t *mp, boolean_t need_up);
143 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
144 int ioccmd, struct linkblk *li);
145 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
146 static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
147 static void ipsq_flush(ill_t *ill);
148
149 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
150 queue_t *q, mblk_t *mp, boolean_t need_up);
151 static void ipsq_delete(ipsq_t *);
152
153 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
154 boolean_t initialize, boolean_t insert, int *errorp);
155 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
156 static void ipif_delete_bcast_ires(ipif_t *ipif);
157 static int ipif_add_ires_v4(ipif_t *, boolean_t);
158 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
159 boolean_t isv6);
160 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
161 static void ipif_free(ipif_t *ipif);
162 static void ipif_free_tail(ipif_t *ipif);
163 static void ipif_set_default(ipif_t *ipif);
164 static int ipif_set_values(queue_t *q, mblk_t *mp,
165 char *interf_name, uint_t *ppa);
166 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
167 queue_t *q);
168 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
169 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
170 ip_stack_t *);
171 static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
172 boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
173 int *error, ip_stack_t *);
174
175 static int ill_alloc_ppa(ill_if_t *, ill_t *);
176 static void ill_delete_interface_type(ill_if_t *);
177 static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
178 static void ill_dl_down(ill_t *ill);
179 static void ill_down(ill_t *ill);
180 static void ill_down_ipifs(ill_t *, boolean_t);
181 static void ill_free_mib(ill_t *ill);
182 static void ill_glist_delete(ill_t *);
183 static void ill_phyint_reinit(ill_t *ill);
184 static void ill_set_nce_router_flags(ill_t *, boolean_t);
185 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
186 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
187
188 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
189 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
190 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
191 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
192 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
193 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
194 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
195 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
196 static ip_v4mapinfo_func_t ip_mbcast_mapping;
197 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
198 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
199 static void phyint_free(phyint_t *);
200
201 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
202 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
203 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
204 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
205 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
206 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
207 dl_capability_sub_t *);
208 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
209 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
210 static void ill_capability_dld_ack(ill_t *, mblk_t *,
211 dl_capability_sub_t *);
212 static void ill_capability_dld_enable(ill_t *);
213 static void ill_capability_ack_thr(void *);
214 static void ill_capability_lso_enable(ill_t *);
215
216 static ill_t *ill_prev_usesrc(ill_t *);
217 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
218 static void ill_disband_usesrc_group(ill_t *);
219 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
220
221 #ifdef DEBUG
222 static void ill_trace_cleanup(const ill_t *);
223 static void ipif_trace_cleanup(const ipif_t *);
224 #endif
225
226 static void ill_dlpi_clear_deferred(ill_t *ill);
227
228 static void phyint_flags_init(phyint_t *, t_uscalar_t);
229
230 /*
231 * if we go over the memory footprint limit more than once in this msec
232 * interval, we'll start pruning aggressively.
233 */
234 int ip_min_frag_prune_time = 0;
235
236 static ipft_t ip_ioctl_ftbl[] = {
237 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
238 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
239 IPFT_F_NO_REPLY },
240 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
241 { 0 }
242 };
243
244 /* Simple ICMP IP Header Template */
245 static ipha_t icmp_ipha = {
246 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
247 };
248
249 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
250
251 static ip_m_t ip_m_tbl[] = {
252 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
253 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
254 ip_nodef_v6intfid },
255 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
256 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
257 ip_nodef_v6intfid },
258 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
259 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
260 ip_nodef_v6intfid },
261 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
262 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
263 ip_nodef_v6intfid },
264 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
265 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
266 ip_nodef_v6intfid },
267 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
268 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
269 ip_nodef_v6intfid },
270 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
271 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
272 ip_ipv4_v6destintfid },
273 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
274 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
275 ip_ipv6_v6destintfid },
276 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
277 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
278 ip_nodef_v6intfid },
279 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
280 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
281 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
282 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
283 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
284 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
285 ip_nodef_v6intfid }
286 };
287
288 char ipif_loopback_name[] = "lo0";
289
290 /* These are used by all IP network modules. */
291 sin6_t sin6_null; /* Zero address for quick clears */
292 sin_t sin_null; /* Zero address for quick clears */
293
294 /* When set search for unused ipif_seqid */
295 static ipif_t ipif_zero;
296
297 /*
298 * ppa arena is created after these many
299 * interfaces have been plumbed.
300 */
301 uint_t ill_no_arena = 12; /* Setable in /etc/system */
302
303 /*
304 * Allocate per-interface mibs.
305 * Returns true if ok. False otherwise.
306 * ipsq may not yet be allocated (loopback case ).
307 */
308 static boolean_t
309 ill_allocate_mibs(ill_t *ill)
310 {
311 /* Already allocated? */
312 if (ill->ill_ip_mib != NULL) {
313 if (ill->ill_isv6)
314 ASSERT(ill->ill_icmp6_mib != NULL);
315 return (B_TRUE);
316 }
317
318 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
319 KM_NOSLEEP);
320 if (ill->ill_ip_mib == NULL) {
321 return (B_FALSE);
322 }
323
324 /* Setup static information */
325 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
326 sizeof (mib2_ipIfStatsEntry_t));
327 if (ill->ill_isv6) {
328 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
329 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
330 sizeof (mib2_ipv6AddrEntry_t));
331 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
332 sizeof (mib2_ipv6RouteEntry_t));
333 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
334 sizeof (mib2_ipv6NetToMediaEntry_t));
335 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
336 sizeof (ipv6_member_t));
337 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
338 sizeof (ipv6_grpsrc_t));
339 } else {
340 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
341 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
342 sizeof (mib2_ipAddrEntry_t));
343 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
344 sizeof (mib2_ipRouteEntry_t));
345 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
346 sizeof (mib2_ipNetToMediaEntry_t));
347 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
348 sizeof (ip_member_t));
349 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
350 sizeof (ip_grpsrc_t));
351
352 /*
353 * For a v4 ill, we are done at this point, because per ill
354 * icmp mibs are only used for v6.
355 */
356 return (B_TRUE);
357 }
358
359 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
360 KM_NOSLEEP);
361 if (ill->ill_icmp6_mib == NULL) {
362 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
363 ill->ill_ip_mib = NULL;
364 return (B_FALSE);
365 }
366 /* static icmp info */
367 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
368 sizeof (mib2_ipv6IfIcmpEntry_t);
369 /*
370 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
371 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
372 * -> ill_phyint_reinit
373 */
374 return (B_TRUE);
375 }
376
377 /*
378 * Completely vaporize a lower level tap and all associated interfaces.
379 * ill_delete is called only out of ip_close when the device control
380 * stream is being closed.
381 */
382 void
383 ill_delete(ill_t *ill)
384 {
385 ipif_t *ipif;
386 ill_t *prev_ill;
387 ip_stack_t *ipst = ill->ill_ipst;
388
389 /*
390 * ill_delete may be forcibly entering the ipsq. The previous
391 * ioctl may not have completed and may need to be aborted.
392 * ipsq_flush takes care of it. If we don't need to enter the
393 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
394 * ill_delete_tail is sufficient.
395 */
396 ipsq_flush(ill);
397
398 /*
399 * Nuke all interfaces. ipif_free will take down the interface,
400 * remove it from the list, and free the data structure.
401 * Walk down the ipif list and remove the logical interfaces
402 * first before removing the main ipif. We can't unplumb
403 * zeroth interface first in the case of IPv6 as update_conn_ill
404 * -> ip_ll_multireq de-references ill_ipif for checking
405 * POINTOPOINT.
406 *
407 * If ill_ipif was not properly initialized (i.e low on memory),
408 * then no interfaces to clean up. In this case just clean up the
409 * ill.
410 */
411 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
412 ipif_free(ipif);
413
414 /*
415 * clean out all the nce_t entries that depend on this
416 * ill for the ill_phys_addr.
417 */
418 nce_flush(ill, B_TRUE);
419
420 /* Clean up msgs on pending upcalls for mrouted */
421 reset_mrt_ill(ill);
422
423 update_conn_ill(ill, ipst);
424
425 /*
426 * Remove multicast references added as a result of calls to
427 * ip_join_allmulti().
428 */
429 ip_purge_allmulti(ill);
430
431 /*
432 * If the ill being deleted is under IPMP, boot it out of the illgrp.
433 */
434 if (IS_UNDER_IPMP(ill))
435 ipmp_ill_leave_illgrp(ill);
436
437 /*
438 * ill_down will arrange to blow off any IRE's dependent on this
439 * ILL, and shut down fragmentation reassembly.
440 */
441 ill_down(ill);
442
443 /* Let SCTP know, so that it can remove this from its list. */
444 sctp_update_ill(ill, SCTP_ILL_REMOVE);
445
446 /*
447 * Walk all CONNs that can have a reference on an ire or nce for this
448 * ill (we actually walk all that now have stale references).
449 */
450 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
451
452 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
453 if (ill->ill_isv6)
454 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
455
456 /*
457 * If an address on this ILL is being used as a source address then
458 * clear out the pointers in other ILLs that point to this ILL.
459 */
460 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
461 if (ill->ill_usesrc_grp_next != NULL) {
462 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
463 ill_disband_usesrc_group(ill);
464 } else { /* consumer of the usesrc ILL */
465 prev_ill = ill_prev_usesrc(ill);
466 prev_ill->ill_usesrc_grp_next =
467 ill->ill_usesrc_grp_next;
468 }
469 }
470 rw_exit(&ipst->ips_ill_g_usesrc_lock);
471 }
472
473 static void
474 ipif_non_duplicate(ipif_t *ipif)
475 {
476 ill_t *ill = ipif->ipif_ill;
477 mutex_enter(&ill->ill_lock);
478 if (ipif->ipif_flags & IPIF_DUPLICATE) {
479 ipif->ipif_flags &= ~IPIF_DUPLICATE;
480 ASSERT(ill->ill_ipif_dup_count > 0);
481 ill->ill_ipif_dup_count--;
482 }
483 mutex_exit(&ill->ill_lock);
484 }
485
486 /*
487 * ill_delete_tail is called from ip_modclose after all references
488 * to the closing ill are gone. The wait is done in ip_modclose
489 */
490 void
491 ill_delete_tail(ill_t *ill)
492 {
493 mblk_t **mpp;
494 ipif_t *ipif;
495 ip_stack_t *ipst = ill->ill_ipst;
496
497 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
498 ipif_non_duplicate(ipif);
499 (void) ipif_down_tail(ipif);
500 }
501
502 ASSERT(ill->ill_ipif_dup_count == 0);
503
504 /*
505 * If polling capability is enabled (which signifies direct
506 * upcall into IP and driver has ill saved as a handle),
507 * we need to make sure that unbind has completed before we
508 * let the ill disappear and driver no longer has any reference
509 * to this ill.
510 */
511 mutex_enter(&ill->ill_lock);
512 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
513 cv_wait(&ill->ill_cv, &ill->ill_lock);
514 mutex_exit(&ill->ill_lock);
515 ASSERT(!(ill->ill_capabilities &
516 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
517
518 if (ill->ill_net_type != IRE_LOOPBACK)
519 qprocsoff(ill->ill_rq);
520
521 /*
522 * We do an ipsq_flush once again now. New messages could have
523 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
524 * could also have landed up if an ioctl thread had looked up
525 * the ill before we set the ILL_CONDEMNED flag, but not yet
526 * enqueued the ioctl when we did the ipsq_flush last time.
527 */
528 ipsq_flush(ill);
529
530 /*
531 * Free capabilities.
532 */
533 if (ill->ill_hcksum_capab != NULL) {
534 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
535 ill->ill_hcksum_capab = NULL;
536 }
537
538 if (ill->ill_zerocopy_capab != NULL) {
539 kmem_free(ill->ill_zerocopy_capab,
540 sizeof (ill_zerocopy_capab_t));
541 ill->ill_zerocopy_capab = NULL;
542 }
543
544 if (ill->ill_lso_capab != NULL) {
545 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
546 ill->ill_lso_capab = NULL;
547 }
548
549 if (ill->ill_dld_capab != NULL) {
550 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
551 ill->ill_dld_capab = NULL;
552 }
553
554 /* Clean up ill_allowed_ips* related state */
555 if (ill->ill_allowed_ips != NULL) {
556 ASSERT(ill->ill_allowed_ips_cnt > 0);
557 kmem_free(ill->ill_allowed_ips,
558 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
559 ill->ill_allowed_ips = NULL;
560 ill->ill_allowed_ips_cnt = 0;
561 }
562
563 while (ill->ill_ipif != NULL)
564 ipif_free_tail(ill->ill_ipif);
565
566 /*
567 * We have removed all references to ilm from conn and the ones joined
568 * within the kernel.
569 *
570 * We don't walk conns, mrts and ires because
571 *
572 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
573 * 2) ill_down ->ill_downi walks all the ires and cleans up
574 * ill references.
575 */
576
577 /*
578 * If this ill is an IPMP meta-interface, blow away the illgrp. This
579 * is safe to do because the illgrp has already been unlinked from the
580 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
581 */
582 if (IS_IPMP(ill)) {
583 ipmp_illgrp_destroy(ill->ill_grp);
584 ill->ill_grp = NULL;
585 }
586
587 if (ill->ill_mphysaddr_list != NULL) {
588 multiphysaddr_t *mpa, *tmpa;
589
590 mpa = ill->ill_mphysaddr_list;
591 ill->ill_mphysaddr_list = NULL;
592 while (mpa) {
593 tmpa = mpa->mpa_next;
594 kmem_free(mpa, sizeof (*mpa));
595 mpa = tmpa;
596 }
597 }
598 /*
599 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
600 * could free the phyint. No more reference to the phyint after this
601 * point.
602 */
603 (void) ill_glist_delete(ill);
604
605 if (ill->ill_frag_ptr != NULL) {
606 uint_t count;
607
608 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
609 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
610 }
611 mi_free(ill->ill_frag_ptr);
612 ill->ill_frag_ptr = NULL;
613 ill->ill_frag_hash_tbl = NULL;
614 }
615
616 freemsg(ill->ill_nd_lla_mp);
617 /* Free all retained control messages. */
618 mpp = &ill->ill_first_mp_to_free;
619 do {
620 while (mpp[0]) {
621 mblk_t *mp;
622 mblk_t *mp1;
623
624 mp = mpp[0];
625 mpp[0] = mp->b_next;
626 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
627 mp1->b_next = NULL;
628 mp1->b_prev = NULL;
629 }
630 freemsg(mp);
631 }
632 } while (mpp++ != &ill->ill_last_mp_to_free);
633
634 ill_free_mib(ill);
635
636 #ifdef DEBUG
637 ill_trace_cleanup(ill);
638 #endif
639
640 /* The default multicast interface might have changed */
641 ire_increment_multicast_generation(ipst, ill->ill_isv6);
642
643 /* Drop refcnt here */
644 netstack_rele(ill->ill_ipst->ips_netstack);
645 ill->ill_ipst = NULL;
646 }
647
648 static void
649 ill_free_mib(ill_t *ill)
650 {
651 ip_stack_t *ipst = ill->ill_ipst;
652
653 /*
654 * MIB statistics must not be lost, so when an interface
655 * goes away the counter values will be added to the global
656 * MIBs.
657 */
658 if (ill->ill_ip_mib != NULL) {
659 if (ill->ill_isv6) {
660 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
661 ill->ill_ip_mib);
662 } else {
663 ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
664 ill->ill_ip_mib);
665 }
666
667 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
668 ill->ill_ip_mib = NULL;
669 }
670 if (ill->ill_icmp6_mib != NULL) {
671 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
672 ill->ill_icmp6_mib);
673 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
674 ill->ill_icmp6_mib = NULL;
675 }
676 }
677
678 /*
679 * Concatenate together a physical address and a sap.
680 *
681 * Sap_lengths are interpreted as follows:
682 * sap_length == 0 ==> no sap
683 * sap_length > 0 ==> sap is at the head of the dlpi address
684 * sap_length < 0 ==> sap is at the tail of the dlpi address
685 */
686 static void
687 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
688 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
689 {
690 uint16_t sap_addr = (uint16_t)sap_src;
691
692 if (sap_length == 0) {
693 if (phys_src == NULL)
694 bzero(dst, phys_length);
695 else
696 bcopy(phys_src, dst, phys_length);
697 } else if (sap_length < 0) {
698 if (phys_src == NULL)
699 bzero(dst, phys_length);
700 else
701 bcopy(phys_src, dst, phys_length);
702 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
703 } else {
704 bcopy(&sap_addr, dst, sizeof (sap_addr));
705 if (phys_src == NULL)
706 bzero((char *)dst + sap_length, phys_length);
707 else
708 bcopy(phys_src, (char *)dst + sap_length, phys_length);
709 }
710 }
711
712 /*
713 * Generate a dl_unitdata_req mblk for the device and address given.
714 * addr_length is the length of the physical portion of the address.
715 * If addr is NULL include an all zero address of the specified length.
716 * TRUE? In any case, addr_length is taken to be the entire length of the
717 * dlpi address, including the absolute value of sap_length.
718 */
719 mblk_t *
720 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
721 t_scalar_t sap_length)
722 {
723 dl_unitdata_req_t *dlur;
724 mblk_t *mp;
725 t_scalar_t abs_sap_length; /* absolute value */
726
727 abs_sap_length = ABS(sap_length);
728 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
729 DL_UNITDATA_REQ);
730 if (mp == NULL)
731 return (NULL);
732 dlur = (dl_unitdata_req_t *)mp->b_rptr;
733 /* HACK: accomodate incompatible DLPI drivers */
734 if (addr_length == 8)
735 addr_length = 6;
736 dlur->dl_dest_addr_length = addr_length + abs_sap_length;
737 dlur->dl_dest_addr_offset = sizeof (*dlur);
738 dlur->dl_priority.dl_min = 0;
739 dlur->dl_priority.dl_max = 0;
740 ill_dlur_copy_address(addr, addr_length, sap, sap_length,
741 (uchar_t *)&dlur[1]);
742 return (mp);
743 }
744
745 /*
746 * Add the pending mp to the list. There can be only 1 pending mp
747 * in the list. Any exclusive ioctl that needs to wait for a response
748 * from another module or driver needs to use this function to set
749 * the ipx_pending_mp to the ioctl mblk and wait for the response from
750 * the other module/driver. This is also used while waiting for the
751 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
752 */
753 boolean_t
754 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
755 int waitfor)
756 {
757 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
758
759 ASSERT(IAM_WRITER_IPIF(ipif));
760 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
761 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
762 ASSERT(ipx->ipx_pending_mp == NULL);
763 /*
764 * The caller may be using a different ipif than the one passed into
765 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
766 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
767 * that `ipx_current_ipif == ipif'.
768 */
769 ASSERT(ipx->ipx_current_ipif != NULL);
770
771 /*
772 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
773 * driver.
774 */
775 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
776 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
777 (DB_TYPE(add_mp) == M_PCPROTO));
778
779 if (connp != NULL) {
780 ASSERT(MUTEX_HELD(&connp->conn_lock));
781 /*
782 * Return error if the conn has started closing. The conn
783 * could have finished cleaning up the pending mp list,
784 * If so we should not add another mp to the list negating
785 * the cleanup.
786 */
787 if (connp->conn_state_flags & CONN_CLOSING)
788 return (B_FALSE);
789 }
790 mutex_enter(&ipx->ipx_lock);
791 ipx->ipx_pending_ipif = ipif;
792 /*
793 * Note down the queue in b_queue. This will be returned by
794 * ipsq_pending_mp_get. Caller will then use these values to restart
795 * the processing
796 */
797 add_mp->b_next = NULL;
798 add_mp->b_queue = q;
799 ipx->ipx_pending_mp = add_mp;
800 ipx->ipx_waitfor = waitfor;
801 mutex_exit(&ipx->ipx_lock);
802
803 if (connp != NULL)
804 connp->conn_oper_pending_ill = ipif->ipif_ill;
805
806 return (B_TRUE);
807 }
808
809 /*
810 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
811 * queued in the list.
812 */
813 mblk_t *
814 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
815 {
816 mblk_t *curr = NULL;
817 ipxop_t *ipx = ipsq->ipsq_xop;
818
819 *connpp = NULL;
820 mutex_enter(&ipx->ipx_lock);
821 if (ipx->ipx_pending_mp == NULL) {
822 mutex_exit(&ipx->ipx_lock);
823 return (NULL);
824 }
825
826 /* There can be only 1 such excl message */
827 curr = ipx->ipx_pending_mp;
828 ASSERT(curr->b_next == NULL);
829 ipx->ipx_pending_ipif = NULL;
830 ipx->ipx_pending_mp = NULL;
831 ipx->ipx_waitfor = 0;
832 mutex_exit(&ipx->ipx_lock);
833
834 if (CONN_Q(curr->b_queue)) {
835 /*
836 * This mp did a refhold on the conn, at the start of the ioctl.
837 * So we can safely return a pointer to the conn to the caller.
838 */
839 *connpp = Q_TO_CONN(curr->b_queue);
840 } else {
841 *connpp = NULL;
842 }
843 curr->b_next = NULL;
844 curr->b_prev = NULL;
845 return (curr);
846 }
847
848 /*
849 * Cleanup the ioctl mp queued in ipx_pending_mp
850 * - Called in the ill_delete path
851 * - Called in the M_ERROR or M_HANGUP path on the ill.
852 * - Called in the conn close path.
853 *
854 * Returns success on finding the pending mblk associated with the ioctl or
855 * exclusive operation in progress, failure otherwise.
856 */
857 boolean_t
858 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
859 {
860 mblk_t *mp;
861 ipxop_t *ipx;
862 queue_t *q;
863 ipif_t *ipif;
864 int cmd;
865
866 ASSERT(IAM_WRITER_ILL(ill));
867 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
868
869 mutex_enter(&ipx->ipx_lock);
870 mp = ipx->ipx_pending_mp;
871 if (connp != NULL) {
872 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
873 /*
874 * Nothing to clean since the conn that is closing
875 * does not have a matching pending mblk in
876 * ipx_pending_mp.
877 */
878 mutex_exit(&ipx->ipx_lock);
879 return (B_FALSE);
880 }
881 } else {
882 /*
883 * A non-zero ill_error signifies we are called in the
884 * M_ERROR or M_HANGUP path and we need to unconditionally
885 * abort any current ioctl and do the corresponding cleanup.
886 * A zero ill_error means we are in the ill_delete path and
887 * we do the cleanup only if there is a pending mp.
888 */
889 if (mp == NULL && ill->ill_error == 0) {
890 mutex_exit(&ipx->ipx_lock);
891 return (B_FALSE);
892 }
893 }
894
895 /* Now remove from the ipx_pending_mp */
896 ipx->ipx_pending_mp = NULL;
897 ipif = ipx->ipx_pending_ipif;
898 ipx->ipx_pending_ipif = NULL;
899 ipx->ipx_waitfor = 0;
900 ipx->ipx_current_ipif = NULL;
901 cmd = ipx->ipx_current_ioctl;
902 ipx->ipx_current_ioctl = 0;
903 ipx->ipx_current_done = B_TRUE;
904 mutex_exit(&ipx->ipx_lock);
905
906 if (mp == NULL)
907 return (B_FALSE);
908
909 q = mp->b_queue;
910 mp->b_next = NULL;
911 mp->b_prev = NULL;
912 mp->b_queue = NULL;
913
914 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
915 DTRACE_PROBE4(ipif__ioctl,
916 char *, "ipsq_pending_mp_cleanup",
917 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
918 ipif_t *, ipif);
919 if (connp == NULL) {
920 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
921 } else {
922 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
923 mutex_enter(&ipif->ipif_ill->ill_lock);
924 ipif->ipif_state_flags &= ~IPIF_CHANGING;
925 mutex_exit(&ipif->ipif_ill->ill_lock);
926 }
927 } else {
928 inet_freemsg(mp);
929 }
930 return (B_TRUE);
931 }
932
933 /*
934 * Called in the conn close path and ill delete path
935 */
936 static void
937 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
938 {
939 ipsq_t *ipsq;
940 mblk_t *prev;
941 mblk_t *curr;
942 mblk_t *next;
943 queue_t *wq, *rq = NULL;
944 mblk_t *tmp_list = NULL;
945
946 ASSERT(IAM_WRITER_ILL(ill));
947 if (connp != NULL)
948 wq = CONNP_TO_WQ(connp);
949 else
950 wq = ill->ill_wq;
951
952 /*
953 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
954 * against this here.
955 */
956 if (wq != NULL)
957 rq = RD(wq);
958
959 ipsq = ill->ill_phyint->phyint_ipsq;
960 /*
961 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
962 * In the case of ioctl from a conn, there can be only 1 mp
963 * queued on the ipsq. If an ill is being unplumbed flush all
964 * the messages.
965 */
966 mutex_enter(&ipsq->ipsq_lock);
967 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
968 curr = next) {
969 next = curr->b_next;
970 if (connp == NULL ||
971 (curr->b_queue == wq || curr->b_queue == rq)) {
972 /* Unlink the mblk from the pending mp list */
973 if (prev != NULL) {
974 prev->b_next = curr->b_next;
975 } else {
976 ASSERT(ipsq->ipsq_xopq_mphead == curr);
977 ipsq->ipsq_xopq_mphead = curr->b_next;
978 }
979 if (ipsq->ipsq_xopq_mptail == curr)
980 ipsq->ipsq_xopq_mptail = prev;
981 /*
982 * Create a temporary list and release the ipsq lock
983 * New elements are added to the head of the tmp_list
984 */
985 curr->b_next = tmp_list;
986 tmp_list = curr;
987 } else {
988 prev = curr;
989 }
990 }
991 mutex_exit(&ipsq->ipsq_lock);
992
993 while (tmp_list != NULL) {
994 curr = tmp_list;
995 tmp_list = curr->b_next;
996 curr->b_next = NULL;
997 curr->b_prev = NULL;
998 wq = curr->b_queue;
999 curr->b_queue = NULL;
1000 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1001 DTRACE_PROBE4(ipif__ioctl,
1002 char *, "ipsq_xopq_mp_cleanup",
1003 int, 0, ill_t *, NULL, ipif_t *, NULL);
1004 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1005 CONN_CLOSE : NO_COPYOUT, NULL);
1006 } else {
1007 /*
1008 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1009 * this can't be just inet_freemsg. we have to
1010 * restart it otherwise the thread will be stuck.
1011 */
1012 inet_freemsg(curr);
1013 }
1014 }
1015 }
1016
1017 /*
1018 * This conn has started closing. Cleanup any pending ioctl from this conn.
1019 * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1020 */
1021 void
1022 conn_ioctl_cleanup(conn_t *connp)
1023 {
1024 ipsq_t *ipsq;
1025 ill_t *ill;
1026 boolean_t refheld;
1027
1028 /*
1029 * Check for a queued ioctl. If the ioctl has not yet started, the mp
1030 * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1031 * started the mp could be present in ipx_pending_mp. Note that if
1032 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1033 * not yet queued anywhere. In this case, the conn close code will wait
1034 * until the conn_ref is dropped. If the stream was a tcp stream, then
1035 * tcp_close will wait first until all ioctls have completed for this
1036 * conn.
1037 */
1038 mutex_enter(&connp->conn_lock);
1039 ill = connp->conn_oper_pending_ill;
1040 if (ill == NULL) {
1041 mutex_exit(&connp->conn_lock);
1042 return;
1043 }
1044
1045 /*
1046 * We may not be able to refhold the ill if the ill/ipif
1047 * is changing. But we need to make sure that the ill will
1048 * not vanish. So we just bump up the ill_waiter count.
1049 */
1050 refheld = ill_waiter_inc(ill);
1051 mutex_exit(&connp->conn_lock);
1052 if (refheld) {
1053 if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1054 ill_waiter_dcr(ill);
1055 /*
1056 * Check whether this ioctl has started and is
1057 * pending. If it is not found there then check
1058 * whether this ioctl has not even started and is in
1059 * the ipsq_xopq list.
1060 */
1061 if (!ipsq_pending_mp_cleanup(ill, connp))
1062 ipsq_xopq_mp_cleanup(ill, connp);
1063 ipsq = ill->ill_phyint->phyint_ipsq;
1064 ipsq_exit(ipsq);
1065 return;
1066 }
1067 }
1068
1069 /*
1070 * The ill is also closing and we could not bump up the
1071 * ill_waiter_count or we could not enter the ipsq. Leave
1072 * the cleanup to ill_delete
1073 */
1074 mutex_enter(&connp->conn_lock);
1075 while (connp->conn_oper_pending_ill != NULL)
1076 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1077 mutex_exit(&connp->conn_lock);
1078 if (refheld)
1079 ill_waiter_dcr(ill);
1080 }
1081
1082 /*
1083 * ipcl_walk function for cleaning up conn_*_ill fields.
1084 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1085 * conn_bound_if in place. We prefer dropping
1086 * packets instead of sending them out the wrong interface, or accepting
1087 * packets from the wrong ifindex.
1088 */
1089 static void
1090 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1091 {
1092 ill_t *ill = (ill_t *)arg;
1093
1094 mutex_enter(&connp->conn_lock);
1095 if (connp->conn_dhcpinit_ill == ill) {
1096 connp->conn_dhcpinit_ill = NULL;
1097 ASSERT(ill->ill_dhcpinit != 0);
1098 atomic_dec_32(&ill->ill_dhcpinit);
1099 ill_set_inputfn(ill);
1100 }
1101 mutex_exit(&connp->conn_lock);
1102 }
1103
1104 static int
1105 ill_down_ipifs_tail(ill_t *ill)
1106 {
1107 ipif_t *ipif;
1108 int err;
1109
1110 ASSERT(IAM_WRITER_ILL(ill));
1111 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1112 ipif_non_duplicate(ipif);
1113 /*
1114 * ipif_down_tail will call arp_ll_down on the last ipif
1115 * and typically return EINPROGRESS when the DL_UNBIND is sent.
1116 */
1117 if ((err = ipif_down_tail(ipif)) != 0)
1118 return (err);
1119 }
1120 return (0);
1121 }
1122
1123 /* ARGSUSED */
1124 void
1125 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1126 {
1127 ASSERT(IAM_WRITER_IPSQ(ipsq));
1128 (void) ill_down_ipifs_tail(q->q_ptr);
1129 freemsg(mp);
1130 ipsq_current_finish(ipsq);
1131 }
1132
1133 /*
1134 * ill_down_start is called when we want to down this ill and bring it up again
1135 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1136 * all interfaces, but don't tear down any plumbing.
1137 */
1138 boolean_t
1139 ill_down_start(queue_t *q, mblk_t *mp)
1140 {
1141 ill_t *ill = q->q_ptr;
1142 ipif_t *ipif;
1143
1144 ASSERT(IAM_WRITER_ILL(ill));
1145 /*
1146 * It is possible that some ioctl is already in progress while we
1147 * received the M_ERROR / M_HANGUP in which case, we need to abort
1148 * the ioctl. ill_down_start() is being processed as CUR_OP rather
1149 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1150 * the in progress ioctl from ever completing.
1151 *
1152 * The thread that started the ioctl (if any) must have returned,
1153 * since we are now executing as writer. After the 2 calls below,
1154 * the state of the ipsq and the ill would reflect no trace of any
1155 * pending operation. Subsequently if there is any response to the
1156 * original ioctl from the driver, it would be discarded as an
1157 * unsolicited message from the driver.
1158 */
1159 (void) ipsq_pending_mp_cleanup(ill, NULL);
1160 ill_dlpi_clear_deferred(ill);
1161
1162 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1163 (void) ipif_down(ipif, NULL, NULL);
1164
1165 ill_down(ill);
1166
1167 /*
1168 * Walk all CONNs that can have a reference on an ire or nce for this
1169 * ill (we actually walk all that now have stale references).
1170 */
1171 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1172
1173 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1174 if (ill->ill_isv6)
1175 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1176
1177 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1178
1179 /*
1180 * Atomically test and add the pending mp if references are active.
1181 */
1182 mutex_enter(&ill->ill_lock);
1183 if (!ill_is_quiescent(ill)) {
1184 /* call cannot fail since `conn_t *' argument is NULL */
1185 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1186 mp, ILL_DOWN);
1187 mutex_exit(&ill->ill_lock);
1188 return (B_FALSE);
1189 }
1190 mutex_exit(&ill->ill_lock);
1191 return (B_TRUE);
1192 }
1193
1194 static void
1195 ill_down(ill_t *ill)
1196 {
1197 mblk_t *mp;
1198 ip_stack_t *ipst = ill->ill_ipst;
1199
1200 /*
1201 * Blow off any IREs dependent on this ILL.
1202 * The caller needs to handle conn_ixa_cleanup
1203 */
1204 ill_delete_ires(ill);
1205
1206 ire_walk_ill(0, 0, ill_downi, ill, ill);
1207
1208 /* Remove any conn_*_ill depending on this ill */
1209 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1210
1211 /*
1212 * Free state for additional IREs.
1213 */
1214 mutex_enter(&ill->ill_saved_ire_lock);
1215 mp = ill->ill_saved_ire_mp;
1216 ill->ill_saved_ire_mp = NULL;
1217 ill->ill_saved_ire_cnt = 0;
1218 mutex_exit(&ill->ill_saved_ire_lock);
1219 freemsg(mp);
1220 }
1221
1222 /*
1223 * ire_walk routine used to delete every IRE that depends on
1224 * 'ill'. (Always called as writer, and may only be called from ire_walk.)
1225 *
1226 * Note: since the routes added by the kernel are deleted separately,
1227 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1228 *
1229 * We also remove references on ire_nce_cache entries that refer to the ill.
1230 */
1231 void
1232 ill_downi(ire_t *ire, char *ill_arg)
1233 {
1234 ill_t *ill = (ill_t *)ill_arg;
1235 nce_t *nce;
1236
1237 mutex_enter(&ire->ire_lock);
1238 nce = ire->ire_nce_cache;
1239 if (nce != NULL && nce->nce_ill == ill)
1240 ire->ire_nce_cache = NULL;
1241 else
1242 nce = NULL;
1243 mutex_exit(&ire->ire_lock);
1244 if (nce != NULL)
1245 nce_refrele(nce);
1246 if (ire->ire_ill == ill) {
1247 /*
1248 * The existing interface binding for ire must be
1249 * deleted before trying to bind the route to another
1250 * interface. However, since we are using the contents of the
1251 * ire after ire_delete, the caller has to ensure that
1252 * CONDEMNED (deleted) ire's are not removed from the list
1253 * when ire_delete() returns. Currently ill_downi() is
1254 * only called as part of ire_walk*() routines, so that
1255 * the irb_refhold() done by ire_walk*() will ensure that
1256 * ire_delete() does not lead to ire_inactive().
1257 */
1258 ASSERT(ire->ire_bucket->irb_refcnt > 0);
1259 ire_delete(ire);
1260 if (ire->ire_unbound)
1261 ire_rebind(ire);
1262 }
1263 }
1264
1265 /* Remove IRE_IF_CLONE on this ill */
1266 void
1267 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1268 {
1269 ill_t *ill = (ill_t *)ill_arg;
1270
1271 ASSERT(ire->ire_type & IRE_IF_CLONE);
1272 if (ire->ire_ill == ill)
1273 ire_delete(ire);
1274 }
1275
1276 /* Consume an M_IOCACK of the fastpath probe. */
1277 void
1278 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1279 {
1280 mblk_t *mp1 = mp;
1281
1282 /*
1283 * If this was the first attempt turn on the fastpath probing.
1284 */
1285 mutex_enter(&ill->ill_lock);
1286 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1287 ill->ill_dlpi_fastpath_state = IDS_OK;
1288 mutex_exit(&ill->ill_lock);
1289
1290 /* Free the M_IOCACK mblk, hold on to the data */
1291 mp = mp->b_cont;
1292 freeb(mp1);
1293 if (mp == NULL)
1294 return;
1295 if (mp->b_cont != NULL)
1296 nce_fastpath_update(ill, mp);
1297 else
1298 ip0dbg(("ill_fastpath_ack: no b_cont\n"));
1299 freemsg(mp);
1300 }
1301
1302 /*
1303 * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1304 * The data portion of the request is a dl_unitdata_req_t template for
1305 * what we would send downstream in the absence of a fastpath confirmation.
1306 */
1307 int
1308 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1309 {
1310 struct iocblk *ioc;
1311 mblk_t *mp;
1312
1313 if (dlur_mp == NULL)
1314 return (EINVAL);
1315
1316 mutex_enter(&ill->ill_lock);
1317 switch (ill->ill_dlpi_fastpath_state) {
1318 case IDS_FAILED:
1319 /*
1320 * Driver NAKed the first fastpath ioctl - assume it doesn't
1321 * support it.
1322 */
1323 mutex_exit(&ill->ill_lock);
1324 return (ENOTSUP);
1325 case IDS_UNKNOWN:
1326 /* This is the first probe */
1327 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1328 break;
1329 default:
1330 break;
1331 }
1332 mutex_exit(&ill->ill_lock);
1333
1334 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1335 return (EAGAIN);
1336
1337 mp->b_cont = copyb(dlur_mp);
1338 if (mp->b_cont == NULL) {
1339 freeb(mp);
1340 return (EAGAIN);
1341 }
1342
1343 ioc = (struct iocblk *)mp->b_rptr;
1344 ioc->ioc_count = msgdsize(mp->b_cont);
1345
1346 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1347 char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1348 putnext(ill->ill_wq, mp);
1349 return (0);
1350 }
1351
1352 void
1353 ill_capability_probe(ill_t *ill)
1354 {
1355 mblk_t *mp;
1356
1357 ASSERT(IAM_WRITER_ILL(ill));
1358
1359 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1360 ill->ill_dlpi_capab_state != IDCS_FAILED)
1361 return;
1362
1363 /*
1364 * We are starting a new cycle of capability negotiation.
1365 * Free up the capab reset messages of any previous incarnation.
1366 * We will do a fresh allocation when we get the response to our probe
1367 */
1368 if (ill->ill_capab_reset_mp != NULL) {
1369 freemsg(ill->ill_capab_reset_mp);
1370 ill->ill_capab_reset_mp = NULL;
1371 }
1372
1373 ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1374
1375 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1376 if (mp == NULL)
1377 return;
1378
1379 ill_capability_send(ill, mp);
1380 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1381 }
1382
1383 void
1384 ill_capability_reset(ill_t *ill, boolean_t reneg)
1385 {
1386 ASSERT(IAM_WRITER_ILL(ill));
1387
1388 if (ill->ill_dlpi_capab_state != IDCS_OK)
1389 return;
1390
1391 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1392
1393 ill_capability_send(ill, ill->ill_capab_reset_mp);
1394 ill->ill_capab_reset_mp = NULL;
1395 /*
1396 * We turn off all capabilities except those pertaining to
1397 * direct function call capabilities viz. ILL_CAPAB_DLD*
1398 * which will be turned off by the corresponding reset functions.
1399 */
1400 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY);
1401 }
1402
1403 static void
1404 ill_capability_reset_alloc(ill_t *ill)
1405 {
1406 mblk_t *mp;
1407 size_t size = 0;
1408 int err;
1409 dl_capability_req_t *capb;
1410
1411 ASSERT(IAM_WRITER_ILL(ill));
1412 ASSERT(ill->ill_capab_reset_mp == NULL);
1413
1414 if (ILL_HCKSUM_CAPABLE(ill)) {
1415 size += sizeof (dl_capability_sub_t) +
1416 sizeof (dl_capab_hcksum_t);
1417 }
1418
1419 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1420 size += sizeof (dl_capability_sub_t) +
1421 sizeof (dl_capab_zerocopy_t);
1422 }
1423
1424 if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1425 size += sizeof (dl_capability_sub_t) +
1426 sizeof (dl_capab_dld_t);
1427 }
1428
1429 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1430 STR_NOSIG, &err);
1431
1432 mp->b_datap->db_type = M_PROTO;
1433 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1434
1435 capb = (dl_capability_req_t *)mp->b_rptr;
1436 capb->dl_primitive = DL_CAPABILITY_REQ;
1437 capb->dl_sub_offset = sizeof (dl_capability_req_t);
1438 capb->dl_sub_length = size;
1439
1440 mp->b_wptr += sizeof (dl_capability_req_t);
1441
1442 /*
1443 * Each handler fills in the corresponding dl_capability_sub_t
1444 * inside the mblk,
1445 */
1446 ill_capability_hcksum_reset_fill(ill, mp);
1447 ill_capability_zerocopy_reset_fill(ill, mp);
1448 ill_capability_dld_reset_fill(ill, mp);
1449
1450 ill->ill_capab_reset_mp = mp;
1451 }
1452
1453 static void
1454 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1455 {
1456 dl_capab_id_t *id_ic;
1457 uint_t sub_dl_cap = outers->dl_cap;
1458 dl_capability_sub_t *inners;
1459 uint8_t *capend;
1460
1461 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1462
1463 /*
1464 * Note: range checks here are not absolutely sufficient to
1465 * make us robust against malformed messages sent by drivers;
1466 * this is in keeping with the rest of IP's dlpi handling.
1467 * (Remember, it's coming from something else in the kernel
1468 * address space)
1469 */
1470
1471 capend = (uint8_t *)(outers + 1) + outers->dl_length;
1472 if (capend > mp->b_wptr) {
1473 cmn_err(CE_WARN, "ill_capability_id_ack: "
1474 "malformed sub-capability too long for mblk");
1475 return;
1476 }
1477
1478 id_ic = (dl_capab_id_t *)(outers + 1);
1479
1480 if (outers->dl_length < sizeof (*id_ic) ||
1481 (inners = &id_ic->id_subcap,
1482 inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1483 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1484 "encapsulated capab type %d too long for mblk",
1485 inners->dl_cap);
1486 return;
1487 }
1488
1489 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1490 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1491 "isn't as expected; pass-thru module(s) detected, "
1492 "discarding capability\n", inners->dl_cap));
1493 return;
1494 }
1495
1496 /* Process the encapsulated sub-capability */
1497 ill_capability_dispatch(ill, mp, inners);
1498 }
1499
1500 static void
1501 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1502 {
1503 dl_capability_sub_t *dl_subcap;
1504
1505 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1506 return;
1507
1508 /*
1509 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1510 * initialized below since it is not used by DLD.
1511 */
1512 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1513 dl_subcap->dl_cap = DL_CAPAB_DLD;
1514 dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1515
1516 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1517 }
1518
1519 static void
1520 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1521 {
1522 /*
1523 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1524 * is only to get the VRRP capability.
1525 *
1526 * Note that we cannot check ill_ipif_up_count here since
1527 * ill_ipif_up_count is only incremented when the resolver is setup.
1528 * That is done asynchronously, and can race with this function.
1529 */
1530 if (!ill->ill_dl_up) {
1531 if (subp->dl_cap == DL_CAPAB_VRRP)
1532 ill_capability_vrrp_ack(ill, mp, subp);
1533 return;
1534 }
1535
1536 switch (subp->dl_cap) {
1537 case DL_CAPAB_HCKSUM:
1538 ill_capability_hcksum_ack(ill, mp, subp);
1539 break;
1540 case DL_CAPAB_ZEROCOPY:
1541 ill_capability_zerocopy_ack(ill, mp, subp);
1542 break;
1543 case DL_CAPAB_DLD:
1544 ill_capability_dld_ack(ill, mp, subp);
1545 break;
1546 case DL_CAPAB_VRRP:
1547 break;
1548 default:
1549 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1550 subp->dl_cap));
1551 }
1552 }
1553
1554 /*
1555 * Process the vrrp capability received from a DLS Provider. isub must point
1556 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1557 */
1558 static void
1559 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1560 {
1561 dl_capab_vrrp_t *vrrp;
1562 uint_t sub_dl_cap = isub->dl_cap;
1563 uint8_t *capend;
1564
1565 ASSERT(IAM_WRITER_ILL(ill));
1566 ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1567
1568 /*
1569 * Note: range checks here are not absolutely sufficient to
1570 * make us robust against malformed messages sent by drivers;
1571 * this is in keeping with the rest of IP's dlpi handling.
1572 * (Remember, it's coming from something else in the kernel
1573 * address space)
1574 */
1575 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1576 if (capend > mp->b_wptr) {
1577 cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1578 "malformed sub-capability too long for mblk");
1579 return;
1580 }
1581 vrrp = (dl_capab_vrrp_t *)(isub + 1);
1582
1583 /*
1584 * Compare the IP address family and set ILLF_VRRP for the right ill.
1585 */
1586 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1587 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1588 ill->ill_flags |= ILLF_VRRP;
1589 }
1590 }
1591
1592 /*
1593 * Process a hardware checksum offload capability negotiation ack received
1594 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1595 * of a DL_CAPABILITY_ACK message.
1596 */
1597 static void
1598 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1599 {
1600 dl_capability_req_t *ocap;
1601 dl_capab_hcksum_t *ihck, *ohck;
1602 ill_hcksum_capab_t **ill_hcksum;
1603 mblk_t *nmp = NULL;
1604 uint_t sub_dl_cap = isub->dl_cap;
1605 uint8_t *capend;
1606
1607 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1608
1609 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1610
1611 /*
1612 * Note: range checks here are not absolutely sufficient to
1613 * make us robust against malformed messages sent by drivers;
1614 * this is in keeping with the rest of IP's dlpi handling.
1615 * (Remember, it's coming from something else in the kernel
1616 * address space)
1617 */
1618 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1619 if (capend > mp->b_wptr) {
1620 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1621 "malformed sub-capability too long for mblk");
1622 return;
1623 }
1624
1625 /*
1626 * There are two types of acks we process here:
1627 * 1. acks in reply to a (first form) generic capability req
1628 * (no ENABLE flag set)
1629 * 2. acks in reply to a ENABLE capability req.
1630 * (ENABLE flag set)
1631 */
1632 ihck = (dl_capab_hcksum_t *)(isub + 1);
1633
1634 if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1635 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1636 "unsupported hardware checksum "
1637 "sub-capability (version %d, expected %d)",
1638 ihck->hcksum_version, HCKSUM_VERSION_1);
1639 return;
1640 }
1641
1642 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1643 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1644 "checksum capability isn't as expected; pass-thru "
1645 "module(s) detected, discarding capability\n"));
1646 return;
1647 }
1648
1649 #define CURR_HCKSUM_CAPAB \
1650 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
1651 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1652
1653 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1654 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1655 /* do ENABLE processing */
1656 if (*ill_hcksum == NULL) {
1657 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1658 KM_NOSLEEP);
1659
1660 if (*ill_hcksum == NULL) {
1661 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1662 "could not enable hcksum version %d "
1663 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1664 ill->ill_name);
1665 return;
1666 }
1667 }
1668
1669 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1670 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1671 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1672 ip1dbg(("ill_capability_hcksum_ack: interface %s "
1673 "has enabled hardware checksumming\n ",
1674 ill->ill_name));
1675 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1676 /*
1677 * Enabling hardware checksum offload
1678 * Currently IP supports {TCP,UDP}/IPv4
1679 * partial and full cksum offload and
1680 * IPv4 header checksum offload.
1681 * Allocate new mblk which will
1682 * contain a new capability request
1683 * to enable hardware checksum offload.
1684 */
1685 uint_t size;
1686 uchar_t *rptr;
1687
1688 size = sizeof (dl_capability_req_t) +
1689 sizeof (dl_capability_sub_t) + isub->dl_length;
1690
1691 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1692 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1693 "could not enable hardware cksum for %s (ENOMEM)\n",
1694 ill->ill_name);
1695 return;
1696 }
1697
1698 rptr = nmp->b_rptr;
1699 /* initialize dl_capability_req_t */
1700 ocap = (dl_capability_req_t *)nmp->b_rptr;
1701 ocap->dl_sub_offset =
1702 sizeof (dl_capability_req_t);
1703 ocap->dl_sub_length =
1704 sizeof (dl_capability_sub_t) +
1705 isub->dl_length;
1706 nmp->b_rptr += sizeof (dl_capability_req_t);
1707
1708 /* initialize dl_capability_sub_t */
1709 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1710 nmp->b_rptr += sizeof (*isub);
1711
1712 /* initialize dl_capab_hcksum_t */
1713 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1714 bcopy(ihck, ohck, sizeof (*ihck));
1715
1716 nmp->b_rptr = rptr;
1717 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1718
1719 /* Set ENABLE flag */
1720 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1721 ohck->hcksum_txflags |= HCKSUM_ENABLE;
1722
1723 /*
1724 * nmp points to a DL_CAPABILITY_REQ message to enable
1725 * hardware checksum acceleration.
1726 */
1727 ill_capability_send(ill, nmp);
1728 } else {
1729 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1730 "advertised %x hardware checksum capability flags\n",
1731 ill->ill_name, ihck->hcksum_txflags));
1732 }
1733 }
1734
1735 static void
1736 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1737 {
1738 dl_capab_hcksum_t *hck_subcap;
1739 dl_capability_sub_t *dl_subcap;
1740
1741 if (!ILL_HCKSUM_CAPABLE(ill))
1742 return;
1743
1744 ASSERT(ill->ill_hcksum_capab != NULL);
1745
1746 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1747 dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1748 dl_subcap->dl_length = sizeof (*hck_subcap);
1749
1750 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1751 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1752 hck_subcap->hcksum_txflags = 0;
1753
1754 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1755 }
1756
1757 static void
1758 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1759 {
1760 mblk_t *nmp = NULL;
1761 dl_capability_req_t *oc;
1762 dl_capab_zerocopy_t *zc_ic, *zc_oc;
1763 ill_zerocopy_capab_t **ill_zerocopy_capab;
1764 uint_t sub_dl_cap = isub->dl_cap;
1765 uint8_t *capend;
1766
1767 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1768
1769 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1770
1771 /*
1772 * Note: range checks here are not absolutely sufficient to
1773 * make us robust against malformed messages sent by drivers;
1774 * this is in keeping with the rest of IP's dlpi handling.
1775 * (Remember, it's coming from something else in the kernel
1776 * address space)
1777 */
1778 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1779 if (capend > mp->b_wptr) {
1780 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1781 "malformed sub-capability too long for mblk");
1782 return;
1783 }
1784
1785 zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1786 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1787 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1788 "unsupported ZEROCOPY sub-capability (version %d, "
1789 "expected %d)", zc_ic->zerocopy_version,
1790 ZEROCOPY_VERSION_1);
1791 return;
1792 }
1793
1794 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1795 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1796 "capability isn't as expected; pass-thru module(s) "
1797 "detected, discarding capability\n"));
1798 return;
1799 }
1800
1801 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1802 if (*ill_zerocopy_capab == NULL) {
1803 *ill_zerocopy_capab =
1804 kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1805 KM_NOSLEEP);
1806
1807 if (*ill_zerocopy_capab == NULL) {
1808 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1809 "could not enable Zero-copy version %d "
1810 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1811 ill->ill_name);
1812 return;
1813 }
1814 }
1815
1816 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1817 "supports Zero-copy version %d\n", ill->ill_name,
1818 ZEROCOPY_VERSION_1));
1819
1820 (*ill_zerocopy_capab)->ill_zerocopy_version =
1821 zc_ic->zerocopy_version;
1822 (*ill_zerocopy_capab)->ill_zerocopy_flags =
1823 zc_ic->zerocopy_flags;
1824
1825 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1826 } else {
1827 uint_t size;
1828 uchar_t *rptr;
1829
1830 size = sizeof (dl_capability_req_t) +
1831 sizeof (dl_capability_sub_t) +
1832 sizeof (dl_capab_zerocopy_t);
1833
1834 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1835 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1836 "could not enable zerocopy for %s (ENOMEM)\n",
1837 ill->ill_name);
1838 return;
1839 }
1840
1841 rptr = nmp->b_rptr;
1842 /* initialize dl_capability_req_t */
1843 oc = (dl_capability_req_t *)rptr;
1844 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1845 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1846 sizeof (dl_capab_zerocopy_t);
1847 rptr += sizeof (dl_capability_req_t);
1848
1849 /* initialize dl_capability_sub_t */
1850 bcopy(isub, rptr, sizeof (*isub));
1851 rptr += sizeof (*isub);
1852
1853 /* initialize dl_capab_zerocopy_t */
1854 zc_oc = (dl_capab_zerocopy_t *)rptr;
1855 *zc_oc = *zc_ic;
1856
1857 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1858 "to enable zero-copy version %d\n", ill->ill_name,
1859 ZEROCOPY_VERSION_1));
1860
1861 /* set VMSAFE_MEM flag */
1862 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1863
1864 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1865 ill_capability_send(ill, nmp);
1866 }
1867 }
1868
1869 static void
1870 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1871 {
1872 dl_capab_zerocopy_t *zerocopy_subcap;
1873 dl_capability_sub_t *dl_subcap;
1874
1875 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1876 return;
1877
1878 ASSERT(ill->ill_zerocopy_capab != NULL);
1879
1880 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1881 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1882 dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1883
1884 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1885 zerocopy_subcap->zerocopy_version =
1886 ill->ill_zerocopy_capab->ill_zerocopy_version;
1887 zerocopy_subcap->zerocopy_flags = 0;
1888
1889 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1890 }
1891
1892 /*
1893 * DLD capability
1894 * Refer to dld.h for more information regarding the purpose and usage
1895 * of this capability.
1896 */
1897 static void
1898 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1899 {
1900 dl_capab_dld_t *dld_ic, dld;
1901 uint_t sub_dl_cap = isub->dl_cap;
1902 uint8_t *capend;
1903 ill_dld_capab_t *idc;
1904
1905 ASSERT(IAM_WRITER_ILL(ill));
1906 ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1907
1908 /*
1909 * Note: range checks here are not absolutely sufficient to
1910 * make us robust against malformed messages sent by drivers;
1911 * this is in keeping with the rest of IP's dlpi handling.
1912 * (Remember, it's coming from something else in the kernel
1913 * address space)
1914 */
1915 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1916 if (capend > mp->b_wptr) {
1917 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1918 "malformed sub-capability too long for mblk");
1919 return;
1920 }
1921 dld_ic = (dl_capab_dld_t *)(isub + 1);
1922 if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1923 cmn_err(CE_CONT, "ill_capability_dld_ack: "
1924 "unsupported DLD sub-capability (version %d, "
1925 "expected %d)", dld_ic->dld_version,
1926 DLD_CURRENT_VERSION);
1927 return;
1928 }
1929 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1930 ip1dbg(("ill_capability_dld_ack: mid token for dld "
1931 "capability isn't as expected; pass-thru module(s) "
1932 "detected, discarding capability\n"));
1933 return;
1934 }
1935
1936 /*
1937 * Copy locally to ensure alignment.
1938 */
1939 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1940
1941 if ((idc = ill->ill_dld_capab) == NULL) {
1942 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1943 if (idc == NULL) {
1944 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1945 "could not enable DLD version %d "
1946 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1947 ill->ill_name);
1948 return;
1949 }
1950 ill->ill_dld_capab = idc;
1951 }
1952 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1953 idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1954 ip1dbg(("ill_capability_dld_ack: interface %s "
1955 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1956
1957 ill_capability_dld_enable(ill);
1958 }
1959
1960 /*
1961 * Typically capability negotiation between IP and the driver happens via
1962 * DLPI message exchange. However GLD also offers a direct function call
1963 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1964 * But arbitrary function calls into IP or GLD are not permitted, since both
1965 * of them are protected by their own perimeter mechanism. The perimeter can
1966 * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1967 * these perimeters is IP -> MAC. Thus for example to enable the squeue
1968 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
1969 * to enter the mac perimeter and then do the direct function calls into
1970 * GLD to enable squeue polling. The ring related callbacks from the mac into
1971 * the stack to add, bind, quiesce, restart or cleanup a ring are all
1972 * protected by the mac perimeter.
1973 */
1974 static void
1975 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
1976 {
1977 ill_dld_capab_t *idc = ill->ill_dld_capab;
1978 int err;
1979
1980 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
1981 DLD_ENABLE);
1982 ASSERT(err == 0);
1983 }
1984
1985 static void
1986 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
1987 {
1988 ill_dld_capab_t *idc = ill->ill_dld_capab;
1989 int err;
1990
1991 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
1992 DLD_DISABLE);
1993 ASSERT(err == 0);
1994 }
1995
1996 boolean_t
1997 ill_mac_perim_held(ill_t *ill)
1998 {
1999 ill_dld_capab_t *idc = ill->ill_dld_capab;
2000
2001 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
2002 DLD_QUERY));
2003 }
2004
2005 static void
2006 ill_capability_direct_enable(ill_t *ill)
2007 {
2008 ill_dld_capab_t *idc = ill->ill_dld_capab;
2009 ill_dld_direct_t *idd = &idc->idc_direct;
2010 dld_capab_direct_t direct;
2011 int rc;
2012
2013 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2014
2015 bzero(&direct, sizeof (direct));
2016 direct.di_rx_cf = (uintptr_t)ip_input;
2017 direct.di_rx_ch = ill;
2018
2019 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2020 DLD_ENABLE);
2021 if (rc == 0) {
2022 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2023 idd->idd_tx_dh = direct.di_tx_dh;
2024 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2025 idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2026 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2027 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2028 ASSERT(idd->idd_tx_cb_df != NULL);
2029 ASSERT(idd->idd_tx_fctl_df != NULL);
2030 ASSERT(idd->idd_tx_df != NULL);
2031 /*
2032 * One time registration of flow enable callback function
2033 */
2034 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2035 ill_flow_enable, ill);
2036 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2037 DTRACE_PROBE1(direct_on, (ill_t *), ill);
2038 } else {
2039 cmn_err(CE_WARN, "warning: could not enable DIRECT "
2040 "capability, rc = %d\n", rc);
2041 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2042 }
2043 }
2044
2045 static void
2046 ill_capability_poll_enable(ill_t *ill)
2047 {
2048 ill_dld_capab_t *idc = ill->ill_dld_capab;
2049 dld_capab_poll_t poll;
2050 int rc;
2051
2052 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2053
2054 bzero(&poll, sizeof (poll));
2055 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2056 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2057 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2058 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2059 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2060 poll.poll_ring_ch = ill;
2061 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2062 DLD_ENABLE);
2063 if (rc == 0) {
2064 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2065 DTRACE_PROBE1(poll_on, (ill_t *), ill);
2066 } else {
2067 ip1dbg(("warning: could not enable POLL "
2068 "capability, rc = %d\n", rc));
2069 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2070 }
2071 }
2072
2073 /*
2074 * Enable the LSO capability.
2075 */
2076 static void
2077 ill_capability_lso_enable(ill_t *ill)
2078 {
2079 ill_dld_capab_t *idc = ill->ill_dld_capab;
2080 dld_capab_lso_t lso;
2081 int rc;
2082
2083 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2084
2085 if (ill->ill_lso_capab == NULL) {
2086 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2087 KM_NOSLEEP);
2088 if (ill->ill_lso_capab == NULL) {
2089 cmn_err(CE_WARN, "ill_capability_lso_enable: "
2090 "could not enable LSO for %s (ENOMEM)\n",
2091 ill->ill_name);
2092 return;
2093 }
2094 }
2095
2096 bzero(&lso, sizeof (lso));
2097 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2098 DLD_ENABLE)) == 0) {
2099 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2100 ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2101 ill->ill_capabilities |= ILL_CAPAB_LSO;
2102 ip1dbg(("ill_capability_lso_enable: interface %s "
2103 "has enabled LSO\n ", ill->ill_name));
2104 } else {
2105 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2106 ill->ill_lso_capab = NULL;
2107 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2108 }
2109 }
2110
2111 static void
2112 ill_capability_dld_enable(ill_t *ill)
2113 {
2114 mac_perim_handle_t mph;
2115
2116 ASSERT(IAM_WRITER_ILL(ill));
2117
2118 if (ill->ill_isv6)
2119 return;
2120
2121 ill_mac_perim_enter(ill, &mph);
2122 if (!ill->ill_isv6) {
2123 ill_capability_direct_enable(ill);
2124 ill_capability_poll_enable(ill);
2125 ill_capability_lso_enable(ill);
2126 }
2127 ill->ill_capabilities |= ILL_CAPAB_DLD;
2128 ill_mac_perim_exit(ill, mph);
2129 }
2130
2131 static void
2132 ill_capability_dld_disable(ill_t *ill)
2133 {
2134 ill_dld_capab_t *idc;
2135 ill_dld_direct_t *idd;
2136 mac_perim_handle_t mph;
2137
2138 ASSERT(IAM_WRITER_ILL(ill));
2139
2140 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2141 return;
2142
2143 ill_mac_perim_enter(ill, &mph);
2144
2145 idc = ill->ill_dld_capab;
2146 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2147 /*
2148 * For performance we avoid locks in the transmit data path
2149 * and don't maintain a count of the number of threads using
2150 * direct calls. Thus some threads could be using direct
2151 * transmit calls to GLD, even after the capability mechanism
2152 * turns it off. This is still safe since the handles used in
2153 * the direct calls continue to be valid until the unplumb is
2154 * completed. Remove the callback that was added (1-time) at
2155 * capab enable time.
2156 */
2157 mutex_enter(&ill->ill_lock);
2158 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2159 mutex_exit(&ill->ill_lock);
2160 if (ill->ill_flownotify_mh != NULL) {
2161 idd = &idc->idc_direct;
2162 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2163 ill->ill_flownotify_mh);
2164 ill->ill_flownotify_mh = NULL;
2165 }
2166 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2167 NULL, DLD_DISABLE);
2168 }
2169
2170 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2171 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2172 ip_squeue_clean_all(ill);
2173 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2174 NULL, DLD_DISABLE);
2175 }
2176
2177 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2178 ASSERT(ill->ill_lso_capab != NULL);
2179 /*
2180 * Clear the capability flag for LSO but retain the
2181 * ill_lso_capab structure since it's possible that another
2182 * thread is still referring to it. The structure only gets
2183 * deallocated when we destroy the ill.
2184 */
2185
2186 ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2187 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2188 NULL, DLD_DISABLE);
2189 }
2190
2191 ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2192 ill_mac_perim_exit(ill, mph);
2193 }
2194
2195 /*
2196 * Capability Negotiation protocol
2197 *
2198 * We don't wait for DLPI capability operations to finish during interface
2199 * bringup or teardown. Doing so would introduce more asynchrony and the
2200 * interface up/down operations will need multiple return and restarts.
2201 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2202 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2203 * exclusive operation won't start until the DLPI operations of the previous
2204 * exclusive operation complete.
2205 *
2206 * The capability state machine is shown below.
2207 *
2208 * state next state event, action
2209 *
2210 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe
2211 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack
2212 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack)
2213 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG
2214 * IDCS_OK IDCS_RESET_SENT ill_capability_reset
2215 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr
2216 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr ->
2217 * ill_capability_probe.
2218 */
2219
2220 /*
2221 * Dedicated thread started from ip_stack_init that handles capability
2222 * disable. This thread ensures the taskq dispatch does not fail by waiting
2223 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2224 * that direct calls to DLD are done in a cv_waitable context.
2225 */
2226 void
2227 ill_taskq_dispatch(ip_stack_t *ipst)
2228 {
2229 callb_cpr_t cprinfo;
2230 char name[64];
2231 mblk_t *mp;
2232
2233 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2234 ipst->ips_netstack->netstack_stackid);
2235 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2236 name);
2237 mutex_enter(&ipst->ips_capab_taskq_lock);
2238
2239 for (;;) {
2240 mp = ipst->ips_capab_taskq_head;
2241 while (mp != NULL) {
2242 ipst->ips_capab_taskq_head = mp->b_next;
2243 if (ipst->ips_capab_taskq_head == NULL)
2244 ipst->ips_capab_taskq_tail = NULL;
2245 mutex_exit(&ipst->ips_capab_taskq_lock);
2246 mp->b_next = NULL;
2247
2248 VERIFY(taskq_dispatch(system_taskq,
2249 ill_capability_ack_thr, mp, TQ_SLEEP) != 0);
2250 mutex_enter(&ipst->ips_capab_taskq_lock);
2251 mp = ipst->ips_capab_taskq_head;
2252 }
2253
2254 if (ipst->ips_capab_taskq_quit)
2255 break;
2256 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2257 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2258 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2259 }
2260 VERIFY(ipst->ips_capab_taskq_head == NULL);
2261 VERIFY(ipst->ips_capab_taskq_tail == NULL);
2262 CALLB_CPR_EXIT(&cprinfo);
2263 thread_exit();
2264 }
2265
2266 /*
2267 * Consume a new-style hardware capabilities negotiation ack.
2268 * Called via taskq on receipt of DL_CAPABILITY_ACK.
2269 */
2270 static void
2271 ill_capability_ack_thr(void *arg)
2272 {
2273 mblk_t *mp = arg;
2274 dl_capability_ack_t *capp;
2275 dl_capability_sub_t *subp, *endp;
2276 ill_t *ill;
2277 boolean_t reneg;
2278
2279 ill = (ill_t *)mp->b_prev;
2280 mp->b_prev = NULL;
2281
2282 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2283
2284 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2285 ill->ill_dlpi_capab_state == IDCS_RENEG) {
2286 /*
2287 * We have received the ack for our DL_CAPAB reset request.
2288 * There isnt' anything in the message that needs processing.
2289 * All message based capabilities have been disabled, now
2290 * do the function call based capability disable.
2291 */
2292 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2293 ill_capability_dld_disable(ill);
2294 ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2295 if (reneg)
2296 ill_capability_probe(ill);
2297 goto done;
2298 }
2299
2300 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2301 ill->ill_dlpi_capab_state = IDCS_OK;
2302
2303 capp = (dl_capability_ack_t *)mp->b_rptr;
2304
2305 if (capp->dl_sub_length == 0) {
2306 /* no new-style capabilities */
2307 goto done;
2308 }
2309
2310 /* make sure the driver supplied correct dl_sub_length */
2311 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2312 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2313 "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2314 goto done;
2315 }
2316
2317 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2318 /*
2319 * There are sub-capabilities. Process the ones we know about.
2320 * Loop until we don't have room for another sub-cap header..
2321 */
2322 for (subp = SC(capp, capp->dl_sub_offset),
2323 endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2324 subp <= endp;
2325 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2326
2327 switch (subp->dl_cap) {
2328 case DL_CAPAB_ID_WRAPPER:
2329 ill_capability_id_ack(ill, mp, subp);
2330 break;
2331 default:
2332 ill_capability_dispatch(ill, mp, subp);
2333 break;
2334 }
2335 }
2336 #undef SC
2337 done:
2338 inet_freemsg(mp);
2339 ill_capability_done(ill);
2340 ipsq_exit(ill->ill_phyint->phyint_ipsq);
2341 }
2342
2343 /*
2344 * This needs to be started in a taskq thread to provide a cv_waitable
2345 * context.
2346 */
2347 void
2348 ill_capability_ack(ill_t *ill, mblk_t *mp)
2349 {
2350 ip_stack_t *ipst = ill->ill_ipst;
2351
2352 mp->b_prev = (mblk_t *)ill;
2353 ASSERT(mp->b_next == NULL);
2354
2355 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2356 TQ_NOSLEEP) != 0)
2357 return;
2358
2359 /*
2360 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2361 * which will do the dispatch using TQ_SLEEP to guarantee success.
2362 */
2363 mutex_enter(&ipst->ips_capab_taskq_lock);
2364 if (ipst->ips_capab_taskq_head == NULL) {
2365 ASSERT(ipst->ips_capab_taskq_tail == NULL);
2366 ipst->ips_capab_taskq_head = mp;
2367 } else {
2368 ipst->ips_capab_taskq_tail->b_next = mp;
2369 }
2370 ipst->ips_capab_taskq_tail = mp;
2371
2372 cv_signal(&ipst->ips_capab_taskq_cv);
2373 mutex_exit(&ipst->ips_capab_taskq_lock);
2374 }
2375
2376 /*
2377 * This routine is called to scan the fragmentation reassembly table for
2378 * the specified ILL for any packets that are starting to smell.
2379 * dead_interval is the maximum time in seconds that will be tolerated. It
2380 * will either be the value specified in ip_g_frag_timeout, or zero if the
2381 * ILL is shutting down and it is time to blow everything off.
2382 *
2383 * It returns the number of seconds (as a time_t) that the next frag timer
2384 * should be scheduled for, 0 meaning that the timer doesn't need to be
2385 * re-started. Note that the method of calculating next_timeout isn't
2386 * entirely accurate since time will flow between the time we grab
2387 * current_time and the time we schedule the next timeout. This isn't a
2388 * big problem since this is the timer for sending an ICMP reassembly time
2389 * exceeded messages, and it doesn't have to be exactly accurate.
2390 *
2391 * This function is
2392 * sometimes called as writer, although this is not required.
2393 */
2394 time_t
2395 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2396 {
2397 ipfb_t *ipfb;
2398 ipfb_t *endp;
2399 ipf_t *ipf;
2400 ipf_t *ipfnext;
2401 mblk_t *mp;
2402 time_t current_time = gethrestime_sec();
2403 time_t next_timeout = 0;
2404 uint32_t hdr_length;
2405 mblk_t *send_icmp_head;
2406 mblk_t *send_icmp_head_v6;
2407 ip_stack_t *ipst = ill->ill_ipst;
2408 ip_recv_attr_t iras;
2409
2410 bzero(&iras, sizeof (iras));
2411 iras.ira_flags = 0;
2412 iras.ira_ill = iras.ira_rill = ill;
2413 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2414 iras.ira_rifindex = iras.ira_ruifindex;
2415
2416 ipfb = ill->ill_frag_hash_tbl;
2417 if (ipfb == NULL)
2418 return (B_FALSE);
2419 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2420 /* Walk the frag hash table. */
2421 for (; ipfb < endp; ipfb++) {
2422 send_icmp_head = NULL;
2423 send_icmp_head_v6 = NULL;
2424 mutex_enter(&ipfb->ipfb_lock);
2425 while ((ipf = ipfb->ipfb_ipf) != 0) {
2426 time_t frag_time = current_time - ipf->ipf_timestamp;
2427 time_t frag_timeout;
2428
2429 if (frag_time < dead_interval) {
2430 /*
2431 * There are some outstanding fragments
2432 * that will timeout later. Make note of
2433 * the time so that we can reschedule the
2434 * next timeout appropriately.
2435 */
2436 frag_timeout = dead_interval - frag_time;
2437 if (next_timeout == 0 ||
2438 frag_timeout < next_timeout) {
2439 next_timeout = frag_timeout;
2440 }
2441 break;
2442 }
2443 /* Time's up. Get it out of here. */
2444 hdr_length = ipf->ipf_nf_hdr_len;
2445 ipfnext = ipf->ipf_hash_next;
2446 if (ipfnext)
2447 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2448 *ipf->ipf_ptphn = ipfnext;
2449 mp = ipf->ipf_mp->b_cont;
2450 for (; mp; mp = mp->b_cont) {
2451 /* Extra points for neatness. */
2452 IP_REASS_SET_START(mp, 0);
2453 IP_REASS_SET_END(mp, 0);
2454 }
2455 mp = ipf->ipf_mp->b_cont;
2456 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2457 ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2458 ipfb->ipfb_count -= ipf->ipf_count;
2459 ASSERT(ipfb->ipfb_frag_pkts > 0);
2460 ipfb->ipfb_frag_pkts--;
2461 /*
2462 * We do not send any icmp message from here because
2463 * we currently are holding the ipfb_lock for this
2464 * hash chain. If we try and send any icmp messages
2465 * from here we may end up via a put back into ip
2466 * trying to get the same lock, causing a recursive
2467 * mutex panic. Instead we build a list and send all
2468 * the icmp messages after we have dropped the lock.
2469 */
2470 if (ill->ill_isv6) {
2471 if (hdr_length != 0) {
2472 mp->b_next = send_icmp_head_v6;
2473 send_icmp_head_v6 = mp;
2474 } else {
2475 freemsg(mp);
2476 }
2477 } else {
2478 if (hdr_length != 0) {
2479 mp->b_next = send_icmp_head;
2480 send_icmp_head = mp;
2481 } else {
2482 freemsg(mp);
2483 }
2484 }
2485 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2486 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2487 freeb(ipf->ipf_mp);
2488 }
2489 mutex_exit(&ipfb->ipfb_lock);
2490 /*
2491 * Now need to send any icmp messages that we delayed from
2492 * above.
2493 */
2494 while (send_icmp_head_v6 != NULL) {
2495 ip6_t *ip6h;
2496
2497 mp = send_icmp_head_v6;
2498 send_icmp_head_v6 = send_icmp_head_v6->b_next;
2499 mp->b_next = NULL;
2500 ip6h = (ip6_t *)mp->b_rptr;
2501 iras.ira_flags = 0;
2502 /*
2503 * This will result in an incorrect ALL_ZONES zoneid
2504 * for multicast packets, but we
2505 * don't send ICMP errors for those in any case.
2506 */
2507 iras.ira_zoneid =
2508 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2509 ill, ipst);
2510 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2511 icmp_time_exceeded_v6(mp,
2512 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2513 &iras);
2514 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2515 }
2516 while (send_icmp_head != NULL) {
2517 ipaddr_t dst;
2518
2519 mp = send_icmp_head;
2520 send_icmp_head = send_icmp_head->b_next;
2521 mp->b_next = NULL;
2522
2523 dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2524
2525 iras.ira_flags = IRAF_IS_IPV4;
2526 /*
2527 * This will result in an incorrect ALL_ZONES zoneid
2528 * for broadcast and multicast packets, but we
2529 * don't send ICMP errors for those in any case.
2530 */
2531 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2532 ill, ipst);
2533 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2534 icmp_time_exceeded(mp,
2535 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2536 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2537 }
2538 }
2539 /*
2540 * A non-dying ILL will use the return value to decide whether to
2541 * restart the frag timer, and for how long.
2542 */
2543 return (next_timeout);
2544 }
2545
2546 /*
2547 * This routine is called when the approximate count of mblk memory used
2548 * for the specified ILL has exceeded max_count.
2549 */
2550 void
2551 ill_frag_prune(ill_t *ill, uint_t max_count)
2552 {
2553 ipfb_t *ipfb;
2554 ipf_t *ipf;
2555 size_t count;
2556 clock_t now;
2557
2558 /*
2559 * If we are here within ip_min_frag_prune_time msecs remove
2560 * ill_frag_free_num_pkts oldest packets from each bucket and increment
2561 * ill_frag_free_num_pkts.
2562 */
2563 mutex_enter(&ill->ill_lock);
2564 now = ddi_get_lbolt();
2565 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2566 (ip_min_frag_prune_time != 0 ?
2567 ip_min_frag_prune_time : msec_per_tick)) {
2568
2569 ill->ill_frag_free_num_pkts++;
2570
2571 } else {
2572 ill->ill_frag_free_num_pkts = 0;
2573 }
2574 ill->ill_last_frag_clean_time = now;
2575 mutex_exit(&ill->ill_lock);
2576
2577 /*
2578 * free ill_frag_free_num_pkts oldest packets from each bucket.
2579 */
2580 if (ill->ill_frag_free_num_pkts != 0) {
2581 int ix;
2582
2583 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2584 ipfb = &ill->ill_frag_hash_tbl[ix];
2585 mutex_enter(&ipfb->ipfb_lock);
2586 if (ipfb->ipfb_ipf != NULL) {
2587 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2588 ill->ill_frag_free_num_pkts);
2589 }
2590 mutex_exit(&ipfb->ipfb_lock);
2591 }
2592 }
2593 /*
2594 * While the reassembly list for this ILL is too big, prune a fragment
2595 * queue by age, oldest first.
2596 */
2597 while (ill->ill_frag_count > max_count) {
2598 int ix;
2599 ipfb_t *oipfb = NULL;
2600 uint_t oldest = UINT_MAX;
2601
2602 count = 0;
2603 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2604 ipfb = &ill->ill_frag_hash_tbl[ix];
2605 mutex_enter(&ipfb->ipfb_lock);
2606 ipf = ipfb->ipfb_ipf;
2607 if (ipf != NULL && ipf->ipf_gen < oldest) {
2608 oldest = ipf->ipf_gen;
2609 oipfb = ipfb;
2610 }
2611 count += ipfb->ipfb_count;
2612 mutex_exit(&ipfb->ipfb_lock);
2613 }
2614 if (oipfb == NULL)
2615 break;
2616
2617 if (count <= max_count)
2618 return; /* Somebody beat us to it, nothing to do */
2619 mutex_enter(&oipfb->ipfb_lock);
2620 ipf = oipfb->ipfb_ipf;
2621 if (ipf != NULL) {
2622 ill_frag_free_pkts(ill, oipfb, ipf, 1);
2623 }
2624 mutex_exit(&oipfb->ipfb_lock);
2625 }
2626 }
2627
2628 /*
2629 * free 'free_cnt' fragmented packets starting at ipf.
2630 */
2631 void
2632 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2633 {
2634 size_t count;
2635 mblk_t *mp;
2636 mblk_t *tmp;
2637 ipf_t **ipfp = ipf->ipf_ptphn;
2638
2639 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2640 ASSERT(ipfp != NULL);
2641 ASSERT(ipf != NULL);
2642
2643 while (ipf != NULL && free_cnt-- > 0) {
2644 count = ipf->ipf_count;
2645 mp = ipf->ipf_mp;
2646 ipf = ipf->ipf_hash_next;
2647 for (tmp = mp; tmp; tmp = tmp->b_cont) {
2648 IP_REASS_SET_START(tmp, 0);
2649 IP_REASS_SET_END(tmp, 0);
2650 }
2651 atomic_add_32(&ill->ill_frag_count, -count);
2652 ASSERT(ipfb->ipfb_count >= count);
2653 ipfb->ipfb_count -= count;
2654 ASSERT(ipfb->ipfb_frag_pkts > 0);
2655 ipfb->ipfb_frag_pkts--;
2656 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2657 ip_drop_input("ipIfStatsReasmFails", mp, ill);
2658 freemsg(mp);
2659 }
2660
2661 if (ipf)
2662 ipf->ipf_ptphn = ipfp;
2663 ipfp[0] = ipf;
2664 }
2665
2666 /*
2667 * Helper function for ill_forward_set().
2668 */
2669 static void
2670 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2671 {
2672 ip_stack_t *ipst = ill->ill_ipst;
2673
2674 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2675
2676 ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2677 (enable ? "Enabling" : "Disabling"),
2678 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2679 mutex_enter(&ill->ill_lock);
2680 if (enable)
2681 ill->ill_flags |= ILLF_ROUTER;
2682 else
2683 ill->ill_flags &= ~ILLF_ROUTER;
2684 mutex_exit(&ill->ill_lock);
2685 if (ill->ill_isv6)
2686 ill_set_nce_router_flags(ill, enable);
2687 /* Notify routing socket listeners of this change. */
2688 if (ill->ill_ipif != NULL)
2689 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2690 }
2691
2692 /*
2693 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
2694 * socket messages for each interface whose flags we change.
2695 */
2696 int
2697 ill_forward_set(ill_t *ill, boolean_t enable)
2698 {
2699 ipmp_illgrp_t *illg;
2700 ip_stack_t *ipst = ill->ill_ipst;
2701
2702 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2703
2704 if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2705 (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2706 return (0);
2707
2708 if (IS_LOOPBACK(ill))
2709 return (EINVAL);
2710
2711 if (enable && ill->ill_allowed_ips_cnt > 0)
2712 return (EPERM);
2713
2714 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2715 /*
2716 * Update all of the interfaces in the group.
2717 */
2718 illg = ill->ill_grp;
2719 ill = list_head(&illg->ig_if);
2720 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2721 ill_forward_set_on_ill(ill, enable);
2722
2723 /*
2724 * Update the IPMP meta-interface.
2725 */
2726 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2727 return (0);
2728 }
2729
2730 ill_forward_set_on_ill(ill, enable);
2731 return (0);
2732 }
2733
2734 /*
2735 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2736 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2737 * set or clear.
2738 */
2739 static void
2740 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2741 {
2742 ipif_t *ipif;
2743 ncec_t *ncec;
2744 nce_t *nce;
2745
2746 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2747 /*
2748 * NOTE: we match across the illgrp because nce's for
2749 * addresses on IPMP interfaces have an nce_ill that points to
2750 * the bound underlying ill.
2751 */
2752 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2753 if (nce != NULL) {
2754 ncec = nce->nce_common;
2755 mutex_enter(&ncec->ncec_lock);
2756 if (enable)
2757 ncec->ncec_flags |= NCE_F_ISROUTER;
2758 else
2759 ncec->ncec_flags &= ~NCE_F_ISROUTER;
2760 mutex_exit(&ncec->ncec_lock);
2761 nce_refrele(nce);
2762 }
2763 }
2764 }
2765
2766 /*
2767 * Intializes the context structure and returns the first ill in the list
2768 * cuurently start_list and end_list can have values:
2769 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists.
2770 * IP_V4_G_HEAD Traverse IPV4 list only.
2771 * IP_V6_G_HEAD Traverse IPV6 list only.
2772 */
2773
2774 /*
2775 * We don't check for CONDEMNED ills here. Caller must do that if
2776 * necessary under the ill lock.
2777 */
2778 ill_t *
2779 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2780 ip_stack_t *ipst)
2781 {
2782 ill_if_t *ifp;
2783 ill_t *ill;
2784 avl_tree_t *avl_tree;
2785
2786 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2787 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2788
2789 /*
2790 * setup the lists to search
2791 */
2792 if (end_list != MAX_G_HEADS) {
2793 ctx->ctx_current_list = start_list;
2794 ctx->ctx_last_list = end_list;
2795 } else {
2796 ctx->ctx_last_list = MAX_G_HEADS - 1;
2797 ctx->ctx_current_list = 0;
2798 }
2799
2800 while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2801 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2802 if (ifp != (ill_if_t *)
2803 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2804 avl_tree = &ifp->illif_avl_by_ppa;
2805 ill = avl_first(avl_tree);
2806 /*
2807 * ill is guaranteed to be non NULL or ifp should have
2808 * not existed.
2809 */
2810 ASSERT(ill != NULL);
2811 return (ill);
2812 }
2813 ctx->ctx_current_list++;
2814 }
2815
2816 return (NULL);
2817 }
2818
2819 /*
2820 * returns the next ill in the list. ill_first() must have been called
2821 * before calling ill_next() or bad things will happen.
2822 */
2823
2824 /*
2825 * We don't check for CONDEMNED ills here. Caller must do that if
2826 * necessary under the ill lock.
2827 */
2828 ill_t *
2829 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2830 {
2831 ill_if_t *ifp;
2832 ill_t *ill;
2833 ip_stack_t *ipst = lastill->ill_ipst;
2834
2835 ASSERT(lastill->ill_ifptr != (ill_if_t *)
2836 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2837 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2838 AVL_AFTER)) != NULL) {
2839 return (ill);
2840 }
2841
2842 /* goto next ill_ifp in the list. */
2843 ifp = lastill->ill_ifptr->illif_next;
2844
2845 /* make sure not at end of circular list */
2846 while (ifp ==
2847 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2848 if (++ctx->ctx_current_list > ctx->ctx_last_list)
2849 return (NULL);
2850 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2851 }
2852
2853 return (avl_first(&ifp->illif_avl_by_ppa));
2854 }
2855
2856 /*
2857 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2858 * The final number (PPA) must not have any leading zeros. Upon success, a
2859 * pointer to the start of the PPA is returned; otherwise NULL is returned.
2860 */
2861 static char *
2862 ill_get_ppa_ptr(char *name)
2863 {
2864 int namelen = strlen(name);
2865 int end_ndx = namelen - 1;
2866 int ppa_ndx, i;
2867
2868 /*
2869 * Check that the first character is [a-zA-Z], and that the last
2870 * character is [0-9].
2871 */
2872 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2873 return (NULL);
2874
2875 /*
2876 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2877 */
2878 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2879 if (!isdigit(name[ppa_ndx - 1]))
2880 break;
2881
2882 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2883 return (NULL);
2884
2885 /*
2886 * Check that the intermediate characters are [a-z0-9.]
2887 */
2888 for (i = 1; i < ppa_ndx; i++) {
2889 if (!isalpha(name[i]) && !isdigit(name[i]) &&
2890 name[i] != '.' && name[i] != '_') {
2891 return (NULL);
2892 }
2893 }
2894
2895 return (name + ppa_ndx);
2896 }
2897
2898 /*
2899 * use avl tree to locate the ill.
2900 */
2901 static ill_t *
2902 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2903 {
2904 char *ppa_ptr = NULL;
2905 int len;
2906 uint_t ppa;
2907 ill_t *ill = NULL;
2908 ill_if_t *ifp;
2909 int list;
2910
2911 /*
2912 * get ppa ptr
2913 */
2914 if (isv6)
2915 list = IP_V6_G_HEAD;
2916 else
2917 list = IP_V4_G_HEAD;
2918
2919 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
2920 return (NULL);
2921 }
2922
2923 len = ppa_ptr - name + 1;
2924
2925 ppa = stoi(&ppa_ptr);
2926
2927 ifp = IP_VX_ILL_G_LIST(list, ipst);
2928
2929 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2930 /*
2931 * match is done on len - 1 as the name is not null
2932 * terminated it contains ppa in addition to the interface
2933 * name.
2934 */
2935 if ((ifp->illif_name_len == len) &&
2936 bcmp(ifp->illif_name, name, len - 1) == 0) {
2937 break;
2938 } else {
2939 ifp = ifp->illif_next;
2940 }
2941 }
2942
2943 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
2944 /*
2945 * Even the interface type does not exist.
2946 */
2947 return (NULL);
2948 }
2949
2950 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
2951 if (ill != NULL) {
2952 mutex_enter(&ill->ill_lock);
2953 if (ILL_CAN_LOOKUP(ill)) {
2954 ill_refhold_locked(ill);
2955 mutex_exit(&ill->ill_lock);
2956 return (ill);
2957 }
2958 mutex_exit(&ill->ill_lock);
2959 }
2960 return (NULL);
2961 }
2962
2963 /*
2964 * comparison function for use with avl.
2965 */
2966 static int
2967 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
2968 {
2969 uint_t ppa;
2970 uint_t ill_ppa;
2971
2972 ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
2973
2974 ppa = *((uint_t *)ppa_ptr);
2975 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
2976 /*
2977 * We want the ill with the lowest ppa to be on the
2978 * top.
2979 */
2980 if (ill_ppa < ppa)
2981 return (1);
2982 if (ill_ppa > ppa)
2983 return (-1);
2984 return (0);
2985 }
2986
2987 /*
2988 * remove an interface type from the global list.
2989 */
2990 static void
2991 ill_delete_interface_type(ill_if_t *interface)
2992 {
2993 ASSERT(interface != NULL);
2994 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
2995
2996 avl_destroy(&interface->illif_avl_by_ppa);
2997 if (interface->illif_ppa_arena != NULL)
2998 vmem_destroy(interface->illif_ppa_arena);
2999
3000 remque(interface);
3001
3002 mi_free(interface);
3003 }
3004
3005 /*
3006 * remove ill from the global list.
3007 */
3008 static void
3009 ill_glist_delete(ill_t *ill)
3010 {
3011 ip_stack_t *ipst;
3012 phyint_t *phyi;
3013
3014 if (ill == NULL)
3015 return;
3016 ipst = ill->ill_ipst;
3017 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3018
3019 /*
3020 * If the ill was never inserted into the AVL tree
3021 * we skip the if branch.
3022 */
3023 if (ill->ill_ifptr != NULL) {
3024 /*
3025 * remove from AVL tree and free ppa number
3026 */
3027 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3028
3029 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3030 vmem_free(ill->ill_ifptr->illif_ppa_arena,
3031 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3032 }
3033 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3034 ill_delete_interface_type(ill->ill_ifptr);
3035 }
3036
3037 /*
3038 * Indicate ill is no longer in the list.
3039 */
3040 ill->ill_ifptr = NULL;
3041 ill->ill_name_length = 0;
3042 ill->ill_name[0] = '\0';
3043 ill->ill_ppa = UINT_MAX;
3044 }
3045
3046 /* Generate one last event for this ill. */
3047 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3048 ill->ill_name_length);
3049
3050 ASSERT(ill->ill_phyint != NULL);
3051 phyi = ill->ill_phyint;
3052 ill->ill_phyint = NULL;
3053
3054 /*
3055 * ill_init allocates a phyint always to store the copy
3056 * of flags relevant to phyint. At that point in time, we could
3057 * not assign the name and hence phyint_illv4/v6 could not be
3058 * initialized. Later in ipif_set_values, we assign the name to
3059 * the ill, at which point in time we assign phyint_illv4/v6.
3060 * Thus we don't rely on phyint_illv6 to be initialized always.
3061 */
3062 if (ill->ill_flags & ILLF_IPV6)
3063 phyi->phyint_illv6 = NULL;
3064 else
3065 phyi->phyint_illv4 = NULL;
3066
3067 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3068 rw_exit(&ipst->ips_ill_g_lock);
3069 return;
3070 }
3071
3072 /*
3073 * There are no ills left on this phyint; pull it out of the phyint
3074 * avl trees, and free it.
3075 */
3076 if (phyi->phyint_ifindex > 0) {
3077 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3078 phyi);
3079 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3080 phyi);
3081 }
3082 rw_exit(&ipst->ips_ill_g_lock);
3083
3084 phyint_free(phyi);
3085 }
3086
3087 /*
3088 * allocate a ppa, if the number of plumbed interfaces of this type are
3089 * less than ill_no_arena do a linear search to find a unused ppa.
3090 * When the number goes beyond ill_no_arena switch to using an arena.
3091 * Note: ppa value of zero cannot be allocated from vmem_arena as it
3092 * is the return value for an error condition, so allocation starts at one
3093 * and is decremented by one.
3094 */
3095 static int
3096 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3097 {
3098 ill_t *tmp_ill;
3099 uint_t start, end;
3100 int ppa;
3101
3102 if (ifp->illif_ppa_arena == NULL &&
3103 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3104 /*
3105 * Create an arena.
3106 */
3107 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3108 (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3109 NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3110 /* allocate what has already been assigned */
3111 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3112 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3113 tmp_ill, AVL_AFTER)) {
3114 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3115 1, /* size */
3116 1, /* align/quantum */
3117 0, /* phase */
3118 0, /* nocross */
3119 /* minaddr */
3120 (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3121 /* maxaddr */
3122 (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3123 VM_NOSLEEP|VM_FIRSTFIT);
3124 if (ppa == 0) {
3125 ip1dbg(("ill_alloc_ppa: ppa allocation"
3126 " failed while switching"));
3127 vmem_destroy(ifp->illif_ppa_arena);
3128 ifp->illif_ppa_arena = NULL;
3129 break;
3130 }
3131 }
3132 }
3133
3134 if (ifp->illif_ppa_arena != NULL) {
3135 if (ill->ill_ppa == UINT_MAX) {
3136 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3137 1, VM_NOSLEEP|VM_FIRSTFIT);
3138 if (ppa == 0)
3139 return (EAGAIN);
3140 ill->ill_ppa = --ppa;
3141 } else {
3142 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3143 1, /* size */
3144 1, /* align/quantum */
3145 0, /* phase */
3146 0, /* nocross */
3147 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3148 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3149 VM_NOSLEEP|VM_FIRSTFIT);
3150 /*
3151 * Most likely the allocation failed because
3152 * the requested ppa was in use.
3153 */
3154 if (ppa == 0)
3155 return (EEXIST);
3156 }
3157 return (0);
3158 }
3159
3160 /*
3161 * No arena is in use and not enough (>ill_no_arena) interfaces have
3162 * been plumbed to create one. Do a linear search to get a unused ppa.
3163 */
3164 if (ill->ill_ppa == UINT_MAX) {
3165 end = UINT_MAX - 1;
3166 start = 0;
3167 } else {
3168 end = start = ill->ill_ppa;
3169 }
3170
3171 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3172 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3173 if (start++ >= end) {
3174 if (ill->ill_ppa == UINT_MAX)
3175 return (EAGAIN);
3176 else
3177 return (EEXIST);
3178 }
3179 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3180 }
3181 ill->ill_ppa = start;
3182 return (0);
3183 }
3184
3185 /*
3186 * Insert ill into the list of configured ill's. Once this function completes,
3187 * the ill is globally visible and is available through lookups. More precisely
3188 * this happens after the caller drops the ill_g_lock.
3189 */
3190 static int
3191 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3192 {
3193 ill_if_t *ill_interface;
3194 avl_index_t where = 0;
3195 int error;
3196 int name_length;
3197 int index;
3198 boolean_t check_length = B_FALSE;
3199 ip_stack_t *ipst = ill->ill_ipst;
3200
3201 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3202
3203 name_length = mi_strlen(name) + 1;
3204
3205 if (isv6)
3206 index = IP_V6_G_HEAD;
3207 else
3208 index = IP_V4_G_HEAD;
3209
3210 ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3211 /*
3212 * Search for interface type based on name
3213 */
3214 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3215 if ((ill_interface->illif_name_len == name_length) &&
3216 (strcmp(ill_interface->illif_name, name) == 0)) {
3217 break;
3218 }
3219 ill_interface = ill_interface->illif_next;
3220 }
3221
3222 /*
3223 * Interface type not found, create one.
3224 */
3225 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3226 ill_g_head_t ghead;
3227
3228 /*
3229 * allocate ill_if_t structure
3230 */
3231 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3232 if (ill_interface == NULL) {
3233 return (ENOMEM);
3234 }
3235
3236 (void) strcpy(ill_interface->illif_name, name);
3237 ill_interface->illif_name_len = name_length;
3238
3239 avl_create(&ill_interface->illif_avl_by_ppa,
3240 ill_compare_ppa, sizeof (ill_t),
3241 offsetof(struct ill_s, ill_avl_byppa));
3242
3243 /*
3244 * link the structure in the back to maintain order
3245 * of configuration for ifconfig output.
3246 */
3247 ghead = ipst->ips_ill_g_heads[index];
3248 insque(ill_interface, ghead.ill_g_list_tail);
3249 }
3250
3251 if (ill->ill_ppa == UINT_MAX)
3252 check_length = B_TRUE;
3253
3254 error = ill_alloc_ppa(ill_interface, ill);
3255 if (error != 0) {
3256 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3257 ill_delete_interface_type(ill->ill_ifptr);
3258 return (error);
3259 }
3260
3261 /*
3262 * When the ppa is choosen by the system, check that there is
3263 * enough space to insert ppa. if a specific ppa was passed in this
3264 * check is not required as the interface name passed in will have
3265 * the right ppa in it.
3266 */
3267 if (check_length) {
3268 /*
3269 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3270 */
3271 char buf[sizeof (uint_t) * 3];
3272
3273 /*
3274 * convert ppa to string to calculate the amount of space
3275 * required for it in the name.
3276 */
3277 numtos(ill->ill_ppa, buf);
3278
3279 /* Do we have enough space to insert ppa ? */
3280
3281 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3282 /* Free ppa and interface type struct */
3283 if (ill_interface->illif_ppa_arena != NULL) {
3284 vmem_free(ill_interface->illif_ppa_arena,
3285 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3286 }
3287 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3288 ill_delete_interface_type(ill->ill_ifptr);
3289
3290 return (EINVAL);
3291 }
3292 }
3293
3294 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3295 ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3296
3297 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3298 &where);
3299 ill->ill_ifptr = ill_interface;
3300 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3301
3302 ill_phyint_reinit(ill);
3303 return (0);
3304 }
3305
3306 /* Initialize the per phyint ipsq used for serialization */
3307 static boolean_t
3308 ipsq_init(ill_t *ill, boolean_t enter)
3309 {
3310 ipsq_t *ipsq;
3311 ipxop_t *ipx;
3312
3313 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3314 return (B_FALSE);
3315
3316 ill->ill_phyint->phyint_ipsq = ipsq;
3317 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3318 ipx->ipx_ipsq = ipsq;
3319 ipsq->ipsq_next = ipsq;
3320 ipsq->ipsq_phyint = ill->ill_phyint;
3321 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3322 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3323 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */
3324 if (enter) {
3325 ipx->ipx_writer = curthread;
3326 ipx->ipx_forced = B_FALSE;
3327 ipx->ipx_reentry_cnt = 1;
3328 #ifdef DEBUG
3329 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3330 #endif
3331 }
3332 return (B_TRUE);
3333 }
3334
3335 /*
3336 * Here we perform initialisation of the ill_t common to both regular
3337 * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
3338 */
3339 static int
3340 ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
3341 boolean_t ipsq_enter)
3342 {
3343 int count;
3344 uchar_t *frag_ptr;
3345
3346 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3347 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3348 ill->ill_saved_ire_cnt = 0;
3349
3350 if (is_loopback) {
3351 ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
3352 ip_loopback_mtuplus;
3353 /*
3354 * No resolver here.
3355 */
3356 ill->ill_net_type = IRE_LOOPBACK;
3357 } else {
3358 ill->ill_rq = q;
3359 ill->ill_wq = WR(q);
3360 ill->ill_ppa = UINT_MAX;
3361 }
3362
3363 ill->ill_isv6 = isv6;
3364
3365 /*
3366 * Allocate sufficient space to contain our fragment hash table and
3367 * the device name.
3368 */
3369 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3370 if (frag_ptr == NULL)
3371 return (ENOMEM);
3372 ill->ill_frag_ptr = frag_ptr;
3373 ill->ill_frag_free_num_pkts = 0;
3374 ill->ill_last_frag_clean_time = 0;
3375 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3376 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3377 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3378 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3379 NULL, MUTEX_DEFAULT, NULL);
3380 }
3381
3382 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3383 if (ill->ill_phyint == NULL) {
3384 mi_free(frag_ptr);
3385 return (ENOMEM);
3386 }
3387
3388 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3389 if (isv6) {
3390 ill->ill_phyint->phyint_illv6 = ill;
3391 } else {
3392 ill->ill_phyint->phyint_illv4 = ill;
3393 }
3394 if (is_loopback) {
3395 phyint_flags_init(ill->ill_phyint, DL_LOOP);
3396 }
3397
3398 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3399
3400 ill_set_inputfn(ill);
3401
3402 if (!ipsq_init(ill, ipsq_enter)) {
3403 mi_free(frag_ptr);
3404 mi_free(ill->ill_phyint);
3405 return (ENOMEM);
3406 }
3407
3408 /* Frag queue limit stuff */
3409 ill->ill_frag_count = 0;
3410 ill->ill_ipf_gen = 0;
3411
3412 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3413 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3414 ill->ill_global_timer = INFINITY;
3415 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3416 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3417 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3418 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3419
3420 /*
3421 * Initialize IPv6 configuration variables. The IP module is always
3422 * opened as an IPv4 module. Instead tracking down the cases where
3423 * it switches to do ipv6, we'll just initialize the IPv6 configuration
3424 * here for convenience, this has no effect until the ill is set to do
3425 * IPv6.
3426 */
3427 ill->ill_reachable_time = ND_REACHABLE_TIME;
3428 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3429 ill->ill_max_buf = ND_MAX_Q;
3430 ill->ill_refcnt = 0;
3431
3432 return (0);
3433 }
3434
3435 /*
3436 * ill_init is called by ip_open when a device control stream is opened.
3437 * It does a few initializations, and shoots a DL_INFO_REQ message down
3438 * to the driver. The response is later picked up in ip_rput_dlpi and
3439 * used to set up default mechanisms for talking to the driver. (Always
3440 * called as writer.)
3441 *
3442 * If this function returns error, ip_open will call ip_close which in
3443 * turn will call ill_delete to clean up any memory allocated here that
3444 * is not yet freed.
3445 *
3446 * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
3447 */
3448 int
3449 ill_init(queue_t *q, ill_t *ill)
3450 {
3451 int ret;
3452 dl_info_req_t *dlir;
3453 mblk_t *info_mp;
3454
3455 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3456 BPRI_HI);
3457 if (info_mp == NULL)
3458 return (ENOMEM);
3459
3460 /*
3461 * The ill is initialized to zero by mi_alloc*(). In addition
3462 * some fields already contain valid values, initialized in
3463 * ip_open(), before we reach here.
3464 *
3465 * For now pretend this is a v4 ill. We need to set phyint_ill*
3466 * at this point because of the following reason. If we can't
3467 * enter the ipsq at some point and cv_wait, the writer that
3468 * wakes us up tries to locate us using the list of all phyints
3469 * in an ipsq and the ills from the phyint thru the phyint_ill*.
3470 * If we don't set it now, we risk a missed wakeup.
3471 */
3472 if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
3473 freemsg(info_mp);
3474 return (ret);
3475 }
3476
3477 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3478
3479 /* Send down the Info Request to the driver. */
3480 info_mp->b_datap->db_type = M_PCPROTO;
3481 dlir = (dl_info_req_t *)info_mp->b_rptr;
3482 info_mp->b_wptr = (uchar_t *)&dlir[1];
3483 dlir->dl_primitive = DL_INFO_REQ;
3484
3485 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3486
3487 qprocson(q);
3488 ill_dlpi_send(ill, info_mp);
3489
3490 return (0);
3491 }
3492
3493 /*
3494 * ill_dls_info
3495 * creates datalink socket info from the device.
3496 */
3497 int
3498 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3499 {
3500 size_t len;
3501
3502 sdl->sdl_family = AF_LINK;
3503 sdl->sdl_index = ill_get_upper_ifindex(ill);
3504 sdl->sdl_type = ill->ill_type;
3505 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3506 len = strlen(sdl->sdl_data);
3507 ASSERT(len < 256);
3508 sdl->sdl_nlen = (uchar_t)len;
3509 sdl->sdl_alen = ill->ill_phys_addr_length;
3510 sdl->sdl_slen = 0;
3511 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3512 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3513
3514 return (sizeof (struct sockaddr_dl));
3515 }
3516
3517 /*
3518 * ill_xarp_info
3519 * creates xarp info from the device.
3520 */
3521 static int
3522 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3523 {
3524 sdl->sdl_family = AF_LINK;
3525 sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3526 sdl->sdl_type = ill->ill_type;
3527 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3528 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3529 sdl->sdl_alen = ill->ill_phys_addr_length;
3530 sdl->sdl_slen = 0;
3531 return (sdl->sdl_nlen);
3532 }
3533
3534 static int
3535 loopback_kstat_update(kstat_t *ksp, int rw)
3536 {
3537 kstat_named_t *kn;
3538 netstackid_t stackid;
3539 netstack_t *ns;
3540 ip_stack_t *ipst;
3541
3542 if (ksp == NULL || ksp->ks_data == NULL)
3543 return (EIO);
3544
3545 if (rw == KSTAT_WRITE)
3546 return (EACCES);
3547
3548 kn = KSTAT_NAMED_PTR(ksp);
3549 stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3550
3551 ns = netstack_find_by_stackid(stackid);
3552 if (ns == NULL)
3553 return (-1);
3554
3555 ipst = ns->netstack_ip;
3556 if (ipst == NULL) {
3557 netstack_rele(ns);
3558 return (-1);
3559 }
3560 kn[0].value.ui32 = ipst->ips_loopback_packets;
3561 kn[1].value.ui32 = ipst->ips_loopback_packets;
3562 netstack_rele(ns);
3563 return (0);
3564 }
3565
3566 /*
3567 * Has ifindex been plumbed already?
3568 */
3569 static boolean_t
3570 phyint_exists(uint_t index, ip_stack_t *ipst)
3571 {
3572 ASSERT(index != 0);
3573 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3574
3575 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3576 &index, NULL) != NULL);
3577 }
3578
3579 /*
3580 * Pick a unique ifindex.
3581 * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3582 * flag is set so that next time time ip_assign_ifindex() is called, it
3583 * falls through and resets the index counter back to 1, the minimum value
3584 * for the interface index. The logic below assumes that ips_ill_index
3585 * can hold a value of IF_INDEX_MAX+1 without there being any loss
3586 * (i.e. reset back to 0.)
3587 */
3588 boolean_t
3589 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3590 {
3591 uint_t loops;
3592
3593 if (!ipst->ips_ill_index_wrap) {
3594 *indexp = ipst->ips_ill_index++;
3595 if (ipst->ips_ill_index > IF_INDEX_MAX) {
3596 /*
3597 * Reached the maximum ifindex value, set the wrap
3598 * flag to indicate that it is no longer possible
3599 * to assume that a given index is unallocated.
3600 */
3601 ipst->ips_ill_index_wrap = B_TRUE;
3602 }
3603 return (B_TRUE);
3604 }
3605
3606 if (ipst->ips_ill_index > IF_INDEX_MAX)
3607 ipst->ips_ill_index = 1;
3608
3609 /*
3610 * Start reusing unused indexes. Note that we hold the ill_g_lock
3611 * at this point and don't want to call any function that attempts
3612 * to get the lock again.
3613 */
3614 for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3615 if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3616 /* found unused index - use it */
3617 *indexp = ipst->ips_ill_index;
3618 return (B_TRUE);
3619 }
3620
3621 ipst->ips_ill_index++;
3622 if (ipst->ips_ill_index > IF_INDEX_MAX)
3623 ipst->ips_ill_index = 1;
3624 }
3625
3626 /*
3627 * all interface indicies are inuse.
3628 */
3629 return (B_FALSE);
3630 }
3631
3632 /*
3633 * Assign a unique interface index for the phyint.
3634 */
3635 static boolean_t
3636 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3637 {
3638 ASSERT(phyi->phyint_ifindex == 0);
3639 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3640 }
3641
3642 /*
3643 * Initialize the flags on `phyi' as per the provided mactype.
3644 */
3645 static void
3646 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3647 {
3648 uint64_t flags = 0;
3649
3650 /*
3651 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces,
3652 * we always presume the underlying hardware is working and set
3653 * PHYI_RUNNING (if it's not, the driver will subsequently send a
3654 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization
3655 * there are no active interfaces in the group so we set PHYI_FAILED.
3656 */
3657 if (mactype == SUNW_DL_IPMP)
3658 flags |= PHYI_FAILED;
3659 else
3660 flags |= PHYI_RUNNING;
3661
3662 switch (mactype) {
3663 case SUNW_DL_VNI:
3664 flags |= PHYI_VIRTUAL;
3665 break;
3666 case SUNW_DL_IPMP:
3667 flags |= PHYI_IPMP;
3668 break;
3669 case DL_LOOP:
3670 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3671 break;
3672 }
3673
3674 mutex_enter(&phyi->phyint_lock);
3675 phyi->phyint_flags |= flags;
3676 mutex_exit(&phyi->phyint_lock);
3677 }
3678
3679 /*
3680 * Return a pointer to the ill which matches the supplied name. Note that
3681 * the ill name length includes the null termination character. (May be
3682 * called as writer.)
3683 * If do_alloc and the interface is "lo0" it will be automatically created.
3684 * Cannot bump up reference on condemned ills. So dup detect can't be done
3685 * using this func.
3686 */
3687 ill_t *
3688 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3689 boolean_t *did_alloc, ip_stack_t *ipst)
3690 {
3691 ill_t *ill;
3692 ipif_t *ipif;
3693 ipsq_t *ipsq;
3694 kstat_named_t *kn;
3695 boolean_t isloopback;
3696 in6_addr_t ov6addr;
3697
3698 isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3699
3700 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3701 ill = ill_find_by_name(name, isv6, ipst);
3702 rw_exit(&ipst->ips_ill_g_lock);
3703 if (ill != NULL)
3704 return (ill);
3705
3706 /*
3707 * Couldn't find it. Does this happen to be a lookup for the
3708 * loopback device and are we allowed to allocate it?
3709 */
3710 if (!isloopback || !do_alloc)
3711 return (NULL);
3712
3713 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3714 ill = ill_find_by_name(name, isv6, ipst);
3715 if (ill != NULL) {
3716 rw_exit(&ipst->ips_ill_g_lock);
3717 return (ill);
3718 }
3719
3720 /* Create the loopback device on demand */
3721 ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3722 sizeof (ipif_loopback_name), BPRI_MED));
3723 if (ill == NULL)
3724 goto done;
3725
3726 bzero(ill, sizeof (*ill));
3727 ill->ill_ipst = ipst;
3728 netstack_hold(ipst->ips_netstack);
3729 /*
3730 * For exclusive stacks we set the zoneid to zero
3731 * to make IP operate as if in the global zone.
3732 */
3733 ill->ill_zoneid = GLOBAL_ZONEID;
3734
3735 if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
3736 goto done;
3737
3738 if (!ill_allocate_mibs(ill))
3739 goto done;
3740
3741 ill->ill_current_frag = ill->ill_max_frag;
3742 ill->ill_mtu = ill->ill_max_frag; /* Initial value */
3743 ill->ill_mc_mtu = ill->ill_mtu;
3744 /*
3745 * ipif_loopback_name can't be pointed at directly because its used
3746 * by both the ipv4 and ipv6 interfaces. When the ill is removed
3747 * from the glist, ill_glist_delete() sets the first character of
3748 * ill_name to '\0'.
3749 */
3750 ill->ill_name = (char *)ill + sizeof (*ill);
3751 (void) strcpy(ill->ill_name, ipif_loopback_name);
3752 ill->ill_name_length = sizeof (ipif_loopback_name);
3753 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3754 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3755
3756 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3757 if (ipif == NULL)
3758 goto done;
3759
3760 ill->ill_flags = ILLF_MULTICAST;
3761
3762 ov6addr = ipif->ipif_v6lcl_addr;
3763 /* Set up default loopback address and mask. */
3764 if (!isv6) {
3765 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3766
3767 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3768 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3769 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3770 ipif->ipif_v6subnet);
3771 ill->ill_flags |= ILLF_IPV4;
3772 } else {
3773 ipif->ipif_v6lcl_addr = ipv6_loopback;
3774 ipif->ipif_v6net_mask = ipv6_all_ones;
3775 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3776 ipif->ipif_v6subnet);
3777 ill->ill_flags |= ILLF_IPV6;
3778 }
3779
3780 /*
3781 * Chain us in at the end of the ill list. hold the ill
3782 * before we make it globally visible. 1 for the lookup.
3783 */
3784 ill_refhold(ill);
3785
3786 ipsq = ill->ill_phyint->phyint_ipsq;
3787
3788 if (ill_glist_insert(ill, "lo", isv6) != 0)
3789 cmn_err(CE_PANIC, "cannot insert loopback interface");
3790
3791 /* Let SCTP know so that it can add this to its list */
3792 sctp_update_ill(ill, SCTP_ILL_INSERT);
3793
3794 /*
3795 * We have already assigned ipif_v6lcl_addr above, but we need to
3796 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3797 * requires to be after ill_glist_insert() since we need the
3798 * ill_index set. Pass on ipv6_loopback as the old address.
3799 */
3800 sctp_update_ipif_addr(ipif, ov6addr);
3801
3802 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3803
3804 /*
3805 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3806 * If so, free our original one.
3807 */
3808 if (ipsq != ill->ill_phyint->phyint_ipsq)
3809 ipsq_delete(ipsq);
3810
3811 if (ipst->ips_loopback_ksp == NULL) {
3812 /* Export loopback interface statistics */
3813 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3814 ipif_loopback_name, "net",
3815 KSTAT_TYPE_NAMED, 2, 0,
3816 ipst->ips_netstack->netstack_stackid);
3817 if (ipst->ips_loopback_ksp != NULL) {
3818 ipst->ips_loopback_ksp->ks_update =
3819 loopback_kstat_update;
3820 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3821 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3822 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3823 ipst->ips_loopback_ksp->ks_private =
3824 (void *)(uintptr_t)ipst->ips_netstack->
3825 netstack_stackid;
3826 kstat_install(ipst->ips_loopback_ksp);
3827 }
3828 }
3829
3830 *did_alloc = B_TRUE;
3831 rw_exit(&ipst->ips_ill_g_lock);
3832 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3833 NE_PLUMB, ill->ill_name, ill->ill_name_length);
3834 return (ill);
3835 done:
3836 if (ill != NULL) {
3837 if (ill->ill_phyint != NULL) {
3838 ipsq = ill->ill_phyint->phyint_ipsq;
3839 if (ipsq != NULL) {
3840 ipsq->ipsq_phyint = NULL;
3841 ipsq_delete(ipsq);
3842 }
3843 mi_free(ill->ill_phyint);
3844 }
3845 ill_free_mib(ill);
3846 if (ill->ill_ipst != NULL)
3847 netstack_rele(ill->ill_ipst->ips_netstack);
3848 mi_free(ill);
3849 }
3850 rw_exit(&ipst->ips_ill_g_lock);
3851 return (NULL);
3852 }
3853
3854 /*
3855 * For IPP calls - use the ip_stack_t for global stack.
3856 */
3857 ill_t *
3858 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3859 {
3860 ip_stack_t *ipst;
3861 ill_t *ill;
3862
3863 ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip;
3864 if (ipst == NULL) {
3865 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3866 return (NULL);
3867 }
3868
3869 ill = ill_lookup_on_ifindex(index, isv6, ipst);
3870 netstack_rele(ipst->ips_netstack);
3871 return (ill);
3872 }
3873
3874 /*
3875 * Return a pointer to the ill which matches the index and IP version type.
3876 */
3877 ill_t *
3878 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3879 {
3880 ill_t *ill;
3881 phyint_t *phyi;
3882
3883 /*
3884 * Indexes are stored in the phyint - a common structure
3885 * to both IPv4 and IPv6.
3886 */
3887 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3888 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3889 (void *) &index, NULL);
3890 if (phyi != NULL) {
3891 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3892 if (ill != NULL) {
3893 mutex_enter(&ill->ill_lock);
3894 if (!ILL_IS_CONDEMNED(ill)) {
3895 ill_refhold_locked(ill);
3896 mutex_exit(&ill->ill_lock);
3897 rw_exit(&ipst->ips_ill_g_lock);
3898 return (ill);
3899 }
3900 mutex_exit(&ill->ill_lock);
3901 }
3902 }
3903 rw_exit(&ipst->ips_ill_g_lock);
3904 return (NULL);
3905 }
3906
3907 /*
3908 * Verify whether or not an interface index is valid for the specified zoneid
3909 * to transmit packets.
3910 * It can be zero (meaning "reset") or an interface index assigned
3911 * to a non-VNI interface. (We don't use VNI interface to send packets.)
3912 */
3913 boolean_t
3914 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
3915 ip_stack_t *ipst)
3916 {
3917 ill_t *ill;
3918
3919 if (ifindex == 0)
3920 return (B_TRUE);
3921
3922 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
3923 if (ill == NULL)
3924 return (B_FALSE);
3925 if (IS_VNI(ill)) {
3926 ill_refrele(ill);
3927 return (B_FALSE);
3928 }
3929 ill_refrele(ill);
3930 return (B_TRUE);
3931 }
3932
3933 /*
3934 * Return the ifindex next in sequence after the passed in ifindex.
3935 * If there is no next ifindex for the given protocol, return 0.
3936 */
3937 uint_t
3938 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3939 {
3940 phyint_t *phyi;
3941 phyint_t *phyi_initial;
3942 uint_t ifindex;
3943
3944 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3945
3946 if (index == 0) {
3947 phyi = avl_first(
3948 &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
3949 } else {
3950 phyi = phyi_initial = avl_find(
3951 &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3952 (void *) &index, NULL);
3953 }
3954
3955 for (; phyi != NULL;
3956 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3957 phyi, AVL_AFTER)) {
3958 /*
3959 * If we're not returning the first interface in the tree
3960 * and we still haven't moved past the phyint_t that
3961 * corresponds to index, avl_walk needs to be called again
3962 */
3963 if (!((index != 0) && (phyi == phyi_initial))) {
3964 if (isv6) {
3965 if ((phyi->phyint_illv6) &&
3966 ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
3967 (phyi->phyint_illv6->ill_isv6 == 1))
3968 break;
3969 } else {
3970 if ((phyi->phyint_illv4) &&
3971 ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
3972 (phyi->phyint_illv4->ill_isv6 == 0))
3973 break;
3974 }
3975 }
3976 }
3977
3978 rw_exit(&ipst->ips_ill_g_lock);
3979
3980 if (phyi != NULL)
3981 ifindex = phyi->phyint_ifindex;
3982 else
3983 ifindex = 0;
3984
3985 return (ifindex);
3986 }
3987
3988 /*
3989 * Return the ifindex for the named interface.
3990 * If there is no next ifindex for the interface, return 0.
3991 */
3992 uint_t
3993 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
3994 {
3995 phyint_t *phyi;
3996 avl_index_t where = 0;
3997 uint_t ifindex;
3998
3999 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4000
4001 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
4002 name, &where)) == NULL) {
4003 rw_exit(&ipst->ips_ill_g_lock);
4004 return (0);
4005 }
4006
4007 ifindex = phyi->phyint_ifindex;
4008
4009 rw_exit(&ipst->ips_ill_g_lock);
4010
4011 return (ifindex);
4012 }
4013
4014 /*
4015 * Return the ifindex to be used by upper layer protocols for instance
4016 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4017 */
4018 uint_t
4019 ill_get_upper_ifindex(const ill_t *ill)
4020 {
4021 if (IS_UNDER_IPMP(ill))
4022 return (ipmp_ill_get_ipmp_ifindex(ill));
4023 else
4024 return (ill->ill_phyint->phyint_ifindex);
4025 }
4026
4027
4028 /*
4029 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4030 * that gives a running thread a reference to the ill. This reference must be
4031 * released by the thread when it is done accessing the ill and related
4032 * objects. ill_refcnt can not be used to account for static references
4033 * such as other structures pointing to an ill. Callers must generally
4034 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4035 * or be sure that the ill is not being deleted or changing state before
4036 * calling the refhold functions. A non-zero ill_refcnt ensures that the
4037 * ill won't change any of its critical state such as address, netmask etc.
4038 */
4039 void
4040 ill_refhold(ill_t *ill)
4041 {
4042 mutex_enter(&ill->ill_lock);
4043 ill->ill_refcnt++;
4044 ILL_TRACE_REF(ill);
4045 mutex_exit(&ill->ill_lock);
4046 }
4047
4048 void
4049 ill_refhold_locked(ill_t *ill)
4050 {
4051 ASSERT(MUTEX_HELD(&ill->ill_lock));
4052 ill->ill_refcnt++;
4053 ILL_TRACE_REF(ill);
4054 }
4055
4056 /* Returns true if we managed to get a refhold */
4057 boolean_t
4058 ill_check_and_refhold(ill_t *ill)
4059 {
4060 mutex_enter(&ill->ill_lock);
4061 if (!ILL_IS_CONDEMNED(ill)) {
4062 ill_refhold_locked(ill);
4063 mutex_exit(&ill->ill_lock);
4064 return (B_TRUE);
4065 }
4066 mutex_exit(&ill->ill_lock);
4067 return (B_FALSE);
4068 }
4069
4070 /*
4071 * Must not be called while holding any locks. Otherwise if this is
4072 * the last reference to be released, there is a chance of recursive mutex
4073 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4074 * to restart an ioctl.
4075 */
4076 void
4077 ill_refrele(ill_t *ill)
4078 {
4079 mutex_enter(&ill->ill_lock);
4080 ASSERT(ill->ill_refcnt != 0);
4081 ill->ill_refcnt--;
4082 ILL_UNTRACE_REF(ill);
4083 if (ill->ill_refcnt != 0) {
4084 /* Every ire pointing to the ill adds 1 to ill_refcnt */
4085 mutex_exit(&ill->ill_lock);
4086 return;
4087 }
4088
4089 /* Drops the ill_lock */
4090 ipif_ill_refrele_tail(ill);
4091 }
4092
4093 /*
4094 * Obtain a weak reference count on the ill. This reference ensures the
4095 * ill won't be freed, but the ill may change any of its critical state
4096 * such as netmask, address etc. Returns an error if the ill has started
4097 * closing.
4098 */
4099 boolean_t
4100 ill_waiter_inc(ill_t *ill)
4101 {
4102 mutex_enter(&ill->ill_lock);
4103 if (ill->ill_state_flags & ILL_CONDEMNED) {
4104 mutex_exit(&ill->ill_lock);
4105 return (B_FALSE);
4106 }
4107 ill->ill_waiters++;
4108 mutex_exit(&ill->ill_lock);
4109 return (B_TRUE);
4110 }
4111
4112 void
4113 ill_waiter_dcr(ill_t *ill)
4114 {
4115 mutex_enter(&ill->ill_lock);
4116 ill->ill_waiters--;
4117 if (ill->ill_waiters == 0)
4118 cv_broadcast(&ill->ill_cv);
4119 mutex_exit(&ill->ill_lock);
4120 }
4121
4122 /*
4123 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4124 * driver. We construct best guess defaults for lower level information that
4125 * we need. If an interface is brought up without injection of any overriding
4126 * information from outside, we have to be ready to go with these defaults.
4127 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4128 * we primarely want the dl_provider_style.
4129 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4130 * at which point we assume the other part of the information is valid.
4131 */
4132 void
4133 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4134 {
4135 uchar_t *brdcst_addr;
4136 uint_t brdcst_addr_length, phys_addr_length;
4137 t_scalar_t sap_length;
4138 dl_info_ack_t *dlia;
4139 ip_m_t *ipm;
4140 dl_qos_cl_sel1_t *sel1;
4141 int min_mtu;
4142
4143 ASSERT(IAM_WRITER_ILL(ill));
4144
4145 /*
4146 * Till the ill is fully up the ill is not globally visible.
4147 * So no need for a lock.
4148 */
4149 dlia = (dl_info_ack_t *)mp->b_rptr;
4150 ill->ill_mactype = dlia->dl_mac_type;
4151
4152 ipm = ip_m_lookup(dlia->dl_mac_type);
4153 if (ipm == NULL) {
4154 ipm = ip_m_lookup(DL_OTHER);
4155 ASSERT(ipm != NULL);
4156 }
4157 ill->ill_media = ipm;
4158
4159 /*
4160 * When the new DLPI stuff is ready we'll pull lengths
4161 * from dlia.
4162 */
4163 if (dlia->dl_version == DL_VERSION_2) {
4164 brdcst_addr_length = dlia->dl_brdcst_addr_length;
4165 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4166 brdcst_addr_length);
4167 if (brdcst_addr == NULL) {
4168 brdcst_addr_length = 0;
4169 }
4170 sap_length = dlia->dl_sap_length;
4171 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4172 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4173 brdcst_addr_length, sap_length, phys_addr_length));
4174 } else {
4175 brdcst_addr_length = 6;
4176 brdcst_addr = ip_six_byte_all_ones;
4177 sap_length = -2;
4178 phys_addr_length = brdcst_addr_length;
4179 }
4180
4181 ill->ill_bcast_addr_length = brdcst_addr_length;
4182 ill->ill_phys_addr_length = phys_addr_length;
4183 ill->ill_sap_length = sap_length;
4184
4185 /*
4186 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4187 * but we must ensure a minimum IP MTU is used since other bits of
4188 * IP will fly apart otherwise.
4189 */
4190 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4191 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4192 ill->ill_current_frag = ill->ill_max_frag;
4193 ill->ill_mtu = ill->ill_max_frag;
4194 ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */
4195
4196 ill->ill_type = ipm->ip_m_type;
4197
4198 if (!ill->ill_dlpi_style_set) {
4199 if (dlia->dl_provider_style == DL_STYLE2)
4200 ill->ill_needs_attach = 1;
4201
4202 phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4203
4204 /*
4205 * Allocate the first ipif on this ill. We don't delay it
4206 * further as ioctl handling assumes at least one ipif exists.
4207 *
4208 * At this point we don't know whether the ill is v4 or v6.
4209 * We will know this whan the SIOCSLIFNAME happens and
4210 * the correct value for ill_isv6 will be assigned in
4211 * ipif_set_values(). We need to hold the ill lock and
4212 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4213 * the wakeup.
4214 */
4215 (void) ipif_allocate(ill, 0, IRE_LOCAL,
4216 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4217 mutex_enter(&ill->ill_lock);
4218 ASSERT(ill->ill_dlpi_style_set == 0);
4219 ill->ill_dlpi_style_set = 1;
4220 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4221 cv_broadcast(&ill->ill_cv);
4222 mutex_exit(&ill->ill_lock);
4223 freemsg(mp);
4224 return;
4225 }
4226 ASSERT(ill->ill_ipif != NULL);
4227 /*
4228 * We know whether it is IPv4 or IPv6 now, as this is the
4229 * second DL_INFO_ACK we are recieving in response to the
4230 * DL_INFO_REQ sent in ipif_set_values.
4231 */
4232 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4233 /*
4234 * Clear all the flags that were set based on ill_bcast_addr_length
4235 * and ill_phys_addr_length (in ipif_set_values) as these could have
4236 * changed now and we need to re-evaluate.
4237 */
4238 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4239 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4240
4241 /*
4242 * Free ill_bcast_mp as things could have changed now.
4243 *
4244 * NOTE: The IPMP meta-interface is special-cased because it starts
4245 * with no underlying interfaces (and thus an unknown broadcast
4246 * address length), but we enforce that an interface is broadcast-
4247 * capable as part of allowing it to join a group.
4248 */
4249 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4250 if (ill->ill_bcast_mp != NULL)
4251 freemsg(ill->ill_bcast_mp);
4252 ill->ill_net_type = IRE_IF_NORESOLVER;
4253
4254 ill->ill_bcast_mp = ill_dlur_gen(NULL,
4255 ill->ill_phys_addr_length,
4256 ill->ill_sap,
4257 ill->ill_sap_length);
4258
4259 if (ill->ill_isv6)
4260 /*
4261 * Note: xresolv interfaces will eventually need NOARP
4262 * set here as well, but that will require those
4263 * external resolvers to have some knowledge of
4264 * that flag and act appropriately. Not to be changed
4265 * at present.
4266 */
4267 ill->ill_flags |= ILLF_NONUD;
4268 else
4269 ill->ill_flags |= ILLF_NOARP;
4270
4271 if (ill->ill_mactype == SUNW_DL_VNI) {
4272 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4273 } else if (ill->ill_phys_addr_length == 0 ||
4274 ill->ill_mactype == DL_IPV4 ||
4275 ill->ill_mactype == DL_IPV6) {
4276 /*
4277 * The underying link is point-to-point, so mark the
4278 * interface as such. We can do IP multicast over
4279 * such a link since it transmits all network-layer
4280 * packets to the remote side the same way.
4281 */
4282 ill->ill_flags |= ILLF_MULTICAST;
4283 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4284 }
4285 } else {
4286 ill->ill_net_type = IRE_IF_RESOLVER;
4287 if (ill->ill_bcast_mp != NULL)
4288 freemsg(ill->ill_bcast_mp);
4289 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4290 ill->ill_bcast_addr_length, ill->ill_sap,
4291 ill->ill_sap_length);
4292 /*
4293 * Later detect lack of DLPI driver multicast
4294 * capability by catching DL_ENABMULTI errors in
4295 * ip_rput_dlpi.
4296 */
4297 ill->ill_flags |= ILLF_MULTICAST;
4298 if (!ill->ill_isv6)
4299 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4300 }
4301
4302 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4303 if (ill->ill_mactype == SUNW_DL_IPMP)
4304 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4305
4306 /* By default an interface does not support any CoS marking */
4307 ill->ill_flags &= ~ILLF_COS_ENABLED;
4308
4309 /*
4310 * If we get QoS information in DL_INFO_ACK, the device supports
4311 * some form of CoS marking, set ILLF_COS_ENABLED.
4312 */
4313 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4314 dlia->dl_qos_length);
4315 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4316 ill->ill_flags |= ILLF_COS_ENABLED;
4317 }
4318
4319 /* Clear any previous error indication. */
4320 ill->ill_error = 0;
4321 freemsg(mp);
4322 }
4323
4324 /*
4325 * Perform various checks to verify that an address would make sense as a
4326 * local, remote, or subnet interface address.
4327 */
4328 static boolean_t
4329 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4330 {
4331 ipaddr_t net_mask;
4332
4333 /*
4334 * Don't allow all zeroes, or all ones, but allow
4335 * all ones netmask.
4336 */
4337 if ((net_mask = ip_net_mask(addr)) == 0)
4338 return (B_FALSE);
4339 /* A given netmask overrides the "guess" netmask */
4340 if (subnet_mask != 0)
4341 net_mask = subnet_mask;
4342 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4343 (addr == (addr | ~net_mask)))) {
4344 return (B_FALSE);
4345 }
4346
4347 /*
4348 * Even if the netmask is all ones, we do not allow address to be
4349 * 255.255.255.255
4350 */
4351 if (addr == INADDR_BROADCAST)
4352 return (B_FALSE);
4353
4354 if (CLASSD(addr))
4355 return (B_FALSE);
4356
4357 return (B_TRUE);
4358 }
4359
4360 #define V6_IPIF_LINKLOCAL(p) \
4361 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4362
4363 /*
4364 * Compare two given ipifs and check if the second one is better than
4365 * the first one using the order of preference (not taking deprecated
4366 * into acount) specified in ipif_lookup_multicast().
4367 */
4368 static boolean_t
4369 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4370 {
4371 /* Check the least preferred first. */
4372 if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4373 /* If both ipifs are the same, use the first one. */
4374 if (IS_LOOPBACK(new_ipif->ipif_ill))
4375 return (B_FALSE);
4376 else
4377 return (B_TRUE);
4378 }
4379
4380 /* For IPv6, check for link local address. */
4381 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4382 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4383 V6_IPIF_LINKLOCAL(new_ipif)) {
4384 /* The second one is equal or less preferred. */
4385 return (B_FALSE);
4386 } else {
4387 return (B_TRUE);
4388 }
4389 }
4390
4391 /* Then check for point to point interface. */
4392 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4393 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4394 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4395 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4396 return (B_FALSE);
4397 } else {
4398 return (B_TRUE);
4399 }
4400 }
4401
4402 /* old_ipif is a normal interface, so no need to use the new one. */
4403 return (B_FALSE);
4404 }
4405
4406 /*
4407 * Find a mulitcast-capable ipif given an IP instance and zoneid.
4408 * The ipif must be up, and its ill must multicast-capable, not
4409 * condemned, not an underlying interface in an IPMP group, and
4410 * not a VNI interface. Order of preference:
4411 *
4412 * 1a. normal
4413 * 1b. normal, but deprecated
4414 * 2a. point to point
4415 * 2b. point to point, but deprecated
4416 * 3a. link local
4417 * 3b. link local, but deprecated
4418 * 4. loopback.
4419 */
4420 static ipif_t *
4421 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4422 {
4423 ill_t *ill;
4424 ill_walk_context_t ctx;
4425 ipif_t *ipif;
4426 ipif_t *saved_ipif = NULL;
4427 ipif_t *dep_ipif = NULL;
4428
4429 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4430 if (isv6)
4431 ill = ILL_START_WALK_V6(&ctx, ipst);
4432 else
4433 ill = ILL_START_WALK_V4(&ctx, ipst);
4434
4435 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4436 mutex_enter(&ill->ill_lock);
4437 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4438 ILL_IS_CONDEMNED(ill) ||
4439 !(ill->ill_flags & ILLF_MULTICAST)) {
4440 mutex_exit(&ill->ill_lock);
4441 continue;
4442 }
4443 for (ipif = ill->ill_ipif; ipif != NULL;
4444 ipif = ipif->ipif_next) {
4445 if (zoneid != ipif->ipif_zoneid &&
4446 zoneid != ALL_ZONES &&
4447 ipif->ipif_zoneid != ALL_ZONES) {
4448 continue;
4449 }
4450 if (!(ipif->ipif_flags & IPIF_UP) ||
4451 IPIF_IS_CONDEMNED(ipif)) {
4452 continue;
4453 }
4454
4455 /*
4456 * Found one candidate. If it is deprecated,
4457 * remember it in dep_ipif. If it is not deprecated,
4458 * remember it in saved_ipif.
4459 */
4460 if (ipif->ipif_flags & IPIF_DEPRECATED) {
4461 if (dep_ipif == NULL) {
4462 dep_ipif = ipif;
4463 } else if (ipif_comp_multi(dep_ipif, ipif,
4464 isv6)) {
4465 /*
4466 * If the previous dep_ipif does not
4467 * belong to the same ill, we've done
4468 * a ipif_refhold() on it. So we need
4469 * to release it.
4470 */
4471 if (dep_ipif->ipif_ill != ill)
4472 ipif_refrele(dep_ipif);
4473 dep_ipif = ipif;
4474 }
4475 continue;
4476 }
4477 if (saved_ipif == NULL) {
4478 saved_ipif = ipif;
4479 } else {
4480 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4481 if (saved_ipif->ipif_ill != ill)
4482 ipif_refrele(saved_ipif);
4483 saved_ipif = ipif;
4484 }
4485 }
4486 }
4487 /*
4488 * Before going to the next ill, do a ipif_refhold() on the
4489 * saved ones.
4490 */
4491 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4492 ipif_refhold_locked(saved_ipif);
4493 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4494 ipif_refhold_locked(dep_ipif);
4495 mutex_exit(&ill->ill_lock);
4496 }
4497 rw_exit(&ipst->ips_ill_g_lock);
4498
4499 /*
4500 * If we have only the saved_ipif, return it. But if we have both
4501 * saved_ipif and dep_ipif, check to see which one is better.
4502 */
4503 if (saved_ipif != NULL) {
4504 if (dep_ipif != NULL) {
4505 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4506 ipif_refrele(saved_ipif);
4507 return (dep_ipif);
4508 } else {
4509 ipif_refrele(dep_ipif);
4510 return (saved_ipif);
4511 }
4512 }
4513 return (saved_ipif);
4514 } else {
4515 return (dep_ipif);
4516 }
4517 }
4518
4519 ill_t *
4520 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4521 {
4522 ipif_t *ipif;
4523 ill_t *ill;
4524
4525 ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4526 if (ipif == NULL)
4527 return (NULL);
4528
4529 ill = ipif->ipif_ill;
4530 ill_refhold(ill);
4531 ipif_refrele(ipif);
4532 return (ill);
4533 }
4534
4535 /*
4536 * This function is called when an application does not specify an interface
4537 * to be used for multicast traffic (joining a group/sending data). It
4538 * calls ire_lookup_multi() to look for an interface route for the
4539 * specified multicast group. Doing this allows the administrator to add
4540 * prefix routes for multicast to indicate which interface to be used for
4541 * multicast traffic in the above scenario. The route could be for all
4542 * multicast (224.0/4), for a single multicast group (a /32 route) or
4543 * anything in between. If there is no such multicast route, we just find
4544 * any multicast capable interface and return it. The returned ipif
4545 * is refhold'ed.
4546 *
4547 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
4548 * unicast table. This is used by CGTP.
4549 */
4550 ill_t *
4551 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4552 boolean_t *multirtp, ipaddr_t *setsrcp)
4553 {
4554 ill_t *ill;
4555
4556 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
4557 if (ill != NULL)
4558 return (ill);
4559
4560 return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4561 }
4562
4563 /*
4564 * Look for an ipif with the specified interface address and destination.
4565 * The destination address is used only for matching point-to-point interfaces.
4566 */
4567 ipif_t *
4568 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4569 {
4570 ipif_t *ipif;
4571 ill_t *ill;
4572 ill_walk_context_t ctx;
4573
4574 /*
4575 * First match all the point-to-point interfaces
4576 * before looking at non-point-to-point interfaces.
4577 * This is done to avoid returning non-point-to-point
4578 * ipif instead of unnumbered point-to-point ipif.
4579 */
4580 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4581 ill = ILL_START_WALK_V4(&ctx, ipst);
4582 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4583 mutex_enter(&ill->ill_lock);
4584 for (ipif = ill->ill_ipif; ipif != NULL;
4585 ipif = ipif->ipif_next) {
4586 /* Allow the ipif to be down */
4587 if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4588 (ipif->ipif_lcl_addr == if_addr) &&
4589 (ipif->ipif_pp_dst_addr == dst)) {
4590 if (!IPIF_IS_CONDEMNED(ipif)) {
4591 ipif_refhold_locked(ipif);
4592 mutex_exit(&ill->ill_lock);
4593 rw_exit(&ipst->ips_ill_g_lock);
4594 return (ipif);
4595 }
4596 }
4597 }
4598 mutex_exit(&ill->ill_lock);
4599 }
4600 rw_exit(&ipst->ips_ill_g_lock);
4601
4602 /* lookup the ipif based on interface address */
4603 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4604 ASSERT(ipif == NULL || !ipif->ipif_isv6);
4605 return (ipif);
4606 }
4607
4608 /*
4609 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4610 */
4611 static ipif_t *
4612 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4613 zoneid_t zoneid, ip_stack_t *ipst)
4614 {
4615 ipif_t *ipif;
4616 ill_t *ill;
4617 boolean_t ptp = B_FALSE;
4618 ill_walk_context_t ctx;
4619 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4620 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4621
4622 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4623 /*
4624 * Repeat twice, first based on local addresses and
4625 * next time for pointopoint.
4626 */
4627 repeat:
4628 ill = ILL_START_WALK_V4(&ctx, ipst);
4629 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4630 if (match_ill != NULL && ill != match_ill &&
4631 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4632 continue;
4633 }
4634 mutex_enter(&ill->ill_lock);
4635 for (ipif = ill->ill_ipif; ipif != NULL;
4636 ipif = ipif->ipif_next) {
4637 if (zoneid != ALL_ZONES &&
4638 zoneid != ipif->ipif_zoneid &&
4639 ipif->ipif_zoneid != ALL_ZONES)
4640 continue;
4641
4642 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4643 continue;
4644
4645 /* Allow the ipif to be down */
4646 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4647 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4648 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4649 (ipif->ipif_pp_dst_addr == addr))) {
4650 if (!IPIF_IS_CONDEMNED(ipif)) {
4651 ipif_refhold_locked(ipif);
4652 mutex_exit(&ill->ill_lock);
4653 rw_exit(&ipst->ips_ill_g_lock);
4654 return (ipif);
4655 }
4656 }
4657 }
4658 mutex_exit(&ill->ill_lock);
4659 }
4660
4661 /* If we already did the ptp case, then we are done */
4662 if (ptp) {
4663 rw_exit(&ipst->ips_ill_g_lock);
4664 return (NULL);
4665 }
4666 ptp = B_TRUE;
4667 goto repeat;
4668 }
4669
4670 /*
4671 * Lookup an ipif with the specified address. For point-to-point links we
4672 * look for matches on either the destination address or the local address,
4673 * but we skip the local address check if IPIF_UNNUMBERED is set. If the
4674 * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4675 * (or illgrp if `match_ill' is in an IPMP group).
4676 */
4677 ipif_t *
4678 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4679 ip_stack_t *ipst)
4680 {
4681 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4682 zoneid, ipst));
4683 }
4684
4685 /*
4686 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4687 * except that we will only return an address if it is not marked as
4688 * IPIF_DUPLICATE
4689 */
4690 ipif_t *
4691 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4692 ip_stack_t *ipst)
4693 {
4694 return (ipif_lookup_addr_common(addr, match_ill,
4695 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4696 zoneid, ipst));
4697 }
4698
4699 /*
4700 * Special abbreviated version of ipif_lookup_addr() that doesn't match
4701 * `match_ill' across the IPMP group. This function is only needed in some
4702 * corner-cases; almost everything should use ipif_lookup_addr().
4703 */
4704 ipif_t *
4705 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4706 {
4707 ASSERT(match_ill != NULL);
4708 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4709 ipst));
4710 }
4711
4712 /*
4713 * Look for an ipif with the specified address. For point-point links
4714 * we look for matches on either the destination address and the local
4715 * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4716 * is set.
4717 * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4718 * ill (or illgrp if `match_ill' is in an IPMP group).
4719 * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4720 */
4721 zoneid_t
4722 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4723 {
4724 zoneid_t zoneid;
4725 ipif_t *ipif;
4726 ill_t *ill;
4727 boolean_t ptp = B_FALSE;
4728 ill_walk_context_t ctx;
4729
4730 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4731 /*
4732 * Repeat twice, first based on local addresses and
4733 * next time for pointopoint.
4734 */
4735 repeat:
4736 ill = ILL_START_WALK_V4(&ctx, ipst);
4737 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4738 if (match_ill != NULL && ill != match_ill &&
4739 !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4740 continue;
4741 }
4742 mutex_enter(&ill->ill_lock);
4743 for (ipif = ill->ill_ipif; ipif != NULL;
4744 ipif = ipif->ipif_next) {
4745 /* Allow the ipif to be down */
4746 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4747 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4748 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4749 (ipif->ipif_pp_dst_addr == addr)) &&
4750 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4751 zoneid = ipif->ipif_zoneid;
4752 mutex_exit(&ill->ill_lock);
4753 rw_exit(&ipst->ips_ill_g_lock);
4754 /*
4755 * If ipif_zoneid was ALL_ZONES then we have
4756 * a trusted extensions shared IP address.
4757 * In that case GLOBAL_ZONEID works to send.
4758 */
4759 if (zoneid == ALL_ZONES)
4760 zoneid = GLOBAL_ZONEID;
4761 return (zoneid);
4762 }
4763 }
4764 mutex_exit(&ill->ill_lock);
4765 }
4766
4767 /* If we already did the ptp case, then we are done */
4768 if (ptp) {
4769 rw_exit(&ipst->ips_ill_g_lock);
4770 return (ALL_ZONES);
4771 }
4772 ptp = B_TRUE;
4773 goto repeat;
4774 }
4775
4776 /*
4777 * Look for an ipif that matches the specified remote address i.e. the
4778 * ipif that would receive the specified packet.
4779 * First look for directly connected interfaces and then do a recursive
4780 * IRE lookup and pick the first ipif corresponding to the source address in the
4781 * ire.
4782 * Returns: held ipif
4783 *
4784 * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4785 */
4786 ipif_t *
4787 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4788 {
4789 ipif_t *ipif;
4790
4791 ASSERT(!ill->ill_isv6);
4792
4793 /*
4794 * Someone could be changing this ipif currently or change it
4795 * after we return this. Thus a few packets could use the old
4796 * old values. However structure updates/creates (ire, ilg, ilm etc)
4797 * will atomically be updated or cleaned up with the new value
4798 * Thus we don't need a lock to check the flags or other attrs below.
4799 */
4800 mutex_enter(&ill->ill_lock);
4801 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4802 if (IPIF_IS_CONDEMNED(ipif))
4803 continue;
4804 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4805 ipif->ipif_zoneid != ALL_ZONES)
4806 continue;
4807 /* Allow the ipif to be down */
4808 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4809 if ((ipif->ipif_pp_dst_addr == addr) ||
4810 (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4811 ipif->ipif_lcl_addr == addr)) {
4812 ipif_refhold_locked(ipif);
4813 mutex_exit(&ill->ill_lock);
4814 return (ipif);
4815 }
4816 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4817 ipif_refhold_locked(ipif);
4818 mutex_exit(&ill->ill_lock);
4819 return (ipif);
4820 }
4821 }
4822 mutex_exit(&ill->ill_lock);
4823 /*
4824 * For a remote destination it isn't possible to nail down a particular
4825 * ipif.
4826 */
4827
4828 /* Pick the first interface */
4829 ipif = ipif_get_next_ipif(NULL, ill);
4830 return (ipif);
4831 }
4832
4833 /*
4834 * This func does not prevent refcnt from increasing. But if
4835 * the caller has taken steps to that effect, then this func
4836 * can be used to determine whether the ill has become quiescent
4837 */
4838 static boolean_t
4839 ill_is_quiescent(ill_t *ill)
4840 {
4841 ipif_t *ipif;
4842
4843 ASSERT(MUTEX_HELD(&ill->ill_lock));
4844
4845 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4846 if (ipif->ipif_refcnt != 0)
4847 return (B_FALSE);
4848 }
4849 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4850 return (B_FALSE);
4851 }
4852 return (B_TRUE);
4853 }
4854
4855 boolean_t
4856 ill_is_freeable(ill_t *ill)
4857 {
4858 ipif_t *ipif;
4859
4860 ASSERT(MUTEX_HELD(&ill->ill_lock));
4861
4862 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4863 if (ipif->ipif_refcnt != 0) {
4864 return (B_FALSE);
4865 }
4866 }
4867 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4868 return (B_FALSE);
4869 }
4870 return (B_TRUE);
4871 }
4872
4873 /*
4874 * This func does not prevent refcnt from increasing. But if
4875 * the caller has taken steps to that effect, then this func
4876 * can be used to determine whether the ipif has become quiescent
4877 */
4878 static boolean_t
4879 ipif_is_quiescent(ipif_t *ipif)
4880 {
4881 ill_t *ill;
4882
4883 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4884
4885 if (ipif->ipif_refcnt != 0)
4886 return (B_FALSE);
4887
4888 ill = ipif->ipif_ill;
4889 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4890 ill->ill_logical_down) {
4891 return (B_TRUE);
4892 }
4893
4894 /* This is the last ipif going down or being deleted on this ill */
4895 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4896 return (B_FALSE);
4897 }
4898
4899 return (B_TRUE);
4900 }
4901
4902 /*
4903 * return true if the ipif can be destroyed: the ipif has to be quiescent
4904 * with zero references from ire/ilm to it.
4905 */
4906 static boolean_t
4907 ipif_is_freeable(ipif_t *ipif)
4908 {
4909 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4910 ASSERT(ipif->ipif_id != 0);
4911 return (ipif->ipif_refcnt == 0);
4912 }
4913
4914 /*
4915 * The ipif/ill/ire has been refreled. Do the tail processing.
4916 * Determine if the ipif or ill in question has become quiescent and if so
4917 * wakeup close and/or restart any queued pending ioctl that is waiting
4918 * for the ipif_down (or ill_down)
4919 */
4920 void
4921 ipif_ill_refrele_tail(ill_t *ill)
4922 {
4923 mblk_t *mp;
4924 conn_t *connp;
4925 ipsq_t *ipsq;
4926 ipxop_t *ipx;
4927 ipif_t *ipif;
4928 dl_notify_ind_t *dlindp;
4929
4930 ASSERT(MUTEX_HELD(&ill->ill_lock));
4931
4932 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
4933 /* ip_modclose() may be waiting */
4934 cv_broadcast(&ill->ill_cv);
4935 }
4936
4937 ipsq = ill->ill_phyint->phyint_ipsq;
4938 mutex_enter(&ipsq->ipsq_lock);
4939 ipx = ipsq->ipsq_xop;
4940 mutex_enter(&ipx->ipx_lock);
4941 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */
4942 goto unlock;
4943
4944 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
4945
4946 ipif = ipx->ipx_pending_ipif;
4947 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */
4948 goto unlock;
4949
4950 switch (ipx->ipx_waitfor) {
4951 case IPIF_DOWN:
4952 if (!ipif_is_quiescent(ipif))
4953 goto unlock;
4954 break;
4955 case IPIF_FREE:
4956 if (!ipif_is_freeable(ipif))
4957 goto unlock;
4958 break;
4959 case ILL_DOWN:
4960 if (!ill_is_quiescent(ill))
4961 goto unlock;
4962 break;
4963 case ILL_FREE:
4964 /*
4965 * ILL_FREE is only for loopback; normal ill teardown waits
4966 * synchronously in ip_modclose() without using ipx_waitfor,
4967 * handled by the cv_broadcast() at the top of this function.
4968 */
4969 if (!ill_is_freeable(ill))
4970 goto unlock;
4971 break;
4972 default:
4973 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
4974 (void *)ipsq, ipx->ipx_waitfor);
4975 }
4976
4977 ill_refhold_locked(ill); /* for qwriter_ip() call below */
4978 mutex_exit(&ipx->ipx_lock);
4979 mp = ipsq_pending_mp_get(ipsq, &connp);
4980 mutex_exit(&ipsq->ipsq_lock);
4981 mutex_exit(&ill->ill_lock);
4982
4983 ASSERT(mp != NULL);
4984 /*
4985 * NOTE: all of the qwriter_ip() calls below use CUR_OP since
4986 * we can only get here when the current operation decides it
4987 * it needs to quiesce via ipsq_pending_mp_add().
4988 */
4989 switch (mp->b_datap->db_type) {
4990 case M_PCPROTO:
4991 case M_PROTO:
4992 /*
4993 * For now, only DL_NOTIFY_IND messages can use this facility.
4994 */
4995 dlindp = (dl_notify_ind_t *)mp->b_rptr;
4996 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
4997
4998 switch (dlindp->dl_notification) {
4999 case DL_NOTE_PHYS_ADDR:
5000 qwriter_ip(ill, ill->ill_rq, mp,
5001 ill_set_phys_addr_tail, CUR_OP, B_TRUE);
5002 return;
5003 case DL_NOTE_REPLUMB:
5004 qwriter_ip(ill, ill->ill_rq, mp,
5005 ill_replumb_tail, CUR_OP, B_TRUE);
5006 return;
5007 default:
5008 ASSERT(0);
5009 ill_refrele(ill);
5010 }
5011 break;
5012
5013 case M_ERROR:
5014 case M_HANGUP:
5015 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5016 B_TRUE);
5017 return;
5018
5019 case M_IOCTL:
5020 case M_IOCDATA:
5021 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5022 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5023 return;
5024
5025 default:
5026 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5027 "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5028 }
5029 return;
5030 unlock:
5031 mutex_exit(&ipsq->ipsq_lock);
5032 mutex_exit(&ipx->ipx_lock);
5033 mutex_exit(&ill->ill_lock);
5034 }
5035
5036 #ifdef DEBUG
5037 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5038 static void
5039 th_trace_rrecord(th_trace_t *th_trace)
5040 {
5041 tr_buf_t *tr_buf;
5042 uint_t lastref;
5043
5044 lastref = th_trace->th_trace_lastref;
5045 lastref++;
5046 if (lastref == TR_BUF_MAX)
5047 lastref = 0;
5048 th_trace->th_trace_lastref = lastref;
5049 tr_buf = &th_trace->th_trbuf[lastref];
5050 tr_buf->tr_time = ddi_get_lbolt();
5051 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5052 }
5053
5054 static void
5055 th_trace_free(void *value)
5056 {
5057 th_trace_t *th_trace = value;
5058
5059 ASSERT(th_trace->th_refcnt == 0);
5060 kmem_free(th_trace, sizeof (*th_trace));
5061 }
5062
5063 /*
5064 * Find or create the per-thread hash table used to track object references.
5065 * The ipst argument is NULL if we shouldn't allocate.
5066 *
5067 * Accesses per-thread data, so there's no need to lock here.
5068 */
5069 static mod_hash_t *
5070 th_trace_gethash(ip_stack_t *ipst)
5071 {
5072 th_hash_t *thh;
5073
5074 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5075 mod_hash_t *mh;
5076 char name[256];
5077 size_t objsize, rshift;
5078 int retv;
5079
5080 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5081 return (NULL);
5082 (void) snprintf(name, sizeof (name), "th_trace_%p",
5083 (void *)curthread);
5084
5085 /*
5086 * We use mod_hash_create_extended here rather than the more
5087 * obvious mod_hash_create_ptrhash because the latter has a
5088 * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5089 * block.
5090 */
5091 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5092 MAX(sizeof (ire_t), sizeof (ncec_t)));
5093 rshift = highbit(objsize);
5094 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5095 th_trace_free, mod_hash_byptr, (void *)rshift,
5096 mod_hash_ptrkey_cmp, KM_NOSLEEP);
5097 if (mh == NULL) {
5098 kmem_free(thh, sizeof (*thh));
5099 return (NULL);
5100 }
5101 thh->thh_hash = mh;
5102 thh->thh_ipst = ipst;
5103 /*
5104 * We trace ills, ipifs, ires, and nces. All of these are
5105 * per-IP-stack, so the lock on the thread list is as well.
5106 */
5107 rw_enter(&ip_thread_rwlock, RW_WRITER);
5108 list_insert_tail(&ip_thread_list, thh);
5109 rw_exit(&ip_thread_rwlock);
5110 retv = tsd_set(ip_thread_data, thh);
5111 ASSERT(retv == 0);
5112 }
5113 return (thh != NULL ? thh->thh_hash : NULL);
5114 }
5115
5116 boolean_t
5117 th_trace_ref(const void *obj, ip_stack_t *ipst)
5118 {
5119 th_trace_t *th_trace;
5120 mod_hash_t *mh;
5121 mod_hash_val_t val;
5122
5123 if ((mh = th_trace_gethash(ipst)) == NULL)
5124 return (B_FALSE);
5125
5126 /*
5127 * Attempt to locate the trace buffer for this obj and thread.
5128 * If it does not exist, then allocate a new trace buffer and
5129 * insert into the hash.
5130 */
5131 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5132 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5133 if (th_trace == NULL)
5134 return (B_FALSE);
5135
5136 th_trace->th_id = curthread;
5137 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5138 (mod_hash_val_t)th_trace) != 0) {
5139 kmem_free(th_trace, sizeof (th_trace_t));
5140 return (B_FALSE);
5141 }
5142 } else {
5143 th_trace = (th_trace_t *)val;
5144 }
5145
5146 ASSERT(th_trace->th_refcnt >= 0 &&
5147 th_trace->th_refcnt < TR_BUF_MAX - 1);
5148
5149 th_trace->th_refcnt++;
5150 th_trace_rrecord(th_trace);
5151 return (B_TRUE);
5152 }
5153
5154 /*
5155 * For the purpose of tracing a reference release, we assume that global
5156 * tracing is always on and that the same thread initiated the reference hold
5157 * is releasing.
5158 */
5159 void
5160 th_trace_unref(const void *obj)
5161 {
5162 int retv;
5163 mod_hash_t *mh;
5164 th_trace_t *th_trace;
5165 mod_hash_val_t val;
5166
5167 mh = th_trace_gethash(NULL);
5168 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5169 ASSERT(retv == 0);
5170 th_trace = (th_trace_t *)val;
5171
5172 ASSERT(th_trace->th_refcnt > 0);
5173 th_trace->th_refcnt--;
5174 th_trace_rrecord(th_trace);
5175 }
5176
5177 /*
5178 * If tracing has been disabled, then we assume that the reference counts are
5179 * now useless, and we clear them out before destroying the entries.
5180 */
5181 void
5182 th_trace_cleanup(const void *obj, boolean_t trace_disable)
5183 {
5184 th_hash_t *thh;
5185 mod_hash_t *mh;
5186 mod_hash_val_t val;
5187 th_trace_t *th_trace;
5188 int retv;
5189
5190 rw_enter(&ip_thread_rwlock, RW_READER);
5191 for (thh = list_head(&ip_thread_list); thh != NULL;
5192 thh = list_next(&ip_thread_list, thh)) {
5193 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5194 &val) == 0) {
5195 th_trace = (th_trace_t *)val;
5196 if (trace_disable)
5197 th_trace->th_refcnt = 0;
5198 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5199 ASSERT(retv == 0);
5200 }
5201 }
5202 rw_exit(&ip_thread_rwlock);
5203 }
5204
5205 void
5206 ipif_trace_ref(ipif_t *ipif)
5207 {
5208 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5209
5210 if (ipif->ipif_trace_disable)
5211 return;
5212
5213 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5214 ipif->ipif_trace_disable = B_TRUE;
5215 ipif_trace_cleanup(ipif);
5216 }
5217 }
5218
5219 void
5220 ipif_untrace_ref(ipif_t *ipif)
5221 {
5222 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5223
5224 if (!ipif->ipif_trace_disable)
5225 th_trace_unref(ipif);
5226 }
5227
5228 void
5229 ill_trace_ref(ill_t *ill)
5230 {
5231 ASSERT(MUTEX_HELD(&ill->ill_lock));
5232
5233 if (ill->ill_trace_disable)
5234 return;
5235
5236 if (!th_trace_ref(ill, ill->ill_ipst)) {
5237 ill->ill_trace_disable = B_TRUE;
5238 ill_trace_cleanup(ill);
5239 }
5240 }
5241
5242 void
5243 ill_untrace_ref(ill_t *ill)
5244 {
5245 ASSERT(MUTEX_HELD(&ill->ill_lock));
5246
5247 if (!ill->ill_trace_disable)
5248 th_trace_unref(ill);
5249 }
5250
5251 /*
5252 * Called when ipif is unplumbed or when memory alloc fails. Note that on
5253 * failure, ipif_trace_disable is set.
5254 */
5255 static void
5256 ipif_trace_cleanup(const ipif_t *ipif)
5257 {
5258 th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5259 }
5260
5261 /*
5262 * Called when ill is unplumbed or when memory alloc fails. Note that on
5263 * failure, ill_trace_disable is set.
5264 */
5265 static void
5266 ill_trace_cleanup(const ill_t *ill)
5267 {
5268 th_trace_cleanup(ill, ill->ill_trace_disable);
5269 }
5270 #endif /* DEBUG */
5271
5272 void
5273 ipif_refhold_locked(ipif_t *ipif)
5274 {
5275 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5276 ipif->ipif_refcnt++;
5277 IPIF_TRACE_REF(ipif);
5278 }
5279
5280 void
5281 ipif_refhold(ipif_t *ipif)
5282 {
5283 ill_t *ill;
5284
5285 ill = ipif->ipif_ill;
5286 mutex_enter(&ill->ill_lock);
5287 ipif->ipif_refcnt++;
5288 IPIF_TRACE_REF(ipif);
5289 mutex_exit(&ill->ill_lock);
5290 }
5291
5292 /*
5293 * Must not be called while holding any locks. Otherwise if this is
5294 * the last reference to be released there is a chance of recursive mutex
5295 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5296 * to restart an ioctl.
5297 */
5298 void
5299 ipif_refrele(ipif_t *ipif)
5300 {
5301 ill_t *ill;
5302
5303 ill = ipif->ipif_ill;
5304
5305 mutex_enter(&ill->ill_lock);
5306 ASSERT(ipif->ipif_refcnt != 0);
5307 ipif->ipif_refcnt--;
5308 IPIF_UNTRACE_REF(ipif);
5309 if (ipif->ipif_refcnt != 0) {
5310 mutex_exit(&ill->ill_lock);
5311 return;
5312 }
5313
5314 /* Drops the ill_lock */
5315 ipif_ill_refrele_tail(ill);
5316 }
5317
5318 ipif_t *
5319 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5320 {
5321 ipif_t *ipif;
5322
5323 mutex_enter(&ill->ill_lock);
5324 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5325 ipif != NULL; ipif = ipif->ipif_next) {
5326 if (IPIF_IS_CONDEMNED(ipif))
5327 continue;
5328 ipif_refhold_locked(ipif);
5329 mutex_exit(&ill->ill_lock);
5330 return (ipif);
5331 }
5332 mutex_exit(&ill->ill_lock);
5333 return (NULL);
5334 }
5335
5336 /*
5337 * TODO: make this table extendible at run time
5338 * Return a pointer to the mac type info for 'mac_type'
5339 */
5340 static ip_m_t *
5341 ip_m_lookup(t_uscalar_t mac_type)
5342 {
5343 ip_m_t *ipm;
5344
5345 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5346 if (ipm->ip_m_mac_type == mac_type)
5347 return (ipm);
5348 return (NULL);
5349 }
5350
5351 /*
5352 * Make a link layer address from the multicast IP address *addr.
5353 * To form the link layer address, invoke the ip_m_v*mapping function
5354 * associated with the link-layer type.
5355 */
5356 void
5357 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5358 {
5359 ip_m_t *ipm;
5360
5361 if (ill->ill_net_type == IRE_IF_NORESOLVER)
5362 return;
5363
5364 ASSERT(addr != NULL);
5365
5366 ipm = ip_m_lookup(ill->ill_mactype);
5367 if (ipm == NULL ||
5368 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5369 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5370 ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5371 ill->ill_name, ill->ill_mactype));
5372 return;
5373 }
5374 if (ill->ill_isv6)
5375 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5376 else
5377 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5378 }
5379
5380 /*
5381 * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5382 * Otherwise returns B_TRUE.
5383 *
5384 * The netmask can be verified to be contiguous with 32 shifts and or
5385 * operations. Take the contiguous mask (in host byte order) and compute
5386 * mask | mask << 1 | mask << 2 | ... | mask << 31
5387 * the result will be the same as the 'mask' for contiguous mask.
5388 */
5389 static boolean_t
5390 ip_contiguous_mask(uint32_t mask)
5391 {
5392 uint32_t m = mask;
5393 int i;
5394
5395 for (i = 1; i < 32; i++)
5396 m |= (mask << i);
5397
5398 return (m == mask);
5399 }
5400
5401 /*
5402 * ip_rt_add is called to add an IPv4 route to the forwarding table.
5403 * ill is passed in to associate it with the correct interface.
5404 * If ire_arg is set, then we return the held IRE in that location.
5405 */
5406 int
5407 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5408 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5409 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
5410 {
5411 ire_t *ire, *nire;
5412 ire_t *gw_ire = NULL;
5413 ipif_t *ipif = NULL;
5414 uint_t type;
5415 int match_flags = MATCH_IRE_TYPE;
5416 tsol_gc_t *gc = NULL;
5417 tsol_gcgrp_t *gcgrp = NULL;
5418 boolean_t gcgrp_xtraref = B_FALSE;
5419 boolean_t cgtp_broadcast;
5420 boolean_t unbound = B_FALSE;
5421
5422 ip1dbg(("ip_rt_add:"));
5423
5424 if (ire_arg != NULL)
5425 *ire_arg = NULL;
5426
5427 /* disallow non-contiguous netmasks */
5428 if (!ip_contiguous_mask(ntohl(mask)))
5429 return (ENOTSUP);
5430
5431 /*
5432 * If this is the case of RTF_HOST being set, then we set the netmask
5433 * to all ones (regardless if one was supplied).
5434 */
5435 if (flags & RTF_HOST)
5436 mask = IP_HOST_MASK;
5437
5438 /*
5439 * Prevent routes with a zero gateway from being created (since
5440 * interfaces can currently be plumbed and brought up no assigned
5441 * address).
5442 */
5443 if (gw_addr == 0)
5444 return (ENETUNREACH);
5445 /*
5446 * Get the ipif, if any, corresponding to the gw_addr
5447 * If -ifp was specified we restrict ourselves to the ill, otherwise
5448 * we match on the gatway and destination to handle unnumbered pt-pt
5449 * interfaces.
5450 */
5451 if (ill != NULL)
5452 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5453 else
5454 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5455 if (ipif != NULL) {
5456 if (IS_VNI(ipif->ipif_ill)) {
5457 ipif_refrele(ipif);
5458 return (EINVAL);
5459 }
5460 }
5461
5462 /*
5463 * GateD will attempt to create routes with a loopback interface
5464 * address as the gateway and with RTF_GATEWAY set. We allow
5465 * these routes to be added, but create them as interface routes
5466 * since the gateway is an interface address.
5467 */
5468 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5469 flags &= ~RTF_GATEWAY;
5470 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5471 mask == IP_HOST_MASK) {
5472 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5473 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
5474 NULL);
5475 if (ire != NULL) {
5476 ire_refrele(ire);
5477 ipif_refrele(ipif);
5478 return (EEXIST);
5479 }
5480 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5481 "for 0x%x\n", (void *)ipif,
5482 ipif->ipif_ire_type,
5483 ntohl(ipif->ipif_lcl_addr)));
5484 ire = ire_create(
5485 (uchar_t *)&dst_addr, /* dest address */
5486 (uchar_t *)&mask, /* mask */
5487 NULL, /* no gateway */
5488 ipif->ipif_ire_type, /* LOOPBACK */
5489 ipif->ipif_ill,
5490 zoneid,
5491 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5492 NULL,
5493 ipst);
5494
5495 if (ire == NULL) {
5496 ipif_refrele(ipif);
5497 return (ENOMEM);
5498 }
5499 /* src address assigned by the caller? */
5500 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5501 ire->ire_setsrc_addr = src_addr;
5502
5503 nire = ire_add(ire);
5504 if (nire == NULL) {
5505 /*
5506 * In the result of failure, ire_add() will have
5507 * already deleted the ire in question, so there
5508 * is no need to do that here.
5509 */
5510 ipif_refrele(ipif);
5511 return (ENOMEM);
5512 }
5513 /*
5514 * Check if it was a duplicate entry. This handles
5515 * the case of two racing route adds for the same route
5516 */
5517 if (nire != ire) {
5518 ASSERT(nire->ire_identical_ref > 1);
5519 ire_delete(nire);
5520 ire_refrele(nire);
5521 ipif_refrele(ipif);
5522 return (EEXIST);
5523 }
5524 ire = nire;
5525 goto save_ire;
5526 }
5527 }
5528
5529 /*
5530 * The routes for multicast with CGTP are quite special in that
5531 * the gateway is the local interface address, yet RTF_GATEWAY
5532 * is set. We turn off RTF_GATEWAY to provide compatibility with
5533 * this undocumented and unusual use of multicast routes.
5534 */
5535 if ((flags & RTF_MULTIRT) && ipif != NULL)
5536 flags &= ~RTF_GATEWAY;
5537
5538 /*
5539 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5540 * and the gateway address provided is one of the system's interface
5541 * addresses. By using the routing socket interface and supplying an
5542 * RTA_IFP sockaddr with an interface index, an alternate method of
5543 * specifying an interface route to be created is available which uses
5544 * the interface index that specifies the outgoing interface rather than
5545 * the address of an outgoing interface (which may not be able to
5546 * uniquely identify an interface). When coupled with the RTF_GATEWAY
5547 * flag, routes can be specified which not only specify the next-hop to
5548 * be used when routing to a certain prefix, but also which outgoing
5549 * interface should be used.
5550 *
5551 * Previously, interfaces would have unique addresses assigned to them
5552 * and so the address assigned to a particular interface could be used
5553 * to identify a particular interface. One exception to this was the
5554 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5555 *
5556 * With the advent of IPv6 and its link-local addresses, this
5557 * restriction was relaxed and interfaces could share addresses between
5558 * themselves. In fact, typically all of the link-local interfaces on
5559 * an IPv6 node or router will have the same link-local address. In
5560 * order to differentiate between these interfaces, the use of an
5561 * interface index is necessary and this index can be carried inside a
5562 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction
5563 * of using the interface index, however, is that all of the ipif's that
5564 * are part of an ill have the same index and so the RTA_IFP sockaddr
5565 * cannot be used to differentiate between ipif's (or logical
5566 * interfaces) that belong to the same ill (physical interface).
5567 *
5568 * For example, in the following case involving IPv4 interfaces and
5569 * logical interfaces
5570 *
5571 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0
5572 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0
5573 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0
5574 *
5575 * the ipif's corresponding to each of these interface routes can be
5576 * uniquely identified by the "gateway" (actually interface address).
5577 *
5578 * In this case involving multiple IPv6 default routes to a particular
5579 * link-local gateway, the use of RTA_IFP is necessary to specify which
5580 * default route is of interest:
5581 *
5582 * default fe80::123:4567:89ab:cdef U if0
5583 * default fe80::123:4567:89ab:cdef U if1
5584 */
5585
5586 /* RTF_GATEWAY not set */
5587 if (!(flags & RTF_GATEWAY)) {
5588 if (sp != NULL) {
5589 ip2dbg(("ip_rt_add: gateway security attributes "
5590 "cannot be set with interface route\n"));
5591 if (ipif != NULL)
5592 ipif_refrele(ipif);
5593 return (EINVAL);
5594 }
5595
5596 /*
5597 * Whether or not ill (RTA_IFP) is set, we require that
5598 * the gateway is one of our local addresses.
5599 */
5600 if (ipif == NULL)
5601 return (ENETUNREACH);
5602
5603 /*
5604 * We use MATCH_IRE_ILL here. If the caller specified an
5605 * interface (from the RTA_IFP sockaddr) we use it, otherwise
5606 * we use the ill derived from the gateway address.
5607 * We can always match the gateway address since we record it
5608 * in ire_gateway_addr.
5609 * We don't allow RTA_IFP to specify a different ill than the
5610 * one matching the ipif to make sure we can delete the route.
5611 */
5612 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5613 if (ill == NULL) {
5614 ill = ipif->ipif_ill;
5615 } else if (ill != ipif->ipif_ill) {
5616 ipif_refrele(ipif);
5617 return (EINVAL);
5618 }
5619
5620 /*
5621 * We check for an existing entry at this point.
5622 *
5623 * Since a netmask isn't passed in via the ioctl interface
5624 * (SIOCADDRT), we don't check for a matching netmask in that
5625 * case.
5626 */
5627 if (!ioctl_msg)
5628 match_flags |= MATCH_IRE_MASK;
5629 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5630 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
5631 NULL);
5632 if (ire != NULL) {
5633 ire_refrele(ire);
5634 ipif_refrele(ipif);
5635 return (EEXIST);
5636 }
5637
5638 /*
5639 * Some software (for example, GateD and Sun Cluster) attempts
5640 * to create (what amount to) IRE_PREFIX routes with the
5641 * loopback address as the gateway. This is primarily done to
5642 * set up prefixes with the RTF_REJECT flag set (for example,
5643 * when generating aggregate routes.)
5644 *
5645 * If the IRE type (as defined by ill->ill_net_type) would be
5646 * IRE_LOOPBACK, then we map the request into a
5647 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5648 * these interface routes, by definition, can only be that.
5649 *
5650 * Needless to say, the real IRE_LOOPBACK is NOT created by this
5651 * routine, but rather using ire_create() directly.
5652 *
5653 */
5654 type = ill->ill_net_type;
5655 if (type == IRE_LOOPBACK) {
5656 type = IRE_IF_NORESOLVER;
5657 flags |= RTF_BLACKHOLE;
5658 }
5659
5660 /*
5661 * Create a copy of the IRE_IF_NORESOLVER or
5662 * IRE_IF_RESOLVER with the modified address, netmask, and
5663 * gateway.
5664 */
5665 ire = ire_create(
5666 (uchar_t *)&dst_addr,
5667 (uint8_t *)&mask,
5668 (uint8_t *)&gw_addr,
5669 type,
5670 ill,
5671 zoneid,
5672 flags,
5673 NULL,
5674 ipst);
5675 if (ire == NULL) {
5676 ipif_refrele(ipif);
5677 return (ENOMEM);
5678 }
5679
5680 /* src address assigned by the caller? */
5681 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5682 ire->ire_setsrc_addr = src_addr;
5683
5684 nire = ire_add(ire);
5685 if (nire == NULL) {
5686 /*
5687 * In the result of failure, ire_add() will have
5688 * already deleted the ire in question, so there
5689 * is no need to do that here.
5690 */
5691 ipif_refrele(ipif);
5692 return (ENOMEM);
5693 }
5694 /*
5695 * Check if it was a duplicate entry. This handles
5696 * the case of two racing route adds for the same route
5697 */
5698 if (nire != ire) {
5699 ire_delete(nire);
5700 ire_refrele(nire);
5701 ipif_refrele(ipif);
5702 return (EEXIST);
5703 }
5704 ire = nire;
5705 goto save_ire;
5706 }
5707
5708 /*
5709 * Get an interface IRE for the specified gateway.
5710 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5711 * gateway, it is currently unreachable and we fail the request
5712 * accordingly. We reject any RTF_GATEWAY routes where the gateway
5713 * is an IRE_LOCAL or IRE_LOOPBACK.
5714 * If RTA_IFP was specified we look on that particular ill.
5715 */
5716 if (ill != NULL)
5717 match_flags |= MATCH_IRE_ILL;
5718
5719 /* Check whether the gateway is reachable. */
5720 again:
5721 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5722 if (flags & RTF_INDIRECT)
5723 type |= IRE_OFFLINK;
5724
5725 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5726 ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
5727 if (gw_ire == NULL) {
5728 /*
5729 * With IPMP, we allow host routes to influence in.mpathd's
5730 * target selection. However, if the test addresses are on
5731 * their own network, the above lookup will fail since the
5732 * underlying IRE_INTERFACEs are marked hidden. So allow
5733 * hidden test IREs to be found and try again.
5734 */
5735 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) {
5736 match_flags |= MATCH_IRE_TESTHIDDEN;
5737 goto again;
5738 }
5739 if (ipif != NULL)
5740 ipif_refrele(ipif);
5741 return (ENETUNREACH);
5742 }
5743 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5744 ire_refrele(gw_ire);
5745 if (ipif != NULL)
5746 ipif_refrele(ipif);
5747 return (ENETUNREACH);
5748 }
5749
5750 if (ill == NULL && !(flags & RTF_INDIRECT)) {
5751 unbound = B_TRUE;
5752 if (ipst->ips_ip_strict_src_multihoming > 0)
5753 ill = gw_ire->ire_ill;
5754 }
5755
5756 /*
5757 * We create one of three types of IREs as a result of this request
5758 * based on the netmask. A netmask of all ones (which is automatically
5759 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5760 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5761 * created. Otherwise, an IRE_PREFIX route is created for the
5762 * destination prefix.
5763 */
5764 if (mask == IP_HOST_MASK)
5765 type = IRE_HOST;
5766 else if (mask == 0)
5767 type = IRE_DEFAULT;
5768 else
5769 type = IRE_PREFIX;
5770
5771 /* check for a duplicate entry */
5772 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5773 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
5774 0, ipst, NULL);
5775 if (ire != NULL) {
5776 if (ipif != NULL)
5777 ipif_refrele(ipif);
5778 ire_refrele(gw_ire);
5779 ire_refrele(ire);
5780 return (EEXIST);
5781 }
5782
5783 /* Security attribute exists */
5784 if (sp != NULL) {
5785 tsol_gcgrp_addr_t ga;
5786
5787 /* find or create the gateway credentials group */
5788 ga.ga_af = AF_INET;
5789 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
5790
5791 /* we hold reference to it upon success */
5792 gcgrp = gcgrp_lookup(&ga, B_TRUE);
5793 if (gcgrp == NULL) {
5794 if (ipif != NULL)
5795 ipif_refrele(ipif);
5796 ire_refrele(gw_ire);
5797 return (ENOMEM);
5798 }
5799
5800 /*
5801 * Create and add the security attribute to the group; a
5802 * reference to the group is made upon allocating a new
5803 * entry successfully. If it finds an already-existing
5804 * entry for the security attribute in the group, it simply
5805 * returns it and no new reference is made to the group.
5806 */
5807 gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
5808 if (gc == NULL) {
5809 if (ipif != NULL)
5810 ipif_refrele(ipif);
5811 /* release reference held by gcgrp_lookup */
5812 GCGRP_REFRELE(gcgrp);
5813 ire_refrele(gw_ire);
5814 return (ENOMEM);
5815 }
5816 }
5817
5818 /* Create the IRE. */
5819 ire = ire_create(
5820 (uchar_t *)&dst_addr, /* dest address */
5821 (uchar_t *)&mask, /* mask */
5822 (uchar_t *)&gw_addr, /* gateway address */
5823 (ushort_t)type, /* IRE type */
5824 ill,
5825 zoneid,
5826 flags,
5827 gc, /* security attribute */
5828 ipst);
5829
5830 /*
5831 * The ire holds a reference to the 'gc' and the 'gc' holds a
5832 * reference to the 'gcgrp'. We can now release the extra reference
5833 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
5834 */
5835 if (gcgrp_xtraref)
5836 GCGRP_REFRELE(gcgrp);
5837 if (ire == NULL) {
5838 if (gc != NULL)
5839 GC_REFRELE(gc);
5840 if (ipif != NULL)
5841 ipif_refrele(ipif);
5842 ire_refrele(gw_ire);
5843 return (ENOMEM);
5844 }
5845
5846 /* Before we add, check if an extra CGTP broadcast is needed */
5847 cgtp_broadcast = ((flags & RTF_MULTIRT) &&
5848 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
5849
5850 /* src address assigned by the caller? */
5851 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5852 ire->ire_setsrc_addr = src_addr;
5853
5854 ire->ire_unbound = unbound;
5855
5856 /*
5857 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5858 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5859 */
5860
5861 /* Add the new IRE. */
5862 nire = ire_add(ire);
5863 if (nire == NULL) {
5864 /*
5865 * In the result of failure, ire_add() will have
5866 * already deleted the ire in question, so there
5867 * is no need to do that here.
5868 */
5869 if (ipif != NULL)
5870 ipif_refrele(ipif);
5871 ire_refrele(gw_ire);
5872 return (ENOMEM);
5873 }
5874 /*
5875 * Check if it was a duplicate entry. This handles
5876 * the case of two racing route adds for the same route
5877 */
5878 if (nire != ire) {
5879 ire_delete(nire);
5880 ire_refrele(nire);
5881 if (ipif != NULL)
5882 ipif_refrele(ipif);
5883 ire_refrele(gw_ire);
5884 return (EEXIST);
5885 }
5886 ire = nire;
5887
5888 if (flags & RTF_MULTIRT) {
5889 /*
5890 * Invoke the CGTP (multirouting) filtering module
5891 * to add the dst address in the filtering database.
5892 * Replicated inbound packets coming from that address
5893 * will be filtered to discard the duplicates.
5894 * It is not necessary to call the CGTP filter hook
5895 * when the dst address is a broadcast or multicast,
5896 * because an IP source address cannot be a broadcast
5897 * or a multicast.
5898 */
5899 if (cgtp_broadcast) {
5900 ip_cgtp_bcast_add(ire, ipst);
5901 goto save_ire;
5902 }
5903 if (ipst->ips_ip_cgtp_filter_ops != NULL &&
5904 !CLASSD(ire->ire_addr)) {
5905 int res;
5906 ipif_t *src_ipif;
5907
5908 /* Find the source address corresponding to gw_ire */
5909 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
5910 NULL, zoneid, ipst);
5911 if (src_ipif != NULL) {
5912 res = ipst->ips_ip_cgtp_filter_ops->
5913 cfo_add_dest_v4(
5914 ipst->ips_netstack->netstack_stackid,
5915 ire->ire_addr,
5916 ire->ire_gateway_addr,
5917 ire->ire_setsrc_addr,
5918 src_ipif->ipif_lcl_addr);
5919 ipif_refrele(src_ipif);
5920 } else {
5921 res = EADDRNOTAVAIL;
5922 }
5923 if (res != 0) {
5924 if (ipif != NULL)
5925 ipif_refrele(ipif);
5926 ire_refrele(gw_ire);
5927 ire_delete(ire);
5928 ire_refrele(ire); /* Held in ire_add */
5929 return (res);
5930 }
5931 }
5932 }
5933
5934 save_ire:
5935 if (gw_ire != NULL) {
5936 ire_refrele(gw_ire);
5937 gw_ire = NULL;
5938 }
5939 if (ill != NULL) {
5940 /*
5941 * Save enough information so that we can recreate the IRE if
5942 * the interface goes down and then up. The metrics associated
5943 * with the route will be saved as well when rts_setmetrics() is
5944 * called after the IRE has been created. In the case where
5945 * memory cannot be allocated, none of this information will be
5946 * saved.
5947 */
5948 ill_save_ire(ill, ire);
5949 }
5950 if (ioctl_msg)
5951 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
5952 if (ire_arg != NULL) {
5953 /*
5954 * Store the ire that was successfully added into where ire_arg
5955 * points to so that callers don't have to look it up
5956 * themselves (but they are responsible for ire_refrele()ing
5957 * the ire when they are finished with it).
5958 */
5959 *ire_arg = ire;
5960 } else {
5961 ire_refrele(ire); /* Held in ire_add */
5962 }
5963 if (ipif != NULL)
5964 ipif_refrele(ipif);
5965 return (0);
5966 }
5967
5968 /*
5969 * ip_rt_delete is called to delete an IPv4 route.
5970 * ill is passed in to associate it with the correct interface.
5971 */
5972 /* ARGSUSED4 */
5973 int
5974 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5975 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
5976 ip_stack_t *ipst, zoneid_t zoneid)
5977 {
5978 ire_t *ire = NULL;
5979 ipif_t *ipif;
5980 uint_t type;
5981 uint_t match_flags = MATCH_IRE_TYPE;
5982 int err = 0;
5983
5984 ip1dbg(("ip_rt_delete:"));
5985 /*
5986 * If this is the case of RTF_HOST being set, then we set the netmask
5987 * to all ones. Otherwise, we use the netmask if one was supplied.
5988 */
5989 if (flags & RTF_HOST) {
5990 mask = IP_HOST_MASK;
5991 match_flags |= MATCH_IRE_MASK;
5992 } else if (rtm_addrs & RTA_NETMASK) {
5993 match_flags |= MATCH_IRE_MASK;
5994 }
5995
5996 /*
5997 * Note that RTF_GATEWAY is never set on a delete, therefore
5998 * we check if the gateway address is one of our interfaces first,
5999 * and fall back on RTF_GATEWAY routes.
6000 *
6001 * This makes it possible to delete an original
6002 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
6003 * However, we have RTF_KERNEL set on the ones created by ipif_up
6004 * and those can not be deleted here.
6005 *
6006 * We use MATCH_IRE_ILL if we know the interface. If the caller
6007 * specified an interface (from the RTA_IFP sockaddr) we use it,
6008 * otherwise we use the ill derived from the gateway address.
6009 * We can always match the gateway address since we record it
6010 * in ire_gateway_addr.
6011 *
6012 * For more detail on specifying routes by gateway address and by
6013 * interface index, see the comments in ip_rt_add().
6014 */
6015 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
6016 if (ipif != NULL) {
6017 ill_t *ill_match;
6018
6019 if (ill != NULL)
6020 ill_match = ill;
6021 else
6022 ill_match = ipif->ipif_ill;
6023
6024 match_flags |= MATCH_IRE_ILL;
6025 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
6026 ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
6027 IRE_LOOPBACK, ill_match, ALL_ZONES, NULL,
6028 match_flags, 0, ipst, NULL);
6029 }
6030 if (ire == NULL) {
6031 match_flags |= MATCH_IRE_GW;
6032 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
6033 IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
6034 match_flags, 0, ipst, NULL);
6035 }
6036 /* Avoid deleting routes created by kernel from an ipif */
6037 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
6038 ire_refrele(ire);
6039 ire = NULL;
6040 }
6041
6042 /* Restore in case we didn't find a match */
6043 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
6044 }
6045
6046 if (ire == NULL) {
6047 /*
6048 * At this point, the gateway address is not one of our own
6049 * addresses or a matching interface route was not found. We
6050 * set the IRE type to lookup based on whether
6051 * this is a host route, a default route or just a prefix.
6052 *
6053 * If an ill was passed in, then the lookup is based on an
6054 * interface index so MATCH_IRE_ILL is added to match_flags.
6055 */
6056 match_flags |= MATCH_IRE_GW;
6057 if (ill != NULL)
6058 match_flags |= MATCH_IRE_ILL;
6059 if (mask == IP_HOST_MASK)
6060 type = IRE_HOST;
6061 else if (mask == 0)
6062 type = IRE_DEFAULT;
6063 else
6064 type = IRE_PREFIX;
6065 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
6066 ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
6067 }
6068
6069 if (ipif != NULL) {
6070 ipif_refrele(ipif);
6071 ipif = NULL;
6072 }
6073
6074 if (ire == NULL)
6075 return (ESRCH);
6076
6077 if (ire->ire_flags & RTF_MULTIRT) {
6078 /*
6079 * Invoke the CGTP (multirouting) filtering module
6080 * to remove the dst address from the filtering database.
6081 * Packets coming from that address will no longer be
6082 * filtered to remove duplicates.
6083 */
6084 if (ipst->ips_ip_cgtp_filter_ops != NULL) {
6085 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
6086 ipst->ips_netstack->netstack_stackid,
6087 ire->ire_addr, ire->ire_gateway_addr);
6088 }
6089 ip_cgtp_bcast_delete(ire, ipst);
6090 }
6091
6092 ill = ire->ire_ill;
6093 if (ill != NULL)
6094 ill_remove_saved_ire(ill, ire);
6095 if (ioctl_msg)
6096 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
6097 ire_delete(ire);
6098 ire_refrele(ire);
6099 return (err);
6100 }
6101
6102 /*
6103 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
6104 */
6105 /* ARGSUSED */
6106 int
6107 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6108 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6109 {
6110 ipaddr_t dst_addr;
6111 ipaddr_t gw_addr;
6112 ipaddr_t mask;
6113 int error = 0;
6114 mblk_t *mp1;
6115 struct rtentry *rt;
6116 ipif_t *ipif = NULL;
6117 ip_stack_t *ipst;
6118
6119 ASSERT(q->q_next == NULL);
6120 ipst = CONNQ_TO_IPST(q);
6121
6122 ip1dbg(("ip_siocaddrt:"));
6123 /* Existence of mp1 verified in ip_wput_nondata */
6124 mp1 = mp->b_cont->b_cont;
6125 rt = (struct rtentry *)mp1->b_rptr;
6126
6127 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6128 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6129
6130 /*
6131 * If the RTF_HOST flag is on, this is a request to assign a gateway
6132 * to a particular host address. In this case, we set the netmask to
6133 * all ones for the particular destination address. Otherwise,
6134 * determine the netmask to be used based on dst_addr and the interfaces
6135 * in use.
6136 */
6137 if (rt->rt_flags & RTF_HOST) {
6138 mask = IP_HOST_MASK;
6139 } else {
6140 /*
6141 * Note that ip_subnet_mask returns a zero mask in the case of
6142 * default (an all-zeroes address).
6143 */
6144 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6145 }
6146
6147 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
6148 B_TRUE, NULL, ipst, ALL_ZONES);
6149 if (ipif != NULL)
6150 ipif_refrele(ipif);
6151 return (error);
6152 }
6153
6154 /*
6155 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6156 */
6157 /* ARGSUSED */
6158 int
6159 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6160 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6161 {
6162 ipaddr_t dst_addr;
6163 ipaddr_t gw_addr;
6164 ipaddr_t mask;
6165 int error;
6166 mblk_t *mp1;
6167 struct rtentry *rt;
6168 ipif_t *ipif = NULL;
6169 ip_stack_t *ipst;
6170
6171 ASSERT(q->q_next == NULL);
6172 ipst = CONNQ_TO_IPST(q);
6173
6174 ip1dbg(("ip_siocdelrt:"));
6175 /* Existence of mp1 verified in ip_wput_nondata */
6176 mp1 = mp->b_cont->b_cont;
6177 rt = (struct rtentry *)mp1->b_rptr;
6178
6179 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6180 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6181
6182 /*
6183 * If the RTF_HOST flag is on, this is a request to delete a gateway
6184 * to a particular host address. In this case, we set the netmask to
6185 * all ones for the particular destination address. Otherwise,
6186 * determine the netmask to be used based on dst_addr and the interfaces
6187 * in use.
6188 */
6189 if (rt->rt_flags & RTF_HOST) {
6190 mask = IP_HOST_MASK;
6191 } else {
6192 /*
6193 * Note that ip_subnet_mask returns a zero mask in the case of
6194 * default (an all-zeroes address).
6195 */
6196 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6197 }
6198
6199 error = ip_rt_delete(dst_addr, mask, gw_addr,
6200 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6201 ipst, ALL_ZONES);
6202 if (ipif != NULL)
6203 ipif_refrele(ipif);
6204 return (error);
6205 }
6206
6207 /*
6208 * Enqueue the mp onto the ipsq, chained by b_next.
6209 * b_prev stores the function to be executed later, and b_queue the queue
6210 * where this mp originated.
6211 */
6212 void
6213 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6214 ill_t *pending_ill)
6215 {
6216 conn_t *connp;
6217 ipxop_t *ipx = ipsq->ipsq_xop;
6218
6219 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6220 ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6221 ASSERT(func != NULL);
6222
6223 mp->b_queue = q;
6224 mp->b_prev = (void *)func;
6225 mp->b_next = NULL;
6226
6227 switch (type) {
6228 case CUR_OP:
6229 if (ipx->ipx_mptail != NULL) {
6230 ASSERT(ipx->ipx_mphead != NULL);
6231 ipx->ipx_mptail->b_next = mp;
6232 } else {
6233 ASSERT(ipx->ipx_mphead == NULL);
6234 ipx->ipx_mphead = mp;
6235 }
6236 ipx->ipx_mptail = mp;
6237 break;
6238
6239 case NEW_OP:
6240 if (ipsq->ipsq_xopq_mptail != NULL) {
6241 ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6242 ipsq->ipsq_xopq_mptail->b_next = mp;
6243 } else {
6244 ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6245 ipsq->ipsq_xopq_mphead = mp;
6246 }
6247 ipsq->ipsq_xopq_mptail = mp;
6248 ipx->ipx_ipsq_queued = B_TRUE;
6249 break;
6250
6251 case SWITCH_OP:
6252 ASSERT(ipsq->ipsq_swxop != NULL);
6253 /* only one switch operation is currently allowed */
6254 ASSERT(ipsq->ipsq_switch_mp == NULL);
6255 ipsq->ipsq_switch_mp = mp;
6256 ipx->ipx_ipsq_queued = B_TRUE;
6257 break;
6258 default:
6259 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6260 }
6261
6262 if (CONN_Q(q) && pending_ill != NULL) {
6263 connp = Q_TO_CONN(q);
6264 ASSERT(MUTEX_HELD(&connp->conn_lock));
6265 connp->conn_oper_pending_ill = pending_ill;
6266 }
6267 }
6268
6269 /*
6270 * Dequeue the next message that requested exclusive access to this IPSQ's
6271 * xop. Specifically:
6272 *
6273 * 1. If we're still processing the current operation on `ipsq', then
6274 * dequeue the next message for the operation (from ipx_mphead), or
6275 * return NULL if there are no queued messages for the operation.
6276 * These messages are queued via CUR_OP to qwriter_ip() and friends.
6277 *
6278 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6279 * not set) see if the ipsq has requested an xop switch. If so, switch
6280 * `ipsq' to a different xop. Xop switches only happen when joining or
6281 * leaving IPMP groups and require a careful dance -- see the comments
6282 * in-line below for details. If we're leaving a group xop or if we're
6283 * joining a group xop and become writer on it, then we proceed to (3).
6284 * Otherwise, we return NULL and exit the xop.
6285 *
6286 * 3. For each IPSQ in the xop, return any switch operation stored on
6287 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6288 * any other messages queued on the IPSQ. Otherwise, dequeue the next
6289 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6290 * Note that if the phyint tied to `ipsq' is not using IPMP there will
6291 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for
6292 * each phyint in the group, including the IPMP meta-interface phyint.
6293 */
6294 static mblk_t *
6295 ipsq_dq(ipsq_t *ipsq)
6296 {
6297 ill_t *illv4, *illv6;
6298 mblk_t *mp;
6299 ipsq_t *xopipsq;
6300 ipsq_t *leftipsq = NULL;
6301 ipxop_t *ipx;
6302 phyint_t *phyi = ipsq->ipsq_phyint;
6303 ip_stack_t *ipst = ipsq->ipsq_ipst;
6304 boolean_t emptied = B_FALSE;
6305
6306 /*
6307 * Grab all the locks we need in the defined order (ill_g_lock ->
6308 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6309 */
6310 rw_enter(&ipst->ips_ill_g_lock,
6311 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6312 mutex_enter(&ipsq->ipsq_lock);
6313 ipx = ipsq->ipsq_xop;
6314 mutex_enter(&ipx->ipx_lock);
6315
6316 /*
6317 * Dequeue the next message associated with the current exclusive
6318 * operation, if any.
6319 */
6320 if ((mp = ipx->ipx_mphead) != NULL) {
6321 ipx->ipx_mphead = mp->b_next;
6322 if (ipx->ipx_mphead == NULL)
6323 ipx->ipx_mptail = NULL;
6324 mp->b_next = (void *)ipsq;
6325 goto out;
6326 }
6327
6328 if (ipx->ipx_current_ipif != NULL)
6329 goto empty;
6330
6331 if (ipsq->ipsq_swxop != NULL) {
6332 /*
6333 * The exclusive operation that is now being completed has
6334 * requested a switch to a different xop. This happens
6335 * when an interface joins or leaves an IPMP group. Joins
6336 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6337 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6338 * (phyint_free()), or interface plumb for an ill type
6339 * not in the IPMP group (ip_rput_dlpi_writer()).
6340 *
6341 * Xop switches are not allowed on the IPMP meta-interface.
6342 */
6343 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6344 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6345 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6346
6347 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6348 /*
6349 * We're switching back to our own xop, so we have two
6350 * xop's to drain/exit: our own, and the group xop
6351 * that we are leaving.
6352 *
6353 * First, pull ourselves out of the group ipsq list.
6354 * This is safe since we're writer on ill_g_lock.
6355 */
6356 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6357
6358 xopipsq = ipx->ipx_ipsq;
6359 while (xopipsq->ipsq_next != ipsq)
6360 xopipsq = xopipsq->ipsq_next;
6361
6362 xopipsq->ipsq_next = ipsq->ipsq_next;
6363 ipsq->ipsq_next = ipsq;
6364 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6365 ipsq->ipsq_swxop = NULL;
6366
6367 /*
6368 * Second, prepare to exit the group xop. The actual
6369 * ipsq_exit() is done at the end of this function
6370 * since we cannot hold any locks across ipsq_exit().
6371 * Note that although we drop the group's ipx_lock, no
6372 * threads can proceed since we're still ipx_writer.
6373 */
6374 leftipsq = xopipsq;
6375 mutex_exit(&ipx->ipx_lock);
6376
6377 /*
6378 * Third, set ipx to point to our own xop (which was
6379 * inactive and therefore can be entered).
6380 */
6381 ipx = ipsq->ipsq_xop;
6382 mutex_enter(&ipx->ipx_lock);
6383 ASSERT(ipx->ipx_writer == NULL);
6384 ASSERT(ipx->ipx_current_ipif == NULL);
6385 } else {
6386 /*
6387 * We're switching from our own xop to a group xop.
6388 * The requestor of the switch must ensure that the
6389 * group xop cannot go away (e.g. by ensuring the
6390 * phyint associated with the xop cannot go away).
6391 *
6392 * If we can become writer on our new xop, then we'll
6393 * do the drain. Otherwise, the current writer of our
6394 * new xop will do the drain when it exits.
6395 *
6396 * First, splice ourselves into the group IPSQ list.
6397 * This is safe since we're writer on ill_g_lock.
6398 */
6399 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6400
6401 xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6402 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6403 xopipsq = xopipsq->ipsq_next;
6404
6405 xopipsq->ipsq_next = ipsq;
6406 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6407 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6408 ipsq->ipsq_swxop = NULL;
6409
6410 /*
6411 * Second, exit our own xop, since it's now unused.
6412 * This is safe since we've got the only reference.
6413 */
6414 ASSERT(ipx->ipx_writer == curthread);
6415 ipx->ipx_writer = NULL;
6416 VERIFY(--ipx->ipx_reentry_cnt == 0);
6417 ipx->ipx_ipsq_queued = B_FALSE;
6418 mutex_exit(&ipx->ipx_lock);
6419
6420 /*
6421 * Third, set ipx to point to our new xop, and check
6422 * if we can become writer on it. If we cannot, then
6423 * the current writer will drain the IPSQ group when
6424 * it exits. Our ipsq_xop is guaranteed to be stable
6425 * because we're still holding ipsq_lock.
6426 */
6427 ipx = ipsq->ipsq_xop;
6428 mutex_enter(&ipx->ipx_lock);
6429 if (ipx->ipx_writer != NULL ||
6430 ipx->ipx_current_ipif != NULL) {
6431 goto out;
6432 }
6433 }
6434
6435 /*
6436 * Fourth, become writer on our new ipx before we continue
6437 * with the drain. Note that we never dropped ipsq_lock
6438 * above, so no other thread could've raced with us to
6439 * become writer first. Also, we're holding ipx_lock, so
6440 * no other thread can examine the ipx right now.
6441 */
6442 ASSERT(ipx->ipx_current_ipif == NULL);
6443 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6444 VERIFY(ipx->ipx_reentry_cnt++ == 0);
6445 ipx->ipx_writer = curthread;
6446 ipx->ipx_forced = B_FALSE;
6447 #ifdef DEBUG
6448 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6449 #endif
6450 }
6451
6452 xopipsq = ipsq;
6453 do {
6454 /*
6455 * So that other operations operate on a consistent and
6456 * complete phyint, a switch message on an IPSQ must be
6457 * handled prior to any other operations on that IPSQ.
6458 */
6459 if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6460 xopipsq->ipsq_switch_mp = NULL;
6461 ASSERT(mp->b_next == NULL);
6462 mp->b_next = (void *)xopipsq;
6463 goto out;
6464 }
6465
6466 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6467 xopipsq->ipsq_xopq_mphead = mp->b_next;
6468 if (xopipsq->ipsq_xopq_mphead == NULL)
6469 xopipsq->ipsq_xopq_mptail = NULL;
6470 mp->b_next = (void *)xopipsq;
6471 goto out;
6472 }
6473 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6474 empty:
6475 /*
6476 * There are no messages. Further, we are holding ipx_lock, hence no
6477 * new messages can end up on any IPSQ in the xop.
6478 */
6479 ipx->ipx_writer = NULL;
6480 ipx->ipx_forced = B_FALSE;
6481 VERIFY(--ipx->ipx_reentry_cnt == 0);
6482 ipx->ipx_ipsq_queued = B_FALSE;
6483 emptied = B_TRUE;
6484 #ifdef DEBUG
6485 ipx->ipx_depth = 0;
6486 #endif
6487 out:
6488 mutex_exit(&ipx->ipx_lock);
6489 mutex_exit(&ipsq->ipsq_lock);
6490
6491 /*
6492 * If we completely emptied the xop, then wake up any threads waiting
6493 * to enter any of the IPSQ's associated with it.
6494 */
6495 if (emptied) {
6496 xopipsq = ipsq;
6497 do {
6498 if ((phyi = xopipsq->ipsq_phyint) == NULL)
6499 continue;
6500
6501 illv4 = phyi->phyint_illv4;
6502 illv6 = phyi->phyint_illv6;
6503
6504 GRAB_ILL_LOCKS(illv4, illv6);
6505 if (illv4 != NULL)
6506 cv_broadcast(&illv4->ill_cv);
6507 if (illv6 != NULL)
6508 cv_broadcast(&illv6->ill_cv);
6509 RELEASE_ILL_LOCKS(illv4, illv6);
6510 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6511 }
6512 rw_exit(&ipst->ips_ill_g_lock);
6513
6514 /*
6515 * Now that all locks are dropped, exit the IPSQ we left.
6516 */
6517 if (leftipsq != NULL)
6518 ipsq_exit(leftipsq);
6519
6520 return (mp);
6521 }
6522
6523 /*
6524 * Return completion status of previously initiated DLPI operations on
6525 * ills in the purview of an ipsq.
6526 */
6527 static boolean_t
6528 ipsq_dlpi_done(ipsq_t *ipsq)
6529 {
6530 ipsq_t *ipsq_start;
6531 phyint_t *phyi;
6532 ill_t *ill;
6533
6534 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6535 ipsq_start = ipsq;
6536
6537 do {
6538 /*
6539 * The only current users of this function are ipsq_try_enter
6540 * and ipsq_enter which have made sure that ipsq_writer is
6541 * NULL before we reach here. ill_dlpi_pending is modified
6542 * only by an ipsq writer
6543 */
6544 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6545 phyi = ipsq->ipsq_phyint;
6546 /*
6547 * phyi could be NULL if a phyint that is part of an
6548 * IPMP group is being unplumbed. A more detailed
6549 * comment is in ipmp_grp_update_kstats()
6550 */
6551 if (phyi != NULL) {
6552 ill = phyi->phyint_illv4;
6553 if (ill != NULL &&
6554 (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6555 ill->ill_arl_dlpi_pending))
6556 return (B_FALSE);
6557
6558 ill = phyi->phyint_illv6;
6559 if (ill != NULL &&
6560 ill->ill_dlpi_pending != DL_PRIM_INVAL)
6561 return (B_FALSE);
6562 }
6563
6564 } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6565
6566 return (B_TRUE);
6567 }
6568
6569 /*
6570 * Enter the ipsq corresponding to ill, by waiting synchronously till
6571 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6572 * will have to drain completely before ipsq_enter returns success.
6573 * ipx_current_ipif will be set if some exclusive op is in progress,
6574 * and the ipsq_exit logic will start the next enqueued op after
6575 * completion of the current op. If 'force' is used, we don't wait
6576 * for the enqueued ops. This is needed when a conn_close wants to
6577 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6578 * of an ill can also use this option. But we dont' use it currently.
6579 */
6580 #define ENTER_SQ_WAIT_TICKS 100
6581 boolean_t
6582 ipsq_enter(ill_t *ill, boolean_t force, int type)
6583 {
6584 ipsq_t *ipsq;
6585 ipxop_t *ipx;
6586 boolean_t waited_enough = B_FALSE;
6587 ip_stack_t *ipst = ill->ill_ipst;
6588
6589 /*
6590 * Note that the relationship between ill and ipsq is fixed as long as
6591 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the
6592 * relationship between the IPSQ and xop cannot change. However,
6593 * since we cannot hold ipsq_lock across the cv_wait(), it may change
6594 * while we're waiting. We wait on ill_cv and rely on ipsq_exit()
6595 * waking up all ills in the xop when it becomes available.
6596 */
6597 for (;;) {
6598 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6599 mutex_enter(&ill->ill_lock);
6600 if (ill->ill_state_flags & ILL_CONDEMNED) {
6601 mutex_exit(&ill->ill_lock);
6602 rw_exit(&ipst->ips_ill_g_lock);
6603 return (B_FALSE);
6604 }
6605
6606 ipsq = ill->ill_phyint->phyint_ipsq;
6607 mutex_enter(&ipsq->ipsq_lock);
6608 ipx = ipsq->ipsq_xop;
6609 mutex_enter(&ipx->ipx_lock);
6610
6611 if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6612 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6613 waited_enough))
6614 break;
6615
6616 rw_exit(&ipst->ips_ill_g_lock);
6617
6618 if (!force || ipx->ipx_writer != NULL) {
6619 mutex_exit(&ipx->ipx_lock);
6620 mutex_exit(&ipsq->ipsq_lock);
6621 cv_wait(&ill->ill_cv, &ill->ill_lock);
6622 } else {
6623 mutex_exit(&ipx->ipx_lock);
6624 mutex_exit(&ipsq->ipsq_lock);
6625 (void) cv_reltimedwait(&ill->ill_cv,
6626 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6627 waited_enough = B_TRUE;
6628 }
6629 mutex_exit(&ill->ill_lock);
6630 }
6631
6632 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6633 ASSERT(ipx->ipx_reentry_cnt == 0);
6634 ipx->ipx_writer = curthread;
6635 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6636 ipx->ipx_reentry_cnt++;
6637 #ifdef DEBUG
6638 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6639 #endif
6640 mutex_exit(&ipx->ipx_lock);
6641 mutex_exit(&ipsq->ipsq_lock);
6642 mutex_exit(&ill->ill_lock);
6643 rw_exit(&ipst->ips_ill_g_lock);
6644
6645 return (B_TRUE);
6646 }
6647
6648 /*
6649 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6650 * across the call to the core interface ipsq_try_enter() and hence calls this
6651 * function directly. This is explained more fully in ipif_set_values().
6652 * In order to support the above constraint, ipsq_try_enter is implemented as
6653 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6654 */
6655 static ipsq_t *
6656 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6657 int type, boolean_t reentry_ok)
6658 {
6659 ipsq_t *ipsq;
6660 ipxop_t *ipx;
6661 ip_stack_t *ipst = ill->ill_ipst;
6662
6663 /*
6664 * lock ordering:
6665 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6666 *
6667 * ipx of an ipsq can't change when ipsq_lock is held.
6668 */
6669 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6670 GRAB_CONN_LOCK(q);
6671 mutex_enter(&ill->ill_lock);
6672 ipsq = ill->ill_phyint->phyint_ipsq;
6673 mutex_enter(&ipsq->ipsq_lock);
6674 ipx = ipsq->ipsq_xop;
6675 mutex_enter(&ipx->ipx_lock);
6676
6677 /*
6678 * 1. Enter the ipsq if we are already writer and reentry is ok.
6679 * (Note: If the caller does not specify reentry_ok then neither
6680 * 'func' nor any of its callees must ever attempt to enter the ipsq
6681 * again. Otherwise it can lead to an infinite loop
6682 * 2. Enter the ipsq if there is no current writer and this attempted
6683 * entry is part of the current operation
6684 * 3. Enter the ipsq if there is no current writer and this is a new
6685 * operation and the operation queue is empty and there is no
6686 * operation currently in progress and if all previously initiated
6687 * DLPI operations have completed.
6688 */
6689 if ((ipx->ipx_writer == curthread && reentry_ok) ||
6690 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6691 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6692 ipsq_dlpi_done(ipsq))))) {
6693 /* Success. */
6694 ipx->ipx_reentry_cnt++;
6695 ipx->ipx_writer = curthread;
6696 ipx->ipx_forced = B_FALSE;
6697 mutex_exit(&ipx->ipx_lock);
6698 mutex_exit(&ipsq->ipsq_lock);
6699 mutex_exit(&ill->ill_lock);
6700 RELEASE_CONN_LOCK(q);
6701 #ifdef DEBUG
6702 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6703 #endif
6704 return (ipsq);
6705 }
6706
6707 if (func != NULL)
6708 ipsq_enq(ipsq, q, mp, func, type, ill);
6709
6710 mutex_exit(&ipx->ipx_lock);
6711 mutex_exit(&ipsq->ipsq_lock);
6712 mutex_exit(&ill->ill_lock);
6713 RELEASE_CONN_LOCK(q);
6714 return (NULL);
6715 }
6716
6717 /*
6718 * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6719 * certain critical operations like plumbing (i.e. most set ioctls), etc.
6720 * There is one ipsq per phyint. The ipsq
6721 * serializes exclusive ioctls issued by applications on a per ipsq basis in
6722 * ipsq_xopq_mphead. It also protects against multiple threads executing in
6723 * the ipsq. Responses from the driver pertain to the current ioctl (say a
6724 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6725 * up the interface) and are enqueued in ipx_mphead.
6726 *
6727 * If a thread does not want to reenter the ipsq when it is already writer,
6728 * it must make sure that the specified reentry point to be called later
6729 * when the ipsq is empty, nor any code path starting from the specified reentry
6730 * point must never ever try to enter the ipsq again. Otherwise it can lead
6731 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6732 * When the thread that is currently exclusive finishes, it (ipsq_exit)
6733 * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6734 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6735 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6736 * ioctl if the current ioctl has completed. If the current ioctl is still
6737 * in progress it simply returns. The current ioctl could be waiting for
6738 * a response from another module (the driver or could be waiting for
6739 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6740 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6741 * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6742 * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6743 * all associated DLPI operations have completed.
6744 */
6745
6746 /*
6747 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6748 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ
6749 * on success, or NULL on failure. The caller ensures ipif/ill is valid by
6750 * refholding it as necessary. If the IPSQ cannot be entered and `func' is
6751 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6752 * can be entered. If `func' is NULL, then `q' and `mp' are ignored.
6753 */
6754 ipsq_t *
6755 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6756 ipsq_func_t func, int type, boolean_t reentry_ok)
6757 {
6758 ip_stack_t *ipst;
6759 ipsq_t *ipsq;
6760
6761 /* Only 1 of ipif or ill can be specified */
6762 ASSERT((ipif != NULL) ^ (ill != NULL));
6763
6764 if (ipif != NULL)
6765 ill = ipif->ipif_ill;
6766 ipst = ill->ill_ipst;
6767
6768 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6769 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6770 rw_exit(&ipst->ips_ill_g_lock);
6771
6772 return (ipsq);
6773 }
6774
6775 /*
6776 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures
6777 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ
6778 * cannot be entered, the mp is queued for completion.
6779 */
6780 void
6781 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6782 boolean_t reentry_ok)
6783 {
6784 ipsq_t *ipsq;
6785
6786 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6787
6788 /*
6789 * Drop the caller's refhold on the ill. This is safe since we either
6790 * entered the IPSQ (and thus are exclusive), or failed to enter the
6791 * IPSQ, in which case we return without accessing ill anymore. This
6792 * is needed because func needs to see the correct refcount.
6793 * e.g. removeif can work only then.
6794 */
6795 ill_refrele(ill);
6796 if (ipsq != NULL) {
6797 (*func)(ipsq, q, mp, NULL);
6798 ipsq_exit(ipsq);
6799 }
6800 }
6801
6802 /*
6803 * Exit the specified IPSQ. If this is the final exit on it then drain it
6804 * prior to exiting. Caller must be writer on the specified IPSQ.
6805 */
6806 void
6807 ipsq_exit(ipsq_t *ipsq)
6808 {
6809 mblk_t *mp;
6810 ipsq_t *mp_ipsq;
6811 queue_t *q;
6812 phyint_t *phyi;
6813 ipsq_func_t func;
6814
6815 ASSERT(IAM_WRITER_IPSQ(ipsq));
6816
6817 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6818 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6819 ipsq->ipsq_xop->ipx_reentry_cnt--;
6820 return;
6821 }
6822
6823 for (;;) {
6824 phyi = ipsq->ipsq_phyint;
6825 mp = ipsq_dq(ipsq);
6826 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6827
6828 /*
6829 * If we've changed to a new IPSQ, and the phyint associated
6830 * with the old one has gone away, free the old IPSQ. Note
6831 * that this cannot happen while the IPSQ is in a group.
6832 */
6833 if (mp_ipsq != ipsq && phyi == NULL) {
6834 ASSERT(ipsq->ipsq_next == ipsq);
6835 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6836 ipsq_delete(ipsq);
6837 }
6838
6839 if (mp == NULL)
6840 break;
6841
6842 q = mp->b_queue;
6843 func = (ipsq_func_t)mp->b_prev;
6844 ipsq = mp_ipsq;
6845 mp->b_next = mp->b_prev = NULL;
6846 mp->b_queue = NULL;
6847
6848 /*
6849 * If 'q' is an conn queue, it is valid, since we did a
6850 * a refhold on the conn at the start of the ioctl.
6851 * If 'q' is an ill queue, it is valid, since close of an
6852 * ill will clean up its IPSQ.
6853 */
6854 (*func)(ipsq, q, mp, NULL);
6855 }
6856 }
6857
6858 /*
6859 * Used to start any igmp or mld timers that could not be started
6860 * while holding ill_mcast_lock. The timers can't be started while holding
6861 * the lock, since mld/igmp_start_timers may need to call untimeout()
6862 * which can't be done while holding the lock which the timeout handler
6863 * acquires. Otherwise
6864 * there could be a deadlock since the timeout handlers
6865 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6866 * ill_mcast_lock.
6867 */
6868 void
6869 ill_mcast_timer_start(ip_stack_t *ipst)
6870 {
6871 int next;
6872
6873 mutex_enter(&ipst->ips_igmp_timer_lock);
6874 next = ipst->ips_igmp_deferred_next;
6875 ipst->ips_igmp_deferred_next = INFINITY;
6876 mutex_exit(&ipst->ips_igmp_timer_lock);
6877
6878 if (next != INFINITY)
6879 igmp_start_timers(next, ipst);
6880
6881 mutex_enter(&ipst->ips_mld_timer_lock);
6882 next = ipst->ips_mld_deferred_next;
6883 ipst->ips_mld_deferred_next = INFINITY;
6884 mutex_exit(&ipst->ips_mld_timer_lock);
6885
6886 if (next != INFINITY)
6887 mld_start_timers(next, ipst);
6888 }
6889
6890 /*
6891 * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6892 * and `ioccmd'.
6893 */
6894 void
6895 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6896 {
6897 ill_t *ill = ipif->ipif_ill;
6898 ipxop_t *ipx = ipsq->ipsq_xop;
6899
6900 ASSERT(IAM_WRITER_IPSQ(ipsq));
6901 ASSERT(ipx->ipx_current_ipif == NULL);
6902 ASSERT(ipx->ipx_current_ioctl == 0);
6903
6904 ipx->ipx_current_done = B_FALSE;
6905 ipx->ipx_current_ioctl = ioccmd;
6906 mutex_enter(&ipx->ipx_lock);
6907 ipx->ipx_current_ipif = ipif;
6908 mutex_exit(&ipx->ipx_lock);
6909
6910 /*
6911 * Set IPIF_CHANGING on one or more ipifs associated with the
6912 * current exclusive operation. IPIF_CHANGING prevents any new
6913 * references to the ipif (so that the references will eventually
6914 * drop to zero) and also prevents any "get" operations (e.g.,
6915 * SIOCGLIFFLAGS) from being able to access the ipif until the
6916 * operation has completed and the ipif is again in a stable state.
6917 *
6918 * For ioctls, IPIF_CHANGING is set on the ipif associated with the
6919 * ioctl. For internal operations (where ioccmd is zero), all ipifs
6920 * on the ill are marked with IPIF_CHANGING since it's unclear which
6921 * ipifs will be affected.
6922 *
6923 * Note that SIOCLIFREMOVEIF is a special case as it sets
6924 * IPIF_CONDEMNED internally after identifying the right ipif to
6925 * operate on.
6926 */
6927 switch (ioccmd) {
6928 case SIOCLIFREMOVEIF:
6929 break;
6930 case 0:
6931 mutex_enter(&ill->ill_lock);
6932 ipif = ipif->ipif_ill->ill_ipif;
6933 for (; ipif != NULL; ipif = ipif->ipif_next)
6934 ipif->ipif_state_flags |= IPIF_CHANGING;
6935 mutex_exit(&ill->ill_lock);
6936 break;
6937 default:
6938 mutex_enter(&ill->ill_lock);
6939 ipif->ipif_state_flags |= IPIF_CHANGING;
6940 mutex_exit(&ill->ill_lock);
6941 }
6942 }
6943
6944 /*
6945 * Finish the current exclusive operation on `ipsq'. Usually, this will allow
6946 * the next exclusive operation to begin once we ipsq_exit(). However, if
6947 * pending DLPI operations remain, then we will wait for the queue to drain
6948 * before allowing the next exclusive operation to begin. This ensures that
6949 * DLPI operations from one exclusive operation are never improperly processed
6950 * as part of a subsequent exclusive operation.
6951 */
6952 void
6953 ipsq_current_finish(ipsq_t *ipsq)
6954 {
6955 ipxop_t *ipx = ipsq->ipsq_xop;
6956 t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
6957 ipif_t *ipif = ipx->ipx_current_ipif;
6958
6959 ASSERT(IAM_WRITER_IPSQ(ipsq));
6960
6961 /*
6962 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
6963 * (but in that case, IPIF_CHANGING will already be clear and no
6964 * pending DLPI messages can remain).
6965 */
6966 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
6967 ill_t *ill = ipif->ipif_ill;
6968
6969 mutex_enter(&ill->ill_lock);
6970 dlpi_pending = ill->ill_dlpi_pending;
6971 if (ipx->ipx_current_ioctl == 0) {
6972 ipif = ill->ill_ipif;
6973 for (; ipif != NULL; ipif = ipif->ipif_next)
6974 ipif->ipif_state_flags &= ~IPIF_CHANGING;
6975 } else {
6976 ipif->ipif_state_flags &= ~IPIF_CHANGING;
6977 }
6978 mutex_exit(&ill->ill_lock);
6979 }
6980
6981 ASSERT(!ipx->ipx_current_done);
6982 ipx->ipx_current_done = B_TRUE;
6983 ipx->ipx_current_ioctl = 0;
6984 if (dlpi_pending == DL_PRIM_INVAL) {
6985 mutex_enter(&ipx->ipx_lock);
6986 ipx->ipx_current_ipif = NULL;
6987 mutex_exit(&ipx->ipx_lock);
6988 }
6989 }
6990
6991 /*
6992 * The ill is closing. Flush all messages on the ipsq that originated
6993 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
6994 * for this ill since ipsq_enter could not have entered until then.
6995 * New messages can't be queued since the CONDEMNED flag is set.
6996 */
6997 static void
6998 ipsq_flush(ill_t *ill)
6999 {
7000 queue_t *q;
7001 mblk_t *prev;
7002 mblk_t *mp;
7003 mblk_t *mp_next;
7004 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
7005
7006 ASSERT(IAM_WRITER_ILL(ill));
7007
7008 /*
7009 * Flush any messages sent up by the driver.
7010 */
7011 mutex_enter(&ipx->ipx_lock);
7012 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
7013 mp_next = mp->b_next;
7014 q = mp->b_queue;
7015 if (q == ill->ill_rq || q == ill->ill_wq) {
7016 /* dequeue mp */
7017 if (prev == NULL)
7018 ipx->ipx_mphead = mp->b_next;
7019 else
7020 prev->b_next = mp->b_next;
7021 if (ipx->ipx_mptail == mp) {
7022 ASSERT(mp_next == NULL);
7023 ipx->ipx_mptail = prev;
7024 }
7025 inet_freemsg(mp);
7026 } else {
7027 prev = mp;
7028 }
7029 }
7030 mutex_exit(&ipx->ipx_lock);
7031 (void) ipsq_pending_mp_cleanup(ill, NULL);
7032 ipsq_xopq_mp_cleanup(ill, NULL);
7033 }
7034
7035 /*
7036 * Parse an ifreq or lifreq struct coming down ioctls and refhold
7037 * and return the associated ipif.
7038 * Return value:
7039 * Non zero: An error has occurred. ci may not be filled out.
7040 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and
7041 * a held ipif in ci.ci_ipif.
7042 */
7043 int
7044 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
7045 cmd_info_t *ci)
7046 {
7047 char *name;
7048 struct ifreq *ifr;
7049 struct lifreq *lifr;
7050 ipif_t *ipif = NULL;
7051 ill_t *ill;
7052 conn_t *connp;
7053 boolean_t isv6;
7054 int err;
7055 mblk_t *mp1;
7056 zoneid_t zoneid;
7057 ip_stack_t *ipst;
7058
7059 if (q->q_next != NULL) {
7060 ill = (ill_t *)q->q_ptr;
7061 isv6 = ill->ill_isv6;
7062 connp = NULL;
7063 zoneid = ALL_ZONES;
7064 ipst = ill->ill_ipst;
7065 } else {
7066 ill = NULL;
7067 connp = Q_TO_CONN(q);
7068 isv6 = (connp->conn_family == AF_INET6);
7069 zoneid = connp->conn_zoneid;
7070 if (zoneid == GLOBAL_ZONEID) {
7071 /* global zone can access ipifs in all zones */
7072 zoneid = ALL_ZONES;
7073 }
7074 ipst = connp->conn_netstack->netstack_ip;
7075 }
7076
7077 /* Has been checked in ip_wput_nondata */
7078 mp1 = mp->b_cont->b_cont;
7079
7080 if (ipip->ipi_cmd_type == IF_CMD) {
7081 /* This a old style SIOC[GS]IF* command */
7082 ifr = (struct ifreq *)mp1->b_rptr;
7083 /*
7084 * Null terminate the string to protect against buffer
7085 * overrun. String was generated by user code and may not
7086 * be trusted.
7087 */
7088 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
7089 name = ifr->ifr_name;
7090 ci->ci_sin = (sin_t *)&ifr->ifr_addr;
7091 ci->ci_sin6 = NULL;
7092 ci->ci_lifr = (struct lifreq *)ifr;
7093 } else {
7094 /* This a new style SIOC[GS]LIF* command */
7095 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
7096 lifr = (struct lifreq *)mp1->b_rptr;
7097 /*
7098 * Null terminate the string to protect against buffer
7099 * overrun. String was generated by user code and may not
7100 * be trusted.
7101 */
7102 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
7103 name = lifr->lifr_name;
7104 ci->ci_sin = (sin_t *)&lifr->lifr_addr;
7105 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
7106 ci->ci_lifr = lifr;
7107 }
7108
7109 if (ipip->ipi_cmd == SIOCSLIFNAME) {
7110 /*
7111 * The ioctl will be failed if the ioctl comes down
7112 * an conn stream
7113 */
7114 if (ill == NULL) {
7115 /*
7116 * Not an ill queue, return EINVAL same as the
7117 * old error code.
7118 */
7119 return (ENXIO);
7120 }
7121 ipif = ill->ill_ipif;
7122 ipif_refhold(ipif);
7123 } else {
7124 /*
7125 * Ensure that ioctls don't see any internal state changes
7126 * caused by set ioctls by deferring them if IPIF_CHANGING is
7127 * set.
7128 */
7129 ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
7130 isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
7131 if (ipif == NULL) {
7132 if (err == EINPROGRESS)
7133 return (err);
7134 err = 0; /* Ensure we don't use it below */
7135 }
7136 }
7137
7138 /*
7139 * Old style [GS]IFCMD does not admit IPv6 ipif
7140 */
7141 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
7142 ipif_refrele(ipif);
7143 return (ENXIO);
7144 }
7145
7146 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
7147 name[0] == '\0') {
7148 /*
7149 * Handle a or a SIOC?IF* with a null name
7150 * during plumb (on the ill queue before the I_PLINK).
7151 */
7152 ipif = ill->ill_ipif;
7153 ipif_refhold(ipif);
7154 }
7155
7156 if (ipif == NULL)
7157 return (ENXIO);
7158
7159 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7160 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7161
7162 ci->ci_ipif = ipif;
7163 return (0);
7164 }
7165
7166 /*
7167 * Return the total number of ipifs.
7168 */
7169 static uint_t
7170 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7171 {
7172 uint_t numifs = 0;
7173 ill_t *ill;
7174 ill_walk_context_t ctx;
7175 ipif_t *ipif;
7176
7177 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7178 ill = ILL_START_WALK_V4(&ctx, ipst);
7179 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7180 if (IS_UNDER_IPMP(ill))
7181 continue;
7182 for (ipif = ill->ill_ipif; ipif != NULL;
7183 ipif = ipif->ipif_next) {
7184 if (ipif->ipif_zoneid == zoneid ||
7185 ipif->ipif_zoneid == ALL_ZONES)
7186 numifs++;
7187 }
7188 }
7189 rw_exit(&ipst->ips_ill_g_lock);
7190 return (numifs);
7191 }
7192
7193 /*
7194 * Return the total number of ipifs.
7195 */
7196 static uint_t
7197 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7198 {
7199 uint_t numifs = 0;
7200 ill_t *ill;
7201 ipif_t *ipif;
7202 ill_walk_context_t ctx;
7203
7204 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7205
7206 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7207 if (family == AF_INET)
7208 ill = ILL_START_WALK_V4(&ctx, ipst);
7209 else if (family == AF_INET6)
7210 ill = ILL_START_WALK_V6(&ctx, ipst);
7211 else
7212 ill = ILL_START_WALK_ALL(&ctx, ipst);
7213
7214 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7215 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7216 continue;
7217
7218 for (ipif = ill->ill_ipif; ipif != NULL;
7219 ipif = ipif->ipif_next) {
7220 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7221 !(lifn_flags & LIFC_NOXMIT))
7222 continue;
7223 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7224 !(lifn_flags & LIFC_TEMPORARY))
7225 continue;
7226 if (((ipif->ipif_flags &
7227 (IPIF_NOXMIT|IPIF_NOLOCAL|
7228 IPIF_DEPRECATED)) ||
7229 IS_LOOPBACK(ill) ||
7230 !(ipif->ipif_flags & IPIF_UP)) &&
7231 (lifn_flags & LIFC_EXTERNAL_SOURCE))
7232 continue;
7233
7234 if (zoneid != ipif->ipif_zoneid &&
7235 ipif->ipif_zoneid != ALL_ZONES &&
7236 (zoneid != GLOBAL_ZONEID ||
7237 !(lifn_flags & LIFC_ALLZONES)))
7238 continue;
7239
7240 numifs++;
7241 }
7242 }
7243 rw_exit(&ipst->ips_ill_g_lock);
7244 return (numifs);
7245 }
7246
7247 uint_t
7248 ip_get_lifsrcofnum(ill_t *ill)
7249 {
7250 uint_t numifs = 0;
7251 ill_t *ill_head = ill;
7252 ip_stack_t *ipst = ill->ill_ipst;
7253
7254 /*
7255 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7256 * other thread may be trying to relink the ILLs in this usesrc group
7257 * and adjusting the ill_usesrc_grp_next pointers
7258 */
7259 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7260 if ((ill->ill_usesrc_ifindex == 0) &&
7261 (ill->ill_usesrc_grp_next != NULL)) {
7262 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7263 ill = ill->ill_usesrc_grp_next)
7264 numifs++;
7265 }
7266 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7267
7268 return (numifs);
7269 }
7270
7271 /* Null values are passed in for ipif, sin, and ifreq */
7272 /* ARGSUSED */
7273 int
7274 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7275 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7276 {
7277 int *nump;
7278 conn_t *connp = Q_TO_CONN(q);
7279
7280 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7281
7282 /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7283 nump = (int *)mp->b_cont->b_cont->b_rptr;
7284
7285 *nump = ip_get_numifs(connp->conn_zoneid,
7286 connp->conn_netstack->netstack_ip);
7287 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7288 return (0);
7289 }
7290
7291 /* Null values are passed in for ipif, sin, and ifreq */
7292 /* ARGSUSED */
7293 int
7294 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7295 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7296 {
7297 struct lifnum *lifn;
7298 mblk_t *mp1;
7299 conn_t *connp = Q_TO_CONN(q);
7300
7301 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7302
7303 /* Existence checked in ip_wput_nondata */
7304 mp1 = mp->b_cont->b_cont;
7305
7306 lifn = (struct lifnum *)mp1->b_rptr;
7307 switch (lifn->lifn_family) {
7308 case AF_UNSPEC:
7309 case AF_INET:
7310 case AF_INET6:
7311 break;
7312 default:
7313 return (EAFNOSUPPORT);
7314 }
7315
7316 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7317 connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7318 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7319 return (0);
7320 }
7321
7322 /* ARGSUSED */
7323 int
7324 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7325 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7326 {
7327 STRUCT_HANDLE(ifconf, ifc);
7328 mblk_t *mp1;
7329 struct iocblk *iocp;
7330 struct ifreq *ifr;
7331 ill_walk_context_t ctx;
7332 ill_t *ill;
7333 ipif_t *ipif;
7334 struct sockaddr_in *sin;
7335 int32_t ifclen;
7336 zoneid_t zoneid;
7337 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7338
7339 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7340
7341 ip1dbg(("ip_sioctl_get_ifconf"));
7342 /* Existence verified in ip_wput_nondata */
7343 mp1 = mp->b_cont->b_cont;
7344 iocp = (struct iocblk *)mp->b_rptr;
7345 zoneid = Q_TO_CONN(q)->conn_zoneid;
7346
7347 /*
7348 * The original SIOCGIFCONF passed in a struct ifconf which specified
7349 * the user buffer address and length into which the list of struct
7350 * ifreqs was to be copied. Since AT&T Streams does not seem to
7351 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7352 * the SIOCGIFCONF operation was redefined to simply provide
7353 * a large output buffer into which we are supposed to jam the ifreq
7354 * array. The same ioctl command code was used, despite the fact that
7355 * both the applications and the kernel code had to change, thus making
7356 * it impossible to support both interfaces.
7357 *
7358 * For reasons not good enough to try to explain, the following
7359 * algorithm is used for deciding what to do with one of these:
7360 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7361 * form with the output buffer coming down as the continuation message.
7362 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7363 * and we have to copy in the ifconf structure to find out how big the
7364 * output buffer is and where to copy out to. Sure no problem...
7365 *
7366 */
7367 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7368 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7369 int numifs = 0;
7370 size_t ifc_bufsize;
7371
7372 /*
7373 * Must be (better be!) continuation of a TRANSPARENT
7374 * IOCTL. We just copied in the ifconf structure.
7375 */
7376 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7377 (struct ifconf *)mp1->b_rptr);
7378
7379 /*
7380 * Allocate a buffer to hold requested information.
7381 *
7382 * If ifc_len is larger than what is needed, we only
7383 * allocate what we will use.
7384 *
7385 * If ifc_len is smaller than what is needed, return
7386 * EINVAL.
7387 *
7388 * XXX: the ill_t structure can hava 2 counters, for
7389 * v4 and v6 (not just ill_ipif_up_count) to store the
7390 * number of interfaces for a device, so we don't need
7391 * to count them here...
7392 */
7393 numifs = ip_get_numifs(zoneid, ipst);
7394
7395 ifclen = STRUCT_FGET(ifc, ifc_len);
7396 ifc_bufsize = numifs * sizeof (struct ifreq);
7397 if (ifc_bufsize > ifclen) {
7398 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7399 /* old behaviour */
7400 return (EINVAL);
7401 } else {
7402 ifc_bufsize = ifclen;
7403 }
7404 }
7405
7406 mp1 = mi_copyout_alloc(q, mp,
7407 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7408 if (mp1 == NULL)
7409 return (ENOMEM);
7410
7411 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7412 }
7413 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7414 /*
7415 * the SIOCGIFCONF ioctl only knows about
7416 * IPv4 addresses, so don't try to tell
7417 * it about interfaces with IPv6-only
7418 * addresses. (Last parm 'isv6' is B_FALSE)
7419 */
7420
7421 ifr = (struct ifreq *)mp1->b_rptr;
7422
7423 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7424 ill = ILL_START_WALK_V4(&ctx, ipst);
7425 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7426 if (IS_UNDER_IPMP(ill))
7427 continue;
7428 for (ipif = ill->ill_ipif; ipif != NULL;
7429 ipif = ipif->ipif_next) {
7430 if (zoneid != ipif->ipif_zoneid &&
7431 ipif->ipif_zoneid != ALL_ZONES)
7432 continue;
7433 if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7434 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7435 /* old behaviour */
7436 rw_exit(&ipst->ips_ill_g_lock);
7437 return (EINVAL);
7438 } else {
7439 goto if_copydone;
7440 }
7441 }
7442 ipif_get_name(ipif, ifr->ifr_name,
7443 sizeof (ifr->ifr_name));
7444 sin = (sin_t *)&ifr->ifr_addr;
7445 *sin = sin_null;
7446 sin->sin_family = AF_INET;
7447 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7448 ifr++;
7449 }
7450 }
7451 if_copydone:
7452 rw_exit(&ipst->ips_ill_g_lock);
7453 mp1->b_wptr = (uchar_t *)ifr;
7454
7455 if (STRUCT_BUF(ifc) != NULL) {
7456 STRUCT_FSET(ifc, ifc_len,
7457 (int)((uchar_t *)ifr - mp1->b_rptr));
7458 }
7459 return (0);
7460 }
7461
7462 /*
7463 * Get the interfaces using the address hosted on the interface passed in,
7464 * as a source adddress
7465 */
7466 /* ARGSUSED */
7467 int
7468 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7469 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7470 {
7471 mblk_t *mp1;
7472 ill_t *ill, *ill_head;
7473 ipif_t *ipif, *orig_ipif;
7474 int numlifs = 0;
7475 size_t lifs_bufsize, lifsmaxlen;
7476 struct lifreq *lifr;
7477 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7478 uint_t ifindex;
7479 zoneid_t zoneid;
7480 boolean_t isv6 = B_FALSE;
7481 struct sockaddr_in *sin;
7482 struct sockaddr_in6 *sin6;
7483 STRUCT_HANDLE(lifsrcof, lifs);
7484 ip_stack_t *ipst;
7485
7486 ipst = CONNQ_TO_IPST(q);
7487
7488 ASSERT(q->q_next == NULL);
7489
7490 zoneid = Q_TO_CONN(q)->conn_zoneid;
7491
7492 /* Existence verified in ip_wput_nondata */
7493 mp1 = mp->b_cont->b_cont;
7494
7495 /*
7496 * Must be (better be!) continuation of a TRANSPARENT
7497 * IOCTL. We just copied in the lifsrcof structure.
7498 */
7499 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7500 (struct lifsrcof *)mp1->b_rptr);
7501
7502 if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7503 return (EINVAL);
7504
7505 ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7506 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7507 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7508 if (ipif == NULL) {
7509 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7510 ifindex));
7511 return (ENXIO);
7512 }
7513
7514 /* Allocate a buffer to hold requested information */
7515 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7516 lifs_bufsize = numlifs * sizeof (struct lifreq);
7517 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen);
7518 /* The actual size needed is always returned in lifs_len */
7519 STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7520
7521 /* If the amount we need is more than what is passed in, abort */
7522 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7523 ipif_refrele(ipif);
7524 return (0);
7525 }
7526
7527 mp1 = mi_copyout_alloc(q, mp,
7528 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7529 if (mp1 == NULL) {
7530 ipif_refrele(ipif);
7531 return (ENOMEM);
7532 }
7533
7534 mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7535 bzero(mp1->b_rptr, lifs_bufsize);
7536
7537 lifr = (struct lifreq *)mp1->b_rptr;
7538
7539 ill = ill_head = ipif->ipif_ill;
7540 orig_ipif = ipif;
7541
7542 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7543 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7544 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7545
7546 ill = ill->ill_usesrc_grp_next; /* start from next ill */
7547 for (; (ill != NULL) && (ill != ill_head);
7548 ill = ill->ill_usesrc_grp_next) {
7549
7550 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7551 break;
7552
7553 ipif = ill->ill_ipif;
7554 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7555 if (ipif->ipif_isv6) {
7556 sin6 = (sin6_t *)&lifr->lifr_addr;
7557 *sin6 = sin6_null;
7558 sin6->sin6_family = AF_INET6;
7559 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7560 lifr->lifr_addrlen = ip_mask_to_plen_v6(
7561 &ipif->ipif_v6net_mask);
7562 } else {
7563 sin = (sin_t *)&lifr->lifr_addr;
7564 *sin = sin_null;
7565 sin->sin_family = AF_INET;
7566 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7567 lifr->lifr_addrlen = ip_mask_to_plen(
7568 ipif->ipif_net_mask);
7569 }
7570 lifr++;
7571 }
7572 rw_exit(&ipst->ips_ill_g_lock);
7573 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7574 ipif_refrele(orig_ipif);
7575 mp1->b_wptr = (uchar_t *)lifr;
7576 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7577
7578 return (0);
7579 }
7580
7581 /* ARGSUSED */
7582 int
7583 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7584 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7585 {
7586 mblk_t *mp1;
7587 int list;
7588 ill_t *ill;
7589 ipif_t *ipif;
7590 int flags;
7591 int numlifs = 0;
7592 size_t lifc_bufsize;
7593 struct lifreq *lifr;
7594 sa_family_t family;
7595 struct sockaddr_in *sin;
7596 struct sockaddr_in6 *sin6;
7597 ill_walk_context_t ctx;
7598 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7599 int32_t lifclen;
7600 zoneid_t zoneid;
7601 STRUCT_HANDLE(lifconf, lifc);
7602 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7603
7604 ip1dbg(("ip_sioctl_get_lifconf"));
7605
7606 ASSERT(q->q_next == NULL);
7607
7608 zoneid = Q_TO_CONN(q)->conn_zoneid;
7609
7610 /* Existence verified in ip_wput_nondata */
7611 mp1 = mp->b_cont->b_cont;
7612
7613 /*
7614 * An extended version of SIOCGIFCONF that takes an
7615 * additional address family and flags field.
7616 * AF_UNSPEC retrieve both IPv4 and IPv6.
7617 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7618 * interfaces are omitted.
7619 * Similarly, IPIF_TEMPORARY interfaces are omitted
7620 * unless LIFC_TEMPORARY is specified.
7621 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7622 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7623 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7624 * has priority over LIFC_NOXMIT.
7625 */
7626 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7627
7628 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7629 return (EINVAL);
7630
7631 /*
7632 * Must be (better be!) continuation of a TRANSPARENT
7633 * IOCTL. We just copied in the lifconf structure.
7634 */
7635 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7636
7637 family = STRUCT_FGET(lifc, lifc_family);
7638 flags = STRUCT_FGET(lifc, lifc_flags);
7639
7640 switch (family) {
7641 case AF_UNSPEC:
7642 /*
7643 * walk all ILL's.
7644 */
7645 list = MAX_G_HEADS;
7646 break;
7647 case AF_INET:
7648 /*
7649 * walk only IPV4 ILL's.
7650 */
7651 list = IP_V4_G_HEAD;
7652 break;
7653 case AF_INET6:
7654 /*
7655 * walk only IPV6 ILL's.
7656 */
7657 list = IP_V6_G_HEAD;
7658 break;
7659 default:
7660 return (EAFNOSUPPORT);
7661 }
7662
7663 /*
7664 * Allocate a buffer to hold requested information.
7665 *
7666 * If lifc_len is larger than what is needed, we only
7667 * allocate what we will use.
7668 *
7669 * If lifc_len is smaller than what is needed, return
7670 * EINVAL.
7671 */
7672 numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7673 lifc_bufsize = numlifs * sizeof (struct lifreq);
7674 lifclen = STRUCT_FGET(lifc, lifc_len);
7675 if (lifc_bufsize > lifclen) {
7676 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7677 return (EINVAL);
7678 else
7679 lifc_bufsize = lifclen;
7680 }
7681
7682 mp1 = mi_copyout_alloc(q, mp,
7683 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7684 if (mp1 == NULL)
7685 return (ENOMEM);
7686
7687 mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7688 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7689
7690 lifr = (struct lifreq *)mp1->b_rptr;
7691
7692 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7693 ill = ill_first(list, list, &ctx, ipst);
7694 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7695 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7696 continue;
7697
7698 for (ipif = ill->ill_ipif; ipif != NULL;
7699 ipif = ipif->ipif_next) {
7700 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7701 !(flags & LIFC_NOXMIT))
7702 continue;
7703
7704 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7705 !(flags & LIFC_TEMPORARY))
7706 continue;
7707
7708 if (((ipif->ipif_flags &
7709 (IPIF_NOXMIT|IPIF_NOLOCAL|
7710 IPIF_DEPRECATED)) ||
7711 IS_LOOPBACK(ill) ||
7712 !(ipif->ipif_flags & IPIF_UP)) &&
7713 (flags & LIFC_EXTERNAL_SOURCE))
7714 continue;
7715
7716 if (zoneid != ipif->ipif_zoneid &&
7717 ipif->ipif_zoneid != ALL_ZONES &&
7718 (zoneid != GLOBAL_ZONEID ||
7719 !(flags & LIFC_ALLZONES)))
7720 continue;
7721
7722 if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7723 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7724 rw_exit(&ipst->ips_ill_g_lock);
7725 return (EINVAL);
7726 } else {
7727 goto lif_copydone;
7728 }
7729 }
7730
7731 ipif_get_name(ipif, lifr->lifr_name,
7732 sizeof (lifr->lifr_name));
7733 lifr->lifr_type = ill->ill_type;
7734 if (ipif->ipif_isv6) {
7735 sin6 = (sin6_t *)&lifr->lifr_addr;
7736 *sin6 = sin6_null;
7737 sin6->sin6_family = AF_INET6;
7738 sin6->sin6_addr =
7739 ipif->ipif_v6lcl_addr;
7740 lifr->lifr_addrlen =
7741 ip_mask_to_plen_v6(
7742 &ipif->ipif_v6net_mask);
7743 } else {
7744 sin = (sin_t *)&lifr->lifr_addr;
7745 *sin = sin_null;
7746 sin->sin_family = AF_INET;
7747 sin->sin_addr.s_addr =
7748 ipif->ipif_lcl_addr;
7749 lifr->lifr_addrlen =
7750 ip_mask_to_plen(
7751 ipif->ipif_net_mask);
7752 }
7753 lifr++;
7754 }
7755 }
7756 lif_copydone:
7757 rw_exit(&ipst->ips_ill_g_lock);
7758
7759 mp1->b_wptr = (uchar_t *)lifr;
7760 if (STRUCT_BUF(lifc) != NULL) {
7761 STRUCT_FSET(lifc, lifc_len,
7762 (int)((uchar_t *)lifr - mp1->b_rptr));
7763 }
7764 return (0);
7765 }
7766
7767 static void
7768 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7769 {
7770 ip6_asp_t *table;
7771 size_t table_size;
7772 mblk_t *data_mp;
7773 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7774 ip_stack_t *ipst;
7775
7776 if (q->q_next == NULL)
7777 ipst = CONNQ_TO_IPST(q);
7778 else
7779 ipst = ILLQ_TO_IPST(q);
7780
7781 /* These two ioctls are I_STR only */
7782 if (iocp->ioc_count == TRANSPARENT) {
7783 miocnak(q, mp, 0, EINVAL);
7784 return;
7785 }
7786
7787 data_mp = mp->b_cont;
7788 if (data_mp == NULL) {
7789 /* The user passed us a NULL argument */
7790 table = NULL;
7791 table_size = iocp->ioc_count;
7792 } else {
7793 /*
7794 * The user provided a table. The stream head
7795 * may have copied in the user data in chunks,
7796 * so make sure everything is pulled up
7797 * properly.
7798 */
7799 if (MBLKL(data_mp) < iocp->ioc_count) {
7800 mblk_t *new_data_mp;
7801 if ((new_data_mp = msgpullup(data_mp, -1)) ==
7802 NULL) {
7803 miocnak(q, mp, 0, ENOMEM);
7804 return;
7805 }
7806 freemsg(data_mp);
7807 data_mp = new_data_mp;
7808 mp->b_cont = data_mp;
7809 }
7810 table = (ip6_asp_t *)data_mp->b_rptr;
7811 table_size = iocp->ioc_count;
7812 }
7813
7814 switch (iocp->ioc_cmd) {
7815 case SIOCGIP6ADDRPOLICY:
7816 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7817 if (iocp->ioc_rval == -1)
7818 iocp->ioc_error = EINVAL;
7819 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7820 else if (table != NULL &&
7821 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7822 ip6_asp_t *src = table;
7823 ip6_asp32_t *dst = (void *)table;
7824 int count = table_size / sizeof (ip6_asp_t);
7825 int i;
7826
7827 /*
7828 * We need to do an in-place shrink of the array
7829 * to match the alignment attributes of the
7830 * 32-bit ABI looking at it.
7831 */
7832 /* LINTED: logical expression always true: op "||" */
7833 ASSERT(sizeof (*src) > sizeof (*dst));
7834 for (i = 1; i < count; i++)
7835 bcopy(src + i, dst + i, sizeof (*dst));
7836 }
7837 #endif
7838 break;
7839
7840 case SIOCSIP6ADDRPOLICY:
7841 ASSERT(mp->b_prev == NULL);
7842 mp->b_prev = (void *)q;
7843 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7844 /*
7845 * We pass in the datamodel here so that the ip6_asp_replace()
7846 * routine can handle converting from 32-bit to native formats
7847 * where necessary.
7848 *
7849 * A better way to handle this might be to convert the inbound
7850 * data structure here, and hang it off a new 'mp'; thus the
7851 * ip6_asp_replace() logic would always be dealing with native
7852 * format data structures..
7853 *
7854 * (An even simpler way to handle these ioctls is to just
7855 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7856 * and just recompile everything that depends on it.)
7857 */
7858 #endif
7859 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7860 iocp->ioc_flag & IOC_MODELS);
7861 return;
7862 }
7863
7864 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7865 qreply(q, mp);
7866 }
7867
7868 static void
7869 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7870 {
7871 mblk_t *data_mp;
7872 struct dstinforeq *dir;
7873 uint8_t *end, *cur;
7874 in6_addr_t *daddr, *saddr;
7875 ipaddr_t v4daddr;
7876 ire_t *ire;
7877 ipaddr_t v4setsrc;
7878 in6_addr_t v6setsrc;
7879 char *slabel, *dlabel;
7880 boolean_t isipv4;
7881 int match_ire;
7882 ill_t *dst_ill;
7883 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7884 conn_t *connp = Q_TO_CONN(q);
7885 zoneid_t zoneid = IPCL_ZONEID(connp);
7886 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
7887 uint64_t ipif_flags;
7888
7889 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7890
7891 /*
7892 * This ioctl is I_STR only, and must have a
7893 * data mblk following the M_IOCTL mblk.
7894 */
7895 data_mp = mp->b_cont;
7896 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7897 miocnak(q, mp, 0, EINVAL);
7898 return;
7899 }
7900
7901 if (MBLKL(data_mp) < iocp->ioc_count) {
7902 mblk_t *new_data_mp;
7903
7904 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7905 miocnak(q, mp, 0, ENOMEM);
7906 return;
7907 }
7908 freemsg(data_mp);
7909 data_mp = new_data_mp;
7910 mp->b_cont = data_mp;
7911 }
7912 match_ire = MATCH_IRE_DSTONLY;
7913
7914 for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
7915 end - cur >= sizeof (struct dstinforeq);
7916 cur += sizeof (struct dstinforeq)) {
7917 dir = (struct dstinforeq *)cur;
7918 daddr = &dir->dir_daddr;
7919 saddr = &dir->dir_saddr;
7920
7921 /*
7922 * ip_addr_scope_v6() and ip6_asp_lookup() handle
7923 * v4 mapped addresses; ire_ftable_lookup_v6()
7924 * and ip_select_source_v6() do not.
7925 */
7926 dir->dir_dscope = ip_addr_scope_v6(daddr);
7927 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
7928
7929 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
7930 if (isipv4) {
7931 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
7932 v4setsrc = INADDR_ANY;
7933 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
7934 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc,
7935 NULL, NULL);
7936 } else {
7937 v6setsrc = ipv6_all_zeros;
7938 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
7939 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc,
7940 NULL, NULL);
7941 }
7942 ASSERT(ire != NULL);
7943 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
7944 ire_refrele(ire);
7945 dir->dir_dreachable = 0;
7946
7947 /* move on to next dst addr */
7948 continue;
7949 }
7950 dir->dir_dreachable = 1;
7951
7952 dst_ill = ire_nexthop_ill(ire);
7953 if (dst_ill == NULL) {
7954 ire_refrele(ire);
7955 continue;
7956 }
7957
7958 /* With ipmp we most likely look at the ipmp ill here */
7959 dir->dir_dmactype = dst_ill->ill_mactype;
7960
7961 if (isipv4) {
7962 ipaddr_t v4saddr;
7963
7964 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
7965 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
7966 &v4saddr, NULL, &ipif_flags) != 0) {
7967 v4saddr = INADDR_ANY;
7968 ipif_flags = 0;
7969 }
7970 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
7971 } else {
7972 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
7973 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
7974 saddr, NULL, &ipif_flags) != 0) {
7975 *saddr = ipv6_all_zeros;
7976 ipif_flags = 0;
7977 }
7978 }
7979
7980 dir->dir_sscope = ip_addr_scope_v6(saddr);
7981 slabel = ip6_asp_lookup(saddr, NULL, ipst);
7982 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
7983 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
7984 ire_refrele(ire);
7985 ill_refrele(dst_ill);
7986 }
7987 miocack(q, mp, iocp->ioc_count, 0);
7988 }
7989
7990 /*
7991 * Check if this is an address assigned to this machine.
7992 * Skips interfaces that are down by using ire checks.
7993 * Translates mapped addresses to v4 addresses and then
7994 * treats them as such, returning true if the v4 address
7995 * associated with this mapped address is configured.
7996 * Note: Applications will have to be careful what they do
7997 * with the response; use of mapped addresses limits
7998 * what can be done with the socket, especially with
7999 * respect to socket options and ioctls - neither IPv4
8000 * options nor IPv6 sticky options/ancillary data options
8001 * may be used.
8002 */
8003 /* ARGSUSED */
8004 int
8005 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8006 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8007 {
8008 struct sioc_addrreq *sia;
8009 sin_t *sin;
8010 ire_t *ire;
8011 mblk_t *mp1;
8012 zoneid_t zoneid;
8013 ip_stack_t *ipst;
8014
8015 ip1dbg(("ip_sioctl_tmyaddr"));
8016
8017 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8018 zoneid = Q_TO_CONN(q)->conn_zoneid;
8019 ipst = CONNQ_TO_IPST(q);
8020
8021 /* Existence verified in ip_wput_nondata */
8022 mp1 = mp->b_cont->b_cont;
8023 sia = (struct sioc_addrreq *)mp1->b_rptr;
8024 sin = (sin_t *)&sia->sa_addr;
8025 switch (sin->sin_family) {
8026 case AF_INET6: {
8027 sin6_t *sin6 = (sin6_t *)sin;
8028
8029 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8030 ipaddr_t v4_addr;
8031
8032 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8033 v4_addr);
8034 ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
8035 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8036 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8037 } else {
8038 in6_addr_t v6addr;
8039
8040 v6addr = sin6->sin6_addr;
8041 ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
8042 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8043 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8044 }
8045 break;
8046 }
8047 case AF_INET: {
8048 ipaddr_t v4addr;
8049
8050 v4addr = sin->sin_addr.s_addr;
8051 ire = ire_ftable_lookup_v4(v4addr, 0, 0,
8052 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8053 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8054 break;
8055 }
8056 default:
8057 return (EAFNOSUPPORT);
8058 }
8059 if (ire != NULL) {
8060 sia->sa_res = 1;
8061 ire_refrele(ire);
8062 } else {
8063 sia->sa_res = 0;
8064 }
8065 return (0);
8066 }
8067
8068 /*
8069 * Check if this is an address assigned on-link i.e. neighbor,
8070 * and makes sure it's reachable from the current zone.
8071 * Returns true for my addresses as well.
8072 * Translates mapped addresses to v4 addresses and then
8073 * treats them as such, returning true if the v4 address
8074 * associated with this mapped address is configured.
8075 * Note: Applications will have to be careful what they do
8076 * with the response; use of mapped addresses limits
8077 * what can be done with the socket, especially with
8078 * respect to socket options and ioctls - neither IPv4
8079 * options nor IPv6 sticky options/ancillary data options
8080 * may be used.
8081 */
8082 /* ARGSUSED */
8083 int
8084 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8085 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
8086 {
8087 struct sioc_addrreq *sia;
8088 sin_t *sin;
8089 mblk_t *mp1;
8090 ire_t *ire = NULL;
8091 zoneid_t zoneid;
8092 ip_stack_t *ipst;
8093
8094 ip1dbg(("ip_sioctl_tonlink"));
8095
8096 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8097 zoneid = Q_TO_CONN(q)->conn_zoneid;
8098 ipst = CONNQ_TO_IPST(q);
8099
8100 /* Existence verified in ip_wput_nondata */
8101 mp1 = mp->b_cont->b_cont;
8102 sia = (struct sioc_addrreq *)mp1->b_rptr;
8103 sin = (sin_t *)&sia->sa_addr;
8104
8105 /*
8106 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
8107 * to make sure we only look at on-link unicast address.
8108 */
8109 switch (sin->sin_family) {
8110 case AF_INET6: {
8111 sin6_t *sin6 = (sin6_t *)sin;
8112
8113 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8114 ipaddr_t v4_addr;
8115
8116 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8117 v4_addr);
8118 if (!CLASSD(v4_addr)) {
8119 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
8120 NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
8121 0, ipst, NULL);
8122 }
8123 } else {
8124 in6_addr_t v6addr;
8125
8126 v6addr = sin6->sin6_addr;
8127 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
8128 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
8129 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
8130 ipst, NULL);
8131 }
8132 }
8133 break;
8134 }
8135 case AF_INET: {
8136 ipaddr_t v4addr;
8137
8138 v4addr = sin->sin_addr.s_addr;
8139 if (!CLASSD(v4addr)) {
8140 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
8141 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
8142 }
8143 break;
8144 }
8145 default:
8146 return (EAFNOSUPPORT);
8147 }
8148 sia->sa_res = 0;
8149 if (ire != NULL) {
8150 ASSERT(!(ire->ire_type & IRE_MULTICAST));
8151
8152 if ((ire->ire_type & IRE_ONLINK) &&
8153 !(ire->ire_type & IRE_BROADCAST))
8154 sia->sa_res = 1;
8155 ire_refrele(ire);
8156 }
8157 return (0);
8158 }
8159
8160 /*
8161 * TBD: implement when kernel maintaines a list of site prefixes.
8162 */
8163 /* ARGSUSED */
8164 int
8165 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8166 ip_ioctl_cmd_t *ipip, void *ifreq)
8167 {
8168 return (ENXIO);
8169 }
8170
8171 /* ARP IOCTLs. */
8172 /* ARGSUSED */
8173 int
8174 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8175 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8176 {
8177 int err;
8178 ipaddr_t ipaddr;
8179 struct iocblk *iocp;
8180 conn_t *connp;
8181 struct arpreq *ar;
8182 struct xarpreq *xar;
8183 int arp_flags, flags, alength;
8184 uchar_t *lladdr;
8185 ip_stack_t *ipst;
8186 ill_t *ill = ipif->ipif_ill;
8187 ill_t *proxy_ill = NULL;
8188 ipmp_arpent_t *entp = NULL;
8189 boolean_t proxyarp = B_FALSE;
8190 boolean_t if_arp_ioctl = B_FALSE;
8191 ncec_t *ncec = NULL;
8192 nce_t *nce;
8193
8194 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8195 connp = Q_TO_CONN(q);
8196 ipst = connp->conn_netstack->netstack_ip;
8197 iocp = (struct iocblk *)mp->b_rptr;
8198
8199 if (ipip->ipi_cmd_type == XARP_CMD) {
8200 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8201 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8202 ar = NULL;
8203
8204 arp_flags = xar->xarp_flags;
8205 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8206 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8207 /*
8208 * Validate against user's link layer address length
8209 * input and name and addr length limits.
8210 */
8211 alength = ill->ill_phys_addr_length;
8212 if (ipip->ipi_cmd == SIOCSXARP) {
8213 if (alength != xar->xarp_ha.sdl_alen ||
8214 (alength + xar->xarp_ha.sdl_nlen >
8215 sizeof (xar->xarp_ha.sdl_data)))
8216 return (EINVAL);
8217 }
8218 } else {
8219 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8220 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8221 xar = NULL;
8222
8223 arp_flags = ar->arp_flags;
8224 lladdr = (uchar_t *)ar->arp_ha.sa_data;
8225 /*
8226 * Theoretically, the sa_family could tell us what link
8227 * layer type this operation is trying to deal with. By
8228 * common usage AF_UNSPEC means ethernet. We'll assume
8229 * any attempt to use the SIOC?ARP ioctls is for ethernet,
8230 * for now. Our new SIOC*XARP ioctls can be used more
8231 * generally.
8232 *
8233 * If the underlying media happens to have a non 6 byte
8234 * address, arp module will fail set/get, but the del
8235 * operation will succeed.
8236 */
8237 alength = 6;
8238 if ((ipip->ipi_cmd != SIOCDARP) &&
8239 (alength != ill->ill_phys_addr_length)) {
8240 return (EINVAL);
8241 }
8242 }
8243
8244 /* Translate ATF* flags to NCE* flags */
8245 flags = 0;
8246 if (arp_flags & ATF_AUTHORITY)
8247 flags |= NCE_F_AUTHORITY;
8248 if (arp_flags & ATF_PERM)
8249 flags |= NCE_F_NONUD; /* not subject to aging */
8250 if (arp_flags & ATF_PUBL)
8251 flags |= NCE_F_PUBLISH;
8252
8253 /*
8254 * IPMP ARP special handling:
8255 *
8256 * 1. Since ARP mappings must appear consistent across the group,
8257 * prohibit changing ARP mappings on the underlying interfaces.
8258 *
8259 * 2. Since ARP mappings for IPMP data addresses are maintained by
8260 * IP itself, prohibit changing them.
8261 *
8262 * 3. For proxy ARP, use a functioning hardware address in the group,
8263 * provided one exists. If one doesn't, just add the entry as-is;
8264 * ipmp_illgrp_refresh_arpent() will refresh it if things change.
8265 */
8266 if (IS_UNDER_IPMP(ill)) {
8267 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8268 return (EPERM);
8269 }
8270 if (IS_IPMP(ill)) {
8271 ipmp_illgrp_t *illg = ill->ill_grp;
8272
8273 switch (ipip->ipi_cmd) {
8274 case SIOCSARP:
8275 case SIOCSXARP:
8276 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8277 if (proxy_ill != NULL) {
8278 proxyarp = B_TRUE;
8279 if (!ipmp_ill_is_active(proxy_ill))
8280 proxy_ill = ipmp_illgrp_next_ill(illg);
8281 if (proxy_ill != NULL)
8282 lladdr = proxy_ill->ill_phys_addr;
8283 }
8284 /* FALLTHRU */
8285 }
8286 }
8287
8288 ipaddr = sin->sin_addr.s_addr;
8289 /*
8290 * don't match across illgrp per case (1) and (2).
8291 * XXX use IS_IPMP(ill) like ndp_sioc_update?
8292 */
8293 nce = nce_lookup_v4(ill, &ipaddr);
8294 if (nce != NULL)
8295 ncec = nce->nce_common;
8296
8297 switch (iocp->ioc_cmd) {
8298 case SIOCDARP:
8299 case SIOCDXARP: {
8300 /*
8301 * Delete the NCE if any.
8302 */
8303 if (ncec == NULL) {
8304 iocp->ioc_error = ENXIO;
8305 break;
8306 }
8307 /* Don't allow changes to arp mappings of local addresses. */
8308 if (NCE_MYADDR(ncec)) {
8309 nce_refrele(nce);
8310 return (ENOTSUP);
8311 }
8312 iocp->ioc_error = 0;
8313
8314 /*
8315 * Delete the nce_common which has ncec_ill set to ipmp_ill.
8316 * This will delete all the nce entries on the under_ills.
8317 */
8318 ncec_delete(ncec);
8319 /*
8320 * Once the NCE has been deleted, then the ire_dep* consistency
8321 * mechanism will find any IRE which depended on the now
8322 * condemned NCE (as part of sending packets).
8323 * That mechanism handles redirects by deleting redirects
8324 * that refer to UNREACHABLE nces.
8325 */
8326 break;
8327 }
8328 case SIOCGARP:
8329 case SIOCGXARP:
8330 if (ncec != NULL) {
8331 lladdr = ncec->ncec_lladdr;
8332 flags = ncec->ncec_flags;
8333 iocp->ioc_error = 0;
8334 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8335 } else {
8336 iocp->ioc_error = ENXIO;
8337 }
8338 break;
8339 case SIOCSARP:
8340 case SIOCSXARP:
8341 /* Don't allow changes to arp mappings of local addresses. */
8342 if (ncec != NULL && NCE_MYADDR(ncec)) {
8343 nce_refrele(nce);
8344 return (ENOTSUP);
8345 }
8346
8347 /* static arp entries will undergo NUD if ATF_PERM is not set */
8348 flags |= NCE_F_STATIC;
8349 if (!if_arp_ioctl) {
8350 ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8351 lladdr, alength, flags);
8352 } else {
8353 ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8354 if (ipif != NULL) {
8355 ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8356 lladdr, alength, flags);
8357 ipif_refrele(ipif);
8358 }
8359 }
8360 if (nce != NULL) {
8361 nce_refrele(nce);
8362 nce = NULL;
8363 }
8364 /*
8365 * NCE_F_STATIC entries will be added in state ND_REACHABLE
8366 * by nce_add_common()
8367 */
8368 err = nce_lookup_then_add_v4(ill, lladdr,
8369 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8370 &nce);
8371 if (err == EEXIST) {
8372 ncec = nce->nce_common;
8373 mutex_enter(&ncec->ncec_lock);
8374 ncec->ncec_state = ND_REACHABLE;
8375 ncec->ncec_flags = flags;
8376 nce_update(ncec, ND_UNCHANGED, lladdr);
8377 mutex_exit(&ncec->ncec_lock);
8378 err = 0;
8379 }
8380 if (nce != NULL) {
8381 nce_refrele(nce);
8382 nce = NULL;
8383 }
8384 if (IS_IPMP(ill) && err == 0) {
8385 entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8386 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8387 flags);
8388 if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8389 iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8390 break;
8391 }
8392 }
8393 iocp->ioc_error = err;
8394 }
8395
8396 if (nce != NULL) {
8397 nce_refrele(nce);
8398 }
8399
8400 /*
8401 * If we created an IPMP ARP entry, mark that we've notified ARP.
8402 */
8403 if (entp != NULL)
8404 ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8405
8406 return (iocp->ioc_error);
8407 }
8408
8409 /*
8410 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8411 * the associated sin and refhold and return the associated ipif via `ci'.
8412 */
8413 int
8414 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8415 cmd_info_t *ci)
8416 {
8417 mblk_t *mp1;
8418 sin_t *sin;
8419 conn_t *connp;
8420 ipif_t *ipif;
8421 ire_t *ire = NULL;
8422 ill_t *ill = NULL;
8423 boolean_t exists;
8424 ip_stack_t *ipst;
8425 struct arpreq *ar;
8426 struct xarpreq *xar;
8427 struct sockaddr_dl *sdl;
8428
8429 /* ioctl comes down on a conn */
8430 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8431 connp = Q_TO_CONN(q);
8432 if (connp->conn_family == AF_INET6)
8433 return (ENXIO);
8434
8435 ipst = connp->conn_netstack->netstack_ip;
8436
8437 /* Verified in ip_wput_nondata */
8438 mp1 = mp->b_cont->b_cont;
8439
8440 if (ipip->ipi_cmd_type == XARP_CMD) {
8441 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8442 xar = (struct xarpreq *)mp1->b_rptr;
8443 sin = (sin_t *)&xar->xarp_pa;
8444 sdl = &xar->xarp_ha;
8445
8446 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8447 return (ENXIO);
8448 if (sdl->sdl_nlen >= LIFNAMSIZ)
8449 return (EINVAL);
8450 } else {
8451 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8452 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8453 ar = (struct arpreq *)mp1->b_rptr;
8454 sin = (sin_t *)&ar->arp_pa;
8455 }
8456
8457 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8458 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8459 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8460 if (ipif == NULL)
8461 return (ENXIO);
8462 if (ipif->ipif_id != 0) {
8463 ipif_refrele(ipif);
8464 return (ENXIO);
8465 }
8466 } else {
8467 /*
8468 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8469 * of 0: use the IP address to find the ipif. If the IP
8470 * address is an IPMP test address, ire_ftable_lookup() will
8471 * find the wrong ill, so we first do an ipif_lookup_addr().
8472 */
8473 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8474 ipst);
8475 if (ipif == NULL) {
8476 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8477 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8478 NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
8479 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8480 if (ire != NULL)
8481 ire_refrele(ire);
8482 return (ENXIO);
8483 }
8484 ASSERT(ire != NULL && ill != NULL);
8485 ipif = ill->ill_ipif;
8486 ipif_refhold(ipif);
8487 ire_refrele(ire);
8488 }
8489 }
8490
8491 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8492 ipif_refrele(ipif);
8493 return (ENXIO);
8494 }
8495
8496 ci->ci_sin = sin;
8497 ci->ci_ipif = ipif;
8498 return (0);
8499 }
8500
8501 /*
8502 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8503 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is
8504 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8505 * up and thus an ill can join that illgrp.
8506 *
8507 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8508 * open()/close() primarily because close() is not allowed to fail or block
8509 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason
8510 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure
8511 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8512 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts
8513 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8514 * state if I_UNLINK didn't occur.
8515 *
8516 * Note that for each plumb/unplumb operation, we may end up here more than
8517 * once because of the way ifconfig works. However, it's OK to link the same
8518 * illgrp more than once, or unlink an illgrp that's already unlinked.
8519 */
8520 static int
8521 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8522 {
8523 int err;
8524 ip_stack_t *ipst = ill->ill_ipst;
8525
8526 ASSERT(IS_IPMP(ill));
8527 ASSERT(IAM_WRITER_ILL(ill));
8528
8529 switch (ioccmd) {
8530 case I_LINK:
8531 return (ENOTSUP);
8532
8533 case I_PLINK:
8534 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8535 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8536 rw_exit(&ipst->ips_ipmp_lock);
8537 break;
8538
8539 case I_PUNLINK:
8540 /*
8541 * Require all UP ipifs be brought down prior to unlinking the
8542 * illgrp so any associated IREs (and other state) is torched.
8543 */
8544 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8545 return (EBUSY);
8546
8547 /*
8548 * NOTE: We hold ipmp_lock across the unlink to prevent a race
8549 * with an SIOCSLIFGROUPNAME request from an ill trying to
8550 * join this group. Specifically: ills trying to join grab
8551 * ipmp_lock and bump a "pending join" counter checked by
8552 * ipmp_illgrp_unlink_grp(). During the unlink no new pending
8553 * joins can occur (since we have ipmp_lock). Once we drop
8554 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8555 * find the illgrp (since we unlinked it) and will return
8556 * EAFNOSUPPORT. This will then take them back through the
8557 * IPMP meta-interface plumbing logic in ifconfig, and thus
8558 * back through I_PLINK above.
8559 */
8560 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8561 err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8562 rw_exit(&ipst->ips_ipmp_lock);
8563 return (err);
8564 default:
8565 break;
8566 }
8567 return (0);
8568 }
8569
8570 /*
8571 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8572 * atomically set/clear the muxids. Also complete the ioctl by acking or
8573 * naking it. Note that the code is structured such that the link type,
8574 * whether it's persistent or not, is treated equally. ifconfig(1M) and
8575 * its clones use the persistent link, while pppd(1M) and perhaps many
8576 * other daemons may use non-persistent link. When combined with some
8577 * ill_t states, linking and unlinking lower streams may be used as
8578 * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8579 */
8580 /* ARGSUSED */
8581 void
8582 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8583 {
8584 mblk_t *mp1;
8585 struct linkblk *li;
8586 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8587 int err = 0;
8588
8589 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8590 ioccmd == I_LINK || ioccmd == I_UNLINK);
8591
8592 mp1 = mp->b_cont; /* This is the linkblk info */
8593 li = (struct linkblk *)mp1->b_rptr;
8594
8595 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8596 if (err == EINPROGRESS)
8597 return;
8598 if (err == 0)
8599 miocack(q, mp, 0, 0);
8600 else
8601 miocnak(q, mp, 0, err);
8602
8603 /* Conn was refheld in ip_sioctl_copyin_setup */
8604 if (CONN_Q(q)) {
8605 CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8606 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8607 }
8608 }
8609
8610 /*
8611 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8612 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8613 * module stream).
8614 * Returns zero on success, EINPROGRESS if the operation is still pending, or
8615 * an error code on failure.
8616 */
8617 static int
8618 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8619 struct linkblk *li)
8620 {
8621 int err = 0;
8622 ill_t *ill;
8623 queue_t *ipwq, *dwq;
8624 const char *name;
8625 struct qinit *qinfo;
8626 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8627 boolean_t entered_ipsq = B_FALSE;
8628 boolean_t is_ip = B_FALSE;
8629 arl_t *arl;
8630
8631 /*
8632 * Walk the lower stream to verify it's the IP module stream.
8633 * The IP module is identified by its name, wput function,
8634 * and non-NULL q_next. STREAMS ensures that the lower stream
8635 * (li->l_qbot) will not vanish until this ioctl completes.
8636 */
8637 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8638 qinfo = ipwq->q_qinfo;
8639 name = qinfo->qi_minfo->mi_idname;
8640 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8641 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
8642 is_ip = B_TRUE;
8643 break;
8644 }
8645 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8646 qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) {
8647 break;
8648 }
8649 }
8650
8651 /*
8652 * If this isn't an IP module stream, bail.
8653 */
8654 if (ipwq == NULL)
8655 return (0);
8656
8657 if (!is_ip) {
8658 arl = (arl_t *)ipwq->q_ptr;
8659 ill = arl_to_ill(arl);
8660 if (ill == NULL)
8661 return (0);
8662 } else {
8663 ill = ipwq->q_ptr;
8664 }
8665 ASSERT(ill != NULL);
8666
8667 if (ipsq == NULL) {
8668 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8669 NEW_OP, B_FALSE);
8670 if (ipsq == NULL) {
8671 if (!is_ip)
8672 ill_refrele(ill);
8673 return (EINPROGRESS);
8674 }
8675 entered_ipsq = B_TRUE;
8676 }
8677 ASSERT(IAM_WRITER_ILL(ill));
8678 mutex_enter(&ill->ill_lock);
8679 if (!is_ip) {
8680 if (islink && ill->ill_muxid == 0) {
8681 /*
8682 * Plumbing has to be done with IP plumbed first, arp
8683 * second, but here we have arp being plumbed first.
8684 */
8685 mutex_exit(&ill->ill_lock);
8686 if (entered_ipsq)
8687 ipsq_exit(ipsq);
8688 ill_refrele(ill);
8689 return (EINVAL);
8690 }
8691 }
8692 mutex_exit(&ill->ill_lock);
8693 if (!is_ip) {
8694 arl->arl_muxid = islink ? li->l_index : 0;
8695 ill_refrele(ill);
8696 goto done;
8697 }
8698
8699 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8700 goto done;
8701
8702 /*
8703 * As part of I_{P}LINKing, stash the number of downstream modules and
8704 * the read queue of the module immediately below IP in the ill.
8705 * These are used during the capability negotiation below.
8706 */
8707 ill->ill_lmod_rq = NULL;
8708 ill->ill_lmod_cnt = 0;
8709 if (islink && ((dwq = ipwq->q_next) != NULL)) {
8710 ill->ill_lmod_rq = RD(dwq);
8711 for (; dwq != NULL; dwq = dwq->q_next)
8712 ill->ill_lmod_cnt++;
8713 }
8714
8715 ill->ill_muxid = islink ? li->l_index : 0;
8716
8717 /*
8718 * Mark the ipsq busy until the capability operations initiated below
8719 * complete. The PLINK/UNLINK ioctl itself completes when our caller
8720 * returns, but the capability operation may complete asynchronously
8721 * much later.
8722 */
8723 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8724 /*
8725 * If there's at least one up ipif on this ill, then we're bound to
8726 * the underlying driver via DLPI. In that case, renegotiate
8727 * capabilities to account for any possible change in modules
8728 * interposed between IP and the driver.
8729 */
8730 if (ill->ill_ipif_up_count > 0) {
8731 if (islink)
8732 ill_capability_probe(ill);
8733 else
8734 ill_capability_reset(ill, B_FALSE);
8735 }
8736 ipsq_current_finish(ipsq);
8737 done:
8738 if (entered_ipsq)
8739 ipsq_exit(ipsq);
8740
8741 return (err);
8742 }
8743
8744 /*
8745 * Search the ioctl command in the ioctl tables and return a pointer
8746 * to the ioctl command information. The ioctl command tables are
8747 * static and fully populated at compile time.
8748 */
8749 ip_ioctl_cmd_t *
8750 ip_sioctl_lookup(int ioc_cmd)
8751 {
8752 int index;
8753 ip_ioctl_cmd_t *ipip;
8754 ip_ioctl_cmd_t *ipip_end;
8755
8756 if (ioc_cmd == IPI_DONTCARE)
8757 return (NULL);
8758
8759 /*
8760 * Do a 2 step search. First search the indexed table
8761 * based on the least significant byte of the ioctl cmd.
8762 * If we don't find a match, then search the misc table
8763 * serially.
8764 */
8765 index = ioc_cmd & 0xFF;
8766 if (index < ip_ndx_ioctl_count) {
8767 ipip = &ip_ndx_ioctl_table[index];
8768 if (ipip->ipi_cmd == ioc_cmd) {
8769 /* Found a match in the ndx table */
8770 return (ipip);
8771 }
8772 }
8773
8774 /* Search the misc table */
8775 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8776 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8777 if (ipip->ipi_cmd == ioc_cmd)
8778 /* Found a match in the misc table */
8779 return (ipip);
8780 }
8781
8782 return (NULL);
8783 }
8784
8785 /*
8786 * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8787 */
8788 static boolean_t
8789 getset_ioctl_checks(mblk_t *mp)
8790 {
8791 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8792 mblk_t *mp1 = mp->b_cont;
8793 mod_ioc_prop_t *pioc;
8794 uint_t flags;
8795 uint_t pioc_size;
8796
8797 /* do sanity checks on various arguments */
8798 if (mp1 == NULL || iocp->ioc_count == 0 ||
8799 iocp->ioc_count == TRANSPARENT) {
8800 return (B_FALSE);
8801 }
8802 if (msgdsize(mp1) < iocp->ioc_count) {
8803 if (!pullupmsg(mp1, iocp->ioc_count))
8804 return (B_FALSE);
8805 }
8806
8807 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8808
8809 /* sanity checks on mpr_valsize */
8810 pioc_size = sizeof (mod_ioc_prop_t);
8811 if (pioc->mpr_valsize != 0)
8812 pioc_size += pioc->mpr_valsize - 1;
8813
8814 if (iocp->ioc_count != pioc_size)
8815 return (B_FALSE);
8816
8817 flags = pioc->mpr_flags;
8818 if (iocp->ioc_cmd == SIOCSETPROP) {
8819 /*
8820 * One can either reset the value to it's default value or
8821 * change the current value or append/remove the value from
8822 * a multi-valued properties.
8823 */
8824 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8825 flags != MOD_PROP_ACTIVE &&
8826 flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8827 flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8828 return (B_FALSE);
8829 } else {
8830 ASSERT(iocp->ioc_cmd == SIOCGETPROP);
8831
8832 /*
8833 * One can retrieve only one kind of property information
8834 * at a time.
8835 */
8836 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8837 (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8838 (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8839 (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8840 return (B_FALSE);
8841 }
8842
8843 return (B_TRUE);
8844 }
8845
8846 /*
8847 * process the SIOC{SET|GET}PROP ioctl's
8848 */
8849 /* ARGSUSED */
8850 static void
8851 ip_sioctl_getsetprop(queue_t *q, mblk_t *mp)
8852 {
8853 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8854 mblk_t *mp1 = mp->b_cont;
8855 mod_ioc_prop_t *pioc;
8856 mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8857 ip_stack_t *ipst;
8858 icmp_stack_t *is;
8859 tcp_stack_t *tcps;
8860 sctp_stack_t *sctps;
8861 udp_stack_t *us;
8862 netstack_t *stack;
8863 void *cbarg;
8864 cred_t *cr;
8865 boolean_t set;
8866 int err;
8867
8868 ASSERT(q->q_next == NULL);
8869 ASSERT(CONN_Q(q));
8870
8871 if (!getset_ioctl_checks(mp)) {
8872 miocnak(q, mp, 0, EINVAL);
8873 return;
8874 }
8875 ipst = CONNQ_TO_IPST(q);
8876 stack = ipst->ips_netstack;
8877 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8878
8879 switch (pioc->mpr_proto) {
8880 case MOD_PROTO_IP:
8881 case MOD_PROTO_IPV4:
8882 case MOD_PROTO_IPV6:
8883 ptbl = ipst->ips_propinfo_tbl;
8884 cbarg = ipst;
8885 break;
8886 case MOD_PROTO_RAWIP:
8887 is = stack->netstack_icmp;
8888 ptbl = is->is_propinfo_tbl;
8889 cbarg = is;
8890 break;
8891 case MOD_PROTO_TCP:
8892 tcps = stack->netstack_tcp;
8893 ptbl = tcps->tcps_propinfo_tbl;
8894 cbarg = tcps;
8895 break;
8896 case MOD_PROTO_UDP:
8897 us = stack->netstack_udp;
8898 ptbl = us->us_propinfo_tbl;
8899 cbarg = us;
8900 break;
8901 case MOD_PROTO_SCTP:
8902 sctps = stack->netstack_sctp;
8903 ptbl = sctps->sctps_propinfo_tbl;
8904 cbarg = sctps;
8905 break;
8906 default:
8907 miocnak(q, mp, 0, EINVAL);
8908 return;
8909 }
8910
8911 /* search for given property in respective protocol propinfo table */
8912 for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) {
8913 if (strcmp(pinfo->mpi_name, pioc->mpr_name) == 0 &&
8914 pinfo->mpi_proto == pioc->mpr_proto)
8915 break;
8916 }
8917 if (pinfo->mpi_name == NULL) {
8918 miocnak(q, mp, 0, ENOENT);
8919 return;
8920 }
8921
8922 set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8923 if (set && pinfo->mpi_setf != NULL) {
8924 cr = msg_getcred(mp, NULL);
8925 if (cr == NULL)
8926 cr = iocp->ioc_cr;
8927 err = pinfo->mpi_setf(cbarg, cr, pinfo, pioc->mpr_ifname,
8928 pioc->mpr_val, pioc->mpr_flags);
8929 } else if (!set && pinfo->mpi_getf != NULL) {
8930 err = pinfo->mpi_getf(cbarg, pinfo, pioc->mpr_ifname,
8931 pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags);
8932 } else {
8933 err = EPERM;
8934 }
8935
8936 if (err != 0) {
8937 miocnak(q, mp, 0, err);
8938 } else {
8939 if (set)
8940 miocack(q, mp, 0, 0);
8941 else /* For get, we need to return back the data */
8942 miocack(q, mp, iocp->ioc_count, 0);
8943 }
8944 }
8945
8946 /*
8947 * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
8948 * as several routing daemons have unfortunately used this 'unpublished'
8949 * but well-known ioctls.
8950 */
8951 /* ARGSUSED */
8952 static void
8953 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
8954 {
8955 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8956 mblk_t *mp1 = mp->b_cont;
8957 char *pname, *pval, *buf;
8958 uint_t bufsize, proto;
8959 mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8960 ip_stack_t *ipst;
8961 int err = 0;
8962
8963 ASSERT(CONN_Q(q));
8964 ipst = CONNQ_TO_IPST(q);
8965
8966 if (iocp->ioc_count == 0 || mp1 == NULL) {
8967 miocnak(q, mp, 0, EINVAL);
8968 return;
8969 }
8970
8971 mp1->b_datap->db_lim[-1] = '\0'; /* Force null termination */
8972 pval = buf = pname = (char *)mp1->b_rptr;
8973 bufsize = MBLKL(mp1);
8974
8975 if (strcmp(pname, "ip_forwarding") == 0) {
8976 pname = "forwarding";
8977 proto = MOD_PROTO_IPV4;
8978 } else if (strcmp(pname, "ip6_forwarding") == 0) {
8979 pname = "forwarding";
8980 proto = MOD_PROTO_IPV6;
8981 } else {
8982 miocnak(q, mp, 0, EINVAL);
8983 return;
8984 }
8985
8986 ptbl = ipst->ips_propinfo_tbl;
8987 for (pinfo = ptbl; pinfo->mpi_name != NULL; pinfo++) {
8988 if (strcmp(pinfo->mpi_name, pname) == 0 &&
8989 pinfo->mpi_proto == proto)
8990 break;
8991 }
8992
8993 ASSERT(pinfo->mpi_name != NULL);
8994
8995 switch (iocp->ioc_cmd) {
8996 case ND_GET:
8997 if ((err = pinfo->mpi_getf(ipst, pinfo, NULL, buf, bufsize,
8998 0)) == 0) {
8999 miocack(q, mp, iocp->ioc_count, 0);
9000 return;
9001 }
9002 break;
9003 case ND_SET:
9004 /*
9005 * buffer will have property name and value in the following
9006 * format,
9007 * <property name>'\0'<property value>'\0', extract them;
9008 */
9009 while (*pval++)
9010 noop;
9011
9012 if (!*pval || pval >= (char *)mp1->b_wptr) {
9013 err = EINVAL;
9014 } else if ((err = pinfo->mpi_setf(ipst, NULL, pinfo, NULL,
9015 pval, 0)) == 0) {
9016 miocack(q, mp, 0, 0);
9017 return;
9018 }
9019 break;
9020 default:
9021 err = EINVAL;
9022 break;
9023 }
9024 miocnak(q, mp, 0, err);
9025 }
9026
9027 /*
9028 * Wrapper function for resuming deferred ioctl processing
9029 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
9030 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
9031 */
9032 /* ARGSUSED */
9033 void
9034 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
9035 void *dummy_arg)
9036 {
9037 ip_sioctl_copyin_setup(q, mp);
9038 }
9039
9040 /*
9041 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
9042 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle
9043 * in either I_STR or TRANSPARENT form, using the mi_copy facility.
9044 * We establish here the size of the block to be copied in. mi_copyin
9045 * arranges for this to happen, an processing continues in ip_wput_nondata with
9046 * an M_IOCDATA message.
9047 */
9048 void
9049 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
9050 {
9051 int copyin_size;
9052 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9053 ip_ioctl_cmd_t *ipip;
9054 cred_t *cr;
9055 ip_stack_t *ipst;
9056
9057 if (CONN_Q(q))
9058 ipst = CONNQ_TO_IPST(q);
9059 else
9060 ipst = ILLQ_TO_IPST(q);
9061
9062 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
9063 if (ipip == NULL) {
9064 /*
9065 * The ioctl is not one we understand or own.
9066 * Pass it along to be processed down stream,
9067 * if this is a module instance of IP, else nak
9068 * the ioctl.
9069 */
9070 if (q->q_next == NULL) {
9071 goto nak;
9072 } else {
9073 putnext(q, mp);
9074 return;
9075 }
9076 }
9077
9078 /*
9079 * If this is deferred, then we will do all the checks when we
9080 * come back.
9081 */
9082 if ((iocp->ioc_cmd == SIOCGDSTINFO ||
9083 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
9084 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
9085 return;
9086 }
9087
9088 /*
9089 * Only allow a very small subset of IP ioctls on this stream if
9090 * IP is a module and not a driver. Allowing ioctls to be processed
9091 * in this case may cause assert failures or data corruption.
9092 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
9093 * ioctls allowed on an IP module stream, after which this stream
9094 * normally becomes a multiplexor (at which time the stream head
9095 * will fail all ioctls).
9096 */
9097 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
9098 goto nak;
9099 }
9100
9101 /* Make sure we have ioctl data to process. */
9102 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
9103 goto nak;
9104
9105 /*
9106 * Prefer dblk credential over ioctl credential; some synthesized
9107 * ioctls have kcred set because there's no way to crhold()
9108 * a credential in some contexts. (ioc_cr is not crfree() by
9109 * the framework; the caller of ioctl needs to hold the reference
9110 * for the duration of the call).
9111 */
9112 cr = msg_getcred(mp, NULL);
9113 if (cr == NULL)
9114 cr = iocp->ioc_cr;
9115
9116 /* Make sure normal users don't send down privileged ioctls */
9117 if ((ipip->ipi_flags & IPI_PRIV) &&
9118 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
9119 /* We checked the privilege earlier but log it here */
9120 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
9121 return;
9122 }
9123
9124 /*
9125 * The ioctl command tables can only encode fixed length
9126 * ioctl data. If the length is variable, the table will
9127 * encode the length as zero. Such special cases are handled
9128 * below in the switch.
9129 */
9130 if (ipip->ipi_copyin_size != 0) {
9131 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
9132 return;
9133 }
9134
9135 switch (iocp->ioc_cmd) {
9136 case O_SIOCGIFCONF:
9137 case SIOCGIFCONF:
9138 /*
9139 * This IOCTL is hilarious. See comments in
9140 * ip_sioctl_get_ifconf for the story.
9141 */
9142 if (iocp->ioc_count == TRANSPARENT)
9143 copyin_size = SIZEOF_STRUCT(ifconf,
9144 iocp->ioc_flag);
9145 else
9146 copyin_size = iocp->ioc_count;
9147 mi_copyin(q, mp, NULL, copyin_size);
9148 return;
9149
9150 case O_SIOCGLIFCONF:
9151 case SIOCGLIFCONF:
9152 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
9153 mi_copyin(q, mp, NULL, copyin_size);
9154 return;
9155
9156 case SIOCGLIFSRCOF:
9157 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
9158 mi_copyin(q, mp, NULL, copyin_size);
9159 return;
9160
9161 case SIOCGIP6ADDRPOLICY:
9162 ip_sioctl_ip6addrpolicy(q, mp);
9163 ip6_asp_table_refrele(ipst);
9164 return;
9165
9166 case SIOCSIP6ADDRPOLICY:
9167 ip_sioctl_ip6addrpolicy(q, mp);
9168 return;
9169
9170 case SIOCGDSTINFO:
9171 ip_sioctl_dstinfo(q, mp);
9172 ip6_asp_table_refrele(ipst);
9173 return;
9174
9175 case ND_SET:
9176 case ND_GET:
9177 ip_process_legacy_nddprop(q, mp);
9178 return;
9179
9180 case SIOCSETPROP:
9181 case SIOCGETPROP:
9182 ip_sioctl_getsetprop(q, mp);
9183 return;
9184
9185 case I_PLINK:
9186 case I_PUNLINK:
9187 case I_LINK:
9188 case I_UNLINK:
9189 /*
9190 * We treat non-persistent link similarly as the persistent
9191 * link case, in terms of plumbing/unplumbing, as well as
9192 * dynamic re-plumbing events indicator. See comments
9193 * in ip_sioctl_plink() for more.
9194 *
9195 * Request can be enqueued in the 'ipsq' while waiting
9196 * to become exclusive. So bump up the conn ref.
9197 */
9198 if (CONN_Q(q)) {
9199 CONN_INC_REF(Q_TO_CONN(q));
9200 CONN_INC_IOCTLREF(Q_TO_CONN(q))
9201 }
9202 ip_sioctl_plink(NULL, q, mp, NULL);
9203 return;
9204
9205 case IP_IOCTL:
9206 ip_wput_ioctl(q, mp);
9207 return;
9208
9209 case SIOCILB:
9210 /* The ioctl length varies depending on the ILB command. */
9211 copyin_size = iocp->ioc_count;
9212 if (copyin_size < sizeof (ilb_cmd_t))
9213 goto nak;
9214 mi_copyin(q, mp, NULL, copyin_size);
9215 return;
9216
9217 default:
9218 cmn_err(CE_PANIC, "should not happen ");
9219 }
9220 nak:
9221 if (mp->b_cont != NULL) {
9222 freemsg(mp->b_cont);
9223 mp->b_cont = NULL;
9224 }
9225 iocp->ioc_error = EINVAL;
9226 mp->b_datap->db_type = M_IOCNAK;
9227 iocp->ioc_count = 0;
9228 qreply(q, mp);
9229 }
9230
9231 static void
9232 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9233 {
9234 struct arpreq *ar;
9235 struct xarpreq *xar;
9236 mblk_t *tmp;
9237 struct iocblk *iocp;
9238 int x_arp_ioctl = B_FALSE;
9239 int *flagsp;
9240 char *storage = NULL;
9241
9242 ASSERT(ill != NULL);
9243
9244 iocp = (struct iocblk *)mp->b_rptr;
9245 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9246
9247 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9248 if ((iocp->ioc_cmd == SIOCGXARP) ||
9249 (iocp->ioc_cmd == SIOCSXARP)) {
9250 x_arp_ioctl = B_TRUE;
9251 xar = (struct xarpreq *)tmp->b_rptr;
9252 flagsp = &xar->xarp_flags;
9253 storage = xar->xarp_ha.sdl_data;
9254 } else {
9255 ar = (struct arpreq *)tmp->b_rptr;
9256 flagsp = &ar->arp_flags;
9257 storage = ar->arp_ha.sa_data;
9258 }
9259
9260 /*
9261 * We're done if this is not an SIOCG{X}ARP
9262 */
9263 if (x_arp_ioctl) {
9264 storage += ill_xarp_info(&xar->xarp_ha, ill);
9265 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9266 sizeof (xar->xarp_ha.sdl_data)) {
9267 iocp->ioc_error = EINVAL;
9268 return;
9269 }
9270 }
9271 *flagsp = ATF_INUSE;
9272 /*
9273 * If /sbin/arp told us we are the authority using the "permanent"
9274 * flag, or if this is one of my addresses print "permanent"
9275 * in the /sbin/arp output.
9276 */
9277 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9278 *flagsp |= ATF_AUTHORITY;
9279 if (flags & NCE_F_NONUD)
9280 *flagsp |= ATF_PERM; /* not subject to aging */
9281 if (flags & NCE_F_PUBLISH)
9282 *flagsp |= ATF_PUBL;
9283 if (hwaddr != NULL) {
9284 *flagsp |= ATF_COM;
9285 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9286 }
9287 }
9288
9289 /*
9290 * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9291 * interface) create the next available logical interface for this
9292 * physical interface.
9293 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9294 * ipif with the specified name.
9295 *
9296 * If the address family is not AF_UNSPEC then set the address as well.
9297 *
9298 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9299 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9300 *
9301 * Executed as a writer on the ill.
9302 * So no lock is needed to traverse the ipif chain, or examine the
9303 * phyint flags.
9304 */
9305 /* ARGSUSED */
9306 int
9307 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9308 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9309 {
9310 mblk_t *mp1;
9311 struct lifreq *lifr;
9312 boolean_t isv6;
9313 boolean_t exists;
9314 char *name;
9315 char *endp;
9316 char *cp;
9317 int namelen;
9318 ipif_t *ipif;
9319 long id;
9320 ipsq_t *ipsq;
9321 ill_t *ill;
9322 sin_t *sin;
9323 int err = 0;
9324 boolean_t found_sep = B_FALSE;
9325 conn_t *connp;
9326 zoneid_t zoneid;
9327 ip_stack_t *ipst = CONNQ_TO_IPST(q);
9328
9329 ASSERT(q->q_next == NULL);
9330 ip1dbg(("ip_sioctl_addif\n"));
9331 /* Existence of mp1 has been checked in ip_wput_nondata */
9332 mp1 = mp->b_cont->b_cont;
9333 /*
9334 * Null terminate the string to protect against buffer
9335 * overrun. String was generated by user code and may not
9336 * be trusted.
9337 */
9338 lifr = (struct lifreq *)mp1->b_rptr;
9339 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9340 name = lifr->lifr_name;
9341 ASSERT(CONN_Q(q));
9342 connp = Q_TO_CONN(q);
9343 isv6 = (connp->conn_family == AF_INET6);
9344 zoneid = connp->conn_zoneid;
9345 namelen = mi_strlen(name);
9346 if (namelen == 0)
9347 return (EINVAL);
9348
9349 exists = B_FALSE;
9350 if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9351 (mi_strcmp(name, ipif_loopback_name) == 0)) {
9352 /*
9353 * Allow creating lo0 using SIOCLIFADDIF.
9354 * can't be any other writer thread. So can pass null below
9355 * for the last 4 args to ipif_lookup_name.
9356 */
9357 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9358 &exists, isv6, zoneid, ipst);
9359 /* Prevent any further action */
9360 if (ipif == NULL) {
9361 return (ENOBUFS);
9362 } else if (!exists) {
9363 /* We created the ipif now and as writer */
9364 ipif_refrele(ipif);
9365 return (0);
9366 } else {
9367 ill = ipif->ipif_ill;
9368 ill_refhold(ill);
9369 ipif_refrele(ipif);
9370 }
9371 } else {
9372 /* Look for a colon in the name. */
9373 endp = &name[namelen];
9374 for (cp = endp; --cp > name; ) {
9375 if (*cp == IPIF_SEPARATOR_CHAR) {
9376 found_sep = B_TRUE;
9377 /*
9378 * Reject any non-decimal aliases for plumbing
9379 * of logical interfaces. Aliases with leading
9380 * zeroes are also rejected as they introduce
9381 * ambiguity in the naming of the interfaces.
9382 * Comparing with "0" takes care of all such
9383 * cases.
9384 */
9385 if ((strncmp("0", cp+1, 1)) == 0)
9386 return (EINVAL);
9387
9388 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9389 id <= 0 || *endp != '\0') {
9390 return (EINVAL);
9391 }
9392 *cp = '\0';
9393 break;
9394 }
9395 }
9396 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9397 if (found_sep)
9398 *cp = IPIF_SEPARATOR_CHAR;
9399 if (ill == NULL)
9400 return (ENXIO);
9401 }
9402
9403 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9404 B_TRUE);
9405
9406 /*
9407 * Release the refhold due to the lookup, now that we are excl
9408 * or we are just returning
9409 */
9410 ill_refrele(ill);
9411
9412 if (ipsq == NULL)
9413 return (EINPROGRESS);
9414
9415 /* We are now exclusive on the IPSQ */
9416 ASSERT(IAM_WRITER_ILL(ill));
9417
9418 if (found_sep) {
9419 /* Now see if there is an IPIF with this unit number. */
9420 for (ipif = ill->ill_ipif; ipif != NULL;
9421 ipif = ipif->ipif_next) {
9422 if (ipif->ipif_id == id) {
9423 err = EEXIST;
9424 goto done;
9425 }
9426 }
9427 }
9428
9429 /*
9430 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9431 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name()
9432 * instead.
9433 */
9434 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9435 B_TRUE, B_TRUE, &err)) == NULL) {
9436 goto done;
9437 }
9438
9439 /* Return created name with ioctl */
9440 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9441 IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9442 ip1dbg(("created %s\n", lifr->lifr_name));
9443
9444 /* Set address */
9445 sin = (sin_t *)&lifr->lifr_addr;
9446 if (sin->sin_family != AF_UNSPEC) {
9447 err = ip_sioctl_addr(ipif, sin, q, mp,
9448 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9449 }
9450
9451 done:
9452 ipsq_exit(ipsq);
9453 return (err);
9454 }
9455
9456 /*
9457 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9458 * interface) delete it based on the IP address (on this physical interface).
9459 * Otherwise delete it based on the ipif_id.
9460 * Also, special handling to allow a removeif of lo0.
9461 */
9462 /* ARGSUSED */
9463 int
9464 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9465 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9466 {
9467 conn_t *connp;
9468 ill_t *ill = ipif->ipif_ill;
9469 boolean_t success;
9470 ip_stack_t *ipst;
9471
9472 ipst = CONNQ_TO_IPST(q);
9473
9474 ASSERT(q->q_next == NULL);
9475 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9476 ill->ill_name, ipif->ipif_id, (void *)ipif));
9477 ASSERT(IAM_WRITER_IPIF(ipif));
9478
9479 connp = Q_TO_CONN(q);
9480 /*
9481 * Special case for unplumbing lo0 (the loopback physical interface).
9482 * If unplumbing lo0, the incoming address structure has been
9483 * initialized to all zeros. When unplumbing lo0, all its logical
9484 * interfaces must be removed too.
9485 *
9486 * Note that this interface may be called to remove a specific
9487 * loopback logical interface (eg, lo0:1). But in that case
9488 * ipif->ipif_id != 0 so that the code path for that case is the
9489 * same as any other interface (meaning it skips the code directly
9490 * below).
9491 */
9492 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9493 if (sin->sin_family == AF_UNSPEC &&
9494 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9495 /*
9496 * Mark it condemned. No new ref. will be made to ill.
9497 */
9498 mutex_enter(&ill->ill_lock);
9499 ill->ill_state_flags |= ILL_CONDEMNED;
9500 for (ipif = ill->ill_ipif; ipif != NULL;
9501 ipif = ipif->ipif_next) {
9502 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9503 }
9504 mutex_exit(&ill->ill_lock);
9505
9506 ipif = ill->ill_ipif;
9507 /* unplumb the loopback interface */
9508 ill_delete(ill);
9509 mutex_enter(&connp->conn_lock);
9510 mutex_enter(&ill->ill_lock);
9511
9512 /* Are any references to this ill active */
9513 if (ill_is_freeable(ill)) {
9514 mutex_exit(&ill->ill_lock);
9515 mutex_exit(&connp->conn_lock);
9516 ill_delete_tail(ill);
9517 mi_free(ill);
9518 return (0);
9519 }
9520 success = ipsq_pending_mp_add(connp, ipif,
9521 CONNP_TO_WQ(connp), mp, ILL_FREE);
9522 mutex_exit(&connp->conn_lock);
9523 mutex_exit(&ill->ill_lock);
9524 if (success)
9525 return (EINPROGRESS);
9526 else
9527 return (EINTR);
9528 }
9529 }
9530
9531 if (ipif->ipif_id == 0) {
9532 ipsq_t *ipsq;
9533
9534 /* Find based on address */
9535 if (ipif->ipif_isv6) {
9536 sin6_t *sin6;
9537
9538 if (sin->sin_family != AF_INET6)
9539 return (EAFNOSUPPORT);
9540
9541 sin6 = (sin6_t *)sin;
9542 /* We are a writer, so we should be able to lookup */
9543 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9544 ipst);
9545 } else {
9546 if (sin->sin_family != AF_INET)
9547 return (EAFNOSUPPORT);
9548
9549 /* We are a writer, so we should be able to lookup */
9550 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9551 ipst);
9552 }
9553 if (ipif == NULL) {
9554 return (EADDRNOTAVAIL);
9555 }
9556
9557 /*
9558 * It is possible for a user to send an SIOCLIFREMOVEIF with
9559 * lifr_name of the physical interface but with an ip address
9560 * lifr_addr of a logical interface plumbed over it.
9561 * So update ipx_current_ipif now that ipif points to the
9562 * correct one.
9563 */
9564 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9565 ipsq->ipsq_xop->ipx_current_ipif = ipif;
9566
9567 /* This is a writer */
9568 ipif_refrele(ipif);
9569 }
9570
9571 /*
9572 * Can not delete instance zero since it is tied to the ill.
9573 */
9574 if (ipif->ipif_id == 0)
9575 return (EBUSY);
9576
9577 mutex_enter(&ill->ill_lock);
9578 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9579 mutex_exit(&ill->ill_lock);
9580
9581 ipif_free(ipif);
9582
9583 mutex_enter(&connp->conn_lock);
9584 mutex_enter(&ill->ill_lock);
9585
9586 /* Are any references to this ipif active */
9587 if (ipif_is_freeable(ipif)) {
9588 mutex_exit(&ill->ill_lock);
9589 mutex_exit(&connp->conn_lock);
9590 ipif_non_duplicate(ipif);
9591 (void) ipif_down_tail(ipif);
9592 ipif_free_tail(ipif); /* frees ipif */
9593 return (0);
9594 }
9595 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9596 IPIF_FREE);
9597 mutex_exit(&ill->ill_lock);
9598 mutex_exit(&connp->conn_lock);
9599 if (success)
9600 return (EINPROGRESS);
9601 else
9602 return (EINTR);
9603 }
9604
9605 /*
9606 * Restart the removeif ioctl. The refcnt has gone down to 0.
9607 * The ipif is already condemned. So can't find it thru lookups.
9608 */
9609 /* ARGSUSED */
9610 int
9611 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9612 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9613 {
9614 ill_t *ill = ipif->ipif_ill;
9615
9616 ASSERT(IAM_WRITER_IPIF(ipif));
9617 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9618
9619 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9620 ill->ill_name, ipif->ipif_id, (void *)ipif));
9621
9622 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9623 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9624 ill_delete_tail(ill);
9625 mi_free(ill);
9626 return (0);
9627 }
9628
9629 ipif_non_duplicate(ipif);
9630 (void) ipif_down_tail(ipif);
9631 ipif_free_tail(ipif);
9632
9633 return (0);
9634 }
9635
9636 /*
9637 * Set the local interface address using the given prefix and ill_token.
9638 */
9639 /* ARGSUSED */
9640 int
9641 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9642 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9643 {
9644 int err;
9645 in6_addr_t v6addr;
9646 sin6_t *sin6;
9647 ill_t *ill;
9648 int i;
9649
9650 ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9651 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9652
9653 ASSERT(IAM_WRITER_IPIF(ipif));
9654
9655 if (!ipif->ipif_isv6)
9656 return (EINVAL);
9657
9658 if (sin->sin_family != AF_INET6)
9659 return (EAFNOSUPPORT);
9660
9661 sin6 = (sin6_t *)sin;
9662 v6addr = sin6->sin6_addr;
9663 ill = ipif->ipif_ill;
9664
9665 if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9666 IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9667 return (EADDRNOTAVAIL);
9668
9669 for (i = 0; i < 4; i++)
9670 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9671
9672 err = ip_sioctl_addr(ipif, sin, q, mp,
9673 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9674 return (err);
9675 }
9676
9677 /*
9678 * Restart entry point to restart the address set operation after the
9679 * refcounts have dropped to zero.
9680 */
9681 /* ARGSUSED */
9682 int
9683 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9684 ip_ioctl_cmd_t *ipip, void *ifreq)
9685 {
9686 ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9687 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9688 return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9689 }
9690
9691 /*
9692 * Set the local interface address.
9693 * Allow an address of all zero when the interface is down.
9694 */
9695 /* ARGSUSED */
9696 int
9697 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9698 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9699 {
9700 int err = 0;
9701 in6_addr_t v6addr;
9702 boolean_t need_up = B_FALSE;
9703 ill_t *ill;
9704 int i;
9705
9706 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9707 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9708
9709 ASSERT(IAM_WRITER_IPIF(ipif));
9710
9711 ill = ipif->ipif_ill;
9712 if (ipif->ipif_isv6) {
9713 sin6_t *sin6;
9714 phyint_t *phyi;
9715
9716 if (sin->sin_family != AF_INET6)
9717 return (EAFNOSUPPORT);
9718
9719 sin6 = (sin6_t *)sin;
9720 v6addr = sin6->sin6_addr;
9721 phyi = ill->ill_phyint;
9722
9723 /*
9724 * Enforce that true multicast interfaces have a link-local
9725 * address for logical unit 0.
9726 *
9727 * However for those ipif's for which link-local address was
9728 * not created by default, also allow setting :: as the address.
9729 * This scenario would arise, when we delete an address on ipif
9730 * with logical unit 0, we would want to set :: as the address.
9731 */
9732 if (ipif->ipif_id == 0 &&
9733 (ill->ill_flags & ILLF_MULTICAST) &&
9734 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9735 !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9736 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9737
9738 /*
9739 * if default link-local was not created by kernel for
9740 * this ill, allow setting :: as the address on ipif:0.
9741 */
9742 if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9743 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9744 return (EADDRNOTAVAIL);
9745 } else {
9746 return (EADDRNOTAVAIL);
9747 }
9748 }
9749
9750 /*
9751 * up interfaces shouldn't have the unspecified address
9752 * unless they also have the IPIF_NOLOCAL flags set and
9753 * have a subnet assigned.
9754 */
9755 if ((ipif->ipif_flags & IPIF_UP) &&
9756 IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9757 (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9758 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9759 return (EADDRNOTAVAIL);
9760 }
9761
9762 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9763 return (EADDRNOTAVAIL);
9764 } else {
9765 ipaddr_t addr;
9766
9767 if (sin->sin_family != AF_INET)
9768 return (EAFNOSUPPORT);
9769
9770 addr = sin->sin_addr.s_addr;
9771
9772 /* Allow INADDR_ANY as the local address. */
9773 if (addr != INADDR_ANY &&
9774 !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9775 return (EADDRNOTAVAIL);
9776
9777 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9778 }
9779 /*
9780 * verify that the address being configured is permitted by the
9781 * ill_allowed_ips[] for the interface.
9782 */
9783 if (ill->ill_allowed_ips_cnt > 0) {
9784 for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
9785 if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
9786 &v6addr))
9787 break;
9788 }
9789 if (i == ill->ill_allowed_ips_cnt) {
9790 pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
9791 return (EPERM);
9792 }
9793 }
9794 /*
9795 * Even if there is no change we redo things just to rerun
9796 * ipif_set_default.
9797 */
9798 if (ipif->ipif_flags & IPIF_UP) {
9799 /*
9800 * Setting a new local address, make sure
9801 * we have net and subnet bcast ire's for
9802 * the old address if we need them.
9803 */
9804 /*
9805 * If the interface is already marked up,
9806 * we call ipif_down which will take care
9807 * of ditching any IREs that have been set
9808 * up based on the old interface address.
9809 */
9810 err = ipif_logical_down(ipif, q, mp);
9811 if (err == EINPROGRESS)
9812 return (err);
9813 (void) ipif_down_tail(ipif);
9814 need_up = 1;
9815 }
9816
9817 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9818 return (err);
9819 }
9820
9821 int
9822 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9823 boolean_t need_up)
9824 {
9825 in6_addr_t v6addr;
9826 in6_addr_t ov6addr;
9827 ipaddr_t addr;
9828 sin6_t *sin6;
9829 int sinlen;
9830 int err = 0;
9831 ill_t *ill = ipif->ipif_ill;
9832 boolean_t need_dl_down;
9833 boolean_t need_arp_down;
9834 struct iocblk *iocp;
9835
9836 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9837
9838 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9839 ill->ill_name, ipif->ipif_id, (void *)ipif));
9840 ASSERT(IAM_WRITER_IPIF(ipif));
9841
9842 /* Must cancel any pending timer before taking the ill_lock */
9843 if (ipif->ipif_recovery_id != 0)
9844 (void) untimeout(ipif->ipif_recovery_id);
9845 ipif->ipif_recovery_id = 0;
9846
9847 if (ipif->ipif_isv6) {
9848 sin6 = (sin6_t *)sin;
9849 v6addr = sin6->sin6_addr;
9850 sinlen = sizeof (struct sockaddr_in6);
9851 } else {
9852 addr = sin->sin_addr.s_addr;
9853 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9854 sinlen = sizeof (struct sockaddr_in);
9855 }
9856 mutex_enter(&ill->ill_lock);
9857 ov6addr = ipif->ipif_v6lcl_addr;
9858 ipif->ipif_v6lcl_addr = v6addr;
9859 sctp_update_ipif_addr(ipif, ov6addr);
9860 ipif->ipif_addr_ready = 0;
9861
9862 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9863
9864 /*
9865 * If the interface was previously marked as a duplicate, then since
9866 * we've now got a "new" address, it should no longer be considered a
9867 * duplicate -- even if the "new" address is the same as the old one.
9868 * Note that if all ipifs are down, we may have a pending ARP down
9869 * event to handle. This is because we want to recover from duplicates
9870 * and thus delay tearing down ARP until the duplicates have been
9871 * removed or disabled.
9872 */
9873 need_dl_down = need_arp_down = B_FALSE;
9874 if (ipif->ipif_flags & IPIF_DUPLICATE) {
9875 need_arp_down = !need_up;
9876 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9877 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9878 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9879 need_dl_down = B_TRUE;
9880 }
9881 }
9882
9883 ipif_set_default(ipif);
9884
9885 /*
9886 * If we've just manually set the IPv6 link-local address (0th ipif),
9887 * tag the ill so that future updates to the interface ID don't result
9888 * in this address getting automatically reconfigured from under the
9889 * administrator.
9890 */
9891 if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9892 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9893 !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9894 ill->ill_manual_linklocal = 1;
9895 }
9896
9897 /*
9898 * When publishing an interface address change event, we only notify
9899 * the event listeners of the new address. It is assumed that if they
9900 * actively care about the addresses assigned that they will have
9901 * already discovered the previous address assigned (if there was one.)
9902 *
9903 * Don't attach nic event message for SIOCLIFADDIF ioctl.
9904 */
9905 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9906 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9907 NE_ADDRESS_CHANGE, sin, sinlen);
9908 }
9909
9910 mutex_exit(&ill->ill_lock);
9911
9912 if (need_up) {
9913 /*
9914 * Now bring the interface back up. If this
9915 * is the only IPIF for the ILL, ipif_up
9916 * will have to re-bind to the device, so
9917 * we may get back EINPROGRESS, in which
9918 * case, this IOCTL will get completed in
9919 * ip_rput_dlpi when we see the DL_BIND_ACK.
9920 */
9921 err = ipif_up(ipif, q, mp);
9922 } else {
9923 /* Perhaps ilgs should use this ill */
9924 update_conn_ill(NULL, ill->ill_ipst);
9925 }
9926
9927 if (need_dl_down)
9928 ill_dl_down(ill);
9929
9930 if (need_arp_down && !ill->ill_isv6)
9931 (void) ipif_arp_down(ipif);
9932
9933 /*
9934 * The default multicast interface might have changed (for
9935 * instance if the IPv6 scope of the address changed)
9936 */
9937 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9938
9939 return (err);
9940 }
9941
9942 /*
9943 * Restart entry point to restart the address set operation after the
9944 * refcounts have dropped to zero.
9945 */
9946 /* ARGSUSED */
9947 int
9948 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9949 ip_ioctl_cmd_t *ipip, void *ifreq)
9950 {
9951 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
9952 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9953 ASSERT(IAM_WRITER_IPIF(ipif));
9954 (void) ipif_down_tail(ipif);
9955 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
9956 }
9957
9958 /* ARGSUSED */
9959 int
9960 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9961 ip_ioctl_cmd_t *ipip, void *if_req)
9962 {
9963 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
9964 struct lifreq *lifr = (struct lifreq *)if_req;
9965
9966 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
9967 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9968 /*
9969 * The net mask and address can't change since we have a
9970 * reference to the ipif. So no lock is necessary.
9971 */
9972 if (ipif->ipif_isv6) {
9973 *sin6 = sin6_null;
9974 sin6->sin6_family = AF_INET6;
9975 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
9976 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
9977 lifr->lifr_addrlen =
9978 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
9979 } else {
9980 *sin = sin_null;
9981 sin->sin_family = AF_INET;
9982 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
9983 if (ipip->ipi_cmd_type == LIF_CMD) {
9984 lifr->lifr_addrlen =
9985 ip_mask_to_plen(ipif->ipif_net_mask);
9986 }
9987 }
9988 return (0);
9989 }
9990
9991 /*
9992 * Set the destination address for a pt-pt interface.
9993 */
9994 /* ARGSUSED */
9995 int
9996 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9997 ip_ioctl_cmd_t *ipip, void *if_req)
9998 {
9999 int err = 0;
10000 in6_addr_t v6addr;
10001 boolean_t need_up = B_FALSE;
10002
10003 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
10004 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10005 ASSERT(IAM_WRITER_IPIF(ipif));
10006
10007 if (ipif->ipif_isv6) {
10008 sin6_t *sin6;
10009
10010 if (sin->sin_family != AF_INET6)
10011 return (EAFNOSUPPORT);
10012
10013 sin6 = (sin6_t *)sin;
10014 v6addr = sin6->sin6_addr;
10015
10016 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10017 return (EADDRNOTAVAIL);
10018 } else {
10019 ipaddr_t addr;
10020
10021 if (sin->sin_family != AF_INET)
10022 return (EAFNOSUPPORT);
10023
10024 addr = sin->sin_addr.s_addr;
10025 if (addr != INADDR_ANY &&
10026 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
10027 return (EADDRNOTAVAIL);
10028 }
10029
10030 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10031 }
10032
10033 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
10034 return (0); /* No change */
10035
10036 if (ipif->ipif_flags & IPIF_UP) {
10037 /*
10038 * If the interface is already marked up,
10039 * we call ipif_down which will take care
10040 * of ditching any IREs that have been set
10041 * up based on the old pp dst address.
10042 */
10043 err = ipif_logical_down(ipif, q, mp);
10044 if (err == EINPROGRESS)
10045 return (err);
10046 (void) ipif_down_tail(ipif);
10047 need_up = B_TRUE;
10048 }
10049 /*
10050 * could return EINPROGRESS. If so ioctl will complete in
10051 * ip_rput_dlpi_writer
10052 */
10053 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
10054 return (err);
10055 }
10056
10057 static int
10058 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10059 boolean_t need_up)
10060 {
10061 in6_addr_t v6addr;
10062 ill_t *ill = ipif->ipif_ill;
10063 int err = 0;
10064 boolean_t need_dl_down;
10065 boolean_t need_arp_down;
10066
10067 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
10068 ipif->ipif_id, (void *)ipif));
10069
10070 /* Must cancel any pending timer before taking the ill_lock */
10071 if (ipif->ipif_recovery_id != 0)
10072 (void) untimeout(ipif->ipif_recovery_id);
10073 ipif->ipif_recovery_id = 0;
10074
10075 if (ipif->ipif_isv6) {
10076 sin6_t *sin6;
10077
10078 sin6 = (sin6_t *)sin;
10079 v6addr = sin6->sin6_addr;
10080 } else {
10081 ipaddr_t addr;
10082
10083 addr = sin->sin_addr.s_addr;
10084 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10085 }
10086 mutex_enter(&ill->ill_lock);
10087 /* Set point to point destination address. */
10088 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10089 /*
10090 * Allow this as a means of creating logical
10091 * pt-pt interfaces on top of e.g. an Ethernet.
10092 * XXX Undocumented HACK for testing.
10093 * pt-pt interfaces are created with NUD disabled.
10094 */
10095 ipif->ipif_flags |= IPIF_POINTOPOINT;
10096 ipif->ipif_flags &= ~IPIF_BROADCAST;
10097 if (ipif->ipif_isv6)
10098 ill->ill_flags |= ILLF_NONUD;
10099 }
10100
10101 /*
10102 * If the interface was previously marked as a duplicate, then since
10103 * we've now got a "new" address, it should no longer be considered a
10104 * duplicate -- even if the "new" address is the same as the old one.
10105 * Note that if all ipifs are down, we may have a pending ARP down
10106 * event to handle.
10107 */
10108 need_dl_down = need_arp_down = B_FALSE;
10109 if (ipif->ipif_flags & IPIF_DUPLICATE) {
10110 need_arp_down = !need_up;
10111 ipif->ipif_flags &= ~IPIF_DUPLICATE;
10112 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
10113 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
10114 need_dl_down = B_TRUE;
10115 }
10116 }
10117
10118 /*
10119 * If we've just manually set the IPv6 destination link-local address
10120 * (0th ipif), tag the ill so that future updates to the destination
10121 * interface ID (as can happen with interfaces over IP tunnels) don't
10122 * result in this address getting automatically reconfigured from
10123 * under the administrator.
10124 */
10125 if (ipif->ipif_isv6 && ipif->ipif_id == 0)
10126 ill->ill_manual_dst_linklocal = 1;
10127
10128 /* Set the new address. */
10129 ipif->ipif_v6pp_dst_addr = v6addr;
10130 /* Make sure subnet tracks pp_dst */
10131 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
10132 mutex_exit(&ill->ill_lock);
10133
10134 if (need_up) {
10135 /*
10136 * Now bring the interface back up. If this
10137 * is the only IPIF for the ILL, ipif_up
10138 * will have to re-bind to the device, so
10139 * we may get back EINPROGRESS, in which
10140 * case, this IOCTL will get completed in
10141 * ip_rput_dlpi when we see the DL_BIND_ACK.
10142 */
10143 err = ipif_up(ipif, q, mp);
10144 }
10145
10146 if (need_dl_down)
10147 ill_dl_down(ill);
10148 if (need_arp_down && !ipif->ipif_isv6)
10149 (void) ipif_arp_down(ipif);
10150
10151 return (err);
10152 }
10153
10154 /*
10155 * Restart entry point to restart the dstaddress set operation after the
10156 * refcounts have dropped to zero.
10157 */
10158 /* ARGSUSED */
10159 int
10160 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10161 ip_ioctl_cmd_t *ipip, void *ifreq)
10162 {
10163 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
10164 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10165 (void) ipif_down_tail(ipif);
10166 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
10167 }
10168
10169 /* ARGSUSED */
10170 int
10171 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10172 ip_ioctl_cmd_t *ipip, void *if_req)
10173 {
10174 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10175
10176 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
10177 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10178 /*
10179 * Get point to point destination address. The addresses can't
10180 * change since we hold a reference to the ipif.
10181 */
10182 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
10183 return (EADDRNOTAVAIL);
10184
10185 if (ipif->ipif_isv6) {
10186 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10187 *sin6 = sin6_null;
10188 sin6->sin6_family = AF_INET6;
10189 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
10190 } else {
10191 *sin = sin_null;
10192 sin->sin_family = AF_INET;
10193 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
10194 }
10195 return (0);
10196 }
10197
10198 /*
10199 * Check which flags will change by the given flags being set
10200 * silently ignore flags which userland is not allowed to control.
10201 * (Because these flags may change between SIOCGLIFFLAGS and
10202 * SIOCSLIFFLAGS, and that's outside of userland's control,
10203 * we need to silently ignore them rather than fail.)
10204 */
10205 static void
10206 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
10207 uint64_t *offp)
10208 {
10209 ill_t *ill = ipif->ipif_ill;
10210 phyint_t *phyi = ill->ill_phyint;
10211 uint64_t cantchange_flags, intf_flags;
10212 uint64_t turn_on, turn_off;
10213
10214 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10215 cantchange_flags = IFF_CANTCHANGE;
10216 if (IS_IPMP(ill))
10217 cantchange_flags |= IFF_IPMP_CANTCHANGE;
10218 turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10219 turn_off = intf_flags & turn_on;
10220 turn_on ^= turn_off;
10221 *onp = turn_on;
10222 *offp = turn_off;
10223 }
10224
10225 /*
10226 * Set interface flags. Many flags require special handling (e.g.,
10227 * bringing the interface down); see below for details.
10228 *
10229 * NOTE : We really don't enforce that ipif_id zero should be used
10230 * for setting any flags other than IFF_LOGINT_FLAGS. This
10231 * is because applications generally does SICGLIFFLAGS and
10232 * ORs in the new flags (that affects the logical) and does a
10233 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10234 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10235 * flags that will be turned on is correct with respect to
10236 * ipif_id 0. For backward compatibility reasons, it is not done.
10237 */
10238 /* ARGSUSED */
10239 int
10240 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10241 ip_ioctl_cmd_t *ipip, void *if_req)
10242 {
10243 uint64_t turn_on;
10244 uint64_t turn_off;
10245 int err = 0;
10246 phyint_t *phyi;
10247 ill_t *ill;
10248 conn_t *connp;
10249 uint64_t intf_flags;
10250 boolean_t phyint_flags_modified = B_FALSE;
10251 uint64_t flags;
10252 struct ifreq *ifr;
10253 struct lifreq *lifr;
10254 boolean_t set_linklocal = B_FALSE;
10255
10256 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10257 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10258
10259 ASSERT(IAM_WRITER_IPIF(ipif));
10260
10261 ill = ipif->ipif_ill;
10262 phyi = ill->ill_phyint;
10263
10264 if (ipip->ipi_cmd_type == IF_CMD) {
10265 ifr = (struct ifreq *)if_req;
10266 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10267 } else {
10268 lifr = (struct lifreq *)if_req;
10269 flags = lifr->lifr_flags;
10270 }
10271
10272 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10273
10274 /*
10275 * Have the flags been set correctly until now?
10276 */
10277 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10278 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10279 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10280 /*
10281 * Compare the new flags to the old, and partition
10282 * into those coming on and those going off.
10283 * For the 16 bit command keep the bits above bit 16 unchanged.
10284 */
10285 if (ipip->ipi_cmd == SIOCSIFFLAGS)
10286 flags |= intf_flags & ~0xFFFF;
10287
10288 /*
10289 * Explicitly fail attempts to change flags that are always invalid on
10290 * an IPMP meta-interface.
10291 */
10292 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10293 return (EINVAL);
10294
10295 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10296 if ((turn_on|turn_off) == 0)
10297 return (0); /* No change */
10298
10299 /*
10300 * All test addresses must be IFF_DEPRECATED (to ensure source address
10301 * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10302 * allow it to be turned off.
10303 */
10304 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10305 (turn_on|intf_flags) & IFF_NOFAILOVER)
10306 return (EINVAL);
10307
10308 if ((connp = Q_TO_CONN(q)) == NULL)
10309 return (EINVAL);
10310
10311 /*
10312 * Only vrrp control socket is allowed to change IFF_UP and
10313 * IFF_NOACCEPT flags when IFF_VRRP is set.
10314 */
10315 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10316 if (!connp->conn_isvrrp)
10317 return (EINVAL);
10318 }
10319
10320 /*
10321 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10322 * VRRP control socket.
10323 */
10324 if ((turn_off | turn_on) & IFF_NOACCEPT) {
10325 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10326 return (EINVAL);
10327 }
10328
10329 if (turn_on & IFF_NOFAILOVER) {
10330 turn_on |= IFF_DEPRECATED;
10331 flags |= IFF_DEPRECATED;
10332 }
10333
10334 /*
10335 * On underlying interfaces, only allow applications to manage test
10336 * addresses -- otherwise, they may get confused when the address
10337 * moves as part of being brought up. Likewise, prevent an
10338 * application-managed test address from being converted to a data
10339 * address. To prevent migration of administratively up addresses in
10340 * the kernel, we don't allow them to be converted either.
10341 */
10342 if (IS_UNDER_IPMP(ill)) {
10343 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10344
10345 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10346 return (EINVAL);
10347
10348 if ((turn_off & IFF_NOFAILOVER) &&
10349 (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10350 return (EINVAL);
10351 }
10352
10353 /*
10354 * Only allow IFF_TEMPORARY flag to be set on
10355 * IPv6 interfaces.
10356 */
10357 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10358 return (EINVAL);
10359
10360 /*
10361 * cannot turn off IFF_NOXMIT on VNI interfaces.
10362 */
10363 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10364 return (EINVAL);
10365
10366 /*
10367 * Don't allow the IFF_ROUTER flag to be turned on on loopback
10368 * interfaces. It makes no sense in that context.
10369 */
10370 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10371 return (EINVAL);
10372
10373 /*
10374 * For IPv6 ipif_id 0, don't allow the interface to be up without
10375 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10376 * If the link local address isn't set, and can be set, it will get
10377 * set later on in this function.
10378 */
10379 if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10380 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10381 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10382 if (ipif_cant_setlinklocal(ipif))
10383 return (EINVAL);
10384 set_linklocal = B_TRUE;
10385 }
10386
10387 /*
10388 * If we modify physical interface flags, we'll potentially need to
10389 * send up two routing socket messages for the changes (one for the
10390 * IPv4 ill, and another for the IPv6 ill). Note that here.
10391 */
10392 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10393 phyint_flags_modified = B_TRUE;
10394
10395 /*
10396 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10397 * (otherwise, we'd immediately use them, defeating standby). Also,
10398 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10399 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10400 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We
10401 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10402 * will not be honored.
10403 */
10404 if (turn_on & PHYI_STANDBY) {
10405 /*
10406 * No need to grab ill_g_usesrc_lock here; see the
10407 * synchronization notes in ip.c.
10408 */
10409 if (ill->ill_usesrc_grp_next != NULL ||
10410 intf_flags & PHYI_INACTIVE)
10411 return (EINVAL);
10412 if (!(flags & PHYI_FAILED)) {
10413 flags |= PHYI_INACTIVE;
10414 turn_on |= PHYI_INACTIVE;
10415 }
10416 }
10417
10418 if (turn_off & PHYI_STANDBY) {
10419 flags &= ~PHYI_INACTIVE;
10420 turn_off |= PHYI_INACTIVE;
10421 }
10422
10423 /*
10424 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10425 * would end up on.
10426 */
10427 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10428 (PHYI_FAILED | PHYI_INACTIVE))
10429 return (EINVAL);
10430
10431 /*
10432 * If ILLF_ROUTER changes, we need to change the ip forwarding
10433 * status of the interface.
10434 */
10435 if ((turn_on | turn_off) & ILLF_ROUTER) {
10436 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10437 if (err != 0)
10438 return (err);
10439 }
10440
10441 /*
10442 * If the interface is not UP and we are not going to
10443 * bring it UP, record the flags and return. When the
10444 * interface comes UP later, the right actions will be
10445 * taken.
10446 */
10447 if (!(ipif->ipif_flags & IPIF_UP) &&
10448 !(turn_on & IPIF_UP)) {
10449 /* Record new flags in their respective places. */
10450 mutex_enter(&ill->ill_lock);
10451 mutex_enter(&ill->ill_phyint->phyint_lock);
10452 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10453 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10454 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10455 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10456 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10457 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10458 mutex_exit(&ill->ill_lock);
10459 mutex_exit(&ill->ill_phyint->phyint_lock);
10460
10461 /*
10462 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10463 * same to the kernel: if any of them has been set by
10464 * userland, the interface cannot be used for data traffic.
10465 */
10466 if ((turn_on|turn_off) &
10467 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10468 ASSERT(!IS_IPMP(ill));
10469 /*
10470 * It's possible the ill is part of an "anonymous"
10471 * IPMP group rather than a real group. In that case,
10472 * there are no other interfaces in the group and thus
10473 * no need to call ipmp_phyint_refresh_active().
10474 */
10475 if (IS_UNDER_IPMP(ill))
10476 ipmp_phyint_refresh_active(phyi);
10477 }
10478
10479 if (phyint_flags_modified) {
10480 if (phyi->phyint_illv4 != NULL) {
10481 ip_rts_ifmsg(phyi->phyint_illv4->
10482 ill_ipif, RTSQ_DEFAULT);
10483 }
10484 if (phyi->phyint_illv6 != NULL) {
10485 ip_rts_ifmsg(phyi->phyint_illv6->
10486 ill_ipif, RTSQ_DEFAULT);
10487 }
10488 }
10489 /* The default multicast interface might have changed */
10490 ire_increment_multicast_generation(ill->ill_ipst,
10491 ill->ill_isv6);
10492
10493 return (0);
10494 } else if (set_linklocal) {
10495 mutex_enter(&ill->ill_lock);
10496 if (set_linklocal)
10497 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10498 mutex_exit(&ill->ill_lock);
10499 }
10500
10501 /*
10502 * Disallow IPv6 interfaces coming up that have the unspecified address,
10503 * or point-to-point interfaces with an unspecified destination. We do
10504 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10505 * have a subnet assigned, which is how in.ndpd currently manages its
10506 * onlink prefix list when no addresses are configured with those
10507 * prefixes.
10508 */
10509 if (ipif->ipif_isv6 &&
10510 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10511 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10512 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10513 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10514 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10515 return (EINVAL);
10516 }
10517
10518 /*
10519 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10520 * from being brought up.
10521 */
10522 if (!ipif->ipif_isv6 &&
10523 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10524 ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10525 return (EINVAL);
10526 }
10527
10528 /*
10529 * If we are going to change one or more of the flags that are
10530 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10531 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10532 * IPIF_NOFAILOVER, we will take special action. This is
10533 * done by bring the ipif down, changing the flags and bringing
10534 * it back up again. For IPIF_NOFAILOVER, the act of bringing it
10535 * back up will trigger the address to be moved.
10536 *
10537 * If we are going to change IFF_NOACCEPT, we need to bring
10538 * all the ipifs down then bring them up again. The act of
10539 * bringing all the ipifs back up will trigger the local
10540 * ires being recreated with "no_accept" set/cleared.
10541 *
10542 * Note that ILLF_NOACCEPT is always set separately from the
10543 * other flags.
10544 */
10545 if ((turn_on|turn_off) &
10546 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10547 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10548 IPIF_NOFAILOVER)) {
10549 /*
10550 * ipif_down() will ire_delete bcast ire's for the subnet,
10551 * while the ire_identical_ref tracks the case of IRE_BROADCAST
10552 * entries shared between multiple ipifs on the same subnet.
10553 */
10554 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10555 !(turn_off & IPIF_UP)) {
10556 if (ipif->ipif_flags & IPIF_UP)
10557 ill->ill_logical_down = 1;
10558 turn_on &= ~IPIF_UP;
10559 }
10560 err = ipif_down(ipif, q, mp);
10561 ip1dbg(("ipif_down returns %d err ", err));
10562 if (err == EINPROGRESS)
10563 return (err);
10564 (void) ipif_down_tail(ipif);
10565 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10566 /*
10567 * If we can quiesce the ill, then continue. If not, then
10568 * ip_sioctl_flags_tail() will be called from
10569 * ipif_ill_refrele_tail().
10570 */
10571 ill_down_ipifs(ill, B_TRUE);
10572
10573 mutex_enter(&connp->conn_lock);
10574 mutex_enter(&ill->ill_lock);
10575 if (!ill_is_quiescent(ill)) {
10576 boolean_t success;
10577
10578 success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10579 q, mp, ILL_DOWN);
10580 mutex_exit(&ill->ill_lock);
10581 mutex_exit(&connp->conn_lock);
10582 return (success ? EINPROGRESS : EINTR);
10583 }
10584 mutex_exit(&ill->ill_lock);
10585 mutex_exit(&connp->conn_lock);
10586 }
10587 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10588 }
10589
10590 static int
10591 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10592 {
10593 ill_t *ill;
10594 phyint_t *phyi;
10595 uint64_t turn_on, turn_off;
10596 boolean_t phyint_flags_modified = B_FALSE;
10597 int err = 0;
10598 boolean_t set_linklocal = B_FALSE;
10599
10600 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10601 ipif->ipif_ill->ill_name, ipif->ipif_id));
10602
10603 ASSERT(IAM_WRITER_IPIF(ipif));
10604
10605 ill = ipif->ipif_ill;
10606 phyi = ill->ill_phyint;
10607
10608 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10609
10610 /*
10611 * IFF_UP is handled separately.
10612 */
10613 turn_on &= ~IFF_UP;
10614 turn_off &= ~IFF_UP;
10615
10616 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10617 phyint_flags_modified = B_TRUE;
10618
10619 /*
10620 * Now we change the flags. Track current value of
10621 * other flags in their respective places.
10622 */
10623 mutex_enter(&ill->ill_lock);
10624 mutex_enter(&phyi->phyint_lock);
10625 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10626 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10627 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10628 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10629 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10630 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10631 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10632 set_linklocal = B_TRUE;
10633 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10634 }
10635
10636 mutex_exit(&ill->ill_lock);
10637 mutex_exit(&phyi->phyint_lock);
10638
10639 if (set_linklocal)
10640 (void) ipif_setlinklocal(ipif);
10641
10642 /*
10643 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10644 * the kernel: if any of them has been set by userland, the interface
10645 * cannot be used for data traffic.
10646 */
10647 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10648 ASSERT(!IS_IPMP(ill));
10649 /*
10650 * It's possible the ill is part of an "anonymous" IPMP group
10651 * rather than a real group. In that case, there are no other
10652 * interfaces in the group and thus no need for us to call
10653 * ipmp_phyint_refresh_active().
10654 */
10655 if (IS_UNDER_IPMP(ill))
10656 ipmp_phyint_refresh_active(phyi);
10657 }
10658
10659 if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10660 /*
10661 * If the ILLF_NOACCEPT flag is changed, bring up all the
10662 * ipifs that were brought down.
10663 *
10664 * The routing sockets messages are sent as the result
10665 * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10666 * as well.
10667 */
10668 err = ill_up_ipifs(ill, q, mp);
10669 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10670 /*
10671 * XXX ipif_up really does not know whether a phyint flags
10672 * was modified or not. So, it sends up information on
10673 * only one routing sockets message. As we don't bring up
10674 * the interface and also set PHYI_ flags simultaneously
10675 * it should be okay.
10676 */
10677 err = ipif_up(ipif, q, mp);
10678 } else {
10679 /*
10680 * Make sure routing socket sees all changes to the flags.
10681 * ipif_up_done* handles this when we use ipif_up.
10682 */
10683 if (phyint_flags_modified) {
10684 if (phyi->phyint_illv4 != NULL) {
10685 ip_rts_ifmsg(phyi->phyint_illv4->
10686 ill_ipif, RTSQ_DEFAULT);
10687 }
10688 if (phyi->phyint_illv6 != NULL) {
10689 ip_rts_ifmsg(phyi->phyint_illv6->
10690 ill_ipif, RTSQ_DEFAULT);
10691 }
10692 } else {
10693 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10694 }
10695 /*
10696 * Update the flags in SCTP's IPIF list, ipif_up() will do
10697 * this in need_up case.
10698 */
10699 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10700 }
10701
10702 /* The default multicast interface might have changed */
10703 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10704 return (err);
10705 }
10706
10707 /*
10708 * Restart the flags operation now that the refcounts have dropped to zero.
10709 */
10710 /* ARGSUSED */
10711 int
10712 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10713 ip_ioctl_cmd_t *ipip, void *if_req)
10714 {
10715 uint64_t flags;
10716 struct ifreq *ifr = if_req;
10717 struct lifreq *lifr = if_req;
10718 uint64_t turn_on, turn_off;
10719
10720 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10721 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10722
10723 if (ipip->ipi_cmd_type == IF_CMD) {
10724 /* cast to uint16_t prevents unwanted sign extension */
10725 flags = (uint16_t)ifr->ifr_flags;
10726 } else {
10727 flags = lifr->lifr_flags;
10728 }
10729
10730 /*
10731 * If this function call is a result of the ILLF_NOACCEPT flag
10732 * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10733 */
10734 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10735 if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10736 (void) ipif_down_tail(ipif);
10737
10738 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10739 }
10740
10741 /*
10742 * Can operate on either a module or a driver queue.
10743 */
10744 /* ARGSUSED */
10745 int
10746 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10747 ip_ioctl_cmd_t *ipip, void *if_req)
10748 {
10749 /*
10750 * Has the flags been set correctly till now ?
10751 */
10752 ill_t *ill = ipif->ipif_ill;
10753 phyint_t *phyi = ill->ill_phyint;
10754
10755 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10756 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10757 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10758 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10759 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10760
10761 /*
10762 * Need a lock since some flags can be set even when there are
10763 * references to the ipif.
10764 */
10765 mutex_enter(&ill->ill_lock);
10766 if (ipip->ipi_cmd_type == IF_CMD) {
10767 struct ifreq *ifr = (struct ifreq *)if_req;
10768
10769 /* Get interface flags (low 16 only). */
10770 ifr->ifr_flags = ((ipif->ipif_flags |
10771 ill->ill_flags | phyi->phyint_flags) & 0xffff);
10772 } else {
10773 struct lifreq *lifr = (struct lifreq *)if_req;
10774
10775 /* Get interface flags. */
10776 lifr->lifr_flags = ipif->ipif_flags |
10777 ill->ill_flags | phyi->phyint_flags;
10778 }
10779 mutex_exit(&ill->ill_lock);
10780 return (0);
10781 }
10782
10783 /*
10784 * We allow the MTU to be set on an ILL, but not have it be different
10785 * for different IPIFs since we don't actually send packets on IPIFs.
10786 */
10787 /* ARGSUSED */
10788 int
10789 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10790 ip_ioctl_cmd_t *ipip, void *if_req)
10791 {
10792 int mtu;
10793 int ip_min_mtu;
10794 struct ifreq *ifr;
10795 struct lifreq *lifr;
10796 ill_t *ill;
10797
10798 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10799 ipif->ipif_id, (void *)ipif));
10800 if (ipip->ipi_cmd_type == IF_CMD) {
10801 ifr = (struct ifreq *)if_req;
10802 mtu = ifr->ifr_metric;
10803 } else {
10804 lifr = (struct lifreq *)if_req;
10805 mtu = lifr->lifr_mtu;
10806 }
10807 /* Only allow for logical unit zero i.e. not on "bge0:17" */
10808 if (ipif->ipif_id != 0)
10809 return (EINVAL);
10810
10811 ill = ipif->ipif_ill;
10812 if (ipif->ipif_isv6)
10813 ip_min_mtu = IPV6_MIN_MTU;
10814 else
10815 ip_min_mtu = IP_MIN_MTU;
10816
10817 mutex_enter(&ill->ill_lock);
10818 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10819 mutex_exit(&ill->ill_lock);
10820 return (EINVAL);
10821 }
10822 /* Avoid increasing ill_mc_mtu */
10823 if (ill->ill_mc_mtu > mtu)
10824 ill->ill_mc_mtu = mtu;
10825
10826 /*
10827 * The dce and fragmentation code can handle changes to ill_mtu
10828 * concurrent with sending/fragmenting packets.
10829 */
10830 ill->ill_mtu = mtu;
10831 ill->ill_flags |= ILLF_FIXEDMTU;
10832 mutex_exit(&ill->ill_lock);
10833
10834 /*
10835 * Make sure all dce_generation checks find out
10836 * that ill_mtu/ill_mc_mtu has changed.
10837 */
10838 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10839
10840 /*
10841 * Refresh IPMP meta-interface MTU if necessary.
10842 */
10843 if (IS_UNDER_IPMP(ill))
10844 ipmp_illgrp_refresh_mtu(ill->ill_grp);
10845
10846 /* Update the MTU in SCTP's list */
10847 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10848 return (0);
10849 }
10850
10851 /* Get interface MTU. */
10852 /* ARGSUSED */
10853 int
10854 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10855 ip_ioctl_cmd_t *ipip, void *if_req)
10856 {
10857 struct ifreq *ifr;
10858 struct lifreq *lifr;
10859
10860 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10861 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10862
10863 /*
10864 * We allow a get on any logical interface even though the set
10865 * can only be done on logical unit 0.
10866 */
10867 if (ipip->ipi_cmd_type == IF_CMD) {
10868 ifr = (struct ifreq *)if_req;
10869 ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10870 } else {
10871 lifr = (struct lifreq *)if_req;
10872 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10873 }
10874 return (0);
10875 }
10876
10877 /* Set interface broadcast address. */
10878 /* ARGSUSED2 */
10879 int
10880 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10881 ip_ioctl_cmd_t *ipip, void *if_req)
10882 {
10883 ipaddr_t addr;
10884 ire_t *ire;
10885 ill_t *ill = ipif->ipif_ill;
10886 ip_stack_t *ipst = ill->ill_ipst;
10887
10888 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10889 ipif->ipif_id));
10890
10891 ASSERT(IAM_WRITER_IPIF(ipif));
10892 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10893 return (EADDRNOTAVAIL);
10894
10895 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */
10896
10897 if (sin->sin_family != AF_INET)
10898 return (EAFNOSUPPORT);
10899
10900 addr = sin->sin_addr.s_addr;
10901
10902 if (ipif->ipif_flags & IPIF_UP) {
10903 /*
10904 * If we are already up, make sure the new
10905 * broadcast address makes sense. If it does,
10906 * there should be an IRE for it already.
10907 */
10908 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10909 ill, ipif->ipif_zoneid, NULL,
10910 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10911 if (ire == NULL) {
10912 return (EINVAL);
10913 } else {
10914 ire_refrele(ire);
10915 }
10916 }
10917 /*
10918 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10919 * needs to already exist we never need to change the set of
10920 * IRE_BROADCASTs when we are UP.
10921 */
10922 if (addr != ipif->ipif_brd_addr)
10923 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10924
10925 return (0);
10926 }
10927
10928 /* Get interface broadcast address. */
10929 /* ARGSUSED */
10930 int
10931 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10932 ip_ioctl_cmd_t *ipip, void *if_req)
10933 {
10934 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10935 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10936 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10937 return (EADDRNOTAVAIL);
10938
10939 /* IPIF_BROADCAST not possible with IPv6 */
10940 ASSERT(!ipif->ipif_isv6);
10941 *sin = sin_null;
10942 sin->sin_family = AF_INET;
10943 sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10944 return (0);
10945 }
10946
10947 /*
10948 * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
10949 */
10950 /* ARGSUSED */
10951 int
10952 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10953 ip_ioctl_cmd_t *ipip, void *if_req)
10954 {
10955 int err = 0;
10956 in6_addr_t v6mask;
10957
10958 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
10959 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10960
10961 ASSERT(IAM_WRITER_IPIF(ipif));
10962
10963 if (ipif->ipif_isv6) {
10964 sin6_t *sin6;
10965
10966 if (sin->sin_family != AF_INET6)
10967 return (EAFNOSUPPORT);
10968
10969 sin6 = (sin6_t *)sin;
10970 v6mask = sin6->sin6_addr;
10971 } else {
10972 ipaddr_t mask;
10973
10974 if (sin->sin_family != AF_INET)
10975 return (EAFNOSUPPORT);
10976
10977 mask = sin->sin_addr.s_addr;
10978 if (!ip_contiguous_mask(ntohl(mask)))
10979 return (ENOTSUP);
10980 V4MASK_TO_V6(mask, v6mask);
10981 }
10982
10983 /*
10984 * No big deal if the interface isn't already up, or the mask
10985 * isn't really changing, or this is pt-pt.
10986 */
10987 if (!(ipif->ipif_flags & IPIF_UP) ||
10988 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
10989 (ipif->ipif_flags & IPIF_POINTOPOINT)) {
10990 ipif->ipif_v6net_mask = v6mask;
10991 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10992 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
10993 ipif->ipif_v6net_mask,
10994 ipif->ipif_v6subnet);
10995 }
10996 return (0);
10997 }
10998 /*
10999 * Make sure we have valid net and subnet broadcast ire's
11000 * for the old netmask, if needed by other logical interfaces.
11001 */
11002 err = ipif_logical_down(ipif, q, mp);
11003 if (err == EINPROGRESS)
11004 return (err);
11005 (void) ipif_down_tail(ipif);
11006 err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
11007 return (err);
11008 }
11009
11010 static int
11011 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
11012 {
11013 in6_addr_t v6mask;
11014 int err = 0;
11015
11016 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
11017 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11018
11019 if (ipif->ipif_isv6) {
11020 sin6_t *sin6;
11021
11022 sin6 = (sin6_t *)sin;
11023 v6mask = sin6->sin6_addr;
11024 } else {
11025 ipaddr_t mask;
11026
11027 mask = sin->sin_addr.s_addr;
11028 V4MASK_TO_V6(mask, v6mask);
11029 }
11030
11031 ipif->ipif_v6net_mask = v6mask;
11032 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11033 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
11034 ipif->ipif_v6subnet);
11035 }
11036 err = ipif_up(ipif, q, mp);
11037
11038 if (err == 0 || err == EINPROGRESS) {
11039 /*
11040 * The interface must be DL_BOUND if this packet has to
11041 * go out on the wire. Since we only go through a logical
11042 * down and are bound with the driver during an internal
11043 * down/up that is satisfied.
11044 */
11045 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
11046 /* Potentially broadcast an address mask reply. */
11047 ipif_mask_reply(ipif);
11048 }
11049 }
11050 return (err);
11051 }
11052
11053 /* ARGSUSED */
11054 int
11055 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11056 ip_ioctl_cmd_t *ipip, void *if_req)
11057 {
11058 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
11059 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11060 (void) ipif_down_tail(ipif);
11061 return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
11062 }
11063
11064 /* Get interface net mask. */
11065 /* ARGSUSED */
11066 int
11067 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11068 ip_ioctl_cmd_t *ipip, void *if_req)
11069 {
11070 struct lifreq *lifr = (struct lifreq *)if_req;
11071 struct sockaddr_in6 *sin6 = (sin6_t *)sin;
11072
11073 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
11074 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11075
11076 /*
11077 * net mask can't change since we have a reference to the ipif.
11078 */
11079 if (ipif->ipif_isv6) {
11080 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11081 *sin6 = sin6_null;
11082 sin6->sin6_family = AF_INET6;
11083 sin6->sin6_addr = ipif->ipif_v6net_mask;
11084 lifr->lifr_addrlen =
11085 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11086 } else {
11087 *sin = sin_null;
11088 sin->sin_family = AF_INET;
11089 sin->sin_addr.s_addr = ipif->ipif_net_mask;
11090 if (ipip->ipi_cmd_type == LIF_CMD) {
11091 lifr->lifr_addrlen =
11092 ip_mask_to_plen(ipif->ipif_net_mask);
11093 }
11094 }
11095 return (0);
11096 }
11097
11098 /* ARGSUSED */
11099 int
11100 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11101 ip_ioctl_cmd_t *ipip, void *if_req)
11102 {
11103 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
11104 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11105
11106 /*
11107 * Since no applications should ever be setting metrics on underlying
11108 * interfaces, we explicitly fail to smoke 'em out.
11109 */
11110 if (IS_UNDER_IPMP(ipif->ipif_ill))
11111 return (EINVAL);
11112
11113 /*
11114 * Set interface metric. We don't use this for
11115 * anything but we keep track of it in case it is
11116 * important to routing applications or such.
11117 */
11118 if (ipip->ipi_cmd_type == IF_CMD) {
11119 struct ifreq *ifr;
11120
11121 ifr = (struct ifreq *)if_req;
11122 ipif->ipif_ill->ill_metric = ifr->ifr_metric;
11123 } else {
11124 struct lifreq *lifr;
11125
11126 lifr = (struct lifreq *)if_req;
11127 ipif->ipif_ill->ill_metric = lifr->lifr_metric;
11128 }
11129 return (0);
11130 }
11131
11132 /* ARGSUSED */
11133 int
11134 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11135 ip_ioctl_cmd_t *ipip, void *if_req)
11136 {
11137 /* Get interface metric. */
11138 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
11139 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11140
11141 if (ipip->ipi_cmd_type == IF_CMD) {
11142 struct ifreq *ifr;
11143
11144 ifr = (struct ifreq *)if_req;
11145 ifr->ifr_metric = ipif->ipif_ill->ill_metric;
11146 } else {
11147 struct lifreq *lifr;
11148
11149 lifr = (struct lifreq *)if_req;
11150 lifr->lifr_metric = ipif->ipif_ill->ill_metric;
11151 }
11152
11153 return (0);
11154 }
11155
11156 /* ARGSUSED */
11157 int
11158 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11159 ip_ioctl_cmd_t *ipip, void *if_req)
11160 {
11161 int arp_muxid;
11162
11163 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
11164 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11165 /*
11166 * Set the muxid returned from I_PLINK.
11167 */
11168 if (ipip->ipi_cmd_type == IF_CMD) {
11169 struct ifreq *ifr = (struct ifreq *)if_req;
11170
11171 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
11172 arp_muxid = ifr->ifr_arp_muxid;
11173 } else {
11174 struct lifreq *lifr = (struct lifreq *)if_req;
11175
11176 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
11177 arp_muxid = lifr->lifr_arp_muxid;
11178 }
11179 arl_set_muxid(ipif->ipif_ill, arp_muxid);
11180 return (0);
11181 }
11182
11183 /* ARGSUSED */
11184 int
11185 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11186 ip_ioctl_cmd_t *ipip, void *if_req)
11187 {
11188 int arp_muxid = 0;
11189
11190 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
11191 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11192 /*
11193 * Get the muxid saved in ill for I_PUNLINK.
11194 */
11195 arp_muxid = arl_get_muxid(ipif->ipif_ill);
11196 if (ipip->ipi_cmd_type == IF_CMD) {
11197 struct ifreq *ifr = (struct ifreq *)if_req;
11198
11199 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11200 ifr->ifr_arp_muxid = arp_muxid;
11201 } else {
11202 struct lifreq *lifr = (struct lifreq *)if_req;
11203
11204 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11205 lifr->lifr_arp_muxid = arp_muxid;
11206 }
11207 return (0);
11208 }
11209
11210 /*
11211 * Set the subnet prefix. Does not modify the broadcast address.
11212 */
11213 /* ARGSUSED */
11214 int
11215 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11216 ip_ioctl_cmd_t *ipip, void *if_req)
11217 {
11218 int err = 0;
11219 in6_addr_t v6addr;
11220 in6_addr_t v6mask;
11221 boolean_t need_up = B_FALSE;
11222 int addrlen;
11223
11224 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11225 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11226
11227 ASSERT(IAM_WRITER_IPIF(ipif));
11228 addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11229
11230 if (ipif->ipif_isv6) {
11231 sin6_t *sin6;
11232
11233 if (sin->sin_family != AF_INET6)
11234 return (EAFNOSUPPORT);
11235
11236 sin6 = (sin6_t *)sin;
11237 v6addr = sin6->sin6_addr;
11238 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11239 return (EADDRNOTAVAIL);
11240 } else {
11241 ipaddr_t addr;
11242
11243 if (sin->sin_family != AF_INET)
11244 return (EAFNOSUPPORT);
11245
11246 addr = sin->sin_addr.s_addr;
11247 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11248 return (EADDRNOTAVAIL);
11249 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11250 /* Add 96 bits */
11251 addrlen += IPV6_ABITS - IP_ABITS;
11252 }
11253
11254 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11255 return (EINVAL);
11256
11257 /* Check if bits in the address is set past the mask */
11258 if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11259 return (EINVAL);
11260
11261 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11262 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11263 return (0); /* No change */
11264
11265 if (ipif->ipif_flags & IPIF_UP) {
11266 /*
11267 * If the interface is already marked up,
11268 * we call ipif_down which will take care
11269 * of ditching any IREs that have been set
11270 * up based on the old interface address.
11271 */
11272 err = ipif_logical_down(ipif, q, mp);
11273 if (err == EINPROGRESS)
11274 return (err);
11275 (void) ipif_down_tail(ipif);
11276 need_up = B_TRUE;
11277 }
11278
11279 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11280 return (err);
11281 }
11282
11283 static int
11284 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11285 queue_t *q, mblk_t *mp, boolean_t need_up)
11286 {
11287 ill_t *ill = ipif->ipif_ill;
11288 int err = 0;
11289
11290 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11291 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11292
11293 /* Set the new address. */
11294 mutex_enter(&ill->ill_lock);
11295 ipif->ipif_v6net_mask = v6mask;
11296 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11297 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11298 ipif->ipif_v6subnet);
11299 }
11300 mutex_exit(&ill->ill_lock);
11301
11302 if (need_up) {
11303 /*
11304 * Now bring the interface back up. If this
11305 * is the only IPIF for the ILL, ipif_up
11306 * will have to re-bind to the device, so
11307 * we may get back EINPROGRESS, in which
11308 * case, this IOCTL will get completed in
11309 * ip_rput_dlpi when we see the DL_BIND_ACK.
11310 */
11311 err = ipif_up(ipif, q, mp);
11312 if (err == EINPROGRESS)
11313 return (err);
11314 }
11315 return (err);
11316 }
11317
11318 /* ARGSUSED */
11319 int
11320 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11321 ip_ioctl_cmd_t *ipip, void *if_req)
11322 {
11323 int addrlen;
11324 in6_addr_t v6addr;
11325 in6_addr_t v6mask;
11326 struct lifreq *lifr = (struct lifreq *)if_req;
11327
11328 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11329 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11330 (void) ipif_down_tail(ipif);
11331
11332 addrlen = lifr->lifr_addrlen;
11333 if (ipif->ipif_isv6) {
11334 sin6_t *sin6;
11335
11336 sin6 = (sin6_t *)sin;
11337 v6addr = sin6->sin6_addr;
11338 } else {
11339 ipaddr_t addr;
11340
11341 addr = sin->sin_addr.s_addr;
11342 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11343 addrlen += IPV6_ABITS - IP_ABITS;
11344 }
11345 (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11346
11347 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11348 }
11349
11350 /* ARGSUSED */
11351 int
11352 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11353 ip_ioctl_cmd_t *ipip, void *if_req)
11354 {
11355 struct lifreq *lifr = (struct lifreq *)if_req;
11356 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11357
11358 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11359 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11360 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11361
11362 if (ipif->ipif_isv6) {
11363 *sin6 = sin6_null;
11364 sin6->sin6_family = AF_INET6;
11365 sin6->sin6_addr = ipif->ipif_v6subnet;
11366 lifr->lifr_addrlen =
11367 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11368 } else {
11369 *sin = sin_null;
11370 sin->sin_family = AF_INET;
11371 sin->sin_addr.s_addr = ipif->ipif_subnet;
11372 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11373 }
11374 return (0);
11375 }
11376
11377 /*
11378 * Set the IPv6 address token.
11379 */
11380 /* ARGSUSED */
11381 int
11382 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11383 ip_ioctl_cmd_t *ipi, void *if_req)
11384 {
11385 ill_t *ill = ipif->ipif_ill;
11386 int err;
11387 in6_addr_t v6addr;
11388 in6_addr_t v6mask;
11389 boolean_t need_up = B_FALSE;
11390 int i;
11391 sin6_t *sin6 = (sin6_t *)sin;
11392 struct lifreq *lifr = (struct lifreq *)if_req;
11393 int addrlen;
11394
11395 ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11396 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11397 ASSERT(IAM_WRITER_IPIF(ipif));
11398
11399 addrlen = lifr->lifr_addrlen;
11400 /* Only allow for logical unit zero i.e. not on "le0:17" */
11401 if (ipif->ipif_id != 0)
11402 return (EINVAL);
11403
11404 if (!ipif->ipif_isv6)
11405 return (EINVAL);
11406
11407 if (addrlen > IPV6_ABITS)
11408 return (EINVAL);
11409
11410 v6addr = sin6->sin6_addr;
11411
11412 /*
11413 * The length of the token is the length from the end. To get
11414 * the proper mask for this, compute the mask of the bits not
11415 * in the token; ie. the prefix, and then xor to get the mask.
11416 */
11417 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11418 return (EINVAL);
11419 for (i = 0; i < 4; i++) {
11420 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11421 }
11422
11423 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11424 ill->ill_token_length == addrlen)
11425 return (0); /* No change */
11426
11427 if (ipif->ipif_flags & IPIF_UP) {
11428 err = ipif_logical_down(ipif, q, mp);
11429 if (err == EINPROGRESS)
11430 return (err);
11431 (void) ipif_down_tail(ipif);
11432 need_up = B_TRUE;
11433 }
11434 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11435 return (err);
11436 }
11437
11438 static int
11439 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11440 mblk_t *mp, boolean_t need_up)
11441 {
11442 in6_addr_t v6addr;
11443 in6_addr_t v6mask;
11444 ill_t *ill = ipif->ipif_ill;
11445 int i;
11446 int err = 0;
11447
11448 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11449 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11450 v6addr = sin6->sin6_addr;
11451 /*
11452 * The length of the token is the length from the end. To get
11453 * the proper mask for this, compute the mask of the bits not
11454 * in the token; ie. the prefix, and then xor to get the mask.
11455 */
11456 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11457 for (i = 0; i < 4; i++)
11458 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11459
11460 mutex_enter(&ill->ill_lock);
11461 V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11462 ill->ill_token_length = addrlen;
11463 ill->ill_manual_token = 1;
11464
11465 /* Reconfigure the link-local address based on this new token */
11466 ipif_setlinklocal(ill->ill_ipif);
11467
11468 mutex_exit(&ill->ill_lock);
11469
11470 if (need_up) {
11471 /*
11472 * Now bring the interface back up. If this
11473 * is the only IPIF for the ILL, ipif_up
11474 * will have to re-bind to the device, so
11475 * we may get back EINPROGRESS, in which
11476 * case, this IOCTL will get completed in
11477 * ip_rput_dlpi when we see the DL_BIND_ACK.
11478 */
11479 err = ipif_up(ipif, q, mp);
11480 if (err == EINPROGRESS)
11481 return (err);
11482 }
11483 return (err);
11484 }
11485
11486 /* ARGSUSED */
11487 int
11488 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11489 ip_ioctl_cmd_t *ipi, void *if_req)
11490 {
11491 ill_t *ill;
11492 sin6_t *sin6 = (sin6_t *)sin;
11493 struct lifreq *lifr = (struct lifreq *)if_req;
11494
11495 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11496 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11497 if (ipif->ipif_id != 0)
11498 return (EINVAL);
11499
11500 ill = ipif->ipif_ill;
11501 if (!ill->ill_isv6)
11502 return (ENXIO);
11503
11504 *sin6 = sin6_null;
11505 sin6->sin6_family = AF_INET6;
11506 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11507 sin6->sin6_addr = ill->ill_token;
11508 lifr->lifr_addrlen = ill->ill_token_length;
11509 return (0);
11510 }
11511
11512 /*
11513 * Set (hardware) link specific information that might override
11514 * what was acquired through the DL_INFO_ACK.
11515 */
11516 /* ARGSUSED */
11517 int
11518 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11519 ip_ioctl_cmd_t *ipi, void *if_req)
11520 {
11521 ill_t *ill = ipif->ipif_ill;
11522 int ip_min_mtu;
11523 struct lifreq *lifr = (struct lifreq *)if_req;
11524 lif_ifinfo_req_t *lir;
11525
11526 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11527 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11528 lir = &lifr->lifr_ifinfo;
11529 ASSERT(IAM_WRITER_IPIF(ipif));
11530
11531 /* Only allow for logical unit zero i.e. not on "bge0:17" */
11532 if (ipif->ipif_id != 0)
11533 return (EINVAL);
11534
11535 /* Set interface MTU. */
11536 if (ipif->ipif_isv6)
11537 ip_min_mtu = IPV6_MIN_MTU;
11538 else
11539 ip_min_mtu = IP_MIN_MTU;
11540
11541 /*
11542 * Verify values before we set anything. Allow zero to
11543 * mean unspecified.
11544 *
11545 * XXX We should be able to set the user-defined lir_mtu to some value
11546 * that is greater than ill_current_frag but less than ill_max_frag- the
11547 * ill_max_frag value tells us the max MTU that can be handled by the
11548 * datalink, whereas the ill_current_frag is dynamically computed for
11549 * some link-types like tunnels, based on the tunnel PMTU. However,
11550 * since there is currently no way of distinguishing between
11551 * administratively fixed link mtu values (e.g., those set via
11552 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11553 * for tunnels) we conservatively choose the ill_current_frag as the
11554 * upper-bound.
11555 */
11556 if (lir->lir_maxmtu != 0 &&
11557 (lir->lir_maxmtu > ill->ill_current_frag ||
11558 lir->lir_maxmtu < ip_min_mtu))
11559 return (EINVAL);
11560 if (lir->lir_reachtime != 0 &&
11561 lir->lir_reachtime > ND_MAX_REACHTIME)
11562 return (EINVAL);
11563 if (lir->lir_reachretrans != 0 &&
11564 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11565 return (EINVAL);
11566
11567 mutex_enter(&ill->ill_lock);
11568 /*
11569 * The dce and fragmentation code can handle changes to ill_mtu
11570 * concurrent with sending/fragmenting packets.
11571 */
11572 if (lir->lir_maxmtu != 0)
11573 ill->ill_user_mtu = lir->lir_maxmtu;
11574
11575 if (lir->lir_reachtime != 0)
11576 ill->ill_reachable_time = lir->lir_reachtime;
11577
11578 if (lir->lir_reachretrans != 0)
11579 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11580
11581 ill->ill_max_hops = lir->lir_maxhops;
11582 ill->ill_max_buf = ND_MAX_Q;
11583 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11584 /*
11585 * ill_mtu is the actual interface MTU, obtained as the min
11586 * of user-configured mtu and the value announced by the
11587 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11588 * we have already made the choice of requiring
11589 * ill_user_mtu < ill_current_frag by the time we get here,
11590 * the ill_mtu effectively gets assigned to the ill_user_mtu
11591 * here.
11592 */
11593 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11594 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11595 }
11596 mutex_exit(&ill->ill_lock);
11597
11598 /*
11599 * Make sure all dce_generation checks find out
11600 * that ill_mtu/ill_mc_mtu has changed.
11601 */
11602 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11603 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11604
11605 /*
11606 * Refresh IPMP meta-interface MTU if necessary.
11607 */
11608 if (IS_UNDER_IPMP(ill))
11609 ipmp_illgrp_refresh_mtu(ill->ill_grp);
11610
11611 return (0);
11612 }
11613
11614 /* ARGSUSED */
11615 int
11616 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11617 ip_ioctl_cmd_t *ipi, void *if_req)
11618 {
11619 struct lif_ifinfo_req *lir;
11620 ill_t *ill = ipif->ipif_ill;
11621
11622 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11623 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11624 if (ipif->ipif_id != 0)
11625 return (EINVAL);
11626
11627 lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11628 lir->lir_maxhops = ill->ill_max_hops;
11629 lir->lir_reachtime = ill->ill_reachable_time;
11630 lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11631 lir->lir_maxmtu = ill->ill_mtu;
11632
11633 return (0);
11634 }
11635
11636 /*
11637 * Return best guess as to the subnet mask for the specified address.
11638 * Based on the subnet masks for all the configured interfaces.
11639 *
11640 * We end up returning a zero mask in the case of default, multicast or
11641 * experimental.
11642 */
11643 static ipaddr_t
11644 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11645 {
11646 ipaddr_t net_mask;
11647 ill_t *ill;
11648 ipif_t *ipif;
11649 ill_walk_context_t ctx;
11650 ipif_t *fallback_ipif = NULL;
11651
11652 net_mask = ip_net_mask(addr);
11653 if (net_mask == 0) {
11654 *ipifp = NULL;
11655 return (0);
11656 }
11657
11658 /* Let's check to see if this is maybe a local subnet route. */
11659 /* this function only applies to IPv4 interfaces */
11660 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11661 ill = ILL_START_WALK_V4(&ctx, ipst);
11662 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11663 mutex_enter(&ill->ill_lock);
11664 for (ipif = ill->ill_ipif; ipif != NULL;
11665 ipif = ipif->ipif_next) {
11666 if (IPIF_IS_CONDEMNED(ipif))
11667 continue;
11668 if (!(ipif->ipif_flags & IPIF_UP))
11669 continue;
11670 if ((ipif->ipif_subnet & net_mask) ==
11671 (addr & net_mask)) {
11672 /*
11673 * Don't trust pt-pt interfaces if there are
11674 * other interfaces.
11675 */
11676 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11677 if (fallback_ipif == NULL) {
11678 ipif_refhold_locked(ipif);
11679 fallback_ipif = ipif;
11680 }
11681 continue;
11682 }
11683
11684 /*
11685 * Fine. Just assume the same net mask as the
11686 * directly attached subnet interface is using.
11687 */
11688 ipif_refhold_locked(ipif);
11689 mutex_exit(&ill->ill_lock);
11690 rw_exit(&ipst->ips_ill_g_lock);
11691 if (fallback_ipif != NULL)
11692 ipif_refrele(fallback_ipif);
11693 *ipifp = ipif;
11694 return (ipif->ipif_net_mask);
11695 }
11696 }
11697 mutex_exit(&ill->ill_lock);
11698 }
11699 rw_exit(&ipst->ips_ill_g_lock);
11700
11701 *ipifp = fallback_ipif;
11702 return ((fallback_ipif != NULL) ?
11703 fallback_ipif->ipif_net_mask : net_mask);
11704 }
11705
11706 /*
11707 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11708 */
11709 static void
11710 ip_wput_ioctl(queue_t *q, mblk_t *mp)
11711 {
11712 IOCP iocp;
11713 ipft_t *ipft;
11714 ipllc_t *ipllc;
11715 mblk_t *mp1;
11716 cred_t *cr;
11717 int error = 0;
11718 conn_t *connp;
11719
11720 ip1dbg(("ip_wput_ioctl"));
11721 iocp = (IOCP)mp->b_rptr;
11722 mp1 = mp->b_cont;
11723 if (mp1 == NULL) {
11724 iocp->ioc_error = EINVAL;
11725 mp->b_datap->db_type = M_IOCNAK;
11726 iocp->ioc_count = 0;
11727 qreply(q, mp);
11728 return;
11729 }
11730
11731 /*
11732 * These IOCTLs provide various control capabilities to
11733 * upstream agents such as ULPs and processes. There
11734 * are currently two such IOCTLs implemented. They
11735 * are used by TCP to provide update information for
11736 * existing IREs and to forcibly delete an IRE for a
11737 * host that is not responding, thereby forcing an
11738 * attempt at a new route.
11739 */
11740 iocp->ioc_error = EINVAL;
11741 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11742 goto done;
11743
11744 ipllc = (ipllc_t *)mp1->b_rptr;
11745 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11746 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11747 break;
11748 }
11749 /*
11750 * prefer credential from mblk over ioctl;
11751 * see ip_sioctl_copyin_setup
11752 */
11753 cr = msg_getcred(mp, NULL);
11754 if (cr == NULL)
11755 cr = iocp->ioc_cr;
11756
11757 /*
11758 * Refhold the conn in case the request gets queued up in some lookup
11759 */
11760 ASSERT(CONN_Q(q));
11761 connp = Q_TO_CONN(q);
11762 CONN_INC_REF(connp);
11763 CONN_INC_IOCTLREF(connp);
11764 if (ipft->ipft_pfi &&
11765 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11766 pullupmsg(mp1, ipft->ipft_min_size))) {
11767 error = (*ipft->ipft_pfi)(q,
11768 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11769 }
11770 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11771 /*
11772 * CONN_OPER_PENDING_DONE happens in the function called
11773 * through ipft_pfi above.
11774 */
11775 return;
11776 }
11777
11778 CONN_DEC_IOCTLREF(connp);
11779 CONN_OPER_PENDING_DONE(connp);
11780 if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11781 freemsg(mp);
11782 return;
11783 }
11784 iocp->ioc_error = error;
11785
11786 done:
11787 mp->b_datap->db_type = M_IOCACK;
11788 if (iocp->ioc_error)
11789 iocp->ioc_count = 0;
11790 qreply(q, mp);
11791 }
11792
11793 /*
11794 * Assign a unique id for the ipif. This is used by sctp_addr.c
11795 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11796 */
11797 static void
11798 ipif_assign_seqid(ipif_t *ipif)
11799 {
11800 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
11801
11802 ipif->ipif_seqid = atomic_add_64_nv(&ipst->ips_ipif_g_seqid, 1);
11803 }
11804
11805 /*
11806 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are
11807 * administratively down (i.e., no DAD), of the same type, and locked. Note
11808 * that the clone is complete -- including the seqid -- and the expectation is
11809 * that the caller will either free or overwrite `sipif' before it's unlocked.
11810 */
11811 static void
11812 ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11813 {
11814 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11815 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11816 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11817 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11818 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11819
11820 dipif->ipif_flags = sipif->ipif_flags;
11821 dipif->ipif_zoneid = sipif->ipif_zoneid;
11822 dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11823 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11824 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11825 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11826 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11827
11828 /*
11829 * As per the comment atop the function, we assume that these sipif
11830 * fields will be changed before sipif is unlocked.
11831 */
11832 dipif->ipif_seqid = sipif->ipif_seqid;
11833 dipif->ipif_state_flags = sipif->ipif_state_flags;
11834 }
11835
11836 /*
11837 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11838 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11839 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then
11840 * transfer the xop to `dipif'. Requires that all ipifs are administratively
11841 * down (i.e., no DAD), of the same type, and unlocked.
11842 */
11843 static void
11844 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11845 {
11846 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11847 ipxop_t *ipx = ipsq->ipsq_xop;
11848
11849 ASSERT(sipif != dipif);
11850 ASSERT(sipif != virgipif);
11851
11852 /*
11853 * Grab all of the locks that protect the ipif in a defined order.
11854 */
11855 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11856
11857 ipif_clone(sipif, dipif);
11858 if (virgipif != NULL) {
11859 ipif_clone(virgipif, sipif);
11860 mi_free(virgipif);
11861 }
11862
11863 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11864
11865 /*
11866 * Transfer ownership of the current xop, if necessary.
11867 */
11868 if (ipx->ipx_current_ipif == sipif) {
11869 ASSERT(ipx->ipx_pending_ipif == NULL);
11870 mutex_enter(&ipx->ipx_lock);
11871 ipx->ipx_current_ipif = dipif;
11872 mutex_exit(&ipx->ipx_lock);
11873 }
11874
11875 if (virgipif == NULL)
11876 mi_free(sipif);
11877 }
11878
11879 /*
11880 * checks if:
11881 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11882 * - logical interface is within the allowed range
11883 */
11884 static int
11885 is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11886 {
11887 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11888 return (ENAMETOOLONG);
11889
11890 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11891 return (ERANGE);
11892 return (0);
11893 }
11894
11895 /*
11896 * Insert the ipif, so that the list of ipifs on the ill will be sorted
11897 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11898 * be inserted into the first space available in the list. The value of
11899 * ipif_id will then be set to the appropriate value for its position.
11900 */
11901 static int
11902 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11903 {
11904 ill_t *ill;
11905 ipif_t *tipif;
11906 ipif_t **tipifp;
11907 int id, err;
11908 ip_stack_t *ipst;
11909
11910 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11911 IAM_WRITER_IPIF(ipif));
11912
11913 ill = ipif->ipif_ill;
11914 ASSERT(ill != NULL);
11915 ipst = ill->ill_ipst;
11916
11917 /*
11918 * In the case of lo0:0 we already hold the ill_g_lock.
11919 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11920 * ipif_insert.
11921 */
11922 if (acquire_g_lock)
11923 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11924 mutex_enter(&ill->ill_lock);
11925 id = ipif->ipif_id;
11926 tipifp = &(ill->ill_ipif);
11927 if (id == -1) { /* need to find a real id */
11928 id = 0;
11929 while ((tipif = *tipifp) != NULL) {
11930 ASSERT(tipif->ipif_id >= id);
11931 if (tipif->ipif_id != id)
11932 break; /* non-consecutive id */
11933 id++;
11934 tipifp = &(tipif->ipif_next);
11935 }
11936 if ((err = is_lifname_valid(ill, id)) != 0) {
11937 mutex_exit(&ill->ill_lock);
11938 if (acquire_g_lock)
11939 rw_exit(&ipst->ips_ill_g_lock);
11940 return (err);
11941 }
11942 ipif->ipif_id = id; /* assign new id */
11943 } else if ((err = is_lifname_valid(ill, id)) == 0) {
11944 /* we have a real id; insert ipif in the right place */
11945 while ((tipif = *tipifp) != NULL) {
11946 ASSERT(tipif->ipif_id != id);
11947 if (tipif->ipif_id > id)
11948 break; /* found correct location */
11949 tipifp = &(tipif->ipif_next);
11950 }
11951 } else {
11952 mutex_exit(&ill->ill_lock);
11953 if (acquire_g_lock)
11954 rw_exit(&ipst->ips_ill_g_lock);
11955 return (err);
11956 }
11957
11958 ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
11959
11960 ipif->ipif_next = tipif;
11961 *tipifp = ipif;
11962 mutex_exit(&ill->ill_lock);
11963 if (acquire_g_lock)
11964 rw_exit(&ipst->ips_ill_g_lock);
11965
11966 return (0);
11967 }
11968
11969 static void
11970 ipif_remove(ipif_t *ipif)
11971 {
11972 ipif_t **ipifp;
11973 ill_t *ill = ipif->ipif_ill;
11974
11975 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
11976
11977 mutex_enter(&ill->ill_lock);
11978 ipifp = &ill->ill_ipif;
11979 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
11980 if (*ipifp == ipif) {
11981 *ipifp = ipif->ipif_next;
11982 break;
11983 }
11984 }
11985 mutex_exit(&ill->ill_lock);
11986 }
11987
11988 /*
11989 * Allocate and initialize a new interface control structure. (Always
11990 * called as writer.)
11991 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
11992 * is not part of the global linked list of ills. ipif_seqid is unique
11993 * in the system and to preserve the uniqueness, it is assigned only
11994 * when ill becomes part of the global list. At that point ill will
11995 * have a name. If it doesn't get assigned here, it will get assigned
11996 * in ipif_set_values() as part of SIOCSLIFNAME processing.
11997 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
11998 * the interface flags or any other information from the DL_INFO_ACK for
11999 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
12000 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
12001 * second DL_INFO_ACK comes in from the driver.
12002 */
12003 static ipif_t *
12004 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
12005 boolean_t insert, int *errorp)
12006 {
12007 int err;
12008 ipif_t *ipif;
12009 ip_stack_t *ipst = ill->ill_ipst;
12010
12011 ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
12012 ill->ill_name, id, (void *)ill));
12013 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
12014
12015 if (errorp != NULL)
12016 *errorp = 0;
12017
12018 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
12019 if (errorp != NULL)
12020 *errorp = ENOMEM;
12021 return (NULL);
12022 }
12023 *ipif = ipif_zero; /* start clean */
12024
12025 ipif->ipif_ill = ill;
12026 ipif->ipif_id = id; /* could be -1 */
12027 /*
12028 * Inherit the zoneid from the ill; for the shared stack instance
12029 * this is always the global zone
12030 */
12031 ipif->ipif_zoneid = ill->ill_zoneid;
12032
12033 ipif->ipif_refcnt = 0;
12034
12035 if (insert) {
12036 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
12037 mi_free(ipif);
12038 if (errorp != NULL)
12039 *errorp = err;
12040 return (NULL);
12041 }
12042 /* -1 id should have been replaced by real id */
12043 id = ipif->ipif_id;
12044 ASSERT(id >= 0);
12045 }
12046
12047 if (ill->ill_name[0] != '\0')
12048 ipif_assign_seqid(ipif);
12049
12050 /*
12051 * If this is the zeroth ipif on the IPMP ill, create the illgrp
12052 * (which must not exist yet because the zeroth ipif is created once
12053 * per ill). However, do not not link it to the ipmp_grp_t until
12054 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
12055 */
12056 if (id == 0 && IS_IPMP(ill)) {
12057 if (ipmp_illgrp_create(ill) == NULL) {
12058 if (insert) {
12059 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
12060 ipif_remove(ipif);
12061 rw_exit(&ipst->ips_ill_g_lock);
12062 }
12063 mi_free(ipif);
12064 if (errorp != NULL)
12065 *errorp = ENOMEM;
12066 return (NULL);
12067 }
12068 }
12069
12070 /*
12071 * We grab ill_lock to protect the flag changes. The ipif is still
12072 * not up and can't be looked up until the ioctl completes and the
12073 * IPIF_CHANGING flag is cleared.
12074 */
12075 mutex_enter(&ill->ill_lock);
12076
12077 ipif->ipif_ire_type = ire_type;
12078
12079 if (ipif->ipif_isv6) {
12080 ill->ill_flags |= ILLF_IPV6;
12081 } else {
12082 ipaddr_t inaddr_any = INADDR_ANY;
12083
12084 ill->ill_flags |= ILLF_IPV4;
12085
12086 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
12087 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12088 &ipif->ipif_v6lcl_addr);
12089 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12090 &ipif->ipif_v6subnet);
12091 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12092 &ipif->ipif_v6net_mask);
12093 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12094 &ipif->ipif_v6brd_addr);
12095 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12096 &ipif->ipif_v6pp_dst_addr);
12097 }
12098
12099 /*
12100 * Don't set the interface flags etc. now, will do it in
12101 * ip_ll_subnet_defaults.
12102 */
12103 if (!initialize)
12104 goto out;
12105
12106 /*
12107 * NOTE: The IPMP meta-interface is special-cased because it starts
12108 * with no underlying interfaces (and thus an unknown broadcast
12109 * address length), but all interfaces that can be placed into an IPMP
12110 * group are required to be broadcast-capable.
12111 */
12112 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
12113 /*
12114 * Later detect lack of DLPI driver multicast capability by
12115 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
12116 */
12117 ill->ill_flags |= ILLF_MULTICAST;
12118 if (!ipif->ipif_isv6)
12119 ipif->ipif_flags |= IPIF_BROADCAST;
12120 } else {
12121 if (ill->ill_net_type != IRE_LOOPBACK) {
12122 if (ipif->ipif_isv6)
12123 /*
12124 * Note: xresolv interfaces will eventually need
12125 * NOARP set here as well, but that will require
12126 * those external resolvers to have some
12127 * knowledge of that flag and act appropriately.
12128 * Not to be changed at present.
12129 */
12130 ill->ill_flags |= ILLF_NONUD;
12131 else
12132 ill->ill_flags |= ILLF_NOARP;
12133 }
12134 if (ill->ill_phys_addr_length == 0) {
12135 if (IS_VNI(ill)) {
12136 ipif->ipif_flags |= IPIF_NOXMIT;
12137 } else {
12138 /* pt-pt supports multicast. */
12139 ill->ill_flags |= ILLF_MULTICAST;
12140 if (ill->ill_net_type != IRE_LOOPBACK)
12141 ipif->ipif_flags |= IPIF_POINTOPOINT;
12142 }
12143 }
12144 }
12145 out:
12146 mutex_exit(&ill->ill_lock);
12147 return (ipif);
12148 }
12149
12150 /*
12151 * Remove the neighbor cache entries associated with this logical
12152 * interface.
12153 */
12154 int
12155 ipif_arp_down(ipif_t *ipif)
12156 {
12157 ill_t *ill = ipif->ipif_ill;
12158 int err = 0;
12159
12160 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
12161 ASSERT(IAM_WRITER_IPIF(ipif));
12162
12163 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
12164 ill_t *, ill, ipif_t *, ipif);
12165 ipif_nce_down(ipif);
12166
12167 /*
12168 * If this is the last ipif that is going down and there are no
12169 * duplicate addresses we may yet attempt to re-probe, then we need to
12170 * clean up ARP completely.
12171 */
12172 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
12173 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
12174 /*
12175 * If this was the last ipif on an IPMP interface, purge any
12176 * static ARP entries associated with it.
12177 */
12178 if (IS_IPMP(ill))
12179 ipmp_illgrp_refresh_arpent(ill->ill_grp);
12180
12181 /* UNBIND, DETACH */
12182 err = arp_ll_down(ill);
12183 }
12184
12185 return (err);
12186 }
12187
12188 /*
12189 * Get the resolver set up for a new IP address. (Always called as writer.)
12190 * Called both for IPv4 and IPv6 interfaces, though it only does some
12191 * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
12192 *
12193 * The enumerated value res_act tunes the behavior:
12194 * * Res_act_initial: set up all the resolver structures for a new
12195 * IP address.
12196 * * Res_act_defend: tell ARP that it needs to send a single gratuitous
12197 * ARP message in defense of the address.
12198 * * Res_act_rebind: tell ARP to change the hardware address for an IP
12199 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif().
12200 *
12201 * Returns zero on success, or an errno upon failure.
12202 */
12203 int
12204 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
12205 {
12206 ill_t *ill = ipif->ipif_ill;
12207 int err;
12208 boolean_t was_dup;
12209
12210 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
12211 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12212 ASSERT(IAM_WRITER_IPIF(ipif));
12213
12214 was_dup = B_FALSE;
12215 if (res_act == Res_act_initial) {
12216 ipif->ipif_addr_ready = 0;
12217 /*
12218 * We're bringing an interface up here. There's no way that we
12219 * should need to shut down ARP now.
12220 */
12221 mutex_enter(&ill->ill_lock);
12222 if (ipif->ipif_flags & IPIF_DUPLICATE) {
12223 ipif->ipif_flags &= ~IPIF_DUPLICATE;
12224 ill->ill_ipif_dup_count--;
12225 was_dup = B_TRUE;
12226 }
12227 mutex_exit(&ill->ill_lock);
12228 }
12229 if (ipif->ipif_recovery_id != 0)
12230 (void) untimeout(ipif->ipif_recovery_id);
12231 ipif->ipif_recovery_id = 0;
12232 if (ill->ill_net_type != IRE_IF_RESOLVER) {
12233 ipif->ipif_addr_ready = 1;
12234 return (0);
12235 }
12236 /* NDP will set the ipif_addr_ready flag when it's ready */
12237 if (ill->ill_isv6)
12238 return (0);
12239
12240 err = ipif_arp_up(ipif, res_act, was_dup);
12241 return (err);
12242 }
12243
12244 /*
12245 * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12246 * when a link has just gone back up.
12247 */
12248 static void
12249 ipif_nce_start_dad(ipif_t *ipif)
12250 {
12251 ncec_t *ncec;
12252 ill_t *ill = ipif->ipif_ill;
12253 boolean_t isv6 = ill->ill_isv6;
12254
12255 if (isv6) {
12256 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12257 &ipif->ipif_v6lcl_addr);
12258 } else {
12259 ipaddr_t v4addr;
12260
12261 if (ill->ill_net_type != IRE_IF_RESOLVER ||
12262 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12263 ipif->ipif_lcl_addr == INADDR_ANY) {
12264 /*
12265 * If we can't contact ARP for some reason,
12266 * that's not really a problem. Just send
12267 * out the routing socket notification that
12268 * DAD completion would have done, and continue.
12269 */
12270 ipif_mask_reply(ipif);
12271 ipif_up_notify(ipif);
12272 ipif->ipif_addr_ready = 1;
12273 return;
12274 }
12275
12276 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12277 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12278 }
12279
12280 if (ncec == NULL) {
12281 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12282 (void *)ipif));
12283 return;
12284 }
12285 if (!nce_restart_dad(ncec)) {
12286 /*
12287 * If we can't restart DAD for some reason, that's not really a
12288 * problem. Just send out the routing socket notification that
12289 * DAD completion would have done, and continue.
12290 */
12291 ipif_up_notify(ipif);
12292 ipif->ipif_addr_ready = 1;
12293 }
12294 ncec_refrele(ncec);
12295 }
12296
12297 /*
12298 * Restart duplicate address detection on all interfaces on the given ill.
12299 *
12300 * This is called when an interface transitions from down to up
12301 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12302 *
12303 * Note that since the underlying physical link has transitioned, we must cause
12304 * at least one routing socket message to be sent here, either via DAD
12305 * completion or just by default on the first ipif. (If we don't do this, then
12306 * in.mpathd will see long delays when doing link-based failure recovery.)
12307 */
12308 void
12309 ill_restart_dad(ill_t *ill, boolean_t went_up)
12310 {
12311 ipif_t *ipif;
12312
12313 if (ill == NULL)
12314 return;
12315
12316 /*
12317 * If layer two doesn't support duplicate address detection, then just
12318 * send the routing socket message now and be done with it.
12319 */
12320 if (!ill->ill_isv6 && arp_no_defense) {
12321 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12322 return;
12323 }
12324
12325 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12326 if (went_up) {
12327
12328 if (ipif->ipif_flags & IPIF_UP) {
12329 ipif_nce_start_dad(ipif);
12330 } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12331 /*
12332 * kick off the bring-up process now.
12333 */
12334 ipif_do_recovery(ipif);
12335 } else {
12336 /*
12337 * Unfortunately, the first ipif is "special"
12338 * and represents the underlying ill in the
12339 * routing socket messages. Thus, when this
12340 * one ipif is down, we must still notify so
12341 * that the user knows the IFF_RUNNING status
12342 * change. (If the first ipif is up, then
12343 * we'll handle eventual routing socket
12344 * notification via DAD completion.)
12345 */
12346 if (ipif == ill->ill_ipif) {
12347 ip_rts_ifmsg(ill->ill_ipif,
12348 RTSQ_DEFAULT);
12349 }
12350 }
12351 } else {
12352 /*
12353 * After link down, we'll need to send a new routing
12354 * message when the link comes back, so clear
12355 * ipif_addr_ready.
12356 */
12357 ipif->ipif_addr_ready = 0;
12358 }
12359 }
12360
12361 /*
12362 * If we've torn down links, then notify the user right away.
12363 */
12364 if (!went_up)
12365 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12366 }
12367
12368 static void
12369 ipsq_delete(ipsq_t *ipsq)
12370 {
12371 ipxop_t *ipx = ipsq->ipsq_xop;
12372
12373 ipsq->ipsq_ipst = NULL;
12374 ASSERT(ipsq->ipsq_phyint == NULL);
12375 ASSERT(ipsq->ipsq_xop != NULL);
12376 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12377 ASSERT(ipx->ipx_pending_mp == NULL);
12378 kmem_free(ipsq, sizeof (ipsq_t));
12379 }
12380
12381 static int
12382 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12383 {
12384 int err = 0;
12385 ipif_t *ipif;
12386
12387 if (ill == NULL)
12388 return (0);
12389
12390 ASSERT(IAM_WRITER_ILL(ill));
12391 ill->ill_up_ipifs = B_TRUE;
12392 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12393 if (ipif->ipif_was_up) {
12394 if (!(ipif->ipif_flags & IPIF_UP))
12395 err = ipif_up(ipif, q, mp);
12396 ipif->ipif_was_up = B_FALSE;
12397 if (err != 0) {
12398 ASSERT(err == EINPROGRESS);
12399 return (err);
12400 }
12401 }
12402 }
12403 ill->ill_up_ipifs = B_FALSE;
12404 return (0);
12405 }
12406
12407 /*
12408 * This function is called to bring up all the ipifs that were up before
12409 * bringing the ill down via ill_down_ipifs().
12410 */
12411 int
12412 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12413 {
12414 int err;
12415
12416 ASSERT(IAM_WRITER_ILL(ill));
12417
12418 if (ill->ill_replumbing) {
12419 ill->ill_replumbing = 0;
12420 /*
12421 * Send down REPLUMB_DONE notification followed by the
12422 * BIND_REQ on the arp stream.
12423 */
12424 if (!ill->ill_isv6)
12425 arp_send_replumb_conf(ill);
12426 }
12427 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12428 if (err != 0)
12429 return (err);
12430
12431 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12432 }
12433
12434 /*
12435 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12436 * down the ipifs without sending DL_UNBIND_REQ to the driver.
12437 */
12438 static void
12439 ill_down_ipifs(ill_t *ill, boolean_t logical)
12440 {
12441 ipif_t *ipif;
12442
12443 ASSERT(IAM_WRITER_ILL(ill));
12444
12445 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12446 /*
12447 * We go through the ipif_down logic even if the ipif
12448 * is already down, since routes can be added based
12449 * on down ipifs. Going through ipif_down once again
12450 * will delete any IREs created based on these routes.
12451 */
12452 if (ipif->ipif_flags & IPIF_UP)
12453 ipif->ipif_was_up = B_TRUE;
12454
12455 if (logical) {
12456 (void) ipif_logical_down(ipif, NULL, NULL);
12457 ipif_non_duplicate(ipif);
12458 (void) ipif_down_tail(ipif);
12459 } else {
12460 (void) ipif_down(ipif, NULL, NULL);
12461 }
12462 }
12463 }
12464
12465 /*
12466 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take
12467 * a look again at valid source addresses.
12468 * This should be called each time after the set of source addresses has been
12469 * changed.
12470 */
12471 void
12472 ip_update_source_selection(ip_stack_t *ipst)
12473 {
12474 /* We skip past SRC_GENERATION_VERIFY */
12475 if (atomic_add_32_nv(&ipst->ips_src_generation, 1) ==
12476 SRC_GENERATION_VERIFY)
12477 atomic_add_32(&ipst->ips_src_generation, 1);
12478 }
12479
12480 /*
12481 * Finish the group join started in ip_sioctl_groupname().
12482 */
12483 /* ARGSUSED */
12484 static void
12485 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12486 {
12487 ill_t *ill = q->q_ptr;
12488 phyint_t *phyi = ill->ill_phyint;
12489 ipmp_grp_t *grp = phyi->phyint_grp;
12490 ip_stack_t *ipst = ill->ill_ipst;
12491
12492 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12493 ASSERT(!IS_IPMP(ill) && grp != NULL);
12494 ASSERT(IAM_WRITER_IPSQ(ipsq));
12495
12496 if (phyi->phyint_illv4 != NULL) {
12497 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12498 VERIFY(grp->gr_pendv4-- > 0);
12499 rw_exit(&ipst->ips_ipmp_lock);
12500 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12501 }
12502 if (phyi->phyint_illv6 != NULL) {
12503 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12504 VERIFY(grp->gr_pendv6-- > 0);
12505 rw_exit(&ipst->ips_ipmp_lock);
12506 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12507 }
12508 freemsg(mp);
12509 }
12510
12511 /*
12512 * Process an SIOCSLIFGROUPNAME request.
12513 */
12514 /* ARGSUSED */
12515 int
12516 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12517 ip_ioctl_cmd_t *ipip, void *ifreq)
12518 {
12519 struct lifreq *lifr = ifreq;
12520 ill_t *ill = ipif->ipif_ill;
12521 ip_stack_t *ipst = ill->ill_ipst;
12522 phyint_t *phyi = ill->ill_phyint;
12523 ipmp_grp_t *grp = phyi->phyint_grp;
12524 mblk_t *ipsq_mp;
12525 int err = 0;
12526
12527 /*
12528 * Note that phyint_grp can only change here, where we're exclusive.
12529 */
12530 ASSERT(IAM_WRITER_ILL(ill));
12531
12532 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12533 (phyi->phyint_flags & PHYI_VIRTUAL))
12534 return (EINVAL);
12535
12536 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12537
12538 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12539
12540 /*
12541 * If the name hasn't changed, there's nothing to do.
12542 */
12543 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12544 goto unlock;
12545
12546 /*
12547 * Handle requests to rename an IPMP meta-interface.
12548 *
12549 * Note that creation of the IPMP meta-interface is handled in
12550 * userland through the standard plumbing sequence. As part of the
12551 * plumbing the IPMP meta-interface, its initial groupname is set to
12552 * the name of the interface (see ipif_set_values_tail()).
12553 */
12554 if (IS_IPMP(ill)) {
12555 err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12556 goto unlock;
12557 }
12558
12559 /*
12560 * Handle requests to add or remove an IP interface from a group.
12561 */
12562 if (lifr->lifr_groupname[0] != '\0') { /* add */
12563 /*
12564 * Moves are handled by first removing the interface from
12565 * its existing group, and then adding it to another group.
12566 * So, fail if it's already in a group.
12567 */
12568 if (IS_UNDER_IPMP(ill)) {
12569 err = EALREADY;
12570 goto unlock;
12571 }
12572
12573 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12574 if (grp == NULL) {
12575 err = ENOENT;
12576 goto unlock;
12577 }
12578
12579 /*
12580 * Check if the phyint and its ills are suitable for
12581 * inclusion into the group.
12582 */
12583 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12584 goto unlock;
12585
12586 /*
12587 * Checks pass; join the group, and enqueue the remaining
12588 * illgrp joins for when we've become part of the group xop
12589 * and are exclusive across its IPSQs. Since qwriter_ip()
12590 * requires an mblk_t to scribble on, and since `mp' will be
12591 * freed as part of completing the ioctl, allocate another.
12592 */
12593 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12594 err = ENOMEM;
12595 goto unlock;
12596 }
12597
12598 /*
12599 * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12600 * IPMP meta-interface ills needed by `phyi' cannot go away
12601 * before ip_join_illgrps() is called back. See the comments
12602 * in ip_sioctl_plink_ipmp() for more.
12603 */
12604 if (phyi->phyint_illv4 != NULL)
12605 grp->gr_pendv4++;
12606 if (phyi->phyint_illv6 != NULL)
12607 grp->gr_pendv6++;
12608
12609 rw_exit(&ipst->ips_ipmp_lock);
12610
12611 ipmp_phyint_join_grp(phyi, grp);
12612 ill_refhold(ill);
12613 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12614 SWITCH_OP, B_FALSE);
12615 return (0);
12616 } else {
12617 /*
12618 * Request to remove the interface from a group. If the
12619 * interface is not in a group, this trivially succeeds.
12620 */
12621 rw_exit(&ipst->ips_ipmp_lock);
12622 if (IS_UNDER_IPMP(ill))
12623 ipmp_phyint_leave_grp(phyi);
12624 return (0);
12625 }
12626 unlock:
12627 rw_exit(&ipst->ips_ipmp_lock);
12628 return (err);
12629 }
12630
12631 /*
12632 * Process an SIOCGLIFBINDING request.
12633 */
12634 /* ARGSUSED */
12635 int
12636 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12637 ip_ioctl_cmd_t *ipip, void *ifreq)
12638 {
12639 ill_t *ill;
12640 struct lifreq *lifr = ifreq;
12641 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12642
12643 if (!IS_IPMP(ipif->ipif_ill))
12644 return (EINVAL);
12645
12646 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12647 if ((ill = ipif->ipif_bound_ill) == NULL)
12648 lifr->lifr_binding[0] = '\0';
12649 else
12650 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12651 rw_exit(&ipst->ips_ipmp_lock);
12652 return (0);
12653 }
12654
12655 /*
12656 * Process an SIOCGLIFGROUPNAME request.
12657 */
12658 /* ARGSUSED */
12659 int
12660 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12661 ip_ioctl_cmd_t *ipip, void *ifreq)
12662 {
12663 ipmp_grp_t *grp;
12664 struct lifreq *lifr = ifreq;
12665 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12666
12667 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12668 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12669 lifr->lifr_groupname[0] = '\0';
12670 else
12671 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12672 rw_exit(&ipst->ips_ipmp_lock);
12673 return (0);
12674 }
12675
12676 /*
12677 * Process an SIOCGLIFGROUPINFO request.
12678 */
12679 /* ARGSUSED */
12680 int
12681 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12682 ip_ioctl_cmd_t *ipip, void *dummy)
12683 {
12684 ipmp_grp_t *grp;
12685 lifgroupinfo_t *lifgr;
12686 ip_stack_t *ipst = CONNQ_TO_IPST(q);
12687
12688 /* ip_wput_nondata() verified mp->b_cont->b_cont */
12689 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12690 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12691
12692 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12693 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12694 rw_exit(&ipst->ips_ipmp_lock);
12695 return (ENOENT);
12696 }
12697 ipmp_grp_info(grp, lifgr);
12698 rw_exit(&ipst->ips_ipmp_lock);
12699 return (0);
12700 }
12701
12702 static void
12703 ill_dl_down(ill_t *ill)
12704 {
12705 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12706
12707 /*
12708 * The ill is down; unbind but stay attached since we're still
12709 * associated with a PPA. If we have negotiated DLPI capabilites
12710 * with the data link service provider (IDS_OK) then reset them.
12711 * The interval between unbinding and rebinding is potentially
12712 * unbounded hence we cannot assume things will be the same.
12713 * The DLPI capabilities will be probed again when the data link
12714 * is brought up.
12715 */
12716 mblk_t *mp = ill->ill_unbind_mp;
12717
12718 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12719
12720 if (!ill->ill_replumbing) {
12721 /* Free all ilms for this ill */
12722 update_conn_ill(ill, ill->ill_ipst);
12723 } else {
12724 ill_leave_multicast(ill);
12725 }
12726
12727 ill->ill_unbind_mp = NULL;
12728 if (mp != NULL) {
12729 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12730 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12731 ill->ill_name));
12732 mutex_enter(&ill->ill_lock);
12733 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12734 mutex_exit(&ill->ill_lock);
12735 /*
12736 * ip_rput does not pass up normal (M_PROTO) DLPI messages
12737 * after ILL_CONDEMNED is set. So in the unplumb case, we call
12738 * ill_capability_dld_disable disable rightaway. If this is not
12739 * an unplumb operation then the disable happens on receipt of
12740 * the capab ack via ip_rput_dlpi_writer ->
12741 * ill_capability_ack_thr. In both cases the order of
12742 * the operations seen by DLD is capability disable followed
12743 * by DL_UNBIND. Also the DLD capability disable needs a
12744 * cv_wait'able context.
12745 */
12746 if (ill->ill_state_flags & ILL_CONDEMNED)
12747 ill_capability_dld_disable(ill);
12748 ill_capability_reset(ill, B_FALSE);
12749 ill_dlpi_send(ill, mp);
12750 }
12751 mutex_enter(&ill->ill_lock);
12752 ill->ill_dl_up = 0;
12753 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12754 mutex_exit(&ill->ill_lock);
12755 }
12756
12757 void
12758 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12759 {
12760 union DL_primitives *dlp;
12761 t_uscalar_t prim;
12762 boolean_t waitack = B_FALSE;
12763
12764 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12765
12766 dlp = (union DL_primitives *)mp->b_rptr;
12767 prim = dlp->dl_primitive;
12768
12769 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12770 dl_primstr(prim), prim, ill->ill_name));
12771
12772 switch (prim) {
12773 case DL_PHYS_ADDR_REQ:
12774 {
12775 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12776 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12777 break;
12778 }
12779 case DL_BIND_REQ:
12780 mutex_enter(&ill->ill_lock);
12781 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12782 mutex_exit(&ill->ill_lock);
12783 break;
12784 }
12785
12786 /*
12787 * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12788 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12789 * we only wait for the ACK of the DL_UNBIND_REQ.
12790 */
12791 mutex_enter(&ill->ill_lock);
12792 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12793 (prim == DL_UNBIND_REQ)) {
12794 ill->ill_dlpi_pending = prim;
12795 waitack = B_TRUE;
12796 }
12797
12798 mutex_exit(&ill->ill_lock);
12799 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12800 char *, dl_primstr(prim), ill_t *, ill);
12801 putnext(ill->ill_wq, mp);
12802
12803 /*
12804 * There is no ack for DL_NOTIFY_CONF messages
12805 */
12806 if (waitack && prim == DL_NOTIFY_CONF)
12807 ill_dlpi_done(ill, prim);
12808 }
12809
12810 /*
12811 * Helper function for ill_dlpi_send().
12812 */
12813 /* ARGSUSED */
12814 static void
12815 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12816 {
12817 ill_dlpi_send(q->q_ptr, mp);
12818 }
12819
12820 /*
12821 * Send a DLPI control message to the driver but make sure there
12822 * is only one outstanding message. Uses ill_dlpi_pending to tell
12823 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12824 * when an ACK or a NAK is received to process the next queued message.
12825 */
12826 void
12827 ill_dlpi_send(ill_t *ill, mblk_t *mp)
12828 {
12829 mblk_t **mpp;
12830
12831 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12832
12833 /*
12834 * To ensure that any DLPI requests for current exclusive operation
12835 * are always completely sent before any DLPI messages for other
12836 * operations, require writer access before enqueuing.
12837 */
12838 if (!IAM_WRITER_ILL(ill)) {
12839 ill_refhold(ill);
12840 /* qwriter_ip() does the ill_refrele() */
12841 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12842 NEW_OP, B_TRUE);
12843 return;
12844 }
12845
12846 mutex_enter(&ill->ill_lock);
12847 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12848 /* Must queue message. Tail insertion */
12849 mpp = &ill->ill_dlpi_deferred;
12850 while (*mpp != NULL)
12851 mpp = &((*mpp)->b_next);
12852
12853 ip1dbg(("ill_dlpi_send: deferring request for %s "
12854 "while %s pending\n", ill->ill_name,
12855 dl_primstr(ill->ill_dlpi_pending)));
12856
12857 *mpp = mp;
12858 mutex_exit(&ill->ill_lock);
12859 return;
12860 }
12861 mutex_exit(&ill->ill_lock);
12862 ill_dlpi_dispatch(ill, mp);
12863 }
12864
12865 void
12866 ill_capability_send(ill_t *ill, mblk_t *mp)
12867 {
12868 ill->ill_capab_pending_cnt++;
12869 ill_dlpi_send(ill, mp);
12870 }
12871
12872 void
12873 ill_capability_done(ill_t *ill)
12874 {
12875 ASSERT(ill->ill_capab_pending_cnt != 0);
12876
12877 ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12878
12879 ill->ill_capab_pending_cnt--;
12880 if (ill->ill_capab_pending_cnt == 0 &&
12881 ill->ill_dlpi_capab_state == IDCS_OK)
12882 ill_capability_reset_alloc(ill);
12883 }
12884
12885 /*
12886 * Send all deferred DLPI messages without waiting for their ACKs.
12887 */
12888 void
12889 ill_dlpi_send_deferred(ill_t *ill)
12890 {
12891 mblk_t *mp, *nextmp;
12892
12893 /*
12894 * Clear ill_dlpi_pending so that the message is not queued in
12895 * ill_dlpi_send().
12896 */
12897 mutex_enter(&ill->ill_lock);
12898 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12899 mp = ill->ill_dlpi_deferred;
12900 ill->ill_dlpi_deferred = NULL;
12901 mutex_exit(&ill->ill_lock);
12902
12903 for (; mp != NULL; mp = nextmp) {
12904 nextmp = mp->b_next;
12905 mp->b_next = NULL;
12906 ill_dlpi_send(ill, mp);
12907 }
12908 }
12909
12910 /*
12911 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12912 * or M_HANGUP
12913 */
12914 static void
12915 ill_dlpi_clear_deferred(ill_t *ill)
12916 {
12917 mblk_t *mp, *nextmp;
12918
12919 mutex_enter(&ill->ill_lock);
12920 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12921 mp = ill->ill_dlpi_deferred;
12922 ill->ill_dlpi_deferred = NULL;
12923 mutex_exit(&ill->ill_lock);
12924
12925 for (; mp != NULL; mp = nextmp) {
12926 nextmp = mp->b_next;
12927 inet_freemsg(mp);
12928 }
12929 }
12930
12931 /*
12932 * Check if the DLPI primitive `prim' is pending; print a warning if not.
12933 */
12934 boolean_t
12935 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
12936 {
12937 t_uscalar_t pending;
12938
12939 mutex_enter(&ill->ill_lock);
12940 if (ill->ill_dlpi_pending == prim) {
12941 mutex_exit(&ill->ill_lock);
12942 return (B_TRUE);
12943 }
12944
12945 /*
12946 * During teardown, ill_dlpi_dispatch() will send DLPI requests
12947 * without waiting, so don't print any warnings in that case.
12948 */
12949 if (ill->ill_state_flags & ILL_CONDEMNED) {
12950 mutex_exit(&ill->ill_lock);
12951 return (B_FALSE);
12952 }
12953 pending = ill->ill_dlpi_pending;
12954 mutex_exit(&ill->ill_lock);
12955
12956 if (pending == DL_PRIM_INVAL) {
12957 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12958 "received unsolicited ack for %s on %s\n",
12959 dl_primstr(prim), ill->ill_name);
12960 } else {
12961 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
12962 "received unexpected ack for %s on %s (expecting %s)\n",
12963 dl_primstr(prim), ill->ill_name, dl_primstr(pending));
12964 }
12965 return (B_FALSE);
12966 }
12967
12968 /*
12969 * Complete the current DLPI operation associated with `prim' on `ill' and
12970 * start the next queued DLPI operation (if any). If there are no queued DLPI
12971 * operations and the ill's current exclusive IPSQ operation has finished
12972 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
12973 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See
12974 * the comments above ipsq_current_finish() for details.
12975 */
12976 void
12977 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
12978 {
12979 mblk_t *mp;
12980 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
12981 ipxop_t *ipx = ipsq->ipsq_xop;
12982
12983 ASSERT(IAM_WRITER_IPSQ(ipsq));
12984 mutex_enter(&ill->ill_lock);
12985
12986 ASSERT(prim != DL_PRIM_INVAL);
12987 ASSERT(ill->ill_dlpi_pending == prim);
12988
12989 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
12990 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
12991
12992 if ((mp = ill->ill_dlpi_deferred) == NULL) {
12993 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12994 if (ipx->ipx_current_done) {
12995 mutex_enter(&ipx->ipx_lock);
12996 ipx->ipx_current_ipif = NULL;
12997 mutex_exit(&ipx->ipx_lock);
12998 }
12999 cv_signal(&ill->ill_cv);
13000 mutex_exit(&ill->ill_lock);
13001 return;
13002 }
13003
13004 ill->ill_dlpi_deferred = mp->b_next;
13005 mp->b_next = NULL;
13006 mutex_exit(&ill->ill_lock);
13007
13008 ill_dlpi_dispatch(ill, mp);
13009 }
13010
13011 /*
13012 * Queue a (multicast) DLPI control message to be sent to the driver by
13013 * later calling ill_dlpi_send_queued.
13014 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13015 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
13016 * for the same group to race.
13017 * We send DLPI control messages in order using ill_lock.
13018 * For IPMP we should be called on the cast_ill.
13019 */
13020 void
13021 ill_dlpi_queue(ill_t *ill, mblk_t *mp)
13022 {
13023 mblk_t **mpp;
13024
13025 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
13026
13027 mutex_enter(&ill->ill_lock);
13028 /* Must queue message. Tail insertion */
13029 mpp = &ill->ill_dlpi_deferred;
13030 while (*mpp != NULL)
13031 mpp = &((*mpp)->b_next);
13032
13033 *mpp = mp;
13034 mutex_exit(&ill->ill_lock);
13035 }
13036
13037 /*
13038 * Send the messages that were queued. Make sure there is only
13039 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
13040 * when an ACK or a NAK is received to process the next queued message.
13041 * For IPMP we are called on the upper ill, but when send what is queued
13042 * on the cast_ill.
13043 */
13044 void
13045 ill_dlpi_send_queued(ill_t *ill)
13046 {
13047 mblk_t *mp;
13048 union DL_primitives *dlp;
13049 t_uscalar_t prim;
13050 ill_t *release_ill = NULL;
13051
13052 if (IS_IPMP(ill)) {
13053 /* On the upper IPMP ill. */
13054 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13055 if (release_ill == NULL) {
13056 /* Avoid ever sending anything down to the ipmpstub */
13057 return;
13058 }
13059 ill = release_ill;
13060 }
13061 mutex_enter(&ill->ill_lock);
13062 while ((mp = ill->ill_dlpi_deferred) != NULL) {
13063 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
13064 /* Can't send. Somebody else will send it */
13065 mutex_exit(&ill->ill_lock);
13066 goto done;
13067 }
13068 ill->ill_dlpi_deferred = mp->b_next;
13069 mp->b_next = NULL;
13070 if (!ill->ill_dl_up) {
13071 /*
13072 * Nobody there. All multicast addresses will be
13073 * re-joined when we get the DL_BIND_ACK bringing the
13074 * interface up.
13075 */
13076 freemsg(mp);
13077 continue;
13078 }
13079 dlp = (union DL_primitives *)mp->b_rptr;
13080 prim = dlp->dl_primitive;
13081
13082 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
13083 (prim == DL_UNBIND_REQ)) {
13084 ill->ill_dlpi_pending = prim;
13085 }
13086 mutex_exit(&ill->ill_lock);
13087
13088 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
13089 char *, dl_primstr(prim), ill_t *, ill);
13090 putnext(ill->ill_wq, mp);
13091 mutex_enter(&ill->ill_lock);
13092 }
13093 mutex_exit(&ill->ill_lock);
13094 done:
13095 if (release_ill != NULL)
13096 ill_refrele(release_ill);
13097 }
13098
13099 /*
13100 * Queue an IP (IGMP/MLD) message to be sent by IP from
13101 * ill_mcast_send_queued
13102 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13103 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
13104 * group to race.
13105 * We send them in order using ill_lock.
13106 * For IPMP we are called on the upper ill, but we queue on the cast_ill.
13107 */
13108 void
13109 ill_mcast_queue(ill_t *ill, mblk_t *mp)
13110 {
13111 mblk_t **mpp;
13112 ill_t *release_ill = NULL;
13113
13114 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
13115
13116 if (IS_IPMP(ill)) {
13117 /* On the upper IPMP ill. */
13118 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13119 if (release_ill == NULL) {
13120 /* Discard instead of queuing for the ipmp interface */
13121 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13122 ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
13123 mp, ill);
13124 freemsg(mp);
13125 return;
13126 }
13127 ill = release_ill;
13128 }
13129
13130 mutex_enter(&ill->ill_lock);
13131 /* Must queue message. Tail insertion */
13132 mpp = &ill->ill_mcast_deferred;
13133 while (*mpp != NULL)
13134 mpp = &((*mpp)->b_next);
13135
13136 *mpp = mp;
13137 mutex_exit(&ill->ill_lock);
13138 if (release_ill != NULL)
13139 ill_refrele(release_ill);
13140 }
13141
13142 /*
13143 * Send the IP packets that were queued by ill_mcast_queue.
13144 * These are IGMP/MLD packets.
13145 *
13146 * For IPMP we are called on the upper ill, but when send what is queued
13147 * on the cast_ill.
13148 *
13149 * Request loopback of the report if we are acting as a multicast
13150 * router, so that the process-level routing demon can hear it.
13151 * This will run multiple times for the same group if there are members
13152 * on the same group for multiple ipif's on the same ill. The
13153 * igmp_input/mld_input code will suppress this due to the loopback thus we
13154 * always loopback membership report.
13155 *
13156 * We also need to make sure that this does not get load balanced
13157 * by IPMP. We do this by passing an ill to ip_output_simple.
13158 */
13159 void
13160 ill_mcast_send_queued(ill_t *ill)
13161 {
13162 mblk_t *mp;
13163 ip_xmit_attr_t ixas;
13164 ill_t *release_ill = NULL;
13165
13166 if (IS_IPMP(ill)) {
13167 /* On the upper IPMP ill. */
13168 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13169 if (release_ill == NULL) {
13170 /*
13171 * We should have no messages on the ipmp interface
13172 * but no point in trying to send them.
13173 */
13174 return;
13175 }
13176 ill = release_ill;
13177 }
13178 bzero(&ixas, sizeof (ixas));
13179 ixas.ixa_zoneid = ALL_ZONES;
13180 ixas.ixa_cred = kcred;
13181 ixas.ixa_cpid = NOPID;
13182 ixas.ixa_tsl = NULL;
13183 /*
13184 * Here we set ixa_ifindex. If IPMP it will be the lower ill which
13185 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
13186 * That is necessary to handle IGMP/MLD snooping switches.
13187 */
13188 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
13189 ixas.ixa_ipst = ill->ill_ipst;
13190
13191 mutex_enter(&ill->ill_lock);
13192 while ((mp = ill->ill_mcast_deferred) != NULL) {
13193 ill->ill_mcast_deferred = mp->b_next;
13194 mp->b_next = NULL;
13195 if (!ill->ill_dl_up) {
13196 /*
13197 * Nobody there. Just drop the ip packets.
13198 * IGMP/MLD will resend later, if this is a replumb.
13199 */
13200 freemsg(mp);
13201 continue;
13202 }
13203 mutex_enter(&ill->ill_phyint->phyint_lock);
13204 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
13205 /*
13206 * When the ill is getting deactivated, we only want to
13207 * send the DLPI messages, so drop IGMP/MLD packets.
13208 * DLPI messages are handled by ill_dlpi_send_queued()
13209 */
13210 mutex_exit(&ill->ill_phyint->phyint_lock);
13211 freemsg(mp);
13212 continue;
13213 }
13214 mutex_exit(&ill->ill_phyint->phyint_lock);
13215 mutex_exit(&ill->ill_lock);
13216
13217 /* Check whether we are sending IPv4 or IPv6. */
13218 if (ill->ill_isv6) {
13219 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
13220
13221 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13222 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13223 } else {
13224 ipha_t *ipha = (ipha_t *)mp->b_rptr;
13225
13226 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13227 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13228 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13229 }
13230 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13231 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13232 (void) ip_output_simple(mp, &ixas);
13233 ixa_cleanup(&ixas);
13234
13235 mutex_enter(&ill->ill_lock);
13236 }
13237 mutex_exit(&ill->ill_lock);
13238
13239 done:
13240 if (release_ill != NULL)
13241 ill_refrele(release_ill);
13242 }
13243
13244 /*
13245 * Take down a specific interface, but don't lose any information about it.
13246 * (Always called as writer.)
13247 * This function goes through the down sequence even if the interface is
13248 * already down. There are 2 reasons.
13249 * a. Currently we permit interface routes that depend on down interfaces
13250 * to be added. This behaviour itself is questionable. However it appears
13251 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13252 * time. We go thru the cleanup in order to remove these routes.
13253 * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13254 * DL_ERROR_ACK in response to the DL_BIND request. The interface is
13255 * down, but we need to cleanup i.e. do ill_dl_down and
13256 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13257 *
13258 * IP-MT notes:
13259 *
13260 * Model of reference to interfaces.
13261 *
13262 * The following members in ipif_t track references to the ipif.
13263 * int ipif_refcnt; Active reference count
13264 *
13265 * The following members in ill_t track references to the ill.
13266 * int ill_refcnt; active refcnt
13267 * uint_t ill_ire_cnt; Number of ires referencing ill
13268 * uint_t ill_ncec_cnt; Number of ncecs referencing ill
13269 * uint_t ill_nce_cnt; Number of nces referencing ill
13270 * uint_t ill_ilm_cnt; Number of ilms referencing ill
13271 *
13272 * Reference to an ipif or ill can be obtained in any of the following ways.
13273 *
13274 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13275 * Pointers to ipif / ill from other data structures viz ire and conn.
13276 * Implicit reference to the ipif / ill by holding a reference to the ire.
13277 *
13278 * The ipif/ill lookup functions return a reference held ipif / ill.
13279 * ipif_refcnt and ill_refcnt track the reference counts respectively.
13280 * This is a purely dynamic reference count associated with threads holding
13281 * references to the ipif / ill. Pointers from other structures do not
13282 * count towards this reference count.
13283 *
13284 * ill_ire_cnt is the number of ire's associated with the
13285 * ill. This is incremented whenever a new ire is created referencing the
13286 * ill. This is done atomically inside ire_add_v[46] where the ire is
13287 * actually added to the ire hash table. The count is decremented in
13288 * ire_inactive where the ire is destroyed.
13289 *
13290 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13291 * This is incremented atomically in
13292 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13293 * table. Similarly it is decremented in ncec_inactive() where the ncec
13294 * is destroyed.
13295 *
13296 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13297 * incremented atomically in nce_add() where the nce is actually added to the
13298 * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13299 * is destroyed.
13300 *
13301 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13302 * ilm_add() and decremented before the ilm is freed in ilm_delete().
13303 *
13304 * Flow of ioctls involving interface down/up
13305 *
13306 * The following is the sequence of an attempt to set some critical flags on an
13307 * up interface.
13308 * ip_sioctl_flags
13309 * ipif_down
13310 * wait for ipif to be quiescent
13311 * ipif_down_tail
13312 * ip_sioctl_flags_tail
13313 *
13314 * All set ioctls that involve down/up sequence would have a skeleton similar
13315 * to the above. All the *tail functions are called after the refcounts have
13316 * dropped to the appropriate values.
13317 *
13318 * SIOC ioctls during the IPIF_CHANGING interval.
13319 *
13320 * Threads handling SIOC set ioctls serialize on the squeue, but this
13321 * is not done for SIOC get ioctls. Since a set ioctl can cause several
13322 * steps of internal changes to the state, some of which are visible in
13323 * ipif_flags (such as IFF_UP being cleared and later set), and we want
13324 * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13325 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13326 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13327 * the current exclusive operation completes. The IPIF_CHANGING check
13328 * and enqueue is atomic using the ill_lock and ipsq_lock. The
13329 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13330 * change while the ill_lock is held. Before dropping the ill_lock we acquire
13331 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13332 * until we release the ipsq_lock, even though the ill/ipif state flags
13333 * can change after we drop the ill_lock.
13334 */
13335 int
13336 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13337 {
13338 ill_t *ill = ipif->ipif_ill;
13339 conn_t *connp;
13340 boolean_t success;
13341 boolean_t ipif_was_up = B_FALSE;
13342 ip_stack_t *ipst = ill->ill_ipst;
13343
13344 ASSERT(IAM_WRITER_IPIF(ipif));
13345
13346 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13347
13348 DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13349 ill_t *, ill, ipif_t *, ipif);
13350
13351 if (ipif->ipif_flags & IPIF_UP) {
13352 mutex_enter(&ill->ill_lock);
13353 ipif->ipif_flags &= ~IPIF_UP;
13354 ASSERT(ill->ill_ipif_up_count > 0);
13355 --ill->ill_ipif_up_count;
13356 mutex_exit(&ill->ill_lock);
13357 ipif_was_up = B_TRUE;
13358 /* Update status in SCTP's list */
13359 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13360 ill_nic_event_dispatch(ipif->ipif_ill,
13361 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13362 }
13363
13364 /*
13365 * Removal of the last ipif from an ill may result in a DL_UNBIND
13366 * being sent to the driver, and we must not send any data packets to
13367 * the driver after the DL_UNBIND_REQ. To ensure this, all the
13368 * ire and nce entries used in the data path will be cleaned
13369 * up, and we also set the ILL_DOWN_IN_PROGRESS bit to make
13370 * sure on new entries will be added until the ill is bound
13371 * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13372 * receipt of a DL_BIND_ACK.
13373 */
13374 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13375 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13376 ill->ill_dl_up) {
13377 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13378 }
13379
13380 /*
13381 * Blow away memberships we established in ipif_multicast_up().
13382 */
13383 ipif_multicast_down(ipif);
13384
13385 /*
13386 * Remove from the mapping for __sin6_src_id. We insert only
13387 * when the address is not INADDR_ANY. As IPv4 addresses are
13388 * stored as mapped addresses, we need to check for mapped
13389 * INADDR_ANY also.
13390 */
13391 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13392 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13393 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13394 int err;
13395
13396 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13397 ipif->ipif_zoneid, ipst);
13398 if (err != 0) {
13399 ip0dbg(("ipif_down: srcid_remove %d\n", err));
13400 }
13401 }
13402
13403 if (ipif_was_up) {
13404 /* only delete if we'd added ire's before */
13405 if (ipif->ipif_isv6)
13406 ipif_delete_ires_v6(ipif);
13407 else
13408 ipif_delete_ires_v4(ipif);
13409 }
13410
13411 if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13412 /*
13413 * Since the interface is now down, it may have just become
13414 * inactive. Note that this needs to be done even for a
13415 * lll_logical_down(), or ARP entries will not get correctly
13416 * restored when the interface comes back up.
13417 */
13418 if (IS_UNDER_IPMP(ill))
13419 ipmp_ill_refresh_active(ill);
13420 }
13421
13422 /*
13423 * neighbor-discovery or arp entries for this interface. The ipif
13424 * has to be quiesced, so we walk all the nce's and delete those
13425 * that point at the ipif->ipif_ill. At the same time, we also
13426 * update IPMP so that ipifs for data addresses are unbound. We dont
13427 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13428 * that for ipif_down_tail()
13429 */
13430 ipif_nce_down(ipif);
13431
13432 /*
13433 * If this is the last ipif on the ill, we also need to remove
13434 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13435 * never succeed.
13436 */
13437 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13438 ire_walk_ill(0, 0, ill_downi, ill, ill);
13439
13440 /*
13441 * Walk all CONNs that can have a reference on an ire for this
13442 * ipif (we actually walk all that now have stale references).
13443 */
13444 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13445
13446 /*
13447 * If mp is NULL the caller will wait for the appropriate refcnt.
13448 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down
13449 * and ill_delete -> ipif_free -> ipif_down
13450 */
13451 if (mp == NULL) {
13452 ASSERT(q == NULL);
13453 return (0);
13454 }
13455
13456 if (CONN_Q(q)) {
13457 connp = Q_TO_CONN(q);
13458 mutex_enter(&connp->conn_lock);
13459 } else {
13460 connp = NULL;
13461 }
13462 mutex_enter(&ill->ill_lock);
13463 /*
13464 * Are there any ire's pointing to this ipif that are still active ?
13465 * If this is the last ipif going down, are there any ire's pointing
13466 * to this ill that are still active ?
13467 */
13468 if (ipif_is_quiescent(ipif)) {
13469 mutex_exit(&ill->ill_lock);
13470 if (connp != NULL)
13471 mutex_exit(&connp->conn_lock);
13472 return (0);
13473 }
13474
13475 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13476 ill->ill_name, (void *)ill));
13477 /*
13478 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13479 * drops down, the operation will be restarted by ipif_ill_refrele_tail
13480 * which in turn is called by the last refrele on the ipif/ill/ire.
13481 */
13482 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13483 if (!success) {
13484 /* The conn is closing. So just return */
13485 ASSERT(connp != NULL);
13486 mutex_exit(&ill->ill_lock);
13487 mutex_exit(&connp->conn_lock);
13488 return (EINTR);
13489 }
13490
13491 mutex_exit(&ill->ill_lock);
13492 if (connp != NULL)
13493 mutex_exit(&connp->conn_lock);
13494 return (EINPROGRESS);
13495 }
13496
13497 int
13498 ipif_down_tail(ipif_t *ipif)
13499 {
13500 ill_t *ill = ipif->ipif_ill;
13501 int err = 0;
13502
13503 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13504 ill_t *, ill, ipif_t *, ipif);
13505
13506 /*
13507 * Skip any loopback interface (null wq).
13508 * If this is the last logical interface on the ill
13509 * have ill_dl_down tell the driver we are gone (unbind)
13510 * Note that lun 0 can ipif_down even though
13511 * there are other logical units that are up.
13512 * This occurs e.g. when we change a "significant" IFF_ flag.
13513 */
13514 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13515 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13516 ill->ill_dl_up) {
13517 ill_dl_down(ill);
13518 }
13519 if (!ipif->ipif_isv6)
13520 err = ipif_arp_down(ipif);
13521
13522 ill->ill_logical_down = 0;
13523
13524 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13525 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13526 return (err);
13527 }
13528
13529 /*
13530 * Bring interface logically down without bringing the physical interface
13531 * down e.g. when the netmask is changed. This avoids long lasting link
13532 * negotiations between an ethernet interface and a certain switches.
13533 */
13534 static int
13535 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13536 {
13537 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13538 ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13539
13540 /*
13541 * The ill_logical_down flag is a transient flag. It is set here
13542 * and is cleared once the down has completed in ipif_down_tail.
13543 * This flag does not indicate whether the ill stream is in the
13544 * DL_BOUND state with the driver. Instead this flag is used by
13545 * ipif_down_tail to determine whether to DL_UNBIND the stream with
13546 * the driver. The state of the ill stream i.e. whether it is
13547 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13548 */
13549 ipif->ipif_ill->ill_logical_down = 1;
13550 return (ipif_down(ipif, q, mp));
13551 }
13552
13553 /*
13554 * Initiate deallocate of an IPIF. Always called as writer. Called by
13555 * ill_delete or ip_sioctl_removeif.
13556 */
13557 static void
13558 ipif_free(ipif_t *ipif)
13559 {
13560 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13561
13562 ASSERT(IAM_WRITER_IPIF(ipif));
13563
13564 if (ipif->ipif_recovery_id != 0)
13565 (void) untimeout(ipif->ipif_recovery_id);
13566 ipif->ipif_recovery_id = 0;
13567
13568 /*
13569 * Take down the interface. We can be called either from ill_delete
13570 * or from ip_sioctl_removeif.
13571 */
13572 (void) ipif_down(ipif, NULL, NULL);
13573
13574 /*
13575 * Now that the interface is down, there's no chance it can still
13576 * become a duplicate. Cancel any timer that may have been set while
13577 * tearing down.
13578 */
13579 if (ipif->ipif_recovery_id != 0)
13580 (void) untimeout(ipif->ipif_recovery_id);
13581 ipif->ipif_recovery_id = 0;
13582
13583 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13584 /* Remove pointers to this ill in the multicast routing tables */
13585 reset_mrt_vif_ipif(ipif);
13586 /* If necessary, clear the cached source ipif rotor. */
13587 if (ipif->ipif_ill->ill_src_ipif == ipif)
13588 ipif->ipif_ill->ill_src_ipif = NULL;
13589 rw_exit(&ipst->ips_ill_g_lock);
13590 }
13591
13592 static void
13593 ipif_free_tail(ipif_t *ipif)
13594 {
13595 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13596
13597 /*
13598 * Need to hold both ill_g_lock and ill_lock while
13599 * inserting or removing an ipif from the linked list
13600 * of ipifs hanging off the ill.
13601 */
13602 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13603
13604 #ifdef DEBUG
13605 ipif_trace_cleanup(ipif);
13606 #endif
13607
13608 /* Ask SCTP to take it out of it list */
13609 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13610 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13611
13612 /* Get it out of the ILL interface list. */
13613 ipif_remove(ipif);
13614 rw_exit(&ipst->ips_ill_g_lock);
13615
13616 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13617 ASSERT(ipif->ipif_recovery_id == 0);
13618 ASSERT(ipif->ipif_ire_local == NULL);
13619 ASSERT(ipif->ipif_ire_if == NULL);
13620
13621 /* Free the memory. */
13622 mi_free(ipif);
13623 }
13624
13625 /*
13626 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13627 * is zero.
13628 */
13629 void
13630 ipif_get_name(const ipif_t *ipif, char *buf, int len)
13631 {
13632 char lbuf[LIFNAMSIZ];
13633 char *name;
13634 size_t name_len;
13635
13636 buf[0] = '\0';
13637 name = ipif->ipif_ill->ill_name;
13638 name_len = ipif->ipif_ill->ill_name_length;
13639 if (ipif->ipif_id != 0) {
13640 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13641 ipif->ipif_id);
13642 name = lbuf;
13643 name_len = mi_strlen(name) + 1;
13644 }
13645 len -= 1;
13646 buf[len] = '\0';
13647 len = MIN(len, name_len);
13648 bcopy(name, buf, len);
13649 }
13650
13651 /*
13652 * Sets `buf' to an ill name.
13653 */
13654 void
13655 ill_get_name(const ill_t *ill, char *buf, int len)
13656 {
13657 char *name;
13658 size_t name_len;
13659
13660 name = ill->ill_name;
13661 name_len = ill->ill_name_length;
13662 len -= 1;
13663 buf[len] = '\0';
13664 len = MIN(len, name_len);
13665 bcopy(name, buf, len);
13666 }
13667
13668 /*
13669 * Find an IPIF based on the name passed in. Names can be of the form <phys>
13670 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the
13671 * implied unit id is zero. <phys> must correspond to the name of an ILL.
13672 * (May be called as writer.)
13673 */
13674 static ipif_t *
13675 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13676 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13677 {
13678 char *cp;
13679 char *endp;
13680 long id;
13681 ill_t *ill;
13682 ipif_t *ipif;
13683 uint_t ire_type;
13684 boolean_t did_alloc = B_FALSE;
13685 char last;
13686
13687 /*
13688 * If the caller wants to us to create the ipif, make sure we have a
13689 * valid zoneid
13690 */
13691 ASSERT(!do_alloc || zoneid != ALL_ZONES);
13692
13693 if (namelen == 0) {
13694 return (NULL);
13695 }
13696
13697 *exists = B_FALSE;
13698 /* Look for a colon in the name. */
13699 endp = &name[namelen];
13700 for (cp = endp; --cp > name; ) {
13701 if (*cp == IPIF_SEPARATOR_CHAR)
13702 break;
13703 }
13704
13705 if (*cp == IPIF_SEPARATOR_CHAR) {
13706 /*
13707 * Reject any non-decimal aliases for logical
13708 * interfaces. Aliases with leading zeroes
13709 * are also rejected as they introduce ambiguity
13710 * in the naming of the interfaces.
13711 * In order to confirm with existing semantics,
13712 * and to not break any programs/script relying
13713 * on that behaviour, if<0>:0 is considered to be
13714 * a valid interface.
13715 *
13716 * If alias has two or more digits and the first
13717 * is zero, fail.
13718 */
13719 if (&cp[2] < endp && cp[1] == '0') {
13720 return (NULL);
13721 }
13722 }
13723
13724 if (cp <= name) {
13725 cp = endp;
13726 }
13727 last = *cp;
13728 *cp = '\0';
13729
13730 /*
13731 * Look up the ILL, based on the portion of the name
13732 * before the slash. ill_lookup_on_name returns a held ill.
13733 * Temporary to check whether ill exists already. If so
13734 * ill_lookup_on_name will clear it.
13735 */
13736 ill = ill_lookup_on_name(name, do_alloc, isv6,
13737 &did_alloc, ipst);
13738 *cp = last;
13739 if (ill == NULL)
13740 return (NULL);
13741
13742 /* Establish the unit number in the name. */
13743 id = 0;
13744 if (cp < endp && *endp == '\0') {
13745 /* If there was a colon, the unit number follows. */
13746 cp++;
13747 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13748 ill_refrele(ill);
13749 return (NULL);
13750 }
13751 }
13752
13753 mutex_enter(&ill->ill_lock);
13754 /* Now see if there is an IPIF with this unit number. */
13755 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13756 if (ipif->ipif_id == id) {
13757 if (zoneid != ALL_ZONES &&
13758 zoneid != ipif->ipif_zoneid &&
13759 ipif->ipif_zoneid != ALL_ZONES) {
13760 mutex_exit(&ill->ill_lock);
13761 ill_refrele(ill);
13762 return (NULL);
13763 }
13764 if (IPIF_CAN_LOOKUP(ipif)) {
13765 ipif_refhold_locked(ipif);
13766 mutex_exit(&ill->ill_lock);
13767 if (!did_alloc)
13768 *exists = B_TRUE;
13769 /*
13770 * Drop locks before calling ill_refrele
13771 * since it can potentially call into
13772 * ipif_ill_refrele_tail which can end up
13773 * in trying to acquire any lock.
13774 */
13775 ill_refrele(ill);
13776 return (ipif);
13777 }
13778 }
13779 }
13780
13781 if (!do_alloc) {
13782 mutex_exit(&ill->ill_lock);
13783 ill_refrele(ill);
13784 return (NULL);
13785 }
13786
13787 /*
13788 * If none found, atomically allocate and return a new one.
13789 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13790 * to support "receive only" use of lo0:1 etc. as is still done
13791 * below as an initial guess.
13792 * However, this is now likely to be overriden later in ipif_up_done()
13793 * when we know for sure what address has been configured on the
13794 * interface, since we might have more than one loopback interface
13795 * with a loopback address, e.g. in the case of zones, and all the
13796 * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13797 */
13798 if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13799 ire_type = IRE_LOOPBACK;
13800 else
13801 ire_type = IRE_LOCAL;
13802 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13803 if (ipif != NULL)
13804 ipif_refhold_locked(ipif);
13805 mutex_exit(&ill->ill_lock);
13806 ill_refrele(ill);
13807 return (ipif);
13808 }
13809
13810 /*
13811 * Variant of the above that queues the request on the ipsq when
13812 * IPIF_CHANGING is set.
13813 */
13814 static ipif_t *
13815 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13816 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13817 ip_stack_t *ipst)
13818 {
13819 char *cp;
13820 char *endp;
13821 long id;
13822 ill_t *ill;
13823 ipif_t *ipif;
13824 boolean_t did_alloc = B_FALSE;
13825 ipsq_t *ipsq;
13826
13827 if (error != NULL)
13828 *error = 0;
13829
13830 if (namelen == 0) {
13831 if (error != NULL)
13832 *error = ENXIO;
13833 return (NULL);
13834 }
13835
13836 /* Look for a colon in the name. */
13837 endp = &name[namelen];
13838 for (cp = endp; --cp > name; ) {
13839 if (*cp == IPIF_SEPARATOR_CHAR)
13840 break;
13841 }
13842
13843 if (*cp == IPIF_SEPARATOR_CHAR) {
13844 /*
13845 * Reject any non-decimal aliases for logical
13846 * interfaces. Aliases with leading zeroes
13847 * are also rejected as they introduce ambiguity
13848 * in the naming of the interfaces.
13849 * In order to confirm with existing semantics,
13850 * and to not break any programs/script relying
13851 * on that behaviour, if<0>:0 is considered to be
13852 * a valid interface.
13853 *
13854 * If alias has two or more digits and the first
13855 * is zero, fail.
13856 */
13857 if (&cp[2] < endp && cp[1] == '0') {
13858 if (error != NULL)
13859 *error = EINVAL;
13860 return (NULL);
13861 }
13862 }
13863
13864 if (cp <= name) {
13865 cp = endp;
13866 } else {
13867 *cp = '\0';
13868 }
13869
13870 /*
13871 * Look up the ILL, based on the portion of the name
13872 * before the slash. ill_lookup_on_name returns a held ill.
13873 * Temporary to check whether ill exists already. If so
13874 * ill_lookup_on_name will clear it.
13875 */
13876 ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13877 if (cp != endp)
13878 *cp = IPIF_SEPARATOR_CHAR;
13879 if (ill == NULL)
13880 return (NULL);
13881
13882 /* Establish the unit number in the name. */
13883 id = 0;
13884 if (cp < endp && *endp == '\0') {
13885 /* If there was a colon, the unit number follows. */
13886 cp++;
13887 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13888 ill_refrele(ill);
13889 if (error != NULL)
13890 *error = ENXIO;
13891 return (NULL);
13892 }
13893 }
13894
13895 GRAB_CONN_LOCK(q);
13896 mutex_enter(&ill->ill_lock);
13897 /* Now see if there is an IPIF with this unit number. */
13898 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13899 if (ipif->ipif_id == id) {
13900 if (zoneid != ALL_ZONES &&
13901 zoneid != ipif->ipif_zoneid &&
13902 ipif->ipif_zoneid != ALL_ZONES) {
13903 mutex_exit(&ill->ill_lock);
13904 RELEASE_CONN_LOCK(q);
13905 ill_refrele(ill);
13906 if (error != NULL)
13907 *error = ENXIO;
13908 return (NULL);
13909 }
13910
13911 if (!(IPIF_IS_CHANGING(ipif) ||
13912 IPIF_IS_CONDEMNED(ipif)) ||
13913 IAM_WRITER_IPIF(ipif)) {
13914 ipif_refhold_locked(ipif);
13915 mutex_exit(&ill->ill_lock);
13916 /*
13917 * Drop locks before calling ill_refrele
13918 * since it can potentially call into
13919 * ipif_ill_refrele_tail which can end up
13920 * in trying to acquire any lock.
13921 */
13922 RELEASE_CONN_LOCK(q);
13923 ill_refrele(ill);
13924 return (ipif);
13925 } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13926 ipsq = ill->ill_phyint->phyint_ipsq;
13927 mutex_enter(&ipsq->ipsq_lock);
13928 mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13929 mutex_exit(&ill->ill_lock);
13930 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13931 mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13932 mutex_exit(&ipsq->ipsq_lock);
13933 RELEASE_CONN_LOCK(q);
13934 ill_refrele(ill);
13935 if (error != NULL)
13936 *error = EINPROGRESS;
13937 return (NULL);
13938 }
13939 }
13940 }
13941 RELEASE_CONN_LOCK(q);
13942 mutex_exit(&ill->ill_lock);
13943 ill_refrele(ill);
13944 if (error != NULL)
13945 *error = ENXIO;
13946 return (NULL);
13947 }
13948
13949 /*
13950 * This routine is called whenever a new address comes up on an ipif. If
13951 * we are configured to respond to address mask requests, then we are supposed
13952 * to broadcast an address mask reply at this time. This routine is also
13953 * called if we are already up, but a netmask change is made. This is legal
13954 * but might not make the system manager very popular. (May be called
13955 * as writer.)
13956 */
13957 void
13958 ipif_mask_reply(ipif_t *ipif)
13959 {
13960 icmph_t *icmph;
13961 ipha_t *ipha;
13962 mblk_t *mp;
13963 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13964 ip_xmit_attr_t ixas;
13965
13966 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
13967
13968 if (!ipst->ips_ip_respond_to_address_mask_broadcast)
13969 return;
13970
13971 /* ICMP mask reply is IPv4 only */
13972 ASSERT(!ipif->ipif_isv6);
13973 /* ICMP mask reply is not for a loopback interface */
13974 ASSERT(ipif->ipif_ill->ill_wq != NULL);
13975
13976 if (ipif->ipif_lcl_addr == INADDR_ANY)
13977 return;
13978
13979 mp = allocb(REPLY_LEN, BPRI_HI);
13980 if (mp == NULL)
13981 return;
13982 mp->b_wptr = mp->b_rptr + REPLY_LEN;
13983
13984 ipha = (ipha_t *)mp->b_rptr;
13985 bzero(ipha, REPLY_LEN);
13986 *ipha = icmp_ipha;
13987 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
13988 ipha->ipha_src = ipif->ipif_lcl_addr;
13989 ipha->ipha_dst = ipif->ipif_brd_addr;
13990 ipha->ipha_length = htons(REPLY_LEN);
13991 ipha->ipha_ident = 0;
13992
13993 icmph = (icmph_t *)&ipha[1];
13994 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
13995 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
13996 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
13997
13998 bzero(&ixas, sizeof (ixas));
13999 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
14000 ixas.ixa_zoneid = ALL_ZONES;
14001 ixas.ixa_ifindex = 0;
14002 ixas.ixa_ipst = ipst;
14003 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
14004 (void) ip_output_simple(mp, &ixas);
14005 ixa_cleanup(&ixas);
14006 #undef REPLY_LEN
14007 }
14008
14009 /*
14010 * Join the ipif specific multicast groups.
14011 * Must be called after a mapping has been set up in the resolver. (Always
14012 * called as writer.)
14013 */
14014 void
14015 ipif_multicast_up(ipif_t *ipif)
14016 {
14017 int err;
14018 ill_t *ill;
14019 ilm_t *ilm;
14020
14021 ASSERT(IAM_WRITER_IPIF(ipif));
14022
14023 ill = ipif->ipif_ill;
14024
14025 ip1dbg(("ipif_multicast_up\n"));
14026 if (!(ill->ill_flags & ILLF_MULTICAST) ||
14027 ipif->ipif_allhosts_ilm != NULL)
14028 return;
14029
14030 if (ipif->ipif_isv6) {
14031 in6_addr_t v6allmc = ipv6_all_hosts_mcast;
14032 in6_addr_t v6solmc = ipv6_solicited_node_mcast;
14033
14034 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
14035
14036 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
14037 return;
14038
14039 ip1dbg(("ipif_multicast_up - addmulti\n"));
14040
14041 /*
14042 * Join the all hosts multicast address. We skip this for
14043 * underlying IPMP interfaces since they should be invisible.
14044 */
14045 if (!IS_UNDER_IPMP(ill)) {
14046 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
14047 &err);
14048 if (ilm == NULL) {
14049 ASSERT(err != 0);
14050 ip0dbg(("ipif_multicast_up: "
14051 "all_hosts_mcast failed %d\n", err));
14052 return;
14053 }
14054 ipif->ipif_allhosts_ilm = ilm;
14055 }
14056
14057 /*
14058 * Enable multicast for the solicited node multicast address.
14059 * If IPMP we need to put the membership on the upper ill.
14060 */
14061 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
14062 ill_t *mcast_ill = NULL;
14063 boolean_t need_refrele;
14064
14065 if (IS_UNDER_IPMP(ill) &&
14066 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
14067 need_refrele = B_TRUE;
14068 } else {
14069 mcast_ill = ill;
14070 need_refrele = B_FALSE;
14071 }
14072
14073 ilm = ip_addmulti(&v6solmc, mcast_ill,
14074 ipif->ipif_zoneid, &err);
14075 if (need_refrele)
14076 ill_refrele(mcast_ill);
14077
14078 if (ilm == NULL) {
14079 ASSERT(err != 0);
14080 ip0dbg(("ipif_multicast_up: solicited MC"
14081 " failed %d\n", err));
14082 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
14083 ipif->ipif_allhosts_ilm = NULL;
14084 (void) ip_delmulti(ilm);
14085 }
14086 return;
14087 }
14088 ipif->ipif_solmulti_ilm = ilm;
14089 }
14090 } else {
14091 in6_addr_t v6group;
14092
14093 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
14094 return;
14095
14096 /* Join the all hosts multicast address */
14097 ip1dbg(("ipif_multicast_up - addmulti\n"));
14098 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
14099
14100 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
14101 if (ilm == NULL) {
14102 ASSERT(err != 0);
14103 ip0dbg(("ipif_multicast_up: failed %d\n", err));
14104 return;
14105 }
14106 ipif->ipif_allhosts_ilm = ilm;
14107 }
14108 }
14109
14110 /*
14111 * Blow away any multicast groups that we joined in ipif_multicast_up().
14112 * (ilms from explicit memberships are handled in conn_update_ill.)
14113 */
14114 void
14115 ipif_multicast_down(ipif_t *ipif)
14116 {
14117 ASSERT(IAM_WRITER_IPIF(ipif));
14118
14119 ip1dbg(("ipif_multicast_down\n"));
14120
14121 if (ipif->ipif_allhosts_ilm != NULL) {
14122 (void) ip_delmulti(ipif->ipif_allhosts_ilm);
14123 ipif->ipif_allhosts_ilm = NULL;
14124 }
14125 if (ipif->ipif_solmulti_ilm != NULL) {
14126 (void) ip_delmulti(ipif->ipif_solmulti_ilm);
14127 ipif->ipif_solmulti_ilm = NULL;
14128 }
14129 }
14130
14131 /*
14132 * Used when an interface comes up to recreate any extra routes on this
14133 * interface.
14134 */
14135 int
14136 ill_recover_saved_ire(ill_t *ill)
14137 {
14138 mblk_t *mp;
14139 ip_stack_t *ipst = ill->ill_ipst;
14140
14141 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
14142
14143 mutex_enter(&ill->ill_saved_ire_lock);
14144 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
14145 ire_t *ire, *nire;
14146 ifrt_t *ifrt;
14147
14148 ifrt = (ifrt_t *)mp->b_rptr;
14149 /*
14150 * Create a copy of the IRE with the saved address and netmask.
14151 */
14152 if (ill->ill_isv6) {
14153 ire = ire_create_v6(
14154 &ifrt->ifrt_v6addr,
14155 &ifrt->ifrt_v6mask,
14156 &ifrt->ifrt_v6gateway_addr,
14157 ifrt->ifrt_type,
14158 ill,
14159 ifrt->ifrt_zoneid,
14160 ifrt->ifrt_flags,
14161 NULL,
14162 ipst);
14163 } else {
14164 ire = ire_create(
14165 (uint8_t *)&ifrt->ifrt_addr,
14166 (uint8_t *)&ifrt->ifrt_mask,
14167 (uint8_t *)&ifrt->ifrt_gateway_addr,
14168 ifrt->ifrt_type,
14169 ill,
14170 ifrt->ifrt_zoneid,
14171 ifrt->ifrt_flags,
14172 NULL,
14173 ipst);
14174 }
14175 if (ire == NULL) {
14176 mutex_exit(&ill->ill_saved_ire_lock);
14177 return (ENOMEM);
14178 }
14179
14180 if (ifrt->ifrt_flags & RTF_SETSRC) {
14181 if (ill->ill_isv6) {
14182 ire->ire_setsrc_addr_v6 =
14183 ifrt->ifrt_v6setsrc_addr;
14184 } else {
14185 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
14186 }
14187 }
14188
14189 /*
14190 * Some software (for example, GateD and Sun Cluster) attempts
14191 * to create (what amount to) IRE_PREFIX routes with the
14192 * loopback address as the gateway. This is primarily done to
14193 * set up prefixes with the RTF_REJECT flag set (for example,
14194 * when generating aggregate routes.)
14195 *
14196 * If the IRE type (as defined by ill->ill_net_type) is
14197 * IRE_LOOPBACK, then we map the request into a
14198 * IRE_IF_NORESOLVER.
14199 */
14200 if (ill->ill_net_type == IRE_LOOPBACK)
14201 ire->ire_type = IRE_IF_NORESOLVER;
14202
14203 /*
14204 * ire held by ire_add, will be refreled' towards the
14205 * the end of ipif_up_done
14206 */
14207 nire = ire_add(ire);
14208 /*
14209 * Check if it was a duplicate entry. This handles
14210 * the case of two racing route adds for the same route
14211 */
14212 if (nire == NULL) {
14213 ip1dbg(("ill_recover_saved_ire: FAILED\n"));
14214 } else if (nire != ire) {
14215 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14216 (void *)nire));
14217 ire_delete(nire);
14218 } else {
14219 ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14220 (void *)nire));
14221 }
14222 if (nire != NULL)
14223 ire_refrele(nire);
14224 }
14225 mutex_exit(&ill->ill_saved_ire_lock);
14226 return (0);
14227 }
14228
14229 /*
14230 * Used to set the netmask and broadcast address to default values when the
14231 * interface is brought up. (Always called as writer.)
14232 */
14233 static void
14234 ipif_set_default(ipif_t *ipif)
14235 {
14236 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14237
14238 if (!ipif->ipif_isv6) {
14239 /*
14240 * Interface holds an IPv4 address. Default
14241 * mask is the natural netmask.
14242 */
14243 if (!ipif->ipif_net_mask) {
14244 ipaddr_t v4mask;
14245
14246 v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14247 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14248 }
14249 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14250 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14251 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14252 } else {
14253 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14254 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14255 }
14256 /*
14257 * NOTE: SunOS 4.X does this even if the broadcast address
14258 * has been already set thus we do the same here.
14259 */
14260 if (ipif->ipif_flags & IPIF_BROADCAST) {
14261 ipaddr_t v4addr;
14262
14263 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14264 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14265 }
14266 } else {
14267 /*
14268 * Interface holds an IPv6-only address. Default
14269 * mask is all-ones.
14270 */
14271 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14272 ipif->ipif_v6net_mask = ipv6_all_ones;
14273 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14274 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14275 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14276 } else {
14277 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14278 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14279 }
14280 }
14281 }
14282
14283 /*
14284 * Return 0 if this address can be used as local address without causing
14285 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14286 * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14287 * Note that the same IPv6 link-local address is allowed as long as the ills
14288 * are not on the same link.
14289 */
14290 int
14291 ip_addr_availability_check(ipif_t *new_ipif)
14292 {
14293 in6_addr_t our_v6addr;
14294 ill_t *ill;
14295 ipif_t *ipif;
14296 ill_walk_context_t ctx;
14297 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst;
14298
14299 ASSERT(IAM_WRITER_IPIF(new_ipif));
14300 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14301 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14302
14303 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14304 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14305 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14306 return (0);
14307
14308 our_v6addr = new_ipif->ipif_v6lcl_addr;
14309
14310 if (new_ipif->ipif_isv6)
14311 ill = ILL_START_WALK_V6(&ctx, ipst);
14312 else
14313 ill = ILL_START_WALK_V4(&ctx, ipst);
14314
14315 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14316 for (ipif = ill->ill_ipif; ipif != NULL;
14317 ipif = ipif->ipif_next) {
14318 if ((ipif == new_ipif) ||
14319 !(ipif->ipif_flags & IPIF_UP) ||
14320 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14321 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14322 &our_v6addr))
14323 continue;
14324
14325 if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14326 new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14327 else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14328 ipif->ipif_flags |= IPIF_UNNUMBERED;
14329 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14330 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14331 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14332 continue;
14333 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14334 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14335 continue;
14336 else if (new_ipif->ipif_ill == ill)
14337 return (EADDRINUSE);
14338 else
14339 return (EADDRNOTAVAIL);
14340 }
14341 }
14342
14343 return (0);
14344 }
14345
14346 /*
14347 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14348 * IREs for the ipif.
14349 * When the routine returns EINPROGRESS then mp has been consumed and
14350 * the ioctl will be acked from ip_rput_dlpi.
14351 */
14352 int
14353 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14354 {
14355 ill_t *ill = ipif->ipif_ill;
14356 boolean_t isv6 = ipif->ipif_isv6;
14357 int err = 0;
14358 boolean_t success;
14359 uint_t ipif_orig_id;
14360 ip_stack_t *ipst = ill->ill_ipst;
14361
14362 ASSERT(IAM_WRITER_IPIF(ipif));
14363
14364 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14365 DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14366 ill_t *, ill, ipif_t *, ipif);
14367
14368 /* Shouldn't get here if it is already up. */
14369 if (ipif->ipif_flags & IPIF_UP)
14370 return (EALREADY);
14371
14372 /*
14373 * If this is a request to bring up a data address on an interface
14374 * under IPMP, then move the address to its IPMP meta-interface and
14375 * try to bring it up. One complication is that the zeroth ipif for
14376 * an ill is special, in that every ill always has one, and that code
14377 * throughout IP deferences ill->ill_ipif without holding any locks.
14378 */
14379 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14380 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14381 ipif_t *stubipif = NULL, *moveipif = NULL;
14382 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14383
14384 /*
14385 * The ipif being brought up should be quiesced. If it's not,
14386 * something has gone amiss and we need to bail out. (If it's
14387 * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14388 */
14389 mutex_enter(&ill->ill_lock);
14390 if (!ipif_is_quiescent(ipif)) {
14391 mutex_exit(&ill->ill_lock);
14392 return (EINVAL);
14393 }
14394 mutex_exit(&ill->ill_lock);
14395
14396 /*
14397 * If we're going to need to allocate ipifs, do it prior
14398 * to starting the move (and grabbing locks).
14399 */
14400 if (ipif->ipif_id == 0) {
14401 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14402 B_FALSE, &err)) == NULL) {
14403 return (err);
14404 }
14405 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14406 B_FALSE, &err)) == NULL) {
14407 mi_free(moveipif);
14408 return (err);
14409 }
14410 }
14411
14412 /*
14413 * Grab or transfer the ipif to move. During the move, keep
14414 * ill_g_lock held to prevent any ill walker threads from
14415 * seeing things in an inconsistent state.
14416 */
14417 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14418 if (ipif->ipif_id != 0) {
14419 ipif_remove(ipif);
14420 } else {
14421 ipif_transfer(ipif, moveipif, stubipif);
14422 ipif = moveipif;
14423 }
14424
14425 /*
14426 * Place the ipif on the IPMP ill. If the zeroth ipif on
14427 * the IPMP ill is a stub (0.0.0.0 down address) then we
14428 * replace that one. Otherwise, pick the next available slot.
14429 */
14430 ipif->ipif_ill = ipmp_ill;
14431 ipif_orig_id = ipif->ipif_id;
14432
14433 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14434 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14435 ipif = ipmp_ill->ill_ipif;
14436 } else {
14437 ipif->ipif_id = -1;
14438 if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14439 /*
14440 * No more available ipif_id's -- put it back
14441 * on the original ill and fail the operation.
14442 * Since we're writer on the ill, we can be
14443 * sure our old slot is still available.
14444 */
14445 ipif->ipif_id = ipif_orig_id;
14446 ipif->ipif_ill = ill;
14447 if (ipif_orig_id == 0) {
14448 ipif_transfer(ipif, ill->ill_ipif,
14449 NULL);
14450 } else {
14451 VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14452 }
14453 rw_exit(&ipst->ips_ill_g_lock);
14454 return (err);
14455 }
14456 }
14457 rw_exit(&ipst->ips_ill_g_lock);
14458
14459 /*
14460 * Tell SCTP that the ipif has moved. Note that even if we
14461 * had to allocate a new ipif, the original sequence id was
14462 * preserved and therefore SCTP won't know.
14463 */
14464 sctp_move_ipif(ipif, ill, ipmp_ill);
14465
14466 /*
14467 * If the ipif being brought up was on slot zero, then we
14468 * first need to bring up the placeholder we stuck there. In
14469 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14470 * call to ipif_up() itself, if we successfully bring up the
14471 * placeholder, we'll check ill_move_ipif and bring it up too.
14472 */
14473 if (ipif_orig_id == 0) {
14474 ASSERT(ill->ill_move_ipif == NULL);
14475 ill->ill_move_ipif = ipif;
14476 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14477 ASSERT(ill->ill_move_ipif == NULL);
14478 if (err != EINPROGRESS)
14479 ill->ill_move_ipif = NULL;
14480 return (err);
14481 }
14482
14483 /*
14484 * Bring it up on the IPMP ill.
14485 */
14486 return (ipif_up(ipif, q, mp));
14487 }
14488
14489 /* Skip arp/ndp for any loopback interface. */
14490 if (ill->ill_wq != NULL) {
14491 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14492 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
14493
14494 if (!ill->ill_dl_up) {
14495 /*
14496 * ill_dl_up is not yet set. i.e. we are yet to
14497 * DL_BIND with the driver and this is the first
14498 * logical interface on the ill to become "up".
14499 * Tell the driver to get going (via DL_BIND_REQ).
14500 * Note that changing "significant" IFF_ flags
14501 * address/netmask etc cause a down/up dance, but
14502 * does not cause an unbind (DL_UNBIND) with the driver
14503 */
14504 return (ill_dl_up(ill, ipif, mp, q));
14505 }
14506
14507 /*
14508 * ipif_resolver_up may end up needeing to bind/attach
14509 * the ARP stream, which in turn necessitates a
14510 * DLPI message exchange with the driver. ioctls are
14511 * serialized and so we cannot send more than one
14512 * interface up message at a time. If ipif_resolver_up
14513 * does need to wait for the DLPI handshake for the ARP stream,
14514 * we get EINPROGRESS and we will complete in arp_bringup_done.
14515 */
14516
14517 ASSERT(connp != NULL || !CONN_Q(q));
14518 if (connp != NULL)
14519 mutex_enter(&connp->conn_lock);
14520 mutex_enter(&ill->ill_lock);
14521 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14522 mutex_exit(&ill->ill_lock);
14523 if (connp != NULL)
14524 mutex_exit(&connp->conn_lock);
14525 if (!success)
14526 return (EINTR);
14527
14528 /*
14529 * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14530 * complete when ipif_ndp_up returns.
14531 */
14532 err = ipif_resolver_up(ipif, Res_act_initial);
14533 if (err == EINPROGRESS) {
14534 /* We will complete it in arp_bringup_done() */
14535 return (err);
14536 }
14537
14538 if (isv6 && err == 0)
14539 err = ipif_ndp_up(ipif, B_TRUE);
14540
14541 ASSERT(err != EINPROGRESS);
14542 mp = ipsq_pending_mp_get(ipsq, &connp);
14543 ASSERT(mp != NULL);
14544 if (err != 0)
14545 return (err);
14546 } else {
14547 /*
14548 * Interfaces without underlying hardware don't do duplicate
14549 * address detection.
14550 */
14551 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14552 ipif->ipif_addr_ready = 1;
14553 err = ill_add_ires(ill);
14554 /* allocation failure? */
14555 if (err != 0)
14556 return (err);
14557 }
14558
14559 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14560 if (err == 0 && ill->ill_move_ipif != NULL) {
14561 ipif = ill->ill_move_ipif;
14562 ill->ill_move_ipif = NULL;
14563 return (ipif_up(ipif, q, mp));
14564 }
14565 return (err);
14566 }
14567
14568 /*
14569 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14570 * The identical set of IREs need to be removed in ill_delete_ires().
14571 */
14572 int
14573 ill_add_ires(ill_t *ill)
14574 {
14575 ire_t *ire;
14576 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
14577 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14578
14579 if (ill->ill_ire_multicast != NULL)
14580 return (0);
14581
14582 /*
14583 * provide some dummy ire_addr for creating the ire.
14584 */
14585 if (ill->ill_isv6) {
14586 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14587 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14588 } else {
14589 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14590 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14591 }
14592 if (ire == NULL)
14593 return (ENOMEM);
14594
14595 ill->ill_ire_multicast = ire;
14596 return (0);
14597 }
14598
14599 void
14600 ill_delete_ires(ill_t *ill)
14601 {
14602 if (ill->ill_ire_multicast != NULL) {
14603 /*
14604 * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14605 * which was taken without any th_tracing enabled.
14606 * We also mark it as condemned (note that it was never added)
14607 * so that caching conn's can move off of it.
14608 */
14609 ire_make_condemned(ill->ill_ire_multicast);
14610 ire_refrele_notr(ill->ill_ire_multicast);
14611 ill->ill_ire_multicast = NULL;
14612 }
14613 }
14614
14615 /*
14616 * Perform a bind for the physical device.
14617 * When the routine returns EINPROGRESS then mp has been consumed and
14618 * the ioctl will be acked from ip_rput_dlpi.
14619 * Allocate an unbind message and save it until ipif_down.
14620 */
14621 static int
14622 ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
14623 {
14624 mblk_t *bind_mp = NULL;
14625 mblk_t *unbind_mp = NULL;
14626 conn_t *connp;
14627 boolean_t success;
14628 int err;
14629
14630 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14631
14632 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14633 ASSERT(IAM_WRITER_ILL(ill));
14634 ASSERT(mp != NULL);
14635
14636 /*
14637 * Make sure we have an IRE_MULTICAST in case we immediately
14638 * start receiving packets.
14639 */
14640 err = ill_add_ires(ill);
14641 if (err != 0)
14642 goto bad;
14643
14644 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14645 DL_BIND_REQ);
14646 if (bind_mp == NULL)
14647 goto bad;
14648 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14649 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14650
14651 /*
14652 * ill_unbind_mp would be non-null if the following sequence had
14653 * happened:
14654 * - send DL_BIND_REQ to driver, wait for response
14655 * - multiple ioctls that need to bring the ipif up are encountered,
14656 * but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14657 * These ioctls will then be enqueued on the ipsq
14658 * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14659 * At this point, the pending ioctls in the ipsq will be drained, and
14660 * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14661 * a non-null ill->ill_unbind_mp
14662 */
14663 if (ill->ill_unbind_mp == NULL) {
14664 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14665 DL_UNBIND_REQ);
14666 if (unbind_mp == NULL)
14667 goto bad;
14668 }
14669 /*
14670 * Record state needed to complete this operation when the
14671 * DL_BIND_ACK shows up. Also remember the pre-allocated mblks.
14672 */
14673 connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14674 ASSERT(connp != NULL || !CONN_Q(q));
14675 GRAB_CONN_LOCK(q);
14676 mutex_enter(&ipif->ipif_ill->ill_lock);
14677 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14678 mutex_exit(&ipif->ipif_ill->ill_lock);
14679 RELEASE_CONN_LOCK(q);
14680 if (!success)
14681 goto bad;
14682
14683 /*
14684 * Save the unbind message for ill_dl_down(); it will be consumed when
14685 * the interface goes down.
14686 */
14687 if (ill->ill_unbind_mp == NULL)
14688 ill->ill_unbind_mp = unbind_mp;
14689
14690 ill_dlpi_send(ill, bind_mp);
14691 /* Send down link-layer capabilities probe if not already done. */
14692 ill_capability_probe(ill);
14693
14694 /*
14695 * Sysid used to rely on the fact that netboots set domainname
14696 * and the like. Now that miniroot boots aren't strictly netboots
14697 * and miniroot network configuration is driven from userland
14698 * these things still need to be set. This situation can be detected
14699 * by comparing the interface being configured here to the one
14700 * dhcifname was set to reference by the boot loader. Once sysid is
14701 * converted to use dhcp_ipc_getinfo() this call can go away.
14702 */
14703 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14704 (strcmp(ill->ill_name, dhcifname) == 0) &&
14705 (strlen(srpc_domain) == 0)) {
14706 if (dhcpinit() != 0)
14707 cmn_err(CE_WARN, "no cached dhcp response");
14708 }
14709
14710 /*
14711 * This operation will complete in ip_rput_dlpi with either
14712 * a DL_BIND_ACK or DL_ERROR_ACK.
14713 */
14714 return (EINPROGRESS);
14715 bad:
14716 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14717
14718 freemsg(bind_mp);
14719 freemsg(unbind_mp);
14720 return (ENOMEM);
14721 }
14722
14723 /* Add room for tcp+ip headers */
14724 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14725
14726 /*
14727 * DLPI and ARP is up.
14728 * Create all the IREs associated with an interface. Bring up multicast.
14729 * Set the interface flag and finish other initialization
14730 * that potentially had to be deferred to after DL_BIND_ACK.
14731 */
14732 int
14733 ipif_up_done(ipif_t *ipif)
14734 {
14735 ill_t *ill = ipif->ipif_ill;
14736 int err = 0;
14737 boolean_t loopback = B_FALSE;
14738 boolean_t update_src_selection = B_TRUE;
14739 ipif_t *tmp_ipif;
14740
14741 ip1dbg(("ipif_up_done(%s:%u)\n",
14742 ipif->ipif_ill->ill_name, ipif->ipif_id));
14743 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14744 ill_t *, ill, ipif_t *, ipif);
14745
14746 /* Check if this is a loopback interface */
14747 if (ipif->ipif_ill->ill_wq == NULL)
14748 loopback = B_TRUE;
14749
14750 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14751
14752 /*
14753 * If all other interfaces for this ill are down or DEPRECATED,
14754 * or otherwise unsuitable for source address selection,
14755 * reset the src generation numbers to make sure source
14756 * address selection gets to take this new ipif into account.
14757 * No need to hold ill_lock while traversing the ipif list since
14758 * we are writer
14759 */
14760 for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14761 tmp_ipif = tmp_ipif->ipif_next) {
14762 if (((tmp_ipif->ipif_flags &
14763 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14764 !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14765 (tmp_ipif == ipif))
14766 continue;
14767 /* first useable pre-existing interface */
14768 update_src_selection = B_FALSE;
14769 break;
14770 }
14771 if (update_src_selection)
14772 ip_update_source_selection(ill->ill_ipst);
14773
14774 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14775 nce_t *loop_nce = NULL;
14776 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14777
14778 /*
14779 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14780 * ipif_lookup_on_name(), but in the case of zones we can have
14781 * several loopback addresses on lo0. So all the interfaces with
14782 * loopback addresses need to be marked IRE_LOOPBACK.
14783 */
14784 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14785 htonl(INADDR_LOOPBACK))
14786 ipif->ipif_ire_type = IRE_LOOPBACK;
14787 else
14788 ipif->ipif_ire_type = IRE_LOCAL;
14789 if (ill->ill_net_type != IRE_LOOPBACK)
14790 flags |= NCE_F_PUBLISH;
14791
14792 /* add unicast nce for the local addr */
14793 err = nce_lookup_then_add_v4(ill, NULL,
14794 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14795 ND_REACHABLE, &loop_nce);
14796 /* A shared-IP zone sees EEXIST for lo0:N */
14797 if (err == 0 || err == EEXIST) {
14798 ipif->ipif_added_nce = 1;
14799 loop_nce->nce_ipif_cnt++;
14800 nce_refrele(loop_nce);
14801 err = 0;
14802 } else {
14803 ASSERT(loop_nce == NULL);
14804 return (err);
14805 }
14806 }
14807
14808 /* Create all the IREs associated with this interface */
14809 err = ipif_add_ires_v4(ipif, loopback);
14810 if (err != 0) {
14811 /*
14812 * see comments about return value from
14813 * ip_addr_availability_check() in ipif_add_ires_v4().
14814 */
14815 if (err != EADDRINUSE) {
14816 (void) ipif_arp_down(ipif);
14817 } else {
14818 /*
14819 * Make IPMP aware of the deleted ipif so that
14820 * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14821 * can be completed. Note that we do not want to
14822 * destroy the nce that was created on the ipmp_ill
14823 * for the active copy of the duplicate address in
14824 * use.
14825 */
14826 if (IS_IPMP(ill))
14827 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14828 err = EADDRNOTAVAIL;
14829 }
14830 return (err);
14831 }
14832
14833 if (ill->ill_ipif_up_count == 1 && !loopback) {
14834 /* Recover any additional IREs entries for this ill */
14835 (void) ill_recover_saved_ire(ill);
14836 }
14837
14838 if (ill->ill_need_recover_multicast) {
14839 /*
14840 * Need to recover all multicast memberships in the driver.
14841 * This had to be deferred until we had attached. The same
14842 * code exists in ipif_up_done_v6() to recover IPv6
14843 * memberships.
14844 *
14845 * Note that it would be preferable to unconditionally do the
14846 * ill_recover_multicast() in ill_dl_up(), but we cannot do
14847 * that since ill_join_allmulti() depends on ill_dl_up being
14848 * set, and it is not set until we receive a DL_BIND_ACK after
14849 * having called ill_dl_up().
14850 */
14851 ill_recover_multicast(ill);
14852 }
14853
14854 if (ill->ill_ipif_up_count == 1) {
14855 /*
14856 * Since the interface is now up, it may now be active.
14857 */
14858 if (IS_UNDER_IPMP(ill))
14859 ipmp_ill_refresh_active(ill);
14860
14861 /*
14862 * If this is an IPMP interface, we may now be able to
14863 * establish ARP entries.
14864 */
14865 if (IS_IPMP(ill))
14866 ipmp_illgrp_refresh_arpent(ill->ill_grp);
14867 }
14868
14869 /* Join the allhosts multicast address */
14870 ipif_multicast_up(ipif);
14871
14872 if (!loopback && !update_src_selection &&
14873 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14874 ip_update_source_selection(ill->ill_ipst);
14875
14876 if (!loopback && ipif->ipif_addr_ready) {
14877 /* Broadcast an address mask reply. */
14878 ipif_mask_reply(ipif);
14879 }
14880 /* Perhaps ilgs should use this ill */
14881 update_conn_ill(NULL, ill->ill_ipst);
14882
14883 /*
14884 * This had to be deferred until we had bound. Tell routing sockets and
14885 * others that this interface is up if it looks like the address has
14886 * been validated. Otherwise, if it isn't ready yet, wait for
14887 * duplicate address detection to do its thing.
14888 */
14889 if (ipif->ipif_addr_ready)
14890 ipif_up_notify(ipif);
14891 return (0);
14892 }
14893
14894 /*
14895 * Add the IREs associated with the ipif.
14896 * Those MUST be explicitly removed in ipif_delete_ires_v4.
14897 */
14898 static int
14899 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14900 {
14901 ill_t *ill = ipif->ipif_ill;
14902 ip_stack_t *ipst = ill->ill_ipst;
14903 ire_t *ire_array[20];
14904 ire_t **irep = ire_array;
14905 ire_t **irep1;
14906 ipaddr_t net_mask = 0;
14907 ipaddr_t subnet_mask, route_mask;
14908 int err;
14909 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */
14910 ire_t *ire_if = NULL;
14911 uchar_t *gw;
14912
14913 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14914 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14915 /*
14916 * If we're on a labeled system then make sure that zone-
14917 * private addresses have proper remote host database entries.
14918 */
14919 if (is_system_labeled() &&
14920 ipif->ipif_ire_type != IRE_LOOPBACK &&
14921 !tsol_check_interface_address(ipif))
14922 return (EINVAL);
14923
14924 /* Register the source address for __sin6_src_id */
14925 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14926 ipif->ipif_zoneid, ipst);
14927 if (err != 0) {
14928 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14929 return (err);
14930 }
14931
14932 if (loopback)
14933 gw = (uchar_t *)&ipif->ipif_lcl_addr;
14934 else
14935 gw = NULL;
14936
14937 /* If the interface address is set, create the local IRE. */
14938 ire_local = ire_create(
14939 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */
14940 (uchar_t *)&ip_g_all_ones, /* mask */
14941 gw, /* gateway */
14942 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */
14943 ipif->ipif_ill,
14944 ipif->ipif_zoneid,
14945 ((ipif->ipif_flags & IPIF_PRIVATE) ?
14946 RTF_PRIVATE : 0) | RTF_KERNEL,
14947 NULL,
14948 ipst);
14949 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
14950 " for 0x%x\n", (void *)ipif, (void *)ire_local,
14951 ipif->ipif_ire_type,
14952 ntohl(ipif->ipif_lcl_addr)));
14953 if (ire_local == NULL) {
14954 ip1dbg(("ipif_up_done: NULL ire_local\n"));
14955 err = ENOMEM;
14956 goto bad;
14957 }
14958 } else {
14959 ip1dbg((
14960 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
14961 ipif->ipif_ire_type,
14962 ntohl(ipif->ipif_lcl_addr),
14963 (uint_t)ipif->ipif_flags));
14964 }
14965 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14966 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14967 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
14968 } else {
14969 net_mask = htonl(IN_CLASSA_NET); /* fallback */
14970 }
14971
14972 subnet_mask = ipif->ipif_net_mask;
14973
14974 /*
14975 * If mask was not specified, use natural netmask of
14976 * interface address. Also, store this mask back into the
14977 * ipif struct.
14978 */
14979 if (subnet_mask == 0) {
14980 subnet_mask = net_mask;
14981 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
14982 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
14983 ipif->ipif_v6subnet);
14984 }
14985
14986 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
14987 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
14988 ipif->ipif_subnet != INADDR_ANY) {
14989 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14990
14991 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14992 route_mask = IP_HOST_MASK;
14993 } else {
14994 route_mask = subnet_mask;
14995 }
14996
14997 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
14998 "creating if IRE ill_net_type 0x%x for 0x%x\n",
14999 (void *)ipif, (void *)ill, ill->ill_net_type,
15000 ntohl(ipif->ipif_subnet)));
15001 ire_if = ire_create(
15002 (uchar_t *)&ipif->ipif_subnet,
15003 (uchar_t *)&route_mask,
15004 (uchar_t *)&ipif->ipif_lcl_addr,
15005 ill->ill_net_type,
15006 ill,
15007 ipif->ipif_zoneid,
15008 ((ipif->ipif_flags & IPIF_PRIVATE) ?
15009 RTF_PRIVATE: 0) | RTF_KERNEL,
15010 NULL,
15011 ipst);
15012 if (ire_if == NULL) {
15013 ip1dbg(("ipif_up_done: NULL ire_if\n"));
15014 err = ENOMEM;
15015 goto bad;
15016 }
15017 }
15018
15019 /*
15020 * Create any necessary broadcast IREs.
15021 */
15022 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15023 !(ipif->ipif_flags & IPIF_NOXMIT))
15024 irep = ipif_create_bcast_ires(ipif, irep);
15025
15026 /* If an earlier ire_create failed, get out now */
15027 for (irep1 = irep; irep1 > ire_array; ) {
15028 irep1--;
15029 if (*irep1 == NULL) {
15030 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
15031 err = ENOMEM;
15032 goto bad;
15033 }
15034 }
15035
15036 /*
15037 * Need to atomically check for IP address availability under
15038 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
15039 * ills or new ipifs can be added while we are checking availability.
15040 */
15041 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15042 mutex_enter(&ipst->ips_ip_addr_avail_lock);
15043 /* Mark it up, and increment counters. */
15044 ipif->ipif_flags |= IPIF_UP;
15045 ill->ill_ipif_up_count++;
15046 err = ip_addr_availability_check(ipif);
15047 mutex_exit(&ipst->ips_ip_addr_avail_lock);
15048 rw_exit(&ipst->ips_ill_g_lock);
15049
15050 if (err != 0) {
15051 /*
15052 * Our address may already be up on the same ill. In this case,
15053 * the ARP entry for our ipif replaced the one for the other
15054 * ipif. So we don't want to delete it (otherwise the other ipif
15055 * would be unable to send packets).
15056 * ip_addr_availability_check() identifies this case for us and
15057 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
15058 * which is the expected error code.
15059 */
15060 ill->ill_ipif_up_count--;
15061 ipif->ipif_flags &= ~IPIF_UP;
15062 goto bad;
15063 }
15064
15065 /*
15066 * Add in all newly created IREs. ire_create_bcast() has
15067 * already checked for duplicates of the IRE_BROADCAST type.
15068 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
15069 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
15070 * a /32 route.
15071 */
15072 if (ire_if != NULL) {
15073 ire_if = ire_add(ire_if);
15074 if (ire_if == NULL) {
15075 err = ENOMEM;
15076 goto bad2;
15077 }
15078 #ifdef DEBUG
15079 ire_refhold_notr(ire_if);
15080 ire_refrele(ire_if);
15081 #endif
15082 }
15083 if (ire_local != NULL) {
15084 ire_local = ire_add(ire_local);
15085 if (ire_local == NULL) {
15086 err = ENOMEM;
15087 goto bad2;
15088 }
15089 #ifdef DEBUG
15090 ire_refhold_notr(ire_local);
15091 ire_refrele(ire_local);
15092 #endif
15093 }
15094 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15095 if (ire_local != NULL)
15096 ipif->ipif_ire_local = ire_local;
15097 if (ire_if != NULL)
15098 ipif->ipif_ire_if = ire_if;
15099 rw_exit(&ipst->ips_ill_g_lock);
15100 ire_local = NULL;
15101 ire_if = NULL;
15102
15103 /*
15104 * We first add all of them, and if that succeeds we refrele the
15105 * bunch. That enables us to delete all of them should any of the
15106 * ire_adds fail.
15107 */
15108 for (irep1 = irep; irep1 > ire_array; ) {
15109 irep1--;
15110 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
15111 *irep1 = ire_add(*irep1);
15112 if (*irep1 == NULL) {
15113 err = ENOMEM;
15114 goto bad2;
15115 }
15116 }
15117
15118 for (irep1 = irep; irep1 > ire_array; ) {
15119 irep1--;
15120 /* refheld by ire_add. */
15121 if (*irep1 != NULL) {
15122 ire_refrele(*irep1);
15123 *irep1 = NULL;
15124 }
15125 }
15126
15127 if (!loopback) {
15128 /*
15129 * If the broadcast address has been set, make sure it makes
15130 * sense based on the interface address.
15131 * Only match on ill since we are sharing broadcast addresses.
15132 */
15133 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
15134 (ipif->ipif_flags & IPIF_BROADCAST)) {
15135 ire_t *ire;
15136
15137 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
15138 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
15139 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
15140
15141 if (ire == NULL) {
15142 /*
15143 * If there isn't a matching broadcast IRE,
15144 * revert to the default for this netmask.
15145 */
15146 ipif->ipif_v6brd_addr = ipv6_all_zeros;
15147 mutex_enter(&ipif->ipif_ill->ill_lock);
15148 ipif_set_default(ipif);
15149 mutex_exit(&ipif->ipif_ill->ill_lock);
15150 } else {
15151 ire_refrele(ire);
15152 }
15153 }
15154
15155 }
15156 return (0);
15157
15158 bad2:
15159 ill->ill_ipif_up_count--;
15160 ipif->ipif_flags &= ~IPIF_UP;
15161
15162 bad:
15163 ip1dbg(("ipif_add_ires: FAILED \n"));
15164 if (ire_local != NULL)
15165 ire_delete(ire_local);
15166 if (ire_if != NULL)
15167 ire_delete(ire_if);
15168
15169 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15170 ire_local = ipif->ipif_ire_local;
15171 ipif->ipif_ire_local = NULL;
15172 ire_if = ipif->ipif_ire_if;
15173 ipif->ipif_ire_if = NULL;
15174 rw_exit(&ipst->ips_ill_g_lock);
15175 if (ire_local != NULL) {
15176 ire_delete(ire_local);
15177 ire_refrele_notr(ire_local);
15178 }
15179 if (ire_if != NULL) {
15180 ire_delete(ire_if);
15181 ire_refrele_notr(ire_if);
15182 }
15183
15184 while (irep > ire_array) {
15185 irep--;
15186 if (*irep != NULL) {
15187 ire_delete(*irep);
15188 }
15189 }
15190 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
15191
15192 return (err);
15193 }
15194
15195 /* Remove all the IREs created by ipif_add_ires_v4 */
15196 void
15197 ipif_delete_ires_v4(ipif_t *ipif)
15198 {
15199 ill_t *ill = ipif->ipif_ill;
15200 ip_stack_t *ipst = ill->ill_ipst;
15201 ire_t *ire;
15202
15203 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15204 ire = ipif->ipif_ire_local;
15205 ipif->ipif_ire_local = NULL;
15206 rw_exit(&ipst->ips_ill_g_lock);
15207 if (ire != NULL) {
15208 /*
15209 * Move count to ipif so we don't loose the count due to
15210 * a down/up dance.
15211 */
15212 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
15213
15214 ire_delete(ire);
15215 ire_refrele_notr(ire);
15216 }
15217 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15218 ire = ipif->ipif_ire_if;
15219 ipif->ipif_ire_if = NULL;
15220 rw_exit(&ipst->ips_ill_g_lock);
15221 if (ire != NULL) {
15222 ire_delete(ire);
15223 ire_refrele_notr(ire);
15224 }
15225
15226 /*
15227 * Delete the broadcast IREs.
15228 */
15229 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15230 !(ipif->ipif_flags & IPIF_NOXMIT))
15231 ipif_delete_bcast_ires(ipif);
15232 }
15233
15234 /*
15235 * Checks for availbility of a usable source address (if there is one) when the
15236 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15237 * this selection is done regardless of the destination.
15238 */
15239 boolean_t
15240 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15241 ip_stack_t *ipst)
15242 {
15243 ipif_t *ipif = NULL;
15244 ill_t *uill;
15245
15246 ASSERT(ifindex != 0);
15247
15248 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15249 if (uill == NULL)
15250 return (B_FALSE);
15251
15252 mutex_enter(&uill->ill_lock);
15253 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15254 if (IPIF_IS_CONDEMNED(ipif))
15255 continue;
15256 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15257 continue;
15258 if (!(ipif->ipif_flags & IPIF_UP))
15259 continue;
15260 if (ipif->ipif_zoneid != zoneid)
15261 continue;
15262 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15263 ipif->ipif_lcl_addr == INADDR_ANY)
15264 continue;
15265 mutex_exit(&uill->ill_lock);
15266 ill_refrele(uill);
15267 return (B_TRUE);
15268 }
15269 mutex_exit(&uill->ill_lock);
15270 ill_refrele(uill);
15271 return (B_FALSE);
15272 }
15273
15274 /*
15275 * Find an ipif with a good local address on the ill+zoneid.
15276 */
15277 ipif_t *
15278 ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15279 {
15280 ipif_t *ipif;
15281
15282 mutex_enter(&ill->ill_lock);
15283 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15284 if (IPIF_IS_CONDEMNED(ipif))
15285 continue;
15286 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15287 continue;
15288 if (!(ipif->ipif_flags & IPIF_UP))
15289 continue;
15290 if (ipif->ipif_zoneid != zoneid &&
15291 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15292 continue;
15293 if (ill->ill_isv6 ?
15294 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15295 ipif->ipif_lcl_addr == INADDR_ANY)
15296 continue;
15297 ipif_refhold_locked(ipif);
15298 mutex_exit(&ill->ill_lock);
15299 return (ipif);
15300 }
15301 mutex_exit(&ill->ill_lock);
15302 return (NULL);
15303 }
15304
15305 /*
15306 * IP source address type, sorted from worst to best. For a given type,
15307 * always prefer IP addresses on the same subnet. All-zones addresses are
15308 * suboptimal because they pose problems with unlabeled destinations.
15309 */
15310 typedef enum {
15311 IPIF_NONE,
15312 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */
15313 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */
15314 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
15315 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
15316 IPIF_DIFFNET, /* normal and different subnet */
15317 IPIF_SAMENET, /* normal and same subnet */
15318 IPIF_LOCALADDR /* local loopback */
15319 } ipif_type_t;
15320
15321 /*
15322 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15323 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t
15324 * enumeration, and return the highest-rated ipif. If there's a tie, we pick
15325 * the first one, unless IPMP is used in which case we round-robin among them;
15326 * see below for more.
15327 *
15328 * Returns NULL if there is no suitable source address for the ill.
15329 * This only occurs when there is no valid source address for the ill.
15330 */
15331 ipif_t *
15332 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15333 boolean_t allow_usesrc, boolean_t *notreadyp)
15334 {
15335 ill_t *usill = NULL;
15336 ill_t *ipmp_ill = NULL;
15337 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif;
15338 ipif_type_t type, best_type;
15339 tsol_tpc_t *src_rhtp, *dst_rhtp;
15340 ip_stack_t *ipst = ill->ill_ipst;
15341 boolean_t samenet;
15342
15343 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15344 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15345 B_FALSE, ipst);
15346 if (usill != NULL)
15347 ill = usill; /* Select source from usesrc ILL */
15348 else
15349 return (NULL);
15350 }
15351
15352 /*
15353 * Test addresses should never be used for source address selection,
15354 * so if we were passed one, switch to the IPMP meta-interface.
15355 */
15356 if (IS_UNDER_IPMP(ill)) {
15357 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15358 ill = ipmp_ill; /* Select source from IPMP ill */
15359 else
15360 return (NULL);
15361 }
15362
15363 /*
15364 * If we're dealing with an unlabeled destination on a labeled system,
15365 * make sure that we ignore source addresses that are incompatible with
15366 * the destination's default label. That destination's default label
15367 * must dominate the minimum label on the source address.
15368 */
15369 dst_rhtp = NULL;
15370 if (is_system_labeled()) {
15371 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE);
15372 if (dst_rhtp == NULL)
15373 return (NULL);
15374 if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
15375 TPC_RELE(dst_rhtp);
15376 dst_rhtp = NULL;
15377 }
15378 }
15379
15380 /*
15381 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15382 * can be deleted. But an ipif/ill can get CONDEMNED any time.
15383 * After selecting the right ipif, under ill_lock make sure ipif is
15384 * not condemned, and increment refcnt. If ipif is CONDEMNED,
15385 * we retry. Inside the loop we still need to check for CONDEMNED,
15386 * but not under a lock.
15387 */
15388 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15389 retry:
15390 /*
15391 * For source address selection, we treat the ipif list as circular
15392 * and continue until we get back to where we started. This allows
15393 * IPMP to vary source address selection (which improves inbound load
15394 * spreading) by caching its last ending point and starting from
15395 * there. NOTE: we don't have to worry about ill_src_ipif changing
15396 * ills since that can't happen on the IPMP ill.
15397 */
15398 start_ipif = ill->ill_ipif;
15399 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15400 start_ipif = ill->ill_src_ipif;
15401
15402 ipif = start_ipif;
15403 best_ipif = NULL;
15404 best_type = IPIF_NONE;
15405 do {
15406 if ((next_ipif = ipif->ipif_next) == NULL)
15407 next_ipif = ill->ill_ipif;
15408
15409 if (IPIF_IS_CONDEMNED(ipif))
15410 continue;
15411 /* Always skip NOLOCAL and ANYCAST interfaces */
15412 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15413 continue;
15414 /* Always skip NOACCEPT interfaces */
15415 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15416 continue;
15417 if (!(ipif->ipif_flags & IPIF_UP))
15418 continue;
15419
15420 if (!ipif->ipif_addr_ready) {
15421 if (notreadyp != NULL)
15422 *notreadyp = B_TRUE;
15423 continue;
15424 }
15425
15426 if (zoneid != ALL_ZONES &&
15427 ipif->ipif_zoneid != zoneid &&
15428 ipif->ipif_zoneid != ALL_ZONES)
15429 continue;
15430
15431 /*
15432 * Interfaces with 0.0.0.0 address are allowed to be UP, but
15433 * are not valid as source addresses.
15434 */
15435 if (ipif->ipif_lcl_addr == INADDR_ANY)
15436 continue;
15437
15438 /*
15439 * Check compatibility of local address for destination's
15440 * default label if we're on a labeled system. Incompatible
15441 * addresses can't be used at all.
15442 */
15443 if (dst_rhtp != NULL) {
15444 boolean_t incompat;
15445
15446 src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
15447 IPV4_VERSION, B_FALSE);
15448 if (src_rhtp == NULL)
15449 continue;
15450 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
15451 src_rhtp->tpc_tp.tp_doi !=
15452 dst_rhtp->tpc_tp.tp_doi ||
15453 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
15454 &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
15455 !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
15456 src_rhtp->tpc_tp.tp_sl_set_cipso));
15457 TPC_RELE(src_rhtp);
15458 if (incompat)
15459 continue;
15460 }
15461
15462 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15463
15464 if (ipif->ipif_lcl_addr == dst) {
15465 type = IPIF_LOCALADDR;
15466 } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15467 type = samenet ? IPIF_SAMENET_DEPRECATED :
15468 IPIF_DIFFNET_DEPRECATED;
15469 } else if (ipif->ipif_zoneid == ALL_ZONES) {
15470 type = samenet ? IPIF_SAMENET_ALLZONES :
15471 IPIF_DIFFNET_ALLZONES;
15472 } else {
15473 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15474 }
15475
15476 if (type > best_type) {
15477 best_type = type;
15478 best_ipif = ipif;
15479 if (best_type == IPIF_LOCALADDR)
15480 break; /* can't get better */
15481 }
15482 } while ((ipif = next_ipif) != start_ipif);
15483
15484 if ((ipif = best_ipif) != NULL) {
15485 mutex_enter(&ipif->ipif_ill->ill_lock);
15486 if (IPIF_IS_CONDEMNED(ipif)) {
15487 mutex_exit(&ipif->ipif_ill->ill_lock);
15488 goto retry;
15489 }
15490 ipif_refhold_locked(ipif);
15491
15492 /*
15493 * For IPMP, update the source ipif rotor to the next ipif,
15494 * provided we can look it up. (We must not use it if it's
15495 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15496 * ipif_free() checked ill_src_ipif.)
15497 */
15498 if (IS_IPMP(ill) && ipif != NULL) {
15499 next_ipif = ipif->ipif_next;
15500 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15501 ill->ill_src_ipif = next_ipif;
15502 else
15503 ill->ill_src_ipif = NULL;
15504 }
15505 mutex_exit(&ipif->ipif_ill->ill_lock);
15506 }
15507
15508 rw_exit(&ipst->ips_ill_g_lock);
15509 if (usill != NULL)
15510 ill_refrele(usill);
15511 if (ipmp_ill != NULL)
15512 ill_refrele(ipmp_ill);
15513 if (dst_rhtp != NULL)
15514 TPC_RELE(dst_rhtp);
15515
15516 #ifdef DEBUG
15517 if (ipif == NULL) {
15518 char buf1[INET6_ADDRSTRLEN];
15519
15520 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15521 ill->ill_name,
15522 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15523 } else {
15524 char buf1[INET6_ADDRSTRLEN];
15525 char buf2[INET6_ADDRSTRLEN];
15526
15527 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15528 ipif->ipif_ill->ill_name,
15529 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15530 inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15531 buf2, sizeof (buf2))));
15532 }
15533 #endif /* DEBUG */
15534 return (ipif);
15535 }
15536
15537 /*
15538 * Pick a source address based on the destination ill and an optional setsrc
15539 * address.
15540 * The result is stored in srcp. If generation is set, then put the source
15541 * generation number there before we look for the source address (to avoid
15542 * missing changes in the set of source addresses.
15543 * If flagsp is set, then us it to pass back ipif_flags.
15544 *
15545 * If the caller wants to cache the returned source address and detect when
15546 * that might be stale, the caller should pass in a generation argument,
15547 * which the caller can later compare against ips_src_generation
15548 *
15549 * The precedence order for selecting an IPv4 source address is:
15550 * - RTF_SETSRC on the offlink ire always wins.
15551 * - If usrsrc is set, swap the ill to be the usesrc one.
15552 * - If IPMP is used on the ill, select a random address from the most
15553 * preferred ones below:
15554 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15555 * 2. Not deprecated, not ALL_ZONES
15556 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15557 * 4. Not deprecated, ALL_ZONES
15558 * 5. If onlink destination, same subnet and deprecated
15559 * 6. Deprecated.
15560 *
15561 * We have lower preference for ALL_ZONES IP addresses,
15562 * as they pose problems with unlabeled destinations.
15563 *
15564 * Note that when multiple IP addresses match e.g., #1 we pick
15565 * the first one if IPMP is not in use. With IPMP we randomize.
15566 */
15567 int
15568 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15569 ipaddr_t multicast_ifaddr,
15570 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15571 uint32_t *generation, uint64_t *flagsp)
15572 {
15573 ipif_t *ipif;
15574 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */
15575
15576 if (flagsp != NULL)
15577 *flagsp = 0;
15578
15579 /*
15580 * Need to grab the generation number before we check to
15581 * avoid a race with a change to the set of local addresses.
15582 * No lock needed since the thread which updates the set of local
15583 * addresses use ipif/ill locks and exit those (hence a store memory
15584 * barrier) before doing the atomic increase of ips_src_generation.
15585 */
15586 if (generation != NULL) {
15587 *generation = ipst->ips_src_generation;
15588 }
15589
15590 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15591 *srcp = multicast_ifaddr;
15592 return (0);
15593 }
15594
15595 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15596 if (setsrc != INADDR_ANY) {
15597 *srcp = setsrc;
15598 return (0);
15599 }
15600 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready);
15601 if (ipif == NULL) {
15602 if (notready)
15603 return (ENETDOWN);
15604 else
15605 return (EADDRNOTAVAIL);
15606 }
15607 *srcp = ipif->ipif_lcl_addr;
15608 if (flagsp != NULL)
15609 *flagsp = ipif->ipif_flags;
15610 ipif_refrele(ipif);
15611 return (0);
15612 }
15613
15614 /* ARGSUSED */
15615 int
15616 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15617 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15618 {
15619 /*
15620 * ill_phyint_reinit merged the v4 and v6 into a single
15621 * ipsq. We might not have been able to complete the
15622 * operation in ipif_set_values, if we could not become
15623 * exclusive. If so restart it here.
15624 */
15625 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15626 }
15627
15628 /*
15629 * Can operate on either a module or a driver queue.
15630 * Returns an error if not a module queue.
15631 */
15632 /* ARGSUSED */
15633 int
15634 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15635 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15636 {
15637 queue_t *q1 = q;
15638 char *cp;
15639 char interf_name[LIFNAMSIZ];
15640 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15641
15642 if (q->q_next == NULL) {
15643 ip1dbg((
15644 "if_unitsel: IF_UNITSEL: no q_next\n"));
15645 return (EINVAL);
15646 }
15647
15648 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15649 return (EALREADY);
15650
15651 do {
15652 q1 = q1->q_next;
15653 } while (q1->q_next);
15654 cp = q1->q_qinfo->qi_minfo->mi_idname;
15655 (void) sprintf(interf_name, "%s%d", cp, ppa);
15656
15657 /*
15658 * Here we are not going to delay the ioack until after
15659 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15660 * original ioctl message before sending the requests.
15661 */
15662 return (ipif_set_values(q, mp, interf_name, &ppa));
15663 }
15664
15665 /* ARGSUSED */
15666 int
15667 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15668 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15669 {
15670 return (ENXIO);
15671 }
15672
15673 /*
15674 * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15675 * `irep'. Returns a pointer to the next free `irep' entry
15676 * A mirror exists in ipif_delete_bcast_ires().
15677 *
15678 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15679 * done in ire_add.
15680 */
15681 static ire_t **
15682 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15683 {
15684 ipaddr_t addr;
15685 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15686 ipaddr_t subnetmask = ipif->ipif_net_mask;
15687 ill_t *ill = ipif->ipif_ill;
15688 zoneid_t zoneid = ipif->ipif_zoneid;
15689
15690 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15691
15692 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15693 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15694
15695 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15696 (ipif->ipif_flags & IPIF_NOLOCAL))
15697 netmask = htonl(IN_CLASSA_NET); /* fallback */
15698
15699 irep = ire_create_bcast(ill, 0, zoneid, irep);
15700 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15701
15702 /*
15703 * For backward compatibility, we create net broadcast IREs based on
15704 * the old "IP address class system", since some old machines only
15705 * respond to these class derived net broadcast. However, we must not
15706 * create these net broadcast IREs if the subnetmask is shorter than
15707 * the IP address class based derived netmask. Otherwise, we may
15708 * create a net broadcast address which is the same as an IP address
15709 * on the subnet -- and then TCP will refuse to talk to that address.
15710 */
15711 if (netmask < subnetmask) {
15712 addr = netmask & ipif->ipif_subnet;
15713 irep = ire_create_bcast(ill, addr, zoneid, irep);
15714 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15715 }
15716
15717 /*
15718 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15719 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15720 * created. Creating these broadcast IREs will only create confusion
15721 * as `addr' will be the same as the IP address.
15722 */
15723 if (subnetmask != 0xFFFFFFFF) {
15724 addr = ipif->ipif_subnet;
15725 irep = ire_create_bcast(ill, addr, zoneid, irep);
15726 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15727 }
15728
15729 return (irep);
15730 }
15731
15732 /*
15733 * Mirror of ipif_create_bcast_ires()
15734 */
15735 static void
15736 ipif_delete_bcast_ires(ipif_t *ipif)
15737 {
15738 ipaddr_t addr;
15739 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15740 ipaddr_t subnetmask = ipif->ipif_net_mask;
15741 ill_t *ill = ipif->ipif_ill;
15742 zoneid_t zoneid = ipif->ipif_zoneid;
15743 ire_t *ire;
15744
15745 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15746 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15747
15748 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15749 (ipif->ipif_flags & IPIF_NOLOCAL))
15750 netmask = htonl(IN_CLASSA_NET); /* fallback */
15751
15752 ire = ire_lookup_bcast(ill, 0, zoneid);
15753 ASSERT(ire != NULL);
15754 ire_delete(ire); ire_refrele(ire);
15755 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15756 ASSERT(ire != NULL);
15757 ire_delete(ire); ire_refrele(ire);
15758
15759 /*
15760 * For backward compatibility, we create net broadcast IREs based on
15761 * the old "IP address class system", since some old machines only
15762 * respond to these class derived net broadcast. However, we must not
15763 * create these net broadcast IREs if the subnetmask is shorter than
15764 * the IP address class based derived netmask. Otherwise, we may
15765 * create a net broadcast address which is the same as an IP address
15766 * on the subnet -- and then TCP will refuse to talk to that address.
15767 */
15768 if (netmask < subnetmask) {
15769 addr = netmask & ipif->ipif_subnet;
15770 ire = ire_lookup_bcast(ill, addr, zoneid);
15771 ASSERT(ire != NULL);
15772 ire_delete(ire); ire_refrele(ire);
15773 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15774 ASSERT(ire != NULL);
15775 ire_delete(ire); ire_refrele(ire);
15776 }
15777
15778 /*
15779 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15780 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15781 * created. Creating these broadcast IREs will only create confusion
15782 * as `addr' will be the same as the IP address.
15783 */
15784 if (subnetmask != 0xFFFFFFFF) {
15785 addr = ipif->ipif_subnet;
15786 ire = ire_lookup_bcast(ill, addr, zoneid);
15787 ASSERT(ire != NULL);
15788 ire_delete(ire); ire_refrele(ire);
15789 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15790 ASSERT(ire != NULL);
15791 ire_delete(ire); ire_refrele(ire);
15792 }
15793 }
15794
15795 /*
15796 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15797 * from lifr_flags and the name from lifr_name.
15798 * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15799 * since ipif_lookup_on_name uses the _isv6 flags when matching.
15800 * Returns EINPROGRESS when mp has been consumed by queueing it on
15801 * ipx_pending_mp and the ioctl will complete in ip_rput.
15802 *
15803 * Can operate on either a module or a driver queue.
15804 * Returns an error if not a module queue.
15805 */
15806 /* ARGSUSED */
15807 int
15808 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15809 ip_ioctl_cmd_t *ipip, void *if_req)
15810 {
15811 ill_t *ill = q->q_ptr;
15812 phyint_t *phyi;
15813 ip_stack_t *ipst;
15814 struct lifreq *lifr = if_req;
15815 uint64_t new_flags;
15816
15817 ASSERT(ipif != NULL);
15818 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15819
15820 if (q->q_next == NULL) {
15821 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15822 return (EINVAL);
15823 }
15824
15825 /*
15826 * If we are not writer on 'q' then this interface exists already
15827 * and previous lookups (ip_extract_lifreq()) found this ipif --
15828 * so return EALREADY.
15829 */
15830 if (ill != ipif->ipif_ill)
15831 return (EALREADY);
15832
15833 if (ill->ill_name[0] != '\0')
15834 return (EALREADY);
15835
15836 /*
15837 * If there's another ill already with the requested name, ensure
15838 * that it's of the same type. Otherwise, ill_phyint_reinit() will
15839 * fuse together two unrelated ills, which will cause chaos.
15840 */
15841 ipst = ill->ill_ipst;
15842 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15843 lifr->lifr_name, NULL);
15844 if (phyi != NULL) {
15845 ill_t *ill_mate = phyi->phyint_illv4;
15846
15847 if (ill_mate == NULL)
15848 ill_mate = phyi->phyint_illv6;
15849 ASSERT(ill_mate != NULL);
15850
15851 if (ill_mate->ill_media->ip_m_mac_type !=
15852 ill->ill_media->ip_m_mac_type) {
15853 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15854 "use the same ill name on differing media\n"));
15855 return (EINVAL);
15856 }
15857 }
15858
15859 /*
15860 * We start off as IFF_IPV4 in ipif_allocate and become
15861 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value.
15862 * The only flags that we read from user space are IFF_IPV4,
15863 * IFF_IPV6, and IFF_BROADCAST.
15864 *
15865 * This ill has not been inserted into the global list.
15866 * So we are still single threaded and don't need any lock
15867 *
15868 * Saniy check the flags.
15869 */
15870
15871 if ((lifr->lifr_flags & IFF_BROADCAST) &&
15872 ((lifr->lifr_flags & IFF_IPV6) ||
15873 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15874 ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15875 "or IPv6 i.e., no broadcast \n"));
15876 return (EINVAL);
15877 }
15878
15879 new_flags =
15880 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15881
15882 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15883 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15884 "IFF_IPV4 or IFF_IPV6\n"));
15885 return (EINVAL);
15886 }
15887
15888 /*
15889 * We always start off as IPv4, so only need to check for IPv6.
15890 */
15891 if ((new_flags & IFF_IPV6) != 0) {
15892 ill->ill_flags |= ILLF_IPV6;
15893 ill->ill_flags &= ~ILLF_IPV4;
15894
15895 if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15896 ill->ill_flags |= ILLF_NOLINKLOCAL;
15897 }
15898
15899 if ((new_flags & IFF_BROADCAST) != 0)
15900 ipif->ipif_flags |= IPIF_BROADCAST;
15901 else
15902 ipif->ipif_flags &= ~IPIF_BROADCAST;
15903
15904 /* We started off as V4. */
15905 if (ill->ill_flags & ILLF_IPV6) {
15906 ill->ill_phyint->phyint_illv6 = ill;
15907 ill->ill_phyint->phyint_illv4 = NULL;
15908 }
15909
15910 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15911 }
15912
15913 /* ARGSUSED */
15914 int
15915 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15916 ip_ioctl_cmd_t *ipip, void *if_req)
15917 {
15918 /*
15919 * ill_phyint_reinit merged the v4 and v6 into a single
15920 * ipsq. We might not have been able to complete the
15921 * slifname in ipif_set_values, if we could not become
15922 * exclusive. If so restart it here
15923 */
15924 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15925 }
15926
15927 /*
15928 * Return a pointer to the ipif which matches the index, IP version type and
15929 * zoneid.
15930 */
15931 ipif_t *
15932 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15933 ip_stack_t *ipst)
15934 {
15935 ill_t *ill;
15936 ipif_t *ipif = NULL;
15937
15938 ill = ill_lookup_on_ifindex(index, isv6, ipst);
15939 if (ill != NULL) {
15940 mutex_enter(&ill->ill_lock);
15941 for (ipif = ill->ill_ipif; ipif != NULL;
15942 ipif = ipif->ipif_next) {
15943 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
15944 zoneid == ipif->ipif_zoneid ||
15945 ipif->ipif_zoneid == ALL_ZONES)) {
15946 ipif_refhold_locked(ipif);
15947 break;
15948 }
15949 }
15950 mutex_exit(&ill->ill_lock);
15951 ill_refrele(ill);
15952 }
15953 return (ipif);
15954 }
15955
15956 /*
15957 * Change an existing physical interface's index. If the new index
15958 * is acceptable we update the index and the phyint_list_avl_by_index tree.
15959 * Finally, we update other systems which may have a dependence on the
15960 * index value.
15961 */
15962 /* ARGSUSED */
15963 int
15964 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15965 ip_ioctl_cmd_t *ipip, void *ifreq)
15966 {
15967 ill_t *ill;
15968 phyint_t *phyi;
15969 struct ifreq *ifr = (struct ifreq *)ifreq;
15970 struct lifreq *lifr = (struct lifreq *)ifreq;
15971 uint_t old_index, index;
15972 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
15973 avl_index_t where;
15974
15975 if (ipip->ipi_cmd_type == IF_CMD)
15976 index = ifr->ifr_index;
15977 else
15978 index = lifr->lifr_index;
15979
15980 /*
15981 * Only allow on physical interface. Also, index zero is illegal.
15982 */
15983 ill = ipif->ipif_ill;
15984 phyi = ill->ill_phyint;
15985 if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
15986 return (EINVAL);
15987 }
15988
15989 /* If the index is not changing, no work to do */
15990 if (phyi->phyint_ifindex == index)
15991 return (0);
15992
15993 /*
15994 * Use phyint_exists() to determine if the new interface index
15995 * is already in use. If the index is unused then we need to
15996 * change the phyint's position in the phyint_list_avl_by_index
15997 * tree. If we do not do this, subsequent lookups (using the new
15998 * index value) will not find the phyint.
15999 */
16000 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16001 if (phyint_exists(index, ipst)) {
16002 rw_exit(&ipst->ips_ill_g_lock);
16003 return (EEXIST);
16004 }
16005
16006 /*
16007 * The new index is unused. Set it in the phyint. However we must not
16008 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
16009 * changes. The event must be bound to old ifindex value.
16010 */
16011 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
16012 &index, sizeof (index));
16013
16014 old_index = phyi->phyint_ifindex;
16015 phyi->phyint_ifindex = index;
16016
16017 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
16018 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16019 &index, &where);
16020 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16021 phyi, where);
16022 rw_exit(&ipst->ips_ill_g_lock);
16023
16024 /* Update SCTP's ILL list */
16025 sctp_ill_reindex(ill, old_index);
16026
16027 /* Send the routing sockets message */
16028 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
16029 if (ILL_OTHER(ill))
16030 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
16031
16032 /* Perhaps ilgs should use this ill */
16033 update_conn_ill(NULL, ill->ill_ipst);
16034 return (0);
16035 }
16036
16037 /* ARGSUSED */
16038 int
16039 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16040 ip_ioctl_cmd_t *ipip, void *ifreq)
16041 {
16042 struct ifreq *ifr = (struct ifreq *)ifreq;
16043 struct lifreq *lifr = (struct lifreq *)ifreq;
16044
16045 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
16046 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16047 /* Get the interface index */
16048 if (ipip->ipi_cmd_type == IF_CMD) {
16049 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16050 } else {
16051 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16052 }
16053 return (0);
16054 }
16055
16056 /* ARGSUSED */
16057 int
16058 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16059 ip_ioctl_cmd_t *ipip, void *ifreq)
16060 {
16061 struct lifreq *lifr = (struct lifreq *)ifreq;
16062
16063 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
16064 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16065 /* Get the interface zone */
16066 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16067 lifr->lifr_zoneid = ipif->ipif_zoneid;
16068 return (0);
16069 }
16070
16071 /*
16072 * Set the zoneid of an interface.
16073 */
16074 /* ARGSUSED */
16075 int
16076 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16077 ip_ioctl_cmd_t *ipip, void *ifreq)
16078 {
16079 struct lifreq *lifr = (struct lifreq *)ifreq;
16080 int err = 0;
16081 boolean_t need_up = B_FALSE;
16082 zone_t *zptr;
16083 zone_status_t status;
16084 zoneid_t zoneid;
16085
16086 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16087 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) {
16088 if (!is_system_labeled())
16089 return (ENOTSUP);
16090 zoneid = GLOBAL_ZONEID;
16091 }
16092
16093 /* cannot assign instance zero to a non-global zone */
16094 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
16095 return (ENOTSUP);
16096
16097 /*
16098 * Cannot assign to a zone that doesn't exist or is shutting down. In
16099 * the event of a race with the zone shutdown processing, since IP
16100 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
16101 * interface will be cleaned up even if the zone is shut down
16102 * immediately after the status check. If the interface can't be brought
16103 * down right away, and the zone is shut down before the restart
16104 * function is called, we resolve the possible races by rechecking the
16105 * zone status in the restart function.
16106 */
16107 if ((zptr = zone_find_by_id(zoneid)) == NULL)
16108 return (EINVAL);
16109 status = zone_status_get(zptr);
16110 zone_rele(zptr);
16111
16112 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
16113 return (EINVAL);
16114
16115 if (ipif->ipif_flags & IPIF_UP) {
16116 /*
16117 * If the interface is already marked up,
16118 * we call ipif_down which will take care
16119 * of ditching any IREs that have been set
16120 * up based on the old interface address.
16121 */
16122 err = ipif_logical_down(ipif, q, mp);
16123 if (err == EINPROGRESS)
16124 return (err);
16125 (void) ipif_down_tail(ipif);
16126 need_up = B_TRUE;
16127 }
16128
16129 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
16130 return (err);
16131 }
16132
16133 static int
16134 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
16135 queue_t *q, mblk_t *mp, boolean_t need_up)
16136 {
16137 int err = 0;
16138 ip_stack_t *ipst;
16139
16140 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
16141 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16142
16143 if (CONN_Q(q))
16144 ipst = CONNQ_TO_IPST(q);
16145 else
16146 ipst = ILLQ_TO_IPST(q);
16147
16148 /*
16149 * For exclusive stacks we don't allow a different zoneid than
16150 * global.
16151 */
16152 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
16153 zoneid != GLOBAL_ZONEID)
16154 return (EINVAL);
16155
16156 /* Set the new zone id. */
16157 ipif->ipif_zoneid = zoneid;
16158
16159 /* Update sctp list */
16160 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
16161
16162 /* The default multicast interface might have changed */
16163 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
16164
16165 if (need_up) {
16166 /*
16167 * Now bring the interface back up. If this
16168 * is the only IPIF for the ILL, ipif_up
16169 * will have to re-bind to the device, so
16170 * we may get back EINPROGRESS, in which
16171 * case, this IOCTL will get completed in
16172 * ip_rput_dlpi when we see the DL_BIND_ACK.
16173 */
16174 err = ipif_up(ipif, q, mp);
16175 }
16176 return (err);
16177 }
16178
16179 /* ARGSUSED */
16180 int
16181 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16182 ip_ioctl_cmd_t *ipip, void *if_req)
16183 {
16184 struct lifreq *lifr = (struct lifreq *)if_req;
16185 zoneid_t zoneid;
16186 zone_t *zptr;
16187 zone_status_t status;
16188
16189 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16190 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
16191 zoneid = GLOBAL_ZONEID;
16192
16193 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
16194 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16195
16196 /*
16197 * We recheck the zone status to resolve the following race condition:
16198 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
16199 * 2) hme0:1 is up and can't be brought down right away;
16200 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
16201 * 3) zone "myzone" is halted; the zone status switches to
16202 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
16203 * the interfaces to remove - hme0:1 is not returned because it's not
16204 * yet in "myzone", so it won't be removed;
16205 * 4) the restart function for SIOCSLIFZONE is called; without the
16206 * status check here, we would have hme0:1 in "myzone" after it's been
16207 * destroyed.
16208 * Note that if the status check fails, we need to bring the interface
16209 * back to its state prior to ip_sioctl_slifzone(), hence the call to
16210 * ipif_up_done[_v6]().
16211 */
16212 status = ZONE_IS_UNINITIALIZED;
16213 if ((zptr = zone_find_by_id(zoneid)) != NULL) {
16214 status = zone_status_get(zptr);
16215 zone_rele(zptr);
16216 }
16217 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
16218 if (ipif->ipif_isv6) {
16219 (void) ipif_up_done_v6(ipif);
16220 } else {
16221 (void) ipif_up_done(ipif);
16222 }
16223 return (EINVAL);
16224 }
16225
16226 (void) ipif_down_tail(ipif);
16227
16228 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
16229 B_TRUE));
16230 }
16231
16232 /*
16233 * Return the number of addresses on `ill' with one or more of the values
16234 * in `set' set and all of the values in `clear' clear.
16235 */
16236 static uint_t
16237 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
16238 {
16239 ipif_t *ipif;
16240 uint_t cnt = 0;
16241
16242 ASSERT(IAM_WRITER_ILL(ill));
16243
16244 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
16245 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
16246 cnt++;
16247
16248 return (cnt);
16249 }
16250
16251 /*
16252 * Return the number of migratable addresses on `ill' that are under
16253 * application control.
16254 */
16255 uint_t
16256 ill_appaddr_cnt(const ill_t *ill)
16257 {
16258 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
16259 IPIF_NOFAILOVER));
16260 }
16261
16262 /*
16263 * Return the number of point-to-point addresses on `ill'.
16264 */
16265 uint_t
16266 ill_ptpaddr_cnt(const ill_t *ill)
16267 {
16268 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
16269 }
16270
16271 /* ARGSUSED */
16272 int
16273 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16274 ip_ioctl_cmd_t *ipip, void *ifreq)
16275 {
16276 struct lifreq *lifr = ifreq;
16277
16278 ASSERT(q->q_next == NULL);
16279 ASSERT(CONN_Q(q));
16280
16281 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16282 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16283 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16284 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16285
16286 return (0);
16287 }
16288
16289 /* Find the previous ILL in this usesrc group */
16290 static ill_t *
16291 ill_prev_usesrc(ill_t *uill)
16292 {
16293 ill_t *ill;
16294
16295 for (ill = uill->ill_usesrc_grp_next;
16296 ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16297 ill = ill->ill_usesrc_grp_next)
16298 /* do nothing */;
16299 return (ill);
16300 }
16301
16302 /*
16303 * Release all members of the usesrc group. This routine is called
16304 * from ill_delete when the interface being unplumbed is the
16305 * group head.
16306 *
16307 * This silently clears the usesrc that ifconfig setup.
16308 * An alternative would be to keep that ifindex, and drop packets on the floor
16309 * since no source address can be selected.
16310 * Even if we keep the current semantics, don't need a lock and a linked list.
16311 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16312 * the one that is being removed. Issue is how we return the usesrc users
16313 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16314 * ill_usesrc_ifindex matching a target ill. We could also do that with an
16315 * ill walk, but the walker would need to insert in the ioctl response.
16316 */
16317 static void
16318 ill_disband_usesrc_group(ill_t *uill)
16319 {
16320 ill_t *next_ill, *tmp_ill;
16321 ip_stack_t *ipst = uill->ill_ipst;
16322
16323 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16324 next_ill = uill->ill_usesrc_grp_next;
16325
16326 do {
16327 ASSERT(next_ill != NULL);
16328 tmp_ill = next_ill->ill_usesrc_grp_next;
16329 ASSERT(tmp_ill != NULL);
16330 next_ill->ill_usesrc_grp_next = NULL;
16331 next_ill->ill_usesrc_ifindex = 0;
16332 next_ill = tmp_ill;
16333 } while (next_ill->ill_usesrc_ifindex != 0);
16334 uill->ill_usesrc_grp_next = NULL;
16335 }
16336
16337 /*
16338 * Remove the client usesrc ILL from the list and relink to a new list
16339 */
16340 int
16341 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16342 {
16343 ill_t *ill, *tmp_ill;
16344 ip_stack_t *ipst = ucill->ill_ipst;
16345
16346 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16347 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16348
16349 /*
16350 * Check if the usesrc client ILL passed in is not already
16351 * in use as a usesrc ILL i.e one whose source address is
16352 * in use OR a usesrc ILL is not already in use as a usesrc
16353 * client ILL
16354 */
16355 if ((ucill->ill_usesrc_ifindex == 0) ||
16356 (uill->ill_usesrc_ifindex != 0)) {
16357 return (-1);
16358 }
16359
16360 ill = ill_prev_usesrc(ucill);
16361 ASSERT(ill->ill_usesrc_grp_next != NULL);
16362
16363 /* Remove from the current list */
16364 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16365 /* Only two elements in the list */
16366 ASSERT(ill->ill_usesrc_ifindex == 0);
16367 ill->ill_usesrc_grp_next = NULL;
16368 } else {
16369 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16370 }
16371
16372 if (ifindex == 0) {
16373 ucill->ill_usesrc_ifindex = 0;
16374 ucill->ill_usesrc_grp_next = NULL;
16375 return (0);
16376 }
16377
16378 ucill->ill_usesrc_ifindex = ifindex;
16379 tmp_ill = uill->ill_usesrc_grp_next;
16380 uill->ill_usesrc_grp_next = ucill;
16381 ucill->ill_usesrc_grp_next =
16382 (tmp_ill != NULL) ? tmp_ill : uill;
16383 return (0);
16384 }
16385
16386 /*
16387 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16388 * ip.c for locking details.
16389 */
16390 /* ARGSUSED */
16391 int
16392 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16393 ip_ioctl_cmd_t *ipip, void *ifreq)
16394 {
16395 struct lifreq *lifr = (struct lifreq *)ifreq;
16396 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16397 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16398 int err = 0, ret;
16399 uint_t ifindex;
16400 ipsq_t *ipsq = NULL;
16401 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
16402
16403 ASSERT(IAM_WRITER_IPIF(ipif));
16404 ASSERT(q->q_next == NULL);
16405 ASSERT(CONN_Q(q));
16406
16407 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16408
16409 ifindex = lifr->lifr_index;
16410 if (ifindex == 0) {
16411 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16412 /* non usesrc group interface, nothing to reset */
16413 return (0);
16414 }
16415 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16416 /* valid reset request */
16417 reset_flg = B_TRUE;
16418 }
16419
16420 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16421 if (usesrc_ill == NULL)
16422 return (ENXIO);
16423 if (usesrc_ill == ipif->ipif_ill) {
16424 ill_refrele(usesrc_ill);
16425 return (EINVAL);
16426 }
16427
16428 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16429 NEW_OP, B_TRUE);
16430 if (ipsq == NULL) {
16431 err = EINPROGRESS;
16432 /* Operation enqueued on the ipsq of the usesrc ILL */
16433 goto done;
16434 }
16435
16436 /* USESRC isn't currently supported with IPMP */
16437 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16438 err = ENOTSUP;
16439 goto done;
16440 }
16441
16442 /*
16443 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only
16444 * used by IPMP underlying interfaces, but someone might think it's
16445 * more general and try to use it independently with VNI.)
16446 */
16447 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16448 err = ENOTSUP;
16449 goto done;
16450 }
16451
16452 /*
16453 * If the client is already in use as a usesrc_ill or a usesrc_ill is
16454 * already a client then return EINVAL
16455 */
16456 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16457 err = EINVAL;
16458 goto done;
16459 }
16460
16461 /*
16462 * If the ill_usesrc_ifindex field is already set to what it needs to
16463 * be then this is a duplicate operation.
16464 */
16465 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16466 err = 0;
16467 goto done;
16468 }
16469
16470 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16471 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16472 usesrc_ill->ill_isv6));
16473
16474 /*
16475 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16476 * and the ill_usesrc_ifindex fields
16477 */
16478 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16479
16480 if (reset_flg) {
16481 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16482 if (ret != 0) {
16483 err = EINVAL;
16484 }
16485 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16486 goto done;
16487 }
16488
16489 /*
16490 * Four possibilities to consider:
16491 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16492 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16493 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16494 * 4. Both are part of their respective usesrc groups
16495 */
16496 if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16497 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16498 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16499 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16500 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16501 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16502 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16503 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16504 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16505 /* Insert at head of list */
16506 usesrc_cli_ill->ill_usesrc_grp_next =
16507 usesrc_ill->ill_usesrc_grp_next;
16508 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16509 } else {
16510 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16511 ifindex);
16512 if (ret != 0)
16513 err = EINVAL;
16514 }
16515 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16516
16517 done:
16518 if (ipsq != NULL)
16519 ipsq_exit(ipsq);
16520 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16521 ill_refrele(usesrc_ill);
16522
16523 /* Let conn_ixa caching know that source address selection changed */
16524 ip_update_source_selection(ipst);
16525
16526 return (err);
16527 }
16528
16529 /* ARGSUSED */
16530 int
16531 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16532 ip_ioctl_cmd_t *ipip, void *if_req)
16533 {
16534 struct lifreq *lifr = (struct lifreq *)if_req;
16535 ill_t *ill = ipif->ipif_ill;
16536
16537 /*
16538 * Need a lock since IFF_UP can be set even when there are
16539 * references to the ipif.
16540 */
16541 mutex_enter(&ill->ill_lock);
16542 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16543 lifr->lifr_dadstate = DAD_IN_PROGRESS;
16544 else
16545 lifr->lifr_dadstate = DAD_DONE;
16546 mutex_exit(&ill->ill_lock);
16547 return (0);
16548 }
16549
16550 /*
16551 * comparison function used by avl.
16552 */
16553 static int
16554 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16555 {
16556
16557 uint_t index;
16558
16559 ASSERT(phyip != NULL && index_ptr != NULL);
16560
16561 index = *((uint_t *)index_ptr);
16562 /*
16563 * let the phyint with the lowest index be on top.
16564 */
16565 if (((phyint_t *)phyip)->phyint_ifindex < index)
16566 return (1);
16567 if (((phyint_t *)phyip)->phyint_ifindex > index)
16568 return (-1);
16569 return (0);
16570 }
16571
16572 /*
16573 * comparison function used by avl.
16574 */
16575 static int
16576 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16577 {
16578 ill_t *ill;
16579 int res = 0;
16580
16581 ASSERT(phyip != NULL && name_ptr != NULL);
16582
16583 if (((phyint_t *)phyip)->phyint_illv4)
16584 ill = ((phyint_t *)phyip)->phyint_illv4;
16585 else
16586 ill = ((phyint_t *)phyip)->phyint_illv6;
16587 ASSERT(ill != NULL);
16588
16589 res = strcmp(ill->ill_name, (char *)name_ptr);
16590 if (res > 0)
16591 return (1);
16592 else if (res < 0)
16593 return (-1);
16594 return (0);
16595 }
16596
16597 /*
16598 * This function is called on the unplumb path via ill_glist_delete() when
16599 * there are no ills left on the phyint and thus the phyint can be freed.
16600 */
16601 static void
16602 phyint_free(phyint_t *phyi)
16603 {
16604 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16605
16606 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16607
16608 /*
16609 * If this phyint was an IPMP meta-interface, blow away the group.
16610 * This is safe to do because all of the illgrps have already been
16611 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16612 * If we're cleaning up as a result of failed initialization,
16613 * phyint_grp may be NULL.
16614 */
16615 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16616 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16617 ipmp_grp_destroy(phyi->phyint_grp);
16618 phyi->phyint_grp = NULL;
16619 rw_exit(&ipst->ips_ipmp_lock);
16620 }
16621
16622 /*
16623 * If this interface was under IPMP, take it out of the group.
16624 */
16625 if (phyi->phyint_grp != NULL)
16626 ipmp_phyint_leave_grp(phyi);
16627
16628 /*
16629 * Delete the phyint and disassociate its ipsq. The ipsq itself
16630 * will be freed in ipsq_exit().
16631 */
16632 phyi->phyint_ipsq->ipsq_phyint = NULL;
16633 phyi->phyint_name[0] = '\0';
16634
16635 mi_free(phyi);
16636 }
16637
16638 /*
16639 * Attach the ill to the phyint structure which can be shared by both
16640 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16641 * function is called from ipif_set_values and ill_lookup_on_name (for
16642 * loopback) where we know the name of the ill. We lookup the ill and if
16643 * there is one present already with the name use that phyint. Otherwise
16644 * reuse the one allocated by ill_init.
16645 */
16646 static void
16647 ill_phyint_reinit(ill_t *ill)
16648 {
16649 boolean_t isv6 = ill->ill_isv6;
16650 phyint_t *phyi_old;
16651 phyint_t *phyi;
16652 avl_index_t where = 0;
16653 ill_t *ill_other = NULL;
16654 ip_stack_t *ipst = ill->ill_ipst;
16655
16656 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16657
16658 phyi_old = ill->ill_phyint;
16659 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16660 phyi_old->phyint_illv6 == NULL));
16661 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16662 phyi_old->phyint_illv4 == NULL));
16663 ASSERT(phyi_old->phyint_ifindex == 0);
16664
16665 /*
16666 * Now that our ill has a name, set it in the phyint.
16667 */
16668 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16669
16670 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16671 ill->ill_name, &where);
16672
16673 /*
16674 * 1. We grabbed the ill_g_lock before inserting this ill into
16675 * the global list of ills. So no other thread could have located
16676 * this ill and hence the ipsq of this ill is guaranteed to be empty.
16677 * 2. Now locate the other protocol instance of this ill.
16678 * 3. Now grab both ill locks in the right order, and the phyint lock of
16679 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16680 * of neither ill can change.
16681 * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16682 * other ill.
16683 * 5. Release all locks.
16684 */
16685
16686 /*
16687 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16688 * we are initializing IPv4.
16689 */
16690 if (phyi != NULL) {
16691 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16692 ASSERT(ill_other->ill_phyint != NULL);
16693 ASSERT((isv6 && !ill_other->ill_isv6) ||
16694 (!isv6 && ill_other->ill_isv6));
16695 GRAB_ILL_LOCKS(ill, ill_other);
16696 /*
16697 * We are potentially throwing away phyint_flags which
16698 * could be different from the one that we obtain from
16699 * ill_other->ill_phyint. But it is okay as we are assuming
16700 * that the state maintained within IP is correct.
16701 */
16702 mutex_enter(&phyi->phyint_lock);
16703 if (isv6) {
16704 ASSERT(phyi->phyint_illv6 == NULL);
16705 phyi->phyint_illv6 = ill;
16706 } else {
16707 ASSERT(phyi->phyint_illv4 == NULL);
16708 phyi->phyint_illv4 = ill;
16709 }
16710
16711 /*
16712 * Delete the old phyint and make its ipsq eligible
16713 * to be freed in ipsq_exit().
16714 */
16715 phyi_old->phyint_illv4 = NULL;
16716 phyi_old->phyint_illv6 = NULL;
16717 phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16718 phyi_old->phyint_name[0] = '\0';
16719 mi_free(phyi_old);
16720 } else {
16721 mutex_enter(&ill->ill_lock);
16722 /*
16723 * We don't need to acquire any lock, since
16724 * the ill is not yet visible globally and we
16725 * have not yet released the ill_g_lock.
16726 */
16727 phyi = phyi_old;
16728 mutex_enter(&phyi->phyint_lock);
16729 /* XXX We need a recovery strategy here. */
16730 if (!phyint_assign_ifindex(phyi, ipst))
16731 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16732
16733 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16734 (void *)phyi, where);
16735
16736 (void) avl_find(&ipst->ips_phyint_g_list->
16737 phyint_list_avl_by_index,
16738 &phyi->phyint_ifindex, &where);
16739 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16740 (void *)phyi, where);
16741 }
16742
16743 /*
16744 * Reassigning ill_phyint automatically reassigns the ipsq also.
16745 * pending mp is not affected because that is per ill basis.
16746 */
16747 ill->ill_phyint = phyi;
16748
16749 /*
16750 * Now that the phyint's ifindex has been assigned, complete the
16751 * remaining
16752 */
16753 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16754 if (ill->ill_isv6) {
16755 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16756 ill->ill_phyint->phyint_ifindex;
16757 ill->ill_mcast_type = ipst->ips_mld_max_version;
16758 } else {
16759 ill->ill_mcast_type = ipst->ips_igmp_max_version;
16760 }
16761
16762 /*
16763 * Generate an event within the hooks framework to indicate that
16764 * a new interface has just been added to IP. For this event to
16765 * be generated, the network interface must, at least, have an
16766 * ifindex assigned to it. (We don't generate the event for
16767 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16768 *
16769 * This needs to be run inside the ill_g_lock perimeter to ensure
16770 * that the ordering of delivered events to listeners matches the
16771 * order of them in the kernel.
16772 */
16773 if (!IS_LOOPBACK(ill)) {
16774 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16775 ill->ill_name_length);
16776 }
16777 RELEASE_ILL_LOCKS(ill, ill_other);
16778 mutex_exit(&phyi->phyint_lock);
16779 }
16780
16781 /*
16782 * Notify any downstream modules of the name of this interface.
16783 * An M_IOCTL is used even though we don't expect a successful reply.
16784 * Any reply message from the driver (presumably an M_IOCNAK) will
16785 * eventually get discarded somewhere upstream. The message format is
16786 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16787 * to IP.
16788 */
16789 static void
16790 ip_ifname_notify(ill_t *ill, queue_t *q)
16791 {
16792 mblk_t *mp1, *mp2;
16793 struct iocblk *iocp;
16794 struct lifreq *lifr;
16795
16796 mp1 = mkiocb(SIOCSLIFNAME);
16797 if (mp1 == NULL)
16798 return;
16799 mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16800 if (mp2 == NULL) {
16801 freeb(mp1);
16802 return;
16803 }
16804
16805 mp1->b_cont = mp2;
16806 iocp = (struct iocblk *)mp1->b_rptr;
16807 iocp->ioc_count = sizeof (struct lifreq);
16808
16809 lifr = (struct lifreq *)mp2->b_rptr;
16810 mp2->b_wptr += sizeof (struct lifreq);
16811 bzero(lifr, sizeof (struct lifreq));
16812
16813 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16814 lifr->lifr_ppa = ill->ill_ppa;
16815 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16816
16817 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16818 char *, "SIOCSLIFNAME", ill_t *, ill);
16819 putnext(q, mp1);
16820 }
16821
16822 static int
16823 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16824 {
16825 int err;
16826 ip_stack_t *ipst = ill->ill_ipst;
16827 phyint_t *phyi = ill->ill_phyint;
16828
16829 /*
16830 * Now that ill_name is set, the configuration for the IPMP
16831 * meta-interface can be performed.
16832 */
16833 if (IS_IPMP(ill)) {
16834 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16835 /*
16836 * If phyi->phyint_grp is NULL, then this is the first IPMP
16837 * meta-interface and we need to create the IPMP group.
16838 */
16839 if (phyi->phyint_grp == NULL) {
16840 /*
16841 * If someone has renamed another IPMP group to have
16842 * the same name as our interface, bail.
16843 */
16844 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16845 rw_exit(&ipst->ips_ipmp_lock);
16846 return (EEXIST);
16847 }
16848 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16849 if (phyi->phyint_grp == NULL) {
16850 rw_exit(&ipst->ips_ipmp_lock);
16851 return (ENOMEM);
16852 }
16853 }
16854 rw_exit(&ipst->ips_ipmp_lock);
16855 }
16856
16857 /* Tell downstream modules where they are. */
16858 ip_ifname_notify(ill, q);
16859
16860 /*
16861 * ill_dl_phys returns EINPROGRESS in the usual case.
16862 * Error cases are ENOMEM ...
16863 */
16864 err = ill_dl_phys(ill, ipif, mp, q);
16865
16866 if (ill->ill_isv6) {
16867 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16868 if (ipst->ips_mld_slowtimeout_id == 0) {
16869 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16870 (void *)ipst,
16871 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16872 }
16873 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16874 } else {
16875 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16876 if (ipst->ips_igmp_slowtimeout_id == 0) {
16877 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16878 (void *)ipst,
16879 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16880 }
16881 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16882 }
16883
16884 return (err);
16885 }
16886
16887 /*
16888 * Common routine for ppa and ifname setting. Should be called exclusive.
16889 *
16890 * Returns EINPROGRESS when mp has been consumed by queueing it on
16891 * ipx_pending_mp and the ioctl will complete in ip_rput.
16892 *
16893 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16894 * the new name and new ppa in lifr_name and lifr_ppa respectively.
16895 * For SLIFNAME, we pass these values back to the userland.
16896 */
16897 static int
16898 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16899 {
16900 ill_t *ill;
16901 ipif_t *ipif;
16902 ipsq_t *ipsq;
16903 char *ppa_ptr;
16904 char *old_ptr;
16905 char old_char;
16906 int error;
16907 ip_stack_t *ipst;
16908
16909 ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16910 ASSERT(q->q_next != NULL);
16911 ASSERT(interf_name != NULL);
16912
16913 ill = (ill_t *)q->q_ptr;
16914 ipst = ill->ill_ipst;
16915
16916 ASSERT(ill->ill_ipst != NULL);
16917 ASSERT(ill->ill_name[0] == '\0');
16918 ASSERT(IAM_WRITER_ILL(ill));
16919 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16920 ASSERT(ill->ill_ppa == UINT_MAX);
16921
16922 ill->ill_defend_start = ill->ill_defend_count = 0;
16923 /* The ppa is sent down by ifconfig or is chosen */
16924 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16925 return (EINVAL);
16926 }
16927
16928 /*
16929 * make sure ppa passed in is same as ppa in the name.
16930 * This check is not made when ppa == UINT_MAX in that case ppa
16931 * in the name could be anything. System will choose a ppa and
16932 * update new_ppa_ptr and inter_name to contain the choosen ppa.
16933 */
16934 if (*new_ppa_ptr != UINT_MAX) {
16935 /* stoi changes the pointer */
16936 old_ptr = ppa_ptr;
16937 /*
16938 * ifconfig passed in 0 for the ppa for DLPI 1 style devices
16939 * (they don't have an externally visible ppa). We assign one
16940 * here so that we can manage the interface. Note that in
16941 * the past this value was always 0 for DLPI 1 drivers.
16942 */
16943 if (*new_ppa_ptr == 0)
16944 *new_ppa_ptr = stoi(&old_ptr);
16945 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
16946 return (EINVAL);
16947 }
16948 /*
16949 * terminate string before ppa
16950 * save char at that location.
16951 */
16952 old_char = ppa_ptr[0];
16953 ppa_ptr[0] = '\0';
16954
16955 ill->ill_ppa = *new_ppa_ptr;
16956 /*
16957 * Finish as much work now as possible before calling ill_glist_insert
16958 * which makes the ill globally visible and also merges it with the
16959 * other protocol instance of this phyint. The remaining work is
16960 * done after entering the ipsq which may happen sometime later.
16961 */
16962 ipif = ill->ill_ipif;
16963
16964 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
16965 ipif_assign_seqid(ipif);
16966
16967 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
16968 ill->ill_flags |= ILLF_IPV4;
16969
16970 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */
16971 ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
16972
16973 if (ill->ill_flags & ILLF_IPV6) {
16974
16975 ill->ill_isv6 = B_TRUE;
16976 ill_set_inputfn(ill);
16977 if (ill->ill_rq != NULL) {
16978 ill->ill_rq->q_qinfo = &iprinitv6;
16979 }
16980
16981 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
16982 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
16983 ipif->ipif_v6subnet = ipv6_all_zeros;
16984 ipif->ipif_v6net_mask = ipv6_all_zeros;
16985 ipif->ipif_v6brd_addr = ipv6_all_zeros;
16986 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
16987 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
16988 /*
16989 * point-to-point or Non-mulicast capable
16990 * interfaces won't do NUD unless explicitly
16991 * configured to do so.
16992 */
16993 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
16994 !(ill->ill_flags & ILLF_MULTICAST)) {
16995 ill->ill_flags |= ILLF_NONUD;
16996 }
16997 /* Make sure IPv4 specific flag is not set on IPv6 if */
16998 if (ill->ill_flags & ILLF_NOARP) {
16999 /*
17000 * Note: xresolv interfaces will eventually need
17001 * NOARP set here as well, but that will require
17002 * those external resolvers to have some
17003 * knowledge of that flag and act appropriately.
17004 * Not to be changed at present.
17005 */
17006 ill->ill_flags &= ~ILLF_NOARP;
17007 }
17008 /*
17009 * Set the ILLF_ROUTER flag according to the global
17010 * IPv6 forwarding policy.
17011 */
17012 if (ipst->ips_ipv6_forwarding != 0)
17013 ill->ill_flags |= ILLF_ROUTER;
17014 } else if (ill->ill_flags & ILLF_IPV4) {
17015 ill->ill_isv6 = B_FALSE;
17016 ill_set_inputfn(ill);
17017 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
17018 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
17019 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
17020 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
17021 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
17022 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
17023 /*
17024 * Set the ILLF_ROUTER flag according to the global
17025 * IPv4 forwarding policy.
17026 */
17027 if (ipst->ips_ip_forwarding != 0)
17028 ill->ill_flags |= ILLF_ROUTER;
17029 }
17030
17031 ASSERT(ill->ill_phyint != NULL);
17032
17033 /*
17034 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
17035 * be completed in ill_glist_insert -> ill_phyint_reinit
17036 */
17037 if (!ill_allocate_mibs(ill))
17038 return (ENOMEM);
17039
17040 /*
17041 * Pick a default sap until we get the DL_INFO_ACK back from
17042 * the driver.
17043 */
17044 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
17045 ill->ill_media->ip_m_ipv4sap;
17046
17047 ill->ill_ifname_pending = 1;
17048 ill->ill_ifname_pending_err = 0;
17049
17050 /*
17051 * When the first ipif comes up in ipif_up_done(), multicast groups
17052 * that were joined while this ill was not bound to the DLPI link need
17053 * to be recovered by ill_recover_multicast().
17054 */
17055 ill->ill_need_recover_multicast = 1;
17056
17057 ill_refhold(ill);
17058 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
17059 if ((error = ill_glist_insert(ill, interf_name,
17060 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
17061 ill->ill_ppa = UINT_MAX;
17062 ill->ill_name[0] = '\0';
17063 /*
17064 * undo null termination done above.
17065 */
17066 ppa_ptr[0] = old_char;
17067 rw_exit(&ipst->ips_ill_g_lock);
17068 ill_refrele(ill);
17069 return (error);
17070 }
17071
17072 ASSERT(ill->ill_name_length <= LIFNAMSIZ);
17073
17074 /*
17075 * When we return the buffer pointed to by interf_name should contain
17076 * the same name as in ill_name.
17077 * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
17078 * the buffer pointed to by new_ppa_ptr would not contain the right ppa
17079 * so copy full name and update the ppa ptr.
17080 * When ppa passed in != UINT_MAX all values are correct just undo
17081 * null termination, this saves a bcopy.
17082 */
17083 if (*new_ppa_ptr == UINT_MAX) {
17084 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
17085 *new_ppa_ptr = ill->ill_ppa;
17086 } else {
17087 /*
17088 * undo null termination done above.
17089 */
17090 ppa_ptr[0] = old_char;
17091 }
17092
17093 /* Let SCTP know about this ILL */
17094 sctp_update_ill(ill, SCTP_ILL_INSERT);
17095
17096 /*
17097 * ill_glist_insert has made the ill visible globally, and
17098 * ill_phyint_reinit could have changed the ipsq. At this point,
17099 * we need to hold the ips_ill_g_lock across the call to enter the
17100 * ipsq to enforce atomicity and prevent reordering. In the event
17101 * the ipsq has changed, and if the new ipsq is currently busy,
17102 * we need to make sure that this half-completed ioctl is ahead of
17103 * any subsequent ioctl. We achieve this by not dropping the
17104 * ips_ill_g_lock which prevents any ill lookup itself thereby
17105 * ensuring that new ioctls can't start.
17106 */
17107 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
17108 B_TRUE);
17109
17110 rw_exit(&ipst->ips_ill_g_lock);
17111 ill_refrele(ill);
17112 if (ipsq == NULL)
17113 return (EINPROGRESS);
17114
17115 /*
17116 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
17117 */
17118 if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
17119 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
17120 else
17121 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
17122
17123 error = ipif_set_values_tail(ill, ipif, mp, q);
17124 ipsq_exit(ipsq);
17125 if (error != 0 && error != EINPROGRESS) {
17126 /*
17127 * restore previous values
17128 */
17129 ill->ill_isv6 = B_FALSE;
17130 ill_set_inputfn(ill);
17131 }
17132 return (error);
17133 }
17134
17135 void
17136 ipif_init(ip_stack_t *ipst)
17137 {
17138 int i;
17139
17140 for (i = 0; i < MAX_G_HEADS; i++) {
17141 ipst->ips_ill_g_heads[i].ill_g_list_head =
17142 (ill_if_t *)&ipst->ips_ill_g_heads[i];
17143 ipst->ips_ill_g_heads[i].ill_g_list_tail =
17144 (ill_if_t *)&ipst->ips_ill_g_heads[i];
17145 }
17146
17147 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
17148 ill_phyint_compare_index,
17149 sizeof (phyint_t),
17150 offsetof(struct phyint, phyint_avl_by_index));
17151 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
17152 ill_phyint_compare_name,
17153 sizeof (phyint_t),
17154 offsetof(struct phyint, phyint_avl_by_name));
17155 }
17156
17157 /*
17158 * Save enough information so that we can recreate the IRE if
17159 * the interface goes down and then up.
17160 */
17161 void
17162 ill_save_ire(ill_t *ill, ire_t *ire)
17163 {
17164 mblk_t *save_mp;
17165
17166 save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
17167 if (save_mp != NULL) {
17168 ifrt_t *ifrt;
17169
17170 save_mp->b_wptr += sizeof (ifrt_t);
17171 ifrt = (ifrt_t *)save_mp->b_rptr;
17172 bzero(ifrt, sizeof (ifrt_t));
17173 ifrt->ifrt_type = ire->ire_type;
17174 if (ire->ire_ipversion == IPV4_VERSION) {
17175 ASSERT(!ill->ill_isv6);
17176 ifrt->ifrt_addr = ire->ire_addr;
17177 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
17178 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
17179 ifrt->ifrt_mask = ire->ire_mask;
17180 } else {
17181 ASSERT(ill->ill_isv6);
17182 ifrt->ifrt_v6addr = ire->ire_addr_v6;
17183 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
17184 mutex_enter(&ire->ire_lock);
17185 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
17186 mutex_exit(&ire->ire_lock);
17187 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
17188 ifrt->ifrt_v6mask = ire->ire_mask_v6;
17189 }
17190 ifrt->ifrt_flags = ire->ire_flags;
17191 ifrt->ifrt_zoneid = ire->ire_zoneid;
17192 mutex_enter(&ill->ill_saved_ire_lock);
17193 save_mp->b_cont = ill->ill_saved_ire_mp;
17194 ill->ill_saved_ire_mp = save_mp;
17195 ill->ill_saved_ire_cnt++;
17196 mutex_exit(&ill->ill_saved_ire_lock);
17197 }
17198 }
17199
17200 /*
17201 * Remove one entry from ill_saved_ire_mp.
17202 */
17203 void
17204 ill_remove_saved_ire(ill_t *ill, ire_t *ire)
17205 {
17206 mblk_t **mpp;
17207 mblk_t *mp;
17208 ifrt_t *ifrt;
17209
17210 /* Remove from ill_saved_ire_mp list if it is there */
17211 mutex_enter(&ill->ill_saved_ire_lock);
17212 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
17213 mpp = &(*mpp)->b_cont) {
17214 in6_addr_t gw_addr_v6;
17215
17216 /*
17217 * On a given ill, the tuple of address, gateway, mask,
17218 * ire_type, and zoneid is unique for each saved IRE.
17219 */
17220 mp = *mpp;
17221 ifrt = (ifrt_t *)mp->b_rptr;
17222 /* ire_gateway_addr_v6 can change - need lock */
17223 mutex_enter(&ire->ire_lock);
17224 gw_addr_v6 = ire->ire_gateway_addr_v6;
17225 mutex_exit(&ire->ire_lock);
17226
17227 if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
17228 ifrt->ifrt_type != ire->ire_type)
17229 continue;
17230
17231 if (ill->ill_isv6 ?
17232 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
17233 &ire->ire_addr_v6) &&
17234 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
17235 &gw_addr_v6) &&
17236 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
17237 &ire->ire_mask_v6)) :
17238 (ifrt->ifrt_addr == ire->ire_addr &&
17239 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
17240 ifrt->ifrt_mask == ire->ire_mask)) {
17241 *mpp = mp->b_cont;
17242 ill->ill_saved_ire_cnt--;
17243 freeb(mp);
17244 break;
17245 }
17246 }
17247 mutex_exit(&ill->ill_saved_ire_lock);
17248 }
17249
17250 /*
17251 * IP multirouting broadcast routes handling
17252 * Append CGTP broadcast IREs to regular ones created
17253 * at ifconfig time.
17254 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
17255 * the destination and the gateway are broadcast addresses.
17256 * The caller has verified that the destination is an IRE_BROADCAST and that
17257 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
17258 * we create a MULTIRT IRE_BROADCAST.
17259 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
17260 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
17261 */
17262 static void
17263 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
17264 {
17265 ire_t *ire_prim;
17266
17267 ASSERT(ire != NULL);
17268
17269 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17270 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
17271 NULL);
17272 if (ire_prim != NULL) {
17273 /*
17274 * We are in the special case of broadcasts for
17275 * CGTP. We add an IRE_BROADCAST that holds
17276 * the RTF_MULTIRT flag, the destination
17277 * address and the low level
17278 * info of ire_prim. In other words, CGTP
17279 * broadcast is added to the redundant ipif.
17280 */
17281 ill_t *ill_prim;
17282 ire_t *bcast_ire;
17283
17284 ill_prim = ire_prim->ire_ill;
17285
17286 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
17287 (void *)ire_prim, (void *)ill_prim));
17288
17289 bcast_ire = ire_create(
17290 (uchar_t *)&ire->ire_addr,
17291 (uchar_t *)&ip_g_all_ones,
17292 (uchar_t *)&ire->ire_gateway_addr,
17293 IRE_BROADCAST,
17294 ill_prim,
17295 GLOBAL_ZONEID, /* CGTP is only for the global zone */
17296 ire->ire_flags | RTF_KERNEL,
17297 NULL,
17298 ipst);
17299
17300 /*
17301 * Here we assume that ire_add does head insertion so that
17302 * the added IRE_BROADCAST comes before the existing IRE_HOST.
17303 */
17304 if (bcast_ire != NULL) {
17305 if (ire->ire_flags & RTF_SETSRC) {
17306 bcast_ire->ire_setsrc_addr =
17307 ire->ire_setsrc_addr;
17308 }
17309 bcast_ire = ire_add(bcast_ire);
17310 if (bcast_ire != NULL) {
17311 ip2dbg(("ip_cgtp_filter_bcast_add: "
17312 "added bcast_ire %p\n",
17313 (void *)bcast_ire));
17314
17315 ill_save_ire(ill_prim, bcast_ire);
17316 ire_refrele(bcast_ire);
17317 }
17318 }
17319 ire_refrele(ire_prim);
17320 }
17321 }
17322
17323 /*
17324 * IP multirouting broadcast routes handling
17325 * Remove the broadcast ire.
17326 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
17327 * the destination and the gateway are broadcast addresses.
17328 * The caller has only verified that RTF_MULTIRT was set. We check
17329 * that the destination is broadcast and that the gateway is a broadcast
17330 * address, and if so delete the IRE added by ip_cgtp_bcast_add().
17331 */
17332 static void
17333 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
17334 {
17335 ASSERT(ire != NULL);
17336
17337 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
17338 ire_t *ire_prim;
17339
17340 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17341 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
17342 ipst, NULL);
17343 if (ire_prim != NULL) {
17344 ill_t *ill_prim;
17345 ire_t *bcast_ire;
17346
17347 ill_prim = ire_prim->ire_ill;
17348
17349 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17350 "ire_prim %p, ill_prim %p\n",
17351 (void *)ire_prim, (void *)ill_prim));
17352
17353 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
17354 ire->ire_gateway_addr, IRE_BROADCAST,
17355 ill_prim, ALL_ZONES, NULL,
17356 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
17357 MATCH_IRE_MASK, 0, ipst, NULL);
17358
17359 if (bcast_ire != NULL) {
17360 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17361 "looked up bcast_ire %p\n",
17362 (void *)bcast_ire));
17363 ill_remove_saved_ire(bcast_ire->ire_ill,
17364 bcast_ire);
17365 ire_delete(bcast_ire);
17366 ire_refrele(bcast_ire);
17367 }
17368 ire_refrele(ire_prim);
17369 }
17370 }
17371 }
17372
17373 /*
17374 * Derive an interface id from the link layer address.
17375 * Knows about IEEE 802 and IEEE EUI-64 mappings.
17376 */
17377 static void
17378 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17379 {
17380 char *addr;
17381
17382 /*
17383 * Note that some IPv6 interfaces get plumbed over links that claim to
17384 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
17385 * PPP links). The ETHERADDRL check here ensures that we only set the
17386 * interface ID on IPv6 interfaces above links that actually have real
17387 * Ethernet addresses.
17388 */
17389 if (ill->ill_phys_addr_length == ETHERADDRL) {
17390 /* Form EUI-64 like address */
17391 addr = (char *)&v6addr->s6_addr32[2];
17392 bcopy(ill->ill_phys_addr, addr, 3);
17393 addr[0] ^= 0x2; /* Toggle Universal/Local bit */
17394 addr[3] = (char)0xff;
17395 addr[4] = (char)0xfe;
17396 bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17397 }
17398 }
17399
17400 /* ARGSUSED */
17401 static void
17402 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17403 {
17404 }
17405
17406 typedef struct ipmp_ifcookie {
17407 uint32_t ic_hostid;
17408 char ic_ifname[LIFNAMSIZ];
17409 char ic_zonename[ZONENAME_MAX];
17410 } ipmp_ifcookie_t;
17411
17412 /*
17413 * Construct a pseudo-random interface ID for the IPMP interface that's both
17414 * predictable and (almost) guaranteed to be unique.
17415 */
17416 static void
17417 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17418 {
17419 zone_t *zp;
17420 uint8_t *addr;
17421 uchar_t hash[16];
17422 ulong_t hostid;
17423 MD5_CTX ctx;
17424 ipmp_ifcookie_t ic = { 0 };
17425
17426 ASSERT(IS_IPMP(ill));
17427
17428 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17429 ic.ic_hostid = htonl((uint32_t)hostid);
17430
17431 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17432
17433 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17434 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17435 zone_rele(zp);
17436 }
17437
17438 MD5Init(&ctx);
17439 MD5Update(&ctx, &ic, sizeof (ic));
17440 MD5Final(hash, &ctx);
17441
17442 /*
17443 * Map the hash to an interface ID per the basic approach in RFC3041.
17444 */
17445 addr = &v6addr->s6_addr8[8];
17446 bcopy(hash + 8, addr, sizeof (uint64_t));
17447 addr[0] &= ~0x2; /* set local bit */
17448 }
17449
17450 /*
17451 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17452 */
17453 static void
17454 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17455 {
17456 phyint_t *phyi = ill->ill_phyint;
17457
17458 /*
17459 * Check PHYI_MULTI_BCAST and length of physical
17460 * address to determine if we use the mapping or the
17461 * broadcast address.
17462 */
17463 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17464 ill->ill_phys_addr_length != ETHERADDRL) {
17465 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17466 return;
17467 }
17468 m_physaddr[0] = 0x33;
17469 m_physaddr[1] = 0x33;
17470 m_physaddr[2] = m_ip6addr[12];
17471 m_physaddr[3] = m_ip6addr[13];
17472 m_physaddr[4] = m_ip6addr[14];
17473 m_physaddr[5] = m_ip6addr[15];
17474 }
17475
17476 /*
17477 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17478 */
17479 static void
17480 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17481 {
17482 phyint_t *phyi = ill->ill_phyint;
17483
17484 /*
17485 * Check PHYI_MULTI_BCAST and length of physical
17486 * address to determine if we use the mapping or the
17487 * broadcast address.
17488 */
17489 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17490 ill->ill_phys_addr_length != ETHERADDRL) {
17491 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17492 return;
17493 }
17494 m_physaddr[0] = 0x01;
17495 m_physaddr[1] = 0x00;
17496 m_physaddr[2] = 0x5e;
17497 m_physaddr[3] = m_ipaddr[1] & 0x7f;
17498 m_physaddr[4] = m_ipaddr[2];
17499 m_physaddr[5] = m_ipaddr[3];
17500 }
17501
17502 /* ARGSUSED */
17503 static void
17504 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17505 {
17506 /*
17507 * for the MULTI_BCAST case and other cases when we want to
17508 * use the link-layer broadcast address for multicast.
17509 */
17510 uint8_t *bphys_addr;
17511 dl_unitdata_req_t *dlur;
17512
17513 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17514 if (ill->ill_sap_length < 0) {
17515 bphys_addr = (uchar_t *)dlur +
17516 dlur->dl_dest_addr_offset;
17517 } else {
17518 bphys_addr = (uchar_t *)dlur +
17519 dlur->dl_dest_addr_offset + ill->ill_sap_length;
17520 }
17521
17522 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17523 }
17524
17525 /*
17526 * Derive IPoIB interface id from the link layer address.
17527 */
17528 static void
17529 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17530 {
17531 char *addr;
17532
17533 ASSERT(ill->ill_phys_addr_length == 20);
17534 addr = (char *)&v6addr->s6_addr32[2];
17535 bcopy(ill->ill_phys_addr + 12, addr, 8);
17536 /*
17537 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17538 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17539 * rules. In these cases, the IBA considers these GUIDs to be in
17540 * "Modified EUI-64" format, and thus toggling the u/l bit is not
17541 * required; vendors are required not to assign global EUI-64's
17542 * that differ only in u/l bit values, thus guaranteeing uniqueness
17543 * of the interface identifier. Whether the GUID is in modified
17544 * or proper EUI-64 format, the ipv6 identifier must have the u/l
17545 * bit set to 1.
17546 */
17547 addr[0] |= 2; /* Set Universal/Local bit to 1 */
17548 }
17549
17550 /*
17551 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17552 * Note on mapping from multicast IP addresses to IPoIB multicast link
17553 * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17554 * The format of an IPoIB multicast address is:
17555 *
17556 * 4 byte QPN Scope Sign. Pkey
17557 * +--------------------------------------------+
17558 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17559 * +--------------------------------------------+
17560 *
17561 * The Scope and Pkey components are properties of the IBA port and
17562 * network interface. They can be ascertained from the broadcast address.
17563 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17564 */
17565 static void
17566 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17567 {
17568 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17569 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17570 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17571 uint8_t *bphys_addr;
17572 dl_unitdata_req_t *dlur;
17573
17574 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17575
17576 /*
17577 * RFC 4391: IPv4 MGID is 28-bit long.
17578 */
17579 m_physaddr[16] = m_ipaddr[0] & 0x0f;
17580 m_physaddr[17] = m_ipaddr[1];
17581 m_physaddr[18] = m_ipaddr[2];
17582 m_physaddr[19] = m_ipaddr[3];
17583
17584
17585 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17586 if (ill->ill_sap_length < 0) {
17587 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17588 } else {
17589 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17590 ill->ill_sap_length;
17591 }
17592 /*
17593 * Now fill in the IBA scope/Pkey values from the broadcast address.
17594 */
17595 m_physaddr[5] = bphys_addr[5];
17596 m_physaddr[8] = bphys_addr[8];
17597 m_physaddr[9] = bphys_addr[9];
17598 }
17599
17600 static void
17601 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17602 {
17603 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17604 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17605 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17606 uint8_t *bphys_addr;
17607 dl_unitdata_req_t *dlur;
17608
17609 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17610
17611 /*
17612 * RFC 4391: IPv4 MGID is 80-bit long.
17613 */
17614 bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17615
17616 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17617 if (ill->ill_sap_length < 0) {
17618 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17619 } else {
17620 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17621 ill->ill_sap_length;
17622 }
17623 /*
17624 * Now fill in the IBA scope/Pkey values from the broadcast address.
17625 */
17626 m_physaddr[5] = bphys_addr[5];
17627 m_physaddr[8] = bphys_addr[8];
17628 m_physaddr[9] = bphys_addr[9];
17629 }
17630
17631 /*
17632 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17633 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the
17634 * IPv6 interface id. This is a suggested mechanism described in section 3.7
17635 * of RFC4213.
17636 */
17637 static void
17638 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17639 {
17640 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17641 v6addr->s6_addr32[2] = 0;
17642 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17643 }
17644
17645 /*
17646 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17647 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface
17648 * id.
17649 */
17650 static void
17651 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17652 {
17653 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17654
17655 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17656 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17657 }
17658
17659 static void
17660 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17661 {
17662 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17663 }
17664
17665 static void
17666 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17667 {
17668 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17669 }
17670
17671 static void
17672 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17673 {
17674 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17675 }
17676
17677 static void
17678 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17679 {
17680 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17681 }
17682
17683 /*
17684 * Lookup an ill and verify that the zoneid has an ipif on that ill.
17685 * Returns an held ill, or NULL.
17686 */
17687 ill_t *
17688 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17689 ip_stack_t *ipst)
17690 {
17691 ill_t *ill;
17692 ipif_t *ipif;
17693
17694 ill = ill_lookup_on_ifindex(index, isv6, ipst);
17695 if (ill == NULL)
17696 return (NULL);
17697
17698 mutex_enter(&ill->ill_lock);
17699 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17700 if (IPIF_IS_CONDEMNED(ipif))
17701 continue;
17702 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17703 ipif->ipif_zoneid != ALL_ZONES)
17704 continue;
17705
17706 mutex_exit(&ill->ill_lock);
17707 return (ill);
17708 }
17709 mutex_exit(&ill->ill_lock);
17710 ill_refrele(ill);
17711 return (NULL);
17712 }
17713
17714 /*
17715 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17716 * If a pointer to an ipif_t is returned then the caller will need to do
17717 * an ill_refrele().
17718 */
17719 ipif_t *
17720 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17721 ip_stack_t *ipst)
17722 {
17723 ipif_t *ipif;
17724 ill_t *ill;
17725
17726 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17727 if (ill == NULL)
17728 return (NULL);
17729
17730 mutex_enter(&ill->ill_lock);
17731 if (ill->ill_state_flags & ILL_CONDEMNED) {
17732 mutex_exit(&ill->ill_lock);
17733 ill_refrele(ill);
17734 return (NULL);
17735 }
17736
17737 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17738 if (!IPIF_CAN_LOOKUP(ipif))
17739 continue;
17740 if (lifidx == ipif->ipif_id) {
17741 ipif_refhold_locked(ipif);
17742 break;
17743 }
17744 }
17745
17746 mutex_exit(&ill->ill_lock);
17747 ill_refrele(ill);
17748 return (ipif);
17749 }
17750
17751 /*
17752 * Set ill_inputfn based on the current know state.
17753 * This needs to be called when any of the factors taken into
17754 * account changes.
17755 */
17756 void
17757 ill_set_inputfn(ill_t *ill)
17758 {
17759 ip_stack_t *ipst = ill->ill_ipst;
17760
17761 if (ill->ill_isv6) {
17762 if (is_system_labeled())
17763 ill->ill_inputfn = ill_input_full_v6;
17764 else
17765 ill->ill_inputfn = ill_input_short_v6;
17766 } else {
17767 if (is_system_labeled())
17768 ill->ill_inputfn = ill_input_full_v4;
17769 else if (ill->ill_dhcpinit != 0)
17770 ill->ill_inputfn = ill_input_full_v4;
17771 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17772 != NULL)
17773 ill->ill_inputfn = ill_input_full_v4;
17774 else if (ipst->ips_ip_cgtp_filter &&
17775 ipst->ips_ip_cgtp_filter_ops != NULL)
17776 ill->ill_inputfn = ill_input_full_v4;
17777 else
17778 ill->ill_inputfn = ill_input_short_v4;
17779 }
17780 }
17781
17782 /*
17783 * Re-evaluate ill_inputfn for all the IPv4 ills.
17784 * Used when RSVP and CGTP comes and goes.
17785 */
17786 void
17787 ill_set_inputfn_all(ip_stack_t *ipst)
17788 {
17789 ill_walk_context_t ctx;
17790 ill_t *ill;
17791
17792 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17793 ill = ILL_START_WALK_V4(&ctx, ipst);
17794 for (; ill != NULL; ill = ill_next(&ctx, ill))
17795 ill_set_inputfn(ill);
17796
17797 rw_exit(&ipst->ips_ill_g_lock);
17798 }
17799
17800 /*
17801 * Set the physical address information for `ill' to the contents of the
17802 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be
17803 * asynchronous if `ill' cannot immediately be quiesced -- in which case
17804 * EINPROGRESS will be returned.
17805 */
17806 int
17807 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17808 {
17809 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17810 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17811
17812 ASSERT(IAM_WRITER_IPSQ(ipsq));
17813
17814 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17815 dlindp->dl_data != DL_CURR_DEST_ADDR &&
17816 dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17817 /* Changing DL_IPV6_TOKEN is not yet supported */
17818 return (0);
17819 }
17820
17821 /*
17822 * We need to store up to two copies of `mp' in `ill'. Due to the
17823 * design of ipsq_pending_mp_add(), we can't pass them as separate
17824 * arguments to ill_set_phys_addr_tail(). Instead, chain them
17825 * together here, then pull 'em apart in ill_set_phys_addr_tail().
17826 */
17827 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17828 freemsg(mp);
17829 return (ENOMEM);
17830 }
17831
17832 ipsq_current_start(ipsq, ill->ill_ipif, 0);
17833
17834 /*
17835 * Since we'll only do a logical down, we can't rely on ipif_down
17836 * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17837 * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17838 * case, to quiesce ire's and nce's for ill_is_quiescent.
17839 */
17840 mutex_enter(&ill->ill_lock);
17841 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17842 /* no more ire/nce addition allowed */
17843 mutex_exit(&ill->ill_lock);
17844
17845 /*
17846 * If we can quiesce the ill, then set the address. If not, then
17847 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17848 */
17849 ill_down_ipifs(ill, B_TRUE);
17850 mutex_enter(&ill->ill_lock);
17851 if (!ill_is_quiescent(ill)) {
17852 /* call cannot fail since `conn_t *' argument is NULL */
17853 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17854 mp, ILL_DOWN);
17855 mutex_exit(&ill->ill_lock);
17856 return (EINPROGRESS);
17857 }
17858 mutex_exit(&ill->ill_lock);
17859
17860 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17861 return (0);
17862 }
17863
17864 /*
17865 * When the allowed-ips link property is set on the datalink, IP receives a
17866 * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17867 * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17868 * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17869 * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17870 * array.
17871 */
17872 void
17873 ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17874 {
17875 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17876 dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17877 mac_protect_t *mrp;
17878 int i;
17879
17880 ASSERT(IAM_WRITER_IPSQ(ipsq));
17881 mrp = (mac_protect_t *)&dlip[1];
17882
17883 if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17884 kmem_free(ill->ill_allowed_ips,
17885 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17886 ill->ill_allowed_ips_cnt = 0;
17887 ill->ill_allowed_ips = NULL;
17888 mutex_enter(&ill->ill_phyint->phyint_lock);
17889 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17890 mutex_exit(&ill->ill_phyint->phyint_lock);
17891 return;
17892 }
17893
17894 if (ill->ill_allowed_ips != NULL) {
17895 kmem_free(ill->ill_allowed_ips,
17896 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17897 }
17898 ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17899 ill->ill_allowed_ips = kmem_alloc(
17900 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17901 for (i = 0; i < mrp->mp_ipaddrcnt; i++)
17902 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17903
17904 mutex_enter(&ill->ill_phyint->phyint_lock);
17905 ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17906 mutex_exit(&ill->ill_phyint->phyint_lock);
17907 }
17908
17909 /*
17910 * Once the ill associated with `q' has quiesced, set its physical address
17911 * information to the values in `addrmp'. Note that two copies of `addrmp'
17912 * are passed (linked by b_cont), since we sometimes need to save two distinct
17913 * copies in the ill_t, and our context doesn't permit sleeping or allocation
17914 * failure (we'll free the other copy if it's not needed). Since the ill_t
17915 * is quiesced, we know any stale nce's with the old address information have
17916 * already been removed, so we don't need to call nce_flush().
17917 */
17918 /* ARGSUSED */
17919 static void
17920 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17921 {
17922 ill_t *ill = q->q_ptr;
17923 mblk_t *addrmp2 = unlinkb(addrmp);
17924 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17925 uint_t addrlen, addroff;
17926 int status;
17927
17928 ASSERT(IAM_WRITER_IPSQ(ipsq));
17929
17930 addroff = dlindp->dl_addr_offset;
17931 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17932
17933 switch (dlindp->dl_data) {
17934 case DL_IPV6_LINK_LAYER_ADDR:
17935 ill_set_ndmp(ill, addrmp, addroff, addrlen);
17936 freemsg(addrmp2);
17937 break;
17938
17939 case DL_CURR_DEST_ADDR:
17940 freemsg(ill->ill_dest_addr_mp);
17941 ill->ill_dest_addr = addrmp->b_rptr + addroff;
17942 ill->ill_dest_addr_mp = addrmp;
17943 if (ill->ill_isv6) {
17944 ill_setdesttoken(ill);
17945 ipif_setdestlinklocal(ill->ill_ipif);
17946 }
17947 freemsg(addrmp2);
17948 break;
17949
17950 case DL_CURR_PHYS_ADDR:
17951 freemsg(ill->ill_phys_addr_mp);
17952 ill->ill_phys_addr = addrmp->b_rptr + addroff;
17953 ill->ill_phys_addr_mp = addrmp;
17954 ill->ill_phys_addr_length = addrlen;
17955 if (ill->ill_isv6)
17956 ill_set_ndmp(ill, addrmp2, addroff, addrlen);
17957 else
17958 freemsg(addrmp2);
17959 if (ill->ill_isv6) {
17960 ill_setdefaulttoken(ill);
17961 ipif_setlinklocal(ill->ill_ipif);
17962 }
17963 break;
17964 default:
17965 ASSERT(0);
17966 }
17967
17968 /*
17969 * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
17970 * as we bring the ipifs up again.
17971 */
17972 mutex_enter(&ill->ill_lock);
17973 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
17974 mutex_exit(&ill->ill_lock);
17975 /*
17976 * If there are ipifs to bring up, ill_up_ipifs() will return
17977 * EINPROGRESS, and ipsq_current_finish() will be called by
17978 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
17979 * brought up.
17980 */
17981 status = ill_up_ipifs(ill, q, addrmp);
17982 if (status != EINPROGRESS)
17983 ipsq_current_finish(ipsq);
17984 }
17985
17986 /*
17987 * Helper routine for setting the ill_nd_lla fields.
17988 */
17989 void
17990 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
17991 {
17992 freemsg(ill->ill_nd_lla_mp);
17993 ill->ill_nd_lla = ndmp->b_rptr + addroff;
17994 ill->ill_nd_lla_mp = ndmp;
17995 ill->ill_nd_lla_len = addrlen;
17996 }
17997
17998 /*
17999 * Replumb the ill.
18000 */
18001 int
18002 ill_replumb(ill_t *ill, mblk_t *mp)
18003 {
18004 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
18005
18006 ASSERT(IAM_WRITER_IPSQ(ipsq));
18007
18008 ipsq_current_start(ipsq, ill->ill_ipif, 0);
18009
18010 /*
18011 * If we can quiesce the ill, then continue. If not, then
18012 * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
18013 */
18014 ill_down_ipifs(ill, B_FALSE);
18015
18016 mutex_enter(&ill->ill_lock);
18017 if (!ill_is_quiescent(ill)) {
18018 /* call cannot fail since `conn_t *' argument is NULL */
18019 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
18020 mp, ILL_DOWN);
18021 mutex_exit(&ill->ill_lock);
18022 return (EINPROGRESS);
18023 }
18024 mutex_exit(&ill->ill_lock);
18025
18026 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
18027 return (0);
18028 }
18029
18030 /* ARGSUSED */
18031 static void
18032 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
18033 {
18034 ill_t *ill = q->q_ptr;
18035 int err;
18036 conn_t *connp = NULL;
18037
18038 ASSERT(IAM_WRITER_IPSQ(ipsq));
18039 freemsg(ill->ill_replumb_mp);
18040 ill->ill_replumb_mp = copyb(mp);
18041
18042 if (ill->ill_replumb_mp == NULL) {
18043 /* out of memory */
18044 ipsq_current_finish(ipsq);
18045 return;
18046 }
18047
18048 mutex_enter(&ill->ill_lock);
18049 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
18050 ill->ill_rq, ill->ill_replumb_mp, 0);
18051 mutex_exit(&ill->ill_lock);
18052
18053 if (!ill->ill_up_ipifs) {
18054 /* already closing */
18055 ipsq_current_finish(ipsq);
18056 return;
18057 }
18058 ill->ill_replumbing = 1;
18059 err = ill_down_ipifs_tail(ill);
18060
18061 /*
18062 * Successfully quiesced and brought down the interface, now we send
18063 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
18064 * DL_NOTE_REPLUMB message.
18065 */
18066 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
18067 DL_NOTIFY_CONF);
18068 ASSERT(mp != NULL);
18069 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
18070 DL_NOTE_REPLUMB_DONE;
18071 ill_dlpi_send(ill, mp);
18072
18073 /*
18074 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
18075 * streams have to be unbound. When all the DLPI exchanges are done,
18076 * ipsq_current_finish() will be called by arp_bringup_done(). The
18077 * remainder of ipif bringup via ill_up_ipifs() will also be done in
18078 * arp_bringup_done().
18079 */
18080 ASSERT(ill->ill_replumb_mp != NULL);
18081 if (err == EINPROGRESS)
18082 return;
18083 else
18084 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
18085 ASSERT(connp == NULL);
18086 if (err == 0 && ill->ill_replumb_mp != NULL &&
18087 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
18088 return;
18089 }
18090 ipsq_current_finish(ipsq);
18091 }
18092
18093 /*
18094 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
18095 * which is `bufsize' bytes. On success, zero is returned and `buf' updated
18096 * as per the ioctl. On failure, an errno is returned.
18097 */
18098 static int
18099 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
18100 {
18101 int rval;
18102 struct strioctl iocb;
18103
18104 iocb.ic_cmd = cmd;
18105 iocb.ic_timout = 15;
18106 iocb.ic_len = bufsize;
18107 iocb.ic_dp = buf;
18108
18109 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
18110 }
18111
18112 /*
18113 * Issue an SIOCGLIFCONF for address family `af' and store the result into a
18114 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
18115 */
18116 static int
18117 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
18118 uint_t *bufsizep, cred_t *cr)
18119 {
18120 int err;
18121 struct lifnum lifn;
18122
18123 bzero(&lifn, sizeof (lifn));
18124 lifn.lifn_family = af;
18125 lifn.lifn_flags = LIFC_UNDER_IPMP;
18126
18127 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
18128 return (err);
18129
18130 /*
18131 * Pad the interface count to account for additional interfaces that
18132 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
18133 */
18134 lifn.lifn_count += 4;
18135 bzero(lifcp, sizeof (*lifcp));
18136 lifcp->lifc_flags = LIFC_UNDER_IPMP;
18137 lifcp->lifc_family = af;
18138 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
18139 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
18140
18141 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
18142 if (err != 0) {
18143 kmem_free(lifcp->lifc_buf, *bufsizep);
18144 return (err);
18145 }
18146
18147 return (0);
18148 }
18149
18150 /*
18151 * Helper for ip_interface_cleanup() that removes the loopback interface.
18152 */
18153 static void
18154 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18155 {
18156 int err;
18157 struct lifreq lifr;
18158
18159 bzero(&lifr, sizeof (lifr));
18160 (void) strcpy(lifr.lifr_name, ipif_loopback_name);
18161
18162 /*
18163 * Attempt to remove the interface. It may legitimately not exist
18164 * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
18165 */
18166 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
18167 if (err != 0 && err != ENXIO) {
18168 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
18169 "error %d\n", isv6 ? "v6" : "v4", err));
18170 }
18171 }
18172
18173 /*
18174 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
18175 * groups and that IPMP data addresses are down. These conditions must be met
18176 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
18177 */
18178 static void
18179 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18180 {
18181 int af = isv6 ? AF_INET6 : AF_INET;
18182 int i, nifs;
18183 int err;
18184 uint_t bufsize;
18185 uint_t lifrsize = sizeof (struct lifreq);
18186 struct lifconf lifc;
18187 struct lifreq *lifrp;
18188
18189 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
18190 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
18191 "(error %d); any IPMP interfaces cannot be shutdown", err);
18192 return;
18193 }
18194
18195 nifs = lifc.lifc_len / lifrsize;
18196 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
18197 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18198 if (err != 0) {
18199 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
18200 "flags: error %d", lifrp->lifr_name, err);
18201 continue;
18202 }
18203
18204 if (lifrp->lifr_flags & IFF_IPMP) {
18205 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
18206 continue;
18207
18208 lifrp->lifr_flags &= ~IFF_UP;
18209 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
18210 if (err != 0) {
18211 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18212 "bring down (error %d); IPMP interface may "
18213 "not be shutdown", lifrp->lifr_name, err);
18214 }
18215
18216 /*
18217 * Check if IFF_DUPLICATE is still set -- and if so,
18218 * reset the address to clear it.
18219 */
18220 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18221 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
18222 continue;
18223
18224 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
18225 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
18226 lifrp, lifrsize, cr)) != 0) {
18227 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18228 "reset DAD (error %d); IPMP interface may "
18229 "not be shutdown", lifrp->lifr_name, err);
18230 }
18231 continue;
18232 }
18233
18234 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
18235 lifrp->lifr_groupname[0] = '\0';
18236 if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
18237 lifrsize, cr)) != 0) {
18238 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18239 "leave IPMP group (error %d); associated "
18240 "IPMP interface may not be shutdown",
18241 lifrp->lifr_name, err);
18242 continue;
18243 }
18244 }
18245 }
18246
18247 kmem_free(lifc.lifc_buf, bufsize);
18248 }
18249
18250 #define UDPDEV "/devices/pseudo/udp@0:udp"
18251 #define UDP6DEV "/devices/pseudo/udp6@0:udp6"
18252
18253 /*
18254 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
18255 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
18256 * when the user-level processes in the zone are killed and the latter are
18257 * cleaned up by str_stack_shutdown().
18258 */
18259 void
18260 ip_interface_cleanup(ip_stack_t *ipst)
18261 {
18262 ldi_handle_t lh;
18263 ldi_ident_t li;
18264 cred_t *cr;
18265 int err;
18266 int i;
18267 char *devs[] = { UDP6DEV, UDPDEV };
18268 netstackid_t stackid = ipst->ips_netstack->netstack_stackid;
18269
18270 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
18271 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
18272 " error %d", err);
18273 return;
18274 }
18275
18276 cr = zone_get_kcred(netstackid_to_zoneid(stackid));
18277 ASSERT(cr != NULL);
18278
18279 /*
18280 * NOTE: loop executes exactly twice and is hardcoded to know that the
18281 * first iteration is IPv6. (Unrolling yields repetitious code, hence
18282 * the loop.)
18283 */
18284 for (i = 0; i < 2; i++) {
18285 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
18286 if (err != 0) {
18287 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
18288 " error %d", devs[i], err);
18289 continue;
18290 }
18291
18292 ip_loopback_removeif(lh, i == 0, cr);
18293 ip_ipmp_cleanup(lh, i == 0, cr);
18294
18295 (void) ldi_close(lh, FREAD|FWRITE, cr);
18296 }
18297
18298 ldi_ident_release(li);
18299 crfree(cr);
18300 }
18301
18302 /*
18303 * This needs to be in-sync with nic_event_t definition
18304 */
18305 static const char *
18306 ill_hook_event2str(nic_event_t event)
18307 {
18308 switch (event) {
18309 case NE_PLUMB:
18310 return ("PLUMB");
18311 case NE_UNPLUMB:
18312 return ("UNPLUMB");
18313 case NE_UP:
18314 return ("UP");
18315 case NE_DOWN:
18316 return ("DOWN");
18317 case NE_ADDRESS_CHANGE:
18318 return ("ADDRESS_CHANGE");
18319 case NE_LIF_UP:
18320 return ("LIF_UP");
18321 case NE_LIF_DOWN:
18322 return ("LIF_DOWN");
18323 case NE_IFINDEX_CHANGE:
18324 return ("IFINDEX_CHANGE");
18325 default:
18326 return ("UNKNOWN");
18327 }
18328 }
18329
18330 void
18331 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
18332 nic_event_data_t data, size_t datalen)
18333 {
18334 ip_stack_t *ipst = ill->ill_ipst;
18335 hook_nic_event_int_t *info;
18336 const char *str = NULL;
18337
18338 /* create a new nic event info */
18339 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
18340 goto fail;
18341
18342 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
18343 info->hnei_event.hne_lif = lif;
18344 info->hnei_event.hne_event = event;
18345 info->hnei_event.hne_protocol = ill->ill_isv6 ?
18346 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
18347 info->hnei_event.hne_data = NULL;
18348 info->hnei_event.hne_datalen = 0;
18349 info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
18350
18351 if (data != NULL && datalen != 0) {
18352 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
18353 if (info->hnei_event.hne_data == NULL)
18354 goto fail;
18355 bcopy(data, info->hnei_event.hne_data, datalen);
18356 info->hnei_event.hne_datalen = datalen;
18357 }
18358
18359 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
18360 DDI_NOSLEEP) == DDI_SUCCESS)
18361 return;
18362
18363 fail:
18364 if (info != NULL) {
18365 if (info->hnei_event.hne_data != NULL) {
18366 kmem_free(info->hnei_event.hne_data,
18367 info->hnei_event.hne_datalen);
18368 }
18369 kmem_free(info, sizeof (hook_nic_event_t));
18370 }
18371 str = ill_hook_event2str(event);
18372 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
18373 "information for %s (ENOMEM)\n", str, ill->ill_name));
18374 }
18375
18376 static int
18377 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
18378 {
18379 int err = 0;
18380 const in_addr_t *addr = NULL;
18381 nce_t *nce = NULL;
18382 ill_t *ill = ipif->ipif_ill;
18383 ill_t *bound_ill;
18384 boolean_t added_ipif = B_FALSE;
18385 uint16_t state;
18386 uint16_t flags;
18387
18388 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
18389 ill_t *, ill, ipif_t *, ipif);
18390 if (ipif->ipif_lcl_addr != INADDR_ANY) {
18391 addr = &ipif->ipif_lcl_addr;
18392 }
18393
18394 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
18395 if (res_act != Res_act_initial)
18396 return (EINVAL);
18397 }
18398
18399 if (addr != NULL) {
18400 ipmp_illgrp_t *illg = ill->ill_grp;
18401
18402 /* add unicast nce for the local addr */
18403
18404 if (IS_IPMP(ill)) {
18405 /*
18406 * If we're here via ipif_up(), then the ipif
18407 * won't be bound yet -- add it to the group,
18408 * which will bind it if possible. (We would
18409 * add it in ipif_up(), but deleting on failure
18410 * there is gruesome.) If we're here via
18411 * ipmp_ill_bind_ipif(), then the ipif has
18412 * already been added to the group and we
18413 * just need to use the binding.
18414 */
18415 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18416 bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
18417 if (bound_ill == NULL) {
18418 /*
18419 * We couldn't bind the ipif to an ill
18420 * yet, so we have nothing to publish.
18421 * Mark the address as ready and return.
18422 */
18423 ipif->ipif_addr_ready = 1;
18424 return (0);
18425 }
18426 added_ipif = B_TRUE;
18427 }
18428 } else {
18429 bound_ill = ill;
18430 }
18431
18432 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18433 NCE_F_NONUD);
18434 /*
18435 * If this is an initial bring-up (or the ipif was never
18436 * completely brought up), do DAD. Otherwise, we're here
18437 * because IPMP has rebound an address to this ill: send
18438 * unsolicited advertisements (ARP announcements) to
18439 * inform others.
18440 */
18441 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18442 state = ND_UNCHANGED; /* compute in nce_add_common() */
18443 } else {
18444 state = ND_REACHABLE;
18445 flags |= NCE_F_UNSOL_ADV;
18446 }
18447
18448 retry:
18449 err = nce_lookup_then_add_v4(ill,
18450 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18451 addr, flags, state, &nce);
18452
18453 /*
18454 * note that we may encounter EEXIST if we are moving
18455 * the nce as a result of a rebind operation.
18456 */
18457 switch (err) {
18458 case 0:
18459 ipif->ipif_added_nce = 1;
18460 nce->nce_ipif_cnt++;
18461 break;
18462 case EEXIST:
18463 ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18464 ill->ill_name));
18465 if (!NCE_MYADDR(nce->nce_common)) {
18466 /*
18467 * A leftover nce from before this address
18468 * existed
18469 */
18470 ncec_delete(nce->nce_common);
18471 nce_refrele(nce);
18472 nce = NULL;
18473 goto retry;
18474 }
18475 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18476 nce_refrele(nce);
18477 nce = NULL;
18478 ip1dbg(("ipif_arp_up: NCE already exists "
18479 "for %s:%u\n", ill->ill_name,
18480 ipif->ipif_id));
18481 goto arp_up_done;
18482 }
18483 /*
18484 * Duplicate local addresses are permissible for
18485 * IPIF_POINTOPOINT interfaces which will get marked
18486 * IPIF_UNNUMBERED later in
18487 * ip_addr_availability_check().
18488 *
18489 * The nce_ipif_cnt field tracks the number of
18490 * ipifs that have nce_addr as their local address.
18491 */
18492 ipif->ipif_addr_ready = 1;
18493 ipif->ipif_added_nce = 1;
18494 nce->nce_ipif_cnt++;
18495 err = 0;
18496 break;
18497 default:
18498 ASSERT(nce == NULL);
18499 goto arp_up_done;
18500 }
18501 if (arp_no_defense) {
18502 if ((ipif->ipif_flags & IPIF_UP) &&
18503 !ipif->ipif_addr_ready)
18504 ipif_up_notify(ipif);
18505 ipif->ipif_addr_ready = 1;
18506 }
18507 } else {
18508 /* zero address. nothing to publish */
18509 ipif->ipif_addr_ready = 1;
18510 }
18511 if (nce != NULL)
18512 nce_refrele(nce);
18513 arp_up_done:
18514 if (added_ipif && err != 0)
18515 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18516 return (err);
18517 }
18518
18519 int
18520 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18521 {
18522 int err = 0;
18523 ill_t *ill = ipif->ipif_ill;
18524 boolean_t first_interface, wait_for_dlpi = B_FALSE;
18525
18526 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18527 ill_t *, ill, ipif_t *, ipif);
18528
18529 /*
18530 * need to bring up ARP or setup mcast mapping only
18531 * when the first interface is coming UP.
18532 */
18533 first_interface = (ill->ill_ipif_up_count == 0 &&
18534 ill->ill_ipif_dup_count == 0 && !was_dup);
18535
18536 if (res_act == Res_act_initial && first_interface) {
18537 /*
18538 * Send ATTACH + BIND
18539 */
18540 err = arp_ll_up(ill);
18541 if (err != EINPROGRESS && err != 0)
18542 return (err);
18543
18544 /*
18545 * Add NCE for local address. Start DAD.
18546 * we'll wait to hear that DAD has finished
18547 * before using the interface.
18548 */
18549 if (err == EINPROGRESS)
18550 wait_for_dlpi = B_TRUE;
18551 }
18552
18553 if (!wait_for_dlpi)
18554 (void) ipif_arp_up_done_tail(ipif, res_act);
18555
18556 return (!wait_for_dlpi ? 0 : EINPROGRESS);
18557 }
18558
18559 /*
18560 * Finish processing of "arp_up" after all the DLPI message
18561 * exchanges have completed between arp and the driver.
18562 */
18563 void
18564 arp_bringup_done(ill_t *ill, int err)
18565 {
18566 mblk_t *mp1;
18567 ipif_t *ipif;
18568 conn_t *connp = NULL;
18569 ipsq_t *ipsq;
18570 queue_t *q;
18571
18572 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18573
18574 ASSERT(IAM_WRITER_ILL(ill));
18575
18576 ipsq = ill->ill_phyint->phyint_ipsq;
18577 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18578 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18579 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18580 if (mp1 == NULL) /* bringup was aborted by the user */
18581 return;
18582
18583 /*
18584 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18585 * must have an associated conn_t. Otherwise, we're bringing this
18586 * interface back up as part of handling an asynchronous event (e.g.,
18587 * physical address change).
18588 */
18589 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18590 ASSERT(connp != NULL);
18591 q = CONNP_TO_WQ(connp);
18592 } else {
18593 ASSERT(connp == NULL);
18594 q = ill->ill_rq;
18595 }
18596 if (err == 0) {
18597 if (ipif->ipif_isv6) {
18598 if ((err = ipif_up_done_v6(ipif)) != 0)
18599 ip0dbg(("arp_bringup_done: init failed\n"));
18600 } else {
18601 err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18602 if (err != 0 ||
18603 (err = ipif_up_done(ipif)) != 0) {
18604 ip0dbg(("arp_bringup_done: "
18605 "init failed err %x\n", err));
18606 (void) ipif_arp_down(ipif);
18607 }
18608
18609 }
18610 } else {
18611 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18612 }
18613
18614 if ((err == 0) && (ill->ill_up_ipifs)) {
18615 err = ill_up_ipifs(ill, q, mp1);
18616 if (err == EINPROGRESS)
18617 return;
18618 }
18619
18620 /*
18621 * If we have a moved ipif to bring up, and everything has succeeded
18622 * to this point, bring it up on the IPMP ill. Otherwise, leave it
18623 * down -- the admin can try to bring it up by hand if need be.
18624 */
18625 if (ill->ill_move_ipif != NULL) {
18626 ipif = ill->ill_move_ipif;
18627 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18628 ipif->ipif_ill->ill_name));
18629 ill->ill_move_ipif = NULL;
18630 if (err == 0) {
18631 err = ipif_up(ipif, q, mp1);
18632 if (err == EINPROGRESS)
18633 return;
18634 }
18635 }
18636
18637 /*
18638 * The operation must complete without EINPROGRESS since
18639 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18640 * Otherwise, the operation will be stuck forever in the ipsq.
18641 */
18642 ASSERT(err != EINPROGRESS);
18643 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18644 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18645 int, ipsq->ipsq_xop->ipx_current_ioctl,
18646 ill_t *, ill, ipif_t *, ipif);
18647 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18648 } else {
18649 ipsq_current_finish(ipsq);
18650 }
18651 }
18652
18653 /*
18654 * Finish processing of arp replumb after all the DLPI message
18655 * exchanges have completed between arp and the driver.
18656 */
18657 void
18658 arp_replumb_done(ill_t *ill, int err)
18659 {
18660 mblk_t *mp1;
18661 ipif_t *ipif;
18662 conn_t *connp = NULL;
18663 ipsq_t *ipsq;
18664 queue_t *q;
18665
18666 ASSERT(IAM_WRITER_ILL(ill));
18667
18668 ipsq = ill->ill_phyint->phyint_ipsq;
18669 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18670 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18671 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18672 if (mp1 == NULL) {
18673 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18674 ipsq->ipsq_xop->ipx_current_ioctl));
18675 /* bringup was aborted by the user */
18676 return;
18677 }
18678 /*
18679 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18680 * must have an associated conn_t. Otherwise, we're bringing this
18681 * interface back up as part of handling an asynchronous event (e.g.,
18682 * physical address change).
18683 */
18684 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18685 ASSERT(connp != NULL);
18686 q = CONNP_TO_WQ(connp);
18687 } else {
18688 ASSERT(connp == NULL);
18689 q = ill->ill_rq;
18690 }
18691 if ((err == 0) && (ill->ill_up_ipifs)) {
18692 err = ill_up_ipifs(ill, q, mp1);
18693 if (err == EINPROGRESS)
18694 return;
18695 }
18696 /*
18697 * The operation must complete without EINPROGRESS since
18698 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18699 * Otherwise, the operation will be stuck forever in the ipsq.
18700 */
18701 ASSERT(err != EINPROGRESS);
18702 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18703 DTRACE_PROBE4(ipif__ioctl, char *,
18704 "arp_replumb_done finish",
18705 int, ipsq->ipsq_xop->ipx_current_ioctl,
18706 ill_t *, ill, ipif_t *, ipif);
18707 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18708 } else {
18709 ipsq_current_finish(ipsq);
18710 }
18711 }
18712
18713 void
18714 ipif_up_notify(ipif_t *ipif)
18715 {
18716 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18717 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18718 sctp_update_ipif(ipif, SCTP_IPIF_UP);
18719 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18720 NE_LIF_UP, NULL, 0);
18721 }
18722
18723 /*
18724 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18725 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on
18726 * TPI end points with STREAMS modules pushed above. This is assured by not
18727 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl
18728 * never ends up on an ipsq, otherwise we may end up processing the ioctl
18729 * while unwinding from the ispq and that could be a thread from the bottom.
18730 */
18731 /* ARGSUSED */
18732 int
18733 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18734 ip_ioctl_cmd_t *ipip, void *arg)
18735 {
18736 mblk_t *cmd_mp = mp->b_cont->b_cont;
18737 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18738 int ret = 0;
18739 int i;
18740 size_t size;
18741 ip_stack_t *ipst;
18742 zoneid_t zoneid;
18743 ilb_stack_t *ilbs;
18744
18745 ipst = CONNQ_TO_IPST(q);
18746 ilbs = ipst->ips_netstack->netstack_ilb;
18747 zoneid = Q_TO_CONN(q)->conn_zoneid;
18748
18749 switch (command) {
18750 case ILB_CREATE_RULE: {
18751 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18752
18753 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18754 ret = EINVAL;
18755 break;
18756 }
18757
18758 ret = ilb_rule_add(ilbs, zoneid, cmd);
18759 break;
18760 }
18761 case ILB_DESTROY_RULE:
18762 case ILB_ENABLE_RULE:
18763 case ILB_DISABLE_RULE: {
18764 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18765
18766 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18767 ret = EINVAL;
18768 break;
18769 }
18770
18771 if (cmd->flags & ILB_RULE_ALLRULES) {
18772 if (command == ILB_DESTROY_RULE) {
18773 ilb_rule_del_all(ilbs, zoneid);
18774 break;
18775 } else if (command == ILB_ENABLE_RULE) {
18776 ilb_rule_enable_all(ilbs, zoneid);
18777 break;
18778 } else if (command == ILB_DISABLE_RULE) {
18779 ilb_rule_disable_all(ilbs, zoneid);
18780 break;
18781 }
18782 } else {
18783 if (command == ILB_DESTROY_RULE) {
18784 ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18785 } else if (command == ILB_ENABLE_RULE) {
18786 ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18787 NULL);
18788 } else if (command == ILB_DISABLE_RULE) {
18789 ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18790 NULL);
18791 }
18792 }
18793 break;
18794 }
18795 case ILB_NUM_RULES: {
18796 ilb_num_rules_cmd_t *cmd;
18797
18798 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18799 ret = EINVAL;
18800 break;
18801 }
18802 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18803 ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18804 break;
18805 }
18806 case ILB_RULE_NAMES: {
18807 ilb_rule_names_cmd_t *cmd;
18808
18809 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18810 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18811 cmd->num_names == 0) {
18812 ret = EINVAL;
18813 break;
18814 }
18815 size = cmd->num_names * ILB_RULE_NAMESZ;
18816 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18817 size != cmd_mp->b_wptr) {
18818 ret = EINVAL;
18819 break;
18820 }
18821 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18822 break;
18823 }
18824 case ILB_NUM_SERVERS: {
18825 ilb_num_servers_cmd_t *cmd;
18826
18827 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18828 ret = EINVAL;
18829 break;
18830 }
18831 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18832 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18833 &(cmd->num));
18834 break;
18835 }
18836 case ILB_LIST_RULE: {
18837 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18838
18839 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18840 ret = EINVAL;
18841 break;
18842 }
18843 ret = ilb_rule_list(ilbs, zoneid, cmd);
18844 break;
18845 }
18846 case ILB_LIST_SERVERS: {
18847 ilb_servers_info_cmd_t *cmd;
18848
18849 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18850 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18851 cmd->num_servers == 0) {
18852 ret = EINVAL;
18853 break;
18854 }
18855 size = cmd->num_servers * sizeof (ilb_server_info_t);
18856 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18857 size != cmd_mp->b_wptr) {
18858 ret = EINVAL;
18859 break;
18860 }
18861
18862 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18863 &cmd->num_servers);
18864 break;
18865 }
18866 case ILB_ADD_SERVERS: {
18867 ilb_servers_info_cmd_t *cmd;
18868 ilb_rule_t *rule;
18869
18870 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18871 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18872 ret = EINVAL;
18873 break;
18874 }
18875 size = cmd->num_servers * sizeof (ilb_server_info_t);
18876 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18877 size != cmd_mp->b_wptr) {
18878 ret = EINVAL;
18879 break;
18880 }
18881 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18882 if (rule == NULL) {
18883 ASSERT(ret != 0);
18884 break;
18885 }
18886 for (i = 0; i < cmd->num_servers; i++) {
18887 ilb_server_info_t *s;
18888
18889 s = &cmd->servers[i];
18890 s->err = ilb_server_add(ilbs, rule, s);
18891 }
18892 ILB_RULE_REFRELE(rule);
18893 break;
18894 }
18895 case ILB_DEL_SERVERS:
18896 case ILB_ENABLE_SERVERS:
18897 case ILB_DISABLE_SERVERS: {
18898 ilb_servers_cmd_t *cmd;
18899 ilb_rule_t *rule;
18900 int (*f)();
18901
18902 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18903 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18904 ret = EINVAL;
18905 break;
18906 }
18907 size = cmd->num_servers * sizeof (ilb_server_arg_t);
18908 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18909 size != cmd_mp->b_wptr) {
18910 ret = EINVAL;
18911 break;
18912 }
18913
18914 if (command == ILB_DEL_SERVERS)
18915 f = ilb_server_del;
18916 else if (command == ILB_ENABLE_SERVERS)
18917 f = ilb_server_enable;
18918 else if (command == ILB_DISABLE_SERVERS)
18919 f = ilb_server_disable;
18920
18921 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18922 if (rule == NULL) {
18923 ASSERT(ret != 0);
18924 break;
18925 }
18926
18927 for (i = 0; i < cmd->num_servers; i++) {
18928 ilb_server_arg_t *s;
18929
18930 s = &cmd->servers[i];
18931 s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18932 }
18933 ILB_RULE_REFRELE(rule);
18934 break;
18935 }
18936 case ILB_LIST_NAT_TABLE: {
18937 ilb_list_nat_cmd_t *cmd;
18938
18939 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
18940 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
18941 ret = EINVAL;
18942 break;
18943 }
18944 size = cmd->num_nat * sizeof (ilb_nat_entry_t);
18945 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
18946 size != cmd_mp->b_wptr) {
18947 ret = EINVAL;
18948 break;
18949 }
18950
18951 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
18952 &cmd->flags);
18953 break;
18954 }
18955 case ILB_LIST_STICKY_TABLE: {
18956 ilb_list_sticky_cmd_t *cmd;
18957
18958 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
18959 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
18960 ret = EINVAL;
18961 break;
18962 }
18963 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
18964 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
18965 size != cmd_mp->b_wptr) {
18966 ret = EINVAL;
18967 break;
18968 }
18969
18970 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
18971 &cmd->num_sticky, &cmd->flags);
18972 break;
18973 }
18974 default:
18975 ret = EINVAL;
18976 break;
18977 }
18978 done:
18979 return (ret);
18980 }
18981
18982 /* Remove all cache entries for this logical interface */
18983 void
18984 ipif_nce_down(ipif_t *ipif)
18985 {
18986 ill_t *ill = ipif->ipif_ill;
18987 nce_t *nce;
18988
18989 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
18990 ill_t *, ill, ipif_t *, ipif);
18991 if (ipif->ipif_added_nce) {
18992 if (ipif->ipif_isv6)
18993 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
18994 else
18995 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
18996 if (nce != NULL) {
18997 if (--nce->nce_ipif_cnt == 0)
18998 ncec_delete(nce->nce_common);
18999 ipif->ipif_added_nce = 0;
19000 nce_refrele(nce);
19001 } else {
19002 /*
19003 * nce may already be NULL because it was already
19004 * flushed, e.g., due to a call to nce_flush
19005 */
19006 ipif->ipif_added_nce = 0;
19007 }
19008 }
19009 /*
19010 * Make IPMP aware of the deleted data address.
19011 */
19012 if (IS_IPMP(ill))
19013 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
19014
19015 /*
19016 * Remove all other nces dependent on this ill when the last ipif
19017 * is going away.
19018 */
19019 if (ill->ill_ipif_up_count == 0) {
19020 ncec_walk(ill, (pfi_t)ncec_delete_per_ill,
19021 (uchar_t *)ill, ill->ill_ipst);
19022 if (IS_UNDER_IPMP(ill))
19023 nce_flush(ill, B_TRUE);
19024 }
19025 }
19026
19027 /*
19028 * find the first interface that uses usill for its source address.
19029 */
19030 ill_t *
19031 ill_lookup_usesrc(ill_t *usill)
19032 {
19033 ip_stack_t *ipst = usill->ill_ipst;
19034 ill_t *ill;
19035
19036 ASSERT(usill != NULL);
19037
19038 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
19039 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
19040 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
19041 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
19042 ill = ill->ill_usesrc_grp_next) {
19043 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
19044 !ILL_IS_CONDEMNED(ill)) {
19045 ill_refhold(ill);
19046 break;
19047 }
19048 }
19049 rw_exit(&ipst->ips_ill_g_lock);
19050 rw_exit(&ipst->ips_ill_g_usesrc_lock);
19051 return (ill);
19052 }
19053
19054 /*
19055 * This comment applies to both ip_sioctl_get_ifhwaddr and
19056 * ip_sioctl_get_lifhwaddr as the basic function of these two functions
19057 * is the same.
19058 *
19059 * The goal here is to find an IP interface that corresponds to the name
19060 * provided by the caller in the ifreq/lifreq structure held in the mblk_t
19061 * chain and to fill out a sockaddr/sockaddr_storage structure with the
19062 * mac address.
19063 *
19064 * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
19065 * of different reasons:
19066 * ENXIO - the device name is not known to IP.
19067 * EADDRNOTAVAIL - the device has no hardware address. This is indicated
19068 * by ill_phys_addr not pointing to an actual address.
19069 * EPFNOSUPPORT - this will indicate that a request is being made for a
19070 * mac address that will not fit in the data structure supplier (struct
19071 * sockaddr).
19072 *
19073 */
19074 /* ARGSUSED */
19075 int
19076 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19077 ip_ioctl_cmd_t *ipip, void *if_req)
19078 {
19079 struct sockaddr *sock;
19080 struct ifreq *ifr;
19081 mblk_t *mp1;
19082 ill_t *ill;
19083
19084 ASSERT(ipif != NULL);
19085 ill = ipif->ipif_ill;
19086
19087 if (ill->ill_phys_addr == NULL) {
19088 return (EADDRNOTAVAIL);
19089 }
19090 if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
19091 return (EPFNOSUPPORT);
19092 }
19093
19094 ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
19095
19096 /* Existence of mp1 has been checked in ip_wput_nondata */
19097 mp1 = mp->b_cont->b_cont;
19098 ifr = (struct ifreq *)mp1->b_rptr;
19099
19100 sock = &ifr->ifr_addr;
19101 /*
19102 * The "family" field in the returned structure is set to a value
19103 * that represents the type of device to which the address belongs.
19104 * The value returned may differ to that on Linux but it will still
19105 * represent the correct symbol on Solaris.
19106 */
19107 sock->sa_family = arp_hw_type(ill->ill_mactype);
19108 bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
19109
19110 return (0);
19111 }
19112
19113 /*
19114 * The expection of applications using SIOCGIFHWADDR is that data will
19115 * be returned in the sa_data field of the sockaddr structure. With
19116 * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
19117 * equivalent. In light of this, struct sockaddr_dl is used as it
19118 * offers more space for address storage in sll_data.
19119 */
19120 /* ARGSUSED */
19121 int
19122 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19123 ip_ioctl_cmd_t *ipip, void *if_req)
19124 {
19125 struct sockaddr_dl *sock;
19126 struct lifreq *lifr;
19127 mblk_t *mp1;
19128 ill_t *ill;
19129
19130 ASSERT(ipif != NULL);
19131 ill = ipif->ipif_ill;
19132
19133 if (ill->ill_phys_addr == NULL) {
19134 return (EADDRNOTAVAIL);
19135 }
19136 if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
19137 return (EPFNOSUPPORT);
19138 }
19139
19140 ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
19141
19142 /* Existence of mp1 has been checked in ip_wput_nondata */
19143 mp1 = mp->b_cont->b_cont;
19144 lifr = (struct lifreq *)mp1->b_rptr;
19145
19146 /*
19147 * sockaddr_ll is used here because it is also the structure used in
19148 * responding to the same ioctl in sockpfp. The only other choice is
19149 * sockaddr_dl which contains fields that are not required here
19150 * because its purpose is different.
19151 */
19152 lifr->lifr_type = ill->ill_type;
19153 sock = (struct sockaddr_dl *)&lifr->lifr_addr;
19154 sock->sdl_family = AF_LINK;
19155 sock->sdl_index = ill->ill_phyint->phyint_ifindex;
19156 sock->sdl_type = ill->ill_mactype;
19157 sock->sdl_nlen = 0;
19158 sock->sdl_slen = 0;
19159 sock->sdl_alen = ill->ill_phys_addr_length;
19160 bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
19161
19162 return (0);
19163 }