1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 1990 Mentat Inc.
25 * Copyright (c) 2011 Joyent, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/dlpi.h>
31 #include <sys/stropts.h>
32 #include <sys/sysmacros.h>
33 #include <sys/strsubr.h>
34 #include <sys/strlog.h>
35 #include <sys/strsun.h>
36 #include <sys/zone.h>
37 #define _SUN_TPI_VERSION 2
38 #include <sys/tihdr.h>
39 #include <sys/xti_inet.h>
40 #include <sys/ddi.h>
41 #include <sys/suntpi.h>
42 #include <sys/cmn_err.h>
43 #include <sys/debug.h>
44 #include <sys/kobj.h>
45 #include <sys/modctl.h>
46 #include <sys/atomic.h>
47 #include <sys/policy.h>
48 #include <sys/priv.h>
49 #include <sys/taskq.h>
50
51 #include <sys/systm.h>
52 #include <sys/param.h>
53 #include <sys/kmem.h>
54 #include <sys/sdt.h>
55 #include <sys/socket.h>
56 #include <sys/vtrace.h>
57 #include <sys/isa_defs.h>
58 #include <sys/mac.h>
59 #include <net/if.h>
60 #include <net/if_arp.h>
61 #include <net/route.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <net/if_dl.h>
65
66 #include <inet/common.h>
67 #include <inet/mi.h>
68 #include <inet/mib2.h>
69 #include <inet/nd.h>
70 #include <inet/arp.h>
71 #include <inet/snmpcom.h>
72 #include <inet/optcom.h>
73 #include <inet/kstatcom.h>
74
75 #include <netinet/igmp_var.h>
76 #include <netinet/ip6.h>
77 #include <netinet/icmp6.h>
78 #include <netinet/sctp.h>
79
80 #include <inet/ip.h>
81 #include <inet/ip_impl.h>
82 #include <inet/ip6.h>
83 #include <inet/ip6_asp.h>
84 #include <inet/tcp.h>
85 #include <inet/tcp_impl.h>
86 #include <inet/ip_multi.h>
87 #include <inet/ip_if.h>
88 #include <inet/ip_ire.h>
89 #include <inet/ip_ftable.h>
90 #include <inet/ip_rts.h>
91 #include <inet/ip_ndp.h>
92 #include <inet/ip_listutils.h>
93 #include <netinet/igmp.h>
94 #include <netinet/ip_mroute.h>
95 #include <inet/ipp_common.h>
96
97 #include <net/pfkeyv2.h>
98 #include <inet/sadb.h>
99 #include <inet/ipsec_impl.h>
100 #include <inet/iptun/iptun_impl.h>
101 #include <inet/ipdrop.h>
102 #include <inet/ip_netinfo.h>
103 #include <inet/ilb_ip.h>
104
105 #include <sys/ethernet.h>
106 #include <net/if_types.h>
107 #include <sys/cpuvar.h>
108
109 #include <ipp/ipp.h>
110 #include <ipp/ipp_impl.h>
111 #include <ipp/ipgpc/ipgpc.h>
112
113 #include <sys/pattr.h>
114 #include <inet/dccp/dccp_ip.h>
115 #include <inet/dccp/dccp_impl.h>
116 #include <inet/ipclassifier.h>
117 #include <inet/sctp_ip.h>
118 #include <inet/sctp/sctp_impl.h>
119 #include <inet/udp_impl.h>
120 #include <inet/rawip_impl.h>
121 #include <inet/rts_impl.h>
122
123 #include <sys/tsol/label.h>
124 #include <sys/tsol/tnet.h>
125
126 #include <sys/squeue_impl.h>
127 #include <inet/ip_arp.h>
128
129 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */
130
131 /*
132 * Values for squeue switch:
133 * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
134 * IP_SQUEUE_ENTER: SQ_PROCESS
135 * IP_SQUEUE_FILL: SQ_FILL
136 */
137 int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */
138
139 int ip_squeue_flag;
140
141 /*
142 * Setable in /etc/system
143 */
144 int ip_poll_normal_ms = 100;
145 int ip_poll_normal_ticks = 0;
146 int ip_modclose_ackwait_ms = 3000;
147
148 /*
149 * It would be nice to have these present only in DEBUG systems, but the
150 * current design of the global symbol checking logic requires them to be
151 * unconditionally present.
152 */
153 uint_t ip_thread_data; /* TSD key for debug support */
154 krwlock_t ip_thread_rwlock;
155 list_t ip_thread_list;
156
157 /*
158 * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
159 */
160
161 struct listptr_s {
162 mblk_t *lp_head; /* pointer to the head of the list */
163 mblk_t *lp_tail; /* pointer to the tail of the list */
164 };
165
166 typedef struct listptr_s listptr_t;
167
168 /*
169 * This is used by ip_snmp_get_mib2_ip_route_media and
170 * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
171 */
172 typedef struct iproutedata_s {
173 uint_t ird_idx;
174 uint_t ird_flags; /* see below */
175 listptr_t ird_route; /* ipRouteEntryTable */
176 listptr_t ird_netmedia; /* ipNetToMediaEntryTable */
177 listptr_t ird_attrs; /* ipRouteAttributeTable */
178 } iproutedata_t;
179
180 /* Include ire_testhidden and IRE_IF_CLONE routes */
181 #define IRD_REPORT_ALL 0x01
182
183 /*
184 * Cluster specific hooks. These should be NULL when booted as a non-cluster
185 */
186
187 /*
188 * Hook functions to enable cluster networking
189 * On non-clustered systems these vectors must always be NULL.
190 *
191 * Hook function to Check ip specified ip address is a shared ip address
192 * in the cluster
193 *
194 */
195 int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
196 sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
197
198 /*
199 * Hook function to generate cluster wide ip fragment identifier
200 */
201 uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
202 sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
203 void *args) = NULL;
204
205 /*
206 * Hook function to generate cluster wide SPI.
207 */
208 void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
209 void *) = NULL;
210
211 /*
212 * Hook function to verify if the SPI is already utlized.
213 */
214
215 int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
216
217 /*
218 * Hook function to delete the SPI from the cluster wide repository.
219 */
220
221 void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
222
223 /*
224 * Hook function to inform the cluster when packet received on an IDLE SA
225 */
226
227 void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
228 in6_addr_t, in6_addr_t, void *) = NULL;
229
230 /*
231 * Synchronization notes:
232 *
233 * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
234 * MT level protection given by STREAMS. IP uses a combination of its own
235 * internal serialization mechanism and standard Solaris locking techniques.
236 * The internal serialization is per phyint. This is used to serialize
237 * plumbing operations, IPMP operations, most set ioctls, etc.
238 *
239 * Plumbing is a long sequence of operations involving message
240 * exchanges between IP, ARP and device drivers. Many set ioctls are typically
241 * involved in plumbing operations. A natural model is to serialize these
242 * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
243 * parallel without any interference. But various set ioctls on hme0 are best
244 * serialized, along with IPMP operations and processing of DLPI control
245 * messages received from drivers on a per phyint basis. This serialization is
246 * provided by the ipsq_t and primitives operating on this. Details can
247 * be found in ip_if.c above the core primitives operating on ipsq_t.
248 *
249 * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
250 * Simiarly lookup of an ire by a thread also returns a refheld ire.
251 * In addition ipif's and ill's referenced by the ire are also indirectly
252 * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
253 * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
254 * address of an ipif has to go through the ipsq_t. This ensures that only
255 * one such exclusive operation proceeds at any time on the ipif. It then
256 * waits for all refcnts
257 * associated with this ipif to come down to zero. The address is changed
258 * only after the ipif has been quiesced. Then the ipif is brought up again.
259 * More details are described above the comment in ip_sioctl_flags.
260 *
261 * Packet processing is based mostly on IREs and are fully multi-threaded
262 * using standard Solaris MT techniques.
263 *
264 * There are explicit locks in IP to handle:
265 * - The ip_g_head list maintained by mi_open_link() and friends.
266 *
267 * - The reassembly data structures (one lock per hash bucket)
268 *
269 * - conn_lock is meant to protect conn_t fields. The fields actually
270 * protected by conn_lock are documented in the conn_t definition.
271 *
272 * - ire_lock to protect some of the fields of the ire, IRE tables
273 * (one lock per hash bucket). Refer to ip_ire.c for details.
274 *
275 * - ndp_g_lock and ncec_lock for protecting NCEs.
276 *
277 * - ill_lock protects fields of the ill and ipif. Details in ip.h
278 *
279 * - ill_g_lock: This is a global reader/writer lock. Protects the following
280 * * The AVL tree based global multi list of all ills.
281 * * The linked list of all ipifs of an ill
282 * * The <ipsq-xop> mapping
283 * * <ill-phyint> association
284 * Insertion/deletion of an ill in the system, insertion/deletion of an ipif
285 * into an ill, changing the <ipsq-xop> mapping of an ill, changing the
286 * <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
287 * writer for the actual duration of the insertion/deletion/change.
288 *
289 * - ill_lock: This is a per ill mutex.
290 * It protects some members of the ill_t struct; see ip.h for details.
291 * It also protects the <ill-phyint> assoc.
292 * It also protects the list of ipifs hanging off the ill.
293 *
294 * - ipsq_lock: This is a per ipsq_t mutex lock.
295 * This protects some members of the ipsq_t struct; see ip.h for details.
296 * It also protects the <ipsq-ipxop> mapping
297 *
298 * - ipx_lock: This is a per ipxop_t mutex lock.
299 * This protects some members of the ipxop_t struct; see ip.h for details.
300 *
301 * - phyint_lock: This is a per phyint mutex lock. Protects just the
302 * phyint_flags
303 *
304 * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
305 * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
306 * uniqueness check also done atomically.
307 *
308 * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
309 * group list linked by ill_usesrc_grp_next. It also protects the
310 * ill_usesrc_ifindex field. It is taken as a writer when a member of the
311 * group is being added or deleted. This lock is taken as a reader when
312 * walking the list/group(eg: to get the number of members in a usesrc group).
313 * Note, it is only necessary to take this lock if the ill_usesrc_grp_next
314 * field is changing state i.e from NULL to non-NULL or vice-versa. For
315 * example, it is not necessary to take this lock in the initial portion
316 * of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
317 * operations are executed exclusively and that ensures that the "usesrc
318 * group state" cannot change. The "usesrc group state" change can happen
319 * only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
320 *
321 * Changing <ill-phyint>, <ipsq-xop> assocications:
322 *
323 * To change the <ill-phyint> association, the ill_g_lock must be held
324 * as writer, and the ill_locks of both the v4 and v6 instance of the ill
325 * must be held.
326 *
327 * To change the <ipsq-xop> association, the ill_g_lock must be held as
328 * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
329 * This is only done when ills are added or removed from IPMP groups.
330 *
331 * To add or delete an ipif from the list of ipifs hanging off the ill,
332 * ill_g_lock (writer) and ill_lock must be held and the thread must be
333 * a writer on the associated ipsq.
334 *
335 * To add or delete an ill to the system, the ill_g_lock must be held as
336 * writer and the thread must be a writer on the associated ipsq.
337 *
338 * To add or delete an ilm to an ill, the ill_lock must be held and the thread
339 * must be a writer on the associated ipsq.
340 *
341 * Lock hierarchy
342 *
343 * Some lock hierarchy scenarios are listed below.
344 *
345 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
346 * ill_g_lock -> ill_lock(s) -> phyint_lock
347 * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
348 * ill_g_lock -> ip_addr_avail_lock
349 * conn_lock -> irb_lock -> ill_lock -> ire_lock
350 * ill_g_lock -> ip_g_nd_lock
351 * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
352 * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
353 * arl_lock -> ill_lock
354 * ips_ire_dep_lock -> irb_lock
355 *
356 * When more than 1 ill lock is needed to be held, all ill lock addresses
357 * are sorted on address and locked starting from highest addressed lock
358 * downward.
359 *
360 * Multicast scenarios
361 * ips_ill_g_lock -> ill_mcast_lock
362 * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
363 * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
364 * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
365 * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
366 * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
367 *
368 * IPsec scenarios
369 *
370 * ipsa_lock -> ill_g_lock -> ill_lock
371 * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
372 *
373 * Trusted Solaris scenarios
374 *
375 * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
376 * igsa_lock -> gcdb_lock
377 * gcgrp_rwlock -> ire_lock
378 * gcgrp_rwlock -> gcdb_lock
379 *
380 * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
381 *
382 * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
383 * sq_lock -> conn_lock -> QLOCK(q)
384 * ill_lock -> ft_lock -> fe_lock
385 *
386 * Routing/forwarding table locking notes:
387 *
388 * Lock acquisition order: Radix tree lock, irb_lock.
389 * Requirements:
390 * i. Walker must not hold any locks during the walker callback.
391 * ii Walker must not see a truncated tree during the walk because of any node
392 * deletion.
393 * iii Existing code assumes ire_bucket is valid if it is non-null and is used
394 * in many places in the code to walk the irb list. Thus even if all the
395 * ires in a bucket have been deleted, we still can't free the radix node
396 * until the ires have actually been inactive'd (freed).
397 *
398 * Tree traversal - Need to hold the global tree lock in read mode.
399 * Before dropping the global tree lock, need to either increment the ire_refcnt
400 * to ensure that the radix node can't be deleted.
401 *
402 * Tree add - Need to hold the global tree lock in write mode to add a
403 * radix node. To prevent the node from being deleted, increment the
404 * irb_refcnt, after the node is added to the tree. The ire itself is
405 * added later while holding the irb_lock, but not the tree lock.
406 *
407 * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
408 * All associated ires must be inactive (i.e. freed), and irb_refcnt
409 * must be zero.
410 *
411 * Walker - Increment irb_refcnt before calling the walker callback. Hold the
412 * global tree lock (read mode) for traversal.
413 *
414 * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
415 * hence we will acquire irb_lock while holding ips_ire_dep_lock.
416 *
417 * IPsec notes :
418 *
419 * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
420 * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
421 * ip_xmit_attr_t has the
422 * information used by the IPsec code for applying the right level of
423 * protection. The information initialized by IP in the ip_xmit_attr_t
424 * is determined by the per-socket policy or global policy in the system.
425 * For inbound datagrams, the ip_recv_attr_t
426 * starts out with nothing in it. It gets filled
427 * with the right information if it goes through the AH/ESP code, which
428 * happens if the incoming packet is secure. The information initialized
429 * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
430 * the policy requirements needed by per-socket policy or global policy
431 * is met or not.
432 *
433 * For fully connected sockets i.e dst, src [addr, port] is known,
434 * conn_policy_cached is set indicating that policy has been cached.
435 * conn_in_enforce_policy may or may not be set depending on whether
436 * there is a global policy match or per-socket policy match.
437 * Policy inheriting happpens in ip_policy_set once the destination is known.
438 * Once the right policy is set on the conn_t, policy cannot change for
439 * this socket. This makes life simpler for TCP (UDP ?) where
440 * re-transmissions go out with the same policy. For symmetry, policy
441 * is cached for fully connected UDP sockets also. Thus if policy is cached,
442 * it also implies that policy is latched i.e policy cannot change
443 * on these sockets. As we have the right policy on the conn, we don't
444 * have to lookup global policy for every outbound and inbound datagram
445 * and thus serving as an optimization. Note that a global policy change
446 * does not affect fully connected sockets if they have policy. If fully
447 * connected sockets did not have any policy associated with it, global
448 * policy change may affect them.
449 *
450 * IP Flow control notes:
451 * ---------------------
452 * Non-TCP streams are flow controlled by IP. The way this is accomplished
453 * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
454 * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
455 * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
456 * functions.
457 *
458 * Per Tx ring udp flow control:
459 * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
460 * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
461 *
462 * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
463 * To achieve best performance, outgoing traffic need to be fanned out among
464 * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
465 * traffic out of the NIC and it takes a fanout hint. UDP connections pass
466 * the address of connp as fanout hint to mac_tx(). Under flow controlled
467 * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
468 * cookie points to a specific Tx ring that is blocked. The cookie is used to
469 * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
470 * point to drain_lists (idl_t's). These drain list will store the blocked UDP
471 * connp's. The drain list is not a single list but a configurable number of
472 * lists.
473 *
474 * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
475 * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
476 * which is equal to 128. This array in turn contains a pointer to idl_t[],
477 * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
478 * list will point to the list of connp's that are flow controlled.
479 *
480 * --------------- ------- ------- -------
481 * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
482 * | --------------- ------- ------- -------
483 * | --------------- ------- ------- -------
484 * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
485 * ---------------- | --------------- ------- ------- -------
486 * |idl_tx_list[0]|->| --------------- ------- ------- -------
487 * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
488 * | --------------- ------- ------- -------
489 * . . . . .
490 * | --------------- ------- ------- -------
491 * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
492 * --------------- ------- ------- -------
493 * --------------- ------- ------- -------
494 * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
495 * | --------------- ------- ------- -------
496 * | --------------- ------- ------- -------
497 * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
498 * |idl_tx_list[1]|->| --------------- ------- ------- -------
499 * ---------------- | . . . .
500 * | --------------- ------- ------- -------
501 * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
502 * --------------- ------- ------- -------
503 * .....
504 * ----------------
505 * |idl_tx_list[n]|-> ...
506 * ----------------
507 *
508 * When mac_tx() returns a cookie, the cookie is hashed into an index into
509 * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
510 * to insert the conn onto. conn_drain_insert() asserts flow control for the
511 * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
512 * Further, conn_blocked is set to indicate that the conn is blocked.
513 *
514 * GLDv3 calls ill_flow_enable() when flow control is relieved. The cookie
515 * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
516 * is again hashed to locate the appropriate idl_tx_list, which is then
517 * drained via conn_walk_drain(). conn_walk_drain() goes through each conn in
518 * the drain list and calls conn_drain_remove() to clear flow control (via
519 * calling su_txq_full() or clearing QFULL), and remove the conn from the
520 * drain list.
521 *
522 * Note that the drain list is not a single list but a (configurable) array of
523 * lists (8 elements by default). Synchronization between drain insertion and
524 * flow control wakeup is handled by using idl_txl->txl_lock, and only
525 * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
526 *
527 * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
528 * On the send side, if the packet cannot be sent down to the driver by IP
529 * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
530 * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
531 * the 0'th drain list. When ip_wsrv() runs on the ill_wq because flow
532 * control has been relieved, the blocked conns in the 0'th drain list are
533 * drained as in the non-STREAMS case.
534 *
535 * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
536 * is done when the conn is inserted into the drain list (conn_drain_insert())
537 * and cleared when the conn is removed from the it (conn_drain_remove()).
538 *
539 * IPQOS notes:
540 *
541 * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
542 * and IPQoS modules. IPPF includes hooks in IP at different control points
543 * (callout positions) which direct packets to IPQoS modules for policy
544 * processing. Policies, if present, are global.
545 *
546 * The callout positions are located in the following paths:
547 * o local_in (packets destined for this host)
548 * o local_out (packets orginating from this host )
549 * o fwd_in (packets forwarded by this m/c - inbound)
550 * o fwd_out (packets forwarded by this m/c - outbound)
551 * Hooks at these callout points can be enabled/disabled using the ndd variable
552 * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
553 * By default all the callout positions are enabled.
554 *
555 * Outbound (local_out)
556 * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
557 *
558 * Inbound (local_in)
559 * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
560 *
561 * Forwarding (in and out)
562 * Hooks are placed in ire_recv_forward_v4/v6.
563 *
564 * IP Policy Framework processing (IPPF processing)
565 * Policy processing for a packet is initiated by ip_process, which ascertains
566 * that the classifier (ipgpc) is loaded and configured, failing which the
567 * packet resumes normal processing in IP. If the clasifier is present, the
568 * packet is acted upon by one or more IPQoS modules (action instances), per
569 * filters configured in ipgpc and resumes normal IP processing thereafter.
570 * An action instance can drop a packet in course of its processing.
571 *
572 * Zones notes:
573 *
574 * The partitioning rules for networking are as follows:
575 * 1) Packets coming from a zone must have a source address belonging to that
576 * zone.
577 * 2) Packets coming from a zone can only be sent on a physical interface on
578 * which the zone has an IP address.
579 * 3) Between two zones on the same machine, packet delivery is only allowed if
580 * there's a matching route for the destination and zone in the forwarding
581 * table.
582 * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
583 * different zones can bind to the same port with the wildcard address
584 * (INADDR_ANY).
585 *
586 * The granularity of interface partitioning is at the logical interface level.
587 * Therefore, every zone has its own IP addresses, and incoming packets can be
588 * attributed to a zone unambiguously. A logical interface is placed into a zone
589 * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
590 * structure. Rule (1) is implemented by modifying the source address selection
591 * algorithm so that the list of eligible addresses is filtered based on the
592 * sending process zone.
593 *
594 * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
595 * across all zones, depending on their type. Here is the break-up:
596 *
597 * IRE type Shared/exclusive
598 * -------- ----------------
599 * IRE_BROADCAST Exclusive
600 * IRE_DEFAULT (default routes) Shared (*)
601 * IRE_LOCAL Exclusive (x)
602 * IRE_LOOPBACK Exclusive
603 * IRE_PREFIX (net routes) Shared (*)
604 * IRE_IF_NORESOLVER (interface routes) Exclusive
605 * IRE_IF_RESOLVER (interface routes) Exclusive
606 * IRE_IF_CLONE (interface routes) Exclusive
607 * IRE_HOST (host routes) Shared (*)
608 *
609 * (*) A zone can only use a default or off-subnet route if the gateway is
610 * directly reachable from the zone, that is, if the gateway's address matches
611 * one of the zone's logical interfaces.
612 *
613 * (x) IRE_LOCAL are handled a bit differently.
614 * When ip_restrict_interzone_loopback is set (the default),
615 * ire_route_recursive restricts loopback using an IRE_LOCAL
616 * between zone to the case when L2 would have conceptually looped the packet
617 * back, i.e. the loopback which is required since neither Ethernet drivers
618 * nor Ethernet hardware loops them back. This is the case when the normal
619 * routes (ignoring IREs with different zoneids) would send out the packet on
620 * the same ill as the ill with which is IRE_LOCAL is associated.
621 *
622 * Multiple zones can share a common broadcast address; typically all zones
623 * share the 255.255.255.255 address. Incoming as well as locally originated
624 * broadcast packets must be dispatched to all the zones on the broadcast
625 * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
626 * since some zones may not be on the 10.16.72/24 network. To handle this, each
627 * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
628 * sent to every zone that has an IRE_BROADCAST entry for the destination
629 * address on the input ill, see ip_input_broadcast().
630 *
631 * Applications in different zones can join the same multicast group address.
632 * The same logic applies for multicast as for broadcast. ip_input_multicast
633 * dispatches packets to all zones that have members on the physical interface.
634 */
635
636 /*
637 * Squeue Fanout flags:
638 * 0: No fanout.
639 * 1: Fanout across all squeues
640 */
641 boolean_t ip_squeue_fanout = 0;
642
643 /*
644 * Maximum dups allowed per packet.
645 */
646 uint_t ip_max_frag_dups = 10;
647
648 static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
649 cred_t *credp, boolean_t isv6);
650 static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
651
652 static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
653 static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
654 static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
655 ip_recv_attr_t *);
656 static void icmp_options_update(ipha_t *);
657 static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *);
658 static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
659 static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
660 static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
661 ip_recv_attr_t *);
662 static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
663 static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
664 ip_recv_attr_t *);
665
666 mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
667 char *ip_dot_addr(ipaddr_t, char *);
668 mblk_t *ip_carve_mp(mblk_t **, ssize_t);
669 int ip_close(queue_t *, int);
670 static char *ip_dot_saddr(uchar_t *, char *);
671 static void ip_lrput(queue_t *, mblk_t *);
672 ipaddr_t ip_net_mask(ipaddr_t);
673 char *ip_nv_lookup(nv_t *, int);
674 void ip_rput(queue_t *, mblk_t *);
675 static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
676 void *dummy_arg);
677 int ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
678 static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
679 mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
680 static mblk_t *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
681 ip_stack_t *, boolean_t);
682 static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
683 boolean_t);
684 static mblk_t *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
685 static mblk_t *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
686 static mblk_t *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
687 static mblk_t *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
688 static mblk_t *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
689 ip_stack_t *ipst, boolean_t);
690 static mblk_t *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
691 ip_stack_t *ipst, boolean_t);
692 static mblk_t *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
693 ip_stack_t *ipst);
694 static mblk_t *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
695 ip_stack_t *ipst);
696 static mblk_t *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
697 ip_stack_t *ipst);
698 static mblk_t *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
699 ip_stack_t *ipst);
700 static mblk_t *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
701 ip_stack_t *ipst);
702 static mblk_t *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
703 ip_stack_t *ipst);
704 static mblk_t *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
705 ip_stack_t *ipst);
706 static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
707 ip_stack_t *ipst);
708 static void ip_snmp_get2_v4(ire_t *, iproutedata_t *);
709 static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
710 static int ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
711 static int ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
712 int ip_snmp_set(queue_t *, int, int, uchar_t *, int);
713
714 static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
715 mblk_t *);
716
717 static void conn_drain_init(ip_stack_t *);
718 static void conn_drain_fini(ip_stack_t *);
719 static void conn_drain(conn_t *connp, boolean_t closing);
720
721 static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
722 static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
723
724 static void *ip_stack_init(netstackid_t stackid, netstack_t *ns);
725 static void ip_stack_shutdown(netstackid_t stackid, void *arg);
726 static void ip_stack_fini(netstackid_t stackid, void *arg);
727
728 static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
729 const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
730 ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
731 const in6_addr_t *);
732
733 static int ip_squeue_switch(int);
734
735 static void *ip_kstat_init(netstackid_t, ip_stack_t *);
736 static void ip_kstat_fini(netstackid_t, kstat_t *);
737 static int ip_kstat_update(kstat_t *kp, int rw);
738 static void *icmp_kstat_init(netstackid_t);
739 static void icmp_kstat_fini(netstackid_t, kstat_t *);
740 static int icmp_kstat_update(kstat_t *kp, int rw);
741 static void *ip_kstat2_init(netstackid_t, ip_stat_t *);
742 static void ip_kstat2_fini(netstackid_t, kstat_t *);
743
744 static void ipobs_init(ip_stack_t *);
745 static void ipobs_fini(ip_stack_t *);
746
747 static int ip_tp_cpu_update(cpu_setup_t, int, void *);
748
749 ipaddr_t ip_g_all_ones = IP_HOST_MASK;
750
751 static long ip_rput_pullups;
752 int dohwcksum = 1; /* use h/w cksum if supported by the hardware */
753
754 vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
755 vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
756
757 int ip_debug;
758
759 /*
760 * Multirouting/CGTP stuff
761 */
762 int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */
763
764 /*
765 * IP tunables related declarations. Definitions are in ip_tunables.c
766 */
767 extern mod_prop_info_t ip_propinfo_tbl[];
768 extern int ip_propinfo_count;
769
770 /*
771 * Table of IP ioctls encoding the various properties of the ioctl and
772 * indexed based on the last byte of the ioctl command. Occasionally there
773 * is a clash, and there is more than 1 ioctl with the same last byte.
774 * In such a case 1 ioctl is encoded in the ndx table and the remaining
775 * ioctls are encoded in the misc table. An entry in the ndx table is
776 * retrieved by indexing on the last byte of the ioctl command and comparing
777 * the ioctl command with the value in the ndx table. In the event of a
778 * mismatch the misc table is then searched sequentially for the desired
779 * ioctl command.
780 *
781 * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
782 */
783 ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
784 /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
785 /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
786 /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
787 /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
788 /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
789 /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
790 /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
791 /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
792 /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
793 /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
794
795 /* 010 */ { SIOCADDRT, sizeof (struct rtentry), IPI_PRIV,
796 MISC_CMD, ip_siocaddrt, NULL },
797 /* 011 */ { SIOCDELRT, sizeof (struct rtentry), IPI_PRIV,
798 MISC_CMD, ip_siocdelrt, NULL },
799
800 /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
801 IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
802 /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
803 IF_CMD, ip_sioctl_get_addr, NULL },
804
805 /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
806 IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
807 /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
808 IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
809
810 /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
811 IPI_PRIV | IPI_WR,
812 IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
813 /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
814 IPI_MODOK | IPI_GET_CMD,
815 IF_CMD, ip_sioctl_get_flags, NULL },
816
817 /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
818 /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
819
820 /* copyin size cannot be coded for SIOCGIFCONF */
821 /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
822 MISC_CMD, ip_sioctl_get_ifconf, NULL },
823
824 /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
825 IF_CMD, ip_sioctl_mtu, NULL },
826 /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
827 IF_CMD, ip_sioctl_get_mtu, NULL },
828 /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
829 IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
830 /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
831 IF_CMD, ip_sioctl_brdaddr, NULL },
832 /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
833 IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
834 /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
835 IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
836 /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
837 IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
838 /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
839 IF_CMD, ip_sioctl_metric, NULL },
840 /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
841
842 /* See 166-168 below for extended SIOC*XARP ioctls */
843 /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
844 ARP_CMD, ip_sioctl_arp, NULL },
845 /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
846 ARP_CMD, ip_sioctl_arp, NULL },
847 /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
848 ARP_CMD, ip_sioctl_arp, NULL },
849
850 /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
851 /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
852 /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
853 /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
854 /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
855 /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
856 /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
857 /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
858 /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
859 /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
860 /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
861 /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
862 /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
863 /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
864 /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
865 /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
866 /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
867 /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
868 /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
869 /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
870 /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
871
872 /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
873 MISC_CMD, if_unitsel, if_unitsel_restart },
874
875 /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
876 /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
877 /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
878 /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
879 /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
880 /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
881 /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
882 /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
883 /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
884 /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
885 /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
886 /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
887 /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
888 /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
889 /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
890 /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
891 /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
892 /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
893
894 /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
895 IPI_PRIV | IPI_WR | IPI_MODOK,
896 IF_CMD, ip_sioctl_sifname, NULL },
897
898 /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
899 /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
900 /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
901 /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
902 /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
903 /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
904 /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
905 /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
906 /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
907 /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
908 /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
909 /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
910 /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
911
912 /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
913 MISC_CMD, ip_sioctl_get_ifnum, NULL },
914 /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
915 IF_CMD, ip_sioctl_get_muxid, NULL },
916 /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
917 IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
918
919 /* Both if and lif variants share same func */
920 /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
921 IF_CMD, ip_sioctl_get_lifindex, NULL },
922 /* Both if and lif variants share same func */
923 /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
924 IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
925
926 /* copyin size cannot be coded for SIOCGIFCONF */
927 /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
928 MISC_CMD, ip_sioctl_get_ifconf, NULL },
929 /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
930 /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
931 /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
932 /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
933 /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
934 /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
935 /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
936 /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
937 /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
938 /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
939 /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
940 /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
941 /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
942 /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
943 /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
944 /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
945 /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
946
947 /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
948 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
949 ip_sioctl_removeif_restart },
950 /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
951 IPI_GET_CMD | IPI_PRIV | IPI_WR,
952 LIF_CMD, ip_sioctl_addif, NULL },
953 #define SIOCLIFADDR_NDX 112
954 /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
955 LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
956 /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
957 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
958 /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
959 LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
960 /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
961 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
962 /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
963 IPI_PRIV | IPI_WR,
964 LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
965 /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
966 IPI_GET_CMD | IPI_MODOK,
967 LIF_CMD, ip_sioctl_get_flags, NULL },
968
969 /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
970 /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
971
972 /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
973 ip_sioctl_get_lifconf, NULL },
974 /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
975 LIF_CMD, ip_sioctl_mtu, NULL },
976 /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
977 LIF_CMD, ip_sioctl_get_mtu, NULL },
978 /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
979 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
980 /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
981 LIF_CMD, ip_sioctl_brdaddr, NULL },
982 /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
983 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
984 /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
985 LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
986 /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
987 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
988 /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
989 LIF_CMD, ip_sioctl_metric, NULL },
990 /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
991 IPI_PRIV | IPI_WR | IPI_MODOK,
992 LIF_CMD, ip_sioctl_slifname,
993 ip_sioctl_slifname_restart },
994
995 /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
996 MISC_CMD, ip_sioctl_get_lifnum, NULL },
997 /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
998 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
999 /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
1000 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
1001 /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
1002 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
1003 /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
1004 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
1005 /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1006 LIF_CMD, ip_sioctl_token, NULL },
1007 /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
1008 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
1009 /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1010 LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
1011 /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
1012 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
1013 /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1014 LIF_CMD, ip_sioctl_lnkinfo, NULL },
1015
1016 /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
1017 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
1018 /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
1019 LIF_CMD, ip_siocdelndp_v6, NULL },
1020 /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
1021 LIF_CMD, ip_siocqueryndp_v6, NULL },
1022 /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
1023 LIF_CMD, ip_siocsetndp_v6, NULL },
1024 /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1025 MISC_CMD, ip_sioctl_tmyaddr, NULL },
1026 /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1027 MISC_CMD, ip_sioctl_tonlink, NULL },
1028 /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
1029 MISC_CMD, ip_sioctl_tmysite, NULL },
1030 /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1031 /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1032 /* IPSECioctls handled in ip_sioctl_copyin_setup itself */
1033 /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1034 /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1035 /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1036 /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1037
1038 /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1039
1040 /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
1041 LIF_CMD, ip_sioctl_get_binding, NULL },
1042 /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
1043 IPI_PRIV | IPI_WR,
1044 LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
1045 /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
1046 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
1047 /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
1048 IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
1049
1050 /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
1051 /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1052 /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1053 /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1054
1055 /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1056
1057 /* These are handled in ip_sioctl_copyin_setup itself */
1058 /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
1059 MISC_CMD, NULL, NULL },
1060 /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
1061 MISC_CMD, NULL, NULL },
1062 /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
1063
1064 /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
1065 ip_sioctl_get_lifconf, NULL },
1066
1067 /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1068 XARP_CMD, ip_sioctl_arp, NULL },
1069 /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
1070 XARP_CMD, ip_sioctl_arp, NULL },
1071 /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1072 XARP_CMD, ip_sioctl_arp, NULL },
1073
1074 /* SIOCPOPSOCKFS is not handled by IP */
1075 /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
1076
1077 /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
1078 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
1079 /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
1080 IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
1081 ip_sioctl_slifzone_restart },
1082 /* 172-174 are SCTP ioctls and not handled by IP */
1083 /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1084 /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1085 /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1086 /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
1087 IPI_GET_CMD, LIF_CMD,
1088 ip_sioctl_get_lifusesrc, 0 },
1089 /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
1090 IPI_PRIV | IPI_WR,
1091 LIF_CMD, ip_sioctl_slifusesrc,
1092 NULL },
1093 /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
1094 ip_sioctl_get_lifsrcof, NULL },
1095 /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
1096 MSFILT_CMD, ip_sioctl_msfilter, NULL },
1097 /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
1098 MSFILT_CMD, ip_sioctl_msfilter, NULL },
1099 /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
1100 MSFILT_CMD, ip_sioctl_msfilter, NULL },
1101 /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
1102 MSFILT_CMD, ip_sioctl_msfilter, NULL },
1103 /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1104 /* SIOCSENABLESDP is handled by SDP */
1105 /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
1106 /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
1107 /* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
1108 IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
1109 /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
1110 /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
1111 ip_sioctl_ilb_cmd, NULL },
1112 /* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
1113 /* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
1114 /* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
1115 IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
1116 /* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1117 LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
1118 /* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
1119 LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
1120 };
1121
1122 int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1123
1124 ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
1125 { I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1126 { I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1127 { I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1128 { I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1129 { ND_GET, 0, 0, 0, NULL, NULL },
1130 { ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1131 { IP_IOCTL, 0, 0, 0, NULL, NULL },
1132 { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
1133 MISC_CMD, mrt_ioctl},
1134 { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
1135 MISC_CMD, mrt_ioctl},
1136 { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
1137 MISC_CMD, mrt_ioctl}
1138 };
1139
1140 int ip_misc_ioctl_count =
1141 sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1142
1143 int conn_drain_nthreads; /* Number of drainers reqd. */
1144 /* Settable in /etc/system */
1145 /* Defined in ip_ire.c */
1146 extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
1147 extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
1148 extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
1149
1150 static nv_t ire_nv_arr[] = {
1151 { IRE_BROADCAST, "BROADCAST" },
1152 { IRE_LOCAL, "LOCAL" },
1153 { IRE_LOOPBACK, "LOOPBACK" },
1154 { IRE_DEFAULT, "DEFAULT" },
1155 { IRE_PREFIX, "PREFIX" },
1156 { IRE_IF_NORESOLVER, "IF_NORESOL" },
1157 { IRE_IF_RESOLVER, "IF_RESOLV" },
1158 { IRE_IF_CLONE, "IF_CLONE" },
1159 { IRE_HOST, "HOST" },
1160 { IRE_MULTICAST, "MULTICAST" },
1161 { IRE_NOROUTE, "NOROUTE" },
1162 { 0 }
1163 };
1164
1165 nv_t *ire_nv_tbl = ire_nv_arr;
1166
1167 /* Simple ICMP IP Header Template */
1168 static ipha_t icmp_ipha = {
1169 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
1170 };
1171
1172 struct module_info ip_mod_info = {
1173 IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
1174 IP_MOD_LOWAT
1175 };
1176
1177 /*
1178 * Duplicate static symbols within a module confuses mdb; so we avoid the
1179 * problem by making the symbols here distinct from those in udp.c.
1180 */
1181
1182 /*
1183 * Entry points for IP as a device and as a module.
1184 * We have separate open functions for the /dev/ip and /dev/ip6 devices.
1185 */
1186 static struct qinit iprinitv4 = {
1187 (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
1188 &ip_mod_info
1189 };
1190
1191 struct qinit iprinitv6 = {
1192 (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
1193 &ip_mod_info
1194 };
1195
1196 static struct qinit ipwinit = {
1197 (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
1198 &ip_mod_info
1199 };
1200
1201 static struct qinit iplrinit = {
1202 (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
1203 &ip_mod_info
1204 };
1205
1206 static struct qinit iplwinit = {
1207 (pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
1208 &ip_mod_info
1209 };
1210
1211 /* For AF_INET aka /dev/ip */
1212 struct streamtab ipinfov4 = {
1213 &iprinitv4, &ipwinit, &iplrinit, &iplwinit
1214 };
1215
1216 /* For AF_INET6 aka /dev/ip6 */
1217 struct streamtab ipinfov6 = {
1218 &iprinitv6, &ipwinit, &iplrinit, &iplwinit
1219 };
1220
1221 #ifdef DEBUG
1222 boolean_t skip_sctp_cksum = B_FALSE;
1223 #endif
1224
1225 /*
1226 * Generate an ICMP fragmentation needed message.
1227 * When called from ip_output side a minimal ip_recv_attr_t needs to be
1228 * constructed by the caller.
1229 */
1230 void
1231 icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
1232 {
1233 icmph_t icmph;
1234 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
1235
1236 mp = icmp_pkt_err_ok(mp, ira);
1237 if (mp == NULL)
1238 return;
1239
1240 bzero(&icmph, sizeof (icmph_t));
1241 icmph.icmph_type = ICMP_DEST_UNREACHABLE;
1242 icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
1243 icmph.icmph_du_mtu = htons((uint16_t)mtu);
1244 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
1245 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
1246
1247 icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
1248 }
1249
1250 /*
1251 * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
1252 * If the ICMP message is consumed by IP, i.e., it should not be delivered
1253 * to any IPPROTO_ICMP raw sockets, then it returns NULL.
1254 * Likewise, if the ICMP error is misformed (too short, etc), then it
1255 * returns NULL. The caller uses this to determine whether or not to send
1256 * to raw sockets.
1257 *
1258 * All error messages are passed to the matching transport stream.
1259 *
1260 * The following cases are handled by icmp_inbound:
1261 * 1) It needs to send a reply back and possibly delivering it
1262 * to the "interested" upper clients.
1263 * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
1264 * 3) It needs to change some values in IP only.
1265 * 4) It needs to change some values in IP and upper layers e.g TCP
1266 * by delivering an error to the upper layers.
1267 *
1268 * We handle the above three cases in the context of IPsec in the
1269 * following way :
1270 *
1271 * 1) Send the reply back in the same way as the request came in.
1272 * If it came in encrypted, it goes out encrypted. If it came in
1273 * clear, it goes out in clear. Thus, this will prevent chosen
1274 * plain text attack.
1275 * 2) The client may or may not expect things to come in secure.
1276 * If it comes in secure, the policy constraints are checked
1277 * before delivering it to the upper layers. If it comes in
1278 * clear, ipsec_inbound_accept_clear will decide whether to
1279 * accept this in clear or not. In both the cases, if the returned
1280 * message (IP header + 8 bytes) that caused the icmp message has
1281 * AH/ESP headers, it is sent up to AH/ESP for validation before
1282 * sending up. If there are only 8 bytes of returned message, then
1283 * upper client will not be notified.
1284 * 3) Check with global policy to see whether it matches the constaints.
1285 * But this will be done only if icmp_accept_messages_in_clear is
1286 * zero.
1287 * 4) If we need to change both in IP and ULP, then the decision taken
1288 * while affecting the values in IP and while delivering up to TCP
1289 * should be the same.
1290 *
1291 * There are two cases.
1292 *
1293 * a) If we reject data at the IP layer (ipsec_check_global_policy()
1294 * failed), we will not deliver it to the ULP, even though they
1295 * are *willing* to accept in *clear*. This is fine as our global
1296 * disposition to icmp messages asks us reject the datagram.
1297 *
1298 * b) If we accept data at the IP layer (ipsec_check_global_policy()
1299 * succeeded or icmp_accept_messages_in_clear is 1), and not able
1300 * to deliver it to ULP (policy failed), it can lead to
1301 * consistency problems. The cases known at this time are
1302 * ICMP_DESTINATION_UNREACHABLE messages with following code
1303 * values :
1304 *
1305 * - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
1306 * and Upper layer rejects. Then the communication will
1307 * come to a stop. This is solved by making similar decisions
1308 * at both levels. Currently, when we are unable to deliver
1309 * to the Upper Layer (due to policy failures) while IP has
1310 * adjusted dce_pmtu, the next outbound datagram would
1311 * generate a local ICMP_FRAGMENTATION_NEEDED message - which
1312 * will be with the right level of protection. Thus the right
1313 * value will be communicated even if we are not able to
1314 * communicate when we get from the wire initially. But this
1315 * assumes there would be at least one outbound datagram after
1316 * IP has adjusted its dce_pmtu value. To make things
1317 * simpler, we accept in clear after the validation of
1318 * AH/ESP headers.
1319 *
1320 * - Other ICMP ERRORS : We may not be able to deliver it to the
1321 * upper layer depending on the level of protection the upper
1322 * layer expects and the disposition in ipsec_inbound_accept_clear().
1323 * ipsec_inbound_accept_clear() decides whether a given ICMP error
1324 * should be accepted in clear when the Upper layer expects secure.
1325 * Thus the communication may get aborted by some bad ICMP
1326 * packets.
1327 */
1328 mblk_t *
1329 icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
1330 {
1331 icmph_t *icmph;
1332 ipha_t *ipha; /* Outer header */
1333 int ip_hdr_length; /* Outer header length */
1334 boolean_t interested;
1335 ipif_t *ipif;
1336 uint32_t ts;
1337 uint32_t *tsp;
1338 timestruc_t now;
1339 ill_t *ill = ira->ira_ill;
1340 ip_stack_t *ipst = ill->ill_ipst;
1341 zoneid_t zoneid = ira->ira_zoneid;
1342 int len_needed;
1343 mblk_t *mp_ret = NULL;
1344
1345 ipha = (ipha_t *)mp->b_rptr;
1346
1347 BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
1348
1349 ip_hdr_length = ira->ira_ip_hdr_length;
1350 if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
1351 if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
1352 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1353 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1354 freemsg(mp);
1355 return (NULL);
1356 }
1357 /* Last chance to get real. */
1358 ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
1359 if (ipha == NULL) {
1360 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
1361 freemsg(mp);
1362 return (NULL);
1363 }
1364 }
1365
1366 /* The IP header will always be a multiple of four bytes */
1367 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1368 ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
1369 icmph->icmph_code));
1370
1371 /*
1372 * We will set "interested" to "true" if we should pass a copy to
1373 * the transport or if we handle the packet locally.
1374 */
1375 interested = B_FALSE;
1376 switch (icmph->icmph_type) {
1377 case ICMP_ECHO_REPLY:
1378 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
1379 break;
1380 case ICMP_DEST_UNREACHABLE:
1381 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
1382 BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
1383 interested = B_TRUE; /* Pass up to transport */
1384 BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
1385 break;
1386 case ICMP_SOURCE_QUENCH:
1387 interested = B_TRUE; /* Pass up to transport */
1388 BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
1389 break;
1390 case ICMP_REDIRECT:
1391 if (!ipst->ips_ip_ignore_redirect)
1392 interested = B_TRUE;
1393 BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
1394 break;
1395 case ICMP_ECHO_REQUEST:
1396 /*
1397 * Whether to respond to echo requests that come in as IP
1398 * broadcasts or as IP multicast is subject to debate
1399 * (what isn't?). We aim to please, you pick it.
1400 * Default is do it.
1401 */
1402 if (ira->ira_flags & IRAF_MULTICAST) {
1403 /* multicast: respond based on tunable */
1404 interested = ipst->ips_ip_g_resp_to_echo_mcast;
1405 } else if (ira->ira_flags & IRAF_BROADCAST) {
1406 /* broadcast: respond based on tunable */
1407 interested = ipst->ips_ip_g_resp_to_echo_bcast;
1408 } else {
1409 /* unicast: always respond */
1410 interested = B_TRUE;
1411 }
1412 BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
1413 if (!interested) {
1414 /* We never pass these to RAW sockets */
1415 freemsg(mp);
1416 return (NULL);
1417 }
1418
1419 /* Check db_ref to make sure we can modify the packet. */
1420 if (mp->b_datap->db_ref > 1) {
1421 mblk_t *mp1;
1422
1423 mp1 = copymsg(mp);
1424 freemsg(mp);
1425 if (!mp1) {
1426 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1427 return (NULL);
1428 }
1429 mp = mp1;
1430 ipha = (ipha_t *)mp->b_rptr;
1431 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1432 }
1433 icmph->icmph_type = ICMP_ECHO_REPLY;
1434 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
1435 icmp_send_reply_v4(mp, ipha, icmph, ira);
1436 return (NULL);
1437
1438 case ICMP_ROUTER_ADVERTISEMENT:
1439 case ICMP_ROUTER_SOLICITATION:
1440 break;
1441 case ICMP_TIME_EXCEEDED:
1442 interested = B_TRUE; /* Pass up to transport */
1443 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
1444 break;
1445 case ICMP_PARAM_PROBLEM:
1446 interested = B_TRUE; /* Pass up to transport */
1447 BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
1448 break;
1449 case ICMP_TIME_STAMP_REQUEST:
1450 /* Response to Time Stamp Requests is local policy. */
1451 if (ipst->ips_ip_g_resp_to_timestamp) {
1452 if (ira->ira_flags & IRAF_MULTIBROADCAST)
1453 interested =
1454 ipst->ips_ip_g_resp_to_timestamp_bcast;
1455 else
1456 interested = B_TRUE;
1457 }
1458 if (!interested) {
1459 /* We never pass these to RAW sockets */
1460 freemsg(mp);
1461 return (NULL);
1462 }
1463
1464 /* Make sure we have enough of the packet */
1465 len_needed = ip_hdr_length + ICMPH_SIZE +
1466 3 * sizeof (uint32_t);
1467
1468 if (mp->b_wptr - mp->b_rptr < len_needed) {
1469 ipha = ip_pullup(mp, len_needed, ira);
1470 if (ipha == NULL) {
1471 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1472 ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1473 mp, ill);
1474 freemsg(mp);
1475 return (NULL);
1476 }
1477 /* Refresh following the pullup. */
1478 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1479 }
1480 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
1481 /* Check db_ref to make sure we can modify the packet. */
1482 if (mp->b_datap->db_ref > 1) {
1483 mblk_t *mp1;
1484
1485 mp1 = copymsg(mp);
1486 freemsg(mp);
1487 if (!mp1) {
1488 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1489 return (NULL);
1490 }
1491 mp = mp1;
1492 ipha = (ipha_t *)mp->b_rptr;
1493 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1494 }
1495 icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
1496 tsp = (uint32_t *)&icmph[1];
1497 tsp++; /* Skip past 'originate time' */
1498 /* Compute # of milliseconds since midnight */
1499 gethrestime(&now);
1500 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
1501 now.tv_nsec / (NANOSEC / MILLISEC);
1502 *tsp++ = htonl(ts); /* Lay in 'receive time' */
1503 *tsp++ = htonl(ts); /* Lay in 'send time' */
1504 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
1505 icmp_send_reply_v4(mp, ipha, icmph, ira);
1506 return (NULL);
1507
1508 case ICMP_TIME_STAMP_REPLY:
1509 BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
1510 break;
1511 case ICMP_INFO_REQUEST:
1512 /* Per RFC 1122 3.2.2.7, ignore this. */
1513 case ICMP_INFO_REPLY:
1514 break;
1515 case ICMP_ADDRESS_MASK_REQUEST:
1516 if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1517 interested =
1518 ipst->ips_ip_respond_to_address_mask_broadcast;
1519 } else {
1520 interested = B_TRUE;
1521 }
1522 if (!interested) {
1523 /* We never pass these to RAW sockets */
1524 freemsg(mp);
1525 return (NULL);
1526 }
1527 len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
1528 if (mp->b_wptr - mp->b_rptr < len_needed) {
1529 ipha = ip_pullup(mp, len_needed, ira);
1530 if (ipha == NULL) {
1531 BUMP_MIB(ill->ill_ip_mib,
1532 ipIfStatsInTruncatedPkts);
1533 ip_drop_input("ipIfStatsInTruncatedPkts", mp,
1534 ill);
1535 freemsg(mp);
1536 return (NULL);
1537 }
1538 /* Refresh following the pullup. */
1539 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1540 }
1541 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
1542 /* Check db_ref to make sure we can modify the packet. */
1543 if (mp->b_datap->db_ref > 1) {
1544 mblk_t *mp1;
1545
1546 mp1 = copymsg(mp);
1547 freemsg(mp);
1548 if (!mp1) {
1549 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1550 return (NULL);
1551 }
1552 mp = mp1;
1553 ipha = (ipha_t *)mp->b_rptr;
1554 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1555 }
1556 /*
1557 * Need the ipif with the mask be the same as the source
1558 * address of the mask reply. For unicast we have a specific
1559 * ipif. For multicast/broadcast we only handle onlink
1560 * senders, and use the source address to pick an ipif.
1561 */
1562 ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
1563 if (ipif == NULL) {
1564 /* Broadcast or multicast */
1565 ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
1566 if (ipif == NULL) {
1567 freemsg(mp);
1568 return (NULL);
1569 }
1570 }
1571 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
1572 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
1573 ipif_refrele(ipif);
1574 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
1575 icmp_send_reply_v4(mp, ipha, icmph, ira);
1576 return (NULL);
1577
1578 case ICMP_ADDRESS_MASK_REPLY:
1579 BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
1580 break;
1581 default:
1582 interested = B_TRUE; /* Pass up to transport */
1583 BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
1584 break;
1585 }
1586 /*
1587 * See if there is an ICMP client to avoid an extra copymsg/freemsg
1588 * if there isn't one.
1589 */
1590 if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
1591 /* If there is an ICMP client and we want one too, copy it. */
1592
1593 if (!interested) {
1594 /* Caller will deliver to RAW sockets */
1595 return (mp);
1596 }
1597 mp_ret = copymsg(mp);
1598 if (mp_ret == NULL) {
1599 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1600 ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1601 }
1602 } else if (!interested) {
1603 /* Neither we nor raw sockets are interested. Drop packet now */
1604 freemsg(mp);
1605 return (NULL);
1606 }
1607
1608 /*
1609 * ICMP error or redirect packet. Make sure we have enough of
1610 * the header and that db_ref == 1 since we might end up modifying
1611 * the packet.
1612 */
1613 if (mp->b_cont != NULL) {
1614 if (ip_pullup(mp, -1, ira) == NULL) {
1615 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1616 ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1617 mp, ill);
1618 freemsg(mp);
1619 return (mp_ret);
1620 }
1621 }
1622
1623 if (mp->b_datap->db_ref > 1) {
1624 mblk_t *mp1;
1625
1626 mp1 = copymsg(mp);
1627 if (mp1 == NULL) {
1628 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1629 ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1630 freemsg(mp);
1631 return (mp_ret);
1632 }
1633 freemsg(mp);
1634 mp = mp1;
1635 }
1636
1637 /*
1638 * In case mp has changed, verify the message before any further
1639 * processes.
1640 */
1641 ipha = (ipha_t *)mp->b_rptr;
1642 icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1643 if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
1644 freemsg(mp);
1645 return (mp_ret);
1646 }
1647
1648 switch (icmph->icmph_type) {
1649 case ICMP_REDIRECT:
1650 icmp_redirect_v4(mp, ipha, icmph, ira);
1651 break;
1652 case ICMP_DEST_UNREACHABLE:
1653 if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
1654 /* Update DCE and adjust MTU is icmp header if needed */
1655 icmp_inbound_too_big_v4(icmph, ira);
1656 }
1657 /* FALLTHRU */
1658 default:
1659 icmp_inbound_error_fanout_v4(mp, icmph, ira);
1660 break;
1661 }
1662 return (mp_ret);
1663 }
1664
1665 /*
1666 * Send an ICMP echo, timestamp or address mask reply.
1667 * The caller has already updated the payload part of the packet.
1668 * We handle the ICMP checksum, IP source address selection and feed
1669 * the packet into ip_output_simple.
1670 */
1671 static void
1672 icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
1673 ip_recv_attr_t *ira)
1674 {
1675 uint_t ip_hdr_length = ira->ira_ip_hdr_length;
1676 ill_t *ill = ira->ira_ill;
1677 ip_stack_t *ipst = ill->ill_ipst;
1678 ip_xmit_attr_t ixas;
1679
1680 /* Send out an ICMP packet */
1681 icmph->icmph_checksum = 0;
1682 icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
1683 /* Reset time to live. */
1684 ipha->ipha_ttl = ipst->ips_ip_def_ttl;
1685 {
1686 /* Swap source and destination addresses */
1687 ipaddr_t tmp;
1688
1689 tmp = ipha->ipha_src;
1690 ipha->ipha_src = ipha->ipha_dst;
1691 ipha->ipha_dst = tmp;
1692 }
1693 ipha->ipha_ident = 0;
1694 if (!IS_SIMPLE_IPH(ipha))
1695 icmp_options_update(ipha);
1696
1697 bzero(&ixas, sizeof (ixas));
1698 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1699 ixas.ixa_zoneid = ira->ira_zoneid;
1700 ixas.ixa_cred = kcred;
1701 ixas.ixa_cpid = NOPID;
1702 ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
1703 ixas.ixa_ifindex = 0;
1704 ixas.ixa_ipst = ipst;
1705 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1706
1707 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
1708 /*
1709 * This packet should go out the same way as it
1710 * came in i.e in clear, independent of the IPsec policy
1711 * for transmitting packets.
1712 */
1713 ixas.ixa_flags |= IXAF_NO_IPSEC;
1714 } else {
1715 if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
1716 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1717 /* Note: mp already consumed and ip_drop_packet done */
1718 return;
1719 }
1720 }
1721 if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1722 /*
1723 * Not one or our addresses (IRE_LOCALs), thus we let
1724 * ip_output_simple pick the source.
1725 */
1726 ipha->ipha_src = INADDR_ANY;
1727 ixas.ixa_flags |= IXAF_SET_SOURCE;
1728 }
1729 /* Should we send with DF and use dce_pmtu? */
1730 if (ipst->ips_ipv4_icmp_return_pmtu) {
1731 ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
1732 ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
1733 }
1734
1735 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
1736
1737 (void) ip_output_simple(mp, &ixas);
1738 ixa_cleanup(&ixas);
1739 }
1740
1741 /*
1742 * Verify the ICMP messages for either for ICMP error or redirect packet.
1743 * The caller should have fully pulled up the message. If it's a redirect
1744 * packet, only basic checks on IP header will be done; otherwise, verify
1745 * the packet by looking at the included ULP header.
1746 *
1747 * Called before icmp_inbound_error_fanout_v4 is called.
1748 */
1749 static boolean_t
1750 icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
1751 {
1752 ill_t *ill = ira->ira_ill;
1753 int hdr_length;
1754 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
1755 conn_t *connp;
1756 ipha_t *ipha; /* Inner IP header */
1757
1758 ipha = (ipha_t *)&icmph[1];
1759 if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
1760 goto truncated;
1761
1762 hdr_length = IPH_HDR_LENGTH(ipha);
1763
1764 if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
1765 goto discard_pkt;
1766
1767 if (hdr_length < sizeof (ipha_t))
1768 goto truncated;
1769
1770 if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
1771 goto truncated;
1772
1773 /*
1774 * Stop here for ICMP_REDIRECT.
1775 */
1776 if (icmph->icmph_type == ICMP_REDIRECT)
1777 return (B_TRUE);
1778
1779 /*
1780 * ICMP errors only.
1781 */
1782 switch (ipha->ipha_protocol) {
1783 case IPPROTO_UDP:
1784 /*
1785 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1786 * transport header.
1787 */
1788 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1789 mp->b_wptr)
1790 goto truncated;
1791 break;
1792 case IPPROTO_TCP: {
1793 tcpha_t *tcpha;
1794
1795 /*
1796 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1797 * transport header.
1798 */
1799 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1800 mp->b_wptr)
1801 goto truncated;
1802
1803 tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
1804 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
1805 ipst);
1806 if (connp == NULL)
1807 goto discard_pkt;
1808
1809 if ((connp->conn_verifyicmp != NULL) &&
1810 !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
1811 CONN_DEC_REF(connp);
1812 goto discard_pkt;
1813 }
1814 CONN_DEC_REF(connp);
1815 break;
1816 }
1817 case IPPROTO_SCTP:
1818 /*
1819 * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1820 * transport header.
1821 */
1822 if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1823 mp->b_wptr)
1824 goto truncated;
1825 break;
1826 case IPPROTO_ESP:
1827 case IPPROTO_AH:
1828 break;
1829 case IPPROTO_ENCAP:
1830 if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
1831 mp->b_wptr)
1832 goto truncated;
1833 break;
1834 default:
1835 break;
1836 }
1837
1838 return (B_TRUE);
1839
1840 discard_pkt:
1841 /* Bogus ICMP error. */
1842 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1843 return (B_FALSE);
1844
1845 truncated:
1846 /* We pulled up everthing already. Must be truncated */
1847 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1848 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1849 return (B_FALSE);
1850 }
1851
1852 /* Table from RFC 1191 */
1853 static int icmp_frag_size_table[] =
1854 { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
1855
1856 /*
1857 * Process received ICMP Packet too big.
1858 * Just handles the DCE create/update, including using the above table of
1859 * PMTU guesses. The caller is responsible for validating the packet before
1860 * passing it in and also to fanout the ICMP error to any matching transport
1861 * conns. Assumes the message has been fully pulled up and verified.
1862 *
1863 * Before getting here, the caller has called icmp_inbound_verify_v4()
1864 * that should have verified with ULP to prevent undoing the changes we're
1865 * going to make to DCE. For example, TCP might have verified that the packet
1866 * which generated error is in the send window.
1867 *
1868 * In some cases modified this MTU in the ICMP header packet; the caller
1869 * should pass to the matching ULP after this returns.
1870 */
1871 static void
1872 icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
1873 {
1874 dce_t *dce;
1875 int old_mtu;
1876 int mtu, orig_mtu;
1877 ipaddr_t dst;
1878 boolean_t disable_pmtud;
1879 ill_t *ill = ira->ira_ill;
1880 ip_stack_t *ipst = ill->ill_ipst;
1881 uint_t hdr_length;
1882 ipha_t *ipha;
1883
1884 /* Caller already pulled up everything. */
1885 ipha = (ipha_t *)&icmph[1];
1886 ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
1887 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
1888 ASSERT(ill != NULL);
1889
1890 hdr_length = IPH_HDR_LENGTH(ipha);
1891
1892 /*
1893 * We handle path MTU for source routed packets since the DCE
1894 * is looked up using the final destination.
1895 */
1896 dst = ip_get_dst(ipha);
1897
1898 dce = dce_lookup_and_add_v4(dst, ipst);
1899 if (dce == NULL) {
1900 /* Couldn't add a unique one - ENOMEM */
1901 ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
1902 ntohl(dst)));
1903 return;
1904 }
1905
1906 /* Check for MTU discovery advice as described in RFC 1191 */
1907 mtu = ntohs(icmph->icmph_du_mtu);
1908 orig_mtu = mtu;
1909 disable_pmtud = B_FALSE;
1910
1911 mutex_enter(&dce->dce_lock);
1912 if (dce->dce_flags & DCEF_PMTU)
1913 old_mtu = dce->dce_pmtu;
1914 else
1915 old_mtu = ill->ill_mtu;
1916
1917 if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
1918 uint32_t length;
1919 int i;
1920
1921 /*
1922 * Use the table from RFC 1191 to figure out
1923 * the next "plateau" based on the length in
1924 * the original IP packet.
1925 */
1926 length = ntohs(ipha->ipha_length);
1927 DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
1928 uint32_t, length);
1929 if (old_mtu <= length &&
1930 old_mtu >= length - hdr_length) {
1931 /*
1932 * Handle broken BSD 4.2 systems that
1933 * return the wrong ipha_length in ICMP
1934 * errors.
1935 */
1936 ip1dbg(("Wrong mtu: sent %d, dce %d\n",
1937 length, old_mtu));
1938 length -= hdr_length;
1939 }
1940 for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
1941 if (length > icmp_frag_size_table[i])
1942 break;
1943 }
1944 if (i == A_CNT(icmp_frag_size_table)) {
1945 /* Smaller than IP_MIN_MTU! */
1946 ip1dbg(("Too big for packet size %d\n",
1947 length));
1948 disable_pmtud = B_TRUE;
1949 mtu = ipst->ips_ip_pmtu_min;
1950 } else {
1951 mtu = icmp_frag_size_table[i];
1952 ip1dbg(("Calculated mtu %d, packet size %d, "
1953 "before %d\n", mtu, length, old_mtu));
1954 if (mtu < ipst->ips_ip_pmtu_min) {
1955 mtu = ipst->ips_ip_pmtu_min;
1956 disable_pmtud = B_TRUE;
1957 }
1958 }
1959 }
1960 if (disable_pmtud)
1961 dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
1962 else
1963 dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
1964
1965 dce->dce_pmtu = MIN(old_mtu, mtu);
1966 /* Prepare to send the new max frag size for the ULP. */
1967 icmph->icmph_du_zero = 0;
1968 icmph->icmph_du_mtu = htons((uint16_t)dce->dce_pmtu);
1969 DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
1970 dce, int, orig_mtu, int, mtu);
1971
1972 /* We now have a PMTU for sure */
1973 dce->dce_flags |= DCEF_PMTU;
1974 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
1975 mutex_exit(&dce->dce_lock);
1976 /*
1977 * After dropping the lock the new value is visible to everyone.
1978 * Then we bump the generation number so any cached values reinspect
1979 * the dce_t.
1980 */
1981 dce_increment_generation(dce);
1982 dce_refrele(dce);
1983 }
1984
1985 /*
1986 * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
1987 * calls this function.
1988 */
1989 static mblk_t *
1990 icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
1991 {
1992 int length;
1993
1994 ASSERT(mp->b_datap->db_type == M_DATA);
1995
1996 /* icmp_inbound_v4 has already pulled up the whole error packet */
1997 ASSERT(mp->b_cont == NULL);
1998
1999 /*
2000 * The length that we want to overlay is the inner header
2001 * and what follows it.
2002 */
2003 length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
2004
2005 /*
2006 * Overlay the inner header and whatever follows it over the
2007 * outer header.
2008 */
2009 bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
2010
2011 /* Adjust for what we removed */
2012 mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
2013 return (mp);
2014 }
2015
2016 /*
2017 * Try to pass the ICMP message upstream in case the ULP cares.
2018 *
2019 * If the packet that caused the ICMP error is secure, we send
2020 * it to AH/ESP to make sure that the attached packet has a
2021 * valid association. ipha in the code below points to the
2022 * IP header of the packet that caused the error.
2023 *
2024 * For IPsec cases, we let the next-layer-up (which has access to
2025 * cached policy on the conn_t, or can query the SPD directly)
2026 * subtract out any IPsec overhead if they must. We therefore make no
2027 * adjustments here for IPsec overhead.
2028 *
2029 * IFN could have been generated locally or by some router.
2030 *
2031 * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
2032 * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
2033 * This happens because IP adjusted its value of MTU on an
2034 * earlier IFN message and could not tell the upper layer,
2035 * the new adjusted value of MTU e.g. Packet was encrypted
2036 * or there was not enough information to fanout to upper
2037 * layers. Thus on the next outbound datagram, ire_send_wire
2038 * generates the IFN, where IPsec processing has *not* been
2039 * done.
2040 *
2041 * Note that we retain ixa_fragsize across IPsec thus once
2042 * we have picking ixa_fragsize and entered ipsec_out_process we do
2043 * no change the fragsize even if the path MTU changes before
2044 * we reach ip_output_post_ipsec.
2045 *
2046 * In the local case, IRAF_LOOPBACK will be set indicating
2047 * that IFN was generated locally.
2048 *
2049 * ROUTER : IFN could be secure or non-secure.
2050 *
2051 * * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
2052 * packet in error has AH/ESP headers to validate the AH/ESP
2053 * headers. AH/ESP will verify whether there is a valid SA or
2054 * not and send it back. We will fanout again if we have more
2055 * data in the packet.
2056 *
2057 * If the packet in error does not have AH/ESP, we handle it
2058 * like any other case.
2059 *
2060 * * NON_SECURE : If the packet in error has AH/ESP headers, we send it
2061 * up to AH/ESP for validation. AH/ESP will verify whether there is a
2062 * valid SA or not and send it back. We will fanout again if
2063 * we have more data in the packet.
2064 *
2065 * If the packet in error does not have AH/ESP, we handle it
2066 * like any other case.
2067 *
2068 * The caller must have called icmp_inbound_verify_v4.
2069 */
2070 static void
2071 icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
2072 {
2073 uint16_t *up; /* Pointer to ports in ULP header */
2074 uint32_t ports; /* reversed ports for fanout */
2075 ipha_t ripha; /* With reversed addresses */
2076 ipha_t *ipha; /* Inner IP header */
2077 uint_t hdr_length; /* Inner IP header length */
2078 tcpha_t *tcpha;
2079 conn_t *connp;
2080 ill_t *ill = ira->ira_ill;
2081 ip_stack_t *ipst = ill->ill_ipst;
2082 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
2083 ill_t *rill = ira->ira_rill;
2084
2085 /* Caller already pulled up everything. */
2086 ipha = (ipha_t *)&icmph[1];
2087 ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
2088 ASSERT(mp->b_cont == NULL);
2089
2090 hdr_length = IPH_HDR_LENGTH(ipha);
2091 ira->ira_protocol = ipha->ipha_protocol;
2092
2093 /*
2094 * We need a separate IP header with the source and destination
2095 * addresses reversed to do fanout/classification because the ipha in
2096 * the ICMP error is in the form we sent it out.
2097 */
2098 ripha.ipha_src = ipha->ipha_dst;
2099 ripha.ipha_dst = ipha->ipha_src;
2100 ripha.ipha_protocol = ipha->ipha_protocol;
2101 ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
2102
2103 ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
2104 ripha.ipha_protocol, ntohl(ipha->ipha_src),
2105 ntohl(ipha->ipha_dst),
2106 icmph->icmph_type, icmph->icmph_code));
2107
2108 switch (ipha->ipha_protocol) {
2109 case IPPROTO_UDP:
2110 up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2111
2112 /* Attempt to find a client stream based on port. */
2113 ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
2114 ntohs(up[0]), ntohs(up[1])));
2115
2116 /* Note that we send error to all matches. */
2117 ira->ira_flags |= IRAF_ICMP_ERROR;
2118 ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
2119 ira->ira_flags &= ~IRAF_ICMP_ERROR;
2120 return;
2121
2122 case IPPROTO_TCP:
2123 /*
2124 * Find a TCP client stream for this packet.
2125 * Note that we do a reverse lookup since the header is
2126 * in the form we sent it out.
2127 */
2128 tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
2129 connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
2130 ipst);
2131 if (connp == NULL)
2132 goto discard_pkt;
2133
2134 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2135 (ira->ira_flags & IRAF_IPSEC_SECURE)) {
2136 mp = ipsec_check_inbound_policy(mp, connp,
2137 ipha, NULL, ira);
2138 if (mp == NULL) {
2139 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2140 /* Note that mp is NULL */
2141 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2142 CONN_DEC_REF(connp);
2143 return;
2144 }
2145 }
2146
2147 ira->ira_flags |= IRAF_ICMP_ERROR;
2148 ira->ira_ill = ira->ira_rill = NULL;
2149 if (IPCL_IS_TCP(connp)) {
2150 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2151 connp->conn_recvicmp, connp, ira, SQ_FILL,
2152 SQTAG_TCP_INPUT_ICMP_ERR);
2153 } else {
2154 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2155 (connp->conn_recv)(connp, mp, NULL, ira);
2156 CONN_DEC_REF(connp);
2157 }
2158 ira->ira_ill = ill;
2159 ira->ira_rill = rill;
2160 ira->ira_flags &= ~IRAF_ICMP_ERROR;
2161 return;
2162
2163 case IPPROTO_SCTP:
2164 up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2165 /* Find a SCTP client stream for this packet. */
2166 ((uint16_t *)&ports)[0] = up[1];
2167 ((uint16_t *)&ports)[1] = up[0];
2168
2169 ira->ira_flags |= IRAF_ICMP_ERROR;
2170 ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
2171 ira->ira_flags &= ~IRAF_ICMP_ERROR;
2172 return;
2173
2174 case IPPROTO_ESP:
2175 case IPPROTO_AH:
2176 if (!ipsec_loaded(ipss)) {
2177 ip_proto_not_sup(mp, ira);
2178 return;
2179 }
2180
2181 if (ipha->ipha_protocol == IPPROTO_ESP)
2182 mp = ipsecesp_icmp_error(mp, ira);
2183 else
2184 mp = ipsecah_icmp_error(mp, ira);
2185 if (mp == NULL)
2186 return;
2187
2188 /* Just in case ipsec didn't preserve the NULL b_cont */
2189 if (mp->b_cont != NULL) {
2190 if (!pullupmsg(mp, -1))
2191 goto discard_pkt;
2192 }
2193
2194 /*
2195 * Note that ira_pktlen and ira_ip_hdr_length are no longer
2196 * correct, but we don't use them any more here.
2197 *
2198 * If succesful, the mp has been modified to not include
2199 * the ESP/AH header so we can fanout to the ULP's icmp
2200 * error handler.
2201 */
2202 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2203 goto truncated;
2204
2205 /* Verify the modified message before any further processes. */
2206 ipha = (ipha_t *)mp->b_rptr;
2207 hdr_length = IPH_HDR_LENGTH(ipha);
2208 icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2209 if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2210 freemsg(mp);
2211 return;
2212 }
2213
2214 icmp_inbound_error_fanout_v4(mp, icmph, ira);
2215 return;
2216
2217 case IPPROTO_ENCAP: {
2218 /* Look for self-encapsulated packets that caused an error */
2219 ipha_t *in_ipha;
2220
2221 /*
2222 * Caller has verified that length has to be
2223 * at least the size of IP header.
2224 */
2225 ASSERT(hdr_length >= sizeof (ipha_t));
2226 /*
2227 * Check the sanity of the inner IP header like
2228 * we did for the outer header.
2229 */
2230 in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
2231 if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
2232 goto discard_pkt;
2233 }
2234 if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
2235 goto discard_pkt;
2236 }
2237 /* Check for Self-encapsulated tunnels */
2238 if (in_ipha->ipha_src == ipha->ipha_src &&
2239 in_ipha->ipha_dst == ipha->ipha_dst) {
2240
2241 mp = icmp_inbound_self_encap_error_v4(mp, ipha,
2242 in_ipha);
2243 if (mp == NULL)
2244 goto discard_pkt;
2245
2246 /*
2247 * Just in case self_encap didn't preserve the NULL
2248 * b_cont
2249 */
2250 if (mp->b_cont != NULL) {
2251 if (!pullupmsg(mp, -1))
2252 goto discard_pkt;
2253 }
2254 /*
2255 * Note that ira_pktlen and ira_ip_hdr_length are no
2256 * longer correct, but we don't use them any more here.
2257 */
2258 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2259 goto truncated;
2260
2261 /*
2262 * Verify the modified message before any further
2263 * processes.
2264 */
2265 ipha = (ipha_t *)mp->b_rptr;
2266 hdr_length = IPH_HDR_LENGTH(ipha);
2267 icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2268 if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2269 freemsg(mp);
2270 return;
2271 }
2272
2273 /*
2274 * The packet in error is self-encapsualted.
2275 * And we are finding it further encapsulated
2276 * which we could not have possibly generated.
2277 */
2278 if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2279 goto discard_pkt;
2280 }
2281 icmp_inbound_error_fanout_v4(mp, icmph, ira);
2282 return;
2283 }
2284 /* No self-encapsulated */
2285 /* FALLTHRU */
2286 }
2287 case IPPROTO_IPV6:
2288 if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
2289 &ripha.ipha_dst, ipst)) != NULL) {
2290 ira->ira_flags |= IRAF_ICMP_ERROR;
2291 connp->conn_recvicmp(connp, mp, NULL, ira);
2292 CONN_DEC_REF(connp);
2293 ira->ira_flags &= ~IRAF_ICMP_ERROR;
2294 return;
2295 }
2296 /*
2297 * No IP tunnel is interested, fallthrough and see
2298 * if a raw socket will want it.
2299 */
2300 /* FALLTHRU */
2301 default:
2302 ira->ira_flags |= IRAF_ICMP_ERROR;
2303 ip_fanout_proto_v4(mp, &ripha, ira);
2304 ira->ira_flags &= ~IRAF_ICMP_ERROR;
2305 return;
2306 }
2307 /* NOTREACHED */
2308 discard_pkt:
2309 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2310 ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
2311 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2312 freemsg(mp);
2313 return;
2314
2315 truncated:
2316 /* We pulled up everthing already. Must be truncated */
2317 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2318 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2319 freemsg(mp);
2320 }
2321
2322 /*
2323 * Common IP options parser.
2324 *
2325 * Setup routine: fill in *optp with options-parsing state, then
2326 * tail-call ipoptp_next to return the first option.
2327 */
2328 uint8_t
2329 ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
2330 {
2331 uint32_t totallen; /* total length of all options */
2332
2333 totallen = ipha->ipha_version_and_hdr_length -
2334 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
2335 totallen <<= 2;
2336 optp->ipoptp_next = (uint8_t *)(&ipha[1]);
2337 optp->ipoptp_end = optp->ipoptp_next + totallen;
2338 optp->ipoptp_flags = 0;
2339 return (ipoptp_next(optp));
2340 }
2341
2342 /* Like above but without an ipha_t */
2343 uint8_t
2344 ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
2345 {
2346 optp->ipoptp_next = opt;
2347 optp->ipoptp_end = optp->ipoptp_next + totallen;
2348 optp->ipoptp_flags = 0;
2349 return (ipoptp_next(optp));
2350 }
2351
2352 /*
2353 * Common IP options parser: extract next option.
2354 */
2355 uint8_t
2356 ipoptp_next(ipoptp_t *optp)
2357 {
2358 uint8_t *end = optp->ipoptp_end;
2359 uint8_t *cur = optp->ipoptp_next;
2360 uint8_t opt, len, pointer;
2361
2362 /*
2363 * If cur > end already, then the ipoptp_end or ipoptp_next pointer
2364 * has been corrupted.
2365 */
2366 ASSERT(cur <= end);
2367
2368 if (cur == end)
2369 return (IPOPT_EOL);
2370
2371 opt = cur[IPOPT_OPTVAL];
2372
2373 /*
2374 * Skip any NOP options.
2375 */
2376 while (opt == IPOPT_NOP) {
2377 cur++;
2378 if (cur == end)
2379 return (IPOPT_EOL);
2380 opt = cur[IPOPT_OPTVAL];
2381 }
2382
2383 if (opt == IPOPT_EOL)
2384 return (IPOPT_EOL);
2385
2386 /*
2387 * Option requiring a length.
2388 */
2389 if ((cur + 1) >= end) {
2390 optp->ipoptp_flags |= IPOPTP_ERROR;
2391 return (IPOPT_EOL);
2392 }
2393 len = cur[IPOPT_OLEN];
2394 if (len < 2) {
2395 optp->ipoptp_flags |= IPOPTP_ERROR;
2396 return (IPOPT_EOL);
2397 }
2398 optp->ipoptp_cur = cur;
2399 optp->ipoptp_len = len;
2400 optp->ipoptp_next = cur + len;
2401 if (cur + len > end) {
2402 optp->ipoptp_flags |= IPOPTP_ERROR;
2403 return (IPOPT_EOL);
2404 }
2405
2406 /*
2407 * For the options which require a pointer field, make sure
2408 * its there, and make sure it points to either something
2409 * inside this option, or the end of the option.
2410 */
2411 switch (opt) {
2412 case IPOPT_RR:
2413 case IPOPT_TS:
2414 case IPOPT_LSRR:
2415 case IPOPT_SSRR:
2416 if (len <= IPOPT_OFFSET) {
2417 optp->ipoptp_flags |= IPOPTP_ERROR;
2418 return (opt);
2419 }
2420 pointer = cur[IPOPT_OFFSET];
2421 if (pointer - 1 > len) {
2422 optp->ipoptp_flags |= IPOPTP_ERROR;
2423 return (opt);
2424 }
2425 break;
2426 }
2427
2428 /*
2429 * Sanity check the pointer field based on the type of the
2430 * option.
2431 */
2432 switch (opt) {
2433 case IPOPT_RR:
2434 case IPOPT_SSRR:
2435 case IPOPT_LSRR:
2436 if (pointer < IPOPT_MINOFF_SR)
2437 optp->ipoptp_flags |= IPOPTP_ERROR;
2438 break;
2439 case IPOPT_TS:
2440 if (pointer < IPOPT_MINOFF_IT)
2441 optp->ipoptp_flags |= IPOPTP_ERROR;
2442 /*
2443 * Note that the Internet Timestamp option also
2444 * contains two four bit fields (the Overflow field,
2445 * and the Flag field), which follow the pointer
2446 * field. We don't need to check that these fields
2447 * fall within the length of the option because this
2448 * was implicitely done above. We've checked that the
2449 * pointer value is at least IPOPT_MINOFF_IT, and that
2450 * it falls within the option. Since IPOPT_MINOFF_IT >
2451 * IPOPT_POS_OV_FLG, we don't need the explicit check.
2452 */
2453 ASSERT(len > IPOPT_POS_OV_FLG);
2454 break;
2455 }
2456
2457 return (opt);
2458 }
2459
2460 /*
2461 * Use the outgoing IP header to create an IP_OPTIONS option the way
2462 * it was passed down from the application.
2463 *
2464 * This is compatible with BSD in that it returns
2465 * the reverse source route with the final destination
2466 * as the last entry. The first 4 bytes of the option
2467 * will contain the final destination.
2468 */
2469 int
2470 ip_opt_get_user(conn_t *connp, uchar_t *buf)
2471 {
2472 ipoptp_t opts;
2473 uchar_t *opt;
2474 uint8_t optval;
2475 uint8_t optlen;
2476 uint32_t len = 0;
2477 uchar_t *buf1 = buf;
2478 uint32_t totallen;
2479 ipaddr_t dst;
2480 ip_pkt_t *ipp = &connp->conn_xmit_ipp;
2481
2482 if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
2483 return (0);
2484
2485 totallen = ipp->ipp_ipv4_options_len;
2486 if (totallen & 0x3)
2487 return (0);
2488
2489 buf += IP_ADDR_LEN; /* Leave room for final destination */
2490 len += IP_ADDR_LEN;
2491 bzero(buf1, IP_ADDR_LEN);
2492
2493 dst = connp->conn_faddr_v4;
2494
2495 for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
2496 optval != IPOPT_EOL;
2497 optval = ipoptp_next(&opts)) {
2498 int off;
2499
2500 opt = opts.ipoptp_cur;
2501 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
2502 break;
2503 }
2504 optlen = opts.ipoptp_len;
2505
2506 switch (optval) {
2507 case IPOPT_SSRR:
2508 case IPOPT_LSRR:
2509
2510 /*
2511 * Insert destination as the first entry in the source
2512 * route and move down the entries on step.
2513 * The last entry gets placed at buf1.
2514 */
2515 buf[IPOPT_OPTVAL] = optval;
2516 buf[IPOPT_OLEN] = optlen;
2517 buf[IPOPT_OFFSET] = optlen;
2518
2519 off = optlen - IP_ADDR_LEN;
2520 if (off < 0) {
2521 /* No entries in source route */
2522 break;
2523 }
2524 /* Last entry in source route if not already set */
2525 if (dst == INADDR_ANY)
2526 bcopy(opt + off, buf1, IP_ADDR_LEN);
2527 off -= IP_ADDR_LEN;
2528
2529 while (off > 0) {
2530 bcopy(opt + off,
2531 buf + off + IP_ADDR_LEN,
2532 IP_ADDR_LEN);
2533 off -= IP_ADDR_LEN;
2534 }
2535 /* ipha_dst into first slot */
2536 bcopy(&dst, buf + off + IP_ADDR_LEN,
2537 IP_ADDR_LEN);
2538 buf += optlen;
2539 len += optlen;
2540 break;
2541
2542 default:
2543 bcopy(opt, buf, optlen);
2544 buf += optlen;
2545 len += optlen;
2546 break;
2547 }
2548 }
2549 done:
2550 /* Pad the resulting options */
2551 while (len & 0x3) {
2552 *buf++ = IPOPT_EOL;
2553 len++;
2554 }
2555 return (len);
2556 }
2557
2558 /*
2559 * Update any record route or timestamp options to include this host.
2560 * Reverse any source route option.
2561 * This routine assumes that the options are well formed i.e. that they
2562 * have already been checked.
2563 */
2564 static void
2565 icmp_options_update(ipha_t *ipha)
2566 {
2567 ipoptp_t opts;
2568 uchar_t *opt;
2569 uint8_t optval;
2570 ipaddr_t src; /* Our local address */
2571 ipaddr_t dst;
2572
2573 ip2dbg(("icmp_options_update\n"));
2574 src = ipha->ipha_src;
2575 dst = ipha->ipha_dst;
2576
2577 for (optval = ipoptp_first(&opts, ipha);
2578 optval != IPOPT_EOL;
2579 optval = ipoptp_next(&opts)) {
2580 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
2581 opt = opts.ipoptp_cur;
2582 ip2dbg(("icmp_options_update: opt %d, len %d\n",
2583 optval, opts.ipoptp_len));
2584 switch (optval) {
2585 int off1, off2;
2586 case IPOPT_SSRR:
2587 case IPOPT_LSRR:
2588 /*
2589 * Reverse the source route. The first entry
2590 * should be the next to last one in the current
2591 * source route (the last entry is our address).
2592 * The last entry should be the final destination.
2593 */
2594 off1 = IPOPT_MINOFF_SR - 1;
2595 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
2596 if (off2 < 0) {
2597 /* No entries in source route */
2598 ip1dbg((
2599 "icmp_options_update: bad src route\n"));
2600 break;
2601 }
2602 bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
2603 bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
2604 bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
2605 off2 -= IP_ADDR_LEN;
2606
2607 while (off1 < off2) {
2608 bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
2609 bcopy((char *)opt + off2, (char *)opt + off1,
2610 IP_ADDR_LEN);
2611 bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
2612 off1 += IP_ADDR_LEN;
2613 off2 -= IP_ADDR_LEN;
2614 }
2615 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
2616 break;
2617 }
2618 }
2619 }
2620
2621 /*
2622 * Process received ICMP Redirect messages.
2623 * Assumes the caller has verified that the headers are in the pulled up mblk.
2624 * Consumes mp.
2625 */
2626 static void
2627 icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
2628 {
2629 ire_t *ire, *nire;
2630 ire_t *prev_ire;
2631 ipaddr_t src, dst, gateway;
2632 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2633 ipha_t *inner_ipha; /* Inner IP header */
2634
2635 /* Caller already pulled up everything. */
2636 inner_ipha = (ipha_t *)&icmph[1];
2637 src = ipha->ipha_src;
2638 dst = inner_ipha->ipha_dst;
2639 gateway = icmph->icmph_rd_gateway;
2640 /* Make sure the new gateway is reachable somehow. */
2641 ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
2642 ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
2643 /*
2644 * Make sure we had a route for the dest in question and that
2645 * that route was pointing to the old gateway (the source of the
2646 * redirect packet.)
2647 * We do longest match and then compare ire_gateway_addr below.
2648 */
2649 prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
2650 NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2651 /*
2652 * Check that
2653 * the redirect was not from ourselves
2654 * the new gateway and the old gateway are directly reachable
2655 */
2656 if (prev_ire == NULL || ire == NULL ||
2657 (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
2658 (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2659 !(ire->ire_type & IRE_IF_ALL) ||
2660 prev_ire->ire_gateway_addr != src) {
2661 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2662 ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
2663 freemsg(mp);
2664 if (ire != NULL)
2665 ire_refrele(ire);
2666 if (prev_ire != NULL)
2667 ire_refrele(prev_ire);
2668 return;
2669 }
2670
2671 ire_refrele(prev_ire);
2672 ire_refrele(ire);
2673
2674 /*
2675 * TODO: more precise handling for cases 0, 2, 3, the latter two
2676 * require TOS routing
2677 */
2678 switch (icmph->icmph_code) {
2679 case 0:
2680 case 1:
2681 /* TODO: TOS specificity for cases 2 and 3 */
2682 case 2:
2683 case 3:
2684 break;
2685 default:
2686 BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2687 ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
2688 freemsg(mp);
2689 return;
2690 }
2691 /*
2692 * Create a Route Association. This will allow us to remember that
2693 * someone we believe told us to use the particular gateway.
2694 */
2695 ire = ire_create(
2696 (uchar_t *)&dst, /* dest addr */
2697 (uchar_t *)&ip_g_all_ones, /* mask */
2698 (uchar_t *)&gateway, /* gateway addr */
2699 IRE_HOST,
2700 NULL, /* ill */
2701 ALL_ZONES,
2702 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
2703 NULL, /* tsol_gc_t */
2704 ipst);
2705
2706 if (ire == NULL) {
2707 freemsg(mp);
2708 return;
2709 }
2710 nire = ire_add(ire);
2711 /* Check if it was a duplicate entry */
2712 if (nire != NULL && nire != ire) {
2713 ASSERT(nire->ire_identical_ref > 1);
2714 ire_delete(nire);
2715 ire_refrele(nire);
2716 nire = NULL;
2717 }
2718 ire = nire;
2719 if (ire != NULL) {
2720 ire_refrele(ire); /* Held in ire_add */
2721
2722 /* tell routing sockets that we received a redirect */
2723 ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
2724 (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
2725 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
2726 }
2727
2728 /*
2729 * Delete any existing IRE_HOST type redirect ires for this destination.
2730 * This together with the added IRE has the effect of
2731 * modifying an existing redirect.
2732 */
2733 prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
2734 ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
2735 if (prev_ire != NULL) {
2736 if (prev_ire ->ire_flags & RTF_DYNAMIC)
2737 ire_delete(prev_ire);
2738 ire_refrele(prev_ire);
2739 }
2740
2741 freemsg(mp);
2742 }
2743
2744 /*
2745 * Generate an ICMP parameter problem message.
2746 * When called from ip_output side a minimal ip_recv_attr_t needs to be
2747 * constructed by the caller.
2748 */
2749 static void
2750 icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
2751 {
2752 icmph_t icmph;
2753 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2754
2755 mp = icmp_pkt_err_ok(mp, ira);
2756 if (mp == NULL)
2757 return;
2758
2759 bzero(&icmph, sizeof (icmph_t));
2760 icmph.icmph_type = ICMP_PARAM_PROBLEM;
2761 icmph.icmph_pp_ptr = ptr;
2762 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
2763 icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
2764 }
2765
2766 /*
2767 * Build and ship an IPv4 ICMP message using the packet data in mp, and
2768 * the ICMP header pointed to by "stuff". (May be called as writer.)
2769 * Note: assumes that icmp_pkt_err_ok has been called to verify that
2770 * an icmp error packet can be sent.
2771 * Assigns an appropriate source address to the packet. If ipha_dst is
2772 * one of our addresses use it for source. Otherwise let ip_output_simple
2773 * pick the source address.
2774 */
2775 static void
2776 icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
2777 {
2778 ipaddr_t dst;
2779 icmph_t *icmph;
2780 ipha_t *ipha;
2781 uint_t len_needed;
2782 size_t msg_len;
2783 mblk_t *mp1;
2784 ipaddr_t src;
2785 ire_t *ire;
2786 ip_xmit_attr_t ixas;
2787 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2788
2789 ipha = (ipha_t *)mp->b_rptr;
2790
2791 bzero(&ixas, sizeof (ixas));
2792 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2793 ixas.ixa_zoneid = ira->ira_zoneid;
2794 ixas.ixa_ifindex = 0;
2795 ixas.ixa_ipst = ipst;
2796 ixas.ixa_cred = kcred;
2797 ixas.ixa_cpid = NOPID;
2798 ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
2799 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2800
2801 if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2802 /*
2803 * Apply IPsec based on how IPsec was applied to
2804 * the packet that had the error.
2805 *
2806 * If it was an outbound packet that caused the ICMP
2807 * error, then the caller will have setup the IRA
2808 * appropriately.
2809 */
2810 if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
2811 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2812 /* Note: mp already consumed and ip_drop_packet done */
2813 return;
2814 }
2815 } else {
2816 /*
2817 * This is in clear. The icmp message we are building
2818 * here should go out in clear, independent of our policy.
2819 */
2820 ixas.ixa_flags |= IXAF_NO_IPSEC;
2821 }
2822
2823 /* Remember our eventual destination */
2824 dst = ipha->ipha_src;
2825
2826 /*
2827 * If the packet was for one of our unicast addresses, make
2828 * sure we respond with that as the source. Otherwise
2829 * have ip_output_simple pick the source address.
2830 */
2831 ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
2832 (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
2833 MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
2834 if (ire != NULL) {
2835 ire_refrele(ire);
2836 src = ipha->ipha_dst;
2837 } else {
2838 src = INADDR_ANY;
2839 ixas.ixa_flags |= IXAF_SET_SOURCE;
2840 }
2841
2842 /*
2843 * Check if we can send back more then 8 bytes in addition to
2844 * the IP header. We try to send 64 bytes of data and the internal
2845 * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
2846 */
2847 len_needed = IPH_HDR_LENGTH(ipha);
2848 if (ipha->ipha_protocol == IPPROTO_ENCAP ||
2849 ipha->ipha_protocol == IPPROTO_IPV6) {
2850 if (!pullupmsg(mp, -1)) {
2851 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2852 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2853 freemsg(mp);
2854 return;
2855 }
2856 ipha = (ipha_t *)mp->b_rptr;
2857
2858 if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2859 len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
2860 len_needed));
2861 } else {
2862 ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
2863
2864 ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
2865 len_needed += ip_hdr_length_v6(mp, ip6h);
2866 }
2867 }
2868 len_needed += ipst->ips_ip_icmp_return;
2869 msg_len = msgdsize(mp);
2870 if (msg_len > len_needed) {
2871 (void) adjmsg(mp, len_needed - msg_len);
2872 msg_len = len_needed;
2873 }
2874 mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
2875 if (mp1 == NULL) {
2876 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
2877 freemsg(mp);
2878 return;
2879 }
2880 mp1->b_cont = mp;
2881 mp = mp1;
2882
2883 /*
2884 * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
2885 * node generates be accepted in peace by all on-host destinations.
2886 * If we do NOT assume that all on-host destinations trust
2887 * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
2888 * (Look for IXAF_TRUSTED_ICMP).
2889 */
2890 ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
2891
2892 ipha = (ipha_t *)mp->b_rptr;
2893 mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
2894 *ipha = icmp_ipha;
2895 ipha->ipha_src = src;
2896 ipha->ipha_dst = dst;
2897 ipha->ipha_ttl = ipst->ips_ip_def_ttl;
2898 msg_len += sizeof (icmp_ipha) + len;
2899 if (msg_len > IP_MAXPACKET) {
2900 (void) adjmsg(mp, IP_MAXPACKET - msg_len);
2901 msg_len = IP_MAXPACKET;
2902 }
2903 ipha->ipha_length = htons((uint16_t)msg_len);
2904 icmph = (icmph_t *)&ipha[1];
2905 bcopy(stuff, icmph, len);
2906 icmph->icmph_checksum = 0;
2907 icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
2908 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
2909
2910 (void) ip_output_simple(mp, &ixas);
2911 ixa_cleanup(&ixas);
2912 }
2913
2914 /*
2915 * Determine if an ICMP error packet can be sent given the rate limit.
2916 * The limit consists of an average frequency (icmp_pkt_err_interval measured
2917 * in milliseconds) and a burst size. Burst size number of packets can
2918 * be sent arbitrarely closely spaced.
2919 * The state is tracked using two variables to implement an approximate
2920 * token bucket filter:
2921 * icmp_pkt_err_last - lbolt value when the last burst started
2922 * icmp_pkt_err_sent - number of packets sent in current burst
2923 */
2924 boolean_t
2925 icmp_err_rate_limit(ip_stack_t *ipst)
2926 {
2927 clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
2928 uint_t refilled; /* Number of packets refilled in tbf since last */
2929 /* Guard against changes by loading into local variable */
2930 uint_t err_interval = ipst->ips_ip_icmp_err_interval;
2931
2932 if (err_interval == 0)
2933 return (B_FALSE);
2934
2935 if (ipst->ips_icmp_pkt_err_last > now) {
2936 /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
2937 ipst->ips_icmp_pkt_err_last = 0;
2938 ipst->ips_icmp_pkt_err_sent = 0;
2939 }
2940 /*
2941 * If we are in a burst update the token bucket filter.
2942 * Update the "last" time to be close to "now" but make sure
2943 * we don't loose precision.
2944 */
2945 if (ipst->ips_icmp_pkt_err_sent != 0) {
2946 refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
2947 if (refilled > ipst->ips_icmp_pkt_err_sent) {
2948 ipst->ips_icmp_pkt_err_sent = 0;
2949 } else {
2950 ipst->ips_icmp_pkt_err_sent -= refilled;
2951 ipst->ips_icmp_pkt_err_last += refilled * err_interval;
2952 }
2953 }
2954 if (ipst->ips_icmp_pkt_err_sent == 0) {
2955 /* Start of new burst */
2956 ipst->ips_icmp_pkt_err_last = now;
2957 }
2958 if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
2959 ipst->ips_icmp_pkt_err_sent++;
2960 ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
2961 ipst->ips_icmp_pkt_err_sent));
2962 return (B_FALSE);
2963 }
2964 ip1dbg(("icmp_err_rate_limit: dropped\n"));
2965 return (B_TRUE);
2966 }
2967
2968 /*
2969 * Check if it is ok to send an IPv4 ICMP error packet in
2970 * response to the IPv4 packet in mp.
2971 * Free the message and return null if no
2972 * ICMP error packet should be sent.
2973 */
2974 static mblk_t *
2975 icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
2976 {
2977 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2978 icmph_t *icmph;
2979 ipha_t *ipha;
2980 uint_t len_needed;
2981
2982 if (!mp)
2983 return (NULL);
2984 ipha = (ipha_t *)mp->b_rptr;
2985 if (ip_csum_hdr(ipha)) {
2986 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
2987 ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
2988 freemsg(mp);
2989 return (NULL);
2990 }
2991 if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
2992 ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
2993 CLASSD(ipha->ipha_dst) ||
2994 CLASSD(ipha->ipha_src) ||
2995 (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
2996 /* Note: only errors to the fragment with offset 0 */
2997 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
2998 freemsg(mp);
2999 return (NULL);
3000 }
3001 if (ipha->ipha_protocol == IPPROTO_ICMP) {
3002 /*
3003 * Check the ICMP type. RFC 1122 sez: don't send ICMP
3004 * errors in response to any ICMP errors.
3005 */
3006 len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
3007 if (mp->b_wptr - mp->b_rptr < len_needed) {
3008 if (!pullupmsg(mp, len_needed)) {
3009 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
3010 freemsg(mp);
3011 return (NULL);
3012 }
3013 ipha = (ipha_t *)mp->b_rptr;
3014 }
3015 icmph = (icmph_t *)
3016 (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
3017 switch (icmph->icmph_type) {
3018 case ICMP_DEST_UNREACHABLE:
3019 case ICMP_SOURCE_QUENCH:
3020 case ICMP_TIME_EXCEEDED:
3021 case ICMP_PARAM_PROBLEM:
3022 case ICMP_REDIRECT:
3023 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3024 freemsg(mp);
3025 return (NULL);
3026 default:
3027 break;
3028 }
3029 }
3030 /*
3031 * If this is a labeled system, then check to see if we're allowed to
3032 * send a response to this particular sender. If not, then just drop.
3033 */
3034 if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
3035 ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
3036 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3037 freemsg(mp);
3038 return (NULL);
3039 }
3040 if (icmp_err_rate_limit(ipst)) {
3041 /*
3042 * Only send ICMP error packets every so often.
3043 * This should be done on a per port/source basis,
3044 * but for now this will suffice.
3045 */
3046 freemsg(mp);
3047 return (NULL);
3048 }
3049 return (mp);
3050 }
3051
3052 /*
3053 * Called when a packet was sent out the same link that it arrived on.
3054 * Check if it is ok to send a redirect and then send it.
3055 */
3056 void
3057 ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
3058 ip_recv_attr_t *ira)
3059 {
3060 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3061 ipaddr_t src, nhop;
3062 mblk_t *mp1;
3063 ire_t *nhop_ire;
3064
3065 /*
3066 * Check the source address to see if it originated
3067 * on the same logical subnet it is going back out on.
3068 * If so, we should be able to send it a redirect.
3069 * Avoid sending a redirect if the destination
3070 * is directly connected (i.e., we matched an IRE_ONLINK),
3071 * or if the packet was source routed out this interface.
3072 *
3073 * We avoid sending a redirect if the
3074 * destination is directly connected
3075 * because it is possible that multiple
3076 * IP subnets may have been configured on
3077 * the link, and the source may not
3078 * be on the same subnet as ip destination,
3079 * even though they are on the same
3080 * physical link.
3081 */
3082 if ((ire->ire_type & IRE_ONLINK) ||
3083 ip_source_routed(ipha, ipst))
3084 return;
3085
3086 nhop_ire = ire_nexthop(ire);
3087 if (nhop_ire == NULL)
3088 return;
3089
3090 nhop = nhop_ire->ire_addr;
3091
3092 if (nhop_ire->ire_type & IRE_IF_CLONE) {
3093 ire_t *ire2;
3094
3095 /* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
3096 mutex_enter(&nhop_ire->ire_lock);
3097 ire2 = nhop_ire->ire_dep_parent;
3098 if (ire2 != NULL)
3099 ire_refhold(ire2);
3100 mutex_exit(&nhop_ire->ire_lock);
3101 ire_refrele(nhop_ire);
3102 nhop_ire = ire2;
3103 }
3104 if (nhop_ire == NULL)
3105 return;
3106
3107 ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
3108
3109 src = ipha->ipha_src;
3110
3111 /*
3112 * We look at the interface ire for the nexthop,
3113 * to see if ipha_src is in the same subnet
3114 * as the nexthop.
3115 */
3116 if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
3117 /*
3118 * The source is directly connected.
3119 */
3120 mp1 = copymsg(mp);
3121 if (mp1 != NULL) {
3122 icmp_send_redirect(mp1, nhop, ira);
3123 }
3124 }
3125 ire_refrele(nhop_ire);
3126 }
3127
3128 /*
3129 * Generate an ICMP redirect message.
3130 */
3131 static void
3132 icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
3133 {
3134 icmph_t icmph;
3135 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3136
3137 mp = icmp_pkt_err_ok(mp, ira);
3138 if (mp == NULL)
3139 return;
3140
3141 bzero(&icmph, sizeof (icmph_t));
3142 icmph.icmph_type = ICMP_REDIRECT;
3143 icmph.icmph_code = 1;
3144 icmph.icmph_rd_gateway = gateway;
3145 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
3146 icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3147 }
3148
3149 /*
3150 * Generate an ICMP time exceeded message.
3151 */
3152 void
3153 icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3154 {
3155 icmph_t icmph;
3156 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3157
3158 mp = icmp_pkt_err_ok(mp, ira);
3159 if (mp == NULL)
3160 return;
3161
3162 bzero(&icmph, sizeof (icmph_t));
3163 icmph.icmph_type = ICMP_TIME_EXCEEDED;
3164 icmph.icmph_code = code;
3165 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
3166 icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3167 }
3168
3169 /*
3170 * Generate an ICMP unreachable message.
3171 * When called from ip_output side a minimal ip_recv_attr_t needs to be
3172 * constructed by the caller.
3173 */
3174 void
3175 icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3176 {
3177 icmph_t icmph;
3178 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3179
3180 mp = icmp_pkt_err_ok(mp, ira);
3181 if (mp == NULL)
3182 return;
3183
3184 bzero(&icmph, sizeof (icmph_t));
3185 icmph.icmph_type = ICMP_DEST_UNREACHABLE;
3186 icmph.icmph_code = code;
3187 BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
3188 icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3189 }
3190
3191 /*
3192 * Latch in the IPsec state for a stream based the policy in the listener
3193 * and the actions in the ip_recv_attr_t.
3194 * Called directly from TCP and SCTP.
3195 */
3196 boolean_t
3197 ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
3198 {
3199 ASSERT(lconnp->conn_policy != NULL);
3200 ASSERT(connp->conn_policy == NULL);
3201
3202 IPPH_REFHOLD(lconnp->conn_policy);
3203 connp->conn_policy = lconnp->conn_policy;
3204
3205 if (ira->ira_ipsec_action != NULL) {
3206 if (connp->conn_latch == NULL) {
3207 connp->conn_latch = iplatch_create();
3208 if (connp->conn_latch == NULL)
3209 return (B_FALSE);
3210 }
3211 ipsec_latch_inbound(connp, ira);
3212 }
3213 return (B_TRUE);
3214 }
3215
3216 /*
3217 * Verify whether or not the IP address is a valid local address.
3218 * Could be a unicast, including one for a down interface.
3219 * If allow_mcbc then a multicast or broadcast address is also
3220 * acceptable.
3221 *
3222 * In the case of a broadcast/multicast address, however, the
3223 * upper protocol is expected to reset the src address
3224 * to zero when we return IPVL_MCAST/IPVL_BCAST so that
3225 * no packets are emitted with broadcast/multicast address as
3226 * source address (that violates hosts requirements RFC 1122)
3227 * The addresses valid for bind are:
3228 * (1) - INADDR_ANY (0)
3229 * (2) - IP address of an UP interface
3230 * (3) - IP address of a DOWN interface
3231 * (4) - valid local IP broadcast addresses. In this case
3232 * the conn will only receive packets destined to
3233 * the specified broadcast address.
3234 * (5) - a multicast address. In this case
3235 * the conn will only receive packets destined to
3236 * the specified multicast address. Note: the
3237 * application still has to issue an
3238 * IP_ADD_MEMBERSHIP socket option.
3239 *
3240 * In all the above cases, the bound address must be valid in the current zone.
3241 * When the address is loopback, multicast or broadcast, there might be many
3242 * matching IREs so bind has to look up based on the zone.
3243 */
3244 ip_laddr_t
3245 ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
3246 ip_stack_t *ipst, boolean_t allow_mcbc)
3247 {
3248 ire_t *src_ire;
3249
3250 ASSERT(src_addr != INADDR_ANY);
3251
3252 src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
3253 NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
3254
3255 /*
3256 * If an address other than in6addr_any is requested,
3257 * we verify that it is a valid address for bind
3258 * Note: Following code is in if-else-if form for
3259 * readability compared to a condition check.
3260 */
3261 if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
3262 /*
3263 * (2) Bind to address of local UP interface
3264 */
3265 ire_refrele(src_ire);
3266 return (IPVL_UNICAST_UP);
3267 } else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
3268 /*
3269 * (4) Bind to broadcast address
3270 */
3271 ire_refrele(src_ire);
3272 if (allow_mcbc)
3273 return (IPVL_BCAST);
3274 else
3275 return (IPVL_BAD);
3276 } else if (CLASSD(src_addr)) {
3277 /* (5) bind to multicast address. */
3278 if (src_ire != NULL)
3279 ire_refrele(src_ire);
3280
3281 if (allow_mcbc)
3282 return (IPVL_MCAST);
3283 else
3284 return (IPVL_BAD);
3285 } else {
3286 ipif_t *ipif;
3287
3288 /*
3289 * (3) Bind to address of local DOWN interface?
3290 * (ipif_lookup_addr() looks up all interfaces
3291 * but we do not get here for UP interfaces
3292 * - case (2) above)
3293 */
3294 if (src_ire != NULL)
3295 ire_refrele(src_ire);
3296
3297 ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
3298 if (ipif == NULL)
3299 return (IPVL_BAD);
3300
3301 /* Not a useful source? */
3302 if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
3303 ipif_refrele(ipif);
3304 return (IPVL_BAD);
3305 }
3306 ipif_refrele(ipif);
3307 return (IPVL_UNICAST_DOWN);
3308 }
3309 }
3310
3311 /*
3312 * Insert in the bind fanout for IPv4 and IPv6.
3313 * The caller should already have used ip_laddr_verify_v*() before calling
3314 * this.
3315 */
3316 int
3317 ip_laddr_fanout_insert(conn_t *connp)
3318 {
3319 int error;
3320
3321 /*
3322 * Allow setting new policies. For example, disconnects result
3323 * in us being called. As we would have set conn_policy_cached
3324 * to B_TRUE before, we should set it to B_FALSE, so that policy
3325 * can change after the disconnect.
3326 */
3327 connp->conn_policy_cached = B_FALSE;
3328
3329 error = ipcl_bind_insert(connp);
3330 if (error != 0) {
3331 if (connp->conn_anon_port) {
3332 (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
3333 connp->conn_mlp_type, connp->conn_proto,
3334 ntohs(connp->conn_lport), B_FALSE);
3335 }
3336 connp->conn_mlp_type = mlptSingle;
3337 }
3338 return (error);
3339 }
3340
3341 /*
3342 * Verify that both the source and destination addresses are valid. If
3343 * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
3344 * i.e. have no route to it. Protocols like TCP want to verify destination
3345 * reachability, while tunnels do not.
3346 *
3347 * Determine the route, the interface, and (optionally) the source address
3348 * to use to reach a given destination.
3349 * Note that we allow connect to broadcast and multicast addresses when
3350 * IPDF_ALLOW_MCBC is set.
3351 * first_hop and dst_addr are normally the same, but if source routing
3352 * they will differ; in that case the first_hop is what we'll use for the
3353 * routing lookup but the dce and label checks will be done on dst_addr,
3354 *
3355 * If uinfo is set, then we fill in the best available information
3356 * we have for the destination. This is based on (in priority order) any
3357 * metrics and path MTU stored in a dce_t, route metrics, and finally the
3358 * ill_mtu/ill_mc_mtu.
3359 *
3360 * Tsol note: If we have a source route then dst_addr != firsthop. But we
3361 * always do the label check on dst_addr.
3362 */
3363 int
3364 ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
3365 ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
3366 {
3367 ire_t *ire = NULL;
3368 int error = 0;
3369 ipaddr_t setsrc; /* RTF_SETSRC */
3370 zoneid_t zoneid = ixa->ixa_zoneid; /* Honors SO_ALLZONES */
3371 ip_stack_t *ipst = ixa->ixa_ipst;
3372 dce_t *dce;
3373 uint_t pmtu;
3374 uint_t generation;
3375 nce_t *nce;
3376 ill_t *ill = NULL;
3377 boolean_t multirt = B_FALSE;
3378
3379 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
3380
3381 /*
3382 * We never send to zero; the ULPs map it to the loopback address.
3383 * We can't allow it since we use zero to mean unitialized in some
3384 * places.
3385 */
3386 ASSERT(dst_addr != INADDR_ANY);
3387
3388 if (is_system_labeled()) {
3389 ts_label_t *tsl = NULL;
3390
3391 error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
3392 mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
3393 if (error != 0)
3394 return (error);
3395 if (tsl != NULL) {
3396 /* Update the label */
3397 ip_xmit_attr_replace_tsl(ixa, tsl);
3398 }
3399 }
3400
3401 setsrc = INADDR_ANY;
3402 /*
3403 * Select a route; For IPMP interfaces, we would only select
3404 * a "hidden" route (i.e., going through a specific under_ill)
3405 * if ixa_ifindex has been specified.
3406 */
3407 ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
3408 &generation, &setsrc, &error, &multirt);
3409 ASSERT(ire != NULL); /* IRE_NOROUTE if none found */
3410 if (error != 0)
3411 goto bad_addr;
3412
3413 /*
3414 * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
3415 * If IPDF_VERIFY_DST is set, the destination must be reachable;
3416 * Otherwise the destination needn't be reachable.
3417 *
3418 * If we match on a reject or black hole, then we've got a
3419 * local failure. May as well fail out the connect() attempt,
3420 * since it's never going to succeed.
3421 */
3422 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
3423 /*
3424 * If we're verifying destination reachability, we always want
3425 * to complain here.
3426 *
3427 * If we're not verifying destination reachability but the
3428 * destination has a route, we still want to fail on the
3429 * temporary address and broadcast address tests.
3430 *
3431 * In both cases do we let the code continue so some reasonable
3432 * information is returned to the caller. That enables the
3433 * caller to use (and even cache) the IRE. conn_ip_ouput will
3434 * use the generation mismatch path to check for the unreachable
3435 * case thereby avoiding any specific check in the main path.
3436 */
3437 ASSERT(generation == IRE_GENERATION_VERIFY);
3438 if (flags & IPDF_VERIFY_DST) {
3439 /*
3440 * Set errno but continue to set up ixa_ire to be
3441 * the RTF_REJECT|RTF_BLACKHOLE IRE.
3442 * That allows callers to use ip_output to get an
3443 * ICMP error back.
3444 */
3445 if (!(ire->ire_type & IRE_HOST))
3446 error = ENETUNREACH;
3447 else
3448 error = EHOSTUNREACH;
3449 }
3450 }
3451
3452 if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
3453 !(flags & IPDF_ALLOW_MCBC)) {
3454 ire_refrele(ire);
3455 ire = ire_reject(ipst, B_FALSE);
3456 generation = IRE_GENERATION_VERIFY;
3457 error = ENETUNREACH;
3458 }
3459
3460 /* Cache things */
3461 if (ixa->ixa_ire != NULL)
3462 ire_refrele_notr(ixa->ixa_ire);
3463 #ifdef DEBUG
3464 ire_refhold_notr(ire);
3465 ire_refrele(ire);
3466 #endif
3467 ixa->ixa_ire = ire;
3468 ixa->ixa_ire_generation = generation;
3469
3470 /*
3471 * Ensure that ixa_dce is always set any time that ixa_ire is set,
3472 * since some callers will send a packet to conn_ip_output() even if
3473 * there's an error.
3474 */
3475 if (flags & IPDF_UNIQUE_DCE) {
3476 /* Fallback to the default dce if allocation fails */
3477 dce = dce_lookup_and_add_v4(dst_addr, ipst);
3478 if (dce != NULL)
3479 generation = dce->dce_generation;
3480 else
3481 dce = dce_lookup_v4(dst_addr, ipst, &generation);
3482 } else {
3483 dce = dce_lookup_v4(dst_addr, ipst, &generation);
3484 }
3485 ASSERT(dce != NULL);
3486 if (ixa->ixa_dce != NULL)
3487 dce_refrele_notr(ixa->ixa_dce);
3488 #ifdef DEBUG
3489 dce_refhold_notr(dce);
3490 dce_refrele(dce);
3491 #endif
3492 ixa->ixa_dce = dce;
3493 ixa->ixa_dce_generation = generation;
3494
3495 /*
3496 * For multicast with multirt we have a flag passed back from
3497 * ire_lookup_multi_ill_v4 since we don't have an IRE for each
3498 * possible multicast address.
3499 * We also need a flag for multicast since we can't check
3500 * whether RTF_MULTIRT is set in ixa_ire for multicast.
3501 */
3502 if (multirt) {
3503 ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
3504 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
3505 } else {
3506 ixa->ixa_postfragfn = ire->ire_postfragfn;
3507 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
3508 }
3509 if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3510 /* Get an nce to cache. */
3511 nce = ire_to_nce(ire, firsthop, NULL);
3512 if (nce == NULL) {
3513 /* Allocation failure? */
3514 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3515 } else {
3516 if (ixa->ixa_nce != NULL)
3517 nce_refrele(ixa->ixa_nce);
3518 ixa->ixa_nce = nce;
3519 }
3520 }
3521
3522 /*
3523 * If the source address is a loopback address, the
3524 * destination had best be local or multicast.
3525 * If we are sending to an IRE_LOCAL using a loopback source then
3526 * it had better be the same zoneid.
3527 */
3528 if (*src_addrp == htonl(INADDR_LOOPBACK)) {
3529 if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
3530 ire = NULL; /* Stored in ixa_ire */
3531 error = EADDRNOTAVAIL;
3532 goto bad_addr;
3533 }
3534 if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
3535 ire = NULL; /* Stored in ixa_ire */
3536 error = EADDRNOTAVAIL;
3537 goto bad_addr;
3538 }
3539 }
3540 if (ire->ire_type & IRE_BROADCAST) {
3541 /*
3542 * If the ULP didn't have a specified source, then we
3543 * make sure we reselect the source when sending
3544 * broadcasts out different interfaces.
3545 */
3546 if (flags & IPDF_SELECT_SRC)
3547 ixa->ixa_flags |= IXAF_SET_SOURCE;
3548 else
3549 ixa->ixa_flags &= ~IXAF_SET_SOURCE;
3550 }
3551
3552 /*
3553 * Does the caller want us to pick a source address?
3554 */
3555 if (flags & IPDF_SELECT_SRC) {
3556 ipaddr_t src_addr;
3557
3558 /*
3559 * We use use ire_nexthop_ill to avoid the under ipmp
3560 * interface for source address selection. Note that for ipmp
3561 * probe packets, ixa_ifindex would have been specified, and
3562 * the ip_select_route() invocation would have picked an ire
3563 * will ire_ill pointing at an under interface.
3564 */
3565 ill = ire_nexthop_ill(ire);
3566
3567 /* If unreachable we have no ill but need some source */
3568 if (ill == NULL) {
3569 src_addr = htonl(INADDR_LOOPBACK);
3570 /* Make sure we look for a better source address */
3571 generation = SRC_GENERATION_VERIFY;
3572 } else {
3573 error = ip_select_source_v4(ill, setsrc, dst_addr,
3574 ixa->ixa_multicast_ifaddr, zoneid,
3575 ipst, &src_addr, &generation, NULL);
3576 if (error != 0) {
3577 ire = NULL; /* Stored in ixa_ire */
3578 goto bad_addr;
3579 }
3580 }
3581
3582 /*
3583 * We allow the source address to to down.
3584 * However, we check that we don't use the loopback address
3585 * as a source when sending out on the wire.
3586 */
3587 if ((src_addr == htonl(INADDR_LOOPBACK)) &&
3588 !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
3589 !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3590 ire = NULL; /* Stored in ixa_ire */
3591 error = EADDRNOTAVAIL;
3592 goto bad_addr;
3593 }
3594
3595 *src_addrp = src_addr;
3596 ixa->ixa_src_generation = generation;
3597 }
3598
3599 /*
3600 * Make sure we don't leave an unreachable ixa_nce in place
3601 * since ip_select_route is used when we unplumb i.e., remove
3602 * references on ixa_ire, ixa_nce, and ixa_dce.
3603 */
3604 nce = ixa->ixa_nce;
3605 if (nce != NULL && nce->nce_is_condemned) {
3606 nce_refrele(nce);
3607 ixa->ixa_nce = NULL;
3608 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3609 }
3610
3611 /*
3612 * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
3613 * However, we can't do it for IPv4 multicast or broadcast.
3614 */
3615 if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
3616 ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3617
3618 /*
3619 * Set initial value for fragmentation limit. Either conn_ip_output
3620 * or ULP might updates it when there are routing changes.
3621 * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
3622 */
3623 pmtu = ip_get_pmtu(ixa);
3624 ixa->ixa_fragsize = pmtu;
3625 /* Make sure ixa_fragsize and ixa_pmtu remain identical */
3626 if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
3627 ixa->ixa_pmtu = pmtu;
3628
3629 /*
3630 * Extract information useful for some transports.
3631 * First we look for DCE metrics. Then we take what we have in
3632 * the metrics in the route, where the offlink is used if we have
3633 * one.
3634 */
3635 if (uinfo != NULL) {
3636 bzero(uinfo, sizeof (*uinfo));
3637
3638 if (dce->dce_flags & DCEF_UINFO)
3639 *uinfo = dce->dce_uinfo;
3640
3641 rts_merge_metrics(uinfo, &ire->ire_metrics);
3642
3643 /* Allow ire_metrics to decrease the path MTU from above */
3644 if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
3645 uinfo->iulp_mtu = pmtu;
3646
3647 uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
3648 uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
3649 uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
3650 }
3651
3652 if (ill != NULL)
3653 ill_refrele(ill);
3654
3655 return (error);
3656
3657 bad_addr:
3658 if (ire != NULL)
3659 ire_refrele(ire);
3660
3661 if (ill != NULL)
3662 ill_refrele(ill);
3663
3664 /*
3665 * Make sure we don't leave an unreachable ixa_nce in place
3666 * since ip_select_route is used when we unplumb i.e., remove
3667 * references on ixa_ire, ixa_nce, and ixa_dce.
3668 */
3669 nce = ixa->ixa_nce;
3670 if (nce != NULL && nce->nce_is_condemned) {
3671 nce_refrele(nce);
3672 ixa->ixa_nce = NULL;
3673 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3674 }
3675
3676 return (error);
3677 }
3678
3679
3680 /*
3681 * Get the base MTU for the case when path MTU discovery is not used.
3682 * Takes the MTU of the IRE into account.
3683 */
3684 uint_t
3685 ip_get_base_mtu(ill_t *ill, ire_t *ire)
3686 {
3687 uint_t mtu;
3688 uint_t iremtu = ire->ire_metrics.iulp_mtu;
3689
3690 if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST))
3691 mtu = ill->ill_mc_mtu;
3692 else
3693 mtu = ill->ill_mtu;
3694
3695 if (iremtu != 0 && iremtu < mtu)
3696 mtu = iremtu;
3697
3698 return (mtu);
3699 }
3700
3701 /*
3702 * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
3703 * Assumes that ixa_ire, dce, and nce have already been set up.
3704 *
3705 * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
3706 * We avoid path MTU discovery if it is disabled with ndd.
3707 * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
3708 *
3709 * NOTE: We also used to turn it off for source routed packets. That
3710 * is no longer required since the dce is per final destination.
3711 */
3712 uint_t
3713 ip_get_pmtu(ip_xmit_attr_t *ixa)
3714 {
3715 ip_stack_t *ipst = ixa->ixa_ipst;
3716 dce_t *dce;
3717 nce_t *nce;
3718 ire_t *ire;
3719 uint_t pmtu;
3720
3721 ire = ixa->ixa_ire;
3722 dce = ixa->ixa_dce;
3723 nce = ixa->ixa_nce;
3724
3725 /*
3726 * If path MTU discovery has been turned off by ndd, then we ignore
3727 * any dce_pmtu and for IPv4 we will not set DF.
3728 */
3729 if (!ipst->ips_ip_path_mtu_discovery)
3730 ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3731
3732 pmtu = IP_MAXPACKET;
3733 /*
3734 * Decide whether whether IPv4 sets DF
3735 * For IPv6 "no DF" means to use the 1280 mtu
3736 */
3737 if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3738 ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3739 } else {
3740 ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3741 if (!(ixa->ixa_flags & IXAF_IS_IPV4))
3742 pmtu = IPV6_MIN_MTU;
3743 }
3744
3745 /* Check if the PMTU is to old before we use it */
3746 if ((dce->dce_flags & DCEF_PMTU) &&
3747 TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
3748 ipst->ips_ip_pathmtu_interval) {
3749 /*
3750 * Older than 20 minutes. Drop the path MTU information.
3751 */
3752 mutex_enter(&dce->dce_lock);
3753 dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
3754 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
3755 mutex_exit(&dce->dce_lock);
3756 dce_increment_generation(dce);
3757 }
3758
3759 /* The metrics on the route can lower the path MTU */
3760 if (ire->ire_metrics.iulp_mtu != 0 &&
3761 ire->ire_metrics.iulp_mtu < pmtu)
3762 pmtu = ire->ire_metrics.iulp_mtu;
3763
3764 /*
3765 * If the path MTU is smaller than some minimum, we still use dce_pmtu
3766 * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
3767 * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
3768 */
3769 if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3770 if (dce->dce_flags & DCEF_PMTU) {
3771 if (dce->dce_pmtu < pmtu)
3772 pmtu = dce->dce_pmtu;
3773
3774 if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
3775 ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
3776 ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3777 } else {
3778 ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3779 ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3780 }
3781 } else {
3782 ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3783 ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3784 }
3785 }
3786
3787 /*
3788 * If we have an IRE_LOCAL we use the loopback mtu instead of
3789 * the ill for going out the wire i.e., IRE_LOCAL gets the same
3790 * mtu as IRE_LOOPBACK.
3791 */
3792 if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
3793 uint_t loopback_mtu;
3794
3795 loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
3796 ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
3797
3798 if (loopback_mtu < pmtu)
3799 pmtu = loopback_mtu;
3800 } else if (nce != NULL) {
3801 /*
3802 * Make sure we don't exceed the interface MTU.
3803 * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
3804 * an ill. We'd use the above IP_MAXPACKET in that case just
3805 * to tell the transport something larger than zero.
3806 */
3807 if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)) {
3808 if (nce->nce_common->ncec_ill->ill_mc_mtu < pmtu)
3809 pmtu = nce->nce_common->ncec_ill->ill_mc_mtu;
3810 if (nce->nce_common->ncec_ill != nce->nce_ill &&
3811 nce->nce_ill->ill_mc_mtu < pmtu) {
3812 /*
3813 * for interfaces in an IPMP group, the mtu of
3814 * the nce_ill (under_ill) could be different
3815 * from the mtu of the ncec_ill, so we take the
3816 * min of the two.
3817 */
3818 pmtu = nce->nce_ill->ill_mc_mtu;
3819 }
3820 } else {
3821 if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
3822 pmtu = nce->nce_common->ncec_ill->ill_mtu;
3823 if (nce->nce_common->ncec_ill != nce->nce_ill &&
3824 nce->nce_ill->ill_mtu < pmtu) {
3825 /*
3826 * for interfaces in an IPMP group, the mtu of
3827 * the nce_ill (under_ill) could be different
3828 * from the mtu of the ncec_ill, so we take the
3829 * min of the two.
3830 */
3831 pmtu = nce->nce_ill->ill_mtu;
3832 }
3833 }
3834 }
3835
3836 /*
3837 * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
3838 * Only applies to IPv6.
3839 */
3840 if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3841 if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
3842 switch (ixa->ixa_use_min_mtu) {
3843 case IPV6_USE_MIN_MTU_MULTICAST:
3844 if (ire->ire_type & IRE_MULTICAST)
3845 pmtu = IPV6_MIN_MTU;
3846 break;
3847 case IPV6_USE_MIN_MTU_ALWAYS:
3848 pmtu = IPV6_MIN_MTU;
3849 break;
3850 case IPV6_USE_MIN_MTU_NEVER:
3851 break;
3852 }
3853 } else {
3854 /* Default is IPV6_USE_MIN_MTU_MULTICAST */
3855 if (ire->ire_type & IRE_MULTICAST)
3856 pmtu = IPV6_MIN_MTU;
3857 }
3858 }
3859
3860 /*
3861 * After receiving an ICMPv6 "packet too big" message with a
3862 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
3863 * will insert a 8-byte fragment header in every packet. We compensate
3864 * for those cases by returning a smaller path MTU to the ULP.
3865 *
3866 * In the case of CGTP then ip_output will add a fragment header.
3867 * Make sure there is room for it by telling a smaller number
3868 * to the transport.
3869 *
3870 * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
3871 * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
3872 * which is the size of the packets it can send.
3873 */
3874 if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3875 if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
3876 (ire->ire_flags & RTF_MULTIRT) ||
3877 (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
3878 pmtu -= sizeof (ip6_frag_t);
3879 ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
3880 }
3881 }
3882
3883 return (pmtu);
3884 }
3885
3886 /*
3887 * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
3888 * the final piece where we don't. Return a pointer to the first mblk in the
3889 * result, and update the pointer to the next mblk to chew on. If anything
3890 * goes wrong (i.e., dupb fails), we waste everything in sight and return a
3891 * NULL pointer.
3892 */
3893 mblk_t *
3894 ip_carve_mp(mblk_t **mpp, ssize_t len)
3895 {
3896 mblk_t *mp0;
3897 mblk_t *mp1;
3898 mblk_t *mp2;
3899
3900 if (!len || !mpp || !(mp0 = *mpp))
3901 return (NULL);
3902 /* If we aren't going to consume the first mblk, we need a dup. */
3903 if (mp0->b_wptr - mp0->b_rptr > len) {
3904 mp1 = dupb(mp0);
3905 if (mp1) {
3906 /* Partition the data between the two mblks. */
3907 mp1->b_wptr = mp1->b_rptr + len;
3908 mp0->b_rptr = mp1->b_wptr;
3909 /*
3910 * after adjustments if mblk not consumed is now
3911 * unaligned, try to align it. If this fails free
3912 * all messages and let upper layer recover.
3913 */
3914 if (!OK_32PTR(mp0->b_rptr)) {
3915 if (!pullupmsg(mp0, -1)) {
3916 freemsg(mp0);
3917 freemsg(mp1);
3918 *mpp = NULL;
3919 return (NULL);
3920 }
3921 }
3922 }
3923 return (mp1);
3924 }
3925 /* Eat through as many mblks as we need to get len bytes. */
3926 len -= mp0->b_wptr - mp0->b_rptr;
3927 for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
3928 if (mp2->b_wptr - mp2->b_rptr > len) {
3929 /*
3930 * We won't consume the entire last mblk. Like
3931 * above, dup and partition it.
3932 */
3933 mp1->b_cont = dupb(mp2);
3934 mp1 = mp1->b_cont;
3935 if (!mp1) {
3936 /*
3937 * Trouble. Rather than go to a lot of
3938 * trouble to clean up, we free the messages.
3939 * This won't be any worse than losing it on
3940 * the wire.
3941 */
3942 freemsg(mp0);
3943 freemsg(mp2);
3944 *mpp = NULL;
3945 return (NULL);
3946 }
3947 mp1->b_wptr = mp1->b_rptr + len;
3948 mp2->b_rptr = mp1->b_wptr;
3949 /*
3950 * after adjustments if mblk not consumed is now
3951 * unaligned, try to align it. If this fails free
3952 * all messages and let upper layer recover.
3953 */
3954 if (!OK_32PTR(mp2->b_rptr)) {
3955 if (!pullupmsg(mp2, -1)) {
3956 freemsg(mp0);
3957 freemsg(mp2);
3958 *mpp = NULL;
3959 return (NULL);
3960 }
3961 }
3962 *mpp = mp2;
3963 return (mp0);
3964 }
3965 /* Decrement len by the amount we just got. */
3966 len -= mp2->b_wptr - mp2->b_rptr;
3967 }
3968 /*
3969 * len should be reduced to zero now. If not our caller has
3970 * screwed up.
3971 */
3972 if (len) {
3973 /* Shouldn't happen! */
3974 freemsg(mp0);
3975 *mpp = NULL;
3976 return (NULL);
3977 }
3978 /*
3979 * We consumed up to exactly the end of an mblk. Detach the part
3980 * we are returning from the rest of the chain.
3981 */
3982 mp1->b_cont = NULL;
3983 *mpp = mp2;
3984 return (mp0);
3985 }
3986
3987 /* The ill stream is being unplumbed. Called from ip_close */
3988 int
3989 ip_modclose(ill_t *ill)
3990 {
3991 boolean_t success;
3992 ipsq_t *ipsq;
3993 ipif_t *ipif;
3994 queue_t *q = ill->ill_rq;
3995 ip_stack_t *ipst = ill->ill_ipst;
3996 int i;
3997 arl_ill_common_t *ai = ill->ill_common;
3998
3999 /*
4000 * The punlink prior to this may have initiated a capability
4001 * negotiation. But ipsq_enter will block until that finishes or
4002 * times out.
4003 */
4004 success = ipsq_enter(ill, B_FALSE, NEW_OP);
4005
4006 /*
4007 * Open/close/push/pop is guaranteed to be single threaded
4008 * per stream by STREAMS. FS guarantees that all references
4009 * from top are gone before close is called. So there can't
4010 * be another close thread that has set CONDEMNED on this ill.
4011 * and cause ipsq_enter to return failure.
4012 */
4013 ASSERT(success);
4014 ipsq = ill->ill_phyint->phyint_ipsq;
4015
4016 /*
4017 * Mark it condemned. No new reference will be made to this ill.
4018 * Lookup functions will return an error. Threads that try to
4019 * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
4020 * that the refcnt will drop down to zero.
4021 */
4022 mutex_enter(&ill->ill_lock);
4023 ill->ill_state_flags |= ILL_CONDEMNED;
4024 for (ipif = ill->ill_ipif; ipif != NULL;
4025 ipif = ipif->ipif_next) {
4026 ipif->ipif_state_flags |= IPIF_CONDEMNED;
4027 }
4028 /*
4029 * Wake up anybody waiting to enter the ipsq. ipsq_enter
4030 * returns error if ILL_CONDEMNED is set
4031 */
4032 cv_broadcast(&ill->ill_cv);
4033 mutex_exit(&ill->ill_lock);
4034
4035 /*
4036 * Send all the deferred DLPI messages downstream which came in
4037 * during the small window right before ipsq_enter(). We do this
4038 * without waiting for the ACKs because all the ACKs for M_PROTO
4039 * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
4040 */
4041 ill_dlpi_send_deferred(ill);
4042
4043 /*
4044 * Shut down fragmentation reassembly.
4045 * ill_frag_timer won't start a timer again.
4046 * Now cancel any existing timer
4047 */
4048 (void) untimeout(ill->ill_frag_timer_id);
4049 (void) ill_frag_timeout(ill, 0);
4050
4051 /*
4052 * Call ill_delete to bring down the ipifs, ilms and ill on
4053 * this ill. Then wait for the refcnts to drop to zero.
4054 * ill_is_freeable checks whether the ill is really quiescent.
4055 * Then make sure that threads that are waiting to enter the
4056 * ipsq have seen the error returned by ipsq_enter and have
4057 * gone away. Then we call ill_delete_tail which does the
4058 * DL_UNBIND_REQ with the driver and then qprocsoff.
4059 */
4060 ill_delete(ill);
4061 mutex_enter(&ill->ill_lock);
4062 while (!ill_is_freeable(ill))
4063 cv_wait(&ill->ill_cv, &ill->ill_lock);
4064
4065 while (ill->ill_waiters)
4066 cv_wait(&ill->ill_cv, &ill->ill_lock);
4067
4068 mutex_exit(&ill->ill_lock);
4069
4070 /*
4071 * ill_delete_tail drops reference on ill_ipst, but we need to keep
4072 * it held until the end of the function since the cleanup
4073 * below needs to be able to use the ip_stack_t.
4074 */
4075 netstack_hold(ipst->ips_netstack);
4076
4077 /* qprocsoff is done via ill_delete_tail */
4078 ill_delete_tail(ill);
4079 /*
4080 * synchronously wait for arp stream to unbind. After this, we
4081 * cannot get any data packets up from the driver.
4082 */
4083 arp_unbind_complete(ill);
4084 ASSERT(ill->ill_ipst == NULL);
4085
4086 /*
4087 * Walk through all conns and qenable those that have queued data.
4088 * Close synchronization needs this to
4089 * be done to ensure that all upper layers blocked
4090 * due to flow control to the closing device
4091 * get unblocked.
4092 */
4093 ip1dbg(("ip_wsrv: walking\n"));
4094 for (i = 0; i < TX_FANOUT_SIZE; i++) {
4095 conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
4096 }
4097
4098 /*
4099 * ai can be null if this is an IPv6 ill, or if the IPv4
4100 * stream is being torn down before ARP was plumbed (e.g.,
4101 * /sbin/ifconfig plumbing a stream twice, and encountering
4102 * an error
4103 */
4104 if (ai != NULL) {
4105 ASSERT(!ill->ill_isv6);
4106 mutex_enter(&ai->ai_lock);
4107 ai->ai_ill = NULL;
4108 if (ai->ai_arl == NULL) {
4109 mutex_destroy(&ai->ai_lock);
4110 kmem_free(ai, sizeof (*ai));
4111 } else {
4112 cv_signal(&ai->ai_ill_unplumb_done);
4113 mutex_exit(&ai->ai_lock);
4114 }
4115 }
4116
4117 mutex_enter(&ipst->ips_ip_mi_lock);
4118 mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
4119 mutex_exit(&ipst->ips_ip_mi_lock);
4120
4121 /*
4122 * credp could be null if the open didn't succeed and ip_modopen
4123 * itself calls ip_close.
4124 */
4125 if (ill->ill_credp != NULL)
4126 crfree(ill->ill_credp);
4127
4128 mutex_destroy(&ill->ill_saved_ire_lock);
4129 mutex_destroy(&ill->ill_lock);
4130 rw_destroy(&ill->ill_mcast_lock);
4131 mutex_destroy(&ill->ill_mcast_serializer);
4132 list_destroy(&ill->ill_nce);
4133
4134 /*
4135 * Now we are done with the module close pieces that
4136 * need the netstack_t.
4137 */
4138 netstack_rele(ipst->ips_netstack);
4139
4140 mi_close_free((IDP)ill);
4141 q->q_ptr = WR(q)->q_ptr = NULL;
4142
4143 ipsq_exit(ipsq);
4144
4145 return (0);
4146 }
4147
4148 /*
4149 * This is called as part of close() for IP, UDP, ICMP, and RTS
4150 * in order to quiesce the conn.
4151 */
4152 void
4153 ip_quiesce_conn(conn_t *connp)
4154 {
4155 boolean_t drain_cleanup_reqd = B_FALSE;
4156 boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
4157 boolean_t ilg_cleanup_reqd = B_FALSE;
4158 ip_stack_t *ipst;
4159
4160 ASSERT(!IPCL_IS_TCP(connp));
4161 ipst = connp->conn_netstack->netstack_ip;
4162
4163 /*
4164 * Mark the conn as closing, and this conn must not be
4165 * inserted in future into any list. Eg. conn_drain_insert(),
4166 * won't insert this conn into the conn_drain_list.
4167 *
4168 * conn_idl, and conn_ilg cannot get set henceforth.
4169 */
4170 mutex_enter(&connp->conn_lock);
4171 ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
4172 connp->conn_state_flags |= CONN_CLOSING;
4173 if (connp->conn_idl != NULL)
4174 drain_cleanup_reqd = B_TRUE;
4175 if (connp->conn_oper_pending_ill != NULL)
4176 conn_ioctl_cleanup_reqd = B_TRUE;
4177 if (connp->conn_dhcpinit_ill != NULL) {
4178 ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
4179 atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
4180 ill_set_inputfn(connp->conn_dhcpinit_ill);
4181 connp->conn_dhcpinit_ill = NULL;
4182 }
4183 if (connp->conn_ilg != NULL)
4184 ilg_cleanup_reqd = B_TRUE;
4185 mutex_exit(&connp->conn_lock);
4186
4187 if (conn_ioctl_cleanup_reqd)
4188 conn_ioctl_cleanup(connp);
4189
4190 if (is_system_labeled() && connp->conn_anon_port) {
4191 (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
4192 connp->conn_mlp_type, connp->conn_proto,
4193 ntohs(connp->conn_lport), B_FALSE);
4194 connp->conn_anon_port = 0;
4195 }
4196 connp->conn_mlp_type = mlptSingle;
4197
4198 /*
4199 * Remove this conn from any fanout list it is on.
4200 * and then wait for any threads currently operating
4201 * on this endpoint to finish
4202 */
4203 ipcl_hash_remove(connp);
4204
4205 /*
4206 * Remove this conn from the drain list, and do any other cleanup that
4207 * may be required. (TCP conns are never flow controlled, and
4208 * conn_idl will be NULL.)
4209 */
4210 if (drain_cleanup_reqd && connp->conn_idl != NULL) {
4211 idl_t *idl = connp->conn_idl;
4212
4213 mutex_enter(&idl->idl_lock);
4214 conn_drain(connp, B_TRUE);
4215 mutex_exit(&idl->idl_lock);
4216 }
4217
4218 if (connp == ipst->ips_ip_g_mrouter)
4219 (void) ip_mrouter_done(ipst);
4220
4221 if (ilg_cleanup_reqd)
4222 ilg_delete_all(connp);
4223
4224 /*
4225 * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
4226 * callers from write side can't be there now because close
4227 * is in progress. The only other caller is ipcl_walk
4228 * which checks for the condemned flag.
4229 */
4230 mutex_enter(&connp->conn_lock);
4231 connp->conn_state_flags |= CONN_CONDEMNED;
4232 while (connp->conn_ref != 1)
4233 cv_wait(&connp->conn_cv, &connp->conn_lock);
4234 connp->conn_state_flags |= CONN_QUIESCED;
4235 mutex_exit(&connp->conn_lock);
4236 }
4237
4238 /* ARGSUSED */
4239 int
4240 ip_close(queue_t *q, int flags)
4241 {
4242 conn_t *connp;
4243
4244 /*
4245 * Call the appropriate delete routine depending on whether this is
4246 * a module or device.
4247 */
4248 if (WR(q)->q_next != NULL) {
4249 /* This is a module close */
4250 return (ip_modclose((ill_t *)q->q_ptr));
4251 }
4252
4253 connp = q->q_ptr;
4254 ip_quiesce_conn(connp);
4255
4256 qprocsoff(q);
4257
4258 /*
4259 * Now we are truly single threaded on this stream, and can
4260 * delete the things hanging off the connp, and finally the connp.
4261 * We removed this connp from the fanout list, it cannot be
4262 * accessed thru the fanouts, and we already waited for the
4263 * conn_ref to drop to 0. We are already in close, so
4264 * there cannot be any other thread from the top. qprocsoff
4265 * has completed, and service has completed or won't run in
4266 * future.
4267 */
4268 ASSERT(connp->conn_ref == 1);
4269
4270 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4271
4272 connp->conn_ref--;
4273 ipcl_conn_destroy(connp);
4274
4275 q->q_ptr = WR(q)->q_ptr = NULL;
4276 return (0);
4277 }
4278
4279 /*
4280 * Wapper around putnext() so that ip_rts_request can merely use
4281 * conn_recv.
4282 */
4283 /*ARGSUSED2*/
4284 static void
4285 ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4286 {
4287 conn_t *connp = (conn_t *)arg1;
4288
4289 putnext(connp->conn_rq, mp);
4290 }
4291
4292 /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
4293 /* ARGSUSED */
4294 static void
4295 ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4296 {
4297 freemsg(mp);
4298 }
4299
4300 /*
4301 * Called when the module is about to be unloaded
4302 */
4303 void
4304 ip_ddi_destroy(void)
4305 {
4306 /* This needs to be called before destroying any transports. */
4307 mutex_enter(&cpu_lock);
4308 unregister_cpu_setup_func(ip_tp_cpu_update, NULL);
4309 mutex_exit(&cpu_lock);
4310
4311 tnet_fini();
4312
4313 icmp_ddi_g_destroy();
4314 rts_ddi_g_destroy();
4315 udp_ddi_g_destroy();
4316 dccp_ddi_g_destroy();
4317 sctp_ddi_g_destroy();
4318 tcp_ddi_g_destroy();
4319 ilb_ddi_g_destroy();
4320 dce_g_destroy();
4321 ipsec_policy_g_destroy();
4322 ipcl_g_destroy();
4323 ip_net_g_destroy();
4324 ip_ire_g_fini();
4325 inet_minor_destroy(ip_minor_arena_sa);
4326 #if defined(_LP64)
4327 inet_minor_destroy(ip_minor_arena_la);
4328 #endif
4329
4330 #ifdef DEBUG
4331 list_destroy(&ip_thread_list);
4332 rw_destroy(&ip_thread_rwlock);
4333 tsd_destroy(&ip_thread_data);
4334 #endif
4335
4336 netstack_unregister(NS_IP);
4337 }
4338
4339 /*
4340 * First step in cleanup.
4341 */
4342 /* ARGSUSED */
4343 static void
4344 ip_stack_shutdown(netstackid_t stackid, void *arg)
4345 {
4346 ip_stack_t *ipst = (ip_stack_t *)arg;
4347
4348 #ifdef NS_DEBUG
4349 printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
4350 #endif
4351
4352 /*
4353 * Perform cleanup for special interfaces (loopback and IPMP).
4354 */
4355 ip_interface_cleanup(ipst);
4356
4357 /*
4358 * The *_hook_shutdown()s start the process of notifying any
4359 * consumers that things are going away.... nothing is destroyed.
4360 */
4361 ipv4_hook_shutdown(ipst);
4362 ipv6_hook_shutdown(ipst);
4363 arp_hook_shutdown(ipst);
4364
4365 mutex_enter(&ipst->ips_capab_taskq_lock);
4366 ipst->ips_capab_taskq_quit = B_TRUE;
4367 cv_signal(&ipst->ips_capab_taskq_cv);
4368 mutex_exit(&ipst->ips_capab_taskq_lock);
4369 }
4370
4371 /*
4372 * Free the IP stack instance.
4373 */
4374 static void
4375 ip_stack_fini(netstackid_t stackid, void *arg)
4376 {
4377 ip_stack_t *ipst = (ip_stack_t *)arg;
4378 int ret;
4379
4380 #ifdef NS_DEBUG
4381 printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
4382 #endif
4383 /*
4384 * At this point, all of the notifications that the events and
4385 * protocols are going away have been run, meaning that we can
4386 * now set about starting to clean things up.
4387 */
4388 ipobs_fini(ipst);
4389 ipv4_hook_destroy(ipst);
4390 ipv6_hook_destroy(ipst);
4391 arp_hook_destroy(ipst);
4392 ip_net_destroy(ipst);
4393
4394 ipmp_destroy(ipst);
4395
4396 ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
4397 ipst->ips_ip_mibkp = NULL;
4398 icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
4399 ipst->ips_icmp_mibkp = NULL;
4400 ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
4401 ipst->ips_ip_kstat = NULL;
4402 bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
4403 ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
4404 ipst->ips_ip6_kstat = NULL;
4405 bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
4406
4407 kmem_free(ipst->ips_propinfo_tbl,
4408 ip_propinfo_count * sizeof (mod_prop_info_t));
4409 ipst->ips_propinfo_tbl = NULL;
4410
4411 dce_stack_destroy(ipst);
4412 ip_mrouter_stack_destroy(ipst);
4413
4414 ret = untimeout(ipst->ips_igmp_timeout_id);
4415 if (ret == -1) {
4416 ASSERT(ipst->ips_igmp_timeout_id == 0);
4417 } else {
4418 ASSERT(ipst->ips_igmp_timeout_id != 0);
4419 ipst->ips_igmp_timeout_id = 0;
4420 }
4421 ret = untimeout(ipst->ips_igmp_slowtimeout_id);
4422 if (ret == -1) {
4423 ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
4424 } else {
4425 ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
4426 ipst->ips_igmp_slowtimeout_id = 0;
4427 }
4428 ret = untimeout(ipst->ips_mld_timeout_id);
4429 if (ret == -1) {
4430 ASSERT(ipst->ips_mld_timeout_id == 0);
4431 } else {
4432 ASSERT(ipst->ips_mld_timeout_id != 0);
4433 ipst->ips_mld_timeout_id = 0;
4434 }
4435 ret = untimeout(ipst->ips_mld_slowtimeout_id);
4436 if (ret == -1) {
4437 ASSERT(ipst->ips_mld_slowtimeout_id == 0);
4438 } else {
4439 ASSERT(ipst->ips_mld_slowtimeout_id != 0);
4440 ipst->ips_mld_slowtimeout_id = 0;
4441 }
4442
4443 ip_ire_fini(ipst);
4444 ip6_asp_free(ipst);
4445 conn_drain_fini(ipst);
4446 ipcl_destroy(ipst);
4447
4448 mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
4449 mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
4450 kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
4451 ipst->ips_ndp4 = NULL;
4452 kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
4453 ipst->ips_ndp6 = NULL;
4454
4455 if (ipst->ips_loopback_ksp != NULL) {
4456 kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
4457 ipst->ips_loopback_ksp = NULL;
4458 }
4459
4460 mutex_destroy(&ipst->ips_capab_taskq_lock);
4461 cv_destroy(&ipst->ips_capab_taskq_cv);
4462
4463 rw_destroy(&ipst->ips_srcid_lock);
4464
4465 mutex_destroy(&ipst->ips_ip_mi_lock);
4466 rw_destroy(&ipst->ips_ill_g_usesrc_lock);
4467
4468 mutex_destroy(&ipst->ips_igmp_timer_lock);
4469 mutex_destroy(&ipst->ips_mld_timer_lock);
4470 mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
4471 mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
4472 mutex_destroy(&ipst->ips_ip_addr_avail_lock);
4473 rw_destroy(&ipst->ips_ill_g_lock);
4474
4475 kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
4476 ipst->ips_phyint_g_list = NULL;
4477 kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
4478 ipst->ips_ill_g_heads = NULL;
4479
4480 ldi_ident_release(ipst->ips_ldi_ident);
4481 kmem_free(ipst, sizeof (*ipst));
4482 }
4483
4484 /*
4485 * This function is called from the TSD destructor, and is used to debug
4486 * reference count issues in IP. See block comment in <inet/ip_if.h> for
4487 * details.
4488 */
4489 static void
4490 ip_thread_exit(void *phash)
4491 {
4492 th_hash_t *thh = phash;
4493
4494 rw_enter(&ip_thread_rwlock, RW_WRITER);
4495 list_remove(&ip_thread_list, thh);
4496 rw_exit(&ip_thread_rwlock);
4497 mod_hash_destroy_hash(thh->thh_hash);
4498 kmem_free(thh, sizeof (*thh));
4499 }
4500
4501 /*
4502 * Called when the IP kernel module is loaded into the kernel
4503 */
4504 void
4505 ip_ddi_init(void)
4506 {
4507 ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
4508
4509 /*
4510 * For IP and TCP the minor numbers should start from 2 since we have 4
4511 * initial devices: ip, ip6, tcp, tcp6.
4512 */
4513 /*
4514 * If this is a 64-bit kernel, then create two separate arenas -
4515 * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
4516 * other for socket apps in the range 2^^18 through 2^^32-1.
4517 */
4518 ip_minor_arena_la = NULL;
4519 ip_minor_arena_sa = NULL;
4520 #if defined(_LP64)
4521 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4522 INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
4523 cmn_err(CE_PANIC,
4524 "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4525 }
4526 if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
4527 MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
4528 cmn_err(CE_PANIC,
4529 "ip_ddi_init: ip_minor_arena_la creation failed\n");
4530 }
4531 #else
4532 if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4533 INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
4534 cmn_err(CE_PANIC,
4535 "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4536 }
4537 #endif
4538 ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
4539
4540 ipcl_g_init();
4541 ip_ire_g_init();
4542 ip_net_g_init();
4543
4544 #ifdef DEBUG
4545 tsd_create(&ip_thread_data, ip_thread_exit);
4546 rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
4547 list_create(&ip_thread_list, sizeof (th_hash_t),
4548 offsetof(th_hash_t, thh_link));
4549 #endif
4550 ipsec_policy_g_init();
4551 tcp_ddi_g_init();
4552 sctp_ddi_g_init();
4553 dccp_ddi_g_init();
4554 dce_g_init();
4555
4556 /*
4557 * We want to be informed each time a stack is created or
4558 * destroyed in the kernel, so we can maintain the
4559 * set of udp_stack_t's.
4560 */
4561 netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
4562 ip_stack_fini);
4563
4564 tnet_init();
4565
4566 udp_ddi_g_init();
4567 rts_ddi_g_init();
4568 icmp_ddi_g_init();
4569 ilb_ddi_g_init();
4570
4571 /* This needs to be called after all transports are initialized. */
4572 mutex_enter(&cpu_lock);
4573 register_cpu_setup_func(ip_tp_cpu_update, NULL);
4574 mutex_exit(&cpu_lock);
4575 }
4576
4577 /*
4578 * Initialize the IP stack instance.
4579 */
4580 static void *
4581 ip_stack_init(netstackid_t stackid, netstack_t *ns)
4582 {
4583 ip_stack_t *ipst;
4584 size_t arrsz;
4585 major_t major;
4586
4587 #ifdef NS_DEBUG
4588 printf("ip_stack_init(stack %d)\n", stackid);
4589 #endif
4590
4591 ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
4592 ipst->ips_netstack = ns;
4593
4594 ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
4595 KM_SLEEP);
4596 ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
4597 KM_SLEEP);
4598 ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4599 ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4600 mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4601 mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4602
4603 mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4604 ipst->ips_igmp_deferred_next = INFINITY;
4605 mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4606 ipst->ips_mld_deferred_next = INFINITY;
4607 mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4608 mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4609 mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
4610 mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
4611 rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
4612 rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
4613
4614 ipcl_init(ipst);
4615 ip_ire_init(ipst);
4616 ip6_asp_init(ipst);
4617 ipif_init(ipst);
4618 conn_drain_init(ipst);
4619 ip_mrouter_stack_init(ipst);
4620 dce_stack_init(ipst);
4621
4622 ipst->ips_ip_multirt_log_interval = 1000;
4623
4624 ipst->ips_ill_index = 1;
4625
4626 ipst->ips_saved_ip_forwarding = -1;
4627 ipst->ips_reg_vif_num = ALL_VIFS; /* Index to Register vif */
4628
4629 arrsz = ip_propinfo_count * sizeof (mod_prop_info_t);
4630 ipst->ips_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4631 bcopy(ip_propinfo_tbl, ipst->ips_propinfo_tbl, arrsz);
4632
4633 ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
4634 ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
4635 ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
4636 ipst->ips_ip6_kstat =
4637 ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
4638
4639 ipst->ips_ip_src_id = 1;
4640 rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
4641
4642 ipst->ips_src_generation = SRC_GENERATION_INITIAL;
4643
4644 ip_net_init(ipst, ns);
4645 ipv4_hook_init(ipst);
4646 ipv6_hook_init(ipst);
4647 arp_hook_init(ipst);
4648 ipmp_init(ipst);
4649 ipobs_init(ipst);
4650
4651 /*
4652 * Create the taskq dispatcher thread and initialize related stuff.
4653 */
4654 mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
4655 cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
4656 ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
4657 ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
4658
4659 major = mod_name_to_major(INET_NAME);
4660 (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
4661 return (ipst);
4662 }
4663
4664 /*
4665 * Allocate and initialize a DLPI template of the specified length. (May be
4666 * called as writer.)
4667 */
4668 mblk_t *
4669 ip_dlpi_alloc(size_t len, t_uscalar_t prim)
4670 {
4671 mblk_t *mp;
4672
4673 mp = allocb(len, BPRI_MED);
4674 if (!mp)
4675 return (NULL);
4676
4677 /*
4678 * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
4679 * of which we don't seem to use) are sent with M_PCPROTO, and
4680 * that other DLPI are M_PROTO.
4681 */
4682 if (prim == DL_INFO_REQ) {
4683 mp->b_datap->db_type = M_PCPROTO;
4684 } else {
4685 mp->b_datap->db_type = M_PROTO;
4686 }
4687
4688 mp->b_wptr = mp->b_rptr + len;
4689 bzero(mp->b_rptr, len);
4690 ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
4691 return (mp);
4692 }
4693
4694 /*
4695 * Allocate and initialize a DLPI notification. (May be called as writer.)
4696 */
4697 mblk_t *
4698 ip_dlnotify_alloc(uint_t notification, uint_t data)
4699 {
4700 dl_notify_ind_t *notifyp;
4701 mblk_t *mp;
4702
4703 if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4704 return (NULL);
4705
4706 notifyp = (dl_notify_ind_t *)mp->b_rptr;
4707 notifyp->dl_notification = notification;
4708 notifyp->dl_data = data;
4709 return (mp);
4710 }
4711
4712 mblk_t *
4713 ip_dlnotify_alloc2(uint_t notification, uint_t data1, uint_t data2)
4714 {
4715 dl_notify_ind_t *notifyp;
4716 mblk_t *mp;
4717
4718 if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4719 return (NULL);
4720
4721 notifyp = (dl_notify_ind_t *)mp->b_rptr;
4722 notifyp->dl_notification = notification;
4723 notifyp->dl_data1 = data1;
4724 notifyp->dl_data2 = data2;
4725 return (mp);
4726 }
4727
4728 /*
4729 * Debug formatting routine. Returns a character string representation of the
4730 * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address
4731 * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
4732 *
4733 * Once the ndd table-printing interfaces are removed, this can be changed to
4734 * standard dotted-decimal form.
4735 */
4736 char *
4737 ip_dot_addr(ipaddr_t addr, char *buf)
4738 {
4739 uint8_t *ap = (uint8_t *)&addr;
4740
4741 (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
4742 ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
4743 return (buf);
4744 }
4745
4746 /*
4747 * Write the given MAC address as a printable string in the usual colon-
4748 * separated format.
4749 */
4750 const char *
4751 mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
4752 {
4753 char *bp;
4754
4755 if (alen == 0 || buflen < 4)
4756 return ("?");
4757 bp = buf;
4758 for (;;) {
4759 /*
4760 * If there are more MAC address bytes available, but we won't
4761 * have any room to print them, then add "..." to the string
4762 * instead. See below for the 'magic number' explanation.
4763 */
4764 if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
4765 (void) strcpy(bp, "...");
4766 break;
4767 }
4768 (void) sprintf(bp, "%02x", *addr++);
4769 bp += 2;
4770 if (--alen == 0)
4771 break;
4772 *bp++ = ':';
4773 buflen -= 3;
4774 /*
4775 * At this point, based on the first 'if' statement above,
4776 * either alen == 1 and buflen >= 3, or alen > 1 and
4777 * buflen >= 4. The first case leaves room for the final "xx"
4778 * number and trailing NUL byte. The second leaves room for at
4779 * least "...". Thus the apparently 'magic' numbers chosen for
4780 * that statement.
4781 */
4782 }
4783 return (buf);
4784 }
4785
4786 /*
4787 * Called when it is conceptually a ULP that would sent the packet
4788 * e.g., port unreachable and protocol unreachable. Check that the packet
4789 * would have passed the IPsec global policy before sending the error.
4790 *
4791 * Send an ICMP error after patching up the packet appropriately.
4792 * Uses ip_drop_input and bumps the appropriate MIB.
4793 */
4794 void
4795 ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
4796 ip_recv_attr_t *ira)
4797 {
4798 ipha_t *ipha;
4799 boolean_t secure;
4800 ill_t *ill = ira->ira_ill;
4801 ip_stack_t *ipst = ill->ill_ipst;
4802 netstack_t *ns = ipst->ips_netstack;
4803 ipsec_stack_t *ipss = ns->netstack_ipsec;
4804
4805 secure = ira->ira_flags & IRAF_IPSEC_SECURE;
4806
4807 /*
4808 * We are generating an icmp error for some inbound packet.
4809 * Called from all ip_fanout_(udp, tcp, proto) functions.
4810 * Before we generate an error, check with global policy
4811 * to see whether this is allowed to enter the system. As
4812 * there is no "conn", we are checking with global policy.
4813 */
4814 ipha = (ipha_t *)mp->b_rptr;
4815 if (secure || ipss->ipsec_inbound_v4_policy_present) {
4816 mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
4817 if (mp == NULL)
4818 return;
4819 }
4820
4821 /* We never send errors for protocols that we do implement */
4822 if (ira->ira_protocol == IPPROTO_ICMP ||
4823 ira->ira_protocol == IPPROTO_IGMP) {
4824 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4825 ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
4826 freemsg(mp);
4827 return;
4828 }
4829 /*
4830 * Have to correct checksum since
4831 * the packet might have been
4832 * fragmented and the reassembly code in ip_rput
4833 * does not restore the IP checksum.
4834 */
4835 ipha->ipha_hdr_checksum = 0;
4836 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
4837
4838 switch (icmp_type) {
4839 case ICMP_DEST_UNREACHABLE:
4840 switch (icmp_code) {
4841 case ICMP_PROTOCOL_UNREACHABLE:
4842 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
4843 ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
4844 break;
4845 case ICMP_PORT_UNREACHABLE:
4846 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
4847 ip_drop_input("ipIfStatsNoPorts", mp, ill);
4848 break;
4849 }
4850
4851 icmp_unreachable(mp, icmp_code, ira);
4852 break;
4853 default:
4854 #ifdef DEBUG
4855 panic("ip_fanout_send_icmp_v4: wrong type");
4856 /*NOTREACHED*/
4857 #else
4858 freemsg(mp);
4859 break;
4860 #endif
4861 }
4862 }
4863
4864 /*
4865 * Used to send an ICMP error message when a packet is received for
4866 * a protocol that is not supported. The mblk passed as argument
4867 * is consumed by this function.
4868 */
4869 void
4870 ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
4871 {
4872 ipha_t *ipha;
4873
4874 ipha = (ipha_t *)mp->b_rptr;
4875 if (ira->ira_flags & IRAF_IS_IPV4) {
4876 ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
4877 ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
4878 ICMP_PROTOCOL_UNREACHABLE, ira);
4879 } else {
4880 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
4881 ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
4882 ICMP6_PARAMPROB_NEXTHEADER, ira);
4883 }
4884 }
4885
4886 /*
4887 * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
4888 * Handles IPv4 and IPv6.
4889 * We are responsible for disposing of mp, such as by freemsg() or putnext()
4890 * Caller is responsible for dropping references to the conn.
4891 */
4892 void
4893 ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
4894 ip_recv_attr_t *ira)
4895 {
4896 ill_t *ill = ira->ira_ill;
4897 ip_stack_t *ipst = ill->ill_ipst;
4898 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
4899 boolean_t secure;
4900 uint_t protocol = ira->ira_protocol;
4901 iaflags_t iraflags = ira->ira_flags;
4902 queue_t *rq;
4903
4904 secure = iraflags & IRAF_IPSEC_SECURE;
4905
4906 rq = connp->conn_rq;
4907 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
4908 switch (protocol) {
4909 case IPPROTO_ICMPV6:
4910 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
4911 break;
4912 case IPPROTO_ICMP:
4913 BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
4914 break;
4915 default:
4916 BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
4917 break;
4918 }
4919 freemsg(mp);
4920 return;
4921 }
4922
4923 ASSERT(!(IPCL_IS_IPTUN(connp)));
4924
4925 if (((iraflags & IRAF_IS_IPV4) ?
4926 CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
4927 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
4928 secure) {
4929 mp = ipsec_check_inbound_policy(mp, connp, ipha,
4930 ip6h, ira);
4931 if (mp == NULL) {
4932 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4933 /* Note that mp is NULL */
4934 ip_drop_input("ipIfStatsInDiscards", mp, ill);
4935 return;
4936 }
4937 }
4938
4939 if (iraflags & IRAF_ICMP_ERROR) {
4940 (connp->conn_recvicmp)(connp, mp, NULL, ira);
4941 } else {
4942 ill_t *rill = ira->ira_rill;
4943
4944 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
4945 ira->ira_ill = ira->ira_rill = NULL;
4946 /* Send it upstream */
4947 (connp->conn_recv)(connp, mp, NULL, ira);
4948 ira->ira_ill = ill;
4949 ira->ira_rill = rill;
4950 }
4951 }
4952
4953 /*
4954 * Handle protocols with which IP is less intimate. There
4955 * can be more than one stream bound to a particular
4956 * protocol. When this is the case, normally each one gets a copy
4957 * of any incoming packets.
4958 *
4959 * IPsec NOTE :
4960 *
4961 * Don't allow a secure packet going up a non-secure connection.
4962 * We don't allow this because
4963 *
4964 * 1) Reply might go out in clear which will be dropped at
4965 * the sending side.
4966 * 2) If the reply goes out in clear it will give the
4967 * adversary enough information for getting the key in
4968 * most of the cases.
4969 *
4970 * Moreover getting a secure packet when we expect clear
4971 * implies that SA's were added without checking for
4972 * policy on both ends. This should not happen once ISAKMP
4973 * is used to negotiate SAs as SAs will be added only after
4974 * verifying the policy.
4975 *
4976 * Zones notes:
4977 * Earlier in ip_input on a system with multiple shared-IP zones we
4978 * duplicate the multicast and broadcast packets and send them up
4979 * with each explicit zoneid that exists on that ill.
4980 * This means that here we can match the zoneid with SO_ALLZONES being special.
4981 */
4982 void
4983 ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
4984 {
4985 mblk_t *mp1;
4986 ipaddr_t laddr;
4987 conn_t *connp, *first_connp, *next_connp;
4988 connf_t *connfp;
4989 ill_t *ill = ira->ira_ill;
4990 ip_stack_t *ipst = ill->ill_ipst;
4991
4992 laddr = ipha->ipha_dst;
4993
4994 connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
4995 mutex_enter(&connfp->connf_lock);
4996 connp = connfp->connf_head;
4997 for (connp = connfp->connf_head; connp != NULL;
4998 connp = connp->conn_next) {
4999 /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5000 if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5001 (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5002 tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
5003 break;
5004 }
5005 }
5006
5007 if (connp == NULL) {
5008 /*
5009 * No one bound to these addresses. Is
5010 * there a client that wants all
5011 * unclaimed datagrams?
5012 */
5013 mutex_exit(&connfp->connf_lock);
5014 ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
5015 ICMP_PROTOCOL_UNREACHABLE, ira);
5016 return;
5017 }
5018
5019 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5020
5021 CONN_INC_REF(connp);
5022 first_connp = connp;
5023 connp = connp->conn_next;
5024
5025 for (;;) {
5026 while (connp != NULL) {
5027 /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5028 if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5029 (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5030 tsol_receive_local(mp, &laddr, IPV4_VERSION,
5031 ira, connp)))
5032 break;
5033 connp = connp->conn_next;
5034 }
5035
5036 if (connp == NULL) {
5037 /* No more interested clients */
5038 connp = first_connp;
5039 break;
5040 }
5041 if (((mp1 = dupmsg(mp)) == NULL) &&
5042 ((mp1 = copymsg(mp)) == NULL)) {
5043 /* Memory allocation failed */
5044 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5045 ip_drop_input("ipIfStatsInDiscards", mp, ill);
5046 connp = first_connp;
5047 break;
5048 }
5049
5050 CONN_INC_REF(connp);
5051 mutex_exit(&connfp->connf_lock);
5052
5053 ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
5054 ira);
5055
5056 mutex_enter(&connfp->connf_lock);
5057 /* Follow the next pointer before releasing the conn. */
5058 next_connp = connp->conn_next;
5059 CONN_DEC_REF(connp);
5060 connp = next_connp;
5061 }
5062
5063 /* Last one. Send it upstream. */
5064 mutex_exit(&connfp->connf_lock);
5065
5066 ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
5067
5068 CONN_DEC_REF(connp);
5069 }
5070
5071 /*
5072 * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
5073 * pass it along to ESP if the SPI is non-zero. Returns the mblk if the mblk
5074 * is not consumed.
5075 *
5076 * One of three things can happen, all of which affect the passed-in mblk:
5077 *
5078 * 1.) The packet is stock UDP and gets its zero-SPI stripped. Return mblk..
5079 *
5080 * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
5081 * ESP packet, and is passed along to ESP for consumption. Return NULL.
5082 *
5083 * 3.) The packet is an ESP-in-UDP Keepalive. Drop it and return NULL.
5084 */
5085 mblk_t *
5086 zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
5087 {
5088 int shift, plen, iph_len;
5089 ipha_t *ipha;
5090 udpha_t *udpha;
5091 uint32_t *spi;
5092 uint32_t esp_ports;
5093 uint8_t *orptr;
5094 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
5095 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
5096
5097 ipha = (ipha_t *)mp->b_rptr;
5098 iph_len = ira->ira_ip_hdr_length;
5099 plen = ira->ira_pktlen;
5100
5101 if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
5102 /*
5103 * Most likely a keepalive for the benefit of an intervening
5104 * NAT. These aren't for us, per se, so drop it.
5105 *
5106 * RFC 3947/8 doesn't say for sure what to do for 2-3
5107 * byte packets (keepalives are 1-byte), but we'll drop them
5108 * also.
5109 */
5110 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5111 DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
5112 return (NULL);
5113 }
5114
5115 if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
5116 /* might as well pull it all up - it might be ESP. */
5117 if (!pullupmsg(mp, -1)) {
5118 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5119 DROPPER(ipss, ipds_esp_nomem),
5120 &ipss->ipsec_dropper);
5121 return (NULL);
5122 }
5123
5124 ipha = (ipha_t *)mp->b_rptr;
5125 }
5126 spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
5127 if (*spi == 0) {
5128 /* UDP packet - remove 0-spi. */
5129 shift = sizeof (uint32_t);
5130 } else {
5131 /* ESP-in-UDP packet - reduce to ESP. */
5132 ipha->ipha_protocol = IPPROTO_ESP;
5133 shift = sizeof (udpha_t);
5134 }
5135
5136 /* Fix IP header */
5137 ira->ira_pktlen = (plen - shift);
5138 ipha->ipha_length = htons(ira->ira_pktlen);
5139 ipha->ipha_hdr_checksum = 0;
5140
5141 orptr = mp->b_rptr;
5142 mp->b_rptr += shift;
5143
5144 udpha = (udpha_t *)(orptr + iph_len);
5145 if (*spi == 0) {
5146 ASSERT((uint8_t *)ipha == orptr);
5147 udpha->uha_length = htons(plen - shift - iph_len);
5148 iph_len += sizeof (udpha_t); /* For the call to ovbcopy(). */
5149 esp_ports = 0;
5150 } else {
5151 esp_ports = *((uint32_t *)udpha);
5152 ASSERT(esp_ports != 0);
5153 }
5154 ovbcopy(orptr, orptr + shift, iph_len);
5155 if (esp_ports != 0) /* Punt up for ESP processing. */ {
5156 ipha = (ipha_t *)(orptr + shift);
5157
5158 ira->ira_flags |= IRAF_ESP_UDP_PORTS;
5159 ira->ira_esp_udp_ports = esp_ports;
5160 ip_fanout_v4(mp, ipha, ira);
5161 return (NULL);
5162 }
5163 return (mp);
5164 }
5165
5166 /*
5167 * Deliver a udp packet to the given conn, possibly applying ipsec policy.
5168 * Handles IPv4 and IPv6.
5169 * We are responsible for disposing of mp, such as by freemsg() or putnext()
5170 * Caller is responsible for dropping references to the conn.
5171 */
5172 void
5173 ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
5174 ip_recv_attr_t *ira)
5175 {
5176 ill_t *ill = ira->ira_ill;
5177 ip_stack_t *ipst = ill->ill_ipst;
5178 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
5179 boolean_t secure;
5180 iaflags_t iraflags = ira->ira_flags;
5181
5182 secure = iraflags & IRAF_IPSEC_SECURE;
5183
5184 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
5185 !canputnext(connp->conn_rq)) {
5186 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
5187 freemsg(mp);
5188 return;
5189 }
5190
5191 if (((iraflags & IRAF_IS_IPV4) ?
5192 CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
5193 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
5194 secure) {
5195 mp = ipsec_check_inbound_policy(mp, connp, ipha,
5196 ip6h, ira);
5197 if (mp == NULL) {
5198 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5199 /* Note that mp is NULL */
5200 ip_drop_input("ipIfStatsInDiscards", mp, ill);
5201 return;
5202 }
5203 }
5204
5205 /*
5206 * Since this code is not used for UDP unicast we don't need a NAT_T
5207 * check. Only ip_fanout_v4 has that check.
5208 */
5209 if (ira->ira_flags & IRAF_ICMP_ERROR) {
5210 (connp->conn_recvicmp)(connp, mp, NULL, ira);
5211 } else {
5212 ill_t *rill = ira->ira_rill;
5213
5214 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
5215 ira->ira_ill = ira->ira_rill = NULL;
5216 /* Send it upstream */
5217 (connp->conn_recv)(connp, mp, NULL, ira);
5218 ira->ira_ill = ill;
5219 ira->ira_rill = rill;
5220 }
5221 }
5222
5223 /*
5224 * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
5225 * (Unicast fanout is handled in ip_input_v4.)
5226 *
5227 * If SO_REUSEADDR is set all multicast and broadcast packets
5228 * will be delivered to all conns bound to the same port.
5229 *
5230 * If there is at least one matching AF_INET receiver, then we will
5231 * ignore any AF_INET6 receivers.
5232 * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
5233 * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
5234 * packets.
5235 *
5236 * Zones notes:
5237 * Earlier in ip_input on a system with multiple shared-IP zones we
5238 * duplicate the multicast and broadcast packets and send them up
5239 * with each explicit zoneid that exists on that ill.
5240 * This means that here we can match the zoneid with SO_ALLZONES being special.
5241 */
5242 void
5243 ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
5244 ip_recv_attr_t *ira)
5245 {
5246 ipaddr_t laddr;
5247 in6_addr_t v6faddr;
5248 conn_t *connp;
5249 connf_t *connfp;
5250 ipaddr_t faddr;
5251 ill_t *ill = ira->ira_ill;
5252 ip_stack_t *ipst = ill->ill_ipst;
5253
5254 ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
5255
5256 laddr = ipha->ipha_dst;
5257 faddr = ipha->ipha_src;
5258
5259 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5260 mutex_enter(&connfp->connf_lock);
5261 connp = connfp->connf_head;
5262
5263 /*
5264 * If SO_REUSEADDR has been set on the first we send the
5265 * packet to all clients that have joined the group and
5266 * match the port.
5267 */
5268 while (connp != NULL) {
5269 if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
5270 conn_wantpacket(connp, ira, ipha) &&
5271 (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5272 tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5273 break;
5274 connp = connp->conn_next;
5275 }
5276
5277 if (connp == NULL)
5278 goto notfound;
5279
5280 CONN_INC_REF(connp);
5281
5282 if (connp->conn_reuseaddr) {
5283 conn_t *first_connp = connp;
5284 conn_t *next_connp;
5285 mblk_t *mp1;
5286
5287 connp = connp->conn_next;
5288 for (;;) {
5289 while (connp != NULL) {
5290 if (IPCL_UDP_MATCH(connp, lport, laddr,
5291 fport, faddr) &&
5292 conn_wantpacket(connp, ira, ipha) &&
5293 (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5294 tsol_receive_local(mp, &laddr, IPV4_VERSION,
5295 ira, connp)))
5296 break;
5297 connp = connp->conn_next;
5298 }
5299 if (connp == NULL) {
5300 /* No more interested clients */
5301 connp = first_connp;
5302 break;
5303 }
5304 if (((mp1 = dupmsg(mp)) == NULL) &&
5305 ((mp1 = copymsg(mp)) == NULL)) {
5306 /* Memory allocation failed */
5307 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5308 ip_drop_input("ipIfStatsInDiscards", mp, ill);
5309 connp = first_connp;
5310 break;
5311 }
5312 CONN_INC_REF(connp);
5313 mutex_exit(&connfp->connf_lock);
5314
5315 IP_STAT(ipst, ip_udp_fanmb);
5316 ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5317 NULL, ira);
5318 mutex_enter(&connfp->connf_lock);
5319 /* Follow the next pointer before releasing the conn */
5320 next_connp = connp->conn_next;
5321 CONN_DEC_REF(connp);
5322 connp = next_connp;
5323 }
5324 }
5325
5326 /* Last one. Send it upstream. */
5327 mutex_exit(&connfp->connf_lock);
5328 IP_STAT(ipst, ip_udp_fanmb);
5329 ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5330 CONN_DEC_REF(connp);
5331 return;
5332
5333 notfound:
5334 mutex_exit(&connfp->connf_lock);
5335 /*
5336 * IPv6 endpoints bound to multicast IPv4-mapped addresses
5337 * have already been matched above, since they live in the IPv4
5338 * fanout tables. This implies we only need to
5339 * check for IPv6 in6addr_any endpoints here.
5340 * Thus we compare using ipv6_all_zeros instead of the destination
5341 * address, except for the multicast group membership lookup which
5342 * uses the IPv4 destination.
5343 */
5344 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
5345 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5346 mutex_enter(&connfp->connf_lock);
5347 connp = connfp->connf_head;
5348 /*
5349 * IPv4 multicast packet being delivered to an AF_INET6
5350 * in6addr_any endpoint.
5351 * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
5352 * and not conn_wantpacket_v6() since any multicast membership is
5353 * for an IPv4-mapped multicast address.
5354 */
5355 while (connp != NULL) {
5356 if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
5357 fport, v6faddr) &&
5358 conn_wantpacket(connp, ira, ipha) &&
5359 (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5360 tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5361 break;
5362 connp = connp->conn_next;
5363 }
5364
5365 if (connp == NULL) {
5366 /*
5367 * No one bound to this port. Is
5368 * there a client that wants all
5369 * unclaimed datagrams?
5370 */
5371 mutex_exit(&connfp->connf_lock);
5372
5373 if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
5374 NULL) {
5375 ASSERT(ira->ira_protocol == IPPROTO_UDP);
5376 ip_fanout_proto_v4(mp, ipha, ira);
5377 } else {
5378 /*
5379 * We used to attempt to send an icmp error here, but
5380 * since this is known to be a multicast packet
5381 * and we don't send icmp errors in response to
5382 * multicast, just drop the packet and give up sooner.
5383 */
5384 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
5385 freemsg(mp);
5386 }
5387 return;
5388 }
5389 ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5390
5391 /*
5392 * If SO_REUSEADDR has been set on the first we send the
5393 * packet to all clients that have joined the group and
5394 * match the port.
5395 */
5396 if (connp->conn_reuseaddr) {
5397 conn_t *first_connp = connp;
5398 conn_t *next_connp;
5399 mblk_t *mp1;
5400
5401 CONN_INC_REF(connp);
5402 connp = connp->conn_next;
5403 for (;;) {
5404 while (connp != NULL) {
5405 if (IPCL_UDP_MATCH_V6(connp, lport,
5406 ipv6_all_zeros, fport, v6faddr) &&
5407 conn_wantpacket(connp, ira, ipha) &&
5408 (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5409 tsol_receive_local(mp, &laddr, IPV4_VERSION,
5410 ira, connp)))
5411 break;
5412 connp = connp->conn_next;
5413 }
5414 if (connp == NULL) {
5415 /* No more interested clients */
5416 connp = first_connp;
5417 break;
5418 }
5419 if (((mp1 = dupmsg(mp)) == NULL) &&
5420 ((mp1 = copymsg(mp)) == NULL)) {
5421 /* Memory allocation failed */
5422 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5423 ip_drop_input("ipIfStatsInDiscards", mp, ill);
5424 connp = first_connp;
5425 break;
5426 }
5427 CONN_INC_REF(connp);
5428 mutex_exit(&connfp->connf_lock);
5429
5430 IP_STAT(ipst, ip_udp_fanmb);
5431 ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5432 NULL, ira);
5433 mutex_enter(&connfp->connf_lock);
5434 /* Follow the next pointer before releasing the conn */
5435 next_connp = connp->conn_next;
5436 CONN_DEC_REF(connp);
5437 connp = next_connp;
5438 }
5439 }
5440
5441 /* Last one. Send it upstream. */
5442 mutex_exit(&connfp->connf_lock);
5443 IP_STAT(ipst, ip_udp_fanmb);
5444 ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5445 CONN_DEC_REF(connp);
5446 }
5447
5448 /*
5449 * Split an incoming packet's IPv4 options into the label and the other options.
5450 * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
5451 * clearing out any leftover label or options.
5452 * Otherwise it just makes ipp point into the packet.
5453 *
5454 * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
5455 */
5456 int
5457 ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
5458 {
5459 uchar_t *opt;
5460 uint32_t totallen;
5461 uint32_t optval;
5462 uint32_t optlen;
5463
5464 ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
5465 ipp->ipp_hoplimit = ipha->ipha_ttl;
5466 ipp->ipp_type_of_service = ipha->ipha_type_of_service;
5467 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
5468
5469 /*
5470 * Get length (in 4 byte octets) of IP header options.
5471 */
5472 totallen = ipha->ipha_version_and_hdr_length -
5473 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5474
5475 if (totallen == 0) {
5476 if (!allocate)
5477 return (0);
5478
5479 /* Clear out anything from a previous packet */
5480 if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5481 kmem_free(ipp->ipp_ipv4_options,
5482 ipp->ipp_ipv4_options_len);
5483 ipp->ipp_ipv4_options = NULL;
5484 ipp->ipp_ipv4_options_len = 0;
5485 ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5486 }
5487 if (ipp->ipp_fields & IPPF_LABEL_V4) {
5488 kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5489 ipp->ipp_label_v4 = NULL;
5490 ipp->ipp_label_len_v4 = 0;
5491 ipp->ipp_fields &= ~IPPF_LABEL_V4;
5492 }
5493 return (0);
5494 }
5495
5496 totallen <<= 2;
5497 opt = (uchar_t *)&ipha[1];
5498 if (!is_system_labeled()) {
5499
5500 copyall:
5501 if (!allocate) {
5502 if (totallen != 0) {
5503 ipp->ipp_ipv4_options = opt;
5504 ipp->ipp_ipv4_options_len = totallen;
5505 ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5506 }
5507 return (0);
5508 }
5509 /* Just copy all of options */
5510 if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5511 if (totallen == ipp->ipp_ipv4_options_len) {
5512 bcopy(opt, ipp->ipp_ipv4_options, totallen);
5513 return (0);
5514 }
5515 kmem_free(ipp->ipp_ipv4_options,
5516 ipp->ipp_ipv4_options_len);
5517 ipp->ipp_ipv4_options = NULL;
5518 ipp->ipp_ipv4_options_len = 0;
5519 ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5520 }
5521 if (totallen == 0)
5522 return (0);
5523
5524 ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
5525 if (ipp->ipp_ipv4_options == NULL)
5526 return (ENOMEM);
5527 ipp->ipp_ipv4_options_len = totallen;
5528 ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5529 bcopy(opt, ipp->ipp_ipv4_options, totallen);
5530 return (0);
5531 }
5532
5533 if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
5534 kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5535 ipp->ipp_label_v4 = NULL;
5536 ipp->ipp_label_len_v4 = 0;
5537 ipp->ipp_fields &= ~IPPF_LABEL_V4;
5538 }
5539
5540 /*
5541 * Search for CIPSO option.
5542 * We assume CIPSO is first in options if it is present.
5543 * If it isn't, then ipp_opt_ipv4_options will not include the options
5544 * prior to the CIPSO option.
5545 */
5546 while (totallen != 0) {
5547 switch (optval = opt[IPOPT_OPTVAL]) {
5548 case IPOPT_EOL:
5549 return (0);
5550 case IPOPT_NOP:
5551 optlen = 1;
5552 break;
5553 default:
5554 if (totallen <= IPOPT_OLEN)
5555 return (EINVAL);
5556 optlen = opt[IPOPT_OLEN];
5557 if (optlen < 2)
5558 return (EINVAL);
5559 }
5560 if (optlen > totallen)
5561 return (EINVAL);
5562
5563 switch (optval) {
5564 case IPOPT_COMSEC:
5565 if (!allocate) {
5566 ipp->ipp_label_v4 = opt;
5567 ipp->ipp_label_len_v4 = optlen;
5568 ipp->ipp_fields |= IPPF_LABEL_V4;
5569 } else {
5570 ipp->ipp_label_v4 = kmem_alloc(optlen,
5571 KM_NOSLEEP);
5572 if (ipp->ipp_label_v4 == NULL)
5573 return (ENOMEM);
5574 ipp->ipp_label_len_v4 = optlen;
5575 ipp->ipp_fields |= IPPF_LABEL_V4;
5576 bcopy(opt, ipp->ipp_label_v4, optlen);
5577 }
5578 totallen -= optlen;
5579 opt += optlen;
5580
5581 /* Skip padding bytes until we get to a multiple of 4 */
5582 while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
5583 totallen--;
5584 opt++;
5585 }
5586 /* Remaining as ipp_ipv4_options */
5587 goto copyall;
5588 }
5589 totallen -= optlen;
5590 opt += optlen;
5591 }
5592 /* No CIPSO found; return everything as ipp_ipv4_options */
5593 totallen = ipha->ipha_version_and_hdr_length -
5594 (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5595 totallen <<= 2;
5596 opt = (uchar_t *)&ipha[1];
5597 goto copyall;
5598 }
5599
5600 /*
5601 * Efficient versions of lookup for an IRE when we only
5602 * match the address.
5603 * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5604 * Does not handle multicast addresses.
5605 */
5606 uint_t
5607 ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
5608 {
5609 ire_t *ire;
5610 uint_t result;
5611
5612 ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
5613 ASSERT(ire != NULL);
5614 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5615 result = IRE_NOROUTE;
5616 else
5617 result = ire->ire_type;
5618 ire_refrele(ire);
5619 return (result);
5620 }
5621
5622 /*
5623 * Efficient versions of lookup for an IRE when we only
5624 * match the address.
5625 * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5626 * Does not handle multicast addresses.
5627 */
5628 uint_t
5629 ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
5630 {
5631 ire_t *ire;
5632 uint_t result;
5633
5634 ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
5635 ASSERT(ire != NULL);
5636 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5637 result = IRE_NOROUTE;
5638 else
5639 result = ire->ire_type;
5640 ire_refrele(ire);
5641 return (result);
5642 }
5643
5644 /*
5645 * Nobody should be sending
5646 * packets up this stream
5647 */
5648 static void
5649 ip_lrput(queue_t *q, mblk_t *mp)
5650 {
5651 switch (mp->b_datap->db_type) {
5652 case M_FLUSH:
5653 /* Turn around */
5654 if (*mp->b_rptr & FLUSHW) {
5655 *mp->b_rptr &= ~FLUSHR;
5656 qreply(q, mp);
5657 return;
5658 }
5659 break;
5660 }
5661 freemsg(mp);
5662 }
5663
5664 /* Nobody should be sending packets down this stream */
5665 /* ARGSUSED */
5666 void
5667 ip_lwput(queue_t *q, mblk_t *mp)
5668 {
5669 freemsg(mp);
5670 }
5671
5672 /*
5673 * Move the first hop in any source route to ipha_dst and remove that part of
5674 * the source route. Called by other protocols. Errors in option formatting
5675 * are ignored - will be handled by ip_output_options. Return the final
5676 * destination (either ipha_dst or the last entry in a source route.)
5677 */
5678 ipaddr_t
5679 ip_massage_options(ipha_t *ipha, netstack_t *ns)
5680 {
5681 ipoptp_t opts;
5682 uchar_t *opt;
5683 uint8_t optval;
5684 uint8_t optlen;
5685 ipaddr_t dst;
5686 int i;
5687 ip_stack_t *ipst = ns->netstack_ip;
5688
5689 ip2dbg(("ip_massage_options\n"));
5690 dst = ipha->ipha_dst;
5691 for (optval = ipoptp_first(&opts, ipha);
5692 optval != IPOPT_EOL;
5693 optval = ipoptp_next(&opts)) {
5694 opt = opts.ipoptp_cur;
5695 switch (optval) {
5696 uint8_t off;
5697 case IPOPT_SSRR:
5698 case IPOPT_LSRR:
5699 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
5700 ip1dbg(("ip_massage_options: bad src route\n"));
5701 break;
5702 }
5703 optlen = opts.ipoptp_len;
5704 off = opt[IPOPT_OFFSET];
5705 off--;
5706 redo_srr:
5707 if (optlen < IP_ADDR_LEN ||
5708 off > optlen - IP_ADDR_LEN) {
5709 /* End of source route */
5710 ip1dbg(("ip_massage_options: end of SR\n"));
5711 break;
5712 }
5713 bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
5714 ip1dbg(("ip_massage_options: next hop 0x%x\n",
5715 ntohl(dst)));
5716 /*
5717 * Check if our address is present more than
5718 * once as consecutive hops in source route.
5719 * XXX verify per-interface ip_forwarding
5720 * for source route?
5721 */
5722 if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
5723 off += IP_ADDR_LEN;
5724 goto redo_srr;
5725 }
5726 if (dst == htonl(INADDR_LOOPBACK)) {
5727 ip1dbg(("ip_massage_options: loopback addr in "
5728 "source route!\n"));
5729 break;
5730 }
5731 /*
5732 * Update ipha_dst to be the first hop and remove the
5733 * first hop from the source route (by overwriting
5734 * part of the option with NOP options).
5735 */
5736 ipha->ipha_dst = dst;
5737 /* Put the last entry in dst */
5738 off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
5739 3;
5740 bcopy(&opt[off], &dst, IP_ADDR_LEN);
5741
5742 ip1dbg(("ip_massage_options: last hop 0x%x\n",
5743 ntohl(dst)));
5744 /* Move down and overwrite */
5745 opt[IP_ADDR_LEN] = opt[0];
5746 opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
5747 opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
5748 for (i = 0; i < IP_ADDR_LEN; i++)
5749 opt[i] = IPOPT_NOP;
5750 break;
5751 }
5752 }
5753 return (dst);
5754 }
5755
5756 /*
5757 * Return the network mask
5758 * associated with the specified address.
5759 */
5760 ipaddr_t
5761 ip_net_mask(ipaddr_t addr)
5762 {
5763 uchar_t *up = (uchar_t *)&addr;
5764 ipaddr_t mask = 0;
5765 uchar_t *maskp = (uchar_t *)&mask;
5766
5767 #if defined(__i386) || defined(__amd64)
5768 #define TOTALLY_BRAIN_DAMAGED_C_COMPILER
5769 #endif
5770 #ifdef TOTALLY_BRAIN_DAMAGED_C_COMPILER
5771 maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
5772 #endif
5773 if (CLASSD(addr)) {
5774 maskp[0] = 0xF0;
5775 return (mask);
5776 }
5777
5778 /* We assume Class E default netmask to be 32 */
5779 if (CLASSE(addr))
5780 return (0xffffffffU);
5781
5782 if (addr == 0)
5783 return (0);
5784 maskp[0] = 0xFF;
5785 if ((up[0] & 0x80) == 0)
5786 return (mask);
5787
5788 maskp[1] = 0xFF;
5789 if ((up[0] & 0xC0) == 0x80)
5790 return (mask);
5791
5792 maskp[2] = 0xFF;
5793 if ((up[0] & 0xE0) == 0xC0)
5794 return (mask);
5795
5796 /* Otherwise return no mask */
5797 return ((ipaddr_t)0);
5798 }
5799
5800 /* Name/Value Table Lookup Routine */
5801 char *
5802 ip_nv_lookup(nv_t *nv, int value)
5803 {
5804 if (!nv)
5805 return (NULL);
5806 for (; nv->nv_name; nv++) {
5807 if (nv->nv_value == value)
5808 return (nv->nv_name);
5809 }
5810 return ("unknown");
5811 }
5812
5813 static int
5814 ip_wait_for_info_ack(ill_t *ill)
5815 {
5816 int err;
5817
5818 mutex_enter(&ill->ill_lock);
5819 while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
5820 /*
5821 * Return value of 0 indicates a pending signal.
5822 */
5823 err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
5824 if (err == 0) {
5825 mutex_exit(&ill->ill_lock);
5826 return (EINTR);
5827 }
5828 }
5829 mutex_exit(&ill->ill_lock);
5830 /*
5831 * ip_rput_other could have set an error in ill_error on
5832 * receipt of M_ERROR.
5833 */
5834 return (ill->ill_error);
5835 }
5836
5837 /*
5838 * This is a module open, i.e. this is a control stream for access
5839 * to a DLPI device. We allocate an ill_t as the instance data in
5840 * this case.
5841 */
5842 static int
5843 ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5844 {
5845 ill_t *ill;
5846 int err;
5847 zoneid_t zoneid;
5848 netstack_t *ns;
5849 ip_stack_t *ipst;
5850
5851 /*
5852 * Prevent unprivileged processes from pushing IP so that
5853 * they can't send raw IP.
5854 */
5855 if (secpolicy_net_rawaccess(credp) != 0)
5856 return (EPERM);
5857
5858 ns = netstack_find_by_cred(credp);
5859 ASSERT(ns != NULL);
5860 ipst = ns->netstack_ip;
5861 ASSERT(ipst != NULL);
5862
5863 /*
5864 * For exclusive stacks we set the zoneid to zero
5865 * to make IP operate as if in the global zone.
5866 */
5867 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5868 zoneid = GLOBAL_ZONEID;
5869 else
5870 zoneid = crgetzoneid(credp);
5871
5872 ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
5873 q->q_ptr = WR(q)->q_ptr = ill;
5874 ill->ill_ipst = ipst;
5875 ill->ill_zoneid = zoneid;
5876
5877 /*
5878 * ill_init initializes the ill fields and then sends down
5879 * down a DL_INFO_REQ after calling qprocson.
5880 */
5881 err = ill_init(q, ill);
5882
5883 if (err != 0) {
5884 mi_free(ill);
5885 netstack_rele(ipst->ips_netstack);
5886 q->q_ptr = NULL;
5887 WR(q)->q_ptr = NULL;
5888 return (err);
5889 }
5890
5891 /*
5892 * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
5893 *
5894 * ill_init initializes the ipsq marking this thread as
5895 * writer
5896 */
5897 ipsq_exit(ill->ill_phyint->phyint_ipsq);
5898 err = ip_wait_for_info_ack(ill);
5899 if (err == 0)
5900 ill->ill_credp = credp;
5901 else
5902 goto fail;
5903
5904 crhold(credp);
5905
5906 mutex_enter(&ipst->ips_ip_mi_lock);
5907 err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
5908 sflag, credp);
5909 mutex_exit(&ipst->ips_ip_mi_lock);
5910 fail:
5911 if (err) {
5912 (void) ip_close(q, 0);
5913 return (err);
5914 }
5915 return (0);
5916 }
5917
5918 /* For /dev/ip aka AF_INET open */
5919 int
5920 ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5921 {
5922 return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
5923 }
5924
5925 /* For /dev/ip6 aka AF_INET6 open */
5926 int
5927 ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5928 {
5929 return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
5930 }
5931
5932 /* IP open routine. */
5933 int
5934 ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
5935 boolean_t isv6)
5936 {
5937 conn_t *connp;
5938 major_t maj;
5939 zoneid_t zoneid;
5940 netstack_t *ns;
5941 ip_stack_t *ipst;
5942
5943 /* Allow reopen. */
5944 if (q->q_ptr != NULL)
5945 return (0);
5946
5947 if (sflag & MODOPEN) {
5948 /* This is a module open */
5949 return (ip_modopen(q, devp, flag, sflag, credp));
5950 }
5951
5952 if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
5953 /*
5954 * Non streams based socket looking for a stream
5955 * to access IP
5956 */
5957 return (ip_helper_stream_setup(q, devp, flag, sflag,
5958 credp, isv6));
5959 }
5960
5961 ns = netstack_find_by_cred(credp);
5962 ASSERT(ns != NULL);
5963 ipst = ns->netstack_ip;
5964 ASSERT(ipst != NULL);
5965
5966 /*
5967 * For exclusive stacks we set the zoneid to zero
5968 * to make IP operate as if in the global zone.
5969 */
5970 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5971 zoneid = GLOBAL_ZONEID;
5972 else
5973 zoneid = crgetzoneid(credp);
5974
5975 /*
5976 * We are opening as a device. This is an IP client stream, and we
5977 * allocate an conn_t as the instance data.
5978 */
5979 connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
5980
5981 /*
5982 * ipcl_conn_create did a netstack_hold. Undo the hold that was
5983 * done by netstack_find_by_cred()
5984 */
5985 netstack_rele(ipst->ips_netstack);
5986
5987 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
5988 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
5989 connp->conn_ixa->ixa_zoneid = zoneid;
5990 connp->conn_zoneid = zoneid;
5991
5992 connp->conn_rq = q;
5993 q->q_ptr = WR(q)->q_ptr = connp;
5994
5995 /* Minor tells us which /dev entry was opened */
5996 if (isv6) {
5997 connp->conn_family = AF_INET6;
5998 connp->conn_ipversion = IPV6_VERSION;
5999 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
6000 connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
6001 } else {
6002 connp->conn_family = AF_INET;
6003 connp->conn_ipversion = IPV4_VERSION;
6004 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
6005 }
6006
6007 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
6008 ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
6009 connp->conn_minor_arena = ip_minor_arena_la;
6010 } else {
6011 /*
6012 * Either minor numbers in the large arena were exhausted
6013 * or a non socket application is doing the open.
6014 * Try to allocate from the small arena.
6015 */
6016 if ((connp->conn_dev =
6017 inet_minor_alloc(ip_minor_arena_sa)) == 0) {
6018 /* CONN_DEC_REF takes care of netstack_rele() */
6019 q->q_ptr = WR(q)->q_ptr = NULL;
6020 CONN_DEC_REF(connp);
6021 return (EBUSY);
6022 }
6023 connp->conn_minor_arena = ip_minor_arena_sa;
6024 }
6025
6026 maj = getemajor(*devp);
6027 *devp = makedevice(maj, (minor_t)connp->conn_dev);
6028
6029 /*
6030 * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
6031 */
6032 connp->conn_cred = credp;
6033 connp->conn_cpid = curproc->p_pid;
6034 /* Cache things in ixa without an extra refhold */
6035 ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
6036 connp->conn_ixa->ixa_cred = connp->conn_cred;
6037 connp->conn_ixa->ixa_cpid = connp->conn_cpid;
6038 if (is_system_labeled())
6039 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
6040
6041 /*
6042 * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
6043 */
6044 connp->conn_recv = ip_conn_input;
6045 connp->conn_recvicmp = ip_conn_input_icmp;
6046
6047 crhold(connp->conn_cred);
6048
6049 /*
6050 * If the caller has the process-wide flag set, then default to MAC
6051 * exempt mode. This allows read-down to unlabeled hosts.
6052 */
6053 if (getpflags(NET_MAC_AWARE, credp) != 0)
6054 connp->conn_mac_mode = CONN_MAC_AWARE;
6055
6056 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
6057
6058 connp->conn_rq = q;
6059 connp->conn_wq = WR(q);
6060
6061 /* Non-zero default values */
6062 connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
6063
6064 /*
6065 * Make the conn globally visible to walkers
6066 */
6067 ASSERT(connp->conn_ref == 1);
6068 mutex_enter(&connp->conn_lock);
6069 connp->conn_state_flags &= ~CONN_INCIPIENT;
6070 mutex_exit(&connp->conn_lock);
6071
6072 qprocson(q);
6073
6074 return (0);
6075 }
6076
6077 /*
6078 * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
6079 * all of them are copied to the conn_t. If the req is "zero", the policy is
6080 * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
6081 * fields.
6082 * We keep only the latest setting of the policy and thus policy setting
6083 * is not incremental/cumulative.
6084 *
6085 * Requests to set policies with multiple alternative actions will
6086 * go through a different API.
6087 */
6088 int
6089 ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
6090 {
6091 uint_t ah_req = 0;
6092 uint_t esp_req = 0;
6093 uint_t se_req = 0;
6094 ipsec_act_t *actp = NULL;
6095 uint_t nact;
6096 ipsec_policy_head_t *ph;
6097 boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
6098 int error = 0;
6099 netstack_t *ns = connp->conn_netstack;
6100 ip_stack_t *ipst = ns->netstack_ip;
6101 ipsec_stack_t *ipss = ns->netstack_ipsec;
6102
6103 #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
6104
6105 /*
6106 * The IP_SEC_OPT option does not allow variable length parameters,
6107 * hence a request cannot be NULL.
6108 */
6109 if (req == NULL)
6110 return (EINVAL);
6111
6112 ah_req = req->ipsr_ah_req;
6113 esp_req = req->ipsr_esp_req;
6114 se_req = req->ipsr_self_encap_req;
6115
6116 /* Don't allow setting self-encap without one or more of AH/ESP. */
6117 if (se_req != 0 && esp_req == 0 && ah_req == 0)
6118 return (EINVAL);
6119
6120 /*
6121 * Are we dealing with a request to reset the policy (i.e.
6122 * zero requests).
6123 */
6124 is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
6125 (esp_req & REQ_MASK) == 0 &&
6126 (se_req & REQ_MASK) == 0);
6127
6128 if (!is_pol_reset) {
6129 /*
6130 * If we couldn't load IPsec, fail with "protocol
6131 * not supported".
6132 * IPsec may not have been loaded for a request with zero
6133 * policies, so we don't fail in this case.
6134 */
6135 mutex_enter(&ipss->ipsec_loader_lock);
6136 if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
6137 mutex_exit(&ipss->ipsec_loader_lock);
6138 return (EPROTONOSUPPORT);
6139 }
6140 mutex_exit(&ipss->ipsec_loader_lock);
6141
6142 /*
6143 * Test for valid requests. Invalid algorithms
6144 * need to be tested by IPsec code because new
6145 * algorithms can be added dynamically.
6146 */
6147 if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6148 (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6149 (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
6150 return (EINVAL);
6151 }
6152
6153 /*
6154 * Only privileged users can issue these
6155 * requests.
6156 */
6157 if (((ah_req & IPSEC_PREF_NEVER) ||
6158 (esp_req & IPSEC_PREF_NEVER) ||
6159 (se_req & IPSEC_PREF_NEVER)) &&
6160 secpolicy_ip_config(cr, B_FALSE) != 0) {
6161 return (EPERM);
6162 }
6163
6164 /*
6165 * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
6166 * are mutually exclusive.
6167 */
6168 if (((ah_req & REQ_MASK) == REQ_MASK) ||
6169 ((esp_req & REQ_MASK) == REQ_MASK) ||
6170 ((se_req & REQ_MASK) == REQ_MASK)) {
6171 /* Both of them are set */
6172 return (EINVAL);
6173 }
6174 }
6175
6176 ASSERT(MUTEX_HELD(&connp->conn_lock));
6177
6178 /*
6179 * If we have already cached policies in conn_connect(), don't
6180 * let them change now. We cache policies for connections
6181 * whose src,dst [addr, port] is known.
6182 */
6183 if (connp->conn_policy_cached) {
6184 return (EINVAL);
6185 }
6186
6187 /*
6188 * We have a zero policies, reset the connection policy if already
6189 * set. This will cause the connection to inherit the
6190 * global policy, if any.
6191 */
6192 if (is_pol_reset) {
6193 if (connp->conn_policy != NULL) {
6194 IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
6195 connp->conn_policy = NULL;
6196 }
6197 connp->conn_in_enforce_policy = B_FALSE;
6198 connp->conn_out_enforce_policy = B_FALSE;
6199 return (0);
6200 }
6201
6202 ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
6203 ipst->ips_netstack);
6204 if (ph == NULL)
6205 goto enomem;
6206
6207 ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
6208 if (actp == NULL)
6209 goto enomem;
6210
6211 /*
6212 * Always insert IPv4 policy entries, since they can also apply to
6213 * ipv6 sockets being used in ipv4-compat mode.
6214 */
6215 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6216 IPSEC_TYPE_INBOUND, ns))
6217 goto enomem;
6218 is_pol_inserted = B_TRUE;
6219 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6220 IPSEC_TYPE_OUTBOUND, ns))
6221 goto enomem;
6222
6223 /*
6224 * We're looking at a v6 socket, also insert the v6-specific
6225 * entries.
6226 */
6227 if (connp->conn_family == AF_INET6) {
6228 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6229 IPSEC_TYPE_INBOUND, ns))
6230 goto enomem;
6231 if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6232 IPSEC_TYPE_OUTBOUND, ns))
6233 goto enomem;
6234 }
6235
6236 ipsec_actvec_free(actp, nact);
6237
6238 /*
6239 * If the requests need security, set enforce_policy.
6240 * If the requests are IPSEC_PREF_NEVER, one should
6241 * still set conn_out_enforce_policy so that ip_set_destination
6242 * marks the ip_xmit_attr_t appropriatly. This is needed so that
6243 * for connections that we don't cache policy in at connect time,
6244 * if global policy matches in ip_output_attach_policy, we
6245 * don't wrongly inherit global policy. Similarly, we need
6246 * to set conn_in_enforce_policy also so that we don't verify
6247 * policy wrongly.
6248 */
6249 if ((ah_req & REQ_MASK) != 0 ||
6250 (esp_req & REQ_MASK) != 0 ||
6251 (se_req & REQ_MASK) != 0) {
6252 connp->conn_in_enforce_policy = B_TRUE;
6253 connp->conn_out_enforce_policy = B_TRUE;
6254 }
6255
6256 return (error);
6257 #undef REQ_MASK
6258
6259 /*
6260 * Common memory-allocation-failure exit path.
6261 */
6262 enomem:
6263 if (actp != NULL)
6264 ipsec_actvec_free(actp, nact);
6265 if (is_pol_inserted)
6266 ipsec_polhead_flush(ph, ns);
6267 return (ENOMEM);
6268 }
6269
6270 /*
6271 * Set socket options for joining and leaving multicast groups.
6272 * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6273 * The caller has already check that the option name is consistent with
6274 * the address family of the socket.
6275 */
6276 int
6277 ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
6278 uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6279 {
6280 int *i1 = (int *)invalp;
6281 int error = 0;
6282 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
6283 struct ip_mreq *v4_mreqp;
6284 struct ipv6_mreq *v6_mreqp;
6285 struct group_req *greqp;
6286 ire_t *ire;
6287 boolean_t done = B_FALSE;
6288 ipaddr_t ifaddr;
6289 in6_addr_t v6group;
6290 uint_t ifindex;
6291 boolean_t mcast_opt = B_TRUE;
6292 mcast_record_t fmode;
6293 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6294 ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6295
6296 switch (name) {
6297 case IP_ADD_MEMBERSHIP:
6298 case IPV6_JOIN_GROUP:
6299 mcast_opt = B_FALSE;
6300 /* FALLTHRU */
6301 case MCAST_JOIN_GROUP:
6302 fmode = MODE_IS_EXCLUDE;
6303 optfn = ip_opt_add_group;
6304 break;
6305
6306 case IP_DROP_MEMBERSHIP:
6307 case IPV6_LEAVE_GROUP:
6308 mcast_opt = B_FALSE;
6309 /* FALLTHRU */
6310 case MCAST_LEAVE_GROUP:
6311 fmode = MODE_IS_INCLUDE;
6312 optfn = ip_opt_delete_group;
6313 break;
6314 default:
6315 ASSERT(0);
6316 }
6317
6318 if (mcast_opt) {
6319 struct sockaddr_in *sin;
6320 struct sockaddr_in6 *sin6;
6321
6322 greqp = (struct group_req *)i1;
6323 if (greqp->gr_group.ss_family == AF_INET) {
6324 sin = (struct sockaddr_in *)&(greqp->gr_group);
6325 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
6326 } else {
6327 if (!inet6)
6328 return (EINVAL); /* Not on INET socket */
6329
6330 sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
6331 v6group = sin6->sin6_addr;
6332 }
6333 ifaddr = INADDR_ANY;
6334 ifindex = greqp->gr_interface;
6335 } else if (inet6) {
6336 v6_mreqp = (struct ipv6_mreq *)i1;
6337 v6group = v6_mreqp->ipv6mr_multiaddr;
6338 ifaddr = INADDR_ANY;
6339 ifindex = v6_mreqp->ipv6mr_interface;
6340 } else {
6341 v4_mreqp = (struct ip_mreq *)i1;
6342 IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
6343 ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
6344 ifindex = 0;
6345 }
6346
6347 /*
6348 * In the multirouting case, we need to replicate
6349 * the request on all interfaces that will take part
6350 * in replication. We do so because multirouting is
6351 * reflective, thus we will probably receive multi-
6352 * casts on those interfaces.
6353 * The ip_multirt_apply_membership() succeeds if
6354 * the operation succeeds on at least one interface.
6355 */
6356 if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6357 ipaddr_t group;
6358
6359 IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6360
6361 ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6362 IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6363 MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6364 } else {
6365 ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6366 IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6367 MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6368 }
6369 if (ire != NULL) {
6370 if (ire->ire_flags & RTF_MULTIRT) {
6371 error = ip_multirt_apply_membership(optfn, ire, connp,
6372 checkonly, &v6group, fmode, &ipv6_all_zeros);
6373 done = B_TRUE;
6374 }
6375 ire_refrele(ire);
6376 }
6377
6378 if (!done) {
6379 error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6380 fmode, &ipv6_all_zeros);
6381 }
6382 return (error);
6383 }
6384
6385 /*
6386 * Set socket options for joining and leaving multicast groups
6387 * for specific sources.
6388 * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6389 * The caller has already check that the option name is consistent with
6390 * the address family of the socket.
6391 */
6392 int
6393 ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
6394 uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6395 {
6396 int *i1 = (int *)invalp;
6397 int error = 0;
6398 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
6399 struct ip_mreq_source *imreqp;
6400 struct group_source_req *gsreqp;
6401 in6_addr_t v6group, v6src;
6402 uint32_t ifindex;
6403 ipaddr_t ifaddr;
6404 boolean_t mcast_opt = B_TRUE;
6405 mcast_record_t fmode;
6406 ire_t *ire;
6407 boolean_t done = B_FALSE;
6408 int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6409 ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6410
6411 switch (name) {
6412 case IP_BLOCK_SOURCE:
6413 mcast_opt = B_FALSE;
6414 /* FALLTHRU */
6415 case MCAST_BLOCK_SOURCE:
6416 fmode = MODE_IS_EXCLUDE;
6417 optfn = ip_opt_add_group;
6418 break;
6419
6420 case IP_UNBLOCK_SOURCE:
6421 mcast_opt = B_FALSE;
6422 /* FALLTHRU */
6423 case MCAST_UNBLOCK_SOURCE:
6424 fmode = MODE_IS_EXCLUDE;
6425 optfn = ip_opt_delete_group;
6426 break;
6427
6428 case IP_ADD_SOURCE_MEMBERSHIP:
6429 mcast_opt = B_FALSE;
6430 /* FALLTHRU */
6431 case MCAST_JOIN_SOURCE_GROUP:
6432 fmode = MODE_IS_INCLUDE;
6433 optfn = ip_opt_add_group;
6434 break;
6435
6436 case IP_DROP_SOURCE_MEMBERSHIP:
6437 mcast_opt = B_FALSE;
6438 /* FALLTHRU */
6439 case MCAST_LEAVE_SOURCE_GROUP:
6440 fmode = MODE_IS_INCLUDE;
6441 optfn = ip_opt_delete_group;
6442 break;
6443 default:
6444 ASSERT(0);
6445 }
6446
6447 if (mcast_opt) {
6448 gsreqp = (struct group_source_req *)i1;
6449 ifindex = gsreqp->gsr_interface;
6450 if (gsreqp->gsr_group.ss_family == AF_INET) {
6451 struct sockaddr_in *s;
6452 s = (struct sockaddr_in *)&gsreqp->gsr_group;
6453 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
6454 s = (struct sockaddr_in *)&gsreqp->gsr_source;
6455 IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
6456 } else {
6457 struct sockaddr_in6 *s6;
6458
6459 if (!inet6)
6460 return (EINVAL); /* Not on INET socket */
6461
6462 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
6463 v6group = s6->sin6_addr;
6464 s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
6465 v6src = s6->sin6_addr;
6466 }
6467 ifaddr = INADDR_ANY;
6468 } else {
6469 imreqp = (struct ip_mreq_source *)i1;
6470 IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
6471 IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
6472 ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
6473 ifindex = 0;
6474 }
6475
6476 /*
6477 * Handle src being mapped INADDR_ANY by changing it to unspecified.
6478 */
6479 if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
6480 v6src = ipv6_all_zeros;
6481
6482 /*
6483 * In the multirouting case, we need to replicate
6484 * the request as noted in the mcast cases above.
6485 */
6486 if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6487 ipaddr_t group;
6488
6489 IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6490
6491 ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6492 IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6493 MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6494 } else {
6495 ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6496 IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6497 MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6498 }
6499 if (ire != NULL) {
6500 if (ire->ire_flags & RTF_MULTIRT) {
6501 error = ip_multirt_apply_membership(optfn, ire, connp,
6502 checkonly, &v6group, fmode, &v6src);
6503 done = B_TRUE;
6504 }
6505 ire_refrele(ire);
6506 }
6507 if (!done) {
6508 error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6509 fmode, &v6src);
6510 }
6511 return (error);
6512 }
6513
6514 /*
6515 * Given a destination address and a pointer to where to put the information
6516 * this routine fills in the mtuinfo.
6517 * The socket must be connected.
6518 * For sctp conn_faddr is the primary address.
6519 */
6520 int
6521 ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
6522 {
6523 uint32_t pmtu = IP_MAXPACKET;
6524 uint_t scopeid;
6525
6526 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
6527 return (-1);
6528
6529 /* In case we never sent or called ip_set_destination_v4/v6 */
6530 if (ixa->ixa_ire != NULL)
6531 pmtu = ip_get_pmtu(ixa);
6532
6533 if (ixa->ixa_flags & IXAF_SCOPEID_SET)
6534 scopeid = ixa->ixa_scopeid;
6535 else
6536 scopeid = 0;
6537
6538 bzero(mtuinfo, sizeof (*mtuinfo));
6539 mtuinfo->ip6m_addr.sin6_family = AF_INET6;
6540 mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
6541 mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
6542 mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
6543 mtuinfo->ip6m_mtu = pmtu;
6544
6545 return (sizeof (struct ip6_mtuinfo));
6546 }
6547
6548 /*
6549 * When the src multihoming is changed from weak to [strong, preferred]
6550 * ip_ire_rebind_walker is called to walk the list of all ire_t entries
6551 * and identify routes that were created by user-applications in the
6552 * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
6553 * currently defined. These routes are then 'rebound', i.e., their ire_ill
6554 * is selected by finding an interface route for the gateway.
6555 */
6556 /* ARGSUSED */
6557 void
6558 ip_ire_rebind_walker(ire_t *ire, void *notused)
6559 {
6560 if (!ire->ire_unbound || ire->ire_ill != NULL)
6561 return;
6562 ire_rebind(ire);
6563 ire_delete(ire);
6564 }
6565
6566 /*
6567 * When the src multihoming is changed from [strong, preferred] to weak,
6568 * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
6569 * set any entries that were created by user-applications in the unbound state
6570 * (i.e., without RTA_IFP) back to having a NULL ire_ill.
6571 */
6572 /* ARGSUSED */
6573 void
6574 ip_ire_unbind_walker(ire_t *ire, void *notused)
6575 {
6576 ire_t *new_ire;
6577
6578 if (!ire->ire_unbound || ire->ire_ill == NULL)
6579 return;
6580 if (ire->ire_ipversion == IPV6_VERSION) {
6581 new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
6582 &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
6583 ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6584 } else {
6585 new_ire = ire_create((uchar_t *)&ire->ire_addr,
6586 (uchar_t *)&ire->ire_mask,
6587 (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
6588 ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6589 }
6590 if (new_ire == NULL)
6591 return;
6592 new_ire->ire_unbound = B_TRUE;
6593 /*
6594 * The bound ire must first be deleted so that we don't return
6595 * the existing one on the attempt to add the unbound new_ire.
6596 */
6597 ire_delete(ire);
6598 new_ire = ire_add(new_ire);
6599 if (new_ire != NULL)
6600 ire_refrele(new_ire);
6601 }
6602
6603 /*
6604 * When the settings of ip*_strict_src_multihoming tunables are changed,
6605 * all cached routes need to be recomputed. This recomputation needs to be
6606 * done when going from weaker to stronger modes so that the cached ire
6607 * for the connection does not violate the current ip*_strict_src_multihoming
6608 * setting. It also needs to be done when going from stronger to weaker modes,
6609 * so that we fall back to matching on the longest-matching-route (as opposed
6610 * to a shorter match that may have been selected in the strong mode
6611 * to satisfy src_multihoming settings).
6612 *
6613 * The cached ixa_ire entires for all conn_t entries are marked as
6614 * "verify" so that they will be recomputed for the next packet.
6615 */
6616 void
6617 conn_ire_revalidate(conn_t *connp, void *arg)
6618 {
6619 boolean_t isv6 = (boolean_t)arg;
6620
6621 if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
6622 (!isv6 && connp->conn_ipversion != IPV4_VERSION))
6623 return;
6624 connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
6625 }
6626
6627 /*
6628 * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
6629 * When an ipf is passed here for the first time, if
6630 * we already have in-order fragments on the queue, we convert from the fast-
6631 * path reassembly scheme to the hard-case scheme. From then on, additional
6632 * fragments are reassembled here. We keep track of the start and end offsets
6633 * of each piece, and the number of holes in the chain. When the hole count
6634 * goes to zero, we are done!
6635 *
6636 * The ipf_count will be updated to account for any mblk(s) added (pointed to
6637 * by mp) or subtracted (freeb()ed dups), upon return the caller must update
6638 * ipfb_count and ill_frag_count by the difference of ipf_count before and
6639 * after the call to ip_reassemble().
6640 */
6641 int
6642 ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
6643 size_t msg_len)
6644 {
6645 uint_t end;
6646 mblk_t *next_mp;
6647 mblk_t *mp1;
6648 uint_t offset;
6649 boolean_t incr_dups = B_TRUE;
6650 boolean_t offset_zero_seen = B_FALSE;
6651 boolean_t pkt_boundary_checked = B_FALSE;
6652
6653 /* If start == 0 then ipf_nf_hdr_len has to be set. */
6654 ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
6655
6656 /* Add in byte count */
6657 ipf->ipf_count += msg_len;
6658 if (ipf->ipf_end) {
6659 /*
6660 * We were part way through in-order reassembly, but now there
6661 * is a hole. We walk through messages already queued, and
6662 * mark them for hard case reassembly. We know that up till
6663 * now they were in order starting from offset zero.
6664 */
6665 offset = 0;
6666 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6667 IP_REASS_SET_START(mp1, offset);
6668 if (offset == 0) {
6669 ASSERT(ipf->ipf_nf_hdr_len != 0);
6670 offset = -ipf->ipf_nf_hdr_len;
6671 }
6672 offset += mp1->b_wptr - mp1->b_rptr;
6673 IP_REASS_SET_END(mp1, offset);
6674 }
6675 /* One hole at the end. */
6676 ipf->ipf_hole_cnt = 1;
6677 /* Brand it as a hard case, forever. */
6678 ipf->ipf_end = 0;
6679 }
6680 /* Walk through all the new pieces. */
6681 do {
6682 end = start + (mp->b_wptr - mp->b_rptr);
6683 /*
6684 * If start is 0, decrease 'end' only for the first mblk of
6685 * the fragment. Otherwise 'end' can get wrong value in the
6686 * second pass of the loop if first mblk is exactly the
6687 * size of ipf_nf_hdr_len.
6688 */
6689 if (start == 0 && !offset_zero_seen) {
6690 /* First segment */
6691 ASSERT(ipf->ipf_nf_hdr_len != 0);
6692 end -= ipf->ipf_nf_hdr_len;
6693 offset_zero_seen = B_TRUE;
6694 }
6695 next_mp = mp->b_cont;
6696 /*
6697 * We are checking to see if there is any interesing data
6698 * to process. If there isn't and the mblk isn't the
6699 * one which carries the unfragmentable header then we
6700 * drop it. It's possible to have just the unfragmentable
6701 * header come through without any data. That needs to be
6702 * saved.
6703 *
6704 * If the assert at the top of this function holds then the
6705 * term "ipf->ipf_nf_hdr_len != 0" isn't needed. This code
6706 * is infrequently traveled enough that the test is left in
6707 * to protect against future code changes which break that
6708 * invariant.
6709 */
6710 if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
6711 /* Empty. Blast it. */
6712 IP_REASS_SET_START(mp, 0);
6713 IP_REASS_SET_END(mp, 0);
6714 /*
6715 * If the ipf points to the mblk we are about to free,
6716 * update ipf to point to the next mblk (or NULL
6717 * if none).
6718 */
6719 if (ipf->ipf_mp->b_cont == mp)
6720 ipf->ipf_mp->b_cont = next_mp;
6721 freeb(mp);
6722 continue;
6723 }
6724 mp->b_cont = NULL;
6725 IP_REASS_SET_START(mp, start);
6726 IP_REASS_SET_END(mp, end);
6727 if (!ipf->ipf_tail_mp) {
6728 ipf->ipf_tail_mp = mp;
6729 ipf->ipf_mp->b_cont = mp;
6730 if (start == 0 || !more) {
6731 ipf->ipf_hole_cnt = 1;
6732 /*
6733 * if the first fragment comes in more than one
6734 * mblk, this loop will be executed for each
6735 * mblk. Need to adjust hole count so exiting
6736 * this routine will leave hole count at 1.
6737 */
6738 if (next_mp)
6739 ipf->ipf_hole_cnt++;
6740 } else
6741 ipf->ipf_hole_cnt = 2;
6742 continue;
6743 } else if (ipf->ipf_last_frag_seen && !more &&
6744 !pkt_boundary_checked) {
6745 /*
6746 * We check datagram boundary only if this fragment
6747 * claims to be the last fragment and we have seen a
6748 * last fragment in the past too. We do this only
6749 * once for a given fragment.
6750 *
6751 * start cannot be 0 here as fragments with start=0
6752 * and MF=0 gets handled as a complete packet. These
6753 * fragments should not reach here.
6754 */
6755
6756 if (start + msgdsize(mp) !=
6757 IP_REASS_END(ipf->ipf_tail_mp)) {
6758 /*
6759 * We have two fragments both of which claim
6760 * to be the last fragment but gives conflicting
6761 * information about the whole datagram size.
6762 * Something fishy is going on. Drop the
6763 * fragment and free up the reassembly list.
6764 */
6765 return (IP_REASS_FAILED);
6766 }
6767
6768 /*
6769 * We shouldn't come to this code block again for this
6770 * particular fragment.
6771 */
6772 pkt_boundary_checked = B_TRUE;
6773 }
6774
6775 /* New stuff at or beyond tail? */
6776 offset = IP_REASS_END(ipf->ipf_tail_mp);
6777 if (start >= offset) {
6778 if (ipf->ipf_last_frag_seen) {
6779 /* current fragment is beyond last fragment */
6780 return (IP_REASS_FAILED);
6781 }
6782 /* Link it on end. */
6783 ipf->ipf_tail_mp->b_cont = mp;
6784 ipf->ipf_tail_mp = mp;
6785 if (more) {
6786 if (start != offset)
6787 ipf->ipf_hole_cnt++;
6788 } else if (start == offset && next_mp == NULL)
6789 ipf->ipf_hole_cnt--;
6790 continue;
6791 }
6792 mp1 = ipf->ipf_mp->b_cont;
6793 offset = IP_REASS_START(mp1);
6794 /* New stuff at the front? */
6795 if (start < offset) {
6796 if (start == 0) {
6797 if (end >= offset) {
6798 /* Nailed the hole at the begining. */
6799 ipf->ipf_hole_cnt--;
6800 }
6801 } else if (end < offset) {
6802 /*
6803 * A hole, stuff, and a hole where there used
6804 * to be just a hole.
6805 */
6806 ipf->ipf_hole_cnt++;
6807 }
6808 mp->b_cont = mp1;
6809 /* Check for overlap. */
6810 while (end > offset) {
6811 if (end < IP_REASS_END(mp1)) {
6812 mp->b_wptr -= end - offset;
6813 IP_REASS_SET_END(mp, offset);
6814 BUMP_MIB(ill->ill_ip_mib,
6815 ipIfStatsReasmPartDups);
6816 break;
6817 }
6818 /* Did we cover another hole? */
6819 if ((mp1->b_cont &&
6820 IP_REASS_END(mp1) !=
6821 IP_REASS_START(mp1->b_cont) &&
6822 end >= IP_REASS_START(mp1->b_cont)) ||
6823 (!ipf->ipf_last_frag_seen && !more)) {
6824 ipf->ipf_hole_cnt--;
6825 }
6826 /* Clip out mp1. */
6827 if ((mp->b_cont = mp1->b_cont) == NULL) {
6828 /*
6829 * After clipping out mp1, this guy
6830 * is now hanging off the end.
6831 */
6832 ipf->ipf_tail_mp = mp;
6833 }
6834 IP_REASS_SET_START(mp1, 0);
6835 IP_REASS_SET_END(mp1, 0);
6836 /* Subtract byte count */
6837 ipf->ipf_count -= mp1->b_datap->db_lim -
6838 mp1->b_datap->db_base;
6839 freeb(mp1);
6840 BUMP_MIB(ill->ill_ip_mib,
6841 ipIfStatsReasmPartDups);
6842 mp1 = mp->b_cont;
6843 if (!mp1)
6844 break;
6845 offset = IP_REASS_START(mp1);
6846 }
6847 ipf->ipf_mp->b_cont = mp;
6848 continue;
6849 }
6850 /*
6851 * The new piece starts somewhere between the start of the head
6852 * and before the end of the tail.
6853 */
6854 for (; mp1; mp1 = mp1->b_cont) {
6855 offset = IP_REASS_END(mp1);
6856 if (start < offset) {
6857 if (end <= offset) {
6858 /* Nothing new. */
6859 IP_REASS_SET_START(mp, 0);
6860 IP_REASS_SET_END(mp, 0);
6861 /* Subtract byte count */
6862 ipf->ipf_count -= mp->b_datap->db_lim -
6863 mp->b_datap->db_base;
6864 if (incr_dups) {
6865 ipf->ipf_num_dups++;
6866 incr_dups = B_FALSE;
6867 }
6868 freeb(mp);
6869 BUMP_MIB(ill->ill_ip_mib,
6870 ipIfStatsReasmDuplicates);
6871 break;
6872 }
6873 /*
6874 * Trim redundant stuff off beginning of new
6875 * piece.
6876 */
6877 IP_REASS_SET_START(mp, offset);
6878 mp->b_rptr += offset - start;
6879 BUMP_MIB(ill->ill_ip_mib,
6880 ipIfStatsReasmPartDups);
6881 start = offset;
6882 if (!mp1->b_cont) {
6883 /*
6884 * After trimming, this guy is now
6885 * hanging off the end.
6886 */
6887 mp1->b_cont = mp;
6888 ipf->ipf_tail_mp = mp;
6889 if (!more) {
6890 ipf->ipf_hole_cnt--;
6891 }
6892 break;
6893 }
6894 }
6895 if (start >= IP_REASS_START(mp1->b_cont))
6896 continue;
6897 /* Fill a hole */
6898 if (start > offset)
6899 ipf->ipf_hole_cnt++;
6900 mp->b_cont = mp1->b_cont;
6901 mp1->b_cont = mp;
6902 mp1 = mp->b_cont;
6903 offset = IP_REASS_START(mp1);
6904 if (end >= offset) {
6905 ipf->ipf_hole_cnt--;
6906 /* Check for overlap. */
6907 while (end > offset) {
6908 if (end < IP_REASS_END(mp1)) {
6909 mp->b_wptr -= end - offset;
6910 IP_REASS_SET_END(mp, offset);
6911 /*
6912 * TODO we might bump
6913 * this up twice if there is
6914 * overlap at both ends.
6915 */
6916 BUMP_MIB(ill->ill_ip_mib,
6917 ipIfStatsReasmPartDups);
6918 break;
6919 }
6920 /* Did we cover another hole? */
6921 if ((mp1->b_cont &&
6922 IP_REASS_END(mp1)
6923 != IP_REASS_START(mp1->b_cont) &&
6924 end >=
6925 IP_REASS_START(mp1->b_cont)) ||
6926 (!ipf->ipf_last_frag_seen &&
6927 !more)) {
6928 ipf->ipf_hole_cnt--;
6929 }
6930 /* Clip out mp1. */
6931 if ((mp->b_cont = mp1->b_cont) ==
6932 NULL) {
6933 /*
6934 * After clipping out mp1,
6935 * this guy is now hanging
6936 * off the end.
6937 */
6938 ipf->ipf_tail_mp = mp;
6939 }
6940 IP_REASS_SET_START(mp1, 0);
6941 IP_REASS_SET_END(mp1, 0);
6942 /* Subtract byte count */
6943 ipf->ipf_count -=
6944 mp1->b_datap->db_lim -
6945 mp1->b_datap->db_base;
6946 freeb(mp1);
6947 BUMP_MIB(ill->ill_ip_mib,
6948 ipIfStatsReasmPartDups);
6949 mp1 = mp->b_cont;
6950 if (!mp1)
6951 break;
6952 offset = IP_REASS_START(mp1);
6953 }
6954 }
6955 break;
6956 }
6957 } while (start = end, mp = next_mp);
6958
6959 /* Fragment just processed could be the last one. Remember this fact */
6960 if (!more)
6961 ipf->ipf_last_frag_seen = B_TRUE;
6962
6963 /* Still got holes? */
6964 if (ipf->ipf_hole_cnt)
6965 return (IP_REASS_PARTIAL);
6966 /* Clean up overloaded fields to avoid upstream disasters. */
6967 for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6968 IP_REASS_SET_START(mp1, 0);
6969 IP_REASS_SET_END(mp1, 0);
6970 }
6971 return (IP_REASS_COMPLETE);
6972 }
6973
6974 /*
6975 * Fragmentation reassembly. Each ILL has a hash table for
6976 * queuing packets undergoing reassembly for all IPIFs
6977 * associated with the ILL. The hash is based on the packet
6978 * IP ident field. The ILL frag hash table was allocated
6979 * as a timer block at the time the ILL was created. Whenever
6980 * there is anything on the reassembly queue, the timer will
6981 * be running. Returns the reassembled packet if reassembly completes.
6982 */
6983 mblk_t *
6984 ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
6985 {
6986 uint32_t frag_offset_flags;
6987 mblk_t *t_mp;
6988 ipaddr_t dst;
6989 uint8_t proto = ipha->ipha_protocol;
6990 uint32_t sum_val;
6991 uint16_t sum_flags;
6992 ipf_t *ipf;
6993 ipf_t **ipfp;
6994 ipfb_t *ipfb;
6995 uint16_t ident;
6996 uint32_t offset;
6997 ipaddr_t src;
6998 uint_t hdr_length;
6999 uint32_t end;
7000 mblk_t *mp1;
7001 mblk_t *tail_mp;
7002 size_t count;
7003 size_t msg_len;
7004 uint8_t ecn_info = 0;
7005 uint32_t packet_size;
7006 boolean_t pruned = B_FALSE;
7007 ill_t *ill = ira->ira_ill;
7008 ip_stack_t *ipst = ill->ill_ipst;
7009
7010 /*
7011 * Drop the fragmented as early as possible, if
7012 * we don't have resource(s) to re-assemble.
7013 */
7014 if (ipst->ips_ip_reass_queue_bytes == 0) {
7015 freemsg(mp);
7016 return (NULL);
7017 }
7018
7019 /* Check for fragmentation offset; return if there's none */
7020 if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
7021 (IPH_MF | IPH_OFFSET)) == 0)
7022 return (mp);
7023
7024 /*
7025 * We utilize hardware computed checksum info only for UDP since
7026 * IP fragmentation is a normal occurrence for the protocol. In
7027 * addition, checksum offload support for IP fragments carrying
7028 * UDP payload is commonly implemented across network adapters.
7029 */
7030 ASSERT(ira->ira_rill != NULL);
7031 if (proto == IPPROTO_UDP && dohwcksum &&
7032 ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
7033 (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
7034 mblk_t *mp1 = mp->b_cont;
7035 int32_t len;
7036
7037 /* Record checksum information from the packet */
7038 sum_val = (uint32_t)DB_CKSUM16(mp);
7039 sum_flags = DB_CKSUMFLAGS(mp);
7040
7041 /* IP payload offset from beginning of mblk */
7042 offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
7043
7044 if ((sum_flags & HCK_PARTIALCKSUM) &&
7045 (mp1 == NULL || mp1->b_cont == NULL) &&
7046 offset >= DB_CKSUMSTART(mp) &&
7047 ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
7048 uint32_t adj;
7049 /*
7050 * Partial checksum has been calculated by hardware
7051 * and attached to the packet; in addition, any
7052 * prepended extraneous data is even byte aligned.
7053 * If any such data exists, we adjust the checksum;
7054 * this would also handle any postpended data.
7055 */
7056 IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
7057 mp, mp1, len, adj);
7058
7059 /* One's complement subtract extraneous checksum */
7060 if (adj >= sum_val)
7061 sum_val = ~(adj - sum_val) & 0xFFFF;
7062 else
7063 sum_val -= adj;
7064 }
7065 } else {
7066 sum_val = 0;
7067 sum_flags = 0;
7068 }
7069
7070 /* Clear hardware checksumming flag */
7071 DB_CKSUMFLAGS(mp) = 0;
7072
7073 ident = ipha->ipha_ident;
7074 offset = (frag_offset_flags << 3) & 0xFFFF;
7075 src = ipha->ipha_src;
7076 dst = ipha->ipha_dst;
7077 hdr_length = IPH_HDR_LENGTH(ipha);
7078 end = ntohs(ipha->ipha_length) - hdr_length;
7079
7080 /* If end == 0 then we have a packet with no data, so just free it */
7081 if (end == 0) {
7082 freemsg(mp);
7083 return (NULL);
7084 }
7085
7086 /* Record the ECN field info. */
7087 ecn_info = (ipha->ipha_type_of_service & 0x3);
7088 if (offset != 0) {
7089 /*
7090 * If this isn't the first piece, strip the header, and
7091 * add the offset to the end value.
7092 */
7093 mp->b_rptr += hdr_length;
7094 end += offset;
7095 }
7096
7097 /* Handle vnic loopback of fragments */
7098 if (mp->b_datap->db_ref > 2)
7099 msg_len = 0;
7100 else
7101 msg_len = MBLKSIZE(mp);
7102
7103 tail_mp = mp;
7104 while (tail_mp->b_cont != NULL) {
7105 tail_mp = tail_mp->b_cont;
7106 if (tail_mp->b_datap->db_ref <= 2)
7107 msg_len += MBLKSIZE(tail_mp);
7108 }
7109
7110 /* If the reassembly list for this ILL will get too big, prune it */
7111 if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
7112 ipst->ips_ip_reass_queue_bytes) {
7113 DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
7114 uint_t, ill->ill_frag_count,
7115 uint_t, ipst->ips_ip_reass_queue_bytes);
7116 ill_frag_prune(ill,
7117 (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
7118 (ipst->ips_ip_reass_queue_bytes - msg_len));
7119 pruned = B_TRUE;
7120 }
7121
7122 ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
7123 mutex_enter(&ipfb->ipfb_lock);
7124
7125 ipfp = &ipfb->ipfb_ipf;
7126 /* Try to find an existing fragment queue for this packet. */
7127 for (;;) {
7128 ipf = ipfp[0];
7129 if (ipf != NULL) {
7130 /*
7131 * It has to match on ident and src/dst address.
7132 */
7133 if (ipf->ipf_ident == ident &&
7134 ipf->ipf_src == src &&
7135 ipf->ipf_dst == dst &&
7136 ipf->ipf_protocol == proto) {
7137 /*
7138 * If we have received too many
7139 * duplicate fragments for this packet
7140 * free it.
7141 */
7142 if (ipf->ipf_num_dups > ip_max_frag_dups) {
7143 ill_frag_free_pkts(ill, ipfb, ipf, 1);
7144 freemsg(mp);
7145 mutex_exit(&ipfb->ipfb_lock);
7146 return (NULL);
7147 }
7148 /* Found it. */
7149 break;
7150 }
7151 ipfp = &ipf->ipf_hash_next;
7152 continue;
7153 }
7154
7155 /*
7156 * If we pruned the list, do we want to store this new
7157 * fragment?. We apply an optimization here based on the
7158 * fact that most fragments will be received in order.
7159 * So if the offset of this incoming fragment is zero,
7160 * it is the first fragment of a new packet. We will
7161 * keep it. Otherwise drop the fragment, as we have
7162 * probably pruned the packet already (since the
7163 * packet cannot be found).
7164 */
7165 if (pruned && offset != 0) {
7166 mutex_exit(&ipfb->ipfb_lock);
7167 freemsg(mp);
7168 return (NULL);
7169 }
7170
7171 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) {
7172 /*
7173 * Too many fragmented packets in this hash
7174 * bucket. Free the oldest.
7175 */
7176 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
7177 }
7178
7179 /* New guy. Allocate a frag message. */
7180 mp1 = allocb(sizeof (*ipf), BPRI_MED);
7181 if (mp1 == NULL) {
7182 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7183 ip_drop_input("ipIfStatsInDiscards", mp, ill);
7184 freemsg(mp);
7185 reass_done:
7186 mutex_exit(&ipfb->ipfb_lock);
7187 return (NULL);
7188 }
7189
7190 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
7191 mp1->b_cont = mp;
7192
7193 /* Initialize the fragment header. */
7194 ipf = (ipf_t *)mp1->b_rptr;
7195 ipf->ipf_mp = mp1;
7196 ipf->ipf_ptphn = ipfp;
7197 ipfp[0] = ipf;
7198 ipf->ipf_hash_next = NULL;
7199 ipf->ipf_ident = ident;
7200 ipf->ipf_protocol = proto;
7201 ipf->ipf_src = src;
7202 ipf->ipf_dst = dst;
7203 ipf->ipf_nf_hdr_len = 0;
7204 /* Record reassembly start time. */
7205 ipf->ipf_timestamp = gethrestime_sec();
7206 /* Record ipf generation and account for frag header */
7207 ipf->ipf_gen = ill->ill_ipf_gen++;
7208 ipf->ipf_count = MBLKSIZE(mp1);
7209 ipf->ipf_last_frag_seen = B_FALSE;
7210 ipf->ipf_ecn = ecn_info;
7211 ipf->ipf_num_dups = 0;
7212 ipfb->ipfb_frag_pkts++;
7213 ipf->ipf_checksum = 0;
7214 ipf->ipf_checksum_flags = 0;
7215
7216 /* Store checksum value in fragment header */
7217 if (sum_flags != 0) {
7218 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7219 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7220 ipf->ipf_checksum = sum_val;
7221 ipf->ipf_checksum_flags = sum_flags;
7222 }
7223
7224 /*
7225 * We handle reassembly two ways. In the easy case,
7226 * where all the fragments show up in order, we do
7227 * minimal bookkeeping, and just clip new pieces on
7228 * the end. If we ever see a hole, then we go off
7229 * to ip_reassemble which has to mark the pieces and
7230 * keep track of the number of holes, etc. Obviously,
7231 * the point of having both mechanisms is so we can
7232 * handle the easy case as efficiently as possible.
7233 */
7234 if (offset == 0) {
7235 /* Easy case, in-order reassembly so far. */
7236 ipf->ipf_count += msg_len;
7237 ipf->ipf_tail_mp = tail_mp;
7238 /*
7239 * Keep track of next expected offset in
7240 * ipf_end.
7241 */
7242 ipf->ipf_end = end;
7243 ipf->ipf_nf_hdr_len = hdr_length;
7244 } else {
7245 /* Hard case, hole at the beginning. */
7246 ipf->ipf_tail_mp = NULL;
7247 /*
7248 * ipf_end == 0 means that we have given up
7249 * on easy reassembly.
7250 */
7251 ipf->ipf_end = 0;
7252
7253 /* Forget checksum offload from now on */
7254 ipf->ipf_checksum_flags = 0;
7255
7256 /*
7257 * ipf_hole_cnt is set by ip_reassemble.
7258 * ipf_count is updated by ip_reassemble.
7259 * No need to check for return value here
7260 * as we don't expect reassembly to complete
7261 * or fail for the first fragment itself.
7262 */
7263 (void) ip_reassemble(mp, ipf,
7264 (frag_offset_flags & IPH_OFFSET) << 3,
7265 (frag_offset_flags & IPH_MF), ill, msg_len);
7266 }
7267 /* Update per ipfb and ill byte counts */
7268 ipfb->ipfb_count += ipf->ipf_count;
7269 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
7270 atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
7271 /* If the frag timer wasn't already going, start it. */
7272 mutex_enter(&ill->ill_lock);
7273 ill_frag_timer_start(ill);
7274 mutex_exit(&ill->ill_lock);
7275 goto reass_done;
7276 }
7277
7278 /*
7279 * If the packet's flag has changed (it could be coming up
7280 * from an interface different than the previous, therefore
7281 * possibly different checksum capability), then forget about
7282 * any stored checksum states. Otherwise add the value to
7283 * the existing one stored in the fragment header.
7284 */
7285 if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
7286 sum_val += ipf->ipf_checksum;
7287 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7288 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7289 ipf->ipf_checksum = sum_val;
7290 } else if (ipf->ipf_checksum_flags != 0) {
7291 /* Forget checksum offload from now on */
7292 ipf->ipf_checksum_flags = 0;
7293 }
7294
7295 /*
7296 * We have a new piece of a datagram which is already being
7297 * reassembled. Update the ECN info if all IP fragments
7298 * are ECN capable. If there is one which is not, clear
7299 * all the info. If there is at least one which has CE
7300 * code point, IP needs to report that up to transport.
7301 */
7302 if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
7303 if (ecn_info == IPH_ECN_CE)
7304 ipf->ipf_ecn = IPH_ECN_CE;
7305 } else {
7306 ipf->ipf_ecn = IPH_ECN_NECT;
7307 }
7308 if (offset && ipf->ipf_end == offset) {
7309 /* The new fragment fits at the end */
7310 ipf->ipf_tail_mp->b_cont = mp;
7311 /* Update the byte count */
7312 ipf->ipf_count += msg_len;
7313 /* Update per ipfb and ill byte counts */
7314 ipfb->ipfb_count += msg_len;
7315 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
7316 atomic_add_32(&ill->ill_frag_count, msg_len);
7317 if (frag_offset_flags & IPH_MF) {
7318 /* More to come. */
7319 ipf->ipf_end = end;
7320 ipf->ipf_tail_mp = tail_mp;
7321 goto reass_done;
7322 }
7323 } else {
7324 /* Go do the hard cases. */
7325 int ret;
7326
7327 if (offset == 0)
7328 ipf->ipf_nf_hdr_len = hdr_length;
7329
7330 /* Save current byte count */
7331 count = ipf->ipf_count;
7332 ret = ip_reassemble(mp, ipf,
7333 (frag_offset_flags & IPH_OFFSET) << 3,
7334 (frag_offset_flags & IPH_MF), ill, msg_len);
7335 /* Count of bytes added and subtracted (freeb()ed) */
7336 count = ipf->ipf_count - count;
7337 if (count) {
7338 /* Update per ipfb and ill byte counts */
7339 ipfb->ipfb_count += count;
7340 ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
7341 atomic_add_32(&ill->ill_frag_count, count);
7342 }
7343 if (ret == IP_REASS_PARTIAL) {
7344 goto reass_done;
7345 } else if (ret == IP_REASS_FAILED) {
7346 /* Reassembly failed. Free up all resources */
7347 ill_frag_free_pkts(ill, ipfb, ipf, 1);
7348 for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
7349 IP_REASS_SET_START(t_mp, 0);
7350 IP_REASS_SET_END(t_mp, 0);
7351 }
7352 freemsg(mp);
7353 goto reass_done;
7354 }
7355 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
7356 }
7357 /*
7358 * We have completed reassembly. Unhook the frag header from
7359 * the reassembly list.
7360 *
7361 * Before we free the frag header, record the ECN info
7362 * to report back to the transport.
7363 */
7364 ecn_info = ipf->ipf_ecn;
7365 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
7366 ipfp = ipf->ipf_ptphn;
7367
7368 /* We need to supply these to caller */
7369 if ((sum_flags = ipf->ipf_checksum_flags) != 0)
7370 sum_val = ipf->ipf_checksum;
7371 else
7372 sum_val = 0;
7373
7374 mp1 = ipf->ipf_mp;
7375 count = ipf->ipf_count;
7376 ipf = ipf->ipf_hash_next;
7377 if (ipf != NULL)
7378 ipf->ipf_ptphn = ipfp;
7379 ipfp[0] = ipf;
7380 atomic_add_32(&ill->ill_frag_count, -count);
7381 ASSERT(ipfb->ipfb_count >= count);
7382 ipfb->ipfb_count -= count;
7383 ipfb->ipfb_frag_pkts--;
7384 mutex_exit(&ipfb->ipfb_lock);
7385 /* Ditch the frag header. */
7386 mp = mp1->b_cont;
7387
7388 freeb(mp1);
7389
7390 /* Restore original IP length in header. */
7391 packet_size = (uint32_t)msgdsize(mp);
7392 if (packet_size > IP_MAXPACKET) {
7393 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7394 ip_drop_input("Reassembled packet too large", mp, ill);
7395 freemsg(mp);
7396 return (NULL);
7397 }
7398
7399 if (DB_REF(mp) > 1) {
7400 mblk_t *mp2 = copymsg(mp);
7401
7402 if (mp2 == NULL) {
7403 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7404 ip_drop_input("ipIfStatsInDiscards", mp, ill);
7405 freemsg(mp);
7406 return (NULL);
7407 }
7408 freemsg(mp);
7409 mp = mp2;
7410 }
7411 ipha = (ipha_t *)mp->b_rptr;
7412
7413 ipha->ipha_length = htons((uint16_t)packet_size);
7414 /* We're now complete, zip the frag state */
7415 ipha->ipha_fragment_offset_and_flags = 0;
7416 /* Record the ECN info. */
7417 ipha->ipha_type_of_service &= 0xFC;
7418 ipha->ipha_type_of_service |= ecn_info;
7419
7420 /* Update the receive attributes */
7421 ira->ira_pktlen = packet_size;
7422 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
7423
7424 /* Reassembly is successful; set checksum information in packet */
7425 DB_CKSUM16(mp) = (uint16_t)sum_val;
7426 DB_CKSUMFLAGS(mp) = sum_flags;
7427 DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
7428
7429 return (mp);
7430 }
7431
7432 /*
7433 * Pullup function that should be used for IP input in order to
7434 * ensure we do not loose the L2 source address; we need the l2 source
7435 * address for IP_RECVSLLA and for ndp_input.
7436 *
7437 * We return either NULL or b_rptr.
7438 */
7439 void *
7440 ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
7441 {
7442 ill_t *ill = ira->ira_ill;
7443
7444 if (ip_rput_pullups++ == 0) {
7445 (void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
7446 "ip_pullup: %s forced us to "
7447 " pullup pkt, hdr len %ld, hdr addr %p",
7448 ill->ill_name, len, (void *)mp->b_rptr);
7449 }
7450 if (!(ira->ira_flags & IRAF_L2SRC_SET))
7451 ip_setl2src(mp, ira, ira->ira_rill);
7452 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7453 if (!pullupmsg(mp, len))
7454 return (NULL);
7455 else
7456 return (mp->b_rptr);
7457 }
7458
7459 /*
7460 * Make sure ira_l2src has an address. If we don't have one fill with zeros.
7461 * When called from the ULP ira_rill will be NULL hence the caller has to
7462 * pass in the ill.
7463 */
7464 /* ARGSUSED */
7465 void
7466 ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
7467 {
7468 const uchar_t *addr;
7469 int alen;
7470
7471 if (ira->ira_flags & IRAF_L2SRC_SET)
7472 return;
7473
7474 ASSERT(ill != NULL);
7475 alen = ill->ill_phys_addr_length;
7476 ASSERT(alen <= sizeof (ira->ira_l2src));
7477 if (ira->ira_mhip != NULL &&
7478 (addr = ira->ira_mhip->mhi_saddr) != NULL) {
7479 bcopy(addr, ira->ira_l2src, alen);
7480 } else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
7481 (addr = ill->ill_phys_addr) != NULL) {
7482 bcopy(addr, ira->ira_l2src, alen);
7483 } else {
7484 bzero(ira->ira_l2src, alen);
7485 }
7486 ira->ira_flags |= IRAF_L2SRC_SET;
7487 }
7488
7489 /*
7490 * check ip header length and align it.
7491 */
7492 mblk_t *
7493 ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
7494 {
7495 ill_t *ill = ira->ira_ill;
7496 ssize_t len;
7497
7498 len = MBLKL(mp);
7499
7500 if (!OK_32PTR(mp->b_rptr))
7501 IP_STAT(ill->ill_ipst, ip_notaligned);
7502 else
7503 IP_STAT(ill->ill_ipst, ip_recv_pullup);
7504
7505 /* Guard against bogus device drivers */
7506 if (len < 0) {
7507 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7508 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7509 freemsg(mp);
7510 return (NULL);
7511 }
7512
7513 if (len == 0) {
7514 /* GLD sometimes sends up mblk with b_rptr == b_wptr! */
7515 mblk_t *mp1 = mp->b_cont;
7516
7517 if (!(ira->ira_flags & IRAF_L2SRC_SET))
7518 ip_setl2src(mp, ira, ira->ira_rill);
7519 ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7520
7521 freeb(mp);
7522 mp = mp1;
7523 if (mp == NULL)
7524 return (NULL);
7525
7526 if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
7527 return (mp);
7528 }
7529 if (ip_pullup(mp, min_size, ira) == NULL) {
7530 if (msgdsize(mp) < min_size) {
7531 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7532 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7533 } else {
7534 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7535 ip_drop_input("ipIfStatsInDiscards", mp, ill);
7536 }
7537 freemsg(mp);
7538 return (NULL);
7539 }
7540 return (mp);
7541 }
7542
7543 /*
7544 * Common code for IPv4 and IPv6 to check and pullup multi-mblks
7545 */
7546 mblk_t *
7547 ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len, uint_t pkt_len,
7548 uint_t min_size, ip_recv_attr_t *ira)
7549 {
7550 ill_t *ill = ira->ira_ill;
7551
7552 /*
7553 * Make sure we have data length consistent
7554 * with the IP header.
7555 */
7556 if (mp->b_cont == NULL) {
7557 /* pkt_len is based on ipha_len, not the mblk length */
7558 if (pkt_len < min_size) {
7559 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7560 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7561 freemsg(mp);
7562 return (NULL);
7563 }
7564 if (len < 0) {
7565 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7566 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7567 freemsg(mp);
7568 return (NULL);
7569 }
7570 /* Drop any pad */
7571 mp->b_wptr = rptr + pkt_len;
7572 } else if ((len += msgdsize(mp->b_cont)) != 0) {
7573 ASSERT(pkt_len >= min_size);
7574 if (pkt_len < min_size) {
7575 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7576 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7577 freemsg(mp);
7578 return (NULL);
7579 }
7580 if (len < 0) {
7581 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7582 ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7583 freemsg(mp);
7584 return (NULL);
7585 }
7586 /* Drop any pad */
7587 (void) adjmsg(mp, -len);
7588 /*
7589 * adjmsg may have freed an mblk from the chain, hence
7590 * invalidate any hw checksum here. This will force IP to
7591 * calculate the checksum in sw, but only for this packet.
7592 */
7593 DB_CKSUMFLAGS(mp) = 0;
7594 IP_STAT(ill->ill_ipst, ip_multimblk);
7595 }
7596 return (mp);
7597 }
7598
7599 /*
7600 * Check that the IPv4 opt_len is consistent with the packet and pullup
7601 * the options.
7602 */
7603 mblk_t *
7604 ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
7605 ip_recv_attr_t *ira)
7606 {
7607 ill_t *ill = ira->ira_ill;
7608 ssize_t len;
7609
7610 /* Assume no IPv6 packets arrive over the IPv4 queue */
7611 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
7612 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7613 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
7614 ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
7615 freemsg(mp);
7616 return (NULL);
7617 }
7618
7619 if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
7620 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7621 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7622 freemsg(mp);
7623 return (NULL);
7624 }
7625 /*
7626 * Recompute complete header length and make sure we
7627 * have access to all of it.
7628 */
7629 len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
7630 if (len > (mp->b_wptr - mp->b_rptr)) {
7631 if (len > pkt_len) {
7632 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7633 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7634 freemsg(mp);
7635 return (NULL);
7636 }
7637 if (ip_pullup(mp, len, ira) == NULL) {
7638 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7639 ip_drop_input("ipIfStatsInDiscards", mp, ill);
7640 freemsg(mp);
7641 return (NULL);
7642 }
7643 }
7644 return (mp);
7645 }
7646
7647 /*
7648 * Returns a new ire, or the same ire, or NULL.
7649 * If a different IRE is returned, then it is held; the caller
7650 * needs to release it.
7651 * In no case is there any hold/release on the ire argument.
7652 */
7653 ire_t *
7654 ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
7655 {
7656 ire_t *new_ire;
7657 ill_t *ire_ill;
7658 uint_t ifindex;
7659 ip_stack_t *ipst = ill->ill_ipst;
7660 boolean_t strict_check = B_FALSE;
7661
7662 /*
7663 * IPMP common case: if IRE and ILL are in the same group, there's no
7664 * issue (e.g. packet received on an underlying interface matched an
7665 * IRE_LOCAL on its associated group interface).
7666 */
7667 ASSERT(ire->ire_ill != NULL);
7668 if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
7669 return (ire);
7670
7671 /*
7672 * Do another ire lookup here, using the ingress ill, to see if the
7673 * interface is in a usesrc group.
7674 * As long as the ills belong to the same group, we don't consider
7675 * them to be arriving on the wrong interface. Thus, if the switch
7676 * is doing inbound load spreading, we won't drop packets when the
7677 * ip*_strict_dst_multihoming switch is on.
7678 * We also need to check for IPIF_UNNUMBERED point2point interfaces
7679 * where the local address may not be unique. In this case we were
7680 * at the mercy of the initial ire lookup and the IRE_LOCAL it
7681 * actually returned. The new lookup, which is more specific, should
7682 * only find the IRE_LOCAL associated with the ingress ill if one
7683 * exists.
7684 */
7685 if (ire->ire_ipversion == IPV4_VERSION) {
7686 if (ipst->ips_ip_strict_dst_multihoming)
7687 strict_check = B_TRUE;
7688 new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
7689 IRE_LOCAL, ill, ALL_ZONES, NULL,
7690 (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7691 } else {
7692 ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
7693 if (ipst->ips_ipv6_strict_dst_multihoming)
7694 strict_check = B_TRUE;
7695 new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
7696 IRE_LOCAL, ill, ALL_ZONES, NULL,
7697 (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7698 }
7699 /*
7700 * If the same ire that was returned in ip_input() is found then this
7701 * is an indication that usesrc groups are in use. The packet
7702 * arrived on a different ill in the group than the one associated with
7703 * the destination address. If a different ire was found then the same
7704 * IP address must be hosted on multiple ills. This is possible with
7705 * unnumbered point2point interfaces. We switch to use this new ire in
7706 * order to have accurate interface statistics.
7707 */
7708 if (new_ire != NULL) {
7709 /* Note: held in one case but not the other? Caller handles */
7710 if (new_ire != ire)
7711 return (new_ire);
7712 /* Unchanged */
7713 ire_refrele(new_ire);
7714 return (ire);
7715 }
7716
7717 /*
7718 * Chase pointers once and store locally.
7719 */
7720 ASSERT(ire->ire_ill != NULL);
7721 ire_ill = ire->ire_ill;
7722 ifindex = ill->ill_usesrc_ifindex;
7723
7724 /*
7725 * Check if it's a legal address on the 'usesrc' interface.
7726 * For IPMP data addresses the IRE_LOCAL is the upper, hence we
7727 * can just check phyint_ifindex.
7728 */
7729 if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
7730 return (ire);
7731 }
7732
7733 /*
7734 * If the ip*_strict_dst_multihoming switch is on then we can
7735 * only accept this packet if the interface is marked as routing.
7736 */
7737 if (!(strict_check))
7738 return (ire);
7739
7740 if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
7741 return (ire);
7742 }
7743 return (NULL);
7744 }
7745
7746 /*
7747 * This function is used to construct a mac_header_info_s from a
7748 * DL_UNITDATA_IND message.
7749 * The address fields in the mhi structure points into the message,
7750 * thus the caller can't use those fields after freeing the message.
7751 *
7752 * We determine whether the packet received is a non-unicast packet
7753 * and in doing so, determine whether or not it is broadcast vs multicast.
7754 * For it to be a broadcast packet, we must have the appropriate mblk_t
7755 * hanging off the ill_t. If this is either not present or doesn't match
7756 * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7757 * to be multicast. Thus NICs that have no broadcast address (or no
7758 * capability for one, such as point to point links) cannot return as
7759 * the packet being broadcast.
7760 */
7761 void
7762 ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
7763 {
7764 dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
7765 mblk_t *bmp;
7766 uint_t extra_offset;
7767
7768 bzero(mhip, sizeof (struct mac_header_info_s));
7769
7770 mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7771
7772 if (ill->ill_sap_length < 0)
7773 extra_offset = 0;
7774 else
7775 extra_offset = ill->ill_sap_length;
7776
7777 mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
7778 extra_offset;
7779 mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
7780 extra_offset;
7781
7782 if (!ind->dl_group_address)
7783 return;
7784
7785 /* Multicast or broadcast */
7786 mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7787
7788 if (ind->dl_dest_addr_offset > sizeof (*ind) &&
7789 ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
7790 (bmp = ill->ill_bcast_mp) != NULL) {
7791 dl_unitdata_req_t *dlur;
7792 uint8_t *bphys_addr;
7793
7794 dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7795 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
7796 extra_offset;
7797
7798 if (bcmp(mhip->mhi_daddr, bphys_addr,
7799 ind->dl_dest_addr_length) == 0)
7800 mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7801 }
7802 }
7803
7804 /*
7805 * This function is used to construct a mac_header_info_s from a
7806 * M_DATA fastpath message from a DLPI driver.
7807 * The address fields in the mhi structure points into the message,
7808 * thus the caller can't use those fields after freeing the message.
7809 *
7810 * We determine whether the packet received is a non-unicast packet
7811 * and in doing so, determine whether or not it is broadcast vs multicast.
7812 * For it to be a broadcast packet, we must have the appropriate mblk_t
7813 * hanging off the ill_t. If this is either not present or doesn't match
7814 * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7815 * to be multicast. Thus NICs that have no broadcast address (or no
7816 * capability for one, such as point to point links) cannot return as
7817 * the packet being broadcast.
7818 */
7819 void
7820 ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
7821 {
7822 mblk_t *bmp;
7823 struct ether_header *pether;
7824
7825 bzero(mhip, sizeof (struct mac_header_info_s));
7826
7827 mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7828
7829 pether = (struct ether_header *)((char *)mp->b_rptr
7830 - sizeof (struct ether_header));
7831
7832 /*
7833 * Make sure the interface is an ethernet type, since we don't
7834 * know the header format for anything but Ethernet. Also make
7835 * sure we are pointing correctly above db_base.
7836 */
7837 if (ill->ill_type != IFT_ETHER)
7838 return;
7839
7840 retry:
7841 if ((uchar_t *)pether < mp->b_datap->db_base)
7842 return;
7843
7844 /* Is there a VLAN tag? */
7845 if (ill->ill_isv6) {
7846 if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
7847 pether = (struct ether_header *)((char *)pether - 4);
7848 goto retry;
7849 }
7850 } else {
7851 if (pether->ether_type != htons(ETHERTYPE_IP)) {
7852 pether = (struct ether_header *)((char *)pether - 4);
7853 goto retry;
7854 }
7855 }
7856 mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
7857 mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
7858
7859 if (!(mhip->mhi_daddr[0] & 0x01))
7860 return;
7861
7862 /* Multicast or broadcast */
7863 mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7864
7865 if ((bmp = ill->ill_bcast_mp) != NULL) {
7866 dl_unitdata_req_t *dlur;
7867 uint8_t *bphys_addr;
7868 uint_t addrlen;
7869
7870 dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7871 addrlen = dlur->dl_dest_addr_length;
7872 if (ill->ill_sap_length < 0) {
7873 bphys_addr = (uchar_t *)dlur +
7874 dlur->dl_dest_addr_offset;
7875 addrlen += ill->ill_sap_length;
7876 } else {
7877 bphys_addr = (uchar_t *)dlur +
7878 dlur->dl_dest_addr_offset +
7879 ill->ill_sap_length;
7880 addrlen -= ill->ill_sap_length;
7881 }
7882 if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
7883 mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7884 }
7885 }
7886
7887 /*
7888 * Handle anything but M_DATA messages
7889 * We see the DL_UNITDATA_IND which are part
7890 * of the data path, and also the other messages from the driver.
7891 */
7892 void
7893 ip_rput_notdata(ill_t *ill, mblk_t *mp)
7894 {
7895 mblk_t *first_mp;
7896 struct iocblk *iocp;
7897 struct mac_header_info_s mhi;
7898
7899 switch (DB_TYPE(mp)) {
7900 case M_PROTO:
7901 case M_PCPROTO: {
7902 if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
7903 DL_UNITDATA_IND) {
7904 /* Go handle anything other than data elsewhere. */
7905 ip_rput_dlpi(ill, mp);
7906 return;
7907 }
7908
7909 first_mp = mp;
7910 mp = first_mp->b_cont;
7911 first_mp->b_cont = NULL;
7912
7913 if (mp == NULL) {
7914 freeb(first_mp);
7915 return;
7916 }
7917 ip_dlur_to_mhi(ill, first_mp, &mhi);
7918 if (ill->ill_isv6)
7919 ip_input_v6(ill, NULL, mp, &mhi);
7920 else
7921 ip_input(ill, NULL, mp, &mhi);
7922
7923 /* Ditch the DLPI header. */
7924 freeb(first_mp);
7925 return;
7926 }
7927 case M_IOCACK:
7928 iocp = (struct iocblk *)mp->b_rptr;
7929 switch (iocp->ioc_cmd) {
7930 case DL_IOC_HDR_INFO:
7931 ill_fastpath_ack(ill, mp);
7932 return;
7933 default:
7934 putnext(ill->ill_rq, mp);
7935 return;
7936 }
7937 /* FALLTHRU */
7938 case M_ERROR:
7939 case M_HANGUP:
7940 mutex_enter(&ill->ill_lock);
7941 if (ill->ill_state_flags & ILL_CONDEMNED) {
7942 mutex_exit(&ill->ill_lock);
7943 freemsg(mp);
7944 return;
7945 }
7946 ill_refhold_locked(ill);
7947 mutex_exit(&ill->ill_lock);
7948 qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
7949 B_FALSE);
7950 return;
7951 case M_CTL:
7952 putnext(ill->ill_rq, mp);
7953 return;
7954 case M_IOCNAK:
7955 ip1dbg(("got iocnak "));
7956 iocp = (struct iocblk *)mp->b_rptr;
7957 switch (iocp->ioc_cmd) {
7958 case DL_IOC_HDR_INFO:
7959 ip_rput_other(NULL, ill->ill_rq, mp, NULL);
7960 return;
7961 default:
7962 break;
7963 }
7964 /* FALLTHRU */
7965 default:
7966 putnext(ill->ill_rq, mp);
7967 return;
7968 }
7969 }
7970
7971 /* Read side put procedure. Packets coming from the wire arrive here. */
7972 void
7973 ip_rput(queue_t *q, mblk_t *mp)
7974 {
7975 ill_t *ill;
7976 union DL_primitives *dl;
7977
7978 ill = (ill_t *)q->q_ptr;
7979
7980 if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
7981 /*
7982 * If things are opening or closing, only accept high-priority
7983 * DLPI messages. (On open ill->ill_ipif has not yet been
7984 * created; on close, things hanging off the ill may have been
7985 * freed already.)
7986 */
7987 dl = (union DL_primitives *)mp->b_rptr;
7988 if (DB_TYPE(mp) != M_PCPROTO ||
7989 dl->dl_primitive == DL_UNITDATA_IND) {
7990 inet_freemsg(mp);
7991 return;
7992 }
7993 }
7994 if (DB_TYPE(mp) == M_DATA) {
7995 struct mac_header_info_s mhi;
7996
7997 ip_mdata_to_mhi(ill, mp, &mhi);
7998 ip_input(ill, NULL, mp, &mhi);
7999 } else {
8000 ip_rput_notdata(ill, mp);
8001 }
8002 }
8003
8004 /*
8005 * Move the information to a copy.
8006 */
8007 mblk_t *
8008 ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
8009 {
8010 mblk_t *mp1;
8011 ill_t *ill = ira->ira_ill;
8012 ip_stack_t *ipst = ill->ill_ipst;
8013
8014 IP_STAT(ipst, ip_db_ref);
8015
8016 /* Make sure we have ira_l2src before we loose the original mblk */
8017 if (!(ira->ira_flags & IRAF_L2SRC_SET))
8018 ip_setl2src(mp, ira, ira->ira_rill);
8019
8020 mp1 = copymsg(mp);
8021 if (mp1 == NULL) {
8022 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
8023 ip_drop_input("ipIfStatsInDiscards", mp, ill);
8024 freemsg(mp);
8025 return (NULL);
8026 }
8027 /* preserve the hardware checksum flags and data, if present */
8028 if (DB_CKSUMFLAGS(mp) != 0) {
8029 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
8030 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
8031 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
8032 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
8033 DB_CKSUM16(mp1) = DB_CKSUM16(mp);
8034 }
8035 freemsg(mp);
8036 return (mp1);
8037 }
8038
8039 static void
8040 ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
8041 t_uscalar_t err)
8042 {
8043 if (dl_err == DL_SYSERR) {
8044 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8045 "%s: %s failed: DL_SYSERR (errno %u)\n",
8046 ill->ill_name, dl_primstr(prim), err);
8047 return;
8048 }
8049
8050 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8051 "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim),
8052 dl_errstr(dl_err));
8053 }
8054
8055 /*
8056 * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other
8057 * than DL_UNITDATA_IND messages. If we need to process this message
8058 * exclusively, we call qwriter_ip, in which case we also need to call
8059 * ill_refhold before that, since qwriter_ip does an ill_refrele.
8060 */
8061 void
8062 ip_rput_dlpi(ill_t *ill, mblk_t *mp)
8063 {
8064 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr;
8065 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa;
8066 queue_t *q = ill->ill_rq;
8067 t_uscalar_t prim = dloa->dl_primitive;
8068 t_uscalar_t reqprim = DL_PRIM_INVAL;
8069
8070 DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
8071 char *, dl_primstr(prim), ill_t *, ill);
8072 ip1dbg(("ip_rput_dlpi"));
8073
8074 /*
8075 * If we received an ACK but didn't send a request for it, then it
8076 * can't be part of any pending operation; discard up-front.
8077 */
8078 switch (prim) {
8079 case DL_ERROR_ACK:
8080 reqprim = dlea->dl_error_primitive;
8081 ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s "
8082 "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim),
8083 reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno,
8084 dlea->dl_unix_errno));
8085 break;
8086 case DL_OK_ACK:
8087 reqprim = dloa->dl_correct_primitive;
8088 break;
8089 case DL_INFO_ACK:
8090 reqprim = DL_INFO_REQ;
8091 break;
8092 case DL_BIND_ACK:
8093 reqprim = DL_BIND_REQ;
8094 break;
8095 case DL_PHYS_ADDR_ACK:
8096 reqprim = DL_PHYS_ADDR_REQ;
8097 break;
8098 case DL_NOTIFY_ACK:
8099 reqprim = DL_NOTIFY_REQ;
8100 break;
8101 case DL_CAPABILITY_ACK:
8102 reqprim = DL_CAPABILITY_REQ;
8103 break;
8104 }
8105
8106 if (prim != DL_NOTIFY_IND) {
8107 if (reqprim == DL_PRIM_INVAL ||
8108 !ill_dlpi_pending(ill, reqprim)) {
8109 /* Not a DLPI message we support or expected */
8110 freemsg(mp);
8111 return;
8112 }
8113 ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim),
8114 dl_primstr(reqprim)));
8115 }
8116
8117 switch (reqprim) {
8118 case DL_UNBIND_REQ:
8119 /*
8120 * NOTE: we mark the unbind as complete even if we got a
8121 * DL_ERROR_ACK, since there's not much else we can do.
8122 */
8123 mutex_enter(&ill->ill_lock);
8124 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
8125 cv_signal(&ill->ill_cv);
8126 mutex_exit(&ill->ill_lock);
8127 break;
8128
8129 case DL_ENABMULTI_REQ:
8130 if (prim == DL_OK_ACK) {
8131 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8132 ill->ill_dlpi_multicast_state = IDS_OK;
8133 }
8134 break;
8135 }
8136
8137 /*
8138 * The message is one we're waiting for (or DL_NOTIFY_IND), but we
8139 * need to become writer to continue to process it. Because an
8140 * exclusive operation doesn't complete until replies to all queued
8141 * DLPI messages have been received, we know we're in the middle of an
8142 * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND).
8143 *
8144 * As required by qwriter_ip(), we refhold the ill; it will refrele.
8145 * Since this is on the ill stream we unconditionally bump up the
8146 * refcount without doing ILL_CAN_LOOKUP().
8147 */
8148 ill_refhold(ill);
8149 if (prim == DL_NOTIFY_IND)
8150 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE);
8151 else
8152 qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE);
8153 }
8154
8155 /*
8156 * Handling of DLPI messages that require exclusive access to the ipsq.
8157 *
8158 * Need to do ipsq_pending_mp_get on ioctl completion, which could
8159 * happen here. (along with mi_copy_done)
8160 */
8161 /* ARGSUSED */
8162 static void
8163 ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8164 {
8165 dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr;
8166 dl_error_ack_t *dlea = (dl_error_ack_t *)dloa;
8167 int err = 0;
8168 ill_t *ill = (ill_t *)q->q_ptr;
8169 ipif_t *ipif = NULL;
8170 mblk_t *mp1 = NULL;
8171 conn_t *connp = NULL;
8172 t_uscalar_t paddrreq;
8173 mblk_t *mp_hw;
8174 boolean_t success;
8175 boolean_t ioctl_aborted = B_FALSE;
8176 boolean_t log = B_TRUE;
8177
8178 DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
8179 char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
8180
8181 ip1dbg(("ip_rput_dlpi_writer .."));
8182 ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
8183 ASSERT(IAM_WRITER_ILL(ill));
8184
8185 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
8186 /*
8187 * The current ioctl could have been aborted by the user and a new
8188 * ioctl to bring up another ill could have started. We could still
8189 * get a response from the driver later.
8190 */
8191 if (ipif != NULL && ipif->ipif_ill != ill)
8192 ioctl_aborted = B_TRUE;
8193
8194 switch (dloa->dl_primitive) {
8195 case DL_ERROR_ACK:
8196 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
8197 dl_primstr(dlea->dl_error_primitive)));
8198
8199 DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
8200 char *, dl_primstr(dlea->dl_error_primitive),
8201 ill_t *, ill);
8202
8203 switch (dlea->dl_error_primitive) {
8204 case DL_DISABMULTI_REQ:
8205 ill_dlpi_done(ill, dlea->dl_error_primitive);
8206 break;
8207 case DL_PROMISCON_REQ:
8208 case DL_PROMISCOFF_REQ:
8209 case DL_UNBIND_REQ:
8210 case DL_ATTACH_REQ:
8211 case DL_INFO_REQ:
8212 ill_dlpi_done(ill, dlea->dl_error_primitive);
8213 break;
8214 case DL_NOTIFY_REQ:
8215 ill_dlpi_done(ill, DL_NOTIFY_REQ);
8216 log = B_FALSE;
8217 break;
8218 case DL_PHYS_ADDR_REQ:
8219 /*
8220 * For IPv6 only, there are two additional
8221 * phys_addr_req's sent to the driver to get the
8222 * IPv6 token and lla. This allows IP to acquire
8223 * the hardware address format for a given interface
8224 * without having built in knowledge of the hardware
8225 * address. ill_phys_addr_pend keeps track of the last
8226 * DL_PAR sent so we know which response we are
8227 * dealing with. ill_dlpi_done will update
8228 * ill_phys_addr_pend when it sends the next req.
8229 * We don't complete the IOCTL until all three DL_PARs
8230 * have been attempted, so set *_len to 0 and break.
8231 */
8232 paddrreq = ill->ill_phys_addr_pend;
8233 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8234 if (paddrreq == DL_IPV6_TOKEN) {
8235 ill->ill_token_length = 0;
8236 log = B_FALSE;
8237 break;
8238 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8239 ill->ill_nd_lla_len = 0;
8240 log = B_FALSE;
8241 break;
8242 }
8243 /*
8244 * Something went wrong with the DL_PHYS_ADDR_REQ.
8245 * We presumably have an IOCTL hanging out waiting
8246 * for completion. Find it and complete the IOCTL
8247 * with the error noted.
8248 * However, ill_dl_phys was called on an ill queue
8249 * (from SIOCSLIFNAME), thus conn_pending_ill is not
8250 * set. But the ioctl is known to be pending on ill_wq.
8251 */
8252 if (!ill->ill_ifname_pending)
8253 break;
8254 ill->ill_ifname_pending = 0;
8255 if (!ioctl_aborted)
8256 mp1 = ipsq_pending_mp_get(ipsq, &connp);
8257 if (mp1 != NULL) {
8258 /*
8259 * This operation (SIOCSLIFNAME) must have
8260 * happened on the ill. Assert there is no conn
8261 */
8262 ASSERT(connp == NULL);
8263 q = ill->ill_wq;
8264 }
8265 break;
8266 case DL_BIND_REQ:
8267 ill_dlpi_done(ill, DL_BIND_REQ);
8268 if (ill->ill_ifname_pending)
8269 break;
8270 mutex_enter(&ill->ill_lock);
8271 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8272 mutex_exit(&ill->ill_lock);
8273 /*
8274 * Something went wrong with the bind. We presumably
8275 * have an IOCTL hanging out waiting for completion.
8276 * Find it, take down the interface that was coming
8277 * up, and complete the IOCTL with the error noted.
8278 */
8279 if (!ioctl_aborted)
8280 mp1 = ipsq_pending_mp_get(ipsq, &connp);
8281 if (mp1 != NULL) {
8282 /*
8283 * This might be a result of a DL_NOTE_REPLUMB
8284 * notification. In that case, connp is NULL.
8285 */
8286 if (connp != NULL)
8287 q = CONNP_TO_WQ(connp);
8288
8289 (void) ipif_down(ipif, NULL, NULL);
8290 /* error is set below the switch */
8291 }
8292 break;
8293 case DL_ENABMULTI_REQ:
8294 ill_dlpi_done(ill, DL_ENABMULTI_REQ);
8295
8296 if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8297 ill->ill_dlpi_multicast_state = IDS_FAILED;
8298 if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
8299
8300 printf("ip: joining multicasts failed (%d)"
8301 " on %s - will use link layer "
8302 "broadcasts for multicast\n",
8303 dlea->dl_errno, ill->ill_name);
8304
8305 /*
8306 * Set up for multi_bcast; We are the
8307 * writer, so ok to access ill->ill_ipif
8308 * without any lock.
8309 */
8310 mutex_enter(&ill->ill_phyint->phyint_lock);
8311 ill->ill_phyint->phyint_flags |=
8312 PHYI_MULTI_BCAST;
8313 mutex_exit(&ill->ill_phyint->phyint_lock);
8314
8315 }
8316 freemsg(mp); /* Don't want to pass this up */
8317 return;
8318 case DL_CAPABILITY_REQ:
8319 ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
8320 "DL_CAPABILITY REQ\n"));
8321 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
8322 ill->ill_dlpi_capab_state = IDCS_FAILED;
8323 ill_capability_done(ill);
8324 freemsg(mp);
8325 return;
8326 }
8327 /*
8328 * Note the error for IOCTL completion (mp1 is set when
8329 * ready to complete ioctl). If ill_ifname_pending_err is
8330 * set, an error occured during plumbing (ill_ifname_pending),
8331 * so we want to report that error.
8332 *
8333 * NOTE: there are two addtional DL_PHYS_ADDR_REQ's
8334 * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are
8335 * expected to get errack'd if the driver doesn't support
8336 * these flags (e.g. ethernet). log will be set to B_FALSE
8337 * if these error conditions are encountered.
8338 */
8339 if (mp1 != NULL) {
8340 if (ill->ill_ifname_pending_err != 0) {
8341 err = ill->ill_ifname_pending_err;
8342 ill->ill_ifname_pending_err = 0;
8343 } else {
8344 err = dlea->dl_unix_errno ?
8345 dlea->dl_unix_errno : ENXIO;
8346 }
8347 /*
8348 * If we're plumbing an interface and an error hasn't already
8349 * been saved, set ill_ifname_pending_err to the error passed
8350 * up. Ignore the error if log is B_FALSE (see comment above).
8351 */
8352 } else if (log && ill->ill_ifname_pending &&
8353 ill->ill_ifname_pending_err == 0) {
8354 ill->ill_ifname_pending_err = dlea->dl_unix_errno ?
8355 dlea->dl_unix_errno : ENXIO;
8356 }
8357
8358 if (log)
8359 ip_dlpi_error(ill, dlea->dl_error_primitive,
8360 dlea->dl_errno, dlea->dl_unix_errno);
8361 break;
8362 case DL_CAPABILITY_ACK:
8363 ill_capability_ack(ill, mp);
8364 /*
8365 * The message has been handed off to ill_capability_ack
8366 * and must not be freed below
8367 */
8368 mp = NULL;
8369 break;
8370
8371 case DL_INFO_ACK:
8372 /* Call a routine to handle this one. */
8373 ill_dlpi_done(ill, DL_INFO_REQ);
8374 ip_ll_subnet_defaults(ill, mp);
8375 ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock));
8376 return;
8377 case DL_BIND_ACK:
8378 /*
8379 * We should have an IOCTL waiting on this unless
8380 * sent by ill_dl_phys, in which case just return
8381 */
8382 ill_dlpi_done(ill, DL_BIND_REQ);
8383
8384 if (ill->ill_ifname_pending) {
8385 DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
8386 ill_t *, ill, mblk_t *, mp);
8387 break;
8388 }
8389 mutex_enter(&ill->ill_lock);
8390 ill->ill_dl_up = 1;
8391 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8392 mutex_exit(&ill->ill_lock);
8393
8394 if (!ioctl_aborted)
8395 mp1 = ipsq_pending_mp_get(ipsq, &connp);
8396 if (mp1 == NULL) {
8397 DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
8398 break;
8399 }
8400 /*
8401 * mp1 was added by ill_dl_up(). if that is a result of
8402 * a DL_NOTE_REPLUMB notification, connp could be NULL.
8403 */
8404 if (connp != NULL)
8405 q = CONNP_TO_WQ(connp);
8406 /*
8407 * We are exclusive. So nothing can change even after
8408 * we get the pending mp.
8409 */
8410 ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
8411 DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
8412 ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
8413
8414 /*
8415 * Now bring up the resolver; when that is complete, we'll
8416 * create IREs. Note that we intentionally mirror what
8417 * ipif_up() would have done, because we got here by way of
8418 * ill_dl_up(), which stopped ipif_up()'s processing.
8419 */
8420 if (ill->ill_isv6) {
8421 /*
8422 * v6 interfaces.
8423 * Unlike ARP which has to do another bind
8424 * and attach, once we get here we are
8425 * done with NDP
8426 */
8427 (void) ipif_resolver_up(ipif, Res_act_initial);
8428 if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
8429 err = ipif_up_done_v6(ipif);
8430 } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
8431 /*
8432 * ARP and other v4 external resolvers.
8433 * Leave the pending mblk intact so that
8434 * the ioctl completes in ip_rput().
8435 */
8436 if (connp != NULL)
8437 mutex_enter(&connp->conn_lock);
8438 mutex_enter(&ill->ill_lock);
8439 success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
8440 mutex_exit(&ill->ill_lock);
8441 if (connp != NULL)
8442 mutex_exit(&connp->conn_lock);
8443 if (success) {
8444 err = ipif_resolver_up(ipif, Res_act_initial);
8445 if (err == EINPROGRESS) {
8446 freemsg(mp);
8447 return;
8448 }
8449 mp1 = ipsq_pending_mp_get(ipsq, &connp);
8450 } else {
8451 /* The conn has started closing */
8452 err = EINTR;
8453 }
8454 } else {
8455 /*
8456 * This one is complete. Reply to pending ioctl.
8457 */
8458 (void) ipif_resolver_up(ipif, Res_act_initial);
8459 err = ipif_up_done(ipif);
8460 }
8461
8462 if ((err == 0) && (ill->ill_up_ipifs)) {
8463 err = ill_up_ipifs(ill, q, mp1);
8464 if (err == EINPROGRESS) {
8465 freemsg(mp);
8466 return;
8467 }
8468 }
8469
8470 /*
8471 * If we have a moved ipif to bring up, and everything has
8472 * succeeded to this point, bring it up on the IPMP ill.
8473 * Otherwise, leave it down -- the admin can try to bring it
8474 * up by hand if need be.
8475 */
8476 if (ill->ill_move_ipif != NULL) {
8477 if (err != 0) {
8478 ill->ill_move_ipif = NULL;
8479 } else {
8480 ipif = ill->ill_move_ipif;
8481 ill->ill_move_ipif = NULL;
8482 err = ipif_up(ipif, q, mp1);
8483 if (err == EINPROGRESS) {
8484 freemsg(mp);
8485 return;
8486 }
8487 }
8488 }
8489 break;
8490
8491 case DL_NOTIFY_IND: {
8492 dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
8493 uint_t orig_mtu, orig_mc_mtu;
8494
8495 switch (notify->dl_notification) {
8496 case DL_NOTE_PHYS_ADDR:
8497 err = ill_set_phys_addr(ill, mp);
8498 break;
8499
8500 case DL_NOTE_REPLUMB:
8501 /*
8502 * Directly return after calling ill_replumb().
8503 * Note that we should not free mp as it is reused
8504 * in the ill_replumb() function.
8505 */
8506 err = ill_replumb(ill, mp);
8507 return;
8508
8509 case DL_NOTE_FASTPATH_FLUSH:
8510 nce_flush(ill, B_FALSE);
8511 break;
8512
8513 case DL_NOTE_SDU_SIZE:
8514 case DL_NOTE_SDU_SIZE2:
8515 /*
8516 * The dce and fragmentation code can cope with
8517 * this changing while packets are being sent.
8518 * When packets are sent ip_output will discover
8519 * a change.
8520 *
8521 * Change the MTU size of the interface.
8522 */
8523 mutex_enter(&ill->ill_lock);
8524 orig_mtu = ill->ill_mtu;
8525 orig_mc_mtu = ill->ill_mc_mtu;
8526 switch (notify->dl_notification) {
8527 case DL_NOTE_SDU_SIZE:
8528 ill->ill_current_frag =
8529 (uint_t)notify->dl_data;
8530 ill->ill_mc_mtu = (uint_t)notify->dl_data;
8531 break;
8532 case DL_NOTE_SDU_SIZE2:
8533 ill->ill_current_frag =
8534 (uint_t)notify->dl_data1;
8535 ill->ill_mc_mtu = (uint_t)notify->dl_data2;
8536 break;
8537 }
8538 if (ill->ill_current_frag > ill->ill_max_frag)
8539 ill->ill_max_frag = ill->ill_current_frag;
8540
8541 if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
8542 ill->ill_mtu = ill->ill_current_frag;
8543
8544 /*
8545 * If ill_user_mtu was set (via
8546 * SIOCSLIFLNKINFO), clamp ill_mtu at it.
8547 */
8548 if (ill->ill_user_mtu != 0 &&
8549 ill->ill_user_mtu < ill->ill_mtu)
8550 ill->ill_mtu = ill->ill_user_mtu;
8551
8552 if (ill->ill_user_mtu != 0 &&
8553 ill->ill_user_mtu < ill->ill_mc_mtu)
8554 ill->ill_mc_mtu = ill->ill_user_mtu;
8555
8556 if (ill->ill_isv6) {
8557 if (ill->ill_mtu < IPV6_MIN_MTU)
8558 ill->ill_mtu = IPV6_MIN_MTU;
8559 if (ill->ill_mc_mtu < IPV6_MIN_MTU)
8560 ill->ill_mc_mtu = IPV6_MIN_MTU;
8561 } else {
8562 if (ill->ill_mtu < IP_MIN_MTU)
8563 ill->ill_mtu = IP_MIN_MTU;
8564 if (ill->ill_mc_mtu < IP_MIN_MTU)
8565 ill->ill_mc_mtu = IP_MIN_MTU;
8566 }
8567 } else if (ill->ill_mc_mtu > ill->ill_mtu) {
8568 ill->ill_mc_mtu = ill->ill_mtu;
8569 }
8570
8571 mutex_exit(&ill->ill_lock);
8572 /*
8573 * Make sure all dce_generation checks find out
8574 * that ill_mtu/ill_mc_mtu has changed.
8575 */
8576 if (orig_mtu != ill->ill_mtu ||
8577 orig_mc_mtu != ill->ill_mc_mtu) {
8578 dce_increment_all_generations(ill->ill_isv6,
8579 ill->ill_ipst);
8580 }
8581
8582 /*
8583 * Refresh IPMP meta-interface MTU if necessary.
8584 */
8585 if (IS_UNDER_IPMP(ill))
8586 ipmp_illgrp_refresh_mtu(ill->ill_grp);
8587 break;
8588
8589 case DL_NOTE_LINK_UP:
8590 case DL_NOTE_LINK_DOWN: {
8591 /*
8592 * We are writer. ill / phyint / ipsq assocs stable.
8593 * The RUNNING flag reflects the state of the link.
8594 */
8595 phyint_t *phyint = ill->ill_phyint;
8596 uint64_t new_phyint_flags;
8597 boolean_t changed = B_FALSE;
8598 boolean_t went_up;
8599
8600 went_up = notify->dl_notification == DL_NOTE_LINK_UP;
8601 mutex_enter(&phyint->phyint_lock);
8602
8603 new_phyint_flags = went_up ?
8604 phyint->phyint_flags | PHYI_RUNNING :
8605 phyint->phyint_flags & ~PHYI_RUNNING;
8606
8607 if (IS_IPMP(ill)) {
8608 new_phyint_flags = went_up ?
8609 new_phyint_flags & ~PHYI_FAILED :
8610 new_phyint_flags | PHYI_FAILED;
8611 }
8612
8613 if (new_phyint_flags != phyint->phyint_flags) {
8614 phyint->phyint_flags = new_phyint_flags;
8615 changed = B_TRUE;
8616 }
8617 mutex_exit(&phyint->phyint_lock);
8618 /*
8619 * ill_restart_dad handles the DAD restart and routing
8620 * socket notification logic.
8621 */
8622 if (changed) {
8623 ill_restart_dad(phyint->phyint_illv4, went_up);
8624 ill_restart_dad(phyint->phyint_illv6, went_up);
8625 }
8626 break;
8627 }
8628 case DL_NOTE_PROMISC_ON_PHYS: {
8629 phyint_t *phyint = ill->ill_phyint;
8630
8631 mutex_enter(&phyint->phyint_lock);
8632 phyint->phyint_flags |= PHYI_PROMISC;
8633 mutex_exit(&phyint->phyint_lock);
8634 break;
8635 }
8636 case DL_NOTE_PROMISC_OFF_PHYS: {
8637 phyint_t *phyint = ill->ill_phyint;
8638
8639 mutex_enter(&phyint->phyint_lock);
8640 phyint->phyint_flags &= ~PHYI_PROMISC;
8641 mutex_exit(&phyint->phyint_lock);
8642 break;
8643 }
8644 case DL_NOTE_CAPAB_RENEG:
8645 /*
8646 * Something changed on the driver side.
8647 * It wants us to renegotiate the capabilities
8648 * on this ill. One possible cause is the aggregation
8649 * interface under us where a port got added or
8650 * went away.
8651 *
8652 * If the capability negotiation is already done
8653 * or is in progress, reset the capabilities and
8654 * mark the ill's ill_capab_reneg to be B_TRUE,
8655 * so that when the ack comes back, we can start
8656 * the renegotiation process.
8657 *
8658 * Note that if ill_capab_reneg is already B_TRUE
8659 * (ill_dlpi_capab_state is IDS_UNKNOWN in this case),
8660 * the capability resetting request has been sent
8661 * and the renegotiation has not been started yet;
8662 * nothing needs to be done in this case.
8663 */
8664 ipsq_current_start(ipsq, ill->ill_ipif, 0);
8665 ill_capability_reset(ill, B_TRUE);
8666 ipsq_current_finish(ipsq);
8667 break;
8668
8669 case DL_NOTE_ALLOWED_IPS:
8670 ill_set_allowed_ips(ill, mp);
8671 break;
8672 default:
8673 ip0dbg(("ip_rput_dlpi_writer: unknown notification "
8674 "type 0x%x for DL_NOTIFY_IND\n",
8675 notify->dl_notification));
8676 break;
8677 }
8678
8679 /*
8680 * As this is an asynchronous operation, we
8681 * should not call ill_dlpi_done
8682 */
8683 break;
8684 }
8685 case DL_NOTIFY_ACK: {
8686 dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr;
8687
8688 if (noteack->dl_notifications & DL_NOTE_LINK_UP)
8689 ill->ill_note_link = 1;
8690 ill_dlpi_done(ill, DL_NOTIFY_REQ);
8691 break;
8692 }
8693 case DL_PHYS_ADDR_ACK: {
8694 /*
8695 * As part of plumbing the interface via SIOCSLIFNAME,
8696 * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs,
8697 * whose answers we receive here. As each answer is received,
8698 * we call ill_dlpi_done() to dispatch the next request as
8699 * we're processing the current one. Once all answers have
8700 * been received, we use ipsq_pending_mp_get() to dequeue the
8701 * outstanding IOCTL and reply to it. (Because ill_dl_phys()
8702 * is invoked from an ill queue, conn_oper_pending_ill is not
8703 * available, but we know the ioctl is pending on ill_wq.)
8704 */
8705 uint_t paddrlen, paddroff;
8706 uint8_t *addr;
8707
8708 paddrreq = ill->ill_phys_addr_pend;
8709 paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
8710 paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset;
8711 addr = mp->b_rptr + paddroff;
8712
8713 ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8714 if (paddrreq == DL_IPV6_TOKEN) {
8715 /*
8716 * bcopy to low-order bits of ill_token
8717 *
8718 * XXX Temporary hack - currently, all known tokens
8719 * are 64 bits, so I'll cheat for the moment.
8720 */
8721 bcopy(addr, &ill->ill_token.s6_addr32[2], paddrlen);
8722 ill->ill_token_length = paddrlen;
8723 break;
8724 } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8725 ASSERT(ill->ill_nd_lla_mp == NULL);
8726 ill_set_ndmp(ill, mp, paddroff, paddrlen);
8727 mp = NULL;
8728 break;
8729 } else if (paddrreq == DL_CURR_DEST_ADDR) {
8730 ASSERT(ill->ill_dest_addr_mp == NULL);
8731 ill->ill_dest_addr_mp = mp;
8732 ill->ill_dest_addr = addr;
8733 mp = NULL;
8734 if (ill->ill_isv6) {
8735 ill_setdesttoken(ill);
8736 ipif_setdestlinklocal(ill->ill_ipif);
8737 }
8738 break;
8739 }
8740
8741 ASSERT(paddrreq == DL_CURR_PHYS_ADDR);
8742 ASSERT(ill->ill_phys_addr_mp == NULL);
8743 if (!ill->ill_ifname_pending)
8744 break;
8745 ill->ill_ifname_pending = 0;
8746 if (!ioctl_aborted)
8747 mp1 = ipsq_pending_mp_get(ipsq, &connp);
8748 if (mp1 != NULL) {
8749 ASSERT(connp == NULL);
8750 q = ill->ill_wq;
8751 }
8752 /*
8753 * If any error acks received during the plumbing sequence,
8754 * ill_ifname_pending_err will be set. Break out and send up
8755 * the error to the pending ioctl.
8756 */
8757 if (ill->ill_ifname_pending_err != 0) {
8758 err = ill->ill_ifname_pending_err;
8759 ill->ill_ifname_pending_err = 0;
8760 break;
8761 }
8762
8763 ill->ill_phys_addr_mp = mp;
8764 ill->ill_phys_addr = (paddrlen == 0 ? NULL : addr);
8765 mp = NULL;
8766
8767 /*
8768 * If paddrlen or ill_phys_addr_length is zero, the DLPI
8769 * provider doesn't support physical addresses. We check both
8770 * paddrlen and ill_phys_addr_length because sppp (PPP) does
8771 * not have physical addresses, but historically adversises a
8772 * physical address length of 0 in its DL_INFO_ACK, but 6 in
8773 * its DL_PHYS_ADDR_ACK.
8774 */
8775 if (paddrlen == 0 || ill->ill_phys_addr_length == 0) {
8776 ill->ill_phys_addr = NULL;
8777 } else if (paddrlen != ill->ill_phys_addr_length) {
8778 ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d",
8779 paddrlen, ill->ill_phys_addr_length));
8780 err = EINVAL;
8781 break;
8782 }
8783
8784 if (ill->ill_nd_lla_mp == NULL) {
8785 if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) {
8786 err = ENOMEM;
8787 break;
8788 }
8789 ill_set_ndmp(ill, mp_hw, paddroff, paddrlen);
8790 }
8791
8792 if (ill->ill_isv6) {
8793 ill_setdefaulttoken(ill);
8794 ipif_setlinklocal(ill->ill_ipif);
8795 }
8796 break;
8797 }
8798 case DL_OK_ACK:
8799 ip2dbg(("DL_OK_ACK %s (0x%x)\n",
8800 dl_primstr((int)dloa->dl_correct_primitive),
8801 dloa->dl_correct_primitive));
8802 DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
8803 char *, dl_primstr(dloa->dl_correct_primitive),
8804 ill_t *, ill);
8805
8806 switch (dloa->dl_correct_primitive) {
8807 case DL_ENABMULTI_REQ:
8808 case DL_DISABMULTI_REQ:
8809 ill_dlpi_done(ill, dloa->dl_correct_primitive);
8810 break;
8811 case DL_PROMISCON_REQ:
8812 case DL_PROMISCOFF_REQ:
8813 case DL_UNBIND_REQ:
8814 case DL_ATTACH_REQ:
8815 ill_dlpi_done(ill, dloa->dl_correct_primitive);
8816 break;
8817 }
8818 break;
8819 default:
8820 break;
8821 }
8822
8823 freemsg(mp);
8824 if (mp1 == NULL)
8825 return;
8826
8827 /*
8828 * The operation must complete without EINPROGRESS since
8829 * ipsq_pending_mp_get() has removed the mblk (mp1). Otherwise,
8830 * the operation will be stuck forever inside the IPSQ.
8831 */
8832 ASSERT(err != EINPROGRESS);
8833
8834 DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
8835 int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
8836 ipif_t *, NULL);
8837
8838 switch (ipsq->ipsq_xop->ipx_current_ioctl) {
8839 case 0:
8840 ipsq_current_finish(ipsq);
8841 break;
8842
8843 case SIOCSLIFNAME:
8844 case IF_UNITSEL: {
8845 ill_t *ill_other = ILL_OTHER(ill);
8846
8847 /*
8848 * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
8849 * ill has a peer which is in an IPMP group, then place ill
8850 * into the same group. One catch: although ifconfig plumbs
8851 * the appropriate IPMP meta-interface prior to plumbing this
8852 * ill, it is possible for multiple ifconfig applications to
8853 * race (or for another application to adjust plumbing), in
8854 * which case the IPMP meta-interface we need will be missing.
8855 * If so, kick the phyint out of the group.
8856 */
8857 if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
8858 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
8859 ipmp_illgrp_t *illg;
8860
8861 illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
8862 if (illg == NULL)
8863 ipmp_phyint_leave_grp(ill->ill_phyint);
8864 else
8865 ipmp_ill_join_illgrp(ill, illg);
8866 }
8867
8868 if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
8869 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8870 else
8871 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8872 break;
8873 }
8874 case SIOCLIFADDIF:
8875 ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8876 break;
8877
8878 default:
8879 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8880 break;
8881 }
8882 }
8883
8884 /*
8885 * ip_rput_other is called by ip_rput to handle messages modifying the global
8886 * state in IP. If 'ipsq' is non-NULL, caller is writer on it.
8887 */
8888 /* ARGSUSED */
8889 void
8890 ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8891 {
8892 ill_t *ill = q->q_ptr;
8893 struct iocblk *iocp;
8894
8895 ip1dbg(("ip_rput_other "));
8896 if (ipsq != NULL) {
8897 ASSERT(IAM_WRITER_IPSQ(ipsq));
8898 ASSERT(ipsq->ipsq_xop ==
8899 ill->ill_phyint->phyint_ipsq->ipsq_xop);
8900 }
8901
8902 switch (mp->b_datap->db_type) {
8903 case M_ERROR:
8904 case M_HANGUP:
8905 /*
8906 * The device has a problem. We force the ILL down. It can
8907 * be brought up again manually using SIOCSIFFLAGS (via
8908 * ifconfig or equivalent).
8909 */
8910 ASSERT(ipsq != NULL);
8911 if (mp->b_rptr < mp->b_wptr)
8912 ill->ill_error = (int)(*mp->b_rptr & 0xFF);
8913 if (ill->ill_error == 0)
8914 ill->ill_error = ENXIO;
8915 if (!ill_down_start(q, mp))
8916 return;
8917 ipif_all_down_tail(ipsq, q, mp, NULL);
8918 break;
8919 case M_IOCNAK: {
8920 iocp = (struct iocblk *)mp->b_rptr;
8921
8922 ASSERT(iocp->ioc_cmd == DL_IOC_HDR_INFO);
8923 /*
8924 * If this was the first attempt, turn off the fastpath
8925 * probing.
8926 */
8927 mutex_enter(&ill->ill_lock);
8928 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
8929 ill->ill_dlpi_fastpath_state = IDS_FAILED;
8930 mutex_exit(&ill->ill_lock);
8931 /*
8932 * don't flush the nce_t entries: we use them
8933 * as an index to the ncec itself.
8934 */
8935 ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
8936 ill->ill_name));
8937 } else {
8938 mutex_exit(&ill->ill_lock);
8939 }
8940 freemsg(mp);
8941 break;
8942 }
8943 default:
8944 ASSERT(0);
8945 break;
8946 }
8947 }
8948
8949 /*
8950 * Update any source route, record route or timestamp options
8951 * When it fails it has consumed the message and BUMPed the MIB.
8952 */
8953 boolean_t
8954 ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
8955 ip_recv_attr_t *ira)
8956 {
8957 ipoptp_t opts;
8958 uchar_t *opt;
8959 uint8_t optval;
8960 uint8_t optlen;
8961 ipaddr_t dst;
8962 ipaddr_t ifaddr;
8963 uint32_t ts;
8964 timestruc_t now;
8965 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
8966
8967 ip2dbg(("ip_forward_options\n"));
8968 dst = ipha->ipha_dst;
8969 for (optval = ipoptp_first(&opts, ipha);
8970 optval != IPOPT_EOL;
8971 optval = ipoptp_next(&opts)) {
8972 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
8973 opt = opts.ipoptp_cur;
8974 optlen = opts.ipoptp_len;
8975 ip2dbg(("ip_forward_options: opt %d, len %d\n",
8976 optval, opts.ipoptp_len));
8977 switch (optval) {
8978 uint32_t off;
8979 case IPOPT_SSRR:
8980 case IPOPT_LSRR:
8981 /* Check if adminstratively disabled */
8982 if (!ipst->ips_ip_forward_src_routed) {
8983 BUMP_MIB(dst_ill->ill_ip_mib,
8984 ipIfStatsForwProhibits);
8985 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
8986 mp, dst_ill);
8987 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
8988 ira);
8989 return (B_FALSE);
8990 }
8991 if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
8992 /*
8993 * Must be partial since ip_input_options
8994 * checked for strict.
8995 */
8996 break;
8997 }
8998 off = opt[IPOPT_OFFSET];
8999 off--;
9000 redo_srr:
9001 if (optlen < IP_ADDR_LEN ||
9002 off > optlen - IP_ADDR_LEN) {
9003 /* End of source route */
9004 ip1dbg((
9005 "ip_forward_options: end of SR\n"));
9006 break;
9007 }
9008 /* Pick a reasonable address on the outbound if */
9009 ASSERT(dst_ill != NULL);
9010 if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9011 INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9012 NULL) != 0) {
9013 /* No source! Shouldn't happen */
9014 ifaddr = INADDR_ANY;
9015 }
9016 bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9017 bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9018 ip1dbg(("ip_forward_options: next hop 0x%x\n",
9019 ntohl(dst)));
9020
9021 /*
9022 * Check if our address is present more than
9023 * once as consecutive hops in source route.
9024 */
9025 if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9026 off += IP_ADDR_LEN;
9027 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9028 goto redo_srr;
9029 }
9030 ipha->ipha_dst = dst;
9031 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9032 break;
9033 case IPOPT_RR:
9034 off = opt[IPOPT_OFFSET];
9035 off--;
9036 if (optlen < IP_ADDR_LEN ||
9037 off > optlen - IP_ADDR_LEN) {
9038 /* No more room - ignore */
9039 ip1dbg((
9040 "ip_forward_options: end of RR\n"));
9041 break;
9042 }
9043 /* Pick a reasonable address on the outbound if */
9044 ASSERT(dst_ill != NULL);
9045 if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9046 INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9047 NULL) != 0) {
9048 /* No source! Shouldn't happen */
9049 ifaddr = INADDR_ANY;
9050 }
9051 bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9052 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9053 break;
9054 case IPOPT_TS:
9055 /* Insert timestamp if there is room */
9056 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9057 case IPOPT_TS_TSONLY:
9058 off = IPOPT_TS_TIMELEN;
9059 break;
9060 case IPOPT_TS_PRESPEC:
9061 case IPOPT_TS_PRESPEC_RFC791:
9062 /* Verify that the address matched */
9063 off = opt[IPOPT_OFFSET] - 1;
9064 bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9065 if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9066 /* Not for us */
9067 break;
9068 }
9069 /* FALLTHRU */
9070 case IPOPT_TS_TSANDADDR:
9071 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9072 break;
9073 default:
9074 /*
9075 * ip_*put_options should have already
9076 * dropped this packet.
9077 */
9078 cmn_err(CE_PANIC, "ip_forward_options: "
9079 "unknown IT - bug in ip_input_options?\n");
9080 return (B_TRUE); /* Keep "lint" happy */
9081 }
9082 if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9083 /* Increase overflow counter */
9084 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9085 opt[IPOPT_POS_OV_FLG] =
9086 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9087 (off << 4));
9088 break;
9089 }
9090 off = opt[IPOPT_OFFSET] - 1;
9091 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9092 case IPOPT_TS_PRESPEC:
9093 case IPOPT_TS_PRESPEC_RFC791:
9094 case IPOPT_TS_TSANDADDR:
9095 /* Pick a reasonable addr on the outbound if */
9096 ASSERT(dst_ill != NULL);
9097 if (ip_select_source_v4(dst_ill, INADDR_ANY,
9098 dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
9099 NULL, NULL) != 0) {
9100 /* No source! Shouldn't happen */
9101 ifaddr = INADDR_ANY;
9102 }
9103 bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9104 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9105 /* FALLTHRU */
9106 case IPOPT_TS_TSONLY:
9107 off = opt[IPOPT_OFFSET] - 1;
9108 /* Compute # of milliseconds since midnight */
9109 gethrestime(&now);
9110 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9111 now.tv_nsec / (NANOSEC / MILLISEC);
9112 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9113 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9114 break;
9115 }
9116 break;
9117 }
9118 }
9119 return (B_TRUE);
9120 }
9121
9122 /*
9123 * Call ill_frag_timeout to do garbage collection. ill_frag_timeout
9124 * returns 'true' if there are still fragments left on the queue, in
9125 * which case we restart the timer.
9126 */
9127 void
9128 ill_frag_timer(void *arg)
9129 {
9130 ill_t *ill = (ill_t *)arg;
9131 boolean_t frag_pending;
9132 ip_stack_t *ipst = ill->ill_ipst;
9133 time_t timeout;
9134
9135 mutex_enter(&ill->ill_lock);
9136 ASSERT(!ill->ill_fragtimer_executing);
9137 if (ill->ill_state_flags & ILL_CONDEMNED) {
9138 ill->ill_frag_timer_id = 0;
9139 mutex_exit(&ill->ill_lock);
9140 return;
9141 }
9142 ill->ill_fragtimer_executing = 1;
9143 mutex_exit(&ill->ill_lock);
9144
9145 timeout = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9146 ipst->ips_ip_reassembly_timeout);
9147
9148 frag_pending = ill_frag_timeout(ill, timeout);
9149
9150 /*
9151 * Restart the timer, if we have fragments pending or if someone
9152 * wanted us to be scheduled again.
9153 */
9154 mutex_enter(&ill->ill_lock);
9155 ill->ill_fragtimer_executing = 0;
9156 ill->ill_frag_timer_id = 0;
9157 if (frag_pending || ill->ill_fragtimer_needrestart)
9158 ill_frag_timer_start(ill);
9159 mutex_exit(&ill->ill_lock);
9160 }
9161
9162 void
9163 ill_frag_timer_start(ill_t *ill)
9164 {
9165 ip_stack_t *ipst = ill->ill_ipst;
9166 clock_t timeo_ms;
9167
9168 ASSERT(MUTEX_HELD(&ill->ill_lock));
9169
9170 /* If the ill is closing or opening don't proceed */
9171 if (ill->ill_state_flags & ILL_CONDEMNED)
9172 return;
9173
9174 if (ill->ill_fragtimer_executing) {
9175 /*
9176 * ill_frag_timer is currently executing. Just record the
9177 * the fact that we want the timer to be restarted.
9178 * ill_frag_timer will post a timeout before it returns,
9179 * ensuring it will be called again.
9180 */
9181 ill->ill_fragtimer_needrestart = 1;
9182 return;
9183 }
9184
9185 if (ill->ill_frag_timer_id == 0) {
9186 timeo_ms = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9187 ipst->ips_ip_reassembly_timeout) * SECONDS;
9188
9189 /*
9190 * The timer is neither running nor is the timeout handler
9191 * executing. Post a timeout so that ill_frag_timer will be
9192 * called
9193 */
9194 ill->ill_frag_timer_id = timeout(ill_frag_timer, ill,
9195 MSEC_TO_TICK(timeo_ms >> 1));
9196 ill->ill_fragtimer_needrestart = 0;
9197 }
9198 }
9199
9200 /*
9201 * Update any source route, record route or timestamp options.
9202 * Check that we are at end of strict source route.
9203 * The options have already been checked for sanity in ip_input_options().
9204 */
9205 boolean_t
9206 ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
9207 {
9208 ipoptp_t opts;
9209 uchar_t *opt;
9210 uint8_t optval;
9211 uint8_t optlen;
9212 ipaddr_t dst;
9213 ipaddr_t ifaddr;
9214 uint32_t ts;
9215 timestruc_t now;
9216 ill_t *ill = ira->ira_ill;
9217 ip_stack_t *ipst = ill->ill_ipst;
9218
9219 ip2dbg(("ip_input_local_options\n"));
9220
9221 for (optval = ipoptp_first(&opts, ipha);
9222 optval != IPOPT_EOL;
9223 optval = ipoptp_next(&opts)) {
9224 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
9225 opt = opts.ipoptp_cur;
9226 optlen = opts.ipoptp_len;
9227 ip2dbg(("ip_input_local_options: opt %d, len %d\n",
9228 optval, optlen));
9229 switch (optval) {
9230 uint32_t off;
9231 case IPOPT_SSRR:
9232 case IPOPT_LSRR:
9233 off = opt[IPOPT_OFFSET];
9234 off--;
9235 if (optlen < IP_ADDR_LEN ||
9236 off > optlen - IP_ADDR_LEN) {
9237 /* End of source route */
9238 ip1dbg(("ip_input_local_options: end of SR\n"));
9239 break;
9240 }
9241 /*
9242 * This will only happen if two consecutive entries
9243 * in the source route contains our address or if
9244 * it is a packet with a loose source route which
9245 * reaches us before consuming the whole source route
9246 */
9247 ip1dbg(("ip_input_local_options: not end of SR\n"));
9248 if (optval == IPOPT_SSRR) {
9249 goto bad_src_route;
9250 }
9251 /*
9252 * Hack: instead of dropping the packet truncate the
9253 * source route to what has been used by filling the
9254 * rest with IPOPT_NOP.
9255 */
9256 opt[IPOPT_OLEN] = (uint8_t)off;
9257 while (off < optlen) {
9258 opt[off++] = IPOPT_NOP;
9259 }
9260 break;
9261 case IPOPT_RR:
9262 off = opt[IPOPT_OFFSET];
9263 off--;
9264 if (optlen < IP_ADDR_LEN ||
9265 off > optlen - IP_ADDR_LEN) {
9266 /* No more room - ignore */
9267 ip1dbg((
9268 "ip_input_local_options: end of RR\n"));
9269 break;
9270 }
9271 /* Pick a reasonable address on the outbound if */
9272 if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
9273 INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9274 NULL) != 0) {
9275 /* No source! Shouldn't happen */
9276 ifaddr = INADDR_ANY;
9277 }
9278 bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9279 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9280 break;
9281 case IPOPT_TS:
9282 /* Insert timestamp if there is romm */
9283 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9284 case IPOPT_TS_TSONLY:
9285 off = IPOPT_TS_TIMELEN;
9286 break;
9287 case IPOPT_TS_PRESPEC:
9288 case IPOPT_TS_PRESPEC_RFC791:
9289 /* Verify that the address matched */
9290 off = opt[IPOPT_OFFSET] - 1;
9291 bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9292 if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9293 /* Not for us */
9294 break;
9295 }
9296 /* FALLTHRU */
9297 case IPOPT_TS_TSANDADDR:
9298 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9299 break;
9300 default:
9301 /*
9302 * ip_*put_options should have already
9303 * dropped this packet.
9304 */
9305 cmn_err(CE_PANIC, "ip_input_local_options: "
9306 "unknown IT - bug in ip_input_options?\n");
9307 return (B_TRUE); /* Keep "lint" happy */
9308 }
9309 if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9310 /* Increase overflow counter */
9311 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9312 opt[IPOPT_POS_OV_FLG] =
9313 (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9314 (off << 4));
9315 break;
9316 }
9317 off = opt[IPOPT_OFFSET] - 1;
9318 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9319 case IPOPT_TS_PRESPEC:
9320 case IPOPT_TS_PRESPEC_RFC791:
9321 case IPOPT_TS_TSANDADDR:
9322 /* Pick a reasonable addr on the outbound if */
9323 if (ip_select_source_v4(ill, INADDR_ANY,
9324 ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
9325 &ifaddr, NULL, NULL) != 0) {
9326 /* No source! Shouldn't happen */
9327 ifaddr = INADDR_ANY;
9328 }
9329 bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9330 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9331 /* FALLTHRU */
9332 case IPOPT_TS_TSONLY:
9333 off = opt[IPOPT_OFFSET] - 1;
9334 /* Compute # of milliseconds since midnight */
9335 gethrestime(&now);
9336 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9337 now.tv_nsec / (NANOSEC / MILLISEC);
9338 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9339 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9340 break;
9341 }
9342 break;
9343 }
9344 }
9345 return (B_TRUE);
9346
9347 bad_src_route:
9348 /* make sure we clear any indication of a hardware checksum */
9349 DB_CKSUMFLAGS(mp) = 0;
9350 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
9351 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9352 return (B_FALSE);
9353
9354 }
9355
9356 /*
9357 * Process IP options in an inbound packet. Always returns the nexthop.
9358 * Normally this is the passed in nexthop, but if there is an option
9359 * that effects the nexthop (such as a source route) that will be returned.
9360 * Sets *errorp if there is an error, in which case an ICMP error has been sent
9361 * and mp freed.
9362 */
9363 ipaddr_t
9364 ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
9365 ip_recv_attr_t *ira, int *errorp)
9366 {
9367 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
9368 ipoptp_t opts;
9369 uchar_t *opt;
9370 uint8_t optval;
9371 uint8_t optlen;
9372 intptr_t code = 0;
9373 ire_t *ire;
9374
9375 ip2dbg(("ip_input_options\n"));
9376 *errorp = 0;
9377 for (optval = ipoptp_first(&opts, ipha);
9378 optval != IPOPT_EOL;
9379 optval = ipoptp_next(&opts)) {
9380 opt = opts.ipoptp_cur;
9381 optlen = opts.ipoptp_len;
9382 ip2dbg(("ip_input_options: opt %d, len %d\n",
9383 optval, optlen));
9384 /*
9385 * Note: we need to verify the checksum before we
9386 * modify anything thus this routine only extracts the next
9387 * hop dst from any source route.
9388 */
9389 switch (optval) {
9390 uint32_t off;
9391 case IPOPT_SSRR:
9392 case IPOPT_LSRR:
9393 if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9394 if (optval == IPOPT_SSRR) {
9395 ip1dbg(("ip_input_options: not next"
9396 " strict source route 0x%x\n",
9397 ntohl(dst)));
9398 code = (char *)&ipha->ipha_dst -
9399 (char *)ipha;
9400 goto param_prob; /* RouterReq's */
9401 }
9402 ip2dbg(("ip_input_options: "
9403 "not next source route 0x%x\n",
9404 ntohl(dst)));
9405 break;
9406 }
9407
9408 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9409 ip1dbg((
9410 "ip_input_options: bad option offset\n"));
9411 code = (char *)&opt[IPOPT_OLEN] -
9412 (char *)ipha;
9413 goto param_prob;
9414 }
9415 off = opt[IPOPT_OFFSET];
9416 off--;
9417 redo_srr:
9418 if (optlen < IP_ADDR_LEN ||
9419 off > optlen - IP_ADDR_LEN) {
9420 /* End of source route */
9421 ip1dbg(("ip_input_options: end of SR\n"));
9422 break;
9423 }
9424 bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9425 ip1dbg(("ip_input_options: next hop 0x%x\n",
9426 ntohl(dst)));
9427
9428 /*
9429 * Check if our address is present more than
9430 * once as consecutive hops in source route.
9431 * XXX verify per-interface ip_forwarding
9432 * for source route?
9433 */
9434 if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9435 off += IP_ADDR_LEN;
9436 goto redo_srr;
9437 }
9438
9439 if (dst == htonl(INADDR_LOOPBACK)) {
9440 ip1dbg(("ip_input_options: loopback addr in "
9441 "source route!\n"));
9442 goto bad_src_route;
9443 }
9444 /*
9445 * For strict: verify that dst is directly
9446 * reachable.
9447 */
9448 if (optval == IPOPT_SSRR) {
9449 ire = ire_ftable_lookup_v4(dst, 0, 0,
9450 IRE_INTERFACE, NULL, ALL_ZONES,
9451 ira->ira_tsl,
9452 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
9453 NULL);
9454 if (ire == NULL) {
9455 ip1dbg(("ip_input_options: SSRR not "
9456 "directly reachable: 0x%x\n",
9457 ntohl(dst)));
9458 goto bad_src_route;
9459 }
9460 ire_refrele(ire);
9461 }
9462 /*
9463 * Defer update of the offset and the record route
9464 * until the packet is forwarded.
9465 */
9466 break;
9467 case IPOPT_RR:
9468 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9469 ip1dbg((
9470 "ip_input_options: bad option offset\n"));
9471 code = (char *)&opt[IPOPT_OLEN] -
9472 (char *)ipha;
9473 goto param_prob;
9474 }
9475 break;
9476 case IPOPT_TS:
9477 /*
9478 * Verify that length >= 5 and that there is either
9479 * room for another timestamp or that the overflow
9480 * counter is not maxed out.
9481 */
9482 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
9483 if (optlen < IPOPT_MINLEN_IT) {
9484 goto param_prob;
9485 }
9486 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9487 ip1dbg((
9488 "ip_input_options: bad option offset\n"));
9489 code = (char *)&opt[IPOPT_OFFSET] -
9490 (char *)ipha;
9491 goto param_prob;
9492 }
9493 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9494 case IPOPT_TS_TSONLY:
9495 off = IPOPT_TS_TIMELEN;
9496 break;
9497 case IPOPT_TS_TSANDADDR:
9498 case IPOPT_TS_PRESPEC:
9499 case IPOPT_TS_PRESPEC_RFC791:
9500 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9501 break;
9502 default:
9503 code = (char *)&opt[IPOPT_POS_OV_FLG] -
9504 (char *)ipha;
9505 goto param_prob;
9506 }
9507 if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
9508 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
9509 /*
9510 * No room and the overflow counter is 15
9511 * already.
9512 */
9513 goto param_prob;
9514 }
9515 break;
9516 }
9517 }
9518
9519 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
9520 return (dst);
9521 }
9522
9523 ip1dbg(("ip_input_options: error processing IP options."));
9524 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
9525
9526 param_prob:
9527 /* make sure we clear any indication of a hardware checksum */
9528 DB_CKSUMFLAGS(mp) = 0;
9529 ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
9530 icmp_param_problem(mp, (uint8_t)code, ira);
9531 *errorp = -1;
9532 return (dst);
9533
9534 bad_src_route:
9535 /* make sure we clear any indication of a hardware checksum */
9536 DB_CKSUMFLAGS(mp) = 0;
9537 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
9538 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9539 *errorp = -1;
9540 return (dst);
9541 }
9542
9543 /*
9544 * IP & ICMP info in >=14 msg's ...
9545 * - ip fixed part (mib2_ip_t)
9546 * - icmp fixed part (mib2_icmp_t)
9547 * - ipAddrEntryTable (ip 20) all IPv4 ipifs
9548 * - ipRouteEntryTable (ip 21) all IPv4 IREs
9549 * - ipNetToMediaEntryTable (ip 22) all IPv4 Neighbor Cache entries
9550 * - ipRouteAttributeTable (ip 102) labeled routes
9551 * - ip multicast membership (ip_member_t)
9552 * - ip multicast source filtering (ip_grpsrc_t)
9553 * - igmp fixed part (struct igmpstat)
9554 * - multicast routing stats (struct mrtstat)
9555 * - multicast routing vifs (array of struct vifctl)
9556 * - multicast routing routes (array of struct mfcctl)
9557 * - ip6 fixed part (mib2_ipv6IfStatsEntry_t)
9558 * One per ill plus one generic
9559 * - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t)
9560 * One per ill plus one generic
9561 * - ipv6RouteEntry all IPv6 IREs
9562 * - ipv6RouteAttributeTable (ip6 102) labeled routes
9563 * - ipv6NetToMediaEntry all IPv6 Neighbor Cache entries
9564 * - ipv6AddrEntry all IPv6 ipifs
9565 * - ipv6 multicast membership (ipv6_member_t)
9566 * - ipv6 multicast source filtering (ipv6_grpsrc_t)
9567 *
9568 * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
9569 * already filled in by the caller.
9570 * If legacy_req is true then MIB structures needs to be truncated to their
9571 * legacy sizes before being returned.
9572 * Return value of 0 indicates that no messages were sent and caller
9573 * should free mpctl.
9574 */
9575 int
9576 ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
9577 {
9578 ip_stack_t *ipst;
9579 sctp_stack_t *sctps;
9580
9581 if (q->q_next != NULL) {
9582 ipst = ILLQ_TO_IPST(q);
9583 } else {
9584 ipst = CONNQ_TO_IPST(q);
9585 }
9586 ASSERT(ipst != NULL);
9587 sctps = ipst->ips_netstack->netstack_sctp;
9588
9589 if (mpctl == NULL || mpctl->b_cont == NULL) {
9590 return (0);
9591 }
9592
9593 /*
9594 * For the purposes of the (broken) packet shell use
9595 * of the level we make sure MIB2_TCP/MIB2_UDP can be used
9596 * to make TCP and UDP appear first in the list of mib items.
9597 * TBD: We could expand this and use it in netstat so that
9598 * the kernel doesn't have to produce large tables (connections,
9599 * routes, etc) when netstat only wants the statistics or a particular
9600 * table.
9601 */
9602 if (!(level == MIB2_TCP || level == MIB2_UDP)) {
9603 if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) {
9604 return (1);
9605 }
9606 }
9607
9608 if (level != MIB2_TCP) {
9609 if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9610 return (1);
9611 }
9612 }
9613
9614 if (level != MIB2_UDP) {
9615 if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9616 return (1);
9617 }
9618 }
9619
9620 if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl,
9621 ipst, legacy_req)) == NULL) {
9622 return (1);
9623 }
9624
9625 if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst,
9626 legacy_req)) == NULL) {
9627 return (1);
9628 }
9629
9630 if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) {
9631 return (1);
9632 }
9633
9634 if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) {
9635 return (1);
9636 }
9637
9638 if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) {
9639 return (1);
9640 }
9641
9642 if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) {
9643 return (1);
9644 }
9645
9646 if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst,
9647 legacy_req)) == NULL) {
9648 return (1);
9649 }
9650
9651 if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst,
9652 legacy_req)) == NULL) {
9653 return (1);
9654 }
9655
9656 if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) {
9657 return (1);
9658 }
9659
9660 if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) {
9661 return (1);
9662 }
9663
9664 if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) {
9665 return (1);
9666 }
9667
9668 if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) {
9669 return (1);
9670 }
9671
9672 if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) {
9673 return (1);
9674 }
9675
9676 if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) {
9677 return (1);
9678 }
9679
9680 mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
9681 if (mpctl == NULL)
9682 return (1);
9683
9684 mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
9685 if (mpctl == NULL)
9686 return (1);
9687
9688 if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
9689 return (1);
9690 }
9691 if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
9692 return (1);
9693 }
9694 freemsg(mpctl);
9695 return (1);
9696 }
9697
9698 /* Get global (legacy) IPv4 statistics */
9699 static mblk_t *
9700 ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
9701 ip_stack_t *ipst, boolean_t legacy_req)
9702 {
9703 mib2_ip_t old_ip_mib;
9704 struct opthdr *optp;
9705 mblk_t *mp2ctl;
9706 mib2_ipAddrEntry_t mae;
9707
9708 /*
9709 * make a copy of the original message
9710 */
9711 mp2ctl = copymsg(mpctl);
9712
9713 /* fixed length IP structure... */
9714 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9715 optp->level = MIB2_IP;
9716 optp->name = 0;
9717 SET_MIB(old_ip_mib.ipForwarding,
9718 (WE_ARE_FORWARDING(ipst) ? 1 : 2));
9719 SET_MIB(old_ip_mib.ipDefaultTTL,
9720 (uint32_t)ipst->ips_ip_def_ttl);
9721 SET_MIB(old_ip_mib.ipReasmTimeout,
9722 ipst->ips_ip_reassembly_timeout);
9723 SET_MIB(old_ip_mib.ipAddrEntrySize,
9724 (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
9725 sizeof (mib2_ipAddrEntry_t));
9726 SET_MIB(old_ip_mib.ipRouteEntrySize,
9727 sizeof (mib2_ipRouteEntry_t));
9728 SET_MIB(old_ip_mib.ipNetToMediaEntrySize,
9729 sizeof (mib2_ipNetToMediaEntry_t));
9730 SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t));
9731 SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t));
9732 SET_MIB(old_ip_mib.ipRouteAttributeSize,
9733 sizeof (mib2_ipAttributeEntry_t));
9734 SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
9735 SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
9736
9737 /*
9738 * Grab the statistics from the new IP MIB
9739 */
9740 SET_MIB(old_ip_mib.ipInReceives,
9741 (uint32_t)ipmib->ipIfStatsHCInReceives);
9742 SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors);
9743 SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors);
9744 SET_MIB(old_ip_mib.ipForwDatagrams,
9745 (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams);
9746 SET_MIB(old_ip_mib.ipInUnknownProtos,
9747 ipmib->ipIfStatsInUnknownProtos);
9748 SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards);
9749 SET_MIB(old_ip_mib.ipInDelivers,
9750 (uint32_t)ipmib->ipIfStatsHCInDelivers);
9751 SET_MIB(old_ip_mib.ipOutRequests,
9752 (uint32_t)ipmib->ipIfStatsHCOutRequests);
9753 SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards);
9754 SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes);
9755 SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds);
9756 SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs);
9757 SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails);
9758 SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs);
9759 SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails);
9760 SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates);
9761
9762 /* ipRoutingDiscards is not being used */
9763 SET_MIB(old_ip_mib.ipRoutingDiscards, 0);
9764 SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs);
9765 SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts);
9766 SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs);
9767 SET_MIB(old_ip_mib.ipReasmDuplicates,
9768 ipmib->ipIfStatsReasmDuplicates);
9769 SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups);
9770 SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits);
9771 SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs);
9772 SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows);
9773 SET_MIB(old_ip_mib.rawipInOverflows,
9774 ipmib->rawipIfStatsInOverflows);
9775
9776 SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded);
9777 SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed);
9778 SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion);
9779 SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion);
9780 SET_MIB(old_ip_mib.ipOutSwitchIPv6,
9781 ipmib->ipIfStatsOutSwitchIPVersion);
9782
9783 if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib,
9784 (int)sizeof (old_ip_mib))) {
9785 ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n",
9786 (uint_t)sizeof (old_ip_mib)));
9787 }
9788
9789 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9790 ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n",
9791 (int)optp->level, (int)optp->name, (int)optp->len));
9792 qreply(q, mpctl);
9793 return (mp2ctl);
9794 }
9795
9796 /* Per interface IPv4 statistics */
9797 static mblk_t *
9798 ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9799 boolean_t legacy_req)
9800 {
9801 struct opthdr *optp;
9802 mblk_t *mp2ctl;
9803 ill_t *ill;
9804 ill_walk_context_t ctx;
9805 mblk_t *mp_tail = NULL;
9806 mib2_ipIfStatsEntry_t global_ip_mib;
9807 mib2_ipAddrEntry_t mae;
9808
9809 /*
9810 * Make a copy of the original message
9811 */
9812 mp2ctl = copymsg(mpctl);
9813
9814 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9815 optp->level = MIB2_IP;
9816 optp->name = MIB2_IP_TRAFFIC_STATS;
9817 /* Include "unknown interface" ip_mib */
9818 ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
9819 ipst->ips_ip_mib.ipIfStatsIfIndex =
9820 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
9821 SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding,
9822 (ipst->ips_ip_forwarding ? 1 : 2));
9823 SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL,
9824 (uint32_t)ipst->ips_ip_def_ttl);
9825 SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize,
9826 sizeof (mib2_ipIfStatsEntry_t));
9827 SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize,
9828 sizeof (mib2_ipAddrEntry_t));
9829 SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize,
9830 sizeof (mib2_ipRouteEntry_t));
9831 SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize,
9832 sizeof (mib2_ipNetToMediaEntry_t));
9833 SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize,
9834 sizeof (ip_member_t));
9835 SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize,
9836 sizeof (ip_grpsrc_t));
9837
9838 bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib));
9839
9840 if (legacy_req) {
9841 SET_MIB(global_ip_mib.ipIfStatsAddrEntrySize,
9842 LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t));
9843 }
9844
9845 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9846 (char *)&global_ip_mib, (int)sizeof (global_ip_mib))) {
9847 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9848 "failed to allocate %u bytes\n",
9849 (uint_t)sizeof (global_ip_mib)));
9850 }
9851
9852 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
9853 ill = ILL_START_WALK_V4(&ctx, ipst);
9854 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
9855 ill->ill_ip_mib->ipIfStatsIfIndex =
9856 ill->ill_phyint->phyint_ifindex;
9857 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
9858 (ipst->ips_ip_forwarding ? 1 : 2));
9859 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL,
9860 (uint32_t)ipst->ips_ip_def_ttl);
9861
9862 ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib);
9863 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9864 (char *)ill->ill_ip_mib,
9865 (int)sizeof (*ill->ill_ip_mib))) {
9866 ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9867 "failed to allocate %u bytes\n",
9868 (uint_t)sizeof (*ill->ill_ip_mib)));
9869 }
9870 }
9871 rw_exit(&ipst->ips_ill_g_lock);
9872
9873 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9874 ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9875 "level %d, name %d, len %d\n",
9876 (int)optp->level, (int)optp->name, (int)optp->len));
9877 qreply(q, mpctl);
9878
9879 if (mp2ctl == NULL)
9880 return (NULL);
9881
9882 return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst,
9883 legacy_req));
9884 }
9885
9886 /* Global IPv4 ICMP statistics */
9887 static mblk_t *
9888 ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9889 {
9890 struct opthdr *optp;
9891 mblk_t *mp2ctl;
9892
9893 /*
9894 * Make a copy of the original message
9895 */
9896 mp2ctl = copymsg(mpctl);
9897
9898 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9899 optp->level = MIB2_ICMP;
9900 optp->name = 0;
9901 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib,
9902 (int)sizeof (ipst->ips_icmp_mib))) {
9903 ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n",
9904 (uint_t)sizeof (ipst->ips_icmp_mib)));
9905 }
9906 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9907 ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n",
9908 (int)optp->level, (int)optp->name, (int)optp->len));
9909 qreply(q, mpctl);
9910 return (mp2ctl);
9911 }
9912
9913 /* Global IPv4 IGMP statistics */
9914 static mblk_t *
9915 ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9916 {
9917 struct opthdr *optp;
9918 mblk_t *mp2ctl;
9919
9920 /*
9921 * make a copy of the original message
9922 */
9923 mp2ctl = copymsg(mpctl);
9924
9925 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9926 optp->level = EXPER_IGMP;
9927 optp->name = 0;
9928 if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat,
9929 (int)sizeof (ipst->ips_igmpstat))) {
9930 ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n",
9931 (uint_t)sizeof (ipst->ips_igmpstat)));
9932 }
9933 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9934 ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n",
9935 (int)optp->level, (int)optp->name, (int)optp->len));
9936 qreply(q, mpctl);
9937 return (mp2ctl);
9938 }
9939
9940 /* Global IPv4 Multicast Routing statistics */
9941 static mblk_t *
9942 ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9943 {
9944 struct opthdr *optp;
9945 mblk_t *mp2ctl;
9946
9947 /*
9948 * make a copy of the original message
9949 */
9950 mp2ctl = copymsg(mpctl);
9951
9952 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9953 optp->level = EXPER_DVMRP;
9954 optp->name = 0;
9955 if (!ip_mroute_stats(mpctl->b_cont, ipst)) {
9956 ip0dbg(("ip_mroute_stats: failed\n"));
9957 }
9958 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9959 ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n",
9960 (int)optp->level, (int)optp->name, (int)optp->len));
9961 qreply(q, mpctl);
9962 return (mp2ctl);
9963 }
9964
9965 /* IPv4 address information */
9966 static mblk_t *
9967 ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9968 boolean_t legacy_req)
9969 {
9970 struct opthdr *optp;
9971 mblk_t *mp2ctl;
9972 mblk_t *mp_tail = NULL;
9973 ill_t *ill;
9974 ipif_t *ipif;
9975 uint_t bitval;
9976 mib2_ipAddrEntry_t mae;
9977 size_t mae_size;
9978 zoneid_t zoneid;
9979 ill_walk_context_t ctx;
9980
9981 /*
9982 * make a copy of the original message
9983 */
9984 mp2ctl = copymsg(mpctl);
9985
9986 mae_size = (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
9987 sizeof (mib2_ipAddrEntry_t);
9988
9989 /* ipAddrEntryTable */
9990
9991 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9992 optp->level = MIB2_IP;
9993 optp->name = MIB2_IP_ADDR;
9994 zoneid = Q_TO_CONN(q)->conn_zoneid;
9995
9996 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
9997 ill = ILL_START_WALK_V4(&ctx, ipst);
9998 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
9999 for (ipif = ill->ill_ipif; ipif != NULL;
10000 ipif = ipif->ipif_next) {
10001 if (ipif->ipif_zoneid != zoneid &&
10002 ipif->ipif_zoneid != ALL_ZONES)
10003 continue;
10004 /* Sum of count from dead IRE_LO* and our current */
10005 mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10006 if (ipif->ipif_ire_local != NULL) {
10007 mae.ipAdEntInfo.ae_ibcnt +=
10008 ipif->ipif_ire_local->ire_ib_pkt_count;
10009 }
10010 mae.ipAdEntInfo.ae_obcnt = 0;
10011 mae.ipAdEntInfo.ae_focnt = 0;
10012
10013 ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
10014 OCTET_LENGTH);
10015 mae.ipAdEntIfIndex.o_length =
10016 mi_strlen(mae.ipAdEntIfIndex.o_bytes);
10017 mae.ipAdEntAddr = ipif->ipif_lcl_addr;
10018 mae.ipAdEntNetMask = ipif->ipif_net_mask;
10019 mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
10020 mae.ipAdEntInfo.ae_subnet_len =
10021 ip_mask_to_plen(ipif->ipif_net_mask);
10022 mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
10023 for (bitval = 1;
10024 bitval &&
10025 !(bitval & ipif->ipif_brd_addr);
10026 bitval <<= 1)
10027 noop;
10028 mae.ipAdEntBcastAddr = bitval;
10029 mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
10030 mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10031 mae.ipAdEntInfo.ae_metric = ipif->ipif_ill->ill_metric;
10032 mae.ipAdEntInfo.ae_broadcast_addr =
10033 ipif->ipif_brd_addr;
10034 mae.ipAdEntInfo.ae_pp_dst_addr =
10035 ipif->ipif_pp_dst_addr;
10036 mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
10037 ill->ill_flags | ill->ill_phyint->phyint_flags;
10038 mae.ipAdEntRetransmitTime =
10039 ill->ill_reachable_retrans_time;
10040
10041 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10042 (char *)&mae, (int)mae_size)) {
10043 ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to "
10044 "allocate %u bytes\n", (uint_t)mae_size));
10045 }
10046 }
10047 }
10048 rw_exit(&ipst->ips_ill_g_lock);
10049
10050 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10051 ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n",
10052 (int)optp->level, (int)optp->name, (int)optp->len));
10053 qreply(q, mpctl);
10054 return (mp2ctl);
10055 }
10056
10057 /* IPv6 address information */
10058 static mblk_t *
10059 ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10060 boolean_t legacy_req)
10061 {
10062 struct opthdr *optp;
10063 mblk_t *mp2ctl;
10064 mblk_t *mp_tail = NULL;
10065 ill_t *ill;
10066 ipif_t *ipif;
10067 mib2_ipv6AddrEntry_t mae6;
10068 size_t mae6_size;
10069 zoneid_t zoneid;
10070 ill_walk_context_t ctx;
10071
10072 /*
10073 * make a copy of the original message
10074 */
10075 mp2ctl = copymsg(mpctl);
10076
10077 mae6_size = (legacy_req) ?
10078 LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t) :
10079 sizeof (mib2_ipv6AddrEntry_t);
10080
10081 /* ipv6AddrEntryTable */
10082
10083 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10084 optp->level = MIB2_IP6;
10085 optp->name = MIB2_IP6_ADDR;
10086 zoneid = Q_TO_CONN(q)->conn_zoneid;
10087
10088 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10089 ill = ILL_START_WALK_V6(&ctx, ipst);
10090 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10091 for (ipif = ill->ill_ipif; ipif != NULL;
10092 ipif = ipif->ipif_next) {
10093 if (ipif->ipif_zoneid != zoneid &&
10094 ipif->ipif_zoneid != ALL_ZONES)
10095 continue;
10096 /* Sum of count from dead IRE_LO* and our current */
10097 mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10098 if (ipif->ipif_ire_local != NULL) {
10099 mae6.ipv6AddrInfo.ae_ibcnt +=
10100 ipif->ipif_ire_local->ire_ib_pkt_count;
10101 }
10102 mae6.ipv6AddrInfo.ae_obcnt = 0;
10103 mae6.ipv6AddrInfo.ae_focnt = 0;
10104
10105 ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
10106 OCTET_LENGTH);
10107 mae6.ipv6AddrIfIndex.o_length =
10108 mi_strlen(mae6.ipv6AddrIfIndex.o_bytes);
10109 mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr;
10110 mae6.ipv6AddrPfxLength =
10111 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10112 mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
10113 mae6.ipv6AddrInfo.ae_subnet_len =
10114 mae6.ipv6AddrPfxLength;
10115 mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
10116
10117 /* Type: stateless(1), stateful(2), unknown(3) */
10118 if (ipif->ipif_flags & IPIF_ADDRCONF)
10119 mae6.ipv6AddrType = 1;
10120 else
10121 mae6.ipv6AddrType = 2;
10122 /* Anycast: true(1), false(2) */
10123 if (ipif->ipif_flags & IPIF_ANYCAST)
10124 mae6.ipv6AddrAnycastFlag = 1;
10125 else
10126 mae6.ipv6AddrAnycastFlag = 2;
10127
10128 /*
10129 * Address status: preferred(1), deprecated(2),
10130 * invalid(3), inaccessible(4), unknown(5)
10131 */
10132 if (ipif->ipif_flags & IPIF_NOLOCAL)
10133 mae6.ipv6AddrStatus = 3;
10134 else if (ipif->ipif_flags & IPIF_DEPRECATED)
10135 mae6.ipv6AddrStatus = 2;
10136 else
10137 mae6.ipv6AddrStatus = 1;
10138 mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10139 mae6.ipv6AddrInfo.ae_metric =
10140 ipif->ipif_ill->ill_metric;
10141 mae6.ipv6AddrInfo.ae_pp_dst_addr =
10142 ipif->ipif_v6pp_dst_addr;
10143 mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags |
10144 ill->ill_flags | ill->ill_phyint->phyint_flags;
10145 mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET;
10146 mae6.ipv6AddrIdentifier = ill->ill_token;
10147 mae6.ipv6AddrIdentifierLen = ill->ill_token_length;
10148 mae6.ipv6AddrReachableTime = ill->ill_reachable_time;
10149 mae6.ipv6AddrRetransmitTime =
10150 ill->ill_reachable_retrans_time;
10151 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10152 (char *)&mae6, (int)mae6_size)) {
10153 ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to "
10154 "allocate %u bytes\n",
10155 (uint_t)mae6_size));
10156 }
10157 }
10158 }
10159 rw_exit(&ipst->ips_ill_g_lock);
10160
10161 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10162 ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n",
10163 (int)optp->level, (int)optp->name, (int)optp->len));
10164 qreply(q, mpctl);
10165 return (mp2ctl);
10166 }
10167
10168 /* IPv4 multicast group membership. */
10169 static mblk_t *
10170 ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10171 {
10172 struct opthdr *optp;
10173 mblk_t *mp2ctl;
10174 ill_t *ill;
10175 ipif_t *ipif;
10176 ilm_t *ilm;
10177 ip_member_t ipm;
10178 mblk_t *mp_tail = NULL;
10179 ill_walk_context_t ctx;
10180 zoneid_t zoneid;
10181
10182 /*
10183 * make a copy of the original message
10184 */
10185 mp2ctl = copymsg(mpctl);
10186 zoneid = Q_TO_CONN(q)->conn_zoneid;
10187
10188 /* ipGroupMember table */
10189 optp = (struct opthdr *)&mpctl->b_rptr[
10190 sizeof (struct T_optmgmt_ack)];
10191 optp->level = MIB2_IP;
10192 optp->name = EXPER_IP_GROUP_MEMBERSHIP;
10193
10194 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10195 ill = ILL_START_WALK_V4(&ctx, ipst);
10196 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10197 /* Make sure the ill isn't going away. */
10198 if (!ill_check_and_refhold(ill))
10199 continue;
10200 rw_exit(&ipst->ips_ill_g_lock);
10201 rw_enter(&ill->ill_mcast_lock, RW_READER);
10202 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10203 if (ilm->ilm_zoneid != zoneid &&
10204 ilm->ilm_zoneid != ALL_ZONES)
10205 continue;
10206
10207 /* Is there an ipif for ilm_ifaddr? */
10208 for (ipif = ill->ill_ipif; ipif != NULL;
10209 ipif = ipif->ipif_next) {
10210 if (!IPIF_IS_CONDEMNED(ipif) &&
10211 ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10212 ilm->ilm_ifaddr != INADDR_ANY)
10213 break;
10214 }
10215 if (ipif != NULL) {
10216 ipif_get_name(ipif,
10217 ipm.ipGroupMemberIfIndex.o_bytes,
10218 OCTET_LENGTH);
10219 } else {
10220 ill_get_name(ill,
10221 ipm.ipGroupMemberIfIndex.o_bytes,
10222 OCTET_LENGTH);
10223 }
10224 ipm.ipGroupMemberIfIndex.o_length =
10225 mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
10226
10227 ipm.ipGroupMemberAddress = ilm->ilm_addr;
10228 ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
10229 ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
10230 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10231 (char *)&ipm, (int)sizeof (ipm))) {
10232 ip1dbg(("ip_snmp_get_mib2_ip_group: "
10233 "failed to allocate %u bytes\n",
10234 (uint_t)sizeof (ipm)));
10235 }
10236 }
10237 rw_exit(&ill->ill_mcast_lock);
10238 ill_refrele(ill);
10239 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10240 }
10241 rw_exit(&ipst->ips_ill_g_lock);
10242 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10243 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10244 (int)optp->level, (int)optp->name, (int)optp->len));
10245 qreply(q, mpctl);
10246 return (mp2ctl);
10247 }
10248
10249 /* IPv6 multicast group membership. */
10250 static mblk_t *
10251 ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10252 {
10253 struct opthdr *optp;
10254 mblk_t *mp2ctl;
10255 ill_t *ill;
10256 ilm_t *ilm;
10257 ipv6_member_t ipm6;
10258 mblk_t *mp_tail = NULL;
10259 ill_walk_context_t ctx;
10260 zoneid_t zoneid;
10261
10262 /*
10263 * make a copy of the original message
10264 */
10265 mp2ctl = copymsg(mpctl);
10266 zoneid = Q_TO_CONN(q)->conn_zoneid;
10267
10268 /* ip6GroupMember table */
10269 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10270 optp->level = MIB2_IP6;
10271 optp->name = EXPER_IP6_GROUP_MEMBERSHIP;
10272
10273 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10274 ill = ILL_START_WALK_V6(&ctx, ipst);
10275 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10276 /* Make sure the ill isn't going away. */
10277 if (!ill_check_and_refhold(ill))
10278 continue;
10279 rw_exit(&ipst->ips_ill_g_lock);
10280 /*
10281 * Normally we don't have any members on under IPMP interfaces.
10282 * We report them as a debugging aid.
10283 */
10284 rw_enter(&ill->ill_mcast_lock, RW_READER);
10285 ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
10286 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10287 if (ilm->ilm_zoneid != zoneid &&
10288 ilm->ilm_zoneid != ALL_ZONES)
10289 continue; /* not this zone */
10290 ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
10291 ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
10292 ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode;
10293 if (!snmp_append_data2(mpctl->b_cont,
10294 &mp_tail,
10295 (char *)&ipm6, (int)sizeof (ipm6))) {
10296 ip1dbg(("ip_snmp_get_mib2_ip6_group: "
10297 "failed to allocate %u bytes\n",
10298 (uint_t)sizeof (ipm6)));
10299 }
10300 }
10301 rw_exit(&ill->ill_mcast_lock);
10302 ill_refrele(ill);
10303 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10304 }
10305 rw_exit(&ipst->ips_ill_g_lock);
10306
10307 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10308 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10309 (int)optp->level, (int)optp->name, (int)optp->len));
10310 qreply(q, mpctl);
10311 return (mp2ctl);
10312 }
10313
10314 /* IP multicast filtered sources */
10315 static mblk_t *
10316 ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10317 {
10318 struct opthdr *optp;
10319 mblk_t *mp2ctl;
10320 ill_t *ill;
10321 ipif_t *ipif;
10322 ilm_t *ilm;
10323 ip_grpsrc_t ips;
10324 mblk_t *mp_tail = NULL;
10325 ill_walk_context_t ctx;
10326 zoneid_t zoneid;
10327 int i;
10328 slist_t *sl;
10329
10330 /*
10331 * make a copy of the original message
10332 */
10333 mp2ctl = copymsg(mpctl);
10334 zoneid = Q_TO_CONN(q)->conn_zoneid;
10335
10336 /* ipGroupSource table */
10337 optp = (struct opthdr *)&mpctl->b_rptr[
10338 sizeof (struct T_optmgmt_ack)];
10339 optp->level = MIB2_IP;
10340 optp->name = EXPER_IP_GROUP_SOURCES;
10341
10342 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10343 ill = ILL_START_WALK_V4(&ctx, ipst);
10344 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10345 /* Make sure the ill isn't going away. */
10346 if (!ill_check_and_refhold(ill))
10347 continue;
10348 rw_exit(&ipst->ips_ill_g_lock);
10349 rw_enter(&ill->ill_mcast_lock, RW_READER);
10350 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10351 sl = ilm->ilm_filter;
10352 if (ilm->ilm_zoneid != zoneid &&
10353 ilm->ilm_zoneid != ALL_ZONES)
10354 continue;
10355 if (SLIST_IS_EMPTY(sl))
10356 continue;
10357
10358 /* Is there an ipif for ilm_ifaddr? */
10359 for (ipif = ill->ill_ipif; ipif != NULL;
10360 ipif = ipif->ipif_next) {
10361 if (!IPIF_IS_CONDEMNED(ipif) &&
10362 ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10363 ilm->ilm_ifaddr != INADDR_ANY)
10364 break;
10365 }
10366 if (ipif != NULL) {
10367 ipif_get_name(ipif,
10368 ips.ipGroupSourceIfIndex.o_bytes,
10369 OCTET_LENGTH);
10370 } else {
10371 ill_get_name(ill,
10372 ips.ipGroupSourceIfIndex.o_bytes,
10373 OCTET_LENGTH);
10374 }
10375 ips.ipGroupSourceIfIndex.o_length =
10376 mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
10377
10378 ips.ipGroupSourceGroup = ilm->ilm_addr;
10379 for (i = 0; i < sl->sl_numsrc; i++) {
10380 if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
10381 continue;
10382 IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
10383 ips.ipGroupSourceAddress);
10384 if (snmp_append_data2(mpctl->b_cont, &mp_tail,
10385 (char *)&ips, (int)sizeof (ips)) == 0) {
10386 ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
10387 " failed to allocate %u bytes\n",
10388 (uint_t)sizeof (ips)));
10389 }
10390 }
10391 }
10392 rw_exit(&ill->ill_mcast_lock);
10393 ill_refrele(ill);
10394 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10395 }
10396 rw_exit(&ipst->ips_ill_g_lock);
10397 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10398 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10399 (int)optp->level, (int)optp->name, (int)optp->len));
10400 qreply(q, mpctl);
10401 return (mp2ctl);
10402 }
10403
10404 /* IPv6 multicast filtered sources. */
10405 static mblk_t *
10406 ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10407 {
10408 struct opthdr *optp;
10409 mblk_t *mp2ctl;
10410 ill_t *ill;
10411 ilm_t *ilm;
10412 ipv6_grpsrc_t ips6;
10413 mblk_t *mp_tail = NULL;
10414 ill_walk_context_t ctx;
10415 zoneid_t zoneid;
10416 int i;
10417 slist_t *sl;
10418
10419 /*
10420 * make a copy of the original message
10421 */
10422 mp2ctl = copymsg(mpctl);
10423 zoneid = Q_TO_CONN(q)->conn_zoneid;
10424
10425 /* ip6GroupMember table */
10426 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10427 optp->level = MIB2_IP6;
10428 optp->name = EXPER_IP6_GROUP_SOURCES;
10429
10430 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10431 ill = ILL_START_WALK_V6(&ctx, ipst);
10432 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10433 /* Make sure the ill isn't going away. */
10434 if (!ill_check_and_refhold(ill))
10435 continue;
10436 rw_exit(&ipst->ips_ill_g_lock);
10437 /*
10438 * Normally we don't have any members on under IPMP interfaces.
10439 * We report them as a debugging aid.
10440 */
10441 rw_enter(&ill->ill_mcast_lock, RW_READER);
10442 ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
10443 for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10444 sl = ilm->ilm_filter;
10445 if (ilm->ilm_zoneid != zoneid &&
10446 ilm->ilm_zoneid != ALL_ZONES)
10447 continue;
10448 if (SLIST_IS_EMPTY(sl))
10449 continue;
10450 ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
10451 for (i = 0; i < sl->sl_numsrc; i++) {
10452 ips6.ipv6GroupSourceAddress = sl->sl_addr[i];
10453 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10454 (char *)&ips6, (int)sizeof (ips6))) {
10455 ip1dbg(("ip_snmp_get_mib2_ip6_"
10456 "group_src: failed to allocate "
10457 "%u bytes\n",
10458 (uint_t)sizeof (ips6)));
10459 }
10460 }
10461 }
10462 rw_exit(&ill->ill_mcast_lock);
10463 ill_refrele(ill);
10464 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10465 }
10466 rw_exit(&ipst->ips_ill_g_lock);
10467
10468 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10469 ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10470 (int)optp->level, (int)optp->name, (int)optp->len));
10471 qreply(q, mpctl);
10472 return (mp2ctl);
10473 }
10474
10475 /* Multicast routing virtual interface table. */
10476 static mblk_t *
10477 ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10478 {
10479 struct opthdr *optp;
10480 mblk_t *mp2ctl;
10481
10482 /*
10483 * make a copy of the original message
10484 */
10485 mp2ctl = copymsg(mpctl);
10486
10487 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10488 optp->level = EXPER_DVMRP;
10489 optp->name = EXPER_DVMRP_VIF;
10490 if (!ip_mroute_vif(mpctl->b_cont, ipst)) {
10491 ip0dbg(("ip_mroute_vif: failed\n"));
10492 }
10493 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10494 ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n",
10495 (int)optp->level, (int)optp->name, (int)optp->len));
10496 qreply(q, mpctl);
10497 return (mp2ctl);
10498 }
10499
10500 /* Multicast routing table. */
10501 static mblk_t *
10502 ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10503 {
10504 struct opthdr *optp;
10505 mblk_t *mp2ctl;
10506
10507 /*
10508 * make a copy of the original message
10509 */
10510 mp2ctl = copymsg(mpctl);
10511
10512 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10513 optp->level = EXPER_DVMRP;
10514 optp->name = EXPER_DVMRP_MRT;
10515 if (!ip_mroute_mrt(mpctl->b_cont, ipst)) {
10516 ip0dbg(("ip_mroute_mrt: failed\n"));
10517 }
10518 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10519 ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n",
10520 (int)optp->level, (int)optp->name, (int)optp->len));
10521 qreply(q, mpctl);
10522 return (mp2ctl);
10523 }
10524
10525 /*
10526 * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable
10527 * in one IRE walk.
10528 */
10529 static mblk_t *
10530 ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
10531 ip_stack_t *ipst)
10532 {
10533 struct opthdr *optp;
10534 mblk_t *mp2ctl; /* Returned */
10535 mblk_t *mp3ctl; /* nettomedia */
10536 mblk_t *mp4ctl; /* routeattrs */
10537 iproutedata_t ird;
10538 zoneid_t zoneid;
10539
10540 /*
10541 * make copies of the original message
10542 * - mp2ctl is returned unchanged to the caller for his use
10543 * - mpctl is sent upstream as ipRouteEntryTable
10544 * - mp3ctl is sent upstream as ipNetToMediaEntryTable
10545 * - mp4ctl is sent upstream as ipRouteAttributeTable
10546 */
10547 mp2ctl = copymsg(mpctl);
10548 mp3ctl = copymsg(mpctl);
10549 mp4ctl = copymsg(mpctl);
10550 if (mp3ctl == NULL || mp4ctl == NULL) {
10551 freemsg(mp4ctl);
10552 freemsg(mp3ctl);
10553 freemsg(mp2ctl);
10554 freemsg(mpctl);
10555 return (NULL);
10556 }
10557
10558 bzero(&ird, sizeof (ird));
10559
10560 ird.ird_route.lp_head = mpctl->b_cont;
10561 ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10562 ird.ird_attrs.lp_head = mp4ctl->b_cont;
10563 /*
10564 * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10565 * then also include ire_testhidden IREs and IRE_IF_CLONE. This is
10566 * intended a temporary solution until a proper MIB API is provided
10567 * that provides complete filtering/caller-opt-in.
10568 */
10569 if (level == EXPER_IP_AND_ALL_IRES)
10570 ird.ird_flags |= IRD_REPORT_ALL;
10571
10572 zoneid = Q_TO_CONN(q)->conn_zoneid;
10573 ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
10574
10575 /* ipRouteEntryTable in mpctl */
10576 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10577 optp->level = MIB2_IP;
10578 optp->name = MIB2_IP_ROUTE;
10579 optp->len = msgdsize(ird.ird_route.lp_head);
10580 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10581 (int)optp->level, (int)optp->name, (int)optp->len));
10582 qreply(q, mpctl);
10583
10584 /* ipNetToMediaEntryTable in mp3ctl */
10585 ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
10586
10587 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10588 optp->level = MIB2_IP;
10589 optp->name = MIB2_IP_MEDIA;
10590 optp->len = msgdsize(ird.ird_netmedia.lp_head);
10591 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10592 (int)optp->level, (int)optp->name, (int)optp->len));
10593 qreply(q, mp3ctl);
10594
10595 /* ipRouteAttributeTable in mp4ctl */
10596 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10597 optp->level = MIB2_IP;
10598 optp->name = EXPER_IP_RTATTR;
10599 optp->len = msgdsize(ird.ird_attrs.lp_head);
10600 ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10601 (int)optp->level, (int)optp->name, (int)optp->len));
10602 if (optp->len == 0)
10603 freemsg(mp4ctl);
10604 else
10605 qreply(q, mp4ctl);
10606
10607 return (mp2ctl);
10608 }
10609
10610 /*
10611 * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and
10612 * ipv6NetToMediaEntryTable in an NDP walk.
10613 */
10614 static mblk_t *
10615 ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
10616 ip_stack_t *ipst)
10617 {
10618 struct opthdr *optp;
10619 mblk_t *mp2ctl; /* Returned */
10620 mblk_t *mp3ctl; /* nettomedia */
10621 mblk_t *mp4ctl; /* routeattrs */
10622 iproutedata_t ird;
10623 zoneid_t zoneid;
10624
10625 /*
10626 * make copies of the original message
10627 * - mp2ctl is returned unchanged to the caller for his use
10628 * - mpctl is sent upstream as ipv6RouteEntryTable
10629 * - mp3ctl is sent upstream as ipv6NetToMediaEntryTable
10630 * - mp4ctl is sent upstream as ipv6RouteAttributeTable
10631 */
10632 mp2ctl = copymsg(mpctl);
10633 mp3ctl = copymsg(mpctl);
10634 mp4ctl = copymsg(mpctl);
10635 if (mp3ctl == NULL || mp4ctl == NULL) {
10636 freemsg(mp4ctl);
10637 freemsg(mp3ctl);
10638 freemsg(mp2ctl);
10639 freemsg(mpctl);
10640 return (NULL);
10641 }
10642
10643 bzero(&ird, sizeof (ird));
10644
10645 ird.ird_route.lp_head = mpctl->b_cont;
10646 ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10647 ird.ird_attrs.lp_head = mp4ctl->b_cont;
10648 /*
10649 * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10650 * then also include ire_testhidden IREs and IRE_IF_CLONE. This is
10651 * intended a temporary solution until a proper MIB API is provided
10652 * that provides complete filtering/caller-opt-in.
10653 */
10654 if (level == EXPER_IP_AND_ALL_IRES)
10655 ird.ird_flags |= IRD_REPORT_ALL;
10656
10657 zoneid = Q_TO_CONN(q)->conn_zoneid;
10658 ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
10659
10660 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10661 optp->level = MIB2_IP6;
10662 optp->name = MIB2_IP6_ROUTE;
10663 optp->len = msgdsize(ird.ird_route.lp_head);
10664 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10665 (int)optp->level, (int)optp->name, (int)optp->len));
10666 qreply(q, mpctl);
10667
10668 /* ipv6NetToMediaEntryTable in mp3ctl */
10669 ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
10670
10671 optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10672 optp->level = MIB2_IP6;
10673 optp->name = MIB2_IP6_MEDIA;
10674 optp->len = msgdsize(ird.ird_netmedia.lp_head);
10675 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10676 (int)optp->level, (int)optp->name, (int)optp->len));
10677 qreply(q, mp3ctl);
10678
10679 /* ipv6RouteAttributeTable in mp4ctl */
10680 optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10681 optp->level = MIB2_IP6;
10682 optp->name = EXPER_IP_RTATTR;
10683 optp->len = msgdsize(ird.ird_attrs.lp_head);
10684 ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10685 (int)optp->level, (int)optp->name, (int)optp->len));
10686 if (optp->len == 0)
10687 freemsg(mp4ctl);
10688 else
10689 qreply(q, mp4ctl);
10690
10691 return (mp2ctl);
10692 }
10693
10694 /*
10695 * IPv6 mib: One per ill
10696 */
10697 static mblk_t *
10698 ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10699 boolean_t legacy_req)
10700 {
10701 struct opthdr *optp;
10702 mblk_t *mp2ctl;
10703 ill_t *ill;
10704 ill_walk_context_t ctx;
10705 mblk_t *mp_tail = NULL;
10706 mib2_ipv6AddrEntry_t mae6;
10707 mib2_ipIfStatsEntry_t *ise;
10708 size_t ise_size, iae_size;
10709
10710 /*
10711 * Make a copy of the original message
10712 */
10713 mp2ctl = copymsg(mpctl);
10714
10715 /* fixed length IPv6 structure ... */
10716
10717 if (legacy_req) {
10718 ise_size = LEGACY_MIB_SIZE(&ipst->ips_ip6_mib,
10719 mib2_ipIfStatsEntry_t);
10720 iae_size = LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t);
10721 } else {
10722 ise_size = sizeof (mib2_ipIfStatsEntry_t);
10723 iae_size = sizeof (mib2_ipv6AddrEntry_t);
10724 }
10725
10726 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10727 optp->level = MIB2_IP6;
10728 optp->name = 0;
10729 /* Include "unknown interface" ip6_mib */
10730 ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
10731 ipst->ips_ip6_mib.ipIfStatsIfIndex =
10732 MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
10733 SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding,
10734 ipst->ips_ipv6_forwarding ? 1 : 2);
10735 SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit,
10736 ipst->ips_ipv6_def_hops);
10737 SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize,
10738 sizeof (mib2_ipIfStatsEntry_t));
10739 SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize,
10740 sizeof (mib2_ipv6AddrEntry_t));
10741 SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize,
10742 sizeof (mib2_ipv6RouteEntry_t));
10743 SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize,
10744 sizeof (mib2_ipv6NetToMediaEntry_t));
10745 SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize,
10746 sizeof (ipv6_member_t));
10747 SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize,
10748 sizeof (ipv6_grpsrc_t));
10749
10750 /*
10751 * Synchronize 64- and 32-bit counters
10752 */
10753 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives,
10754 ipIfStatsHCInReceives);
10755 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers,
10756 ipIfStatsHCInDelivers);
10757 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests,
10758 ipIfStatsHCOutRequests);
10759 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams,
10760 ipIfStatsHCOutForwDatagrams);
10761 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts,
10762 ipIfStatsHCOutMcastPkts);
10763 SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts,
10764 ipIfStatsHCInMcastPkts);
10765
10766 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10767 (char *)&ipst->ips_ip6_mib, (int)ise_size)) {
10768 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n",
10769 (uint_t)ise_size));
10770 } else if (legacy_req) {
10771 /* Adjust the EntrySize fields for legacy requests. */
10772 ise =
10773 (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr - (int)ise_size);
10774 SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10775 SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10776 }
10777
10778 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10779 ill = ILL_START_WALK_V6(&ctx, ipst);
10780 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10781 ill->ill_ip_mib->ipIfStatsIfIndex =
10782 ill->ill_phyint->phyint_ifindex;
10783 SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
10784 ipst->ips_ipv6_forwarding ? 1 : 2);
10785 SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit,
10786 ill->ill_max_hops);
10787
10788 /*
10789 * Synchronize 64- and 32-bit counters
10790 */
10791 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives,
10792 ipIfStatsHCInReceives);
10793 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers,
10794 ipIfStatsHCInDelivers);
10795 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests,
10796 ipIfStatsHCOutRequests);
10797 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams,
10798 ipIfStatsHCOutForwDatagrams);
10799 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts,
10800 ipIfStatsHCOutMcastPkts);
10801 SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts,
10802 ipIfStatsHCInMcastPkts);
10803
10804 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10805 (char *)ill->ill_ip_mib, (int)ise_size)) {
10806 ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate "
10807 "%u bytes\n", (uint_t)ise_size));
10808 } else if (legacy_req) {
10809 /* Adjust the EntrySize fields for legacy requests. */
10810 ise = (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr -
10811 (int)ise_size);
10812 SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10813 SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10814 }
10815 }
10816 rw_exit(&ipst->ips_ill_g_lock);
10817
10818 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10819 ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n",
10820 (int)optp->level, (int)optp->name, (int)optp->len));
10821 qreply(q, mpctl);
10822 return (mp2ctl);
10823 }
10824
10825 /*
10826 * ICMPv6 mib: One per ill
10827 */
10828 static mblk_t *
10829 ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10830 {
10831 struct opthdr *optp;
10832 mblk_t *mp2ctl;
10833 ill_t *ill;
10834 ill_walk_context_t ctx;
10835 mblk_t *mp_tail = NULL;
10836 /*
10837 * Make a copy of the original message
10838 */
10839 mp2ctl = copymsg(mpctl);
10840
10841 /* fixed length ICMPv6 structure ... */
10842
10843 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10844 optp->level = MIB2_ICMP6;
10845 optp->name = 0;
10846 /* Include "unknown interface" icmp6_mib */
10847 ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex =
10848 MIB2_UNKNOWN_INTERFACE; /* netstat flag */
10849 ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize =
10850 sizeof (mib2_ipv6IfIcmpEntry_t);
10851 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10852 (char *)&ipst->ips_icmp6_mib,
10853 (int)sizeof (ipst->ips_icmp6_mib))) {
10854 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n",
10855 (uint_t)sizeof (ipst->ips_icmp6_mib)));
10856 }
10857
10858 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10859 ill = ILL_START_WALK_V6(&ctx, ipst);
10860 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10861 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
10862 ill->ill_phyint->phyint_ifindex;
10863 if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10864 (char *)ill->ill_icmp6_mib,
10865 (int)sizeof (*ill->ill_icmp6_mib))) {
10866 ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate "
10867 "%u bytes\n",
10868 (uint_t)sizeof (*ill->ill_icmp6_mib)));
10869 }
10870 }
10871 rw_exit(&ipst->ips_ill_g_lock);
10872
10873 optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10874 ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n",
10875 (int)optp->level, (int)optp->name, (int)optp->len));
10876 qreply(q, mpctl);
10877 return (mp2ctl);
10878 }
10879
10880 /*
10881 * ire_walk routine to create both ipRouteEntryTable and
10882 * ipRouteAttributeTable in one IRE walk
10883 */
10884 static void
10885 ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
10886 {
10887 ill_t *ill;
10888 mib2_ipRouteEntry_t *re;
10889 mib2_ipAttributeEntry_t iaes;
10890 tsol_ire_gw_secattr_t *attrp;
10891 tsol_gc_t *gc = NULL;
10892 tsol_gcgrp_t *gcgrp = NULL;
10893 ip_stack_t *ipst = ire->ire_ipst;
10894
10895 ASSERT(ire->ire_ipversion == IPV4_VERSION);
10896
10897 if (!(ird->ird_flags & IRD_REPORT_ALL)) {
10898 if (ire->ire_testhidden)
10899 return;
10900 if (ire->ire_type & IRE_IF_CLONE)
10901 return;
10902 }
10903
10904 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
10905 return;
10906
10907 if ((attrp = ire->ire_gw_secattr) != NULL) {
10908 mutex_enter(&attrp->igsa_lock);
10909 if ((gc = attrp->igsa_gc) != NULL) {
10910 gcgrp = gc->gc_grp;
10911 ASSERT(gcgrp != NULL);
10912 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
10913 }
10914 mutex_exit(&attrp->igsa_lock);
10915 }
10916 /*
10917 * Return all IRE types for route table... let caller pick and choose
10918 */
10919 re->ipRouteDest = ire->ire_addr;
10920 ill = ire->ire_ill;
10921 re->ipRouteIfIndex.o_length = 0;
10922 if (ill != NULL) {
10923 ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
10924 re->ipRouteIfIndex.o_length =
10925 mi_strlen(re->ipRouteIfIndex.o_bytes);
10926 }
10927 re->ipRouteMetric1 = -1;
10928 re->ipRouteMetric2 = -1;
10929 re->ipRouteMetric3 = -1;
10930 re->ipRouteMetric4 = -1;
10931
10932 re->ipRouteNextHop = ire->ire_gateway_addr;
10933 /* indirect(4), direct(3), or invalid(2) */
10934 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
10935 re->ipRouteType = 2;
10936 else if (ire->ire_type & IRE_ONLINK)
10937 re->ipRouteType = 3;
10938 else
10939 re->ipRouteType = 4;
10940
10941 re->ipRouteProto = -1;
10942 re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
10943 re->ipRouteMask = ire->ire_mask;
10944 re->ipRouteMetric5 = -1;
10945 re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
10946 if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
10947 re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
10948
10949 re->ipRouteInfo.re_frag_flag = 0;
10950 re->ipRouteInfo.re_rtt = 0;
10951 re->ipRouteInfo.re_src_addr = 0;
10952 re->ipRouteInfo.re_ref = ire->ire_refcnt;
10953 re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count;
10954 re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count;
10955 re->ipRouteInfo.re_flags = ire->ire_flags;
10956
10957 /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
10958 if (ire->ire_type & IRE_INTERFACE) {
10959 ire_t *child;
10960
10961 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
10962 child = ire->ire_dep_children;
10963 while (child != NULL) {
10964 re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
10965 re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
10966 child = child->ire_dep_sib_next;
10967 }
10968 rw_exit(&ipst->ips_ire_dep_lock);
10969 }
10970
10971 if (ire->ire_flags & RTF_DYNAMIC) {
10972 re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT;
10973 } else {
10974 re->ipRouteInfo.re_ire_type = ire->ire_type;
10975 }
10976
10977 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
10978 (char *)re, (int)sizeof (*re))) {
10979 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
10980 (uint_t)sizeof (*re)));
10981 }
10982
10983 if (gc != NULL) {
10984 iaes.iae_routeidx = ird->ird_idx;
10985 iaes.iae_doi = gc->gc_db->gcdb_doi;
10986 iaes.iae_slrange = gc->gc_db->gcdb_slrange;
10987
10988 if (!snmp_append_data2(ird->ird_attrs.lp_head,
10989 &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
10990 ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
10991 "bytes\n", (uint_t)sizeof (iaes)));
10992 }
10993 }
10994
10995 /* bump route index for next pass */
10996 ird->ird_idx++;
10997
10998 kmem_free(re, sizeof (*re));
10999 if (gcgrp != NULL)
11000 rw_exit(&gcgrp->gcgrp_rwlock);
11001 }
11002
11003 /*
11004 * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable.
11005 */
11006 static void
11007 ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
11008 {
11009 ill_t *ill;
11010 mib2_ipv6RouteEntry_t *re;
11011 mib2_ipAttributeEntry_t iaes;
11012 tsol_ire_gw_secattr_t *attrp;
11013 tsol_gc_t *gc = NULL;
11014 tsol_gcgrp_t *gcgrp = NULL;
11015 ip_stack_t *ipst = ire->ire_ipst;
11016
11017 ASSERT(ire->ire_ipversion == IPV6_VERSION);
11018
11019 if (!(ird->ird_flags & IRD_REPORT_ALL)) {
11020 if (ire->ire_testhidden)
11021 return;
11022 if (ire->ire_type & IRE_IF_CLONE)
11023 return;
11024 }
11025
11026 if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
11027 return;
11028
11029 if ((attrp = ire->ire_gw_secattr) != NULL) {
11030 mutex_enter(&attrp->igsa_lock);
11031 if ((gc = attrp->igsa_gc) != NULL) {
11032 gcgrp = gc->gc_grp;
11033 ASSERT(gcgrp != NULL);
11034 rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
11035 }
11036 mutex_exit(&attrp->igsa_lock);
11037 }
11038 /*
11039 * Return all IRE types for route table... let caller pick and choose
11040 */
11041 re->ipv6RouteDest = ire->ire_addr_v6;
11042 re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
11043 re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */
11044 re->ipv6RouteIfIndex.o_length = 0;
11045 ill = ire->ire_ill;
11046 if (ill != NULL) {
11047 ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
11048 re->ipv6RouteIfIndex.o_length =
11049 mi_strlen(re->ipv6RouteIfIndex.o_bytes);
11050 }
11051
11052 ASSERT(!(ire->ire_type & IRE_BROADCAST));
11053
11054 mutex_enter(&ire->ire_lock);
11055 re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
11056 mutex_exit(&ire->ire_lock);
11057
11058 /* remote(4), local(3), or discard(2) */
11059 if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
11060 re->ipv6RouteType = 2;
11061 else if (ire->ire_type & IRE_ONLINK)
11062 re->ipv6RouteType = 3;
11063 else
11064 re->ipv6RouteType = 4;
11065
11066 re->ipv6RouteProtocol = -1;
11067 re->ipv6RoutePolicy = 0;
11068 re->ipv6RouteAge = gethrestime_sec() - ire->ire_create_time;
11069 re->ipv6RouteNextHopRDI = 0;
11070 re->ipv6RouteWeight = 0;
11071 re->ipv6RouteMetric = 0;
11072 re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
11073 if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
11074 re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
11075
11076 re->ipv6RouteInfo.re_frag_flag = 0;
11077 re->ipv6RouteInfo.re_rtt = 0;
11078 re->ipv6RouteInfo.re_src_addr = ipv6_all_zeros;
11079 re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count;
11080 re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count;
11081 re->ipv6RouteInfo.re_ref = ire->ire_refcnt;
11082 re->ipv6RouteInfo.re_flags = ire->ire_flags;
11083
11084 /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
11085 if (ire->ire_type & IRE_INTERFACE) {
11086 ire_t *child;
11087
11088 rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
11089 child = ire->ire_dep_children;
11090 while (child != NULL) {
11091 re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
11092 re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
11093 child = child->ire_dep_sib_next;
11094 }
11095 rw_exit(&ipst->ips_ire_dep_lock);
11096 }
11097 if (ire->ire_flags & RTF_DYNAMIC) {
11098 re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT;
11099 } else {
11100 re->ipv6RouteInfo.re_ire_type = ire->ire_type;
11101 }
11102
11103 if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
11104 (char *)re, (int)sizeof (*re))) {
11105 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
11106 (uint_t)sizeof (*re)));
11107 }
11108
11109 if (gc != NULL) {
11110 iaes.iae_routeidx = ird->ird_idx;
11111 iaes.iae_doi = gc->gc_db->gcdb_doi;
11112 iaes.iae_slrange = gc->gc_db->gcdb_slrange;
11113
11114 if (!snmp_append_data2(ird->ird_attrs.lp_head,
11115 &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
11116 ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
11117 "bytes\n", (uint_t)sizeof (iaes)));
11118 }
11119 }
11120
11121 /* bump route index for next pass */
11122 ird->ird_idx++;
11123
11124 kmem_free(re, sizeof (*re));
11125 if (gcgrp != NULL)
11126 rw_exit(&gcgrp->gcgrp_rwlock);
11127 }
11128
11129 /*
11130 * ncec_walk routine to create ipv6NetToMediaEntryTable
11131 */
11132 static int
11133 ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird)
11134 {
11135 ill_t *ill;
11136 mib2_ipv6NetToMediaEntry_t ntme;
11137
11138 ill = ncec->ncec_ill;
11139 /* skip arpce entries, and loopback ncec entries */
11140 if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
11141 return (0);
11142 /*
11143 * Neighbor cache entry attached to IRE with on-link
11144 * destination.
11145 * We report all IPMP groups on ncec_ill which is normally the upper.
11146 */
11147 ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
11148 ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
11149 ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
11150 if (ncec->ncec_lladdr != NULL) {
11151 bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
11152 ntme.ipv6NetToMediaPhysAddress.o_length);
11153 }
11154 /*
11155 * Note: Returns ND_* states. Should be:
11156 * reachable(1), stale(2), delay(3), probe(4),
11157 * invalid(5), unknown(6)
11158 */
11159 ntme.ipv6NetToMediaState = ncec->ncec_state;
11160 ntme.ipv6NetToMediaLastUpdated = 0;
11161
11162 /* other(1), dynamic(2), static(3), local(4) */
11163 if (NCE_MYADDR(ncec)) {
11164 ntme.ipv6NetToMediaType = 4;
11165 } else if (ncec->ncec_flags & NCE_F_PUBLISH) {
11166 ntme.ipv6NetToMediaType = 1; /* proxy */
11167 } else if (ncec->ncec_flags & NCE_F_STATIC) {
11168 ntme.ipv6NetToMediaType = 3;
11169 } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
11170 ntme.ipv6NetToMediaType = 1;
11171 } else {
11172 ntme.ipv6NetToMediaType = 2;
11173 }
11174
11175 if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11176 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11177 ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n",
11178 (uint_t)sizeof (ntme)));
11179 }
11180 return (0);
11181 }
11182
11183 int
11184 nce2ace(ncec_t *ncec)
11185 {
11186 int flags = 0;
11187
11188 if (NCE_ISREACHABLE(ncec))
11189 flags |= ACE_F_RESOLVED;
11190 if (ncec->ncec_flags & NCE_F_AUTHORITY)
11191 flags |= ACE_F_AUTHORITY;
11192 if (ncec->ncec_flags & NCE_F_PUBLISH)
11193 flags |= ACE_F_PUBLISH;
11194 if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
11195 flags |= ACE_F_PERMANENT;
11196 if (NCE_MYADDR(ncec))
11197 flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
11198 if (ncec->ncec_flags & NCE_F_UNVERIFIED)
11199 flags |= ACE_F_UNVERIFIED;
11200 if (ncec->ncec_flags & NCE_F_AUTHORITY)
11201 flags |= ACE_F_AUTHORITY;
11202 if (ncec->ncec_flags & NCE_F_DELAYED)
11203 flags |= ACE_F_DELAYED;
11204 return (flags);
11205 }
11206
11207 /*
11208 * ncec_walk routine to create ipNetToMediaEntryTable
11209 */
11210 static int
11211 ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird)
11212 {
11213 ill_t *ill;
11214 mib2_ipNetToMediaEntry_t ntme;
11215 const char *name = "unknown";
11216 ipaddr_t ncec_addr;
11217
11218 ill = ncec->ncec_ill;
11219 if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
11220 ill->ill_net_type == IRE_LOOPBACK)
11221 return (0);
11222
11223 /* We report all IPMP groups on ncec_ill which is normally the upper. */
11224 name = ill->ill_name;
11225 /* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
11226 if (NCE_MYADDR(ncec)) {
11227 ntme.ipNetToMediaType = 4;
11228 } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
11229 ntme.ipNetToMediaType = 1;
11230 } else {
11231 ntme.ipNetToMediaType = 3;
11232 }
11233 ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
11234 bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
11235 ntme.ipNetToMediaIfIndex.o_length);
11236
11237 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
11238 bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
11239
11240 ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
11241 ncec_addr = INADDR_BROADCAST;
11242 bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
11243 sizeof (ncec_addr));
11244 /*
11245 * map all the flags to the ACE counterpart.
11246 */
11247 ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
11248
11249 ntme.ipNetToMediaPhysAddress.o_length =
11250 MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
11251
11252 if (!NCE_ISREACHABLE(ncec))
11253 ntme.ipNetToMediaPhysAddress.o_length = 0;
11254 else {
11255 if (ncec->ncec_lladdr != NULL) {
11256 bcopy(ncec->ncec_lladdr,
11257 ntme.ipNetToMediaPhysAddress.o_bytes,
11258 ntme.ipNetToMediaPhysAddress.o_length);
11259 }
11260 }
11261
11262 if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11263 &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11264 ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
11265 (uint_t)sizeof (ntme)));
11266 }
11267 return (0);
11268 }
11269
11270 /*
11271 * return (0) if invalid set request, 1 otherwise, including non-tcp requests
11272 */
11273 /* ARGSUSED */
11274 int
11275 ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
11276 {
11277 switch (level) {
11278 case MIB2_IP:
11279 case MIB2_ICMP:
11280 switch (name) {
11281 default:
11282 break;
11283 }
11284 return (1);
11285 default:
11286 return (1);
11287 }
11288 }
11289
11290 /*
11291 * When there exists both a 64- and 32-bit counter of a particular type
11292 * (i.e., InReceives), only the 64-bit counters are added.
11293 */
11294 void
11295 ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2)
11296 {
11297 UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors);
11298 UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors);
11299 UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes);
11300 UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors);
11301 UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos);
11302 UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts);
11303 UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards);
11304 UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards);
11305 UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs);
11306 UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails);
11307 UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates);
11308 UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds);
11309 UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs);
11310 UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails);
11311 UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes);
11312 UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates);
11313 UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups);
11314 UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits);
11315 UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs);
11316 UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows);
11317 UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows);
11318 UPDATE_MIB(o1, ipIfStatsInWrongIPVersion,
11319 o2->ipIfStatsInWrongIPVersion);
11320 UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion,
11321 o2->ipIfStatsInWrongIPVersion);
11322 UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion,
11323 o2->ipIfStatsOutSwitchIPVersion);
11324 UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives);
11325 UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets);
11326 UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams,
11327 o2->ipIfStatsHCInForwDatagrams);
11328 UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers);
11329 UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests);
11330 UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams,
11331 o2->ipIfStatsHCOutForwDatagrams);
11332 UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds);
11333 UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits);
11334 UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets);
11335 UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts);
11336 UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets);
11337 UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts);
11338 UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets,
11339 o2->ipIfStatsHCOutMcastOctets);
11340 UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts);
11341 UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts);
11342 UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded);
11343 UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed);
11344 UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs);
11345 UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs);
11346 UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts);
11347 }
11348
11349 void
11350 ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
11351 {
11352 UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs);
11353 UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors);
11354 UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs);
11355 UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs);
11356 UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds);
11357 UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems);
11358 UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs);
11359 UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos);
11360 UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies);
11361 UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits,
11362 o2->ipv6IfIcmpInRouterSolicits);
11363 UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements,
11364 o2->ipv6IfIcmpInRouterAdvertisements);
11365 UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits,
11366 o2->ipv6IfIcmpInNeighborSolicits);
11367 UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements,
11368 o2->ipv6IfIcmpInNeighborAdvertisements);
11369 UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects);
11370 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries,
11371 o2->ipv6IfIcmpInGroupMembQueries);
11372 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses,
11373 o2->ipv6IfIcmpInGroupMembResponses);
11374 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions,
11375 o2->ipv6IfIcmpInGroupMembReductions);
11376 UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs);
11377 UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors);
11378 UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs,
11379 o2->ipv6IfIcmpOutDestUnreachs);
11380 UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs,
11381 o2->ipv6IfIcmpOutAdminProhibs);
11382 UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds);
11383 UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems,
11384 o2->ipv6IfIcmpOutParmProblems);
11385 UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs);
11386 UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos);
11387 UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies);
11388 UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits,
11389 o2->ipv6IfIcmpOutRouterSolicits);
11390 UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements,
11391 o2->ipv6IfIcmpOutRouterAdvertisements);
11392 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits,
11393 o2->ipv6IfIcmpOutNeighborSolicits);
11394 UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements,
11395 o2->ipv6IfIcmpOutNeighborAdvertisements);
11396 UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects);
11397 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries,
11398 o2->ipv6IfIcmpOutGroupMembQueries);
11399 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses,
11400 o2->ipv6IfIcmpOutGroupMembResponses);
11401 UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions,
11402 o2->ipv6IfIcmpOutGroupMembReductions);
11403 UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows);
11404 UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit);
11405 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements,
11406 o2->ipv6IfIcmpInBadNeighborAdvertisements);
11407 UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations,
11408 o2->ipv6IfIcmpInBadNeighborSolicitations);
11409 UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects);
11410 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal,
11411 o2->ipv6IfIcmpInGroupMembTotal);
11412 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries,
11413 o2->ipv6IfIcmpInGroupMembBadQueries);
11414 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports,
11415 o2->ipv6IfIcmpInGroupMembBadReports);
11416 UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports,
11417 o2->ipv6IfIcmpInGroupMembOurReports);
11418 }
11419
11420 /*
11421 * Called before the options are updated to check if this packet will
11422 * be source routed from here.
11423 * This routine assumes that the options are well formed i.e. that they
11424 * have already been checked.
11425 */
11426 boolean_t
11427 ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
11428 {
11429 ipoptp_t opts;
11430 uchar_t *opt;
11431 uint8_t optval;
11432 uint8_t optlen;
11433 ipaddr_t dst;
11434
11435 if (IS_SIMPLE_IPH(ipha)) {
11436 ip2dbg(("not source routed\n"));
11437 return (B_FALSE);
11438 }
11439 dst = ipha->ipha_dst;
11440 for (optval = ipoptp_first(&opts, ipha);
11441 optval != IPOPT_EOL;
11442 optval = ipoptp_next(&opts)) {
11443 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11444 opt = opts.ipoptp_cur;
11445 optlen = opts.ipoptp_len;
11446 ip2dbg(("ip_source_routed: opt %d, len %d\n",
11447 optval, optlen));
11448 switch (optval) {
11449 uint32_t off;
11450 case IPOPT_SSRR:
11451 case IPOPT_LSRR:
11452 /*
11453 * If dst is one of our addresses and there are some
11454 * entries left in the source route return (true).
11455 */
11456 if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
11457 ip2dbg(("ip_source_routed: not next"
11458 " source route 0x%x\n",
11459 ntohl(dst)));
11460 return (B_FALSE);
11461 }
11462 off = opt[IPOPT_OFFSET];
11463 off--;
11464 if (optlen < IP_ADDR_LEN ||
11465 off > optlen - IP_ADDR_LEN) {
11466 /* End of source route */
11467 ip1dbg(("ip_source_routed: end of SR\n"));
11468 return (B_FALSE);
11469 }
11470 return (B_TRUE);
11471 }
11472 }
11473 ip2dbg(("not source routed\n"));
11474 return (B_FALSE);
11475 }
11476
11477 /*
11478 * ip_unbind is called by the transports to remove a conn from
11479 * the fanout table.
11480 */
11481 void
11482 ip_unbind(conn_t *connp)
11483 {
11484
11485 ASSERT(!MUTEX_HELD(&connp->conn_lock));
11486
11487 if (is_system_labeled() && connp->conn_anon_port) {
11488 (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
11489 connp->conn_mlp_type, connp->conn_proto,
11490 ntohs(connp->conn_lport), B_FALSE);
11491 connp->conn_anon_port = 0;
11492 }
11493 connp->conn_mlp_type = mlptSingle;
11494
11495 ipcl_hash_remove(connp);
11496 }
11497
11498 /*
11499 * Used for deciding the MSS size for the upper layer. Thus
11500 * we need to check the outbound policy values in the conn.
11501 */
11502 int
11503 conn_ipsec_length(conn_t *connp)
11504 {
11505 ipsec_latch_t *ipl;
11506
11507 ipl = connp->conn_latch;
11508 if (ipl == NULL)
11509 return (0);
11510
11511 if (connp->conn_ixa->ixa_ipsec_policy == NULL)
11512 return (0);
11513
11514 return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
11515 }
11516
11517 /*
11518 * Returns an estimate of the IPsec headers size. This is used if
11519 * we don't want to call into IPsec to get the exact size.
11520 */
11521 int
11522 ipsec_out_extra_length(ip_xmit_attr_t *ixa)
11523 {
11524 ipsec_action_t *a;
11525
11526 if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
11527 return (0);
11528
11529 a = ixa->ixa_ipsec_action;
11530 if (a == NULL) {
11531 ASSERT(ixa->ixa_ipsec_policy != NULL);
11532 a = ixa->ixa_ipsec_policy->ipsp_act;
11533 }
11534 ASSERT(a != NULL);
11535
11536 return (a->ipa_ovhd);
11537 }
11538
11539 /*
11540 * If there are any source route options, return the true final
11541 * destination. Otherwise, return the destination.
11542 */
11543 ipaddr_t
11544 ip_get_dst(ipha_t *ipha)
11545 {
11546 ipoptp_t opts;
11547 uchar_t *opt;
11548 uint8_t optval;
11549 uint8_t optlen;
11550 ipaddr_t dst;
11551 uint32_t off;
11552
11553 dst = ipha->ipha_dst;
11554
11555 if (IS_SIMPLE_IPH(ipha))
11556 return (dst);
11557
11558 for (optval = ipoptp_first(&opts, ipha);
11559 optval != IPOPT_EOL;
11560 optval = ipoptp_next(&opts)) {
11561 opt = opts.ipoptp_cur;
11562 optlen = opts.ipoptp_len;
11563 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11564 switch (optval) {
11565 case IPOPT_SSRR:
11566 case IPOPT_LSRR:
11567 off = opt[IPOPT_OFFSET];
11568 /*
11569 * If one of the conditions is true, it means
11570 * end of options and dst already has the right
11571 * value.
11572 */
11573 if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) {
11574 off = optlen - IP_ADDR_LEN;
11575 bcopy(&opt[off], &dst, IP_ADDR_LEN);
11576 }
11577 return (dst);
11578 default:
11579 break;
11580 }
11581 }
11582
11583 return (dst);
11584 }
11585
11586 /*
11587 * Outbound IP fragmentation routine.
11588 * Assumes the caller has checked whether or not fragmentation should
11589 * be allowed. Here we copy the DF bit from the header to all the generated
11590 * fragments.
11591 */
11592 int
11593 ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
11594 uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
11595 zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
11596 {
11597 int i1;
11598 int hdr_len;
11599 mblk_t *hdr_mp;
11600 ipha_t *ipha;
11601 int ip_data_end;
11602 int len;
11603 mblk_t *mp = mp_orig;
11604 int offset;
11605 ill_t *ill = nce->nce_ill;
11606 ip_stack_t *ipst = ill->ill_ipst;
11607 mblk_t *carve_mp;
11608 uint32_t frag_flag;
11609 uint_t priority = mp->b_band;
11610 int error = 0;
11611
11612 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
11613
11614 if (pkt_len != msgdsize(mp)) {
11615 ip0dbg(("Packet length mismatch: %d, %ld\n",
11616 pkt_len, msgdsize(mp)));
11617 freemsg(mp);
11618 return (EINVAL);
11619 }
11620
11621 if (max_frag == 0) {
11622 ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
11623 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11624 ip_drop_output("FragFails: zero max_frag", mp, ill);
11625 freemsg(mp);
11626 return (EINVAL);
11627 }
11628
11629 ASSERT(MBLKL(mp) >= sizeof (ipha_t));
11630 ipha = (ipha_t *)mp->b_rptr;
11631 ASSERT(ntohs(ipha->ipha_length) == pkt_len);
11632 frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
11633
11634 /*
11635 * Establish the starting offset. May not be zero if we are fragging
11636 * a fragment that is being forwarded.
11637 */
11638 offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
11639
11640 /* TODO why is this test needed? */
11641 if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
11642 /* TODO: notify ulp somehow */
11643 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11644 ip_drop_output("FragFails: bad starting offset", mp, ill);
11645 freemsg(mp);
11646 return (EINVAL);
11647 }
11648
11649 hdr_len = IPH_HDR_LENGTH(ipha);
11650 ipha->ipha_hdr_checksum = 0;
11651
11652 /*
11653 * Establish the number of bytes maximum per frag, after putting
11654 * in the header.
11655 */
11656 len = (max_frag - hdr_len) & ~7;
11657
11658 /* Get a copy of the header for the trailing frags */
11659 hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
11660 mp);
11661 if (hdr_mp == NULL) {
11662 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11663 ip_drop_output("FragFails: no hdr_mp", mp, ill);
11664 freemsg(mp);
11665 return (ENOBUFS);
11666 }
11667
11668 /* Store the starting offset, with the MoreFrags flag. */
11669 i1 = offset | IPH_MF | frag_flag;
11670 ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1);
11671
11672 /* Establish the ending byte offset, based on the starting offset. */
11673 offset <<= 3;
11674 ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
11675
11676 /* Store the length of the first fragment in the IP header. */
11677 i1 = len + hdr_len;
11678 ASSERT(i1 <= IP_MAXPACKET);
11679 ipha->ipha_length = htons((uint16_t)i1);
11680
11681 /*
11682 * Compute the IP header checksum for the first frag. We have to
11683 * watch out that we stop at the end of the header.
11684 */
11685 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11686
11687 /*
11688 * Now carve off the first frag. Note that this will include the
11689 * original IP header.
11690 */
11691 if (!(mp = ip_carve_mp(&mp_orig, i1))) {
11692 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11693 ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
11694 freeb(hdr_mp);
11695 freemsg(mp_orig);
11696 return (ENOBUFS);
11697 }
11698
11699 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11700
11701 error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
11702 ixa_cookie);
11703 if (error != 0 && error != EWOULDBLOCK) {
11704 /* No point in sending the other fragments */
11705 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11706 ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
11707 freeb(hdr_mp);
11708 freemsg(mp_orig);
11709 return (error);
11710 }
11711
11712 /* No need to redo state machine in loop */
11713 ixaflags &= ~IXAF_REACH_CONF;
11714
11715 /* Advance the offset to the second frag starting point. */
11716 offset += len;
11717 /*
11718 * Update hdr_len from the copied header - there might be less options
11719 * in the later fragments.
11720 */
11721 hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr);
11722 /* Loop until done. */
11723 for (;;) {
11724 uint16_t offset_and_flags;
11725 uint16_t ip_len;
11726
11727 if (ip_data_end - offset > len) {
11728 /*
11729 * Carve off the appropriate amount from the original
11730 * datagram.
11731 */
11732 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11733 mp = NULL;
11734 break;
11735 }
11736 /*
11737 * More frags after this one. Get another copy
11738 * of the header.
11739 */
11740 if (carve_mp->b_datap->db_ref == 1 &&
11741 hdr_mp->b_wptr - hdr_mp->b_rptr <
11742 carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11743 /* Inline IP header */
11744 carve_mp->b_rptr -= hdr_mp->b_wptr -
11745 hdr_mp->b_rptr;
11746 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11747 hdr_mp->b_wptr - hdr_mp->b_rptr);
11748 mp = carve_mp;
11749 } else {
11750 if (!(mp = copyb(hdr_mp))) {
11751 freemsg(carve_mp);
11752 break;
11753 }
11754 /* Get priority marking, if any. */
11755 mp->b_band = priority;
11756 mp->b_cont = carve_mp;
11757 }
11758 ipha = (ipha_t *)mp->b_rptr;
11759 offset_and_flags = IPH_MF;
11760 } else {
11761 /*
11762 * Last frag. Consume the header. Set len to
11763 * the length of this last piece.
11764 */
11765 len = ip_data_end - offset;
11766
11767 /*
11768 * Carve off the appropriate amount from the original
11769 * datagram.
11770 */
11771 if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11772 mp = NULL;
11773 break;
11774 }
11775 if (carve_mp->b_datap->db_ref == 1 &&
11776 hdr_mp->b_wptr - hdr_mp->b_rptr <
11777 carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11778 /* Inline IP header */
11779 carve_mp->b_rptr -= hdr_mp->b_wptr -
11780 hdr_mp->b_rptr;
11781 bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11782 hdr_mp->b_wptr - hdr_mp->b_rptr);
11783 mp = carve_mp;
11784 freeb(hdr_mp);
11785 hdr_mp = mp;
11786 } else {
11787 mp = hdr_mp;
11788 /* Get priority marking, if any. */
11789 mp->b_band = priority;
11790 mp->b_cont = carve_mp;
11791 }
11792 ipha = (ipha_t *)mp->b_rptr;
11793 /* A frag of a frag might have IPH_MF non-zero */
11794 offset_and_flags =
11795 ntohs(ipha->ipha_fragment_offset_and_flags) &
11796 IPH_MF;
11797 }
11798 offset_and_flags |= (uint16_t)(offset >> 3);
11799 offset_and_flags |= (uint16_t)frag_flag;
11800 /* Store the offset and flags in the IP header. */
11801 ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
11802
11803 /* Store the length in the IP header. */
11804 ip_len = (uint16_t)(len + hdr_len);
11805 ipha->ipha_length = htons(ip_len);
11806
11807 /*
11808 * Set the IP header checksum. Note that mp is just
11809 * the header, so this is easy to pass to ip_csum.
11810 */
11811 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11812
11813 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11814
11815 error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
11816 nolzid, ixa_cookie);
11817 /* All done if we just consumed the hdr_mp. */
11818 if (mp == hdr_mp) {
11819 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
11820 return (error);
11821 }
11822 if (error != 0 && error != EWOULDBLOCK) {
11823 DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
11824 mblk_t *, hdr_mp);
11825 /* No point in sending the other fragments */
11826 break;
11827 }
11828
11829 /* Otherwise, advance and loop. */
11830 offset += len;
11831 }
11832 /* Clean up following allocation failure. */
11833 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11834 ip_drop_output("FragFails: loop ended", NULL, ill);
11835 if (mp != hdr_mp)
11836 freeb(hdr_mp);
11837 if (mp != mp_orig)
11838 freemsg(mp_orig);
11839 return (error);
11840 }
11841
11842 /*
11843 * Copy the header plus those options which have the copy bit set
11844 */
11845 static mblk_t *
11846 ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
11847 mblk_t *src)
11848 {
11849 mblk_t *mp;
11850 uchar_t *up;
11851
11852 /*
11853 * Quick check if we need to look for options without the copy bit
11854 * set
11855 */
11856 mp = allocb_tmpl(ipst->ips_ip_wroff_extra + hdr_len, src);
11857 if (!mp)
11858 return (mp);
11859 mp->b_rptr += ipst->ips_ip_wroff_extra;
11860 if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) {
11861 bcopy(rptr, mp->b_rptr, hdr_len);
11862 mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra;
11863 return (mp);
11864 }
11865 up = mp->b_rptr;
11866 bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH);
11867 up += IP_SIMPLE_HDR_LENGTH;
11868 rptr += IP_SIMPLE_HDR_LENGTH;
11869 hdr_len -= IP_SIMPLE_HDR_LENGTH;
11870 while (hdr_len > 0) {
11871 uint32_t optval;
11872 uint32_t optlen;
11873
11874 optval = *rptr;
11875 if (optval == IPOPT_EOL)
11876 break;
11877 if (optval == IPOPT_NOP)
11878 optlen = 1;
11879 else
11880 optlen = rptr[1];
11881 if (optval & IPOPT_COPY) {
11882 bcopy(rptr, up, optlen);
11883 up += optlen;
11884 }
11885 rptr += optlen;
11886 hdr_len -= optlen;
11887 }
11888 /*
11889 * Make sure that we drop an even number of words by filling
11890 * with EOL to the next word boundary.
11891 */
11892 for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH);
11893 hdr_len & 0x3; hdr_len++)
11894 *up++ = IPOPT_EOL;
11895 mp->b_wptr = up;
11896 /* Update header length */
11897 mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2));
11898 return (mp);
11899 }
11900
11901 /*
11902 * Update any source route, record route, or timestamp options when
11903 * sending a packet back to ourselves.
11904 * Check that we are at end of strict source route.
11905 * The options have been sanity checked by ip_output_options().
11906 */
11907 void
11908 ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
11909 {
11910 ipoptp_t opts;
11911 uchar_t *opt;
11912 uint8_t optval;
11913 uint8_t optlen;
11914 ipaddr_t dst;
11915 uint32_t ts;
11916 timestruc_t now;
11917
11918 for (optval = ipoptp_first(&opts, ipha);
11919 optval != IPOPT_EOL;
11920 optval = ipoptp_next(&opts)) {
11921 opt = opts.ipoptp_cur;
11922 optlen = opts.ipoptp_len;
11923 ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11924 switch (optval) {
11925 uint32_t off;
11926 case IPOPT_SSRR:
11927 case IPOPT_LSRR:
11928 off = opt[IPOPT_OFFSET];
11929 off--;
11930 if (optlen < IP_ADDR_LEN ||
11931 off > optlen - IP_ADDR_LEN) {
11932 /* End of source route */
11933 break;
11934 }
11935 /*
11936 * This will only happen if two consecutive entries
11937 * in the source route contains our address or if
11938 * it is a packet with a loose source route which
11939 * reaches us before consuming the whole source route
11940 */
11941
11942 if (optval == IPOPT_SSRR) {
11943 return;
11944 }
11945 /*
11946 * Hack: instead of dropping the packet truncate the
11947 * source route to what has been used by filling the
11948 * rest with IPOPT_NOP.
11949 */
11950 opt[IPOPT_OLEN] = (uint8_t)off;
11951 while (off < optlen) {
11952 opt[off++] = IPOPT_NOP;
11953 }
11954 break;
11955 case IPOPT_RR:
11956 off = opt[IPOPT_OFFSET];
11957 off--;
11958 if (optlen < IP_ADDR_LEN ||
11959 off > optlen - IP_ADDR_LEN) {
11960 /* No more room - ignore */
11961 ip1dbg((
11962 "ip_output_local_options: end of RR\n"));
11963 break;
11964 }
11965 dst = htonl(INADDR_LOOPBACK);
11966 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
11967 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
11968 break;
11969 case IPOPT_TS:
11970 /* Insert timestamp if there is romm */
11971 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
11972 case IPOPT_TS_TSONLY:
11973 off = IPOPT_TS_TIMELEN;
11974 break;
11975 case IPOPT_TS_PRESPEC:
11976 case IPOPT_TS_PRESPEC_RFC791:
11977 /* Verify that the address matched */
11978 off = opt[IPOPT_OFFSET] - 1;
11979 bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
11980 if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
11981 /* Not for us */
11982 break;
11983 }
11984 /* FALLTHRU */
11985 case IPOPT_TS_TSANDADDR:
11986 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
11987 break;
11988 default:
11989 /*
11990 * ip_*put_options should have already
11991 * dropped this packet.
11992 */
11993 cmn_err(CE_PANIC, "ip_output_local_options: "
11994 "unknown IT - bug in ip_output_options?\n");
11995 return; /* Keep "lint" happy */
11996 }
11997 if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
11998 /* Increase overflow counter */
11999 off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
12000 opt[IPOPT_POS_OV_FLG] = (uint8_t)
12001 (opt[IPOPT_POS_OV_FLG] & 0x0F) |
12002 (off << 4);
12003 break;
12004 }
12005 off = opt[IPOPT_OFFSET] - 1;
12006 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
12007 case IPOPT_TS_PRESPEC:
12008 case IPOPT_TS_PRESPEC_RFC791:
12009 case IPOPT_TS_TSANDADDR:
12010 dst = htonl(INADDR_LOOPBACK);
12011 bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
12012 opt[IPOPT_OFFSET] += IP_ADDR_LEN;
12013 /* FALLTHRU */
12014 case IPOPT_TS_TSONLY:
12015 off = opt[IPOPT_OFFSET] - 1;
12016 /* Compute # of milliseconds since midnight */
12017 gethrestime(&now);
12018 ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
12019 now.tv_nsec / (NANOSEC / MILLISEC);
12020 bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
12021 opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
12022 break;
12023 }
12024 break;
12025 }
12026 }
12027 }
12028
12029 /*
12030 * Prepend an M_DATA fastpath header, and if none present prepend a
12031 * DL_UNITDATA_REQ. Frees the mblk on failure.
12032 *
12033 * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
12034 * If there is a change to them, the nce will be deleted (condemned) and
12035 * a new nce_t will be created when packets are sent. Thus we need no locks
12036 * to access those fields.
12037 *
12038 * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
12039 * we place b_band in dl_priority.dl_max.
12040 */
12041 static mblk_t *
12042 ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
12043 {
12044 uint_t hlen;
12045 mblk_t *mp1;
12046 uint_t priority;
12047 uchar_t *rptr;
12048
12049 rptr = mp->b_rptr;
12050
12051 ASSERT(DB_TYPE(mp) == M_DATA);
12052 priority = mp->b_band;
12053
12054 ASSERT(nce != NULL);
12055 if ((mp1 = nce->nce_fp_mp) != NULL) {
12056 hlen = MBLKL(mp1);
12057 /*
12058 * Check if we have enough room to prepend fastpath
12059 * header
12060 */
12061 if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
12062 rptr -= hlen;
12063 bcopy(mp1->b_rptr, rptr, hlen);
12064 /*
12065 * Set the b_rptr to the start of the link layer
12066 * header
12067 */
12068 mp->b_rptr = rptr;
12069 return (mp);
12070 }
12071 mp1 = copyb(mp1);
12072 if (mp1 == NULL) {
12073 ill_t *ill = nce->nce_ill;
12074
12075 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12076 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12077 freemsg(mp);
12078 return (NULL);
12079 }
12080 mp1->b_band = priority;
12081 mp1->b_cont = mp;
12082 DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
12083 DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
12084 DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
12085 DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
12086 DB_LSOMSS(mp1) = DB_LSOMSS(mp);
12087 DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
12088 /*
12089 * XXX disable ICK_VALID and compute checksum
12090 * here; can happen if nce_fp_mp changes and
12091 * it can't be copied now due to insufficient
12092 * space. (unlikely, fp mp can change, but it
12093 * does not increase in length)
12094 */
12095 return (mp1);
12096 }
12097 mp1 = copyb(nce->nce_dlur_mp);
12098
12099 if (mp1 == NULL) {
12100 ill_t *ill = nce->nce_ill;
12101
12102 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12103 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12104 freemsg(mp);
12105 return (NULL);
12106 }
12107 mp1->b_cont = mp;
12108 if (priority != 0) {
12109 mp1->b_band = priority;
12110 ((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
12111 priority;
12112 }
12113 return (mp1);
12114 #undef rptr
12115 }
12116
12117 /*
12118 * Finish the outbound IPsec processing. This function is called from
12119 * ipsec_out_process() if the IPsec packet was processed
12120 * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12121 * asynchronously.
12122 *
12123 * This is common to IPv4 and IPv6.
12124 */
12125 int
12126 ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
12127 {
12128 iaflags_t ixaflags = ixa->ixa_flags;
12129 uint_t pktlen;
12130
12131
12132 /* AH/ESP don't update ixa_pktlen when they modify the packet */
12133 if (ixaflags & IXAF_IS_IPV4) {
12134 ipha_t *ipha = (ipha_t *)mp->b_rptr;
12135
12136 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12137 pktlen = ntohs(ipha->ipha_length);
12138 } else {
12139 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
12140
12141 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12142 pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12143 }
12144
12145 /*
12146 * We release any hard reference on the SAs here to make
12147 * sure the SAs can be garbage collected. ipsr_sa has a soft reference
12148 * on the SAs.
12149 * If in the future we want the hard latching of the SAs in the
12150 * ip_xmit_attr_t then we should remove this.
12151 */
12152 if (ixa->ixa_ipsec_esp_sa != NULL) {
12153 IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12154 ixa->ixa_ipsec_esp_sa = NULL;
12155 }
12156 if (ixa->ixa_ipsec_ah_sa != NULL) {
12157 IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12158 ixa->ixa_ipsec_ah_sa = NULL;
12159 }
12160
12161 /* Do we need to fragment? */
12162 if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
12163 pktlen > ixa->ixa_fragsize) {
12164 if (ixaflags & IXAF_IS_IPV4) {
12165 ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
12166 /*
12167 * We check for the DF case in ipsec_out_process
12168 * hence this only handles the non-DF case.
12169 */
12170 return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
12171 pktlen, ixa->ixa_fragsize,
12172 ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12173 ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
12174 &ixa->ixa_cookie));
12175 } else {
12176 mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
12177 if (mp == NULL) {
12178 /* MIB and ip_drop_output already done */
12179 return (ENOMEM);
12180 }
12181 pktlen += sizeof (ip6_frag_t);
12182 if (pktlen > ixa->ixa_fragsize) {
12183 return (ip_fragment_v6(mp, ixa->ixa_nce,
12184 ixa->ixa_flags, pktlen,
12185 ixa->ixa_fragsize, ixa->ixa_xmit_hint,
12186 ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
12187 ixa->ixa_postfragfn, &ixa->ixa_cookie));
12188 }
12189 }
12190 }
12191 return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
12192 pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12193 ixa->ixa_no_loop_zoneid, NULL));
12194 }
12195
12196 /*
12197 * Finish the inbound IPsec processing. This function is called from
12198 * ipsec_out_process() if the IPsec packet was processed
12199 * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12200 * asynchronously.
12201 *
12202 * This is common to IPv4 and IPv6.
12203 */
12204 void
12205 ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
12206 {
12207 iaflags_t iraflags = ira->ira_flags;
12208
12209 /* Length might have changed */
12210 if (iraflags & IRAF_IS_IPV4) {
12211 ipha_t *ipha = (ipha_t *)mp->b_rptr;
12212
12213 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12214 ira->ira_pktlen = ntohs(ipha->ipha_length);
12215 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
12216 ira->ira_protocol = ipha->ipha_protocol;
12217
12218 ip_fanout_v4(mp, ipha, ira);
12219 } else {
12220 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
12221 uint8_t *nexthdrp;
12222
12223 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12224 ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12225 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
12226 &nexthdrp)) {
12227 /* Malformed packet */
12228 BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
12229 ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
12230 freemsg(mp);
12231 return;
12232 }
12233 ira->ira_protocol = *nexthdrp;
12234 ip_fanout_v6(mp, ip6h, ira);
12235 }
12236 }
12237
12238 /*
12239 * Select which AH & ESP SA's to use (if any) for the outbound packet.
12240 *
12241 * If this function returns B_TRUE, the requested SA's have been filled
12242 * into the ixa_ipsec_*_sa pointers.
12243 *
12244 * If the function returns B_FALSE, the packet has been "consumed", most
12245 * likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
12246 *
12247 * The SA references created by the protocol-specific "select"
12248 * function will be released in ip_output_post_ipsec.
12249 */
12250 static boolean_t
12251 ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
12252 {
12253 boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
12254 ipsec_policy_t *pp;
12255 ipsec_action_t *ap;
12256
12257 ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12258 ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12259 (ixa->ixa_ipsec_action != NULL));
12260
12261 ap = ixa->ixa_ipsec_action;
12262 if (ap == NULL) {
12263 pp = ixa->ixa_ipsec_policy;
12264 ASSERT(pp != NULL);
12265 ap = pp->ipsp_act;
12266 ASSERT(ap != NULL);
12267 }
12268
12269 /*
12270 * We have an action. now, let's select SA's.
12271 * A side effect of setting ixa_ipsec_*_sa is that it will
12272 * be cached in the conn_t.
12273 */
12274 if (ap->ipa_want_esp) {
12275 if (ixa->ixa_ipsec_esp_sa == NULL) {
12276 need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
12277 IPPROTO_ESP);
12278 }
12279 ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
12280 }
12281
12282 if (ap->ipa_want_ah) {
12283 if (ixa->ixa_ipsec_ah_sa == NULL) {
12284 need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
12285 IPPROTO_AH);
12286 }
12287 ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
12288 /*
12289 * The ESP and AH processing order needs to be preserved
12290 * when both protocols are required (ESP should be applied
12291 * before AH for an outbound packet). Force an ESP ACQUIRE
12292 * when both ESP and AH are required, and an AH ACQUIRE
12293 * is needed.
12294 */
12295 if (ap->ipa_want_esp && need_ah_acquire)
12296 need_esp_acquire = B_TRUE;
12297 }
12298
12299 /*
12300 * Send an ACQUIRE (extended, regular, or both) if we need one.
12301 * Release SAs that got referenced, but will not be used until we
12302 * acquire _all_ of the SAs we need.
12303 */
12304 if (need_ah_acquire || need_esp_acquire) {
12305 if (ixa->ixa_ipsec_ah_sa != NULL) {
12306 IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12307 ixa->ixa_ipsec_ah_sa = NULL;
12308 }
12309 if (ixa->ixa_ipsec_esp_sa != NULL) {
12310 IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12311 ixa->ixa_ipsec_esp_sa = NULL;
12312 }
12313
12314 sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
12315 return (B_FALSE);
12316 }
12317
12318 return (B_TRUE);
12319 }
12320
12321 /*
12322 * Handle IPsec output processing.
12323 * This function is only entered once for a given packet.
12324 * We try to do things synchronously, but if we need to have user-level
12325 * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
12326 * will be completed
12327 * - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
12328 * - when asynchronous ESP is done it will do AH
12329 *
12330 * In all cases we come back in ip_output_post_ipsec() to fragment and
12331 * send out the packet.
12332 */
12333 int
12334 ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
12335 {
12336 ill_t *ill = ixa->ixa_nce->nce_ill;
12337 ip_stack_t *ipst = ixa->ixa_ipst;
12338 ipsec_stack_t *ipss;
12339 ipsec_policy_t *pp;
12340 ipsec_action_t *ap;
12341
12342 ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12343
12344 ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12345 (ixa->ixa_ipsec_action != NULL));
12346
12347 ipss = ipst->ips_netstack->netstack_ipsec;
12348 if (!ipsec_loaded(ipss)) {
12349 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12350 ip_drop_packet(mp, B_TRUE, ill,
12351 DROPPER(ipss, ipds_ip_ipsec_not_loaded),
12352 &ipss->ipsec_dropper);
12353 return (ENOTSUP);
12354 }
12355
12356 ap = ixa->ixa_ipsec_action;
12357 if (ap == NULL) {
12358 pp = ixa->ixa_ipsec_policy;
12359 ASSERT(pp != NULL);
12360 ap = pp->ipsp_act;
12361 ASSERT(ap != NULL);
12362 }
12363
12364 /* Handle explicit drop action and bypass. */
12365 switch (ap->ipa_act.ipa_type) {
12366 case IPSEC_ACT_DISCARD:
12367 case IPSEC_ACT_REJECT:
12368 ip_drop_packet(mp, B_FALSE, ill,
12369 DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
12370 return (EHOSTUNREACH); /* IPsec policy failure */
12371 case IPSEC_ACT_BYPASS:
12372 return (ip_output_post_ipsec(mp, ixa));
12373 }
12374
12375 /*
12376 * The order of processing is first insert a IP header if needed.
12377 * Then insert the ESP header and then the AH header.
12378 */
12379 if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
12380 /*
12381 * First get the outer IP header before sending
12382 * it to ESP.
12383 */
12384 ipha_t *oipha, *iipha;
12385 mblk_t *outer_mp, *inner_mp;
12386
12387 if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
12388 (void) mi_strlog(ill->ill_rq, 0,
12389 SL_ERROR|SL_TRACE|SL_CONSOLE,
12390 "ipsec_out_process: "
12391 "Self-Encapsulation failed: Out of memory\n");
12392 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12393 ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12394 freemsg(mp);
12395 return (ENOBUFS);
12396 }
12397 inner_mp = mp;
12398 ASSERT(inner_mp->b_datap->db_type == M_DATA);
12399 oipha = (ipha_t *)outer_mp->b_rptr;
12400 iipha = (ipha_t *)inner_mp->b_rptr;
12401 *oipha = *iipha;
12402 outer_mp->b_wptr += sizeof (ipha_t);
12403 oipha->ipha_length = htons(ntohs(iipha->ipha_length) +
12404 sizeof (ipha_t));
12405 oipha->ipha_protocol = IPPROTO_ENCAP;
12406 oipha->ipha_version_and_hdr_length =
12407 IP_SIMPLE_HDR_VERSION;
12408 oipha->ipha_hdr_checksum = 0;
12409 oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
12410 outer_mp->b_cont = inner_mp;
12411 mp = outer_mp;
12412
12413 ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
12414 }
12415
12416 /* If we need to wait for a SA then we can't return any errno */
12417 if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
12418 (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
12419 !ipsec_out_select_sa(mp, ixa))
12420 return (0);
12421
12422 /*
12423 * By now, we know what SA's to use. Toss over to ESP & AH
12424 * to do the heavy lifting.
12425 */
12426 if (ap->ipa_want_esp) {
12427 ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
12428
12429 mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
12430 if (mp == NULL) {
12431 /*
12432 * Either it failed or is pending. In the former case
12433 * ipIfStatsInDiscards was increased.
12434 */
12435 return (0);
12436 }
12437 }
12438
12439 if (ap->ipa_want_ah) {
12440 ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
12441
12442 mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
12443 if (mp == NULL) {
12444 /*
12445 * Either it failed or is pending. In the former case
12446 * ipIfStatsInDiscards was increased.
12447 */
12448 return (0);
12449 }
12450 }
12451 /*
12452 * We are done with IPsec processing. Send it over
12453 * the wire.
12454 */
12455 return (ip_output_post_ipsec(mp, ixa));
12456 }
12457
12458 /*
12459 * ioctls that go through a down/up sequence may need to wait for the down
12460 * to complete. This involves waiting for the ire and ipif refcnts to go down
12461 * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail.
12462 */
12463 /* ARGSUSED */
12464 void
12465 ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
12466 {
12467 struct iocblk *iocp;
12468 mblk_t *mp1;
12469 ip_ioctl_cmd_t *ipip;
12470 int err;
12471 sin_t *sin;
12472 struct lifreq *lifr;
12473 struct ifreq *ifr;
12474
12475 iocp = (struct iocblk *)mp->b_rptr;
12476 ASSERT(ipsq != NULL);
12477 /* Existence of mp1 verified in ip_wput_nondata */
12478 mp1 = mp->b_cont->b_cont;
12479 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12480 if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
12481 /*
12482 * Special case where ipx_current_ipif is not set:
12483 * ill_phyint_reinit merged the v4 and v6 into a single ipsq.
12484 * We are here as were not able to complete the operation in
12485 * ipif_set_values because we could not become exclusive on
12486 * the new ipsq.
12487 */
12488 ill_t *ill = q->q_ptr;
12489 ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
12490 }
12491 ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
12492
12493 if (ipip->ipi_cmd_type == IF_CMD) {
12494 /* This a old style SIOC[GS]IF* command */
12495 ifr = (struct ifreq *)mp1->b_rptr;
12496 sin = (sin_t *)&ifr->ifr_addr;
12497 } else if (ipip->ipi_cmd_type == LIF_CMD) {
12498 /* This a new style SIOC[GS]LIF* command */
12499 lifr = (struct lifreq *)mp1->b_rptr;
12500 sin = (sin_t *)&lifr->lifr_addr;
12501 } else {
12502 sin = NULL;
12503 }
12504
12505 err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
12506 q, mp, ipip, mp1->b_rptr);
12507
12508 DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
12509 int, ipip->ipi_cmd,
12510 ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
12511 ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
12512
12513 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12514 }
12515
12516 /*
12517 * ioctl processing
12518 *
12519 * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up
12520 * the ioctl command in the ioctl tables, determines the copyin data size
12521 * from the ipi_copyin_size field, and does an mi_copyin() of that size.
12522 *
12523 * ioctl processing then continues when the M_IOCDATA makes its way down to
12524 * ip_wput_nondata(). The ioctl is looked up again in the ioctl table, its
12525 * associated 'conn' is refheld till the end of the ioctl and the general
12526 * ioctl processing function ip_process_ioctl() is called to extract the
12527 * arguments and process the ioctl. To simplify extraction, ioctl commands
12528 * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a
12529 * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq())
12530 * is used to extract the ioctl's arguments.
12531 *
12532 * ip_process_ioctl determines if the ioctl needs to be serialized, and if
12533 * so goes thru the serialization primitive ipsq_try_enter. Then the
12534 * appropriate function to handle the ioctl is called based on the entry in
12535 * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish
12536 * which also refreleases the 'conn' that was refheld at the start of the
12537 * ioctl. Finally ipsq_exit is called if needed to exit the ipsq.
12538 *
12539 * Many exclusive ioctls go thru an internal down up sequence as part of
12540 * the operation. For example an attempt to change the IP address of an
12541 * ipif entails ipif_down, set address, ipif_up. Bringing down the interface
12542 * does all the cleanup such as deleting all ires that use this address.
12543 * Then we need to wait till all references to the interface go away.
12544 */
12545 void
12546 ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12547 {
12548 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
12549 ip_ioctl_cmd_t *ipip = arg;
12550 ip_extract_func_t *extract_funcp;
12551 cmd_info_t ci;
12552 int err;
12553 boolean_t entered_ipsq = B_FALSE;
12554
12555 ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
12556
12557 if (ipip == NULL)
12558 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12559
12560 /*
12561 * SIOCLIFADDIF needs to go thru a special path since the
12562 * ill may not exist yet. This happens in the case of lo0
12563 * which is created using this ioctl.
12564 */
12565 if (ipip->ipi_cmd == SIOCLIFADDIF) {
12566 err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
12567 DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
12568 int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12569 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12570 return;
12571 }
12572
12573 ci.ci_ipif = NULL;
12574 switch (ipip->ipi_cmd_type) {
12575 case MISC_CMD:
12576 case MSFILT_CMD:
12577 /*
12578 * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
12579 */
12580 if (ipip->ipi_cmd == IF_UNITSEL) {
12581 /* ioctl comes down the ill */
12582 ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif;
12583 ipif_refhold(ci.ci_ipif);
12584 }
12585 err = 0;
12586 ci.ci_sin = NULL;
12587 ci.ci_sin6 = NULL;
12588 ci.ci_lifr = NULL;
12589 extract_funcp = NULL;
12590 break;
12591
12592 case IF_CMD:
12593 case LIF_CMD:
12594 extract_funcp = ip_extract_lifreq;
12595 break;
12596
12597 case ARP_CMD:
12598 case XARP_CMD:
12599 extract_funcp = ip_extract_arpreq;
12600 break;
12601
12602 default:
12603 ASSERT(0);
12604 }
12605
12606 if (extract_funcp != NULL) {
12607 err = (*extract_funcp)(q, mp, ipip, &ci);
12608 if (err != 0) {
12609 DTRACE_PROBE4(ipif__ioctl,
12610 char *, "ip_process_ioctl finish err",
12611 int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12612 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12613 return;
12614 }
12615
12616 /*
12617 * All of the extraction functions return a refheld ipif.
12618 */
12619 ASSERT(ci.ci_ipif != NULL);
12620 }
12621
12622 if (!(ipip->ipi_flags & IPI_WR)) {
12623 /*
12624 * A return value of EINPROGRESS means the ioctl is
12625 * either queued and waiting for some reason or has
12626 * already completed.
12627 */
12628 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
12629 ci.ci_lifr);
12630 if (ci.ci_ipif != NULL) {
12631 DTRACE_PROBE4(ipif__ioctl,
12632 char *, "ip_process_ioctl finish RD",
12633 int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
12634 ipif_t *, ci.ci_ipif);
12635 ipif_refrele(ci.ci_ipif);
12636 } else {
12637 DTRACE_PROBE4(ipif__ioctl,
12638 char *, "ip_process_ioctl finish RD",
12639 int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12640 }
12641 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12642 return;
12643 }
12644
12645 ASSERT(ci.ci_ipif != NULL);
12646
12647 /*
12648 * If ipsq is non-NULL, we are already being called exclusively
12649 */
12650 ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
12651 if (ipsq == NULL) {
12652 ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
12653 NEW_OP, B_TRUE);
12654 if (ipsq == NULL) {
12655 ipif_refrele(ci.ci_ipif);
12656 return;
12657 }
12658 entered_ipsq = B_TRUE;
12659 }
12660 /*
12661 * Release the ipif so that ipif_down and friends that wait for
12662 * references to go away are not misled about the current ipif_refcnt
12663 * values. We are writer so we can access the ipif even after releasing
12664 * the ipif.
12665 */
12666 ipif_refrele(ci.ci_ipif);
12667
12668 ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
12669
12670 /*
12671 * A return value of EINPROGRESS means the ioctl is
12672 * either queued and waiting for some reason or has
12673 * already completed.
12674 */
12675 err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
12676
12677 DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
12678 int, ipip->ipi_cmd,
12679 ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
12680 ipif_t *, ci.ci_ipif);
12681 ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12682
12683 if (entered_ipsq)
12684 ipsq_exit(ipsq);
12685 }
12686
12687 /*
12688 * Complete the ioctl. Typically ioctls use the mi package and need to
12689 * do mi_copyout/mi_copy_done.
12690 */
12691 void
12692 ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
12693 {
12694 conn_t *connp = NULL;
12695
12696 if (err == EINPROGRESS)
12697 return;
12698
12699 if (CONN_Q(q)) {
12700 connp = Q_TO_CONN(q);
12701 ASSERT(connp->conn_ref >= 2);
12702 }
12703
12704 switch (mode) {
12705 case COPYOUT:
12706 if (err == 0)
12707 mi_copyout(q, mp);
12708 else
12709 mi_copy_done(q, mp, err);
12710 break;
12711
12712 case NO_COPYOUT:
12713 mi_copy_done(q, mp, err);
12714 break;
12715
12716 default:
12717 ASSERT(mode == CONN_CLOSE); /* aborted through CONN_CLOSE */
12718 break;
12719 }
12720
12721 /*
12722 * The conn refhold and ioctlref placed on the conn at the start of the
12723 * ioctl are released here.
12724 */
12725 if (connp != NULL) {
12726 CONN_DEC_IOCTLREF(connp);
12727 CONN_OPER_PENDING_DONE(connp);
12728 }
12729
12730 if (ipsq != NULL)
12731 ipsq_current_finish(ipsq);
12732 }
12733
12734 /* Handles all non data messages */
12735 void
12736 ip_wput_nondata(queue_t *q, mblk_t *mp)
12737 {
12738 mblk_t *mp1;
12739 struct iocblk *iocp;
12740 ip_ioctl_cmd_t *ipip;
12741 conn_t *connp;
12742 cred_t *cr;
12743 char *proto_str;
12744
12745 if (CONN_Q(q))
12746 connp = Q_TO_CONN(q);
12747 else
12748 connp = NULL;
12749
12750 switch (DB_TYPE(mp)) {
12751 case M_IOCTL:
12752 /*
12753 * IOCTL processing begins in ip_sioctl_copyin_setup which
12754 * will arrange to copy in associated control structures.
12755 */
12756 ip_sioctl_copyin_setup(q, mp);
12757 return;
12758 case M_IOCDATA:
12759 /*
12760 * Ensure that this is associated with one of our trans-
12761 * parent ioctls. If it's not ours, discard it if we're
12762 * running as a driver, or pass it on if we're a module.
12763 */
12764 iocp = (struct iocblk *)mp->b_rptr;
12765 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12766 if (ipip == NULL) {
12767 if (q->q_next == NULL) {
12768 goto nak;
12769 } else {
12770 putnext(q, mp);
12771 }
12772 return;
12773 }
12774 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
12775 /*
12776 * The ioctl is one we recognise, but is not consumed
12777 * by IP as a module and we are a module, so we drop
12778 */
12779 goto nak;
12780 }
12781
12782 /* IOCTL continuation following copyin or copyout. */
12783 if (mi_copy_state(q, mp, NULL) == -1) {
12784 /*
12785 * The copy operation failed. mi_copy_state already
12786 * cleaned up, so we're out of here.
12787 */
12788 return;
12789 }
12790 /*
12791 * If we just completed a copy in, we become writer and
12792 * continue processing in ip_sioctl_copyin_done. If it
12793 * was a copy out, we call mi_copyout again. If there is
12794 * nothing more to copy out, it will complete the IOCTL.
12795 */
12796 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) {
12797 if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) {
12798 mi_copy_done(q, mp, EPROTO);
12799 return;
12800 }
12801 /*
12802 * Check for cases that need more copying. A return
12803 * value of 0 means a second copyin has been started,
12804 * so we return; a return value of 1 means no more
12805 * copying is needed, so we continue.
12806 */
12807 if (ipip->ipi_cmd_type == MSFILT_CMD &&
12808 MI_COPY_COUNT(mp) == 1) {
12809 if (ip_copyin_msfilter(q, mp) == 0)
12810 return;
12811 }
12812 /*
12813 * Refhold the conn, till the ioctl completes. This is
12814 * needed in case the ioctl ends up in the pending mp
12815 * list. Every mp in the ipx_pending_mp list must have
12816 * a refhold on the conn to resume processing. The
12817 * refhold is released when the ioctl completes
12818 * (whether normally or abnormally). An ioctlref is also
12819 * placed on the conn to prevent TCP from removing the
12820 * queue needed to send the ioctl reply back.
12821 * In all cases ip_ioctl_finish is called to finish
12822 * the ioctl and release the refholds.
12823 */
12824 if (connp != NULL) {
12825 /* This is not a reentry */
12826 CONN_INC_REF(connp);
12827 CONN_INC_IOCTLREF(connp);
12828 } else {
12829 if (!(ipip->ipi_flags & IPI_MODOK)) {
12830 mi_copy_done(q, mp, EINVAL);
12831 return;
12832 }
12833 }
12834
12835 ip_process_ioctl(NULL, q, mp, ipip);
12836
12837 } else {
12838 mi_copyout(q, mp);
12839 }
12840 return;
12841
12842 case M_IOCNAK:
12843 /*
12844 * The only way we could get here is if a resolver didn't like
12845 * an IOCTL we sent it. This shouldn't happen.
12846 */
12847 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
12848 "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
12849 ((struct iocblk *)mp->b_rptr)->ioc_cmd);
12850 freemsg(mp);
12851 return;
12852 case M_IOCACK:
12853 /* /dev/ip shouldn't see this */
12854 goto nak;
12855 case M_FLUSH:
12856 if (*mp->b_rptr & FLUSHW)
12857 flushq(q, FLUSHALL);
12858 if (q->q_next) {
12859 putnext(q, mp);
12860 return;
12861 }
12862 if (*mp->b_rptr & FLUSHR) {
12863 *mp->b_rptr &= ~FLUSHW;
12864 qreply(q, mp);
12865 return;
12866 }
12867 freemsg(mp);
12868 return;
12869 case M_CTL:
12870 break;
12871 case M_PROTO:
12872 case M_PCPROTO:
12873 /*
12874 * The only PROTO messages we expect are SNMP-related.
12875 */
12876 switch (((union T_primitives *)mp->b_rptr)->type) {
12877 case T_SVR4_OPTMGMT_REQ:
12878 ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
12879 "flags %x\n",
12880 ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
12881
12882 if (connp == NULL) {
12883 proto_str = "T_SVR4_OPTMGMT_REQ";
12884 goto protonak;
12885 }
12886
12887 /*
12888 * All Solaris components should pass a db_credp
12889 * for this TPI message, hence we ASSERT.
12890 * But in case there is some other M_PROTO that looks
12891 * like a TPI message sent by some other kernel
12892 * component, we check and return an error.
12893 */
12894 cr = msg_getcred(mp, NULL);
12895 ASSERT(cr != NULL);
12896 if (cr == NULL) {
12897 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
12898 if (mp != NULL)
12899 qreply(q, mp);
12900 return;
12901 }
12902
12903 if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
12904 proto_str = "Bad SNMPCOM request?";
12905 goto protonak;
12906 }
12907 return;
12908 default:
12909 ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
12910 (int)*(uint_t *)mp->b_rptr));
12911 freemsg(mp);
12912 return;
12913 }
12914 default:
12915 break;
12916 }
12917 if (q->q_next) {
12918 putnext(q, mp);
12919 } else
12920 freemsg(mp);
12921 return;
12922
12923 nak:
12924 iocp->ioc_error = EINVAL;
12925 mp->b_datap->db_type = M_IOCNAK;
12926 iocp->ioc_count = 0;
12927 qreply(q, mp);
12928 return;
12929
12930 protonak:
12931 cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
12932 if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
12933 qreply(q, mp);
12934 }
12935
12936 /*
12937 * Process IP options in an outbound packet. Verify that the nexthop in a
12938 * strict source route is onlink.
12939 * Returns non-zero if something fails in which case an ICMP error has been
12940 * sent and mp freed.
12941 *
12942 * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
12943 */
12944 int
12945 ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
12946 {
12947 ipoptp_t opts;
12948 uchar_t *opt;
12949 uint8_t optval;
12950 uint8_t optlen;
12951 ipaddr_t dst;
12952 intptr_t code = 0;
12953 ire_t *ire;
12954 ip_stack_t *ipst = ixa->ixa_ipst;
12955 ip_recv_attr_t iras;
12956
12957 ip2dbg(("ip_output_options\n"));
12958
12959 dst = ipha->ipha_dst;
12960 for (optval = ipoptp_first(&opts, ipha);
12961 optval != IPOPT_EOL;
12962 optval = ipoptp_next(&opts)) {
12963 opt = opts.ipoptp_cur;
12964 optlen = opts.ipoptp_len;
12965 ip2dbg(("ip_output_options: opt %d, len %d\n",
12966 optval, optlen));
12967 switch (optval) {
12968 uint32_t off;
12969 case IPOPT_SSRR:
12970 case IPOPT_LSRR:
12971 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
12972 ip1dbg((
12973 "ip_output_options: bad option offset\n"));
12974 code = (char *)&opt[IPOPT_OLEN] -
12975 (char *)ipha;
12976 goto param_prob;
12977 }
12978 off = opt[IPOPT_OFFSET];
12979 ip1dbg(("ip_output_options: next hop 0x%x\n",
12980 ntohl(dst)));
12981 /*
12982 * For strict: verify that dst is directly
12983 * reachable.
12984 */
12985 if (optval == IPOPT_SSRR) {
12986 ire = ire_ftable_lookup_v4(dst, 0, 0,
12987 IRE_INTERFACE, NULL, ALL_ZONES,
12988 ixa->ixa_tsl,
12989 MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
12990 NULL);
12991 if (ire == NULL) {
12992 ip1dbg(("ip_output_options: SSRR not"
12993 " directly reachable: 0x%x\n",
12994 ntohl(dst)));
12995 goto bad_src_route;
12996 }
12997 ire_refrele(ire);
12998 }
12999 break;
13000 case IPOPT_RR:
13001 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13002 ip1dbg((
13003 "ip_output_options: bad option offset\n"));
13004 code = (char *)&opt[IPOPT_OLEN] -
13005 (char *)ipha;
13006 goto param_prob;
13007 }
13008 break;
13009 case IPOPT_TS:
13010 /*
13011 * Verify that length >=5 and that there is either
13012 * room for another timestamp or that the overflow
13013 * counter is not maxed out.
13014 */
13015 code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
13016 if (optlen < IPOPT_MINLEN_IT) {
13017 goto param_prob;
13018 }
13019 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13020 ip1dbg((
13021 "ip_output_options: bad option offset\n"));
13022 code = (char *)&opt[IPOPT_OFFSET] -
13023 (char *)ipha;
13024 goto param_prob;
13025 }
13026 switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
13027 case IPOPT_TS_TSONLY:
13028 off = IPOPT_TS_TIMELEN;
13029 break;
13030 case IPOPT_TS_TSANDADDR:
13031 case IPOPT_TS_PRESPEC:
13032 case IPOPT_TS_PRESPEC_RFC791:
13033 off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
13034 break;
13035 default:
13036 code = (char *)&opt[IPOPT_POS_OV_FLG] -
13037 (char *)ipha;
13038 goto param_prob;
13039 }
13040 if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
13041 (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
13042 /*
13043 * No room and the overflow counter is 15
13044 * already.
13045 */
13046 goto param_prob;
13047 }
13048 break;
13049 }
13050 }
13051
13052 if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
13053 return (0);
13054
13055 ip1dbg(("ip_output_options: error processing IP options."));
13056 code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
13057
13058 param_prob:
13059 bzero(&iras, sizeof (iras));
13060 iras.ira_ill = iras.ira_rill = ill;
13061 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13062 iras.ira_rifindex = iras.ira_ruifindex;
13063 iras.ira_flags = IRAF_IS_IPV4;
13064
13065 ip_drop_output("ip_output_options", mp, ill);
13066 icmp_param_problem(mp, (uint8_t)code, &iras);
13067 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13068 return (-1);
13069
13070 bad_src_route:
13071 bzero(&iras, sizeof (iras));
13072 iras.ira_ill = iras.ira_rill = ill;
13073 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13074 iras.ira_rifindex = iras.ira_ruifindex;
13075 iras.ira_flags = IRAF_IS_IPV4;
13076
13077 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
13078 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
13079 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13080 return (-1);
13081 }
13082
13083 /*
13084 * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT.
13085 * conn_drain_list_cnt can be changed by setting conn_drain_nthreads
13086 * thru /etc/system.
13087 */
13088 #define CONN_MAXDRAINCNT 64
13089
13090 static void
13091 conn_drain_init(ip_stack_t *ipst)
13092 {
13093 int i, j;
13094 idl_tx_list_t *itl_tx;
13095
13096 ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
13097
13098 if ((ipst->ips_conn_drain_list_cnt == 0) ||
13099 (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) {
13100 /*
13101 * Default value of the number of drainers is the
13102 * number of cpus, subject to maximum of 8 drainers.
13103 */
13104 if (boot_max_ncpus != -1)
13105 ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8);
13106 else
13107 ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
13108 }
13109
13110 ipst->ips_idl_tx_list =
13111 kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
13112 for (i = 0; i < TX_FANOUT_SIZE; i++) {
13113 itl_tx = &ipst->ips_idl_tx_list[i];
13114 itl_tx->txl_drain_list =
13115 kmem_zalloc(ipst->ips_conn_drain_list_cnt *
13116 sizeof (idl_t), KM_SLEEP);
13117 mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
13118 for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
13119 mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
13120 MUTEX_DEFAULT, NULL);
13121 itl_tx->txl_drain_list[j].idl_itl = itl_tx;
13122 }
13123 }
13124 }
13125
13126 static void
13127 conn_drain_fini(ip_stack_t *ipst)
13128 {
13129 int i;
13130 idl_tx_list_t *itl_tx;
13131
13132 for (i = 0; i < TX_FANOUT_SIZE; i++) {
13133 itl_tx = &ipst->ips_idl_tx_list[i];
13134 kmem_free(itl_tx->txl_drain_list,
13135 ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
13136 }
13137 kmem_free(ipst->ips_idl_tx_list,
13138 TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
13139 ipst->ips_idl_tx_list = NULL;
13140 }
13141
13142 /*
13143 * Flow control has blocked us from proceeding. Insert the given conn in one
13144 * of the conn drain lists. When flow control is unblocked, either ip_wsrv()
13145 * (STREAMS) or ill_flow_enable() (direct) will be called back, which in turn
13146 * will call conn_walk_drain(). See the flow control notes at the top of this
13147 * file for more details.
13148 */
13149 void
13150 conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
13151 {
13152 idl_t *idl = tx_list->txl_drain_list;
13153 uint_t index;
13154 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
13155
13156 mutex_enter(&connp->conn_lock);
13157 if (connp->conn_state_flags & CONN_CLOSING) {
13158 /*
13159 * The conn is closing as a result of which CONN_CLOSING
13160 * is set. Return.
13161 */
13162 mutex_exit(&connp->conn_lock);
13163 return;
13164 } else if (connp->conn_idl == NULL) {
13165 /*
13166 * Assign the next drain list round robin. We dont' use
13167 * a lock, and thus it may not be strictly round robin.
13168 * Atomicity of load/stores is enough to make sure that
13169 * conn_drain_list_index is always within bounds.
13170 */
13171 index = tx_list->txl_drain_index;
13172 ASSERT(index < ipst->ips_conn_drain_list_cnt);
13173 connp->conn_idl = &tx_list->txl_drain_list[index];
13174 index++;
13175 if (index == ipst->ips_conn_drain_list_cnt)
13176 index = 0;
13177 tx_list->txl_drain_index = index;
13178 } else {
13179 ASSERT(connp->conn_idl->idl_itl == tx_list);
13180 }
13181 mutex_exit(&connp->conn_lock);
13182
13183 idl = connp->conn_idl;
13184 mutex_enter(&idl->idl_lock);
13185 if ((connp->conn_drain_prev != NULL) ||
13186 (connp->conn_state_flags & CONN_CLOSING)) {
13187 /*
13188 * The conn is either already in the drain list or closing.
13189 * (We needed to check for CONN_CLOSING again since close can
13190 * sneak in between dropping conn_lock and acquiring idl_lock.)
13191 */
13192 mutex_exit(&idl->idl_lock);
13193 return;
13194 }
13195
13196 /*
13197 * The conn is not in the drain list. Insert it at the
13198 * tail of the drain list. The drain list is circular
13199 * and doubly linked. idl_conn points to the 1st element
13200 * in the list.
13201 */
13202 if (idl->idl_conn == NULL) {
13203 idl->idl_conn = connp;
13204 connp->conn_drain_next = connp;
13205 connp->conn_drain_prev = connp;
13206 } else {
13207 conn_t *head = idl->idl_conn;
13208
13209 connp->conn_drain_next = head;
13210 connp->conn_drain_prev = head->conn_drain_prev;
13211 head->conn_drain_prev->conn_drain_next = connp;
13212 head->conn_drain_prev = connp;
13213 }
13214 /*
13215 * For non streams based sockets assert flow control.
13216 */
13217 conn_setqfull(connp, NULL);
13218 mutex_exit(&idl->idl_lock);
13219 }
13220
13221 static void
13222 conn_drain_remove(conn_t *connp)
13223 {
13224 idl_t *idl = connp->conn_idl;
13225
13226 if (idl != NULL) {
13227 /*
13228 * Remove ourself from the drain list.
13229 */
13230 if (connp->conn_drain_next == connp) {
13231 /* Singleton in the list */
13232 ASSERT(connp->conn_drain_prev == connp);
13233 idl->idl_conn = NULL;
13234 } else {
13235 connp->conn_drain_prev->conn_drain_next =
13236 connp->conn_drain_next;
13237 connp->conn_drain_next->conn_drain_prev =
13238 connp->conn_drain_prev;
13239 if (idl->idl_conn == connp)
13240 idl->idl_conn = connp->conn_drain_next;
13241 }
13242
13243 /*
13244 * NOTE: because conn_idl is associated with a specific drain
13245 * list which in turn is tied to the index the TX ring
13246 * (txl_cookie) hashes to, and because the TX ring can change
13247 * over the lifetime of the conn_t, we must clear conn_idl so
13248 * a subsequent conn_drain_insert() will set conn_idl again
13249 * based on the latest txl_cookie.
13250 */
13251 connp->conn_idl = NULL;
13252 }
13253 connp->conn_drain_next = NULL;
13254 connp->conn_drain_prev = NULL;
13255
13256 conn_clrqfull(connp, NULL);
13257 /*
13258 * For streams based sockets open up flow control.
13259 */
13260 if (!IPCL_IS_NONSTR(connp))
13261 enableok(connp->conn_wq);
13262 }
13263
13264 /*
13265 * This conn is closing, and we are called from ip_close. OR
13266 * this conn is draining because flow-control on the ill has been relieved.
13267 *
13268 * We must also need to remove conn's on this idl from the list, and also
13269 * inform the sockfs upcalls about the change in flow-control.
13270 */
13271 static void
13272 conn_drain(conn_t *connp, boolean_t closing)
13273 {
13274 idl_t *idl;
13275 conn_t *next_connp;
13276
13277 /*
13278 * connp->conn_idl is stable at this point, and no lock is needed
13279 * to check it. If we are called from ip_close, close has already
13280 * set CONN_CLOSING, thus freezing the value of conn_idl, and
13281 * called us only because conn_idl is non-null. If we are called thru
13282 * service, conn_idl could be null, but it cannot change because
13283 * service is single-threaded per queue, and there cannot be another
13284 * instance of service trying to call conn_drain_insert on this conn
13285 * now.
13286 */
13287 ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
13288
13289 /*
13290 * If the conn doesn't exist or is not on a drain list, bail.
13291 */
13292 if (connp == NULL || connp->conn_idl == NULL ||
13293 connp->conn_drain_prev == NULL) {
13294 return;
13295 }
13296
13297 idl = connp->conn_idl;
13298 ASSERT(MUTEX_HELD(&idl->idl_lock));
13299
13300 if (!closing) {
13301 next_connp = connp->conn_drain_next;
13302 while (next_connp != connp) {
13303 conn_t *delconnp = next_connp;
13304
13305 next_connp = next_connp->conn_drain_next;
13306 conn_drain_remove(delconnp);
13307 }
13308 ASSERT(connp->conn_drain_next == idl->idl_conn);
13309 }
13310 conn_drain_remove(connp);
13311 }
13312
13313 /*
13314 * Write service routine. Shared perimeter entry point.
13315 * The device queue's messages has fallen below the low water mark and STREAMS
13316 * has backenabled the ill_wq. Send sockfs notification about flow-control on
13317 * each waiting conn.
13318 */
13319 void
13320 ip_wsrv(queue_t *q)
13321 {
13322 ill_t *ill;
13323
13324 ill = (ill_t *)q->q_ptr;
13325 if (ill->ill_state_flags == 0) {
13326 ip_stack_t *ipst = ill->ill_ipst;
13327
13328 /*
13329 * The device flow control has opened up.
13330 * Walk through conn drain lists and qenable the
13331 * first conn in each list. This makes sense only
13332 * if the stream is fully plumbed and setup.
13333 * Hence the ill_state_flags check above.
13334 */
13335 ip1dbg(("ip_wsrv: walking\n"));
13336 conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
13337 enableok(ill->ill_wq);
13338 }
13339 }
13340
13341 /*
13342 * Callback to disable flow control in IP.
13343 *
13344 * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
13345 * is enabled.
13346 *
13347 * When MAC_TX() is not able to send any more packets, dld sets its queue
13348 * to QFULL and enable the STREAMS flow control. Later, when the underlying
13349 * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
13350 * function and wakes up corresponding mac worker threads, which in turn
13351 * calls this callback function, and disables flow control.
13352 */
13353 void
13354 ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
13355 {
13356 ill_t *ill = (ill_t *)arg;
13357 ip_stack_t *ipst = ill->ill_ipst;
13358 idl_tx_list_t *idl_txl;
13359
13360 idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
13361 mutex_enter(&idl_txl->txl_lock);
13362 /* add code to to set a flag to indicate idl_txl is enabled */
13363 conn_walk_drain(ipst, idl_txl);
13364 mutex_exit(&idl_txl->txl_lock);
13365 }
13366
13367 /*
13368 * Flow control has been relieved and STREAMS has backenabled us; drain
13369 * all the conn lists on `tx_list'.
13370 */
13371 static void
13372 conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
13373 {
13374 int i;
13375 idl_t *idl;
13376
13377 IP_STAT(ipst, ip_conn_walk_drain);
13378
13379 for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
13380 idl = &tx_list->txl_drain_list[i];
13381 mutex_enter(&idl->idl_lock);
13382 conn_drain(idl->idl_conn, B_FALSE);
13383 mutex_exit(&idl->idl_lock);
13384 }
13385 }
13386
13387 /*
13388 * Determine if the ill and multicast aspects of that packets
13389 * "matches" the conn.
13390 */
13391 boolean_t
13392 conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
13393 {
13394 ill_t *ill = ira->ira_rill;
13395 zoneid_t zoneid = ira->ira_zoneid;
13396 uint_t in_ifindex;
13397 ipaddr_t dst, src;
13398
13399 dst = ipha->ipha_dst;
13400 src = ipha->ipha_src;
13401
13402 /*
13403 * conn_incoming_ifindex is set by IP_BOUND_IF which limits
13404 * unicast, broadcast and multicast reception to
13405 * conn_incoming_ifindex.
13406 * conn_wantpacket is called for unicast, broadcast and
13407 * multicast packets.
13408 */
13409 in_ifindex = connp->conn_incoming_ifindex;
13410
13411 /* mpathd can bind to the under IPMP interface, which we allow */
13412 if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
13413 if (!IS_UNDER_IPMP(ill))
13414 return (B_FALSE);
13415
13416 if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
13417 return (B_FALSE);
13418 }
13419
13420 if (!IPCL_ZONE_MATCH(connp, zoneid))
13421 return (B_FALSE);
13422
13423 if (!(ira->ira_flags & IRAF_MULTICAST))
13424 return (B_TRUE);
13425
13426 if (connp->conn_multi_router) {
13427 /* multicast packet and multicast router socket: send up */
13428 return (B_TRUE);
13429 }
13430
13431 if (ipha->ipha_protocol == IPPROTO_PIM ||
13432 ipha->ipha_protocol == IPPROTO_RSVP)
13433 return (B_TRUE);
13434
13435 return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
13436 }
13437
13438 void
13439 conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
13440 {
13441 if (IPCL_IS_NONSTR(connp)) {
13442 (*connp->conn_upcalls->su_txq_full)
13443 (connp->conn_upper_handle, B_TRUE);
13444 if (flow_stopped != NULL)
13445 *flow_stopped = B_TRUE;
13446 } else {
13447 queue_t *q = connp->conn_wq;
13448
13449 ASSERT(q != NULL);
13450 if (!(q->q_flag & QFULL)) {
13451 mutex_enter(QLOCK(q));
13452 if (!(q->q_flag & QFULL)) {
13453 /* still need to set QFULL */
13454 q->q_flag |= QFULL;
13455 /* set flow_stopped to true under QLOCK */
13456 if (flow_stopped != NULL)
13457 *flow_stopped = B_TRUE;
13458 mutex_exit(QLOCK(q));
13459 } else {
13460 /* flow_stopped is left unchanged */
13461 mutex_exit(QLOCK(q));
13462 }
13463 }
13464 }
13465 }
13466
13467 void
13468 conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
13469 {
13470 if (IPCL_IS_NONSTR(connp)) {
13471 (*connp->conn_upcalls->su_txq_full)
13472 (connp->conn_upper_handle, B_FALSE);
13473 if (flow_stopped != NULL)
13474 *flow_stopped = B_FALSE;
13475 } else {
13476 queue_t *q = connp->conn_wq;
13477
13478 ASSERT(q != NULL);
13479 if (q->q_flag & QFULL) {
13480 mutex_enter(QLOCK(q));
13481 if (q->q_flag & QFULL) {
13482 q->q_flag &= ~QFULL;
13483 /* set flow_stopped to false under QLOCK */
13484 if (flow_stopped != NULL)
13485 *flow_stopped = B_FALSE;
13486 mutex_exit(QLOCK(q));
13487 if (q->q_flag & QWANTW)
13488 qbackenable(q, 0);
13489 } else {
13490 /* flow_stopped is left unchanged */
13491 mutex_exit(QLOCK(q));
13492 }
13493 }
13494 }
13495
13496 mutex_enter(&connp->conn_lock);
13497 connp->conn_blocked = B_FALSE;
13498 mutex_exit(&connp->conn_lock);
13499 }
13500
13501 /*
13502 * Return the length in bytes of the IPv4 headers (base header, label, and
13503 * other IP options) that will be needed based on the
13504 * ip_pkt_t structure passed by the caller.
13505 *
13506 * The returned length does not include the length of the upper level
13507 * protocol (ULP) header.
13508 * The caller needs to check that the length doesn't exceed the max for IPv4.
13509 */
13510 int
13511 ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
13512 {
13513 int len;
13514
13515 len = IP_SIMPLE_HDR_LENGTH;
13516 if (ipp->ipp_fields & IPPF_LABEL_V4) {
13517 ASSERT(ipp->ipp_label_len_v4 != 0);
13518 /* We need to round up here */
13519 len += (ipp->ipp_label_len_v4 + 3) & ~3;
13520 }
13521
13522 if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13523 ASSERT(ipp->ipp_ipv4_options_len != 0);
13524 ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13525 len += ipp->ipp_ipv4_options_len;
13526 }
13527 return (len);
13528 }
13529
13530 /*
13531 * All-purpose routine to build an IPv4 header with options based
13532 * on the abstract ip_pkt_t.
13533 *
13534 * The caller has to set the source and destination address as well as
13535 * ipha_length. The caller has to massage any source route and compensate
13536 * for the ULP pseudo-header checksum due to the source route.
13537 */
13538 void
13539 ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
13540 uint8_t protocol)
13541 {
13542 ipha_t *ipha = (ipha_t *)buf;
13543 uint8_t *cp;
13544
13545 /* Initialize IPv4 header */
13546 ipha->ipha_type_of_service = ipp->ipp_type_of_service;
13547 ipha->ipha_length = 0; /* Caller will set later */
13548 ipha->ipha_ident = 0;
13549 ipha->ipha_fragment_offset_and_flags = 0;
13550 ipha->ipha_ttl = ipp->ipp_unicast_hops;
13551 ipha->ipha_protocol = protocol;
13552 ipha->ipha_hdr_checksum = 0;
13553
13554 if ((ipp->ipp_fields & IPPF_ADDR) &&
13555 IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
13556 ipha->ipha_src = ipp->ipp_addr_v4;
13557
13558 cp = (uint8_t *)&ipha[1];
13559 if (ipp->ipp_fields & IPPF_LABEL_V4) {
13560 ASSERT(ipp->ipp_label_len_v4 != 0);
13561 bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
13562 cp += ipp->ipp_label_len_v4;
13563 /* We need to round up here */
13564 while ((uintptr_t)cp & 0x3) {
13565 *cp++ = IPOPT_NOP;
13566 }
13567 }
13568
13569 if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13570 ASSERT(ipp->ipp_ipv4_options_len != 0);
13571 ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13572 bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
13573 cp += ipp->ipp_ipv4_options_len;
13574 }
13575 ipha->ipha_version_and_hdr_length =
13576 (uint8_t)((IP_VERSION << 4) + buf_len / 4);
13577
13578 ASSERT((int)(cp - buf) == buf_len);
13579 }
13580
13581 /* Allocate the private structure */
13582 static int
13583 ip_priv_alloc(void **bufp)
13584 {
13585 void *buf;
13586
13587 if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL)
13588 return (ENOMEM);
13589
13590 *bufp = buf;
13591 return (0);
13592 }
13593
13594 /* Function to delete the private structure */
13595 void
13596 ip_priv_free(void *buf)
13597 {
13598 ASSERT(buf != NULL);
13599 kmem_free(buf, sizeof (ip_priv_t));
13600 }
13601
13602 /*
13603 * The entry point for IPPF processing.
13604 * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the
13605 * routine just returns.
13606 *
13607 * When called, ip_process generates an ipp_packet_t structure
13608 * which holds the state information for this packet and invokes the
13609 * the classifier (via ipp_packet_process). The classification, depending on
13610 * configured filters, results in a list of actions for this packet. Invoking
13611 * an action may cause the packet to be dropped, in which case we return NULL.
13612 * proc indicates the callout position for
13613 * this packet and ill is the interface this packet arrived on or will leave
13614 * on (inbound and outbound resp.).
13615 *
13616 * We do the processing on the rill (mapped to the upper if ipmp), but MIB
13617 * on the ill corrsponding to the destination IP address.
13618 */
13619 mblk_t *
13620 ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
13621 {
13622 ip_priv_t *priv;
13623 ipp_action_id_t aid;
13624 int rc = 0;
13625 ipp_packet_t *pp;
13626
13627 /* If the classifier is not loaded, return */
13628 if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
13629 return (mp);
13630 }
13631
13632 ASSERT(mp != NULL);
13633
13634 /* Allocate the packet structure */
13635 rc = ipp_packet_alloc(&pp, "ip", aid);
13636 if (rc != 0)
13637 goto drop;
13638
13639 /* Allocate the private structure */
13640 rc = ip_priv_alloc((void **)&priv);
13641 if (rc != 0) {
13642 ipp_packet_free(pp);
13643 goto drop;
13644 }
13645 priv->proc = proc;
13646 priv->ill_index = ill_get_upper_ifindex(rill);
13647
13648 ipp_packet_set_private(pp, priv, ip_priv_free);
13649 ipp_packet_set_data(pp, mp);
13650
13651 /* Invoke the classifier */
13652 rc = ipp_packet_process(&pp);
13653 if (pp != NULL) {
13654 mp = ipp_packet_get_data(pp);
13655 ipp_packet_free(pp);
13656 if (rc != 0)
13657 goto drop;
13658 return (mp);
13659 } else {
13660 /* No mp to trace in ip_drop_input/ip_drop_output */
13661 mp = NULL;
13662 }
13663 drop:
13664 if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
13665 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
13666 ip_drop_input("ip_process", mp, ill);
13667 } else {
13668 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13669 ip_drop_output("ip_process", mp, ill);
13670 }
13671 freemsg(mp);
13672 return (NULL);
13673 }
13674
13675 /*
13676 * Propagate a multicast group membership operation (add/drop) on
13677 * all the interfaces crossed by the related multirt routes.
13678 * The call is considered successful if the operation succeeds
13679 * on at least one interface.
13680 *
13681 * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
13682 * multicast addresses with the ire argument being the first one.
13683 * We walk the bucket to find all the of those.
13684 *
13685 * Common to IPv4 and IPv6.
13686 */
13687 static int
13688 ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
13689 const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
13690 ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
13691 mcast_record_t fmode, const in6_addr_t *v6src)
13692 {
13693 ire_t *ire_gw;
13694 irb_t *irb;
13695 int ifindex;
13696 int error = 0;
13697 int result;
13698 ip_stack_t *ipst = ire->ire_ipst;
13699 ipaddr_t group;
13700 boolean_t isv6;
13701 int match_flags;
13702
13703 if (IN6_IS_ADDR_V4MAPPED(v6group)) {
13704 IN6_V4MAPPED_TO_IPADDR(v6group, group);
13705 isv6 = B_FALSE;
13706 } else {
13707 isv6 = B_TRUE;
13708 }
13709
13710 irb = ire->ire_bucket;
13711 ASSERT(irb != NULL);
13712
13713 result = 0;
13714 irb_refhold(irb);
13715 for (; ire != NULL; ire = ire->ire_next) {
13716 if ((ire->ire_flags & RTF_MULTIRT) == 0)
13717 continue;
13718
13719 /* We handle -ifp routes by matching on the ill if set */
13720 match_flags = MATCH_IRE_TYPE;
13721 if (ire->ire_ill != NULL)
13722 match_flags |= MATCH_IRE_ILL;
13723
13724 if (isv6) {
13725 if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
13726 continue;
13727
13728 ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
13729 0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13730 match_flags, 0, ipst, NULL);
13731 } else {
13732 if (ire->ire_addr != group)
13733 continue;
13734
13735 ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
13736 0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13737 match_flags, 0, ipst, NULL);
13738 }
13739 /* No interface route exists for the gateway; skip this ire. */
13740 if (ire_gw == NULL)
13741 continue;
13742 if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
13743 ire_refrele(ire_gw);
13744 continue;
13745 }
13746 ASSERT(ire_gw->ire_ill != NULL); /* IRE_INTERFACE */
13747 ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
13748
13749 /*
13750 * The operation is considered a success if
13751 * it succeeds at least once on any one interface.
13752 */
13753 error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
13754 fmode, v6src);
13755 if (error == 0)
13756 result = CGTP_MCAST_SUCCESS;
13757
13758 ire_refrele(ire_gw);
13759 }
13760 irb_refrele(irb);
13761 /*
13762 * Consider the call as successful if we succeeded on at least
13763 * one interface. Otherwise, return the last encountered error.
13764 */
13765 return (result == CGTP_MCAST_SUCCESS ? 0 : error);
13766 }
13767
13768 /*
13769 * Return the expected CGTP hooks version number.
13770 */
13771 int
13772 ip_cgtp_filter_supported(void)
13773 {
13774 return (ip_cgtp_filter_rev);
13775 }
13776
13777 /*
13778 * CGTP hooks can be registered by invoking this function.
13779 * Checks that the version number matches.
13780 */
13781 int
13782 ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
13783 {
13784 netstack_t *ns;
13785 ip_stack_t *ipst;
13786
13787 if (ops->cfo_filter_rev != CGTP_FILTER_REV)
13788 return (ENOTSUP);
13789
13790 ns = netstack_find_by_stackid(stackid);
13791 if (ns == NULL)
13792 return (EINVAL);
13793 ipst = ns->netstack_ip;
13794 ASSERT(ipst != NULL);
13795
13796 if (ipst->ips_ip_cgtp_filter_ops != NULL) {
13797 netstack_rele(ns);
13798 return (EALREADY);
13799 }
13800
13801 ipst->ips_ip_cgtp_filter_ops = ops;
13802
13803 ill_set_inputfn_all(ipst);
13804
13805 netstack_rele(ns);
13806 return (0);
13807 }
13808
13809 /*
13810 * CGTP hooks can be unregistered by invoking this function.
13811 * Returns ENXIO if there was no registration.
13812 * Returns EBUSY if the ndd variable has not been turned off.
13813 */
13814 int
13815 ip_cgtp_filter_unregister(netstackid_t stackid)
13816 {
13817 netstack_t *ns;
13818 ip_stack_t *ipst;
13819
13820 ns = netstack_find_by_stackid(stackid);
13821 if (ns == NULL)
13822 return (EINVAL);
13823 ipst = ns->netstack_ip;
13824 ASSERT(ipst != NULL);
13825
13826 if (ipst->ips_ip_cgtp_filter) {
13827 netstack_rele(ns);
13828 return (EBUSY);
13829 }
13830
13831 if (ipst->ips_ip_cgtp_filter_ops == NULL) {
13832 netstack_rele(ns);
13833 return (ENXIO);
13834 }
13835 ipst->ips_ip_cgtp_filter_ops = NULL;
13836
13837 ill_set_inputfn_all(ipst);
13838
13839 netstack_rele(ns);
13840 return (0);
13841 }
13842
13843 /*
13844 * Check whether there is a CGTP filter registration.
13845 * Returns non-zero if there is a registration, otherwise returns zero.
13846 * Note: returns zero if bad stackid.
13847 */
13848 int
13849 ip_cgtp_filter_is_registered(netstackid_t stackid)
13850 {
13851 netstack_t *ns;
13852 ip_stack_t *ipst;
13853 int ret;
13854
13855 ns = netstack_find_by_stackid(stackid);
13856 if (ns == NULL)
13857 return (0);
13858 ipst = ns->netstack_ip;
13859 ASSERT(ipst != NULL);
13860
13861 if (ipst->ips_ip_cgtp_filter_ops != NULL)
13862 ret = 1;
13863 else
13864 ret = 0;
13865
13866 netstack_rele(ns);
13867 return (ret);
13868 }
13869
13870 static int
13871 ip_squeue_switch(int val)
13872 {
13873 int rval;
13874
13875 switch (val) {
13876 case IP_SQUEUE_ENTER_NODRAIN:
13877 rval = SQ_NODRAIN;
13878 break;
13879 case IP_SQUEUE_ENTER:
13880 rval = SQ_PROCESS;
13881 break;
13882 case IP_SQUEUE_FILL:
13883 default:
13884 rval = SQ_FILL;
13885 break;
13886 }
13887 return (rval);
13888 }
13889
13890 static void *
13891 ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
13892 {
13893 kstat_t *ksp;
13894
13895 ip_stat_t template = {
13896 { "ip_udp_fannorm", KSTAT_DATA_UINT64 },
13897 { "ip_udp_fanmb", KSTAT_DATA_UINT64 },
13898 { "ip_recv_pullup", KSTAT_DATA_UINT64 },
13899 { "ip_db_ref", KSTAT_DATA_UINT64 },
13900 { "ip_notaligned", KSTAT_DATA_UINT64 },
13901 { "ip_multimblk", KSTAT_DATA_UINT64 },
13902 { "ip_opt", KSTAT_DATA_UINT64 },
13903 { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
13904 { "ip_conn_flputbq", KSTAT_DATA_UINT64 },
13905 { "ip_conn_walk_drain", KSTAT_DATA_UINT64 },
13906 { "ip_out_sw_cksum", KSTAT_DATA_UINT64 },
13907 { "ip_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
13908 { "ip_in_sw_cksum", KSTAT_DATA_UINT64 },
13909 { "ip_ire_reclaim_calls", KSTAT_DATA_UINT64 },
13910 { "ip_ire_reclaim_deleted", KSTAT_DATA_UINT64 },
13911 { "ip_nce_reclaim_calls", KSTAT_DATA_UINT64 },
13912 { "ip_nce_reclaim_deleted", KSTAT_DATA_UINT64 },
13913 { "ip_dce_reclaim_calls", KSTAT_DATA_UINT64 },
13914 { "ip_dce_reclaim_deleted", KSTAT_DATA_UINT64 },
13915 { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
13916 { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
13917 { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
13918 { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
13919 { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
13920 { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
13921 { "conn_in_recvdstaddr", KSTAT_DATA_UINT64 },
13922 { "conn_in_recvopts", KSTAT_DATA_UINT64 },
13923 { "conn_in_recvif", KSTAT_DATA_UINT64 },
13924 { "conn_in_recvslla", KSTAT_DATA_UINT64 },
13925 { "conn_in_recvucred", KSTAT_DATA_UINT64 },
13926 { "conn_in_recvttl", KSTAT_DATA_UINT64 },
13927 { "conn_in_recvhopopts", KSTAT_DATA_UINT64 },
13928 { "conn_in_recvhoplimit", KSTAT_DATA_UINT64 },
13929 { "conn_in_recvdstopts", KSTAT_DATA_UINT64 },
13930 { "conn_in_recvrthdrdstopts", KSTAT_DATA_UINT64 },
13931 { "conn_in_recvrthdr", KSTAT_DATA_UINT64 },
13932 { "conn_in_recvpktinfo", KSTAT_DATA_UINT64 },
13933 { "conn_in_recvtclass", KSTAT_DATA_UINT64 },
13934 { "conn_in_timestamp", KSTAT_DATA_UINT64 },
13935 };
13936
13937 ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
13938 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
13939 KSTAT_FLAG_VIRTUAL, stackid);
13940
13941 if (ksp == NULL)
13942 return (NULL);
13943
13944 bcopy(&template, ip_statisticsp, sizeof (template));
13945 ksp->ks_data = (void *)ip_statisticsp;
13946 ksp->ks_private = (void *)(uintptr_t)stackid;
13947
13948 kstat_install(ksp);
13949 return (ksp);
13950 }
13951
13952 static void
13953 ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp)
13954 {
13955 if (ksp != NULL) {
13956 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
13957 kstat_delete_netstack(ksp, stackid);
13958 }
13959 }
13960
13961 static void *
13962 ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst)
13963 {
13964 kstat_t *ksp;
13965
13966 ip_named_kstat_t template = {
13967 { "forwarding", KSTAT_DATA_UINT32, 0 },
13968 { "defaultTTL", KSTAT_DATA_UINT32, 0 },
13969 { "inReceives", KSTAT_DATA_UINT64, 0 },
13970 { "inHdrErrors", KSTAT_DATA_UINT32, 0 },
13971 { "inAddrErrors", KSTAT_DATA_UINT32, 0 },
13972 { "forwDatagrams", KSTAT_DATA_UINT64, 0 },
13973 { "inUnknownProtos", KSTAT_DATA_UINT32, 0 },
13974 { "inDiscards", KSTAT_DATA_UINT32, 0 },
13975 { "inDelivers", KSTAT_DATA_UINT64, 0 },
13976 { "outRequests", KSTAT_DATA_UINT64, 0 },
13977 { "outDiscards", KSTAT_DATA_UINT32, 0 },
13978 { "outNoRoutes", KSTAT_DATA_UINT32, 0 },
13979 { "reasmTimeout", KSTAT_DATA_UINT32, 0 },
13980 { "reasmReqds", KSTAT_DATA_UINT32, 0 },
13981 { "reasmOKs", KSTAT_DATA_UINT32, 0 },
13982 { "reasmFails", KSTAT_DATA_UINT32, 0 },
13983 { "fragOKs", KSTAT_DATA_UINT32, 0 },
13984 { "fragFails", KSTAT_DATA_UINT32, 0 },
13985 { "fragCreates", KSTAT_DATA_UINT32, 0 },
13986 { "addrEntrySize", KSTAT_DATA_INT32, 0 },
13987 { "routeEntrySize", KSTAT_DATA_INT32, 0 },
13988 { "netToMediaEntrySize", KSTAT_DATA_INT32, 0 },
13989 { "routingDiscards", KSTAT_DATA_UINT32, 0 },
13990 { "inErrs", KSTAT_DATA_UINT32, 0 },
13991 { "noPorts", KSTAT_DATA_UINT32, 0 },
13992 { "inCksumErrs", KSTAT_DATA_UINT32, 0 },
13993 { "reasmDuplicates", KSTAT_DATA_UINT32, 0 },
13994 { "reasmPartDups", KSTAT_DATA_UINT32, 0 },
13995 { "forwProhibits", KSTAT_DATA_UINT32, 0 },
13996 { "udpInCksumErrs", KSTAT_DATA_UINT32, 0 },
13997 { "udpInOverflows", KSTAT_DATA_UINT32, 0 },
13998 { "rawipInOverflows", KSTAT_DATA_UINT32, 0 },
13999 { "ipsecInSucceeded", KSTAT_DATA_UINT32, 0 },
14000 { "ipsecInFailed", KSTAT_DATA_INT32, 0 },
14001 { "memberEntrySize", KSTAT_DATA_INT32, 0 },
14002 { "inIPv6", KSTAT_DATA_UINT32, 0 },
14003 { "outIPv6", KSTAT_DATA_UINT32, 0 },
14004 { "outSwitchIPv6", KSTAT_DATA_UINT32, 0 },
14005 };
14006
14007 ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED,
14008 NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid);
14009 if (ksp == NULL || ksp->ks_data == NULL)
14010 return (NULL);
14011
14012 template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2;
14013 template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl;
14014 template.reasmTimeout.value.ui32 = ipst->ips_ip_reassembly_timeout;
14015 template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t);
14016 template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t);
14017
14018 template.netToMediaEntrySize.value.i32 =
14019 sizeof (mib2_ipNetToMediaEntry_t);
14020
14021 template.memberEntrySize.value.i32 = sizeof (ipv6_member_t);
14022
14023 bcopy(&template, ksp->ks_data, sizeof (template));
14024 ksp->ks_update = ip_kstat_update;
14025 ksp->ks_private = (void *)(uintptr_t)stackid;
14026
14027 kstat_install(ksp);
14028 return (ksp);
14029 }
14030
14031 static void
14032 ip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14033 {
14034 if (ksp != NULL) {
14035 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14036 kstat_delete_netstack(ksp, stackid);
14037 }
14038 }
14039
14040 static int
14041 ip_kstat_update(kstat_t *kp, int rw)
14042 {
14043 ip_named_kstat_t *ipkp;
14044 mib2_ipIfStatsEntry_t ipmib;
14045 ill_walk_context_t ctx;
14046 ill_t *ill;
14047 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14048 netstack_t *ns;
14049 ip_stack_t *ipst;
14050
14051 if (kp == NULL || kp->ks_data == NULL)
14052 return (EIO);
14053
14054 if (rw == KSTAT_WRITE)
14055 return (EACCES);
14056
14057 ns = netstack_find_by_stackid(stackid);
14058 if (ns == NULL)
14059 return (-1);
14060 ipst = ns->netstack_ip;
14061 if (ipst == NULL) {
14062 netstack_rele(ns);
14063 return (-1);
14064 }
14065 ipkp = (ip_named_kstat_t *)kp->ks_data;
14066
14067 bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib));
14068 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
14069 ill = ILL_START_WALK_V4(&ctx, ipst);
14070 for (; ill != NULL; ill = ill_next(&ctx, ill))
14071 ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib);
14072 rw_exit(&ipst->ips_ill_g_lock);
14073
14074 ipkp->forwarding.value.ui32 = ipmib.ipIfStatsForwarding;
14075 ipkp->defaultTTL.value.ui32 = ipmib.ipIfStatsDefaultTTL;
14076 ipkp->inReceives.value.ui64 = ipmib.ipIfStatsHCInReceives;
14077 ipkp->inHdrErrors.value.ui32 = ipmib.ipIfStatsInHdrErrors;
14078 ipkp->inAddrErrors.value.ui32 = ipmib.ipIfStatsInAddrErrors;
14079 ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams;
14080 ipkp->inUnknownProtos.value.ui32 = ipmib.ipIfStatsInUnknownProtos;
14081 ipkp->inDiscards.value.ui32 = ipmib.ipIfStatsInDiscards;
14082 ipkp->inDelivers.value.ui64 = ipmib.ipIfStatsHCInDelivers;
14083 ipkp->outRequests.value.ui64 = ipmib.ipIfStatsHCOutRequests;
14084 ipkp->outDiscards.value.ui32 = ipmib.ipIfStatsOutDiscards;
14085 ipkp->outNoRoutes.value.ui32 = ipmib.ipIfStatsOutNoRoutes;
14086 ipkp->reasmTimeout.value.ui32 = ipst->ips_ip_reassembly_timeout;
14087 ipkp->reasmReqds.value.ui32 = ipmib.ipIfStatsReasmReqds;
14088 ipkp->reasmOKs.value.ui32 = ipmib.ipIfStatsReasmOKs;
14089 ipkp->reasmFails.value.ui32 = ipmib.ipIfStatsReasmFails;
14090 ipkp->fragOKs.value.ui32 = ipmib.ipIfStatsOutFragOKs;
14091 ipkp->fragFails.value.ui32 = ipmib.ipIfStatsOutFragFails;
14092 ipkp->fragCreates.value.ui32 = ipmib.ipIfStatsOutFragCreates;
14093
14094 ipkp->routingDiscards.value.ui32 = 0;
14095 ipkp->inErrs.value.ui32 = ipmib.tcpIfStatsInErrs;
14096 ipkp->noPorts.value.ui32 = ipmib.udpIfStatsNoPorts;
14097 ipkp->inCksumErrs.value.ui32 = ipmib.ipIfStatsInCksumErrs;
14098 ipkp->reasmDuplicates.value.ui32 = ipmib.ipIfStatsReasmDuplicates;
14099 ipkp->reasmPartDups.value.ui32 = ipmib.ipIfStatsReasmPartDups;
14100 ipkp->forwProhibits.value.ui32 = ipmib.ipIfStatsForwProhibits;
14101 ipkp->udpInCksumErrs.value.ui32 = ipmib.udpIfStatsInCksumErrs;
14102 ipkp->udpInOverflows.value.ui32 = ipmib.udpIfStatsInOverflows;
14103 ipkp->rawipInOverflows.value.ui32 = ipmib.rawipIfStatsInOverflows;
14104 ipkp->ipsecInSucceeded.value.ui32 = ipmib.ipsecIfStatsInSucceeded;
14105 ipkp->ipsecInFailed.value.i32 = ipmib.ipsecIfStatsInFailed;
14106
14107 ipkp->inIPv6.value.ui32 = ipmib.ipIfStatsInWrongIPVersion;
14108 ipkp->outIPv6.value.ui32 = ipmib.ipIfStatsOutWrongIPVersion;
14109 ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion;
14110
14111 netstack_rele(ns);
14112
14113 return (0);
14114 }
14115
14116 static void *
14117 icmp_kstat_init(netstackid_t stackid)
14118 {
14119 kstat_t *ksp;
14120
14121 icmp_named_kstat_t template = {
14122 { "inMsgs", KSTAT_DATA_UINT32 },
14123 { "inErrors", KSTAT_DATA_UINT32 },
14124 { "inDestUnreachs", KSTAT_DATA_UINT32 },
14125 { "inTimeExcds", KSTAT_DATA_UINT32 },
14126 { "inParmProbs", KSTAT_DATA_UINT32 },
14127 { "inSrcQuenchs", KSTAT_DATA_UINT32 },
14128 { "inRedirects", KSTAT_DATA_UINT32 },
14129 { "inEchos", KSTAT_DATA_UINT32 },
14130 { "inEchoReps", KSTAT_DATA_UINT32 },
14131 { "inTimestamps", KSTAT_DATA_UINT32 },
14132 { "inTimestampReps", KSTAT_DATA_UINT32 },
14133 { "inAddrMasks", KSTAT_DATA_UINT32 },
14134 { "inAddrMaskReps", KSTAT_DATA_UINT32 },
14135 { "outMsgs", KSTAT_DATA_UINT32 },
14136 { "outErrors", KSTAT_DATA_UINT32 },
14137 { "outDestUnreachs", KSTAT_DATA_UINT32 },
14138 { "outTimeExcds", KSTAT_DATA_UINT32 },
14139 { "outParmProbs", KSTAT_DATA_UINT32 },
14140 { "outSrcQuenchs", KSTAT_DATA_UINT32 },
14141 { "outRedirects", KSTAT_DATA_UINT32 },
14142 { "outEchos", KSTAT_DATA_UINT32 },
14143 { "outEchoReps", KSTAT_DATA_UINT32 },
14144 { "outTimestamps", KSTAT_DATA_UINT32 },
14145 { "outTimestampReps", KSTAT_DATA_UINT32 },
14146 { "outAddrMasks", KSTAT_DATA_UINT32 },
14147 { "outAddrMaskReps", KSTAT_DATA_UINT32 },
14148 { "inChksumErrs", KSTAT_DATA_UINT32 },
14149 { "inUnknowns", KSTAT_DATA_UINT32 },
14150 { "inFragNeeded", KSTAT_DATA_UINT32 },
14151 { "outFragNeeded", KSTAT_DATA_UINT32 },
14152 { "outDrops", KSTAT_DATA_UINT32 },
14153 { "inOverFlows", KSTAT_DATA_UINT32 },
14154 { "inBadRedirects", KSTAT_DATA_UINT32 },
14155 };
14156
14157 ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED,
14158 NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid);
14159 if (ksp == NULL || ksp->ks_data == NULL)
14160 return (NULL);
14161
14162 bcopy(&template, ksp->ks_data, sizeof (template));
14163
14164 ksp->ks_update = icmp_kstat_update;
14165 ksp->ks_private = (void *)(uintptr_t)stackid;
14166
14167 kstat_install(ksp);
14168 return (ksp);
14169 }
14170
14171 static void
14172 icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14173 {
14174 if (ksp != NULL) {
14175 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14176 kstat_delete_netstack(ksp, stackid);
14177 }
14178 }
14179
14180 static int
14181 icmp_kstat_update(kstat_t *kp, int rw)
14182 {
14183 icmp_named_kstat_t *icmpkp;
14184 netstackid_t stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14185 netstack_t *ns;
14186 ip_stack_t *ipst;
14187
14188 if ((kp == NULL) || (kp->ks_data == NULL))
14189 return (EIO);
14190
14191 if (rw == KSTAT_WRITE)
14192 return (EACCES);
14193
14194 ns = netstack_find_by_stackid(stackid);
14195 if (ns == NULL)
14196 return (-1);
14197 ipst = ns->netstack_ip;
14198 if (ipst == NULL) {
14199 netstack_rele(ns);
14200 return (-1);
14201 }
14202 icmpkp = (icmp_named_kstat_t *)kp->ks_data;
14203
14204 icmpkp->inMsgs.value.ui32 = ipst->ips_icmp_mib.icmpInMsgs;
14205 icmpkp->inErrors.value.ui32 = ipst->ips_icmp_mib.icmpInErrors;
14206 icmpkp->inDestUnreachs.value.ui32 =
14207 ipst->ips_icmp_mib.icmpInDestUnreachs;
14208 icmpkp->inTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpInTimeExcds;
14209 icmpkp->inParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpInParmProbs;
14210 icmpkp->inSrcQuenchs.value.ui32 = ipst->ips_icmp_mib.icmpInSrcQuenchs;
14211 icmpkp->inRedirects.value.ui32 = ipst->ips_icmp_mib.icmpInRedirects;
14212 icmpkp->inEchos.value.ui32 = ipst->ips_icmp_mib.icmpInEchos;
14213 icmpkp->inEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpInEchoReps;
14214 icmpkp->inTimestamps.value.ui32 = ipst->ips_icmp_mib.icmpInTimestamps;
14215 icmpkp->inTimestampReps.value.ui32 =
14216 ipst->ips_icmp_mib.icmpInTimestampReps;
14217 icmpkp->inAddrMasks.value.ui32 = ipst->ips_icmp_mib.icmpInAddrMasks;
14218 icmpkp->inAddrMaskReps.value.ui32 =
14219 ipst->ips_icmp_mib.icmpInAddrMaskReps;
14220 icmpkp->outMsgs.value.ui32 = ipst->ips_icmp_mib.icmpOutMsgs;
14221 icmpkp->outErrors.value.ui32 = ipst->ips_icmp_mib.icmpOutErrors;
14222 icmpkp->outDestUnreachs.value.ui32 =
14223 ipst->ips_icmp_mib.icmpOutDestUnreachs;
14224 icmpkp->outTimeExcds.value.ui32 = ipst->ips_icmp_mib.icmpOutTimeExcds;
14225 icmpkp->outParmProbs.value.ui32 = ipst->ips_icmp_mib.icmpOutParmProbs;
14226 icmpkp->outSrcQuenchs.value.ui32 =
14227 ipst->ips_icmp_mib.icmpOutSrcQuenchs;
14228 icmpkp->outRedirects.value.ui32 = ipst->ips_icmp_mib.icmpOutRedirects;
14229 icmpkp->outEchos.value.ui32 = ipst->ips_icmp_mib.icmpOutEchos;
14230 icmpkp->outEchoReps.value.ui32 = ipst->ips_icmp_mib.icmpOutEchoReps;
14231 icmpkp->outTimestamps.value.ui32 =
14232 ipst->ips_icmp_mib.icmpOutTimestamps;
14233 icmpkp->outTimestampReps.value.ui32 =
14234 ipst->ips_icmp_mib.icmpOutTimestampReps;
14235 icmpkp->outAddrMasks.value.ui32 =
14236 ipst->ips_icmp_mib.icmpOutAddrMasks;
14237 icmpkp->outAddrMaskReps.value.ui32 =
14238 ipst->ips_icmp_mib.icmpOutAddrMaskReps;
14239 icmpkp->inCksumErrs.value.ui32 = ipst->ips_icmp_mib.icmpInCksumErrs;
14240 icmpkp->inUnknowns.value.ui32 = ipst->ips_icmp_mib.icmpInUnknowns;
14241 icmpkp->inFragNeeded.value.ui32 = ipst->ips_icmp_mib.icmpInFragNeeded;
14242 icmpkp->outFragNeeded.value.ui32 =
14243 ipst->ips_icmp_mib.icmpOutFragNeeded;
14244 icmpkp->outDrops.value.ui32 = ipst->ips_icmp_mib.icmpOutDrops;
14245 icmpkp->inOverflows.value.ui32 = ipst->ips_icmp_mib.icmpInOverflows;
14246 icmpkp->inBadRedirects.value.ui32 =
14247 ipst->ips_icmp_mib.icmpInBadRedirects;
14248
14249 netstack_rele(ns);
14250 return (0);
14251 }
14252
14253 /*
14254 * This is the fanout function for raw socket opened for SCTP. Note
14255 * that it is called after SCTP checks that there is no socket which
14256 * wants a packet. Then before SCTP handles this out of the blue packet,
14257 * this function is called to see if there is any raw socket for SCTP.
14258 * If there is and it is bound to the correct address, the packet will
14259 * be sent to that socket. Note that only one raw socket can be bound to
14260 * a port. This is assured in ipcl_sctp_hash_insert();
14261 */
14262 void
14263 ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
14264 ip_recv_attr_t *ira)
14265 {
14266 conn_t *connp;
14267 queue_t *rq;
14268 boolean_t secure;
14269 ill_t *ill = ira->ira_ill;
14270 ip_stack_t *ipst = ill->ill_ipst;
14271 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
14272 sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp;
14273 iaflags_t iraflags = ira->ira_flags;
14274 ill_t *rill = ira->ira_rill;
14275
14276 secure = iraflags & IRAF_IPSEC_SECURE;
14277
14278 connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
14279 ira, ipst);
14280 if (connp == NULL) {
14281 /*
14282 * Although raw sctp is not summed, OOB chunks must be.
14283 * Drop the packet here if the sctp checksum failed.
14284 */
14285 if (iraflags & IRAF_SCTP_CSUM_ERR) {
14286 SCTPS_BUMP_MIB(sctps, sctpChecksumError);
14287 freemsg(mp);
14288 return;
14289 }
14290 ira->ira_ill = ira->ira_rill = NULL;
14291 sctp_ootb_input(mp, ira, ipst);
14292 ira->ira_ill = ill;
14293 ira->ira_rill = rill;
14294 return;
14295 }
14296 rq = connp->conn_rq;
14297 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
14298 CONN_DEC_REF(connp);
14299 BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
14300 freemsg(mp);
14301 return;
14302 }
14303 if (((iraflags & IRAF_IS_IPV4) ?
14304 CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
14305 CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
14306 secure) {
14307 mp = ipsec_check_inbound_policy(mp, connp, ipha,
14308 ip6h, ira);
14309 if (mp == NULL) {
14310 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
14311 /* Note that mp is NULL */
14312 ip_drop_input("ipIfStatsInDiscards", mp, ill);
14313 CONN_DEC_REF(connp);
14314 return;
14315 }
14316 }
14317
14318 if (iraflags & IRAF_ICMP_ERROR) {
14319 (connp->conn_recvicmp)(connp, mp, NULL, ira);
14320 } else {
14321 ill_t *rill = ira->ira_rill;
14322
14323 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
14324 /* This is the SOCK_RAW, IPPROTO_SCTP case. */
14325 ira->ira_ill = ira->ira_rill = NULL;
14326 (connp->conn_recv)(connp, mp, NULL, ira);
14327 ira->ira_ill = ill;
14328 ira->ira_rill = rill;
14329 }
14330 CONN_DEC_REF(connp);
14331 }
14332
14333 /*
14334 * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
14335 * header before the ip payload.
14336 */
14337 static void
14338 ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
14339 {
14340 int len = (mp->b_wptr - mp->b_rptr);
14341 mblk_t *ip_mp;
14342
14343 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14344 if (is_fp_mp || len != fp_mp_len) {
14345 if (len > fp_mp_len) {
14346 /*
14347 * fastpath header and ip header in the first mblk
14348 */
14349 mp->b_rptr += fp_mp_len;
14350 } else {
14351 /*
14352 * ip_xmit_attach_llhdr had to prepend an mblk to
14353 * attach the fastpath header before ip header.
14354 */
14355 ip_mp = mp->b_cont;
14356 freeb(mp);
14357 mp = ip_mp;
14358 mp->b_rptr += (fp_mp_len - len);
14359 }
14360 } else {
14361 ip_mp = mp->b_cont;
14362 freeb(mp);
14363 mp = ip_mp;
14364 }
14365 ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
14366 freemsg(mp);
14367 }
14368
14369 /*
14370 * Normal post fragmentation function.
14371 *
14372 * Send a packet using the passed in nce. This handles both IPv4 and IPv6
14373 * using the same state machine.
14374 *
14375 * We return an error on failure. In particular we return EWOULDBLOCK
14376 * when the driver flow controls. In that case this ensures that ip_wsrv runs
14377 * (currently by canputnext failure resulting in backenabling from GLD.)
14378 * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
14379 * indication that they can flow control until ip_wsrv() tells then to restart.
14380 *
14381 * If the nce passed by caller is incomplete, this function
14382 * queues the packet and if necessary, sends ARP request and bails.
14383 * If the Neighbor Cache passed is fully resolved, we simply prepend
14384 * the link-layer header to the packet, do ipsec hw acceleration
14385 * work if necessary, and send the packet out on the wire.
14386 */
14387 /* ARGSUSED6 */
14388 int
14389 ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
14390 uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
14391 {
14392 queue_t *wq;
14393 ill_t *ill = nce->nce_ill;
14394 ip_stack_t *ipst = ill->ill_ipst;
14395 uint64_t delta;
14396 boolean_t isv6 = ill->ill_isv6;
14397 boolean_t fp_mp;
14398 ncec_t *ncec = nce->nce_common;
14399 int64_t now = LBOLT_FASTPATH64;
14400 boolean_t is_probe;
14401
14402 DTRACE_PROBE1(ip__xmit, nce_t *, nce);
14403
14404 ASSERT(mp != NULL);
14405 ASSERT(mp->b_datap->db_type == M_DATA);
14406 ASSERT(pkt_len == msgdsize(mp));
14407
14408 /*
14409 * If we have already been here and are coming back after ARP/ND.
14410 * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
14411 * in that case since they have seen the packet when it came here
14412 * the first time.
14413 */
14414 if (ixaflags & IXAF_NO_TRACE)
14415 goto sendit;
14416
14417 if (ixaflags & IXAF_IS_IPV4) {
14418 ipha_t *ipha = (ipha_t *)mp->b_rptr;
14419
14420 ASSERT(!isv6);
14421 ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
14422 if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
14423 !(ixaflags & IXAF_NO_PFHOOK)) {
14424 int error;
14425
14426 FW_HOOKS(ipst->ips_ip4_physical_out_event,
14427 ipst->ips_ipv4firewall_physical_out,
14428 NULL, ill, ipha, mp, mp, 0, ipst, error);
14429 DTRACE_PROBE1(ip4__physical__out__end,
14430 mblk_t *, mp);
14431 if (mp == NULL)
14432 return (error);
14433
14434 /* The length could have changed */
14435 pkt_len = msgdsize(mp);
14436 }
14437 if (ipst->ips_ip4_observe.he_interested) {
14438 /*
14439 * Note that for TX the zoneid is the sending
14440 * zone, whether or not MLP is in play.
14441 * Since the szone argument is the IP zoneid (i.e.,
14442 * zero for exclusive-IP zones) and ipobs wants
14443 * the system zoneid, we map it here.
14444 */
14445 szone = IP_REAL_ZONEID(szone, ipst);
14446
14447 /*
14448 * On the outbound path the destination zone will be
14449 * unknown as we're sending this packet out on the
14450 * wire.
14451 */
14452 ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14453 ill, ipst);
14454 }
14455 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
14456 void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
14457 ipha_t *, ipha, ip6_t *, NULL, int, 0);
14458 } else {
14459 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
14460
14461 ASSERT(isv6);
14462 ASSERT(pkt_len ==
14463 ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
14464 if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
14465 !(ixaflags & IXAF_NO_PFHOOK)) {
14466 int error;
14467
14468 FW_HOOKS6(ipst->ips_ip6_physical_out_event,
14469 ipst->ips_ipv6firewall_physical_out,
14470 NULL, ill, ip6h, mp, mp, 0, ipst, error);
14471 DTRACE_PROBE1(ip6__physical__out__end,
14472 mblk_t *, mp);
14473 if (mp == NULL)
14474 return (error);
14475
14476 /* The length could have changed */
14477 pkt_len = msgdsize(mp);
14478 }
14479 if (ipst->ips_ip6_observe.he_interested) {
14480 /* See above */
14481 szone = IP_REAL_ZONEID(szone, ipst);
14482
14483 ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14484 ill, ipst);
14485 }
14486 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
14487 void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill,
14488 ipha_t *, NULL, ip6_t *, ip6h, int, 0);
14489 }
14490
14491 sendit:
14492 /*
14493 * We check the state without a lock because the state can never
14494 * move "backwards" to initial or incomplete.
14495 */
14496 switch (ncec->ncec_state) {
14497 case ND_REACHABLE:
14498 case ND_STALE:
14499 case ND_DELAY:
14500 case ND_PROBE:
14501 mp = ip_xmit_attach_llhdr(mp, nce);
14502 if (mp == NULL) {
14503 /*
14504 * ip_xmit_attach_llhdr has increased
14505 * ipIfStatsOutDiscards and called ip_drop_output()
14506 */
14507 return (ENOBUFS);
14508 }
14509 /*
14510 * check if nce_fastpath completed and we tagged on a
14511 * copy of nce_fp_mp in ip_xmit_attach_llhdr().
14512 */
14513 fp_mp = (mp->b_datap->db_type == M_DATA);
14514
14515 if (fp_mp &&
14516 (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
14517 ill_dld_direct_t *idd;
14518
14519 idd = &ill->ill_dld_capab->idc_direct;
14520 /*
14521 * Send the packet directly to DLD, where it
14522 * may be queued depending on the availability
14523 * of transmit resources at the media layer.
14524 * Return value should be taken into
14525 * account and flow control the TCP.
14526 */
14527 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14528 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14529 pkt_len);
14530
14531 if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
14532 (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
14533 (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
14534 } else {
14535 uintptr_t cookie;
14536
14537 if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
14538 mp, (uintptr_t)xmit_hint, 0)) != 0) {
14539 if (ixacookie != NULL)
14540 *ixacookie = cookie;
14541 return (EWOULDBLOCK);
14542 }
14543 }
14544 } else {
14545 wq = ill->ill_wq;
14546
14547 if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
14548 !canputnext(wq)) {
14549 if (ixacookie != NULL)
14550 *ixacookie = 0;
14551 ip_xmit_flowctl_drop(ill, mp, fp_mp,
14552 nce->nce_fp_mp != NULL ?
14553 MBLKL(nce->nce_fp_mp) : 0);
14554 return (EWOULDBLOCK);
14555 }
14556 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14557 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14558 pkt_len);
14559 putnext(wq, mp);
14560 }
14561
14562 /*
14563 * The rest of this function implements Neighbor Unreachability
14564 * detection. Determine if the ncec is eligible for NUD.
14565 */
14566 if (ncec->ncec_flags & NCE_F_NONUD)
14567 return (0);
14568
14569 ASSERT(ncec->ncec_state != ND_INCOMPLETE);
14570
14571 /*
14572 * Check for upper layer advice
14573 */
14574 if (ixaflags & IXAF_REACH_CONF) {
14575 timeout_id_t tid;
14576
14577 /*
14578 * It should be o.k. to check the state without
14579 * a lock here, at most we lose an advice.
14580 */
14581 ncec->ncec_last = TICK_TO_MSEC(now);
14582 if (ncec->ncec_state != ND_REACHABLE) {
14583 mutex_enter(&ncec->ncec_lock);
14584 ncec->ncec_state = ND_REACHABLE;
14585 tid = ncec->ncec_timeout_id;
14586 ncec->ncec_timeout_id = 0;
14587 mutex_exit(&ncec->ncec_lock);
14588 (void) untimeout(tid);
14589 if (ip_debug > 2) {
14590 /* ip1dbg */
14591 pr_addr_dbg("ip_xmit: state"
14592 " for %s changed to"
14593 " REACHABLE\n", AF_INET6,
14594 &ncec->ncec_addr);
14595 }
14596 }
14597 return (0);
14598 }
14599
14600 delta = TICK_TO_MSEC(now) - ncec->ncec_last;
14601 ip1dbg(("ip_xmit: delta = %" PRId64
14602 " ill_reachable_time = %d \n", delta,
14603 ill->ill_reachable_time));
14604 if (delta > (uint64_t)ill->ill_reachable_time) {
14605 mutex_enter(&ncec->ncec_lock);
14606 switch (ncec->ncec_state) {
14607 case ND_REACHABLE:
14608 ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
14609 /* FALLTHROUGH */
14610 case ND_STALE:
14611 /*
14612 * ND_REACHABLE is identical to
14613 * ND_STALE in this specific case. If
14614 * reachable time has expired for this
14615 * neighbor (delta is greater than
14616 * reachable time), conceptually, the
14617 * neighbor cache is no longer in
14618 * REACHABLE state, but already in
14619 * STALE state. So the correct
14620 * transition here is to ND_DELAY.
14621 */
14622 ncec->ncec_state = ND_DELAY;
14623 mutex_exit(&ncec->ncec_lock);
14624 nce_restart_timer(ncec,
14625 ipst->ips_delay_first_probe_time);
14626 if (ip_debug > 3) {
14627 /* ip2dbg */
14628 pr_addr_dbg("ip_xmit: state"
14629 " for %s changed to"
14630 " DELAY\n", AF_INET6,
14631 &ncec->ncec_addr);
14632 }
14633 break;
14634 case ND_DELAY:
14635 case ND_PROBE:
14636 mutex_exit(&ncec->ncec_lock);
14637 /* Timers have already started */
14638 break;
14639 case ND_UNREACHABLE:
14640 /*
14641 * nce_timer has detected that this ncec
14642 * is unreachable and initiated deleting
14643 * this ncec.
14644 * This is a harmless race where we found the
14645 * ncec before it was deleted and have
14646 * just sent out a packet using this
14647 * unreachable ncec.
14648 */
14649 mutex_exit(&ncec->ncec_lock);
14650 break;
14651 default:
14652 ASSERT(0);
14653 mutex_exit(&ncec->ncec_lock);
14654 }
14655 }
14656 return (0);
14657
14658 case ND_INCOMPLETE:
14659 /*
14660 * the state could have changed since we didn't hold the lock.
14661 * Re-verify state under lock.
14662 */
14663 is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14664 mutex_enter(&ncec->ncec_lock);
14665 if (NCE_ISREACHABLE(ncec)) {
14666 mutex_exit(&ncec->ncec_lock);
14667 goto sendit;
14668 }
14669 /* queue the packet */
14670 nce_queue_mp(ncec, mp, is_probe);
14671 mutex_exit(&ncec->ncec_lock);
14672 DTRACE_PROBE2(ip__xmit__incomplete,
14673 (ncec_t *), ncec, (mblk_t *), mp);
14674 return (0);
14675
14676 case ND_INITIAL:
14677 /*
14678 * State could have changed since we didn't hold the lock, so
14679 * re-verify state.
14680 */
14681 is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14682 mutex_enter(&ncec->ncec_lock);
14683 if (NCE_ISREACHABLE(ncec)) {
14684 mutex_exit(&ncec->ncec_lock);
14685 goto sendit;
14686 }
14687 nce_queue_mp(ncec, mp, is_probe);
14688 if (ncec->ncec_state == ND_INITIAL) {
14689 ncec->ncec_state = ND_INCOMPLETE;
14690 mutex_exit(&ncec->ncec_lock);
14691 /*
14692 * figure out the source we want to use
14693 * and resolve it.
14694 */
14695 ip_ndp_resolve(ncec);
14696 } else {
14697 mutex_exit(&ncec->ncec_lock);
14698 }
14699 return (0);
14700
14701 case ND_UNREACHABLE:
14702 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14703 ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
14704 mp, ill);
14705 freemsg(mp);
14706 return (0);
14707
14708 default:
14709 ASSERT(0);
14710 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14711 ip_drop_output("ipIfStatsOutDiscards - ND_other",
14712 mp, ill);
14713 freemsg(mp);
14714 return (ENETUNREACH);
14715 }
14716 }
14717
14718 /*
14719 * Return B_TRUE if the buffers differ in length or content.
14720 * This is used for comparing extension header buffers.
14721 * Note that an extension header would be declared different
14722 * even if all that changed was the next header value in that header i.e.
14723 * what really changed is the next extension header.
14724 */
14725 boolean_t
14726 ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf,
14727 uint_t blen)
14728 {
14729 if (!b_valid)
14730 blen = 0;
14731
14732 if (alen != blen)
14733 return (B_TRUE);
14734 if (alen == 0)
14735 return (B_FALSE); /* Both zero length */
14736 return (bcmp(abuf, bbuf, alen));
14737 }
14738
14739 /*
14740 * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok.
14741 * Return B_FALSE if memory allocation fails - don't change any state!
14742 */
14743 boolean_t
14744 ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14745 const void *src, uint_t srclen)
14746 {
14747 void *dst;
14748
14749 if (!src_valid)
14750 srclen = 0;
14751
14752 ASSERT(*dstlenp == 0);
14753 if (src != NULL && srclen != 0) {
14754 dst = mi_alloc(srclen, BPRI_MED);
14755 if (dst == NULL)
14756 return (B_FALSE);
14757 } else {
14758 dst = NULL;
14759 }
14760 if (*dstp != NULL)
14761 mi_free(*dstp);
14762 *dstp = dst;
14763 *dstlenp = dst == NULL ? 0 : srclen;
14764 return (B_TRUE);
14765 }
14766
14767 /*
14768 * Replace what is in *dst, *dstlen with the source.
14769 * Assumes ip_allocbuf has already been called.
14770 */
14771 void
14772 ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14773 const void *src, uint_t srclen)
14774 {
14775 if (!src_valid)
14776 srclen = 0;
14777
14778 ASSERT(*dstlenp == srclen);
14779 if (src != NULL && srclen != 0)
14780 bcopy(src, *dstp, srclen);
14781 }
14782
14783 /*
14784 * Free the storage pointed to by the members of an ip_pkt_t.
14785 */
14786 void
14787 ip_pkt_free(ip_pkt_t *ipp)
14788 {
14789 uint_t fields = ipp->ipp_fields;
14790
14791 if (fields & IPPF_HOPOPTS) {
14792 kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
14793 ipp->ipp_hopopts = NULL;
14794 ipp->ipp_hopoptslen = 0;
14795 }
14796 if (fields & IPPF_RTHDRDSTOPTS) {
14797 kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
14798 ipp->ipp_rthdrdstopts = NULL;
14799 ipp->ipp_rthdrdstoptslen = 0;
14800 }
14801 if (fields & IPPF_DSTOPTS) {
14802 kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
14803 ipp->ipp_dstopts = NULL;
14804 ipp->ipp_dstoptslen = 0;
14805 }
14806 if (fields & IPPF_RTHDR) {
14807 kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
14808 ipp->ipp_rthdr = NULL;
14809 ipp->ipp_rthdrlen = 0;
14810 }
14811 if (fields & IPPF_IPV4_OPTIONS) {
14812 kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
14813 ipp->ipp_ipv4_options = NULL;
14814 ipp->ipp_ipv4_options_len = 0;
14815 }
14816 if (fields & IPPF_LABEL_V4) {
14817 kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
14818 ipp->ipp_label_v4 = NULL;
14819 ipp->ipp_label_len_v4 = 0;
14820 }
14821 if (fields & IPPF_LABEL_V6) {
14822 kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
14823 ipp->ipp_label_v6 = NULL;
14824 ipp->ipp_label_len_v6 = 0;
14825 }
14826 ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14827 IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14828 }
14829
14830 /*
14831 * Copy from src to dst and allocate as needed.
14832 * Returns zero or ENOMEM.
14833 *
14834 * The caller must initialize dst to zero.
14835 */
14836 int
14837 ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
14838 {
14839 uint_t fields = src->ipp_fields;
14840
14841 /* Start with fields that don't require memory allocation */
14842 dst->ipp_fields = fields &
14843 ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14844 IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14845
14846 dst->ipp_addr = src->ipp_addr;
14847 dst->ipp_unicast_hops = src->ipp_unicast_hops;
14848 dst->ipp_hoplimit = src->ipp_hoplimit;
14849 dst->ipp_tclass = src->ipp_tclass;
14850 dst->ipp_type_of_service = src->ipp_type_of_service;
14851
14852 if (!(fields & (IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14853 IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6)))
14854 return (0);
14855
14856 if (fields & IPPF_HOPOPTS) {
14857 dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
14858 if (dst->ipp_hopopts == NULL) {
14859 ip_pkt_free(dst);
14860 return (ENOMEM);
14861 }
14862 dst->ipp_fields |= IPPF_HOPOPTS;
14863 bcopy(src->ipp_hopopts, dst->ipp_hopopts,
14864 src->ipp_hopoptslen);
14865 dst->ipp_hopoptslen = src->ipp_hopoptslen;
14866 }
14867 if (fields & IPPF_RTHDRDSTOPTS) {
14868 dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
14869 kmflag);
14870 if (dst->ipp_rthdrdstopts == NULL) {
14871 ip_pkt_free(dst);
14872 return (ENOMEM);
14873 }
14874 dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
14875 bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
14876 src->ipp_rthdrdstoptslen);
14877 dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
14878 }
14879 if (fields & IPPF_DSTOPTS) {
14880 dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
14881 if (dst->ipp_dstopts == NULL) {
14882 ip_pkt_free(dst);
14883 return (ENOMEM);
14884 }
14885 dst->ipp_fields |= IPPF_DSTOPTS;
14886 bcopy(src->ipp_dstopts, dst->ipp_dstopts,
14887 src->ipp_dstoptslen);
14888 dst->ipp_dstoptslen = src->ipp_dstoptslen;
14889 }
14890 if (fields & IPPF_RTHDR) {
14891 dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
14892 if (dst->ipp_rthdr == NULL) {
14893 ip_pkt_free(dst);
14894 return (ENOMEM);
14895 }
14896 dst->ipp_fields |= IPPF_RTHDR;
14897 bcopy(src->ipp_rthdr, dst->ipp_rthdr,
14898 src->ipp_rthdrlen);
14899 dst->ipp_rthdrlen = src->ipp_rthdrlen;
14900 }
14901 if (fields & IPPF_IPV4_OPTIONS) {
14902 dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
14903 kmflag);
14904 if (dst->ipp_ipv4_options == NULL) {
14905 ip_pkt_free(dst);
14906 return (ENOMEM);
14907 }
14908 dst->ipp_fields |= IPPF_IPV4_OPTIONS;
14909 bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
14910 src->ipp_ipv4_options_len);
14911 dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
14912 }
14913 if (fields & IPPF_LABEL_V4) {
14914 dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
14915 if (dst->ipp_label_v4 == NULL) {
14916 ip_pkt_free(dst);
14917 return (ENOMEM);
14918 }
14919 dst->ipp_fields |= IPPF_LABEL_V4;
14920 bcopy(src->ipp_label_v4, dst->ipp_label_v4,
14921 src->ipp_label_len_v4);
14922 dst->ipp_label_len_v4 = src->ipp_label_len_v4;
14923 }
14924 if (fields & IPPF_LABEL_V6) {
14925 dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
14926 if (dst->ipp_label_v6 == NULL) {
14927 ip_pkt_free(dst);
14928 return (ENOMEM);
14929 }
14930 dst->ipp_fields |= IPPF_LABEL_V6;
14931 bcopy(src->ipp_label_v6, dst->ipp_label_v6,
14932 src->ipp_label_len_v6);
14933 dst->ipp_label_len_v6 = src->ipp_label_len_v6;
14934 }
14935 if (fields & IPPF_FRAGHDR) {
14936 dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
14937 if (dst->ipp_fraghdr == NULL) {
14938 ip_pkt_free(dst);
14939 return (ENOMEM);
14940 }
14941 dst->ipp_fields |= IPPF_FRAGHDR;
14942 bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
14943 src->ipp_fraghdrlen);
14944 dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
14945 }
14946 return (0);
14947 }
14948
14949 /*
14950 * Returns INADDR_ANY if no source route
14951 */
14952 ipaddr_t
14953 ip_pkt_source_route_v4(const ip_pkt_t *ipp)
14954 {
14955 ipaddr_t nexthop = INADDR_ANY;
14956 ipoptp_t opts;
14957 uchar_t *opt;
14958 uint8_t optval;
14959 uint8_t optlen;
14960 uint32_t totallen;
14961
14962 if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
14963 return (INADDR_ANY);
14964
14965 totallen = ipp->ipp_ipv4_options_len;
14966 if (totallen & 0x3)
14967 return (INADDR_ANY);
14968
14969 for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
14970 optval != IPOPT_EOL;
14971 optval = ipoptp_next(&opts)) {
14972 opt = opts.ipoptp_cur;
14973 switch (optval) {
14974 uint8_t off;
14975 case IPOPT_SSRR:
14976 case IPOPT_LSRR:
14977 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
14978 break;
14979 }
14980 optlen = opts.ipoptp_len;
14981 off = opt[IPOPT_OFFSET];
14982 off--;
14983 if (optlen < IP_ADDR_LEN ||
14984 off > optlen - IP_ADDR_LEN) {
14985 /* End of source route */
14986 break;
14987 }
14988 bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
14989 if (nexthop == htonl(INADDR_LOOPBACK)) {
14990 /* Ignore */
14991 nexthop = INADDR_ANY;
14992 break;
14993 }
14994 break;
14995 }
14996 }
14997 return (nexthop);
14998 }
14999
15000 /*
15001 * Reverse a source route.
15002 */
15003 void
15004 ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
15005 {
15006 ipaddr_t tmp;
15007 ipoptp_t opts;
15008 uchar_t *opt;
15009 uint8_t optval;
15010 uint32_t totallen;
15011
15012 if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
15013 return;
15014
15015 totallen = ipp->ipp_ipv4_options_len;
15016 if (totallen & 0x3)
15017 return;
15018
15019 for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
15020 optval != IPOPT_EOL;
15021 optval = ipoptp_next(&opts)) {
15022 uint8_t off1, off2;
15023
15024 opt = opts.ipoptp_cur;
15025 switch (optval) {
15026 case IPOPT_SSRR:
15027 case IPOPT_LSRR:
15028 if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
15029 break;
15030 }
15031 off1 = IPOPT_MINOFF_SR - 1;
15032 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
15033 while (off2 > off1) {
15034 bcopy(opt + off2, &tmp, IP_ADDR_LEN);
15035 bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
15036 bcopy(&tmp, opt + off2, IP_ADDR_LEN);
15037 off2 -= IP_ADDR_LEN;
15038 off1 += IP_ADDR_LEN;
15039 }
15040 opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
15041 break;
15042 }
15043 }
15044 }
15045
15046 /*
15047 * Returns NULL if no routing header
15048 */
15049 in6_addr_t *
15050 ip_pkt_source_route_v6(const ip_pkt_t *ipp)
15051 {
15052 in6_addr_t *nexthop = NULL;
15053 ip6_rthdr0_t *rthdr;
15054
15055 if (!(ipp->ipp_fields & IPPF_RTHDR))
15056 return (NULL);
15057
15058 rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
15059 if (rthdr->ip6r0_segleft == 0)
15060 return (NULL);
15061
15062 nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
15063 return (nexthop);
15064 }
15065
15066 zoneid_t
15067 ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
15068 zoneid_t lookup_zoneid)
15069 {
15070 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
15071 ire_t *ire;
15072 int ire_flags = MATCH_IRE_TYPE;
15073 zoneid_t zoneid = ALL_ZONES;
15074
15075 if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15076 return (ALL_ZONES);
15077
15078 if (lookup_zoneid != ALL_ZONES)
15079 ire_flags |= MATCH_IRE_ZONEONLY;
15080 ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15081 NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15082 if (ire != NULL) {
15083 zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15084 ire_refrele(ire);
15085 }
15086 return (zoneid);
15087 }
15088
15089 zoneid_t
15090 ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
15091 ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
15092 {
15093 ip_stack_t *ipst = ira->ira_ill->ill_ipst;
15094 ire_t *ire;
15095 int ire_flags = MATCH_IRE_TYPE;
15096 zoneid_t zoneid = ALL_ZONES;
15097
15098 if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15099 return (ALL_ZONES);
15100
15101 if (IN6_IS_ADDR_LINKLOCAL(addr))
15102 ire_flags |= MATCH_IRE_ILL;
15103
15104 if (lookup_zoneid != ALL_ZONES)
15105 ire_flags |= MATCH_IRE_ZONEONLY;
15106 ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15107 ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15108 if (ire != NULL) {
15109 zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15110 ire_refrele(ire);
15111 }
15112 return (zoneid);
15113 }
15114
15115 /*
15116 * IP obserability hook support functions.
15117 */
15118 static void
15119 ipobs_init(ip_stack_t *ipst)
15120 {
15121 netid_t id;
15122
15123 id = net_getnetidbynetstackid(ipst->ips_netstack->netstack_stackid);
15124
15125 ipst->ips_ip4_observe_pr = net_protocol_lookup(id, NHF_INET);
15126 VERIFY(ipst->ips_ip4_observe_pr != NULL);
15127
15128 ipst->ips_ip6_observe_pr = net_protocol_lookup(id, NHF_INET6);
15129 VERIFY(ipst->ips_ip6_observe_pr != NULL);
15130 }
15131
15132 static void
15133 ipobs_fini(ip_stack_t *ipst)
15134 {
15135
15136 VERIFY(net_protocol_release(ipst->ips_ip4_observe_pr) == 0);
15137 VERIFY(net_protocol_release(ipst->ips_ip6_observe_pr) == 0);
15138 }
15139
15140 /*
15141 * hook_pkt_observe_t is composed in network byte order so that the
15142 * entire mblk_t chain handed into hook_run can be used as-is.
15143 * The caveat is that use of the fields, such as the zone fields,
15144 * requires conversion into host byte order first.
15145 */
15146 void
15147 ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
15148 const ill_t *ill, ip_stack_t *ipst)
15149 {
15150 hook_pkt_observe_t *hdr;
15151 uint64_t grifindex;
15152 mblk_t *imp;
15153
15154 imp = allocb(sizeof (*hdr), BPRI_HI);
15155 if (imp == NULL)
15156 return;
15157
15158 hdr = (hook_pkt_observe_t *)imp->b_rptr;
15159 /*
15160 * b_wptr is set to make the apparent size of the data in the mblk_t
15161 * to exclude the pointers at the end of hook_pkt_observer_t.
15162 */
15163 imp->b_wptr = imp->b_rptr + sizeof (dl_ipnetinfo_t);
15164 imp->b_cont = mp;
15165
15166 ASSERT(DB_TYPE(mp) == M_DATA);
15167
15168 if (IS_UNDER_IPMP(ill))
15169 grifindex = ipmp_ill_get_ipmp_ifindex(ill);
15170 else
15171 grifindex = 0;
15172
15173 hdr->hpo_version = 1;
15174 hdr->hpo_htype = htons(htype);
15175 hdr->hpo_pktlen = htonl((ulong_t)msgdsize(mp));
15176 hdr->hpo_ifindex = htonl(ill->ill_phyint->phyint_ifindex);
15177 hdr->hpo_grifindex = htonl(grifindex);
15178 hdr->hpo_zsrc = htonl(zsrc);
15179 hdr->hpo_zdst = htonl(zdst);
15180 hdr->hpo_pkt = imp;
15181 hdr->hpo_ctx = ipst->ips_netstack;
15182
15183 if (ill->ill_isv6) {
15184 hdr->hpo_family = AF_INET6;
15185 (void) hook_run(ipst->ips_ipv6_net_data->netd_hooks,
15186 ipst->ips_ipv6observing, (hook_data_t)hdr);
15187 } else {
15188 hdr->hpo_family = AF_INET;
15189 (void) hook_run(ipst->ips_ipv4_net_data->netd_hooks,
15190 ipst->ips_ipv4observing, (hook_data_t)hdr);
15191 }
15192
15193 imp->b_cont = NULL;
15194 freemsg(imp);
15195 }
15196
15197 /*
15198 * Utility routine that checks if `v4srcp' is a valid address on underlying
15199 * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif
15200 * associated with `v4srcp' on success. NOTE: if this is not called from
15201 * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
15202 * group during or after this lookup.
15203 */
15204 boolean_t
15205 ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
15206 {
15207 ipif_t *ipif;
15208
15209 ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
15210 if (ipif != NULL) {
15211 if (ipifp != NULL)
15212 *ipifp = ipif;
15213 else
15214 ipif_refrele(ipif);
15215 return (B_TRUE);
15216 }
15217
15218 ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
15219 *v4srcp));
15220 return (B_FALSE);
15221 }
15222
15223 /*
15224 * Transport protocol call back function for CPU state change.
15225 */
15226 /* ARGSUSED */
15227 static int
15228 ip_tp_cpu_update(cpu_setup_t what, int id, void *arg)
15229 {
15230 processorid_t cpu_seqid;
15231 netstack_handle_t nh;
15232 netstack_t *ns;
15233
15234 ASSERT(MUTEX_HELD(&cpu_lock));
15235
15236 switch (what) {
15237 case CPU_CONFIG:
15238 case CPU_ON:
15239 case CPU_INIT:
15240 case CPU_CPUPART_IN:
15241 cpu_seqid = cpu[id]->cpu_seqid;
15242 netstack_next_init(&nh);
15243 while ((ns = netstack_next(&nh)) != NULL) {
15244 tcp_stack_cpu_add(ns->netstack_tcp, cpu_seqid);
15245 sctp_stack_cpu_add(ns->netstack_sctp, cpu_seqid);
15246 udp_stack_cpu_add(ns->netstack_udp, cpu_seqid);
15247 netstack_rele(ns);
15248 }
15249 netstack_next_fini(&nh);
15250 break;
15251 case CPU_UNCONFIG:
15252 case CPU_OFF:
15253 case CPU_CPUPART_OUT:
15254 /*
15255 * Nothing to do. We don't remove the per CPU stats from
15256 * the IP stack even when the CPU goes offline.
15257 */
15258 break;
15259 default:
15260 break;
15261 }
15262 return (0);
15263 }