1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * IP PACKET CLASSIFIER
27 *
28 * The IP packet classifier provides mapping between IP packets and persistent
29 * connection state for connection-oriented protocols. It also provides
30 * interface for managing connection states.
31 *
32 * The connection state is kept in conn_t data structure and contains, among
33 * other things:
34 *
35 * o local/remote address and ports
36 * o Transport protocol
37 * o squeue for the connection (for TCP only)
38 * o reference counter
39 * o Connection state
40 * o hash table linkage
41 * o interface/ire information
42 * o credentials
43 * o ipsec policy
44 * o send and receive functions.
45 * o mutex lock.
46 *
47 * Connections use a reference counting scheme. They are freed when the
48 * reference counter drops to zero. A reference is incremented when connection
49 * is placed in a list or table, when incoming packet for the connection arrives
50 * and when connection is processed via squeue (squeue processing may be
51 * asynchronous and the reference protects the connection from being destroyed
52 * before its processing is finished).
53 *
54 * conn_recv is used to pass up packets to the ULP.
55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56 * a listener, and changes to tcp_input_listener as the listener has picked a
57 * good squeue. For other cases it is set to tcp_input_data.
58 *
59 * conn_recvicmp is used to pass up ICMP errors to the ULP.
60 *
61 * Classifier uses several hash tables:
62 *
63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
64 * ipcl_bind_fanout: contains all connections in BOUND state
65 * ipcl_proto_fanout: IPv4 protocol fanout
66 * ipcl_proto_fanout_v6: IPv6 protocol fanout
67 * ipcl_udp_fanout: contains all UDP connections
68 * ipcl_iptun_fanout: contains all IP tunnel connections
69 * ipcl_globalhash_fanout: contains all connections
70 *` ipcl_dccp_conn_fanout: contains all DCCP connections in CONNECTED state
71 * ipcl_dccp_bind_fanout: contains all DCCP connections in BOUND state
72 *
73 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
74 * which need to view all existing connections.
75 *
76 * All tables are protected by per-bucket locks. When both per-bucket lock and
77 * connection lock need to be held, the per-bucket lock should be acquired
78 * first, followed by the connection lock.
79 *
80 * All functions doing search in one of these tables increment a reference
81 * counter on the connection found (if any). This reference should be dropped
82 * when the caller has finished processing the connection.
83 *
84 *
85 * INTERFACES:
86 * ===========
87 *
88 * Connection Lookup:
89 * ------------------
90 *
91 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
92 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
93 *
94 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
95 * it can't find any associated connection. If the connection is found, its
96 * reference counter is incremented.
97 *
98 * mp: mblock, containing packet header. The full header should fit
99 * into a single mblock. It should also contain at least full IP
100 * and TCP or UDP header.
101 *
102 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
103 *
104 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
105 * the packet.
106 *
107 * ira->ira_zoneid: The zone in which the returned connection must be; the
108 * zoneid corresponding to the ire_zoneid on the IRE located for
109 * the packet's destination address.
110 *
111 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
112 * IRAF_TX_SHARED_ADDR flags
113 *
114 * For TCP connections, the lookup order is as follows:
115 * 5-tuple {src, dst, protocol, local port, remote port}
116 * lookup in ipcl_conn_fanout table.
117 * 3-tuple {dst, remote port, protocol} lookup in
118 * ipcl_bind_fanout table.
119 *
120 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
121 * remote port} lookup is done on ipcl_udp_fanout. Note that,
122 * these interfaces do not handle cases where a packets belongs
123 * to multiple UDP clients, which is handled in IP itself.
124 *
125 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
126 * determine which actual zone gets the segment. This is used only in a
127 * labeled environment. The matching rules are:
128 *
129 * - If it's not a multilevel port, then the label on the packet selects
130 * the zone. Unlabeled packets are delivered to the global zone.
131 *
132 * - If it's a multilevel port, then only the zone registered to receive
133 * packets on that port matches.
134 *
135 * Also, in a labeled environment, packet labels need to be checked. For fully
136 * bound TCP connections, we can assume that the packet label was checked
137 * during connection establishment, and doesn't need to be checked on each
138 * packet. For others, though, we need to check for strict equality or, for
139 * multilevel ports, membership in the range or set. This part currently does
140 * a tnrh lookup on each packet, but could be optimized to use cached results
141 * if that were necessary. (SCTP doesn't come through here, but if it did,
142 * we would apply the same rules as TCP.)
143 *
144 * An implication of the above is that fully-bound TCP sockets must always use
145 * distinct 4-tuples; they can't be discriminated by label alone.
146 *
147 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
148 * as there's no connection set-up handshake and no shared state.
149 *
150 * Labels on looped-back packets within a single zone do not need to be
151 * checked, as all processes in the same zone have the same label.
152 *
153 * Finally, for unlabeled packets received by a labeled system, special rules
154 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
155 * socket in the zone whose label matches the default label of the sender, if
156 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
157 * receiver's label must dominate the sender's default label.
158 *
159 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
160 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
161 * ip_stack);
162 *
163 * Lookup routine to find a exact match for {src, dst, local port,
164 * remote port) for TCP connections in ipcl_conn_fanout. The address and
165 * ports are read from the IP and TCP header respectively.
166 *
167 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
168 * zoneid, ip_stack);
169 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
170 * zoneid, ip_stack);
171 *
172 * Lookup routine to find a listener with the tuple {lport, laddr,
173 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
174 * parameter interface index is also compared.
175 *
176 * void ipcl_walk(func, arg, ip_stack)
177 *
178 * Apply 'func' to every connection available. The 'func' is called as
179 * (*func)(connp, arg). The walk is non-atomic so connections may be
180 * created and destroyed during the walk. The CONN_CONDEMNED and
181 * CONN_INCIPIENT flags ensure that connections which are newly created
182 * or being destroyed are not selected by the walker.
183 *
184 * Table Updates
185 * -------------
186 *
187 * int ipcl_conn_insert(connp);
188 * int ipcl_conn_insert_v4(connp);
189 * int ipcl_conn_insert_v6(connp);
190 *
191 * Insert 'connp' in the ipcl_conn_fanout.
192 * Arguements :
193 * connp conn_t to be inserted
194 *
195 * Return value :
196 * 0 if connp was inserted
197 * EADDRINUSE if the connection with the same tuple
198 * already exists.
199 *
200 * int ipcl_bind_insert(connp);
201 * int ipcl_bind_insert_v4(connp);
202 * int ipcl_bind_insert_v6(connp);
203 *
204 * Insert 'connp' in ipcl_bind_fanout.
205 * Arguements :
206 * connp conn_t to be inserted
207 *
208 *
209 * void ipcl_hash_remove(connp);
210 *
211 * Removes the 'connp' from the connection fanout table.
212 *
213 * Connection Creation/Destruction
214 * -------------------------------
215 *
216 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
217 *
218 * Creates a new conn based on the type flag, inserts it into
219 * globalhash table.
220 *
221 * type: This flag determines the type of conn_t which needs to be
222 * created i.e., which kmem_cache it comes from.
223 * IPCL_TCPCONN indicates a TCP connection
224 * IPCL_SCTPCONN indicates a SCTP connection
225 * IPCL_UDPCONN indicates a UDP conn_t.
226 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
227 * IPCL_RTSCONN indicates a RTS conn_t.
228 * IPCL_DCCPCONN indicates a DCCP conn_t.
229 * IPCL_IPCCONN indicates all other connections.
230 *
231 * void ipcl_conn_destroy(connp)
232 *
233 * Destroys the connection state, removes it from the global
234 * connection hash table and frees its memory.
235 */
236
237 #include <sys/types.h>
238 #include <sys/stream.h>
239 #include <sys/stropts.h>
240 #include <sys/sysmacros.h>
241 #include <sys/strsubr.h>
242 #include <sys/strsun.h>
243 #define _SUN_TPI_VERSION 2
244 #include <sys/ddi.h>
245 #include <sys/cmn_err.h>
246 #include <sys/debug.h>
247
248 #include <sys/systm.h>
249 #include <sys/param.h>
250 #include <sys/kmem.h>
251 #include <sys/isa_defs.h>
252 #include <inet/common.h>
253 #include <netinet/ip6.h>
254 #include <netinet/icmp6.h>
255
256 #include <inet/ip.h>
257 #include <inet/ip_if.h>
258 #include <inet/ip_ire.h>
259 #include <inet/ip6.h>
260 #include <inet/ip_ndp.h>
261 #include <inet/ip_impl.h>
262 #include <inet/udp_impl.h>
263 #include <inet/dccp_impl.h>
264 #include <inet/sctp_ip.h>
265 #include <inet/sctp/sctp_impl.h>
266 #include <inet/rawip_impl.h>
267 #include <inet/rts_impl.h>
268 #include <inet/iptun/iptun_impl.h>
269
270 #include <sys/cpuvar.h>
271
272 #include <inet/ipclassifier.h>
273 #include <inet/tcp.h>
274 #include <inet/ipsec_impl.h>
275
276 #include <sys/tsol/tnet.h>
277 #include <sys/sockio.h>
278
279 /* Old value for compatibility. Setable in /etc/system */
280 uint_t tcp_conn_hash_size = 0;
281
282 /* New value. Zero means choose automatically. Setable in /etc/system */
283 uint_t ipcl_conn_hash_size = 0;
284 uint_t ipcl_conn_hash_memfactor = 8192;
285 uint_t ipcl_conn_hash_maxsize = 82500;
286
287 /* bind/udp fanout table size */
288 uint_t ipcl_bind_fanout_size = 512;
289 uint_t ipcl_udp_fanout_size = 16384;
290
291 /* Fanout table sizes for dccp */
292 uint_t ipcl_dccp_conn_fanout_size = 512;
293 uint_t ipcl_dccp_bind_fanout_size = 512;
294
295 /* Raw socket fanout size. Must be a power of 2. */
296 uint_t ipcl_raw_fanout_size = 256;
297
298 /*
299 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
300 * expect that most large deployments would have hundreds of tunnels, and
301 * thousands in the extreme case.
302 */
303 uint_t ipcl_iptun_fanout_size = 6143;
304
305 /*
306 * Power of 2^N Primes useful for hashing for N of 0-28,
307 * these primes are the nearest prime <= 2^N - 2^(N-2).
308 */
309
310 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
311 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
312 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
313 50331599, 100663291, 201326557, 0}
314
315 /*
316 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
317 * are aligned on cache lines.
318 */
319 typedef union itc_s {
320 conn_t itc_conn;
321 char itcu_filler[CACHE_ALIGN(conn_s)];
322 } itc_t;
323
324 struct kmem_cache *tcp_conn_cache;
325 struct kmem_cache *ip_conn_cache;
326 extern struct kmem_cache *sctp_conn_cache;
327 struct kmem_cache *udp_conn_cache;
328 struct kmem_cache *rawip_conn_cache;
329 struct kmem_cache *rts_conn_cache;
330 struct kmem_cache *dccp_conn_cache;
331
332 extern void tcp_timermp_free(tcp_t *);
333 extern mblk_t *tcp_timermp_alloc(int);
334
335 static int ip_conn_constructor(void *, void *, int);
336 static void ip_conn_destructor(void *, void *);
337
338 static int tcp_conn_constructor(void *, void *, int);
339 static void tcp_conn_destructor(void *, void *);
340
341 static int udp_conn_constructor(void *, void *, int);
342 static void udp_conn_destructor(void *, void *);
343
344 static int rawip_conn_constructor(void *, void *, int);
345 static void rawip_conn_destructor(void *, void *);
346
347 static int rts_conn_constructor(void *, void *, int);
348 static void rts_conn_destructor(void *, void *);
349
350 static int dccp_conn_constructor(void *, void *, int);
351 static void dccp_conn_destructor(void *, void *);
352
353 /*
354 * Global (for all stack instances) init routine
355 */
356 void
357 ipcl_g_init(void)
358 {
359 ip_conn_cache = kmem_cache_create("ip_conn_cache",
360 sizeof (conn_t), CACHE_ALIGN_SIZE,
361 ip_conn_constructor, ip_conn_destructor,
362 NULL, NULL, NULL, 0);
363
364 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
365 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
366 tcp_conn_constructor, tcp_conn_destructor,
367 tcp_conn_reclaim, NULL, NULL, 0);
368
369 udp_conn_cache = kmem_cache_create("udp_conn_cache",
370 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
371 udp_conn_constructor, udp_conn_destructor,
372 NULL, NULL, NULL, 0);
373
374 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
375 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
376 rawip_conn_constructor, rawip_conn_destructor,
377 NULL, NULL, NULL, 0);
378
379 rts_conn_cache = kmem_cache_create("rts_conn_cache",
380 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
381 rts_conn_constructor, rts_conn_destructor,
382 NULL, NULL, NULL, 0);
383
384 /* XXX:DCCP reclaim */
385 dccp_conn_cache = kmem_cache_create("dccp_conn_cache",
386 sizeof (itc_t) + sizeof (dccp_t), CACHE_ALIGN_SIZE,
387 dccp_conn_constructor, dccp_conn_destructor,
388 NULL, NULL, NULL, 0);
389 }
390
391 /*
392 * ipclassifier intialization routine, sets up hash tables.
393 */
394 void
395 ipcl_init(ip_stack_t *ipst)
396 {
397 int i;
398 int sizes[] = P2Ps();
399
400 /*
401 * Calculate size of conn fanout table from /etc/system settings
402 */
403 if (ipcl_conn_hash_size != 0) {
404 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
405 } else if (tcp_conn_hash_size != 0) {
406 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
407 } else {
408 extern pgcnt_t freemem;
409
410 ipst->ips_ipcl_conn_fanout_size =
411 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
412
413 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
414 ipst->ips_ipcl_conn_fanout_size =
415 ipcl_conn_hash_maxsize;
416 }
417 }
418
419 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
420 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
421 break;
422 }
423 }
424 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
425 /* Out of range, use the 2^16 value */
426 ipst->ips_ipcl_conn_fanout_size = sizes[16];
427 }
428
429 /* Take values from /etc/system */
430 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
431 ipst->ips_ipcl_dccp_conn_fanout_size = ipcl_dccp_conn_fanout_size;
432 ipst->ips_ipcl_dccp_bind_fanout_size = ipcl_dccp_bind_fanout_size;
433 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
434 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
435 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
436
437 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
438
439 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
440 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
441
442 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
443 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
444 MUTEX_DEFAULT, NULL);
445 }
446
447 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
448 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
449
450 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
451 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
452 MUTEX_DEFAULT, NULL);
453 }
454
455 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
456 sizeof (connf_t), KM_SLEEP);
457 for (i = 0; i < IPPROTO_MAX; i++) {
458 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
459 MUTEX_DEFAULT, NULL);
460 }
461
462 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
463 sizeof (connf_t), KM_SLEEP);
464 for (i = 0; i < IPPROTO_MAX; i++) {
465 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
466 MUTEX_DEFAULT, NULL);
467 }
468
469 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
470 mutex_init(&ipst->ips_rts_clients->connf_lock,
471 NULL, MUTEX_DEFAULT, NULL);
472
473 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
474 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
475 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
476 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
477 MUTEX_DEFAULT, NULL);
478 }
479
480 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
481 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
482 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
483 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
484 MUTEX_DEFAULT, NULL);
485 }
486
487 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
488 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
489 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
490 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
491 MUTEX_DEFAULT, NULL);
492 }
493
494 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
495 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
496 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
497 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
498 NULL, MUTEX_DEFAULT, NULL);
499 }
500
501 ipst->ips_ipcl_dccp_conn_fanout = kmem_zalloc(
502 ipst->ips_ipcl_dccp_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
503 for (i = 0; i < ipst->ips_ipcl_dccp_conn_fanout_size; i++) {
504 mutex_init(&ipst->ips_ipcl_dccp_conn_fanout[i].connf_lock, NULL,
505 MUTEX_DEFAULT, NULL);
506 }
507
508 ipst->ips_ipcl_dccp_bind_fanout = kmem_zalloc(
509 ipst->ips_ipcl_dccp_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
510 for (i = 0; i < ipst->ips_ipcl_dccp_bind_fanout_size; i++) {
511 mutex_init(&ipst->ips_ipcl_dccp_bind_fanout[i].connf_lock, NULL,
512 MUTEX_DEFAULT, NULL);
513 }
514 }
515
516 void
517 ipcl_g_destroy(void)
518 {
519 kmem_cache_destroy(ip_conn_cache);
520 kmem_cache_destroy(tcp_conn_cache);
521 kmem_cache_destroy(udp_conn_cache);
522 kmem_cache_destroy(rawip_conn_cache);
523 kmem_cache_destroy(rts_conn_cache);
524 kmem_cache_destroy(dccp_conn_cache);
525 }
526
527 /*
528 * All user-level and kernel use of the stack must be gone
529 * by now.
530 */
531 void
532 ipcl_destroy(ip_stack_t *ipst)
533 {
534 int i;
535
536 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
537 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
538 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
539 }
540 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
541 sizeof (connf_t));
542 ipst->ips_ipcl_conn_fanout = NULL;
543
544 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
545 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
546 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
547 }
548 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
549 sizeof (connf_t));
550 ipst->ips_ipcl_bind_fanout = NULL;
551
552 for (i = 0; i < IPPROTO_MAX; i++) {
553 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
554 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
555 }
556 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
557 IPPROTO_MAX * sizeof (connf_t));
558 ipst->ips_ipcl_proto_fanout_v4 = NULL;
559
560 for (i = 0; i < IPPROTO_MAX; i++) {
561 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
562 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
563 }
564 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
565 IPPROTO_MAX * sizeof (connf_t));
566 ipst->ips_ipcl_proto_fanout_v6 = NULL;
567
568 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
569 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
570 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
571 }
572 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
573 sizeof (connf_t));
574 ipst->ips_ipcl_udp_fanout = NULL;
575
576 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
577 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
578 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
579 }
580 kmem_free(ipst->ips_ipcl_iptun_fanout,
581 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
582 ipst->ips_ipcl_iptun_fanout = NULL;
583
584 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
585 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
586 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
587 }
588 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
589 sizeof (connf_t));
590 ipst->ips_ipcl_raw_fanout = NULL;
591
592 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
593 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
594 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
595 }
596 kmem_free(ipst->ips_ipcl_globalhash_fanout,
597 sizeof (connf_t) * CONN_G_HASH_SIZE);
598 ipst->ips_ipcl_globalhash_fanout = NULL;
599
600 for (i = 0; i < ipst->ips_ipcl_dccp_conn_fanout_size; i++) {
601 ASSERT(ipst->ips_ipcl_dccp_conn_fanout[i].connf_head == NULL);
602 mutex_destroy(&ipst->ips_ipcl_dccp_conn_fanout[i].connf_lock);
603 }
604 kmem_free(ipst->ips_ipcl_dccp_conn_fanout,
605 ipst->ips_ipcl_dccp_conn_fanout_size * sizeof (connf_t));
606 ipst->ips_ipcl_dccp_conn_fanout = NULL;
607
608 for (i = 0; i < ipst->ips_ipcl_dccp_bind_fanout_size; i++) {
609 ASSERT(ipst->ips_ipcl_dccp_bind_fanout[i].connf_head == NULL);
610 mutex_destroy(&ipst->ips_ipcl_dccp_bind_fanout[i].connf_lock);
611 }
612 kmem_free(ipst->ips_ipcl_dccp_bind_fanout,
613 ipst->ips_ipcl_dccp_bind_fanout_size * sizeof (connf_t));
614 ipst->ips_ipcl_dccp_bind_fanout = NULL;
615
616 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
617 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
618 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
619 ipst->ips_rts_clients = NULL;
620 }
621
622 /*
623 * conn creation routine. initialize the conn, sets the reference
624 * and inserts it in the global hash table.
625 */
626 conn_t *
627 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
628 {
629 conn_t *connp;
630 struct kmem_cache *conn_cache;
631
632 switch (type) {
633 case IPCL_SCTPCONN:
634 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
635 return (NULL);
636 sctp_conn_init(connp);
637 netstack_hold(ns);
638 connp->conn_netstack = ns;
639 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
640 connp->conn_ixa->ixa_conn_id = (long)connp;
641 ipcl_globalhash_insert(connp);
642 return (connp);
643
644 case IPCL_TCPCONN:
645 conn_cache = tcp_conn_cache;
646 break;
647
648 case IPCL_UDPCONN:
649 conn_cache = udp_conn_cache;
650 break;
651
652 case IPCL_RAWIPCONN:
653 conn_cache = rawip_conn_cache;
654 break;
655
656 case IPCL_RTSCONN:
657 conn_cache = rts_conn_cache;
658 break;
659
660 case IPCL_IPCCONN:
661 conn_cache = ip_conn_cache;
662 break;
663
664 case IPCL_DCCPCONN:
665 conn_cache = dccp_conn_cache;
666 break;
667
668 default:
669 connp = NULL;
670 ASSERT(0);
671 }
672
673 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
674 return (NULL);
675
676 connp->conn_ref = 1;
677 netstack_hold(ns);
678 connp->conn_netstack = ns;
679 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
680 connp->conn_ixa->ixa_conn_id = (long)connp;
681 ipcl_globalhash_insert(connp);
682 return (connp);
683 }
684
685 void
686 ipcl_conn_destroy(conn_t *connp)
687 {
688 mblk_t *mp;
689 netstack_t *ns = connp->conn_netstack;
690
691 ASSERT(!MUTEX_HELD(&connp->conn_lock));
692 ASSERT(connp->conn_ref == 0);
693 ASSERT(connp->conn_ioctlref == 0);
694
695 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
696
697 if (connp->conn_cred != NULL) {
698 crfree(connp->conn_cred);
699 connp->conn_cred = NULL;
700 /* ixa_cred done in ipcl_conn_cleanup below */
701 }
702
703 if (connp->conn_ht_iphc != NULL) {
704 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
705 connp->conn_ht_iphc = NULL;
706 connp->conn_ht_iphc_allocated = 0;
707 connp->conn_ht_iphc_len = 0;
708 connp->conn_ht_ulp = NULL;
709 connp->conn_ht_ulp_len = 0;
710 }
711 ip_pkt_free(&connp->conn_xmit_ipp);
712
713 ipcl_globalhash_remove(connp);
714
715 if (connp->conn_latch != NULL) {
716 IPLATCH_REFRELE(connp->conn_latch);
717 connp->conn_latch = NULL;
718 }
719 if (connp->conn_latch_in_policy != NULL) {
720 IPPOL_REFRELE(connp->conn_latch_in_policy);
721 connp->conn_latch_in_policy = NULL;
722 }
723 if (connp->conn_latch_in_action != NULL) {
724 IPACT_REFRELE(connp->conn_latch_in_action);
725 connp->conn_latch_in_action = NULL;
726 }
727 if (connp->conn_policy != NULL) {
728 IPPH_REFRELE(connp->conn_policy, ns);
729 connp->conn_policy = NULL;
730 }
731
732 if (connp->conn_ipsec_opt_mp != NULL) {
733 freemsg(connp->conn_ipsec_opt_mp);
734 connp->conn_ipsec_opt_mp = NULL;
735 }
736
737 if (connp->conn_flags & IPCL_TCPCONN) {
738 tcp_t *tcp = connp->conn_tcp;
739
740 tcp_free(tcp);
741 mp = tcp->tcp_timercache;
742
743 tcp->tcp_tcps = NULL;
744
745 /*
746 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
747 * the mblk.
748 */
749 if (tcp->tcp_rsrv_mp != NULL) {
750 freeb(tcp->tcp_rsrv_mp);
751 tcp->tcp_rsrv_mp = NULL;
752 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
753 }
754
755 ipcl_conn_cleanup(connp);
756 connp->conn_flags = IPCL_TCPCONN;
757 if (ns != NULL) {
758 ASSERT(tcp->tcp_tcps == NULL);
759 connp->conn_netstack = NULL;
760 connp->conn_ixa->ixa_ipst = NULL;
761 netstack_rele(ns);
762 }
763
764 bzero(tcp, sizeof (tcp_t));
765
766 tcp->tcp_timercache = mp;
767 tcp->tcp_connp = connp;
768 kmem_cache_free(tcp_conn_cache, connp);
769 return;
770 }
771
772 if (connp->conn_flags & IPCL_SCTPCONN) {
773 ASSERT(ns != NULL);
774 sctp_free(connp);
775 return;
776 }
777
778 if (connp->conn_flags & IPCL_DCCPCONN) {
779 dccp_t *dccp = connp->conn_dccp;
780
781 cmn_err(CE_NOTE, "ipclassifier: conn_flags DCCP cache_free");
782
783 dccp_free(dccp);
784 mp = dccp->dccp_timercache;
785
786 dccp->dccp_dccps = NULL;
787
788 ipcl_conn_cleanup(connp);
789 connp->conn_flags = IPCL_DCCPCONN;
790 if (ns != NULL) {
791 ASSERT(dccp->dccps == NULL);
792 connp->conn_netstack = NULL;
793 connp->conn_ixa->ixa_ipst = NULL;
794 netstack_rele(ns);
795 }
796
797 bzero(dccp, sizeof (dccp_t));
798
799 dccp->dccp_timercache = mp;
800 dccp->dccp_connp = connp;
801 kmem_cache_free(dccp_conn_cache, connp);
802 return;
803 }
804
805 ipcl_conn_cleanup(connp);
806 if (ns != NULL) {
807 connp->conn_netstack = NULL;
808 connp->conn_ixa->ixa_ipst = NULL;
809 netstack_rele(ns);
810 }
811
812 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
813 if (connp->conn_flags & IPCL_UDPCONN) {
814 connp->conn_flags = IPCL_UDPCONN;
815 kmem_cache_free(udp_conn_cache, connp);
816 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
817 connp->conn_flags = IPCL_RAWIPCONN;
818 connp->conn_proto = IPPROTO_ICMP;
819 connp->conn_ixa->ixa_protocol = connp->conn_proto;
820 kmem_cache_free(rawip_conn_cache, connp);
821 } else if (connp->conn_flags & IPCL_RTSCONN) {
822 connp->conn_flags = IPCL_RTSCONN;
823 kmem_cache_free(rts_conn_cache, connp);
824 } else {
825 connp->conn_flags = IPCL_IPCCONN;
826 ASSERT(connp->conn_flags & IPCL_IPCCONN);
827 ASSERT(connp->conn_priv == NULL);
828 kmem_cache_free(ip_conn_cache, connp);
829 }
830 }
831
832 /*
833 * Running in cluster mode - deregister listener information
834 */
835 static void
836 ipcl_conn_unlisten(conn_t *connp)
837 {
838 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
839 ASSERT(connp->conn_lport != 0);
840
841 if (cl_inet_unlisten != NULL) {
842 sa_family_t addr_family;
843 uint8_t *laddrp;
844
845 if (connp->conn_ipversion == IPV6_VERSION) {
846 addr_family = AF_INET6;
847 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
848 } else {
849 addr_family = AF_INET;
850 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
851 }
852 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
853 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
854 }
855 connp->conn_flags &= ~IPCL_CL_LISTENER;
856 }
857
858 /*
859 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
860 * which table the conn belonged to). So for debugging we can see which hash
861 * table this connection was in.
862 */
863 #define IPCL_HASH_REMOVE(connp) { \
864 connf_t *connfp = (connp)->conn_fanout; \
865 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
866 if (connfp != NULL) { \
867 mutex_enter(&connfp->connf_lock); \
868 if ((connp)->conn_next != NULL) \
869 (connp)->conn_next->conn_prev = \
870 (connp)->conn_prev; \
871 if ((connp)->conn_prev != NULL) \
872 (connp)->conn_prev->conn_next = \
873 (connp)->conn_next; \
874 else \
875 connfp->connf_head = (connp)->conn_next; \
876 (connp)->conn_fanout = NULL; \
877 (connp)->conn_next = NULL; \
878 (connp)->conn_prev = NULL; \
879 (connp)->conn_flags |= IPCL_REMOVED; \
880 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
881 ipcl_conn_unlisten((connp)); \
882 CONN_DEC_REF((connp)); \
883 mutex_exit(&connfp->connf_lock); \
884 } \
885 }
886
887 void
888 ipcl_hash_remove(conn_t *connp)
889 {
890 uint8_t protocol = connp->conn_proto;
891
892 IPCL_HASH_REMOVE(connp);
893 if (protocol == IPPROTO_RSVP)
894 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
895 }
896
897 /*
898 * The whole purpose of this function is allow removal of
899 * a conn_t from the connected hash for timewait reclaim.
900 * This is essentially a TW reclaim fastpath where timewait
901 * collector checks under fanout lock (so no one else can
902 * get access to the conn_t) that refcnt is 2 i.e. one for
903 * TCP and one for the classifier hash list. If ref count
904 * is indeed 2, we can just remove the conn under lock and
905 * avoid cleaning up the conn under squeue. This gives us
906 * improved performance.
907 */
908 void
909 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
910 {
911 ASSERT(MUTEX_HELD(&connfp->connf_lock));
912 ASSERT(MUTEX_HELD(&connp->conn_lock));
913 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
914
915 if ((connp)->conn_next != NULL) {
916 (connp)->conn_next->conn_prev = (connp)->conn_prev;
917 }
918 if ((connp)->conn_prev != NULL) {
919 (connp)->conn_prev->conn_next = (connp)->conn_next;
920 } else {
921 connfp->connf_head = (connp)->conn_next;
922 }
923 (connp)->conn_fanout = NULL;
924 (connp)->conn_next = NULL;
925 (connp)->conn_prev = NULL;
926 (connp)->conn_flags |= IPCL_REMOVED;
927 ASSERT((connp)->conn_ref == 2);
928 (connp)->conn_ref--;
929 }
930
931 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
932 ASSERT((connp)->conn_fanout == NULL); \
933 ASSERT((connp)->conn_next == NULL); \
934 ASSERT((connp)->conn_prev == NULL); \
935 if ((connfp)->connf_head != NULL) { \
936 (connfp)->connf_head->conn_prev = (connp); \
937 (connp)->conn_next = (connfp)->connf_head; \
938 } \
939 (connp)->conn_fanout = (connfp); \
940 (connfp)->connf_head = (connp); \
941 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
942 IPCL_CONNECTED; \
943 CONN_INC_REF(connp); \
944 }
945
946 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
947 IPCL_HASH_REMOVE((connp)); \
948 mutex_enter(&(connfp)->connf_lock); \
949 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
950 mutex_exit(&(connfp)->connf_lock); \
951 }
952
953 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
954 conn_t *pconnp = NULL, *nconnp; \
955 IPCL_HASH_REMOVE((connp)); \
956 mutex_enter(&(connfp)->connf_lock); \
957 nconnp = (connfp)->connf_head; \
958 while (nconnp != NULL && \
959 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
960 pconnp = nconnp; \
961 nconnp = nconnp->conn_next; \
962 } \
963 if (pconnp != NULL) { \
964 pconnp->conn_next = (connp); \
965 (connp)->conn_prev = pconnp; \
966 } else { \
967 (connfp)->connf_head = (connp); \
968 } \
969 if (nconnp != NULL) { \
970 (connp)->conn_next = nconnp; \
971 nconnp->conn_prev = (connp); \
972 } \
973 (connp)->conn_fanout = (connfp); \
974 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
975 IPCL_BOUND; \
976 CONN_INC_REF(connp); \
977 mutex_exit(&(connfp)->connf_lock); \
978 }
979
980 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
981 conn_t **list, *prev, *next; \
982 boolean_t isv4mapped = \
983 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
984 IPCL_HASH_REMOVE((connp)); \
985 mutex_enter(&(connfp)->connf_lock); \
986 list = &(connfp)->connf_head; \
987 prev = NULL; \
988 while ((next = *list) != NULL) { \
989 if (isv4mapped && \
990 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
991 connp->conn_zoneid == next->conn_zoneid) { \
992 (connp)->conn_next = next; \
993 if (prev != NULL) \
994 prev = next->conn_prev; \
995 next->conn_prev = (connp); \
996 break; \
997 } \
998 list = &next->conn_next; \
999 prev = next; \
1000 } \
1001 (connp)->conn_prev = prev; \
1002 *list = (connp); \
1003 (connp)->conn_fanout = (connfp); \
1004 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
1005 IPCL_BOUND; \
1006 CONN_INC_REF((connp)); \
1007 mutex_exit(&(connfp)->connf_lock); \
1008 }
1009
1010 void
1011 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
1012 {
1013 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1014 }
1015
1016 /*
1017 * Because the classifier is used to classify inbound packets, the destination
1018 * address is meant to be our local tunnel address (tunnel source), and the
1019 * source the remote tunnel address (tunnel destination).
1020 *
1021 * Note that conn_proto can't be used for fanout since the upper protocol
1022 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
1023 */
1024 conn_t *
1025 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
1026 {
1027 connf_t *connfp;
1028 conn_t *connp;
1029
1030 /* first look for IPv4 tunnel links */
1031 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
1032 mutex_enter(&connfp->connf_lock);
1033 for (connp = connfp->connf_head; connp != NULL;
1034 connp = connp->conn_next) {
1035 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
1036 break;
1037 }
1038 if (connp != NULL)
1039 goto done;
1040
1041 mutex_exit(&connfp->connf_lock);
1042
1043 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
1044 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
1045 INADDR_ANY)];
1046 mutex_enter(&connfp->connf_lock);
1047 for (connp = connfp->connf_head; connp != NULL;
1048 connp = connp->conn_next) {
1049 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
1050 break;
1051 }
1052 done:
1053 if (connp != NULL)
1054 CONN_INC_REF(connp);
1055 mutex_exit(&connfp->connf_lock);
1056 return (connp);
1057 }
1058
1059 conn_t *
1060 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1061 {
1062 connf_t *connfp;
1063 conn_t *connp;
1064
1065 /* Look for an IPv6 tunnel link */
1066 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1067 mutex_enter(&connfp->connf_lock);
1068 for (connp = connfp->connf_head; connp != NULL;
1069 connp = connp->conn_next) {
1070 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1071 CONN_INC_REF(connp);
1072 break;
1073 }
1074 }
1075 mutex_exit(&connfp->connf_lock);
1076 return (connp);
1077 }
1078
1079 /*
1080 * This function is used only for inserting SCTP raw socket now.
1081 * This may change later.
1082 *
1083 * Note that only one raw socket can be bound to a port. The param
1084 * lport is in network byte order.
1085 */
1086 static int
1087 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1088 {
1089 connf_t *connfp;
1090 conn_t *oconnp;
1091 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1092
1093 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1094
1095 /* Check for existing raw socket already bound to the port. */
1096 mutex_enter(&connfp->connf_lock);
1097 for (oconnp = connfp->connf_head; oconnp != NULL;
1098 oconnp = oconnp->conn_next) {
1099 if (oconnp->conn_lport == lport &&
1100 oconnp->conn_zoneid == connp->conn_zoneid &&
1101 oconnp->conn_family == connp->conn_family &&
1102 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1103 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1104 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1105 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1106 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1107 &connp->conn_laddr_v6))) {
1108 break;
1109 }
1110 }
1111 mutex_exit(&connfp->connf_lock);
1112 if (oconnp != NULL)
1113 return (EADDRNOTAVAIL);
1114
1115 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1116 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1117 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1118 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1119 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1120 } else {
1121 IPCL_HASH_INSERT_BOUND(connfp, connp);
1122 }
1123 } else {
1124 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1125 }
1126 return (0);
1127 }
1128
1129 static int
1130 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1131 {
1132 connf_t *connfp;
1133 conn_t *tconnp;
1134 ipaddr_t laddr = connp->conn_laddr_v4;
1135 ipaddr_t faddr = connp->conn_faddr_v4;
1136
1137 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1138 mutex_enter(&connfp->connf_lock);
1139 for (tconnp = connfp->connf_head; tconnp != NULL;
1140 tconnp = tconnp->conn_next) {
1141 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1142 /* A tunnel is already bound to these addresses. */
1143 mutex_exit(&connfp->connf_lock);
1144 return (EADDRINUSE);
1145 }
1146 }
1147 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1148 mutex_exit(&connfp->connf_lock);
1149 return (0);
1150 }
1151
1152 static int
1153 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1154 {
1155 connf_t *connfp;
1156 conn_t *tconnp;
1157 in6_addr_t *laddr = &connp->conn_laddr_v6;
1158 in6_addr_t *faddr = &connp->conn_faddr_v6;
1159
1160 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1161 mutex_enter(&connfp->connf_lock);
1162 for (tconnp = connfp->connf_head; tconnp != NULL;
1163 tconnp = tconnp->conn_next) {
1164 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1165 /* A tunnel is already bound to these addresses. */
1166 mutex_exit(&connfp->connf_lock);
1167 return (EADDRINUSE);
1168 }
1169 }
1170 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1171 mutex_exit(&connfp->connf_lock);
1172 return (0);
1173 }
1174
1175 /*
1176 * Check for a MAC exemption conflict on a labeled system. Note that for
1177 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1178 * transport layer. This check is for binding all other protocols.
1179 *
1180 * Returns true if there's a conflict.
1181 */
1182 static boolean_t
1183 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1184 {
1185 connf_t *connfp;
1186 conn_t *tconn;
1187
1188 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1189 mutex_enter(&connfp->connf_lock);
1190 for (tconn = connfp->connf_head; tconn != NULL;
1191 tconn = tconn->conn_next) {
1192 /* We don't allow v4 fallback for v6 raw socket */
1193 if (connp->conn_family != tconn->conn_family)
1194 continue;
1195 /* If neither is exempt, then there's no conflict */
1196 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1197 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1198 continue;
1199 /* We are only concerned about sockets for a different zone */
1200 if (connp->conn_zoneid == tconn->conn_zoneid)
1201 continue;
1202 /* If both are bound to different specific addrs, ok */
1203 if (connp->conn_laddr_v4 != INADDR_ANY &&
1204 tconn->conn_laddr_v4 != INADDR_ANY &&
1205 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1206 continue;
1207 /* These two conflict; fail */
1208 break;
1209 }
1210 mutex_exit(&connfp->connf_lock);
1211 return (tconn != NULL);
1212 }
1213
1214 static boolean_t
1215 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1216 {
1217 connf_t *connfp;
1218 conn_t *tconn;
1219
1220 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1221 mutex_enter(&connfp->connf_lock);
1222 for (tconn = connfp->connf_head; tconn != NULL;
1223 tconn = tconn->conn_next) {
1224 /* We don't allow v4 fallback for v6 raw socket */
1225 if (connp->conn_family != tconn->conn_family)
1226 continue;
1227 /* If neither is exempt, then there's no conflict */
1228 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1229 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1230 continue;
1231 /* We are only concerned about sockets for a different zone */
1232 if (connp->conn_zoneid == tconn->conn_zoneid)
1233 continue;
1234 /* If both are bound to different addrs, ok */
1235 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1236 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1237 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1238 &tconn->conn_laddr_v6))
1239 continue;
1240 /* These two conflict; fail */
1241 break;
1242 }
1243 mutex_exit(&connfp->connf_lock);
1244 return (tconn != NULL);
1245 }
1246
1247 /*
1248 * (v4, v6) bind hash insertion routines
1249 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1250 */
1251
1252 int
1253 ipcl_bind_insert(conn_t *connp)
1254 {
1255 if (connp->conn_ipversion == IPV6_VERSION)
1256 return (ipcl_bind_insert_v6(connp));
1257 else
1258 return (ipcl_bind_insert_v4(connp));
1259 }
1260
1261 int
1262 ipcl_bind_insert_v4(conn_t *connp)
1263 {
1264 connf_t *connfp;
1265 int ret = 0;
1266 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1267 uint16_t lport = connp->conn_lport;
1268 uint8_t protocol = connp->conn_proto;
1269
1270 if (IPCL_IS_IPTUN(connp))
1271 return (ipcl_iptun_hash_insert(connp, ipst));
1272
1273 switch (protocol) {
1274 default:
1275 if (is_system_labeled() &&
1276 check_exempt_conflict_v4(connp, ipst))
1277 return (EADDRINUSE);
1278 /* FALLTHROUGH */
1279 case IPPROTO_UDP:
1280 if (protocol == IPPROTO_UDP) {
1281 connfp = &ipst->ips_ipcl_udp_fanout[
1282 IPCL_UDP_HASH(lport, ipst)];
1283 } else {
1284 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1285 }
1286
1287 if (connp->conn_faddr_v4 != INADDR_ANY) {
1288 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1289 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1290 IPCL_HASH_INSERT_BOUND(connfp, connp);
1291 } else {
1292 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1293 }
1294 if (protocol == IPPROTO_RSVP)
1295 ill_set_inputfn_all(ipst);
1296 break;
1297
1298 case IPPROTO_TCP:
1299 /* Insert it in the Bind Hash */
1300 ASSERT(connp->conn_zoneid != ALL_ZONES);
1301 connfp = &ipst->ips_ipcl_bind_fanout[
1302 IPCL_BIND_HASH(lport, ipst)];
1303 if (connp->conn_laddr_v4 != INADDR_ANY) {
1304 IPCL_HASH_INSERT_BOUND(connfp, connp);
1305 } else {
1306 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1307 }
1308 if (cl_inet_listen != NULL) {
1309 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1310 connp->conn_flags |= IPCL_CL_LISTENER;
1311 (*cl_inet_listen)(
1312 connp->conn_netstack->netstack_stackid,
1313 IPPROTO_TCP, AF_INET,
1314 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1315 }
1316 break;
1317
1318 case IPPROTO_SCTP:
1319 ret = ipcl_sctp_hash_insert(connp, lport);
1320 break;
1321
1322 case IPPROTO_DCCP:
1323 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_bind_insert_v4");
1324 ASSERT(connp->conn_zoneid != ALL_ZONES);
1325 connfp = &ipst->ips_ipcl_dccp_bind_fanout[
1326 IPCL_DCCP_BIND_HASH(lport, ipst)];
1327 if (connp->conn_laddr_v4 != INADDR_ANY) {
1328 IPCL_HASH_INSERT_BOUND(connfp, connp);
1329 } else {
1330 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1331 }
1332 break;
1333 }
1334
1335
1336 return (ret);
1337 }
1338
1339 int
1340 ipcl_bind_insert_v6(conn_t *connp)
1341 {
1342 connf_t *connfp;
1343 int ret = 0;
1344 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1345 uint16_t lport = connp->conn_lport;
1346 uint8_t protocol = connp->conn_proto;
1347
1348 if (IPCL_IS_IPTUN(connp)) {
1349 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1350 }
1351
1352 switch (protocol) {
1353 default:
1354 if (is_system_labeled() &&
1355 check_exempt_conflict_v6(connp, ipst))
1356 return (EADDRINUSE);
1357 /* FALLTHROUGH */
1358 case IPPROTO_UDP:
1359 if (protocol == IPPROTO_UDP) {
1360 connfp = &ipst->ips_ipcl_udp_fanout[
1361 IPCL_UDP_HASH(lport, ipst)];
1362 } else {
1363 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1364 }
1365
1366 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1367 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1368 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1369 IPCL_HASH_INSERT_BOUND(connfp, connp);
1370 } else {
1371 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1372 }
1373 break;
1374
1375 case IPPROTO_TCP:
1376 /* Insert it in the Bind Hash */
1377 ASSERT(connp->conn_zoneid != ALL_ZONES);
1378 connfp = &ipst->ips_ipcl_bind_fanout[
1379 IPCL_BIND_HASH(lport, ipst)];
1380 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1381 IPCL_HASH_INSERT_BOUND(connfp, connp);
1382 } else {
1383 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1384 }
1385 if (cl_inet_listen != NULL) {
1386 sa_family_t addr_family;
1387 uint8_t *laddrp;
1388
1389 if (connp->conn_ipversion == IPV6_VERSION) {
1390 addr_family = AF_INET6;
1391 laddrp =
1392 (uint8_t *)&connp->conn_bound_addr_v6;
1393 } else {
1394 addr_family = AF_INET;
1395 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1396 }
1397 connp->conn_flags |= IPCL_CL_LISTENER;
1398 (*cl_inet_listen)(
1399 connp->conn_netstack->netstack_stackid,
1400 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1401 }
1402 break;
1403
1404 case IPPROTO_SCTP:
1405 ret = ipcl_sctp_hash_insert(connp, lport);
1406 break;
1407
1408 case IPPROTO_DCCP:
1409 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_bind_insert_v6");
1410 ASSERT(connp->conn_zoneid != ALL_ZONES);
1411 connfp = &ipst->ips_ipcl_dccp_bind_fanout[
1412 IPCL_DCCP_BIND_HASH(lport, ipst)];
1413 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1414 IPCL_HASH_INSERT_BOUND(connfp, connp);
1415 } else {
1416 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1417 }
1418 break;
1419 }
1420
1421 return (ret);
1422 }
1423
1424 /*
1425 * ipcl_conn_hash insertion routines.
1426 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1427 */
1428
1429 int
1430 ipcl_conn_insert(conn_t *connp)
1431 {
1432 if (connp->conn_ipversion == IPV6_VERSION)
1433 return (ipcl_conn_insert_v6(connp));
1434 else
1435 return (ipcl_conn_insert_v4(connp));
1436 }
1437
1438 int
1439 ipcl_conn_insert_v4(conn_t *connp)
1440 {
1441 connf_t *connfp;
1442 conn_t *tconnp;
1443 int ret = 0;
1444 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1445 uint16_t lport = connp->conn_lport;
1446 uint8_t protocol = connp->conn_proto;
1447
1448 if (IPCL_IS_IPTUN(connp))
1449 return (ipcl_iptun_hash_insert(connp, ipst));
1450
1451 switch (protocol) {
1452 case IPPROTO_TCP:
1453 /*
1454 * For TCP, we check whether the connection tuple already
1455 * exists before allowing the connection to proceed. We
1456 * also allow indexing on the zoneid. This is to allow
1457 * multiple shared stack zones to have the same tcp
1458 * connection tuple. In practice this only happens for
1459 * INADDR_LOOPBACK as it's the only local address which
1460 * doesn't have to be unique.
1461 */
1462 connfp = &ipst->ips_ipcl_conn_fanout[
1463 IPCL_CONN_HASH(connp->conn_faddr_v4,
1464 connp->conn_ports, ipst)];
1465 mutex_enter(&connfp->connf_lock);
1466 for (tconnp = connfp->connf_head; tconnp != NULL;
1467 tconnp = tconnp->conn_next) {
1468 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1469 connp->conn_faddr_v4, connp->conn_laddr_v4,
1470 connp->conn_ports) &&
1471 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1472 /* Already have a conn. bail out */
1473 mutex_exit(&connfp->connf_lock);
1474 return (EADDRINUSE);
1475 }
1476 }
1477 if (connp->conn_fanout != NULL) {
1478 /*
1479 * Probably a XTI/TLI application trying to do a
1480 * rebind. Let it happen.
1481 */
1482 mutex_exit(&connfp->connf_lock);
1483 IPCL_HASH_REMOVE(connp);
1484 mutex_enter(&connfp->connf_lock);
1485 }
1486
1487 ASSERT(connp->conn_recv != NULL);
1488 ASSERT(connp->conn_recvicmp != NULL);
1489
1490 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1491 mutex_exit(&connfp->connf_lock);
1492 break;
1493
1494 case IPPROTO_SCTP:
1495 /*
1496 * The raw socket may have already been bound, remove it
1497 * from the hash first.
1498 */
1499 IPCL_HASH_REMOVE(connp);
1500 ret = ipcl_sctp_hash_insert(connp, lport);
1501 break;
1502
1503 case IPPROTO_DCCP:
1504 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_conn_insert_v4");
1505 connfp = &ipst->ips_ipcl_dccp_conn_fanout[IPCL_DCCP_CONN_HASH(
1506 connp->conn_faddr_v4, connp->conn_ports, ipst)];
1507 mutex_enter(&connfp->connf_lock);
1508 for (tconnp = connfp->connf_head; tconnp != NULL;
1509 tconnp = tconnp->conn_next) {
1510 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1511 connp->conn_faddr_v4, connp->conn_laddr_v4,
1512 connp->conn_ports) &&
1513 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1514 /* Already have a conn. bail out */
1515 mutex_exit(&connfp->connf_lock);
1516 return (EADDRINUSE);
1517 }
1518 }
1519
1520 /* XXX:DCCP XTI/TLI application? */
1521
1522 ASSERT(connp->conn_recv != NULL);
1523 ASSERT(connp->conn_recvicmp != NULL);
1524
1525 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1526 mutex_exit(&connfp->connf_lock);
1527 break;
1528
1529 default:
1530 /*
1531 * Check for conflicts among MAC exempt bindings. For
1532 * transports with port numbers, this is done by the upper
1533 * level per-transport binding logic. For all others, it's
1534 * done here.
1535 */
1536 if (is_system_labeled() &&
1537 check_exempt_conflict_v4(connp, ipst))
1538 return (EADDRINUSE);
1539 /* FALLTHROUGH */
1540
1541 case IPPROTO_UDP:
1542 if (protocol == IPPROTO_UDP) {
1543 connfp = &ipst->ips_ipcl_udp_fanout[
1544 IPCL_UDP_HASH(lport, ipst)];
1545 } else {
1546 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1547 }
1548
1549 if (connp->conn_faddr_v4 != INADDR_ANY) {
1550 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1551 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1552 IPCL_HASH_INSERT_BOUND(connfp, connp);
1553 } else {
1554 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1555 }
1556 break;
1557 }
1558
1559 return (ret);
1560 }
1561
1562 int
1563 ipcl_conn_insert_v6(conn_t *connp)
1564 {
1565 connf_t *connfp;
1566 conn_t *tconnp;
1567 int ret = 0;
1568 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1569 uint16_t lport = connp->conn_lport;
1570 uint8_t protocol = connp->conn_proto;
1571 uint_t ifindex = connp->conn_bound_if;
1572
1573 if (IPCL_IS_IPTUN(connp))
1574 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1575
1576 switch (protocol) {
1577 case IPPROTO_TCP:
1578
1579 /*
1580 * For tcp, we check whether the connection tuple already
1581 * exists before allowing the connection to proceed. We
1582 * also allow indexing on the zoneid. This is to allow
1583 * multiple shared stack zones to have the same tcp
1584 * connection tuple. In practice this only happens for
1585 * ipv6_loopback as it's the only local address which
1586 * doesn't have to be unique.
1587 */
1588 connfp = &ipst->ips_ipcl_conn_fanout[
1589 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1590 ipst)];
1591 mutex_enter(&connfp->connf_lock);
1592 for (tconnp = connfp->connf_head; tconnp != NULL;
1593 tconnp = tconnp->conn_next) {
1594 /* NOTE: need to match zoneid. Bug in onnv-gate */
1595 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1596 connp->conn_faddr_v6, connp->conn_laddr_v6,
1597 connp->conn_ports) &&
1598 (tconnp->conn_bound_if == 0 ||
1599 tconnp->conn_bound_if == ifindex) &&
1600 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1601 /* Already have a conn. bail out */
1602 mutex_exit(&connfp->connf_lock);
1603 return (EADDRINUSE);
1604 }
1605 }
1606 if (connp->conn_fanout != NULL) {
1607 /*
1608 * Probably a XTI/TLI application trying to do a
1609 * rebind. Let it happen.
1610 */
1611 mutex_exit(&connfp->connf_lock);
1612 IPCL_HASH_REMOVE(connp);
1613 mutex_enter(&connfp->connf_lock);
1614 }
1615 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1616 mutex_exit(&connfp->connf_lock);
1617 break;
1618
1619 case IPPROTO_SCTP:
1620 IPCL_HASH_REMOVE(connp);
1621 ret = ipcl_sctp_hash_insert(connp, lport);
1622 break;
1623
1624 case IPPROTO_DCCP:
1625 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_conn_insert_v6");
1626 connfp = &ipst->ips_ipcl_dccp_conn_fanout[
1627 IPCL_DCCP_CONN_HASH_V6(connp->conn_faddr_v6,
1628 connp->conn_ports, ipst)];
1629 mutex_enter(&connfp->connf_lock);
1630 for (tconnp = connfp->connf_head; tconnp != NULL;
1631 tconnp = tconnp->conn_next) {
1632 /* NOTE: need to match zoneid. Bug in onnv-gate */
1633 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1634 connp->conn_faddr_v6, connp->conn_laddr_v6,
1635 connp->conn_ports) &&
1636 (tconnp->conn_bound_if == 0 ||
1637 tconnp->conn_bound_if == ifindex) &&
1638 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1639 /* Already have a conn. bail out */
1640 mutex_exit(&connfp->connf_lock);
1641 return (EADDRINUSE);
1642 }
1643 }
1644
1645 /* XXX:DCCP XTI/TLI? */
1646 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1647 mutex_exit(&connfp->connf_lock);
1648 break;
1649
1650 default:
1651 if (is_system_labeled() &&
1652 check_exempt_conflict_v6(connp, ipst))
1653 return (EADDRINUSE);
1654 /* FALLTHROUGH */
1655 case IPPROTO_UDP:
1656 if (protocol == IPPROTO_UDP) {
1657 connfp = &ipst->ips_ipcl_udp_fanout[
1658 IPCL_UDP_HASH(lport, ipst)];
1659 } else {
1660 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1661 }
1662
1663 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1664 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1665 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1666 IPCL_HASH_INSERT_BOUND(connfp, connp);
1667 } else {
1668 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1669 }
1670 break;
1671 }
1672
1673 return (ret);
1674 }
1675
1676 /*
1677 * v4 packet classifying function. looks up the fanout table to
1678 * find the conn, the packet belongs to. returns the conn with
1679 * the reference held, null otherwise.
1680 *
1681 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1682 * Lookup" comment block are applied. Labels are also checked as described
1683 * above. If the packet is from the inside (looped back), and is from the same
1684 * zone, then label checks are omitted.
1685 */
1686 conn_t *
1687 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1688 ip_recv_attr_t *ira, ip_stack_t *ipst)
1689 {
1690 ipha_t *ipha;
1691 connf_t *connfp, *bind_connfp;
1692 uint16_t lport;
1693 uint16_t fport;
1694 uint32_t ports;
1695 conn_t *connp;
1696 uint16_t *up;
1697 zoneid_t zoneid = ira->ira_zoneid;
1698
1699 ipha = (ipha_t *)mp->b_rptr;
1700 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1701
1702 switch (protocol) {
1703 case IPPROTO_TCP:
1704 ports = *(uint32_t *)up;
1705 connfp =
1706 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1707 ports, ipst)];
1708 mutex_enter(&connfp->connf_lock);
1709 for (connp = connfp->connf_head; connp != NULL;
1710 connp = connp->conn_next) {
1711 if (IPCL_CONN_MATCH(connp, protocol,
1712 ipha->ipha_src, ipha->ipha_dst, ports) &&
1713 (connp->conn_zoneid == zoneid ||
1714 connp->conn_allzones ||
1715 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1716 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1717 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1718 break;
1719 }
1720
1721 if (connp != NULL) {
1722 /*
1723 * We have a fully-bound TCP connection.
1724 *
1725 * For labeled systems, there's no need to check the
1726 * label here. It's known to be good as we checked
1727 * before allowing the connection to become bound.
1728 */
1729 CONN_INC_REF(connp);
1730 mutex_exit(&connfp->connf_lock);
1731 return (connp);
1732 }
1733
1734 mutex_exit(&connfp->connf_lock);
1735 lport = up[1];
1736 bind_connfp =
1737 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1738 mutex_enter(&bind_connfp->connf_lock);
1739 for (connp = bind_connfp->connf_head; connp != NULL;
1740 connp = connp->conn_next) {
1741 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1742 lport) &&
1743 (connp->conn_zoneid == zoneid ||
1744 connp->conn_allzones ||
1745 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1746 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1747 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1748 break;
1749 }
1750
1751 /*
1752 * If the matching connection is SLP on a private address, then
1753 * the label on the packet must match the local zone's label.
1754 * Otherwise, it must be in the label range defined by tnrh.
1755 * This is ensured by tsol_receive_local.
1756 *
1757 * Note that we don't check tsol_receive_local for
1758 * the connected case.
1759 */
1760 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1761 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1762 ira, connp)) {
1763 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1764 char *, "connp(1) could not receive mp(2)",
1765 conn_t *, connp, mblk_t *, mp);
1766 connp = NULL;
1767 }
1768
1769 if (connp != NULL) {
1770 /* Have a listener at least */
1771 CONN_INC_REF(connp);
1772 mutex_exit(&bind_connfp->connf_lock);
1773 return (connp);
1774 }
1775
1776 mutex_exit(&bind_connfp->connf_lock);
1777 break;
1778
1779 case IPPROTO_UDP:
1780 lport = up[1];
1781 fport = up[0];
1782 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1783 mutex_enter(&connfp->connf_lock);
1784 for (connp = connfp->connf_head; connp != NULL;
1785 connp = connp->conn_next) {
1786 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1787 fport, ipha->ipha_src) &&
1788 (connp->conn_zoneid == zoneid ||
1789 connp->conn_allzones ||
1790 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1791 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1792 break;
1793 }
1794
1795 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1796 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1797 ira, connp)) {
1798 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1799 char *, "connp(1) could not receive mp(2)",
1800 conn_t *, connp, mblk_t *, mp);
1801 connp = NULL;
1802 }
1803
1804 if (connp != NULL) {
1805 CONN_INC_REF(connp);
1806 mutex_exit(&connfp->connf_lock);
1807 return (connp);
1808 }
1809
1810 /*
1811 * We shouldn't come here for multicast/broadcast packets
1812 */
1813 mutex_exit(&connfp->connf_lock);
1814
1815 break;
1816
1817 case IPPROTO_DCCP:
1818 ports = *(uint32_t *)up;
1819
1820 /*
1821 * Search for fully-bound connection.
1822 */
1823 connfp = &ipst->ips_ipcl_dccp_conn_fanout[IPCL_DCCP_CONN_HASH(
1824 ipha->ipha_src, ports, ipst)];
1825 mutex_enter(&connfp->connf_lock);
1826 for (connp = connfp->connf_head; connp != NULL;
1827 connp = connp->conn_next) {
1828 /* XXX:DCCP */
1829 if (IPCL_CONN_MATCH(connp, protocol,
1830 ipha->ipha_src, ipha->ipha_dst, ports)) {
1831 /* XXX */
1832 cmn_err(CE_NOTE, "ipclassifier.c: fully bound connection found");
1833 break;
1834 }
1835 }
1836
1837 if (connp != NULL) {
1838 /*
1839 * We have a fully-bound DCCP connection.
1840 */
1841 CONN_INC_REF(connp);
1842 mutex_exit(&connfp->connf_lock);
1843 return (connp);
1844 }
1845
1846 mutex_exit(&connfp->connf_lock);
1847 lport = up[1];
1848
1849 /*
1850 * Fully-bound connection was not found, search for listener.
1851 */
1852 bind_connfp = &ipst->ips_ipcl_dccp_bind_fanout[
1853 IPCL_DCCP_BIND_HASH(lport, ipst)];
1854 mutex_enter(&bind_connfp->connf_lock);
1855 for (connp = bind_connfp->connf_head; connp != NULL;
1856 connp = connp->conn_next) {
1857 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1858 lport) &&
1859 (connp->conn_zoneid == zoneid ||
1860 connp->conn_allzones ||
1861 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1862 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1863 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1864 break;
1865 }
1866
1867 if (connp != NULL) {
1868 cmn_err(CE_NOTE, "ipclassifier.c: half-bound bind listener");
1869 /* Have a listener at least */
1870 CONN_INC_REF(connp);
1871 mutex_exit(&bind_connfp->connf_lock);
1872 return (connp);
1873 }
1874
1875 mutex_exit(&bind_connfp->connf_lock);
1876 break;
1877
1878 case IPPROTO_ENCAP:
1879 case IPPROTO_IPV6:
1880 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1881 &ipha->ipha_dst, ipst));
1882 }
1883
1884 return (NULL);
1885 }
1886
1887 conn_t *
1888 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1889 ip_recv_attr_t *ira, ip_stack_t *ipst)
1890 {
1891 ip6_t *ip6h;
1892 connf_t *connfp, *bind_connfp;
1893 uint16_t lport;
1894 uint16_t fport;
1895 tcpha_t *tcpha;
1896 uint32_t ports;
1897 conn_t *connp;
1898 uint16_t *up;
1899 zoneid_t zoneid = ira->ira_zoneid;
1900
1901 ip6h = (ip6_t *)mp->b_rptr;
1902
1903 switch (protocol) {
1904 case IPPROTO_TCP:
1905 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1906 up = &tcpha->tha_lport;
1907 ports = *(uint32_t *)up;
1908
1909 connfp =
1910 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1911 ports, ipst)];
1912 mutex_enter(&connfp->connf_lock);
1913 for (connp = connfp->connf_head; connp != NULL;
1914 connp = connp->conn_next) {
1915 if (IPCL_CONN_MATCH_V6(connp, protocol,
1916 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1917 (connp->conn_zoneid == zoneid ||
1918 connp->conn_allzones ||
1919 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1920 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1921 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1922 break;
1923 }
1924
1925 if (connp != NULL) {
1926 /*
1927 * We have a fully-bound TCP connection.
1928 *
1929 * For labeled systems, there's no need to check the
1930 * label here. It's known to be good as we checked
1931 * before allowing the connection to become bound.
1932 */
1933 CONN_INC_REF(connp);
1934 mutex_exit(&connfp->connf_lock);
1935 return (connp);
1936 }
1937
1938 mutex_exit(&connfp->connf_lock);
1939
1940 lport = up[1];
1941 bind_connfp =
1942 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1943 mutex_enter(&bind_connfp->connf_lock);
1944 for (connp = bind_connfp->connf_head; connp != NULL;
1945 connp = connp->conn_next) {
1946 if (IPCL_BIND_MATCH_V6(connp, protocol,
1947 ip6h->ip6_dst, lport) &&
1948 (connp->conn_zoneid == zoneid ||
1949 connp->conn_allzones ||
1950 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1951 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1952 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1953 break;
1954 }
1955
1956 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1957 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1958 ira, connp)) {
1959 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1960 char *, "connp(1) could not receive mp(2)",
1961 conn_t *, connp, mblk_t *, mp);
1962 connp = NULL;
1963 }
1964
1965 if (connp != NULL) {
1966 /* Have a listner at least */
1967 CONN_INC_REF(connp);
1968 mutex_exit(&bind_connfp->connf_lock);
1969 return (connp);
1970 }
1971
1972 mutex_exit(&bind_connfp->connf_lock);
1973 break;
1974
1975 case IPPROTO_UDP:
1976 up = (uint16_t *)&mp->b_rptr[hdr_len];
1977 lport = up[1];
1978 fport = up[0];
1979 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1980 mutex_enter(&connfp->connf_lock);
1981 for (connp = connfp->connf_head; connp != NULL;
1982 connp = connp->conn_next) {
1983 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1984 fport, ip6h->ip6_src) &&
1985 (connp->conn_zoneid == zoneid ||
1986 connp->conn_allzones ||
1987 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1988 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1989 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1990 break;
1991 }
1992
1993 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1994 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1995 ira, connp)) {
1996 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1997 char *, "connp(1) could not receive mp(2)",
1998 conn_t *, connp, mblk_t *, mp);
1999 connp = NULL;
2000 }
2001
2002 if (connp != NULL) {
2003 CONN_INC_REF(connp);
2004 mutex_exit(&connfp->connf_lock);
2005 return (connp);
2006 }
2007
2008 /*
2009 * We shouldn't come here for multicast/broadcast packets
2010 */
2011 mutex_exit(&connfp->connf_lock);
2012 break;
2013 case IPPROTO_ENCAP:
2014 case IPPROTO_IPV6:
2015 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
2016 &ip6h->ip6_dst, ipst));
2017 }
2018
2019 return (NULL);
2020 }
2021
2022 /*
2023 * wrapper around ipcl_classify_(v4,v6) routines.
2024 */
2025 conn_t *
2026 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
2027 {
2028 if (ira->ira_flags & IRAF_IS_IPV4) {
2029 return (ipcl_classify_v4(mp, ira->ira_protocol,
2030 ira->ira_ip_hdr_length, ira, ipst));
2031 } else {
2032 return (ipcl_classify_v6(mp, ira->ira_protocol,
2033 ira->ira_ip_hdr_length, ira, ipst));
2034 }
2035 }
2036
2037 /*
2038 * Only used to classify SCTP RAW sockets
2039 */
2040 conn_t *
2041 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
2042 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
2043 {
2044 connf_t *connfp;
2045 conn_t *connp;
2046 in_port_t lport;
2047 int ipversion;
2048 const void *dst;
2049 zoneid_t zoneid = ira->ira_zoneid;
2050
2051 lport = ((uint16_t *)&ports)[1];
2052 if (ira->ira_flags & IRAF_IS_IPV4) {
2053 dst = (const void *)&ipha->ipha_dst;
2054 ipversion = IPV4_VERSION;
2055 } else {
2056 dst = (const void *)&ip6h->ip6_dst;
2057 ipversion = IPV6_VERSION;
2058 }
2059
2060 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
2061 mutex_enter(&connfp->connf_lock);
2062 for (connp = connfp->connf_head; connp != NULL;
2063 connp = connp->conn_next) {
2064 /* We don't allow v4 fallback for v6 raw socket. */
2065 if (ipversion != connp->conn_ipversion)
2066 continue;
2067 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2068 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2069 if (ipversion == IPV4_VERSION) {
2070 if (!IPCL_CONN_MATCH(connp, protocol,
2071 ipha->ipha_src, ipha->ipha_dst, ports))
2072 continue;
2073 } else {
2074 if (!IPCL_CONN_MATCH_V6(connp, protocol,
2075 ip6h->ip6_src, ip6h->ip6_dst, ports))
2076 continue;
2077 }
2078 } else {
2079 if (ipversion == IPV4_VERSION) {
2080 if (!IPCL_BIND_MATCH(connp, protocol,
2081 ipha->ipha_dst, lport))
2082 continue;
2083 } else {
2084 if (!IPCL_BIND_MATCH_V6(connp, protocol,
2085 ip6h->ip6_dst, lport))
2086 continue;
2087 }
2088 }
2089
2090 if (connp->conn_zoneid == zoneid ||
2091 connp->conn_allzones ||
2092 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
2093 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
2094 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
2095 break;
2096 }
2097
2098 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2099 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
2100 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
2101 char *, "connp(1) could not receive mp(2)",
2102 conn_t *, connp, mblk_t *, mp);
2103 connp = NULL;
2104 }
2105
2106 if (connp != NULL)
2107 goto found;
2108 mutex_exit(&connfp->connf_lock);
2109
2110 /* Try to look for a wildcard SCTP RAW socket match. */
2111 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
2112 mutex_enter(&connfp->connf_lock);
2113 for (connp = connfp->connf_head; connp != NULL;
2114 connp = connp->conn_next) {
2115 /* We don't allow v4 fallback for v6 raw socket. */
2116 if (ipversion != connp->conn_ipversion)
2117 continue;
2118 if (!IPCL_ZONE_MATCH(connp, zoneid))
2119 continue;
2120
2121 if (ipversion == IPV4_VERSION) {
2122 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
2123 break;
2124 } else {
2125 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
2126 break;
2127 }
2128 }
2129 }
2130
2131 if (connp != NULL)
2132 goto found;
2133
2134 mutex_exit(&connfp->connf_lock);
2135 return (NULL);
2136
2137 found:
2138 ASSERT(connp != NULL);
2139 CONN_INC_REF(connp);
2140 mutex_exit(&connfp->connf_lock);
2141 return (connp);
2142 }
2143
2144 /* ARGSUSED */
2145 static int
2146 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2147 {
2148 itc_t *itc = (itc_t *)buf;
2149 conn_t *connp = &itc->itc_conn;
2150 tcp_t *tcp = (tcp_t *)&itc[1];
2151
2152 bzero(connp, sizeof (conn_t));
2153 bzero(tcp, sizeof (tcp_t));
2154
2155 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2156 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2157 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
2158 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
2159 if (tcp->tcp_timercache == NULL)
2160 return (ENOMEM);
2161 connp->conn_tcp = tcp;
2162 connp->conn_flags = IPCL_TCPCONN;
2163 connp->conn_proto = IPPROTO_TCP;
2164 tcp->tcp_connp = connp;
2165 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2166
2167 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2168 if (connp->conn_ixa == NULL) {
2169 tcp_timermp_free(tcp);
2170 return (ENOMEM);
2171 }
2172 connp->conn_ixa->ixa_refcnt = 1;
2173 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2174 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2175 return (0);
2176 }
2177
2178 /* ARGSUSED */
2179 static void
2180 tcp_conn_destructor(void *buf, void *cdrarg)
2181 {
2182 itc_t *itc = (itc_t *)buf;
2183 conn_t *connp = &itc->itc_conn;
2184 tcp_t *tcp = (tcp_t *)&itc[1];
2185
2186 ASSERT(connp->conn_flags & IPCL_TCPCONN);
2187 ASSERT(tcp->tcp_connp == connp);
2188 ASSERT(connp->conn_tcp == tcp);
2189 tcp_timermp_free(tcp);
2190 mutex_destroy(&connp->conn_lock);
2191 cv_destroy(&connp->conn_cv);
2192 cv_destroy(&connp->conn_sq_cv);
2193 rw_destroy(&connp->conn_ilg_lock);
2194
2195 /* Can be NULL if constructor failed */
2196 if (connp->conn_ixa != NULL) {
2197 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2198 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2199 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2200 ixa_refrele(connp->conn_ixa);
2201 }
2202 }
2203
2204 /* ARGSUSED */
2205 static int
2206 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2207 {
2208 itc_t *itc = (itc_t *)buf;
2209 conn_t *connp = &itc->itc_conn;
2210
2211 bzero(connp, sizeof (conn_t));
2212 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2213 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2214 connp->conn_flags = IPCL_IPCCONN;
2215 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2216
2217 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2218 if (connp->conn_ixa == NULL)
2219 return (ENOMEM);
2220 connp->conn_ixa->ixa_refcnt = 1;
2221 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2222 return (0);
2223 }
2224
2225 /* ARGSUSED */
2226 static void
2227 ip_conn_destructor(void *buf, void *cdrarg)
2228 {
2229 itc_t *itc = (itc_t *)buf;
2230 conn_t *connp = &itc->itc_conn;
2231
2232 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2233 ASSERT(connp->conn_priv == NULL);
2234 mutex_destroy(&connp->conn_lock);
2235 cv_destroy(&connp->conn_cv);
2236 rw_destroy(&connp->conn_ilg_lock);
2237
2238 /* Can be NULL if constructor failed */
2239 if (connp->conn_ixa != NULL) {
2240 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2241 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2242 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2243 ixa_refrele(connp->conn_ixa);
2244 }
2245 }
2246
2247 /* ARGSUSED */
2248 static int
2249 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2250 {
2251 itc_t *itc = (itc_t *)buf;
2252 conn_t *connp = &itc->itc_conn;
2253 udp_t *udp = (udp_t *)&itc[1];
2254
2255 bzero(connp, sizeof (conn_t));
2256 bzero(udp, sizeof (udp_t));
2257
2258 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2259 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2260 connp->conn_udp = udp;
2261 connp->conn_flags = IPCL_UDPCONN;
2262 connp->conn_proto = IPPROTO_UDP;
2263 udp->udp_connp = connp;
2264 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2265 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2266 if (connp->conn_ixa == NULL)
2267 return (ENOMEM);
2268 connp->conn_ixa->ixa_refcnt = 1;
2269 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2270 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2271 return (0);
2272 }
2273
2274 /* ARGSUSED */
2275 static void
2276 udp_conn_destructor(void *buf, void *cdrarg)
2277 {
2278 itc_t *itc = (itc_t *)buf;
2279 conn_t *connp = &itc->itc_conn;
2280 udp_t *udp = (udp_t *)&itc[1];
2281
2282 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2283 ASSERT(udp->udp_connp == connp);
2284 ASSERT(connp->conn_udp == udp);
2285 mutex_destroy(&connp->conn_lock);
2286 cv_destroy(&connp->conn_cv);
2287 rw_destroy(&connp->conn_ilg_lock);
2288
2289 /* Can be NULL if constructor failed */
2290 if (connp->conn_ixa != NULL) {
2291 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2292 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2293 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2294 ixa_refrele(connp->conn_ixa);
2295 }
2296 }
2297
2298 /* ARGSUSED */
2299 static int
2300 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2301 {
2302 itc_t *itc = (itc_t *)buf;
2303 conn_t *connp = &itc->itc_conn;
2304 icmp_t *icmp = (icmp_t *)&itc[1];
2305
2306 bzero(connp, sizeof (conn_t));
2307 bzero(icmp, sizeof (icmp_t));
2308
2309 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2310 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2311 connp->conn_icmp = icmp;
2312 connp->conn_flags = IPCL_RAWIPCONN;
2313 connp->conn_proto = IPPROTO_ICMP;
2314 icmp->icmp_connp = connp;
2315 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2316 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2317 if (connp->conn_ixa == NULL)
2318 return (ENOMEM);
2319 connp->conn_ixa->ixa_refcnt = 1;
2320 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2321 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2322 return (0);
2323 }
2324
2325 /* ARGSUSED */
2326 static void
2327 rawip_conn_destructor(void *buf, void *cdrarg)
2328 {
2329 itc_t *itc = (itc_t *)buf;
2330 conn_t *connp = &itc->itc_conn;
2331 icmp_t *icmp = (icmp_t *)&itc[1];
2332
2333 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2334 ASSERT(icmp->icmp_connp == connp);
2335 ASSERT(connp->conn_icmp == icmp);
2336 mutex_destroy(&connp->conn_lock);
2337 cv_destroy(&connp->conn_cv);
2338 rw_destroy(&connp->conn_ilg_lock);
2339
2340 /* Can be NULL if constructor failed */
2341 if (connp->conn_ixa != NULL) {
2342 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2343 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2344 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2345 ixa_refrele(connp->conn_ixa);
2346 }
2347 }
2348
2349 /* ARGSUSED */
2350 static int
2351 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2352 {
2353 itc_t *itc = (itc_t *)buf;
2354 conn_t *connp = &itc->itc_conn;
2355 rts_t *rts = (rts_t *)&itc[1];
2356
2357 bzero(connp, sizeof (conn_t));
2358 bzero(rts, sizeof (rts_t));
2359
2360 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2361 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2362 connp->conn_rts = rts;
2363 connp->conn_flags = IPCL_RTSCONN;
2364 rts->rts_connp = connp;
2365 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2366 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2367 if (connp->conn_ixa == NULL)
2368 return (ENOMEM);
2369 connp->conn_ixa->ixa_refcnt = 1;
2370 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2371 return (0);
2372 }
2373
2374 /* ARGSUSED */
2375 static void
2376 rts_conn_destructor(void *buf, void *cdrarg)
2377 {
2378 itc_t *itc = (itc_t *)buf;
2379 conn_t *connp = &itc->itc_conn;
2380 rts_t *rts = (rts_t *)&itc[1];
2381
2382 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2383 ASSERT(rts->rts_connp == connp);
2384 ASSERT(connp->conn_rts == rts);
2385 mutex_destroy(&connp->conn_lock);
2386 cv_destroy(&connp->conn_cv);
2387 rw_destroy(&connp->conn_ilg_lock);
2388
2389 /* Can be NULL if constructor failed */
2390 if (connp->conn_ixa != NULL) {
2391 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2392 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2393 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2394 ixa_refrele(connp->conn_ixa);
2395 }
2396 }
2397
2398 /* ARGSUSED */
2399 static int
2400 dccp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2401 {
2402 itc_t *itc = (itc_t *)buf;
2403 conn_t *connp = &itc->itc_conn;
2404 dccp_t *dccp = (dccp_t *)&itc[1];
2405
2406 bzero(connp, sizeof (conn_t));
2407 bzero(dccp, sizeof (dccp_t));
2408
2409 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2410 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2411 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2412
2413 dccp->dccp_timercache = dccp_timermp_alloc(kmflags);
2414 if (dccp->dccp_timercache == NULL) {
2415 return (ENOMEM);
2416 }
2417
2418 connp->conn_dccp = dccp;
2419 connp->conn_flags = IPCL_DCCPCONN;
2420 connp->conn_proto = IPPROTO_DCCP;
2421 dccp->dccp_connp = connp;
2422
2423 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2424 if (connp->conn_ixa == NULL) {
2425 return (NULL);
2426 }
2427
2428 connp->conn_ixa->ixa_refcnt = 1;
2429 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2430 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2431
2432 return (0);
2433 }
2434
2435 /* ARGSUSED */
2436 static void
2437 dccp_conn_destructor(void *buf, void *cdrarg)
2438 {
2439 itc_t *itc = (itc_t *)buf;
2440 conn_t *connp = &itc->itc_conn;
2441 dccp_t *dccp = (dccp_t *)&itc[1];
2442
2443 ASSERT(connp->conn_flags & IPCL_DCCPCONN);
2444 ASSERT(dccp->dccp_connp == connp);
2445 ASSERT(connp->conn_dccp == dccp);
2446
2447 dccp_timermp_free(dccp);
2448
2449 mutex_destroy(&connp->conn_lock);
2450 cv_destroy(&connp->conn_cv);
2451 rw_destroy(&connp->conn_ilg_lock);
2452
2453 if (connp->conn_ixa != NULL) {
2454 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2455 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2456 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2457
2458 ixa_refrele(connp->conn_ixa);
2459 }
2460 }
2461
2462 /*
2463 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2464 * in the conn_t.
2465 *
2466 * Below we list all the pointers in the conn_t as a documentation aid.
2467 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2468 * If you add any pointers to the conn_t please add an ASSERT here
2469 * and #ifdef it out if it can't be actually asserted to be NULL.
2470 * In any case, we bzero most of the conn_t at the end of the function.
2471 */
2472 void
2473 ipcl_conn_cleanup(conn_t *connp)
2474 {
2475 ip_xmit_attr_t *ixa;
2476
2477 ASSERT(connp->conn_latch == NULL);
2478 ASSERT(connp->conn_latch_in_policy == NULL);
2479 ASSERT(connp->conn_latch_in_action == NULL);
2480 #ifdef notdef
2481 ASSERT(connp->conn_rq == NULL);
2482 ASSERT(connp->conn_wq == NULL);
2483 #endif
2484 ASSERT(connp->conn_cred == NULL);
2485 ASSERT(connp->conn_g_fanout == NULL);
2486 ASSERT(connp->conn_g_next == NULL);
2487 ASSERT(connp->conn_g_prev == NULL);
2488 ASSERT(connp->conn_policy == NULL);
2489 ASSERT(connp->conn_fanout == NULL);
2490 ASSERT(connp->conn_next == NULL);
2491 ASSERT(connp->conn_prev == NULL);
2492 ASSERT(connp->conn_oper_pending_ill == NULL);
2493 ASSERT(connp->conn_ilg == NULL);
2494 ASSERT(connp->conn_drain_next == NULL);
2495 ASSERT(connp->conn_drain_prev == NULL);
2496 #ifdef notdef
2497 /* conn_idl is not cleared when removed from idl list */
2498 ASSERT(connp->conn_idl == NULL);
2499 #endif
2500 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2501 #ifdef notdef
2502 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2503 ASSERT(connp->conn_netstack == NULL);
2504 #endif
2505
2506 ASSERT(connp->conn_helper_info == NULL);
2507 ASSERT(connp->conn_ixa != NULL);
2508 ixa = connp->conn_ixa;
2509 ASSERT(ixa->ixa_refcnt == 1);
2510 /* Need to preserve ixa_protocol */
2511 ixa_cleanup(ixa);
2512 ixa->ixa_flags = 0;
2513
2514 /* Clear out the conn_t fields that are not preserved */
2515 bzero(&connp->conn_start_clr,
2516 sizeof (conn_t) -
2517 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2518 }
2519
2520 /*
2521 * All conns are inserted in a global multi-list for the benefit of
2522 * walkers. The walk is guaranteed to walk all open conns at the time
2523 * of the start of the walk exactly once. This property is needed to
2524 * achieve some cleanups during unplumb of interfaces. This is achieved
2525 * as follows.
2526 *
2527 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2528 * call the insert and delete functions below at creation and deletion
2529 * time respectively. The conn never moves or changes its position in this
2530 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2531 * won't increase due to walkers, once the conn deletion has started. Note
2532 * that we can't remove the conn from the global list and then wait for
2533 * the refcnt to drop to zero, since walkers would then see a truncated
2534 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2535 * conns until ip_open is ready to make them globally visible.
2536 * The global round robin multi-list locks are held only to get the
2537 * next member/insertion/deletion and contention should be negligible
2538 * if the multi-list is much greater than the number of cpus.
2539 */
2540 void
2541 ipcl_globalhash_insert(conn_t *connp)
2542 {
2543 int index;
2544 struct connf_s *connfp;
2545 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2546
2547 /*
2548 * No need for atomic here. Approximate even distribution
2549 * in the global lists is sufficient.
2550 */
2551 ipst->ips_conn_g_index++;
2552 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2553
2554 connp->conn_g_prev = NULL;
2555 /*
2556 * Mark as INCIPIENT, so that walkers will ignore this
2557 * for now, till ip_open is ready to make it visible globally.
2558 */
2559 connp->conn_state_flags |= CONN_INCIPIENT;
2560
2561 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2562 /* Insert at the head of the list */
2563 mutex_enter(&connfp->connf_lock);
2564 connp->conn_g_next = connfp->connf_head;
2565 if (connp->conn_g_next != NULL)
2566 connp->conn_g_next->conn_g_prev = connp;
2567 connfp->connf_head = connp;
2568
2569 /* The fanout bucket this conn points to */
2570 connp->conn_g_fanout = connfp;
2571
2572 mutex_exit(&connfp->connf_lock);
2573 }
2574
2575 void
2576 ipcl_globalhash_remove(conn_t *connp)
2577 {
2578 struct connf_s *connfp;
2579
2580 /*
2581 * We were never inserted in the global multi list.
2582 * IPCL_NONE variety is never inserted in the global multilist
2583 * since it is presumed to not need any cleanup and is transient.
2584 */
2585 if (connp->conn_g_fanout == NULL)
2586 return;
2587
2588 connfp = connp->conn_g_fanout;
2589 mutex_enter(&connfp->connf_lock);
2590 if (connp->conn_g_prev != NULL)
2591 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2592 else
2593 connfp->connf_head = connp->conn_g_next;
2594 if (connp->conn_g_next != NULL)
2595 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2596 mutex_exit(&connfp->connf_lock);
2597
2598 /* Better to stumble on a null pointer than to corrupt memory */
2599 connp->conn_g_next = NULL;
2600 connp->conn_g_prev = NULL;
2601 connp->conn_g_fanout = NULL;
2602 }
2603
2604 /*
2605 * Walk the list of all conn_t's in the system, calling the function provided
2606 * With the specified argument for each.
2607 * Applies to both IPv4 and IPv6.
2608 *
2609 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2610 * conn_oper_pending_ill). To guard against stale pointers
2611 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2612 * unplumbed or removed. New conn_t's that are created while we are walking
2613 * may be missed by this walk, because they are not necessarily inserted
2614 * at the tail of the list. They are new conn_t's and thus don't have any
2615 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2616 * is created to the struct that is going away.
2617 */
2618 void
2619 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2620 {
2621 int i;
2622 conn_t *connp;
2623 conn_t *prev_connp;
2624
2625 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2626 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2627 prev_connp = NULL;
2628 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2629 while (connp != NULL) {
2630 mutex_enter(&connp->conn_lock);
2631 if (connp->conn_state_flags &
2632 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2633 mutex_exit(&connp->conn_lock);
2634 connp = connp->conn_g_next;
2635 continue;
2636 }
2637 CONN_INC_REF_LOCKED(connp);
2638 mutex_exit(&connp->conn_lock);
2639 mutex_exit(
2640 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2641 (*func)(connp, arg);
2642 if (prev_connp != NULL)
2643 CONN_DEC_REF(prev_connp);
2644 mutex_enter(
2645 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2646 prev_connp = connp;
2647 connp = connp->conn_g_next;
2648 }
2649 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2650 if (prev_connp != NULL)
2651 CONN_DEC_REF(prev_connp);
2652 }
2653 }
2654
2655 /*
2656 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2657 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2658 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2659 * (peer tcp in ESTABLISHED state).
2660 */
2661 conn_t *
2662 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2663 ip_stack_t *ipst)
2664 {
2665 uint32_t ports;
2666 uint16_t *pports = (uint16_t *)&ports;
2667 connf_t *connfp;
2668 conn_t *tconnp;
2669 boolean_t zone_chk;
2670
2671 /*
2672 * If either the source of destination address is loopback, then
2673 * both endpoints must be in the same Zone. Otherwise, both of
2674 * the addresses are system-wide unique (tcp is in ESTABLISHED
2675 * state) and the endpoints may reside in different Zones.
2676 */
2677 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2678 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2679
2680 pports[0] = tcpha->tha_fport;
2681 pports[1] = tcpha->tha_lport;
2682
2683 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2684 ports, ipst)];
2685
2686 mutex_enter(&connfp->connf_lock);
2687 for (tconnp = connfp->connf_head; tconnp != NULL;
2688 tconnp = tconnp->conn_next) {
2689
2690 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2691 ipha->ipha_dst, ipha->ipha_src, ports) &&
2692 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2693 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2694
2695 ASSERT(tconnp != connp);
2696 CONN_INC_REF(tconnp);
2697 mutex_exit(&connfp->connf_lock);
2698 return (tconnp);
2699 }
2700 }
2701 mutex_exit(&connfp->connf_lock);
2702 return (NULL);
2703 }
2704
2705 /*
2706 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2707 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2708 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2709 * (peer tcp in ESTABLISHED state).
2710 */
2711 conn_t *
2712 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2713 ip_stack_t *ipst)
2714 {
2715 uint32_t ports;
2716 uint16_t *pports = (uint16_t *)&ports;
2717 connf_t *connfp;
2718 conn_t *tconnp;
2719 boolean_t zone_chk;
2720
2721 /*
2722 * If either the source of destination address is loopback, then
2723 * both endpoints must be in the same Zone. Otherwise, both of
2724 * the addresses are system-wide unique (tcp is in ESTABLISHED
2725 * state) and the endpoints may reside in different Zones. We
2726 * don't do Zone check for link local address(es) because the
2727 * current Zone implementation treats each link local address as
2728 * being unique per system node, i.e. they belong to global Zone.
2729 */
2730 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2731 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2732
2733 pports[0] = tcpha->tha_fport;
2734 pports[1] = tcpha->tha_lport;
2735
2736 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2737 ports, ipst)];
2738
2739 mutex_enter(&connfp->connf_lock);
2740 for (tconnp = connfp->connf_head; tconnp != NULL;
2741 tconnp = tconnp->conn_next) {
2742
2743 /* We skip conn_bound_if check here as this is loopback tcp */
2744 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2745 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2746 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2747 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2748
2749 ASSERT(tconnp != connp);
2750 CONN_INC_REF(tconnp);
2751 mutex_exit(&connfp->connf_lock);
2752 return (tconnp);
2753 }
2754 }
2755 mutex_exit(&connfp->connf_lock);
2756 return (NULL);
2757 }
2758
2759 /*
2760 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2761 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2762 * Only checks for connected entries i.e. no INADDR_ANY checks.
2763 */
2764 conn_t *
2765 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2766 ip_stack_t *ipst)
2767 {
2768 uint32_t ports;
2769 uint16_t *pports;
2770 connf_t *connfp;
2771 conn_t *tconnp;
2772
2773 pports = (uint16_t *)&ports;
2774 pports[0] = tcpha->tha_fport;
2775 pports[1] = tcpha->tha_lport;
2776
2777 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2778 ports, ipst)];
2779
2780 mutex_enter(&connfp->connf_lock);
2781 for (tconnp = connfp->connf_head; tconnp != NULL;
2782 tconnp = tconnp->conn_next) {
2783
2784 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2785 ipha->ipha_dst, ipha->ipha_src, ports) &&
2786 tconnp->conn_tcp->tcp_state >= min_state) {
2787
2788 CONN_INC_REF(tconnp);
2789 mutex_exit(&connfp->connf_lock);
2790 return (tconnp);
2791 }
2792 }
2793 mutex_exit(&connfp->connf_lock);
2794 return (NULL);
2795 }
2796
2797 /*
2798 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2799 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2800 * Only checks for connected entries i.e. no INADDR_ANY checks.
2801 * Match on ifindex in addition to addresses.
2802 */
2803 conn_t *
2804 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2805 uint_t ifindex, ip_stack_t *ipst)
2806 {
2807 tcp_t *tcp;
2808 uint32_t ports;
2809 uint16_t *pports;
2810 connf_t *connfp;
2811 conn_t *tconnp;
2812
2813 pports = (uint16_t *)&ports;
2814 pports[0] = tcpha->tha_fport;
2815 pports[1] = tcpha->tha_lport;
2816
2817 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2818 ports, ipst)];
2819
2820 mutex_enter(&connfp->connf_lock);
2821 for (tconnp = connfp->connf_head; tconnp != NULL;
2822 tconnp = tconnp->conn_next) {
2823
2824 tcp = tconnp->conn_tcp;
2825 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2826 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2827 tcp->tcp_state >= min_state &&
2828 (tconnp->conn_bound_if == 0 ||
2829 tconnp->conn_bound_if == ifindex)) {
2830
2831 CONN_INC_REF(tconnp);
2832 mutex_exit(&connfp->connf_lock);
2833 return (tconnp);
2834 }
2835 }
2836 mutex_exit(&connfp->connf_lock);
2837 return (NULL);
2838 }
2839
2840 /*
2841 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2842 * a listener when changing state.
2843 */
2844 conn_t *
2845 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2846 ip_stack_t *ipst)
2847 {
2848 connf_t *bind_connfp;
2849 conn_t *connp;
2850 tcp_t *tcp;
2851
2852 /*
2853 * Avoid false matches for packets sent to an IP destination of
2854 * all zeros.
2855 */
2856 if (laddr == 0)
2857 return (NULL);
2858
2859 ASSERT(zoneid != ALL_ZONES);
2860
2861 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2862 mutex_enter(&bind_connfp->connf_lock);
2863 for (connp = bind_connfp->connf_head; connp != NULL;
2864 connp = connp->conn_next) {
2865 tcp = connp->conn_tcp;
2866 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2867 IPCL_ZONE_MATCH(connp, zoneid) &&
2868 (tcp->tcp_listener == NULL)) {
2869 CONN_INC_REF(connp);
2870 mutex_exit(&bind_connfp->connf_lock);
2871 return (connp);
2872 }
2873 }
2874 mutex_exit(&bind_connfp->connf_lock);
2875 return (NULL);
2876 }
2877
2878 /*
2879 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2880 * a listener when changing state.
2881 */
2882 conn_t *
2883 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2884 zoneid_t zoneid, ip_stack_t *ipst)
2885 {
2886 connf_t *bind_connfp;
2887 conn_t *connp = NULL;
2888 tcp_t *tcp;
2889
2890 /*
2891 * Avoid false matches for packets sent to an IP destination of
2892 * all zeros.
2893 */
2894 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2895 return (NULL);
2896
2897 ASSERT(zoneid != ALL_ZONES);
2898
2899 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2900 mutex_enter(&bind_connfp->connf_lock);
2901 for (connp = bind_connfp->connf_head; connp != NULL;
2902 connp = connp->conn_next) {
2903 tcp = connp->conn_tcp;
2904 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2905 IPCL_ZONE_MATCH(connp, zoneid) &&
2906 (connp->conn_bound_if == 0 ||
2907 connp->conn_bound_if == ifindex) &&
2908 tcp->tcp_listener == NULL) {
2909 CONN_INC_REF(connp);
2910 mutex_exit(&bind_connfp->connf_lock);
2911 return (connp);
2912 }
2913 }
2914 mutex_exit(&bind_connfp->connf_lock);
2915 return (NULL);
2916 }
2917
2918 /*
2919 * ipcl_get_next_conn
2920 * get the next entry in the conn global list
2921 * and put a reference on the next_conn.
2922 * decrement the reference on the current conn.
2923 *
2924 * This is an iterator based walker function that also provides for
2925 * some selection by the caller. It walks through the conn_hash bucket
2926 * searching for the next valid connp in the list, and selects connections
2927 * that are neither closed nor condemned. It also REFHOLDS the conn
2928 * thus ensuring that the conn exists when the caller uses the conn.
2929 */
2930 conn_t *
2931 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2932 {
2933 conn_t *next_connp;
2934
2935 if (connfp == NULL)
2936 return (NULL);
2937
2938 mutex_enter(&connfp->connf_lock);
2939
2940 next_connp = (connp == NULL) ?
2941 connfp->connf_head : connp->conn_g_next;
2942
2943 while (next_connp != NULL) {
2944 mutex_enter(&next_connp->conn_lock);
2945 if (!(next_connp->conn_flags & conn_flags) ||
2946 (next_connp->conn_state_flags &
2947 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2948 /*
2949 * This conn has been condemned or
2950 * is closing, or the flags don't match
2951 */
2952 mutex_exit(&next_connp->conn_lock);
2953 next_connp = next_connp->conn_g_next;
2954 continue;
2955 }
2956 CONN_INC_REF_LOCKED(next_connp);
2957 mutex_exit(&next_connp->conn_lock);
2958 break;
2959 }
2960
2961 mutex_exit(&connfp->connf_lock);
2962
2963 if (connp != NULL)
2964 CONN_DEC_REF(connp);
2965
2966 return (next_connp);
2967 }
2968
2969 #ifdef CONN_DEBUG
2970 /*
2971 * Trace of the last NBUF refhold/refrele
2972 */
2973 int
2974 conn_trace_ref(conn_t *connp)
2975 {
2976 int last;
2977 conn_trace_t *ctb;
2978
2979 ASSERT(MUTEX_HELD(&connp->conn_lock));
2980 last = connp->conn_trace_last;
2981 last++;
2982 if (last == CONN_TRACE_MAX)
2983 last = 0;
2984
2985 ctb = &connp->conn_trace_buf[last];
2986 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2987 connp->conn_trace_last = last;
2988 return (1);
2989 }
2990
2991 int
2992 conn_untrace_ref(conn_t *connp)
2993 {
2994 int last;
2995 conn_trace_t *ctb;
2996
2997 ASSERT(MUTEX_HELD(&connp->conn_lock));
2998 last = connp->conn_trace_last;
2999 last++;
3000 if (last == CONN_TRACE_MAX)
3001 last = 0;
3002
3003 ctb = &connp->conn_trace_buf[last];
3004 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
3005 connp->conn_trace_last = last;
3006 return (1);
3007 }
3008 #endif