Print this page
XXXX adding PID information to netstat output
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/inet/ip/ipclassifier.c
+++ new/usr/src/uts/common/inet/ip/ipclassifier.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 */
24 24
25 25 /*
26 26 * IP PACKET CLASSIFIER
27 27 *
28 28 * The IP packet classifier provides mapping between IP packets and persistent
29 29 * connection state for connection-oriented protocols. It also provides
30 30 * interface for managing connection states.
31 31 *
32 32 * The connection state is kept in conn_t data structure and contains, among
33 33 * other things:
34 34 *
35 35 * o local/remote address and ports
36 36 * o Transport protocol
37 37 * o squeue for the connection (for TCP only)
38 38 * o reference counter
39 39 * o Connection state
40 40 * o hash table linkage
41 41 * o interface/ire information
42 42 * o credentials
43 43 * o ipsec policy
44 44 * o send and receive functions.
45 45 * o mutex lock.
46 46 *
47 47 * Connections use a reference counting scheme. They are freed when the
48 48 * reference counter drops to zero. A reference is incremented when connection
49 49 * is placed in a list or table, when incoming packet for the connection arrives
50 50 * and when connection is processed via squeue (squeue processing may be
51 51 * asynchronous and the reference protects the connection from being destroyed
52 52 * before its processing is finished).
53 53 *
54 54 * conn_recv is used to pass up packets to the ULP.
55 55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
56 56 * a listener, and changes to tcp_input_listener as the listener has picked a
57 57 * good squeue. For other cases it is set to tcp_input_data.
58 58 *
59 59 * conn_recvicmp is used to pass up ICMP errors to the ULP.
60 60 *
61 61 * Classifier uses several hash tables:
62 62 *
63 63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state
64 64 * ipcl_bind_fanout: contains all connections in BOUND state
65 65 * ipcl_proto_fanout: IPv4 protocol fanout
66 66 * ipcl_proto_fanout_v6: IPv6 protocol fanout
67 67 * ipcl_udp_fanout: contains all UDP connections
68 68 * ipcl_iptun_fanout: contains all IP tunnel connections
69 69 * ipcl_globalhash_fanout: contains all connections
70 70 *
71 71 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
72 72 * which need to view all existing connections.
73 73 *
74 74 * All tables are protected by per-bucket locks. When both per-bucket lock and
75 75 * connection lock need to be held, the per-bucket lock should be acquired
76 76 * first, followed by the connection lock.
77 77 *
78 78 * All functions doing search in one of these tables increment a reference
79 79 * counter on the connection found (if any). This reference should be dropped
80 80 * when the caller has finished processing the connection.
81 81 *
82 82 *
83 83 * INTERFACES:
84 84 * ===========
85 85 *
86 86 * Connection Lookup:
87 87 * ------------------
88 88 *
89 89 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
90 90 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
91 91 *
92 92 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
93 93 * it can't find any associated connection. If the connection is found, its
94 94 * reference counter is incremented.
95 95 *
96 96 * mp: mblock, containing packet header. The full header should fit
97 97 * into a single mblock. It should also contain at least full IP
98 98 * and TCP or UDP header.
99 99 *
100 100 * protocol: Either IPPROTO_TCP or IPPROTO_UDP.
101 101 *
102 102 * hdr_len: The size of IP header. It is used to find TCP or UDP header in
103 103 * the packet.
104 104 *
105 105 * ira->ira_zoneid: The zone in which the returned connection must be; the
106 106 * zoneid corresponding to the ire_zoneid on the IRE located for
107 107 * the packet's destination address.
108 108 *
109 109 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
110 110 * IRAF_TX_SHARED_ADDR flags
111 111 *
112 112 * For TCP connections, the lookup order is as follows:
113 113 * 5-tuple {src, dst, protocol, local port, remote port}
114 114 * lookup in ipcl_conn_fanout table.
115 115 * 3-tuple {dst, remote port, protocol} lookup in
116 116 * ipcl_bind_fanout table.
117 117 *
118 118 * For UDP connections, a 5-tuple {src, dst, protocol, local port,
119 119 * remote port} lookup is done on ipcl_udp_fanout. Note that,
120 120 * these interfaces do not handle cases where a packets belongs
121 121 * to multiple UDP clients, which is handled in IP itself.
122 122 *
123 123 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
124 124 * determine which actual zone gets the segment. This is used only in a
125 125 * labeled environment. The matching rules are:
126 126 *
127 127 * - If it's not a multilevel port, then the label on the packet selects
128 128 * the zone. Unlabeled packets are delivered to the global zone.
129 129 *
130 130 * - If it's a multilevel port, then only the zone registered to receive
131 131 * packets on that port matches.
132 132 *
133 133 * Also, in a labeled environment, packet labels need to be checked. For fully
134 134 * bound TCP connections, we can assume that the packet label was checked
135 135 * during connection establishment, and doesn't need to be checked on each
136 136 * packet. For others, though, we need to check for strict equality or, for
137 137 * multilevel ports, membership in the range or set. This part currently does
138 138 * a tnrh lookup on each packet, but could be optimized to use cached results
139 139 * if that were necessary. (SCTP doesn't come through here, but if it did,
140 140 * we would apply the same rules as TCP.)
141 141 *
142 142 * An implication of the above is that fully-bound TCP sockets must always use
143 143 * distinct 4-tuples; they can't be discriminated by label alone.
144 144 *
145 145 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
146 146 * as there's no connection set-up handshake and no shared state.
147 147 *
148 148 * Labels on looped-back packets within a single zone do not need to be
149 149 * checked, as all processes in the same zone have the same label.
150 150 *
151 151 * Finally, for unlabeled packets received by a labeled system, special rules
152 152 * apply. We consider only the MLP if there is one. Otherwise, we prefer a
153 153 * socket in the zone whose label matches the default label of the sender, if
154 154 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the
155 155 * receiver's label must dominate the sender's default label.
156 156 *
157 157 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
158 158 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
159 159 * ip_stack);
160 160 *
161 161 * Lookup routine to find a exact match for {src, dst, local port,
162 162 * remote port) for TCP connections in ipcl_conn_fanout. The address and
163 163 * ports are read from the IP and TCP header respectively.
164 164 *
165 165 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol,
166 166 * zoneid, ip_stack);
167 167 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
168 168 * zoneid, ip_stack);
169 169 *
170 170 * Lookup routine to find a listener with the tuple {lport, laddr,
171 171 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional
172 172 * parameter interface index is also compared.
173 173 *
174 174 * void ipcl_walk(func, arg, ip_stack)
175 175 *
176 176 * Apply 'func' to every connection available. The 'func' is called as
177 177 * (*func)(connp, arg). The walk is non-atomic so connections may be
178 178 * created and destroyed during the walk. The CONN_CONDEMNED and
179 179 * CONN_INCIPIENT flags ensure that connections which are newly created
180 180 * or being destroyed are not selected by the walker.
181 181 *
182 182 * Table Updates
183 183 * -------------
184 184 *
185 185 * int ipcl_conn_insert(connp);
186 186 * int ipcl_conn_insert_v4(connp);
187 187 * int ipcl_conn_insert_v6(connp);
188 188 *
189 189 * Insert 'connp' in the ipcl_conn_fanout.
190 190 * Arguements :
191 191 * connp conn_t to be inserted
192 192 *
193 193 * Return value :
194 194 * 0 if connp was inserted
195 195 * EADDRINUSE if the connection with the same tuple
196 196 * already exists.
197 197 *
198 198 * int ipcl_bind_insert(connp);
199 199 * int ipcl_bind_insert_v4(connp);
200 200 * int ipcl_bind_insert_v6(connp);
201 201 *
202 202 * Insert 'connp' in ipcl_bind_fanout.
203 203 * Arguements :
204 204 * connp conn_t to be inserted
205 205 *
206 206 *
207 207 * void ipcl_hash_remove(connp);
208 208 *
209 209 * Removes the 'connp' from the connection fanout table.
210 210 *
211 211 * Connection Creation/Destruction
212 212 * -------------------------------
213 213 *
214 214 * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
215 215 *
216 216 * Creates a new conn based on the type flag, inserts it into
217 217 * globalhash table.
218 218 *
219 219 * type: This flag determines the type of conn_t which needs to be
220 220 * created i.e., which kmem_cache it comes from.
221 221 * IPCL_TCPCONN indicates a TCP connection
222 222 * IPCL_SCTPCONN indicates a SCTP connection
223 223 * IPCL_UDPCONN indicates a UDP conn_t.
224 224 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t.
225 225 * IPCL_RTSCONN indicates a RTS conn_t.
226 226 * IPCL_IPCCONN indicates all other connections.
227 227 *
228 228 * void ipcl_conn_destroy(connp)
229 229 *
230 230 * Destroys the connection state, removes it from the global
231 231 * connection hash table and frees its memory.
232 232 */
233 233
234 234 #include <sys/types.h>
235 235 #include <sys/stream.h>
236 236 #include <sys/stropts.h>
237 237 #include <sys/sysmacros.h>
238 238 #include <sys/strsubr.h>
239 239 #include <sys/strsun.h>
240 240 #define _SUN_TPI_VERSION 2
241 241 #include <sys/ddi.h>
242 242 #include <sys/cmn_err.h>
243 243 #include <sys/debug.h>
244 244
245 245 #include <sys/systm.h>
246 246 #include <sys/param.h>
247 247 #include <sys/kmem.h>
248 248 #include <sys/isa_defs.h>
249 249 #include <inet/common.h>
250 250 #include <netinet/ip6.h>
251 251 #include <netinet/icmp6.h>
252 252
253 253 #include <inet/ip.h>
254 254 #include <inet/ip_if.h>
255 255 #include <inet/ip_ire.h>
256 256 #include <inet/ip6.h>
257 257 #include <inet/ip_ndp.h>
258 258 #include <inet/ip_impl.h>
259 259 #include <inet/udp_impl.h>
260 260 #include <inet/sctp_ip.h>
261 261 #include <inet/sctp/sctp_impl.h>
262 262 #include <inet/rawip_impl.h>
263 263 #include <inet/rts_impl.h>
264 264 #include <inet/iptun/iptun_impl.h>
265 265
266 266 #include <sys/cpuvar.h>
267 267
268 268 #include <inet/ipclassifier.h>
269 269 #include <inet/tcp.h>
270 270 #include <inet/ipsec_impl.h>
271 271
272 272 #include <sys/tsol/tnet.h>
273 273 #include <sys/sockio.h>
274 274
275 275 /* Old value for compatibility. Setable in /etc/system */
276 276 uint_t tcp_conn_hash_size = 0;
277 277
278 278 /* New value. Zero means choose automatically. Setable in /etc/system */
279 279 uint_t ipcl_conn_hash_size = 0;
280 280 uint_t ipcl_conn_hash_memfactor = 8192;
281 281 uint_t ipcl_conn_hash_maxsize = 82500;
282 282
283 283 /* bind/udp fanout table size */
284 284 uint_t ipcl_bind_fanout_size = 512;
285 285 uint_t ipcl_udp_fanout_size = 16384;
286 286
287 287 /* Raw socket fanout size. Must be a power of 2. */
288 288 uint_t ipcl_raw_fanout_size = 256;
289 289
290 290 /*
291 291 * The IPCL_IPTUN_HASH() function works best with a prime table size. We
292 292 * expect that most large deployments would have hundreds of tunnels, and
293 293 * thousands in the extreme case.
294 294 */
295 295 uint_t ipcl_iptun_fanout_size = 6143;
296 296
297 297 /*
298 298 * Power of 2^N Primes useful for hashing for N of 0-28,
299 299 * these primes are the nearest prime <= 2^N - 2^(N-2).
300 300 */
301 301
302 302 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \
303 303 6143, 12281, 24571, 49139, 98299, 196597, 393209, \
304 304 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \
305 305 50331599, 100663291, 201326557, 0}
306 306
307 307 /*
308 308 * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
309 309 * are aligned on cache lines.
310 310 */
311 311 typedef union itc_s {
312 312 conn_t itc_conn;
313 313 char itcu_filler[CACHE_ALIGN(conn_s)];
314 314 } itc_t;
315 315
316 316 struct kmem_cache *tcp_conn_cache;
317 317 struct kmem_cache *ip_conn_cache;
318 318 extern struct kmem_cache *sctp_conn_cache;
319 319 struct kmem_cache *udp_conn_cache;
320 320 struct kmem_cache *rawip_conn_cache;
321 321 struct kmem_cache *rts_conn_cache;
322 322
323 323 extern void tcp_timermp_free(tcp_t *);
324 324 extern mblk_t *tcp_timermp_alloc(int);
325 325
326 326 static int ip_conn_constructor(void *, void *, int);
327 327 static void ip_conn_destructor(void *, void *);
328 328
329 329 static int tcp_conn_constructor(void *, void *, int);
330 330 static void tcp_conn_destructor(void *, void *);
331 331
332 332 static int udp_conn_constructor(void *, void *, int);
333 333 static void udp_conn_destructor(void *, void *);
334 334
335 335 static int rawip_conn_constructor(void *, void *, int);
336 336 static void rawip_conn_destructor(void *, void *);
337 337
338 338 static int rts_conn_constructor(void *, void *, int);
339 339 static void rts_conn_destructor(void *, void *);
340 340
341 341 /*
342 342 * Global (for all stack instances) init routine
343 343 */
344 344 void
345 345 ipcl_g_init(void)
346 346 {
347 347 ip_conn_cache = kmem_cache_create("ip_conn_cache",
348 348 sizeof (conn_t), CACHE_ALIGN_SIZE,
349 349 ip_conn_constructor, ip_conn_destructor,
350 350 NULL, NULL, NULL, 0);
351 351
352 352 tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
353 353 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
354 354 tcp_conn_constructor, tcp_conn_destructor,
355 355 tcp_conn_reclaim, NULL, NULL, 0);
356 356
357 357 udp_conn_cache = kmem_cache_create("udp_conn_cache",
358 358 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
359 359 udp_conn_constructor, udp_conn_destructor,
360 360 NULL, NULL, NULL, 0);
361 361
362 362 rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
363 363 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
364 364 rawip_conn_constructor, rawip_conn_destructor,
365 365 NULL, NULL, NULL, 0);
366 366
367 367 rts_conn_cache = kmem_cache_create("rts_conn_cache",
368 368 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
369 369 rts_conn_constructor, rts_conn_destructor,
370 370 NULL, NULL, NULL, 0);
371 371 }
372 372
373 373 /*
374 374 * ipclassifier intialization routine, sets up hash tables.
375 375 */
376 376 void
377 377 ipcl_init(ip_stack_t *ipst)
378 378 {
379 379 int i;
380 380 int sizes[] = P2Ps();
381 381
382 382 /*
383 383 * Calculate size of conn fanout table from /etc/system settings
384 384 */
385 385 if (ipcl_conn_hash_size != 0) {
386 386 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
387 387 } else if (tcp_conn_hash_size != 0) {
388 388 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
389 389 } else {
390 390 extern pgcnt_t freemem;
391 391
392 392 ipst->ips_ipcl_conn_fanout_size =
393 393 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
394 394
395 395 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
396 396 ipst->ips_ipcl_conn_fanout_size =
397 397 ipcl_conn_hash_maxsize;
398 398 }
399 399 }
400 400
401 401 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
402 402 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
403 403 break;
404 404 }
405 405 }
406 406 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
407 407 /* Out of range, use the 2^16 value */
408 408 ipst->ips_ipcl_conn_fanout_size = sizes[16];
409 409 }
410 410
411 411 /* Take values from /etc/system */
412 412 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
413 413 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
414 414 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
415 415 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
416 416
417 417 ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
418 418
419 419 ipst->ips_ipcl_conn_fanout = kmem_zalloc(
420 420 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
421 421
422 422 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
423 423 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
424 424 MUTEX_DEFAULT, NULL);
425 425 }
426 426
427 427 ipst->ips_ipcl_bind_fanout = kmem_zalloc(
428 428 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
429 429
430 430 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
431 431 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
432 432 MUTEX_DEFAULT, NULL);
433 433 }
434 434
435 435 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
436 436 sizeof (connf_t), KM_SLEEP);
437 437 for (i = 0; i < IPPROTO_MAX; i++) {
438 438 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
439 439 MUTEX_DEFAULT, NULL);
440 440 }
441 441
442 442 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
443 443 sizeof (connf_t), KM_SLEEP);
444 444 for (i = 0; i < IPPROTO_MAX; i++) {
445 445 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
446 446 MUTEX_DEFAULT, NULL);
447 447 }
448 448
449 449 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
450 450 mutex_init(&ipst->ips_rts_clients->connf_lock,
451 451 NULL, MUTEX_DEFAULT, NULL);
452 452
453 453 ipst->ips_ipcl_udp_fanout = kmem_zalloc(
454 454 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
455 455 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
456 456 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
457 457 MUTEX_DEFAULT, NULL);
458 458 }
459 459
460 460 ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
461 461 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
462 462 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
463 463 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
464 464 MUTEX_DEFAULT, NULL);
465 465 }
466 466
467 467 ipst->ips_ipcl_raw_fanout = kmem_zalloc(
468 468 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
469 469 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
470 470 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
471 471 MUTEX_DEFAULT, NULL);
472 472 }
473 473
474 474 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
475 475 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
476 476 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
477 477 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
478 478 NULL, MUTEX_DEFAULT, NULL);
479 479 }
480 480 }
481 481
482 482 void
483 483 ipcl_g_destroy(void)
484 484 {
485 485 kmem_cache_destroy(ip_conn_cache);
486 486 kmem_cache_destroy(tcp_conn_cache);
487 487 kmem_cache_destroy(udp_conn_cache);
488 488 kmem_cache_destroy(rawip_conn_cache);
489 489 kmem_cache_destroy(rts_conn_cache);
490 490 }
491 491
492 492 /*
493 493 * All user-level and kernel use of the stack must be gone
494 494 * by now.
495 495 */
496 496 void
497 497 ipcl_destroy(ip_stack_t *ipst)
498 498 {
499 499 int i;
500 500
501 501 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
502 502 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
503 503 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
504 504 }
505 505 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
506 506 sizeof (connf_t));
507 507 ipst->ips_ipcl_conn_fanout = NULL;
508 508
509 509 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
510 510 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
511 511 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
512 512 }
513 513 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
514 514 sizeof (connf_t));
515 515 ipst->ips_ipcl_bind_fanout = NULL;
516 516
517 517 for (i = 0; i < IPPROTO_MAX; i++) {
518 518 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
519 519 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
520 520 }
521 521 kmem_free(ipst->ips_ipcl_proto_fanout_v4,
522 522 IPPROTO_MAX * sizeof (connf_t));
523 523 ipst->ips_ipcl_proto_fanout_v4 = NULL;
524 524
525 525 for (i = 0; i < IPPROTO_MAX; i++) {
526 526 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
527 527 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
528 528 }
529 529 kmem_free(ipst->ips_ipcl_proto_fanout_v6,
530 530 IPPROTO_MAX * sizeof (connf_t));
531 531 ipst->ips_ipcl_proto_fanout_v6 = NULL;
532 532
533 533 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
534 534 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
535 535 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
536 536 }
537 537 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
538 538 sizeof (connf_t));
539 539 ipst->ips_ipcl_udp_fanout = NULL;
540 540
541 541 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
542 542 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
543 543 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
544 544 }
545 545 kmem_free(ipst->ips_ipcl_iptun_fanout,
546 546 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
547 547 ipst->ips_ipcl_iptun_fanout = NULL;
548 548
549 549 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
550 550 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
551 551 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
552 552 }
553 553 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
554 554 sizeof (connf_t));
555 555 ipst->ips_ipcl_raw_fanout = NULL;
556 556
557 557 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
558 558 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
559 559 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
560 560 }
561 561 kmem_free(ipst->ips_ipcl_globalhash_fanout,
562 562 sizeof (connf_t) * CONN_G_HASH_SIZE);
563 563 ipst->ips_ipcl_globalhash_fanout = NULL;
564 564
565 565 ASSERT(ipst->ips_rts_clients->connf_head == NULL);
566 566 mutex_destroy(&ipst->ips_rts_clients->connf_lock);
567 567 kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
568 568 ipst->ips_rts_clients = NULL;
569 569 }
570 570
571 571 /*
572 572 * conn creation routine. initialize the conn, sets the reference
573 573 * and inserts it in the global hash table.
574 574 */
575 575 conn_t *
576 576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
577 577 {
578 578 conn_t *connp;
579 579 struct kmem_cache *conn_cache;
580 580
581 581 switch (type) {
582 582 case IPCL_SCTPCONN:
583 583 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
584 584 return (NULL);
585 585 sctp_conn_init(connp);
586 586 netstack_hold(ns);
587 587 connp->conn_netstack = ns;
588 588 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
589 589 connp->conn_ixa->ixa_conn_id = (long)connp;
590 590 ipcl_globalhash_insert(connp);
591 591 return (connp);
592 592
593 593 case IPCL_TCPCONN:
594 594 conn_cache = tcp_conn_cache;
595 595 break;
596 596
597 597 case IPCL_UDPCONN:
598 598 conn_cache = udp_conn_cache;
599 599 break;
600 600
601 601 case IPCL_RAWIPCONN:
602 602 conn_cache = rawip_conn_cache;
603 603 break;
604 604
605 605 case IPCL_RTSCONN:
606 606 conn_cache = rts_conn_cache;
607 607 break;
608 608
609 609 case IPCL_IPCCONN:
610 610 conn_cache = ip_conn_cache;
611 611 break;
612 612
613 613 default:
614 614 connp = NULL;
615 615 ASSERT(0);
616 616 }
617 617
618 618 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
619 619 return (NULL);
620 620
621 621 connp->conn_ref = 1;
622 622 netstack_hold(ns);
623 623 connp->conn_netstack = ns;
624 624 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
625 625 connp->conn_ixa->ixa_conn_id = (long)connp;
626 626 ipcl_globalhash_insert(connp);
627 627 return (connp);
628 628 }
629 629
630 630 void
631 631 ipcl_conn_destroy(conn_t *connp)
632 632 {
633 633 mblk_t *mp;
634 634 netstack_t *ns = connp->conn_netstack;
635 635
636 636 ASSERT(!MUTEX_HELD(&connp->conn_lock));
637 637 ASSERT(connp->conn_ref == 0);
638 638 ASSERT(connp->conn_ioctlref == 0);
639 639
640 640 DTRACE_PROBE1(conn__destroy, conn_t *, connp);
641 641
642 642 if (connp->conn_cred != NULL) {
643 643 crfree(connp->conn_cred);
644 644 connp->conn_cred = NULL;
645 645 /* ixa_cred done in ipcl_conn_cleanup below */
646 646 }
647 647
648 648 if (connp->conn_ht_iphc != NULL) {
649 649 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
650 650 connp->conn_ht_iphc = NULL;
651 651 connp->conn_ht_iphc_allocated = 0;
652 652 connp->conn_ht_iphc_len = 0;
653 653 connp->conn_ht_ulp = NULL;
654 654 connp->conn_ht_ulp_len = 0;
655 655 }
656 656 ip_pkt_free(&connp->conn_xmit_ipp);
657 657
658 658 ipcl_globalhash_remove(connp);
659 659
660 660 if (connp->conn_latch != NULL) {
661 661 IPLATCH_REFRELE(connp->conn_latch);
662 662 connp->conn_latch = NULL;
663 663 }
664 664 if (connp->conn_latch_in_policy != NULL) {
665 665 IPPOL_REFRELE(connp->conn_latch_in_policy);
666 666 connp->conn_latch_in_policy = NULL;
667 667 }
668 668 if (connp->conn_latch_in_action != NULL) {
669 669 IPACT_REFRELE(connp->conn_latch_in_action);
670 670 connp->conn_latch_in_action = NULL;
671 671 }
672 672 if (connp->conn_policy != NULL) {
673 673 IPPH_REFRELE(connp->conn_policy, ns);
674 674 connp->conn_policy = NULL;
675 675 }
676 676
677 677 if (connp->conn_ipsec_opt_mp != NULL) {
678 678 freemsg(connp->conn_ipsec_opt_mp);
679 679 connp->conn_ipsec_opt_mp = NULL;
680 680 }
681 681
682 682 if (connp->conn_flags & IPCL_TCPCONN) {
683 683 tcp_t *tcp = connp->conn_tcp;
684 684
685 685 tcp_free(tcp);
686 686 mp = tcp->tcp_timercache;
687 687
688 688 tcp->tcp_tcps = NULL;
689 689
690 690 /*
691 691 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
692 692 * the mblk.
693 693 */
694 694 if (tcp->tcp_rsrv_mp != NULL) {
695 695 freeb(tcp->tcp_rsrv_mp);
696 696 tcp->tcp_rsrv_mp = NULL;
697 697 mutex_destroy(&tcp->tcp_rsrv_mp_lock);
698 698 }
699 699
700 700 ipcl_conn_cleanup(connp);
701 701 connp->conn_flags = IPCL_TCPCONN;
702 702 if (ns != NULL) {
703 703 ASSERT(tcp->tcp_tcps == NULL);
704 704 connp->conn_netstack = NULL;
705 705 connp->conn_ixa->ixa_ipst = NULL;
706 706 netstack_rele(ns);
707 707 }
708 708
709 709 bzero(tcp, sizeof (tcp_t));
710 710
711 711 tcp->tcp_timercache = mp;
712 712 tcp->tcp_connp = connp;
713 713 kmem_cache_free(tcp_conn_cache, connp);
714 714 return;
715 715 }
716 716
717 717 if (connp->conn_flags & IPCL_SCTPCONN) {
718 718 ASSERT(ns != NULL);
719 719 sctp_free(connp);
720 720 return;
721 721 }
722 722
723 723 ipcl_conn_cleanup(connp);
724 724 if (ns != NULL) {
725 725 connp->conn_netstack = NULL;
726 726 connp->conn_ixa->ixa_ipst = NULL;
727 727 netstack_rele(ns);
728 728 }
729 729
730 730 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
731 731 if (connp->conn_flags & IPCL_UDPCONN) {
732 732 connp->conn_flags = IPCL_UDPCONN;
733 733 kmem_cache_free(udp_conn_cache, connp);
734 734 } else if (connp->conn_flags & IPCL_RAWIPCONN) {
735 735 connp->conn_flags = IPCL_RAWIPCONN;
736 736 connp->conn_proto = IPPROTO_ICMP;
737 737 connp->conn_ixa->ixa_protocol = connp->conn_proto;
738 738 kmem_cache_free(rawip_conn_cache, connp);
739 739 } else if (connp->conn_flags & IPCL_RTSCONN) {
740 740 connp->conn_flags = IPCL_RTSCONN;
741 741 kmem_cache_free(rts_conn_cache, connp);
742 742 } else {
743 743 connp->conn_flags = IPCL_IPCCONN;
744 744 ASSERT(connp->conn_flags & IPCL_IPCCONN);
745 745 ASSERT(connp->conn_priv == NULL);
746 746 kmem_cache_free(ip_conn_cache, connp);
747 747 }
748 748 }
749 749
750 750 /*
751 751 * Running in cluster mode - deregister listener information
752 752 */
753 753 static void
754 754 ipcl_conn_unlisten(conn_t *connp)
755 755 {
756 756 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
757 757 ASSERT(connp->conn_lport != 0);
758 758
759 759 if (cl_inet_unlisten != NULL) {
760 760 sa_family_t addr_family;
761 761 uint8_t *laddrp;
762 762
763 763 if (connp->conn_ipversion == IPV6_VERSION) {
764 764 addr_family = AF_INET6;
765 765 laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
766 766 } else {
767 767 addr_family = AF_INET;
768 768 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
769 769 }
770 770 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
771 771 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
772 772 }
773 773 connp->conn_flags &= ~IPCL_CL_LISTENER;
774 774 }
775 775
776 776 /*
777 777 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
778 778 * which table the conn belonged to). So for debugging we can see which hash
779 779 * table this connection was in.
780 780 */
781 781 #define IPCL_HASH_REMOVE(connp) { \
782 782 connf_t *connfp = (connp)->conn_fanout; \
783 783 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \
784 784 if (connfp != NULL) { \
785 785 mutex_enter(&connfp->connf_lock); \
786 786 if ((connp)->conn_next != NULL) \
787 787 (connp)->conn_next->conn_prev = \
788 788 (connp)->conn_prev; \
789 789 if ((connp)->conn_prev != NULL) \
790 790 (connp)->conn_prev->conn_next = \
791 791 (connp)->conn_next; \
792 792 else \
793 793 connfp->connf_head = (connp)->conn_next; \
794 794 (connp)->conn_fanout = NULL; \
795 795 (connp)->conn_next = NULL; \
796 796 (connp)->conn_prev = NULL; \
797 797 (connp)->conn_flags |= IPCL_REMOVED; \
798 798 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \
799 799 ipcl_conn_unlisten((connp)); \
800 800 CONN_DEC_REF((connp)); \
801 801 mutex_exit(&connfp->connf_lock); \
802 802 } \
803 803 }
804 804
805 805 void
806 806 ipcl_hash_remove(conn_t *connp)
807 807 {
808 808 uint8_t protocol = connp->conn_proto;
809 809
810 810 IPCL_HASH_REMOVE(connp);
811 811 if (protocol == IPPROTO_RSVP)
812 812 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
813 813 }
814 814
815 815 /*
816 816 * The whole purpose of this function is allow removal of
817 817 * a conn_t from the connected hash for timewait reclaim.
818 818 * This is essentially a TW reclaim fastpath where timewait
819 819 * collector checks under fanout lock (so no one else can
820 820 * get access to the conn_t) that refcnt is 2 i.e. one for
821 821 * TCP and one for the classifier hash list. If ref count
822 822 * is indeed 2, we can just remove the conn under lock and
823 823 * avoid cleaning up the conn under squeue. This gives us
824 824 * improved performance.
825 825 */
826 826 void
827 827 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
828 828 {
829 829 ASSERT(MUTEX_HELD(&connfp->connf_lock));
830 830 ASSERT(MUTEX_HELD(&connp->conn_lock));
831 831 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
832 832
833 833 if ((connp)->conn_next != NULL) {
834 834 (connp)->conn_next->conn_prev = (connp)->conn_prev;
835 835 }
836 836 if ((connp)->conn_prev != NULL) {
837 837 (connp)->conn_prev->conn_next = (connp)->conn_next;
838 838 } else {
839 839 connfp->connf_head = (connp)->conn_next;
840 840 }
841 841 (connp)->conn_fanout = NULL;
842 842 (connp)->conn_next = NULL;
843 843 (connp)->conn_prev = NULL;
844 844 (connp)->conn_flags |= IPCL_REMOVED;
845 845 ASSERT((connp)->conn_ref == 2);
846 846 (connp)->conn_ref--;
847 847 }
848 848
849 849 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \
850 850 ASSERT((connp)->conn_fanout == NULL); \
851 851 ASSERT((connp)->conn_next == NULL); \
852 852 ASSERT((connp)->conn_prev == NULL); \
853 853 if ((connfp)->connf_head != NULL) { \
854 854 (connfp)->connf_head->conn_prev = (connp); \
855 855 (connp)->conn_next = (connfp)->connf_head; \
856 856 } \
857 857 (connp)->conn_fanout = (connfp); \
858 858 (connfp)->connf_head = (connp); \
859 859 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
860 860 IPCL_CONNECTED; \
861 861 CONN_INC_REF(connp); \
862 862 }
863 863
864 864 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \
865 865 IPCL_HASH_REMOVE((connp)); \
866 866 mutex_enter(&(connfp)->connf_lock); \
867 867 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \
868 868 mutex_exit(&(connfp)->connf_lock); \
869 869 }
870 870
871 871 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
872 872 conn_t *pconnp = NULL, *nconnp; \
873 873 IPCL_HASH_REMOVE((connp)); \
874 874 mutex_enter(&(connfp)->connf_lock); \
875 875 nconnp = (connfp)->connf_head; \
876 876 while (nconnp != NULL && \
877 877 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
878 878 pconnp = nconnp; \
879 879 nconnp = nconnp->conn_next; \
880 880 } \
881 881 if (pconnp != NULL) { \
882 882 pconnp->conn_next = (connp); \
883 883 (connp)->conn_prev = pconnp; \
884 884 } else { \
885 885 (connfp)->connf_head = (connp); \
886 886 } \
887 887 if (nconnp != NULL) { \
888 888 (connp)->conn_next = nconnp; \
889 889 nconnp->conn_prev = (connp); \
890 890 } \
891 891 (connp)->conn_fanout = (connfp); \
892 892 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
893 893 IPCL_BOUND; \
894 894 CONN_INC_REF(connp); \
895 895 mutex_exit(&(connfp)->connf_lock); \
896 896 }
897 897
898 898 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
899 899 conn_t **list, *prev, *next; \
900 900 boolean_t isv4mapped = \
901 901 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
902 902 IPCL_HASH_REMOVE((connp)); \
903 903 mutex_enter(&(connfp)->connf_lock); \
904 904 list = &(connfp)->connf_head; \
905 905 prev = NULL; \
906 906 while ((next = *list) != NULL) { \
907 907 if (isv4mapped && \
908 908 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
909 909 connp->conn_zoneid == next->conn_zoneid) { \
910 910 (connp)->conn_next = next; \
911 911 if (prev != NULL) \
912 912 prev = next->conn_prev; \
913 913 next->conn_prev = (connp); \
914 914 break; \
915 915 } \
916 916 list = &next->conn_next; \
917 917 prev = next; \
918 918 } \
919 919 (connp)->conn_prev = prev; \
920 920 *list = (connp); \
921 921 (connp)->conn_fanout = (connfp); \
922 922 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
923 923 IPCL_BOUND; \
924 924 CONN_INC_REF((connp)); \
925 925 mutex_exit(&(connfp)->connf_lock); \
926 926 }
927 927
928 928 void
929 929 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
930 930 {
931 931 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
932 932 }
933 933
934 934 /*
935 935 * Because the classifier is used to classify inbound packets, the destination
936 936 * address is meant to be our local tunnel address (tunnel source), and the
937 937 * source the remote tunnel address (tunnel destination).
938 938 *
939 939 * Note that conn_proto can't be used for fanout since the upper protocol
940 940 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
941 941 */
942 942 conn_t *
943 943 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
944 944 {
945 945 connf_t *connfp;
946 946 conn_t *connp;
947 947
948 948 /* first look for IPv4 tunnel links */
949 949 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
950 950 mutex_enter(&connfp->connf_lock);
951 951 for (connp = connfp->connf_head; connp != NULL;
952 952 connp = connp->conn_next) {
953 953 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
954 954 break;
955 955 }
956 956 if (connp != NULL)
957 957 goto done;
958 958
959 959 mutex_exit(&connfp->connf_lock);
960 960
961 961 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
962 962 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
963 963 INADDR_ANY)];
964 964 mutex_enter(&connfp->connf_lock);
965 965 for (connp = connfp->connf_head; connp != NULL;
966 966 connp = connp->conn_next) {
967 967 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
968 968 break;
969 969 }
970 970 done:
971 971 if (connp != NULL)
972 972 CONN_INC_REF(connp);
973 973 mutex_exit(&connfp->connf_lock);
974 974 return (connp);
975 975 }
976 976
977 977 conn_t *
978 978 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
979 979 {
980 980 connf_t *connfp;
981 981 conn_t *connp;
982 982
983 983 /* Look for an IPv6 tunnel link */
984 984 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
985 985 mutex_enter(&connfp->connf_lock);
986 986 for (connp = connfp->connf_head; connp != NULL;
987 987 connp = connp->conn_next) {
988 988 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
989 989 CONN_INC_REF(connp);
990 990 break;
991 991 }
992 992 }
993 993 mutex_exit(&connfp->connf_lock);
994 994 return (connp);
995 995 }
996 996
997 997 /*
998 998 * This function is used only for inserting SCTP raw socket now.
999 999 * This may change later.
1000 1000 *
1001 1001 * Note that only one raw socket can be bound to a port. The param
1002 1002 * lport is in network byte order.
1003 1003 */
1004 1004 static int
1005 1005 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 1006 {
1007 1007 connf_t *connfp;
1008 1008 conn_t *oconnp;
1009 1009 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1010 1010
1011 1011 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012 1012
1013 1013 /* Check for existing raw socket already bound to the port. */
1014 1014 mutex_enter(&connfp->connf_lock);
1015 1015 for (oconnp = connfp->connf_head; oconnp != NULL;
1016 1016 oconnp = oconnp->conn_next) {
1017 1017 if (oconnp->conn_lport == lport &&
1018 1018 oconnp->conn_zoneid == connp->conn_zoneid &&
1019 1019 oconnp->conn_family == connp->conn_family &&
1020 1020 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021 1021 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022 1022 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023 1023 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024 1024 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025 1025 &connp->conn_laddr_v6))) {
1026 1026 break;
1027 1027 }
1028 1028 }
1029 1029 mutex_exit(&connfp->connf_lock);
1030 1030 if (oconnp != NULL)
1031 1031 return (EADDRNOTAVAIL);
1032 1032
1033 1033 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034 1034 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035 1035 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036 1036 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037 1037 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1038 1038 } else {
1039 1039 IPCL_HASH_INSERT_BOUND(connfp, connp);
1040 1040 }
1041 1041 } else {
1042 1042 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043 1043 }
1044 1044 return (0);
1045 1045 }
1046 1046
1047 1047 static int
1048 1048 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 1049 {
1050 1050 connf_t *connfp;
1051 1051 conn_t *tconnp;
1052 1052 ipaddr_t laddr = connp->conn_laddr_v4;
1053 1053 ipaddr_t faddr = connp->conn_faddr_v4;
1054 1054
1055 1055 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056 1056 mutex_enter(&connfp->connf_lock);
1057 1057 for (tconnp = connfp->connf_head; tconnp != NULL;
1058 1058 tconnp = tconnp->conn_next) {
1059 1059 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060 1060 /* A tunnel is already bound to these addresses. */
1061 1061 mutex_exit(&connfp->connf_lock);
1062 1062 return (EADDRINUSE);
1063 1063 }
1064 1064 }
1065 1065 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066 1066 mutex_exit(&connfp->connf_lock);
1067 1067 return (0);
1068 1068 }
1069 1069
1070 1070 static int
1071 1071 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 1072 {
1073 1073 connf_t *connfp;
1074 1074 conn_t *tconnp;
1075 1075 in6_addr_t *laddr = &connp->conn_laddr_v6;
1076 1076 in6_addr_t *faddr = &connp->conn_faddr_v6;
1077 1077
1078 1078 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079 1079 mutex_enter(&connfp->connf_lock);
1080 1080 for (tconnp = connfp->connf_head; tconnp != NULL;
1081 1081 tconnp = tconnp->conn_next) {
1082 1082 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083 1083 /* A tunnel is already bound to these addresses. */
1084 1084 mutex_exit(&connfp->connf_lock);
1085 1085 return (EADDRINUSE);
1086 1086 }
1087 1087 }
1088 1088 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089 1089 mutex_exit(&connfp->connf_lock);
1090 1090 return (0);
1091 1091 }
1092 1092
1093 1093 /*
1094 1094 * Check for a MAC exemption conflict on a labeled system. Note that for
1095 1095 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096 1096 * transport layer. This check is for binding all other protocols.
1097 1097 *
1098 1098 * Returns true if there's a conflict.
1099 1099 */
1100 1100 static boolean_t
1101 1101 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 1102 {
1103 1103 connf_t *connfp;
1104 1104 conn_t *tconn;
1105 1105
1106 1106 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107 1107 mutex_enter(&connfp->connf_lock);
1108 1108 for (tconn = connfp->connf_head; tconn != NULL;
1109 1109 tconn = tconn->conn_next) {
1110 1110 /* We don't allow v4 fallback for v6 raw socket */
1111 1111 if (connp->conn_family != tconn->conn_family)
1112 1112 continue;
1113 1113 /* If neither is exempt, then there's no conflict */
1114 1114 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115 1115 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116 1116 continue;
1117 1117 /* We are only concerned about sockets for a different zone */
1118 1118 if (connp->conn_zoneid == tconn->conn_zoneid)
1119 1119 continue;
1120 1120 /* If both are bound to different specific addrs, ok */
1121 1121 if (connp->conn_laddr_v4 != INADDR_ANY &&
1122 1122 tconn->conn_laddr_v4 != INADDR_ANY &&
1123 1123 connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124 1124 continue;
1125 1125 /* These two conflict; fail */
1126 1126 break;
1127 1127 }
1128 1128 mutex_exit(&connfp->connf_lock);
1129 1129 return (tconn != NULL);
1130 1130 }
1131 1131
1132 1132 static boolean_t
1133 1133 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 1134 {
1135 1135 connf_t *connfp;
1136 1136 conn_t *tconn;
1137 1137
1138 1138 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139 1139 mutex_enter(&connfp->connf_lock);
1140 1140 for (tconn = connfp->connf_head; tconn != NULL;
1141 1141 tconn = tconn->conn_next) {
1142 1142 /* We don't allow v4 fallback for v6 raw socket */
1143 1143 if (connp->conn_family != tconn->conn_family)
1144 1144 continue;
1145 1145 /* If neither is exempt, then there's no conflict */
1146 1146 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147 1147 (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148 1148 continue;
1149 1149 /* We are only concerned about sockets for a different zone */
1150 1150 if (connp->conn_zoneid == tconn->conn_zoneid)
1151 1151 continue;
1152 1152 /* If both are bound to different addrs, ok */
1153 1153 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154 1154 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155 1155 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156 1156 &tconn->conn_laddr_v6))
1157 1157 continue;
1158 1158 /* These two conflict; fail */
1159 1159 break;
1160 1160 }
1161 1161 mutex_exit(&connfp->connf_lock);
1162 1162 return (tconn != NULL);
1163 1163 }
1164 1164
1165 1165 /*
1166 1166 * (v4, v6) bind hash insertion routines
1167 1167 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168 1168 */
1169 1169
1170 1170 int
1171 1171 ipcl_bind_insert(conn_t *connp)
1172 1172 {
1173 1173 if (connp->conn_ipversion == IPV6_VERSION)
1174 1174 return (ipcl_bind_insert_v6(connp));
1175 1175 else
1176 1176 return (ipcl_bind_insert_v4(connp));
1177 1177 }
1178 1178
1179 1179 int
1180 1180 ipcl_bind_insert_v4(conn_t *connp)
1181 1181 {
1182 1182 connf_t *connfp;
1183 1183 int ret = 0;
1184 1184 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1185 1185 uint16_t lport = connp->conn_lport;
1186 1186 uint8_t protocol = connp->conn_proto;
1187 1187
1188 1188 if (IPCL_IS_IPTUN(connp))
1189 1189 return (ipcl_iptun_hash_insert(connp, ipst));
1190 1190
1191 1191 switch (protocol) {
1192 1192 default:
1193 1193 if (is_system_labeled() &&
1194 1194 check_exempt_conflict_v4(connp, ipst))
1195 1195 return (EADDRINUSE);
1196 1196 /* FALLTHROUGH */
1197 1197 case IPPROTO_UDP:
1198 1198 if (protocol == IPPROTO_UDP) {
1199 1199 connfp = &ipst->ips_ipcl_udp_fanout[
1200 1200 IPCL_UDP_HASH(lport, ipst)];
1201 1201 } else {
1202 1202 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203 1203 }
1204 1204
1205 1205 if (connp->conn_faddr_v4 != INADDR_ANY) {
1206 1206 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207 1207 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208 1208 IPCL_HASH_INSERT_BOUND(connfp, connp);
1209 1209 } else {
1210 1210 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1211 1211 }
1212 1212 if (protocol == IPPROTO_RSVP)
1213 1213 ill_set_inputfn_all(ipst);
1214 1214 break;
1215 1215
1216 1216 case IPPROTO_TCP:
1217 1217 /* Insert it in the Bind Hash */
1218 1218 ASSERT(connp->conn_zoneid != ALL_ZONES);
1219 1219 connfp = &ipst->ips_ipcl_bind_fanout[
1220 1220 IPCL_BIND_HASH(lport, ipst)];
1221 1221 if (connp->conn_laddr_v4 != INADDR_ANY) {
1222 1222 IPCL_HASH_INSERT_BOUND(connfp, connp);
1223 1223 } else {
1224 1224 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1225 1225 }
1226 1226 if (cl_inet_listen != NULL) {
1227 1227 ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228 1228 connp->conn_flags |= IPCL_CL_LISTENER;
1229 1229 (*cl_inet_listen)(
1230 1230 connp->conn_netstack->netstack_stackid,
1231 1231 IPPROTO_TCP, AF_INET,
1232 1232 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233 1233 }
1234 1234 break;
1235 1235
1236 1236 case IPPROTO_SCTP:
1237 1237 ret = ipcl_sctp_hash_insert(connp, lport);
1238 1238 break;
1239 1239 }
1240 1240
1241 1241 return (ret);
1242 1242 }
1243 1243
1244 1244 int
1245 1245 ipcl_bind_insert_v6(conn_t *connp)
1246 1246 {
1247 1247 connf_t *connfp;
1248 1248 int ret = 0;
1249 1249 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1250 1250 uint16_t lport = connp->conn_lport;
1251 1251 uint8_t protocol = connp->conn_proto;
1252 1252
1253 1253 if (IPCL_IS_IPTUN(connp)) {
1254 1254 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255 1255 }
1256 1256
1257 1257 switch (protocol) {
1258 1258 default:
1259 1259 if (is_system_labeled() &&
1260 1260 check_exempt_conflict_v6(connp, ipst))
1261 1261 return (EADDRINUSE);
1262 1262 /* FALLTHROUGH */
1263 1263 case IPPROTO_UDP:
1264 1264 if (protocol == IPPROTO_UDP) {
1265 1265 connfp = &ipst->ips_ipcl_udp_fanout[
1266 1266 IPCL_UDP_HASH(lport, ipst)];
1267 1267 } else {
1268 1268 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269 1269 }
1270 1270
1271 1271 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272 1272 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 1273 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274 1274 IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 1275 } else {
1276 1276 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 1277 }
1278 1278 break;
1279 1279
1280 1280 case IPPROTO_TCP:
1281 1281 /* Insert it in the Bind Hash */
1282 1282 ASSERT(connp->conn_zoneid != ALL_ZONES);
1283 1283 connfp = &ipst->ips_ipcl_bind_fanout[
1284 1284 IPCL_BIND_HASH(lport, ipst)];
1285 1285 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286 1286 IPCL_HASH_INSERT_BOUND(connfp, connp);
1287 1287 } else {
1288 1288 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1289 1289 }
1290 1290 if (cl_inet_listen != NULL) {
1291 1291 sa_family_t addr_family;
1292 1292 uint8_t *laddrp;
1293 1293
1294 1294 if (connp->conn_ipversion == IPV6_VERSION) {
1295 1295 addr_family = AF_INET6;
1296 1296 laddrp =
1297 1297 (uint8_t *)&connp->conn_bound_addr_v6;
1298 1298 } else {
1299 1299 addr_family = AF_INET;
1300 1300 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301 1301 }
1302 1302 connp->conn_flags |= IPCL_CL_LISTENER;
1303 1303 (*cl_inet_listen)(
1304 1304 connp->conn_netstack->netstack_stackid,
1305 1305 IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306 1306 }
1307 1307 break;
1308 1308
1309 1309 case IPPROTO_SCTP:
1310 1310 ret = ipcl_sctp_hash_insert(connp, lport);
1311 1311 break;
1312 1312 }
1313 1313
1314 1314 return (ret);
1315 1315 }
1316 1316
1317 1317 /*
1318 1318 * ipcl_conn_hash insertion routines.
1319 1319 * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320 1320 */
1321 1321
1322 1322 int
1323 1323 ipcl_conn_insert(conn_t *connp)
1324 1324 {
1325 1325 if (connp->conn_ipversion == IPV6_VERSION)
1326 1326 return (ipcl_conn_insert_v6(connp));
1327 1327 else
1328 1328 return (ipcl_conn_insert_v4(connp));
1329 1329 }
1330 1330
1331 1331 int
1332 1332 ipcl_conn_insert_v4(conn_t *connp)
1333 1333 {
1334 1334 connf_t *connfp;
1335 1335 conn_t *tconnp;
1336 1336 int ret = 0;
1337 1337 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1338 1338 uint16_t lport = connp->conn_lport;
1339 1339 uint8_t protocol = connp->conn_proto;
1340 1340
1341 1341 if (IPCL_IS_IPTUN(connp))
1342 1342 return (ipcl_iptun_hash_insert(connp, ipst));
1343 1343
1344 1344 switch (protocol) {
1345 1345 case IPPROTO_TCP:
1346 1346 /*
1347 1347 * For TCP, we check whether the connection tuple already
1348 1348 * exists before allowing the connection to proceed. We
1349 1349 * also allow indexing on the zoneid. This is to allow
1350 1350 * multiple shared stack zones to have the same tcp
1351 1351 * connection tuple. In practice this only happens for
1352 1352 * INADDR_LOOPBACK as it's the only local address which
1353 1353 * doesn't have to be unique.
1354 1354 */
1355 1355 connfp = &ipst->ips_ipcl_conn_fanout[
1356 1356 IPCL_CONN_HASH(connp->conn_faddr_v4,
1357 1357 connp->conn_ports, ipst)];
1358 1358 mutex_enter(&connfp->connf_lock);
1359 1359 for (tconnp = connfp->connf_head; tconnp != NULL;
1360 1360 tconnp = tconnp->conn_next) {
1361 1361 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362 1362 connp->conn_faddr_v4, connp->conn_laddr_v4,
1363 1363 connp->conn_ports) &&
1364 1364 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365 1365 /* Already have a conn. bail out */
1366 1366 mutex_exit(&connfp->connf_lock);
1367 1367 return (EADDRINUSE);
1368 1368 }
1369 1369 }
1370 1370 if (connp->conn_fanout != NULL) {
1371 1371 /*
1372 1372 * Probably a XTI/TLI application trying to do a
1373 1373 * rebind. Let it happen.
1374 1374 */
1375 1375 mutex_exit(&connfp->connf_lock);
1376 1376 IPCL_HASH_REMOVE(connp);
1377 1377 mutex_enter(&connfp->connf_lock);
1378 1378 }
1379 1379
1380 1380 ASSERT(connp->conn_recv != NULL);
1381 1381 ASSERT(connp->conn_recvicmp != NULL);
1382 1382
1383 1383 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384 1384 mutex_exit(&connfp->connf_lock);
1385 1385 break;
1386 1386
1387 1387 case IPPROTO_SCTP:
1388 1388 /*
1389 1389 * The raw socket may have already been bound, remove it
1390 1390 * from the hash first.
1391 1391 */
1392 1392 IPCL_HASH_REMOVE(connp);
1393 1393 ret = ipcl_sctp_hash_insert(connp, lport);
1394 1394 break;
1395 1395
1396 1396 default:
1397 1397 /*
1398 1398 * Check for conflicts among MAC exempt bindings. For
1399 1399 * transports with port numbers, this is done by the upper
1400 1400 * level per-transport binding logic. For all others, it's
1401 1401 * done here.
1402 1402 */
1403 1403 if (is_system_labeled() &&
1404 1404 check_exempt_conflict_v4(connp, ipst))
1405 1405 return (EADDRINUSE);
1406 1406 /* FALLTHROUGH */
1407 1407
1408 1408 case IPPROTO_UDP:
1409 1409 if (protocol == IPPROTO_UDP) {
1410 1410 connfp = &ipst->ips_ipcl_udp_fanout[
1411 1411 IPCL_UDP_HASH(lport, ipst)];
1412 1412 } else {
1413 1413 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414 1414 }
1415 1415
1416 1416 if (connp->conn_faddr_v4 != INADDR_ANY) {
1417 1417 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418 1418 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419 1419 IPCL_HASH_INSERT_BOUND(connfp, connp);
1420 1420 } else {
1421 1421 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1422 1422 }
1423 1423 break;
1424 1424 }
1425 1425
1426 1426 return (ret);
1427 1427 }
1428 1428
1429 1429 int
1430 1430 ipcl_conn_insert_v6(conn_t *connp)
1431 1431 {
1432 1432 connf_t *connfp;
1433 1433 conn_t *tconnp;
1434 1434 int ret = 0;
1435 1435 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
1436 1436 uint16_t lport = connp->conn_lport;
1437 1437 uint8_t protocol = connp->conn_proto;
1438 1438 uint_t ifindex = connp->conn_bound_if;
1439 1439
1440 1440 if (IPCL_IS_IPTUN(connp))
1441 1441 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442 1442
1443 1443 switch (protocol) {
1444 1444 case IPPROTO_TCP:
1445 1445
1446 1446 /*
1447 1447 * For tcp, we check whether the connection tuple already
1448 1448 * exists before allowing the connection to proceed. We
1449 1449 * also allow indexing on the zoneid. This is to allow
1450 1450 * multiple shared stack zones to have the same tcp
1451 1451 * connection tuple. In practice this only happens for
1452 1452 * ipv6_loopback as it's the only local address which
1453 1453 * doesn't have to be unique.
1454 1454 */
1455 1455 connfp = &ipst->ips_ipcl_conn_fanout[
1456 1456 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457 1457 ipst)];
1458 1458 mutex_enter(&connfp->connf_lock);
1459 1459 for (tconnp = connfp->connf_head; tconnp != NULL;
1460 1460 tconnp = tconnp->conn_next) {
1461 1461 /* NOTE: need to match zoneid. Bug in onnv-gate */
1462 1462 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463 1463 connp->conn_faddr_v6, connp->conn_laddr_v6,
1464 1464 connp->conn_ports) &&
1465 1465 (tconnp->conn_bound_if == 0 ||
1466 1466 tconnp->conn_bound_if == ifindex) &&
1467 1467 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468 1468 /* Already have a conn. bail out */
1469 1469 mutex_exit(&connfp->connf_lock);
1470 1470 return (EADDRINUSE);
1471 1471 }
1472 1472 }
1473 1473 if (connp->conn_fanout != NULL) {
1474 1474 /*
1475 1475 * Probably a XTI/TLI application trying to do a
1476 1476 * rebind. Let it happen.
1477 1477 */
1478 1478 mutex_exit(&connfp->connf_lock);
1479 1479 IPCL_HASH_REMOVE(connp);
1480 1480 mutex_enter(&connfp->connf_lock);
1481 1481 }
1482 1482 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483 1483 mutex_exit(&connfp->connf_lock);
1484 1484 break;
1485 1485
1486 1486 case IPPROTO_SCTP:
1487 1487 IPCL_HASH_REMOVE(connp);
1488 1488 ret = ipcl_sctp_hash_insert(connp, lport);
1489 1489 break;
1490 1490
1491 1491 default:
1492 1492 if (is_system_labeled() &&
1493 1493 check_exempt_conflict_v6(connp, ipst))
1494 1494 return (EADDRINUSE);
1495 1495 /* FALLTHROUGH */
1496 1496 case IPPROTO_UDP:
1497 1497 if (protocol == IPPROTO_UDP) {
1498 1498 connfp = &ipst->ips_ipcl_udp_fanout[
1499 1499 IPCL_UDP_HASH(lport, ipst)];
1500 1500 } else {
1501 1501 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502 1502 }
1503 1503
1504 1504 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505 1505 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506 1506 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507 1507 IPCL_HASH_INSERT_BOUND(connfp, connp);
1508 1508 } else {
1509 1509 IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1510 1510 }
1511 1511 break;
1512 1512 }
1513 1513
1514 1514 return (ret);
1515 1515 }
1516 1516
1517 1517 /*
1518 1518 * v4 packet classifying function. looks up the fanout table to
1519 1519 * find the conn, the packet belongs to. returns the conn with
1520 1520 * the reference held, null otherwise.
1521 1521 *
1522 1522 * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523 1523 * Lookup" comment block are applied. Labels are also checked as described
1524 1524 * above. If the packet is from the inside (looped back), and is from the same
1525 1525 * zone, then label checks are omitted.
1526 1526 */
1527 1527 conn_t *
1528 1528 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529 1529 ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 1530 {
1531 1531 ipha_t *ipha;
1532 1532 connf_t *connfp, *bind_connfp;
1533 1533 uint16_t lport;
1534 1534 uint16_t fport;
1535 1535 uint32_t ports;
1536 1536 conn_t *connp;
1537 1537 uint16_t *up;
1538 1538 zoneid_t zoneid = ira->ira_zoneid;
1539 1539
1540 1540 ipha = (ipha_t *)mp->b_rptr;
1541 1541 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 1542
1543 1543 switch (protocol) {
1544 1544 case IPPROTO_TCP:
1545 1545 ports = *(uint32_t *)up;
1546 1546 connfp =
1547 1547 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 1548 ports, ipst)];
1549 1549 mutex_enter(&connfp->connf_lock);
1550 1550 for (connp = connfp->connf_head; connp != NULL;
1551 1551 connp = connp->conn_next) {
1552 1552 if (IPCL_CONN_MATCH(connp, protocol,
1553 1553 ipha->ipha_src, ipha->ipha_dst, ports) &&
1554 1554 (connp->conn_zoneid == zoneid ||
1555 1555 connp->conn_allzones ||
1556 1556 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557 1557 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558 1558 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559 1559 break;
1560 1560 }
1561 1561
1562 1562 if (connp != NULL) {
1563 1563 /*
1564 1564 * We have a fully-bound TCP connection.
1565 1565 *
1566 1566 * For labeled systems, there's no need to check the
1567 1567 * label here. It's known to be good as we checked
1568 1568 * before allowing the connection to become bound.
1569 1569 */
1570 1570 CONN_INC_REF(connp);
1571 1571 mutex_exit(&connfp->connf_lock);
1572 1572 return (connp);
1573 1573 }
1574 1574
1575 1575 mutex_exit(&connfp->connf_lock);
1576 1576 lport = up[1];
1577 1577 bind_connfp =
1578 1578 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579 1579 mutex_enter(&bind_connfp->connf_lock);
1580 1580 for (connp = bind_connfp->connf_head; connp != NULL;
1581 1581 connp = connp->conn_next) {
1582 1582 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583 1583 lport) &&
1584 1584 (connp->conn_zoneid == zoneid ||
1585 1585 connp->conn_allzones ||
1586 1586 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587 1587 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588 1588 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589 1589 break;
1590 1590 }
1591 1591
1592 1592 /*
1593 1593 * If the matching connection is SLP on a private address, then
1594 1594 * the label on the packet must match the local zone's label.
1595 1595 * Otherwise, it must be in the label range defined by tnrh.
1596 1596 * This is ensured by tsol_receive_local.
1597 1597 *
1598 1598 * Note that we don't check tsol_receive_local for
1599 1599 * the connected case.
1600 1600 */
1601 1601 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602 1602 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603 1603 ira, connp)) {
1604 1604 DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605 1605 char *, "connp(1) could not receive mp(2)",
1606 1606 conn_t *, connp, mblk_t *, mp);
1607 1607 connp = NULL;
1608 1608 }
1609 1609
1610 1610 if (connp != NULL) {
1611 1611 /* Have a listener at least */
1612 1612 CONN_INC_REF(connp);
1613 1613 mutex_exit(&bind_connfp->connf_lock);
1614 1614 return (connp);
1615 1615 }
1616 1616
1617 1617 mutex_exit(&bind_connfp->connf_lock);
1618 1618 break;
1619 1619
1620 1620 case IPPROTO_UDP:
1621 1621 lport = up[1];
1622 1622 fport = up[0];
1623 1623 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624 1624 mutex_enter(&connfp->connf_lock);
1625 1625 for (connp = connfp->connf_head; connp != NULL;
1626 1626 connp = connp->conn_next) {
1627 1627 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628 1628 fport, ipha->ipha_src) &&
1629 1629 (connp->conn_zoneid == zoneid ||
1630 1630 connp->conn_allzones ||
1631 1631 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632 1632 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633 1633 break;
1634 1634 }
1635 1635
1636 1636 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637 1637 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638 1638 ira, connp)) {
1639 1639 DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640 1640 char *, "connp(1) could not receive mp(2)",
1641 1641 conn_t *, connp, mblk_t *, mp);
1642 1642 connp = NULL;
1643 1643 }
1644 1644
1645 1645 if (connp != NULL) {
1646 1646 CONN_INC_REF(connp);
1647 1647 mutex_exit(&connfp->connf_lock);
1648 1648 return (connp);
1649 1649 }
1650 1650
1651 1651 /*
1652 1652 * We shouldn't come here for multicast/broadcast packets
1653 1653 */
1654 1654 mutex_exit(&connfp->connf_lock);
1655 1655
1656 1656 break;
1657 1657
1658 1658 case IPPROTO_ENCAP:
1659 1659 case IPPROTO_IPV6:
1660 1660 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661 1661 &ipha->ipha_dst, ipst));
1662 1662 }
1663 1663
1664 1664 return (NULL);
1665 1665 }
1666 1666
1667 1667 conn_t *
1668 1668 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669 1669 ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 1670 {
1671 1671 ip6_t *ip6h;
1672 1672 connf_t *connfp, *bind_connfp;
1673 1673 uint16_t lport;
1674 1674 uint16_t fport;
1675 1675 tcpha_t *tcpha;
1676 1676 uint32_t ports;
1677 1677 conn_t *connp;
1678 1678 uint16_t *up;
1679 1679 zoneid_t zoneid = ira->ira_zoneid;
1680 1680
1681 1681 ip6h = (ip6_t *)mp->b_rptr;
1682 1682
1683 1683 switch (protocol) {
1684 1684 case IPPROTO_TCP:
1685 1685 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686 1686 up = &tcpha->tha_lport;
1687 1687 ports = *(uint32_t *)up;
1688 1688
1689 1689 connfp =
1690 1690 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691 1691 ports, ipst)];
1692 1692 mutex_enter(&connfp->connf_lock);
1693 1693 for (connp = connfp->connf_head; connp != NULL;
1694 1694 connp = connp->conn_next) {
1695 1695 if (IPCL_CONN_MATCH_V6(connp, protocol,
1696 1696 ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697 1697 (connp->conn_zoneid == zoneid ||
1698 1698 connp->conn_allzones ||
1699 1699 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700 1700 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701 1701 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702 1702 break;
1703 1703 }
1704 1704
1705 1705 if (connp != NULL) {
1706 1706 /*
1707 1707 * We have a fully-bound TCP connection.
1708 1708 *
1709 1709 * For labeled systems, there's no need to check the
1710 1710 * label here. It's known to be good as we checked
1711 1711 * before allowing the connection to become bound.
1712 1712 */
1713 1713 CONN_INC_REF(connp);
1714 1714 mutex_exit(&connfp->connf_lock);
1715 1715 return (connp);
1716 1716 }
1717 1717
1718 1718 mutex_exit(&connfp->connf_lock);
1719 1719
1720 1720 lport = up[1];
1721 1721 bind_connfp =
1722 1722 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723 1723 mutex_enter(&bind_connfp->connf_lock);
1724 1724 for (connp = bind_connfp->connf_head; connp != NULL;
1725 1725 connp = connp->conn_next) {
1726 1726 if (IPCL_BIND_MATCH_V6(connp, protocol,
1727 1727 ip6h->ip6_dst, lport) &&
1728 1728 (connp->conn_zoneid == zoneid ||
1729 1729 connp->conn_allzones ||
1730 1730 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731 1731 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732 1732 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733 1733 break;
1734 1734 }
1735 1735
1736 1736 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737 1737 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738 1738 ira, connp)) {
1739 1739 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740 1740 char *, "connp(1) could not receive mp(2)",
1741 1741 conn_t *, connp, mblk_t *, mp);
1742 1742 connp = NULL;
1743 1743 }
1744 1744
1745 1745 if (connp != NULL) {
1746 1746 /* Have a listner at least */
1747 1747 CONN_INC_REF(connp);
1748 1748 mutex_exit(&bind_connfp->connf_lock);
1749 1749 return (connp);
1750 1750 }
1751 1751
1752 1752 mutex_exit(&bind_connfp->connf_lock);
1753 1753 break;
1754 1754
1755 1755 case IPPROTO_UDP:
1756 1756 up = (uint16_t *)&mp->b_rptr[hdr_len];
1757 1757 lport = up[1];
1758 1758 fport = up[0];
1759 1759 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760 1760 mutex_enter(&connfp->connf_lock);
1761 1761 for (connp = connfp->connf_head; connp != NULL;
1762 1762 connp = connp->conn_next) {
1763 1763 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764 1764 fport, ip6h->ip6_src) &&
1765 1765 (connp->conn_zoneid == zoneid ||
1766 1766 connp->conn_allzones ||
1767 1767 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768 1768 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769 1769 (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770 1770 break;
1771 1771 }
1772 1772
1773 1773 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774 1774 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775 1775 ira, connp)) {
1776 1776 DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777 1777 char *, "connp(1) could not receive mp(2)",
1778 1778 conn_t *, connp, mblk_t *, mp);
1779 1779 connp = NULL;
1780 1780 }
1781 1781
1782 1782 if (connp != NULL) {
1783 1783 CONN_INC_REF(connp);
1784 1784 mutex_exit(&connfp->connf_lock);
1785 1785 return (connp);
1786 1786 }
1787 1787
1788 1788 /*
1789 1789 * We shouldn't come here for multicast/broadcast packets
1790 1790 */
1791 1791 mutex_exit(&connfp->connf_lock);
1792 1792 break;
1793 1793 case IPPROTO_ENCAP:
1794 1794 case IPPROTO_IPV6:
1795 1795 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796 1796 &ip6h->ip6_dst, ipst));
1797 1797 }
1798 1798
1799 1799 return (NULL);
1800 1800 }
1801 1801
1802 1802 /*
1803 1803 * wrapper around ipcl_classify_(v4,v6) routines.
1804 1804 */
1805 1805 conn_t *
1806 1806 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 1807 {
1808 1808 if (ira->ira_flags & IRAF_IS_IPV4) {
1809 1809 return (ipcl_classify_v4(mp, ira->ira_protocol,
1810 1810 ira->ira_ip_hdr_length, ira, ipst));
1811 1811 } else {
1812 1812 return (ipcl_classify_v6(mp, ira->ira_protocol,
1813 1813 ira->ira_ip_hdr_length, ira, ipst));
1814 1814 }
1815 1815 }
1816 1816
1817 1817 /*
1818 1818 * Only used to classify SCTP RAW sockets
1819 1819 */
1820 1820 conn_t *
1821 1821 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822 1822 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 1823 {
1824 1824 connf_t *connfp;
1825 1825 conn_t *connp;
1826 1826 in_port_t lport;
1827 1827 int ipversion;
1828 1828 const void *dst;
1829 1829 zoneid_t zoneid = ira->ira_zoneid;
1830 1830
1831 1831 lport = ((uint16_t *)&ports)[1];
1832 1832 if (ira->ira_flags & IRAF_IS_IPV4) {
1833 1833 dst = (const void *)&ipha->ipha_dst;
1834 1834 ipversion = IPV4_VERSION;
1835 1835 } else {
1836 1836 dst = (const void *)&ip6h->ip6_dst;
1837 1837 ipversion = IPV6_VERSION;
1838 1838 }
1839 1839
1840 1840 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841 1841 mutex_enter(&connfp->connf_lock);
1842 1842 for (connp = connfp->connf_head; connp != NULL;
1843 1843 connp = connp->conn_next) {
1844 1844 /* We don't allow v4 fallback for v6 raw socket. */
1845 1845 if (ipversion != connp->conn_ipversion)
1846 1846 continue;
1847 1847 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848 1848 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849 1849 if (ipversion == IPV4_VERSION) {
1850 1850 if (!IPCL_CONN_MATCH(connp, protocol,
1851 1851 ipha->ipha_src, ipha->ipha_dst, ports))
1852 1852 continue;
1853 1853 } else {
1854 1854 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855 1855 ip6h->ip6_src, ip6h->ip6_dst, ports))
1856 1856 continue;
1857 1857 }
1858 1858 } else {
1859 1859 if (ipversion == IPV4_VERSION) {
1860 1860 if (!IPCL_BIND_MATCH(connp, protocol,
1861 1861 ipha->ipha_dst, lport))
1862 1862 continue;
1863 1863 } else {
1864 1864 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865 1865 ip6h->ip6_dst, lport))
1866 1866 continue;
1867 1867 }
1868 1868 }
1869 1869
1870 1870 if (connp->conn_zoneid == zoneid ||
1871 1871 connp->conn_allzones ||
1872 1872 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873 1873 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874 1874 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875 1875 break;
1876 1876 }
1877 1877
1878 1878 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879 1879 !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880 1880 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881 1881 char *, "connp(1) could not receive mp(2)",
1882 1882 conn_t *, connp, mblk_t *, mp);
1883 1883 connp = NULL;
1884 1884 }
1885 1885
1886 1886 if (connp != NULL)
1887 1887 goto found;
1888 1888 mutex_exit(&connfp->connf_lock);
1889 1889
1890 1890 /* Try to look for a wildcard SCTP RAW socket match. */
1891 1891 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892 1892 mutex_enter(&connfp->connf_lock);
1893 1893 for (connp = connfp->connf_head; connp != NULL;
1894 1894 connp = connp->conn_next) {
1895 1895 /* We don't allow v4 fallback for v6 raw socket. */
1896 1896 if (ipversion != connp->conn_ipversion)
1897 1897 continue;
1898 1898 if (!IPCL_ZONE_MATCH(connp, zoneid))
1899 1899 continue;
1900 1900
1901 1901 if (ipversion == IPV4_VERSION) {
1902 1902 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903 1903 break;
1904 1904 } else {
1905 1905 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906 1906 break;
1907 1907 }
1908 1908 }
1909 1909 }
1910 1910
1911 1911 if (connp != NULL)
1912 1912 goto found;
1913 1913
1914 1914 mutex_exit(&connfp->connf_lock);
1915 1915 return (NULL);
1916 1916
1917 1917 found:
1918 1918 ASSERT(connp != NULL);
1919 1919 CONN_INC_REF(connp);
1920 1920 mutex_exit(&connfp->connf_lock);
1921 1921 return (connp);
1922 1922 }
1923 1923
1924 1924 /* ARGSUSED */
1925 1925 static int
1926 1926 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 1927 {
1928 1928 itc_t *itc = (itc_t *)buf;
1929 1929 conn_t *connp = &itc->itc_conn;
1930 1930 tcp_t *tcp = (tcp_t *)&itc[1];
1931 1931
1932 1932 bzero(connp, sizeof (conn_t));
1933 1933 bzero(tcp, sizeof (tcp_t));
1934 1934
1935 1935 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936 1936 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937 1937 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938 1938 tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939 1939 if (tcp->tcp_timercache == NULL)
1940 1940 return (ENOMEM);
1941 1941 connp->conn_tcp = tcp;
1942 1942 connp->conn_flags = IPCL_TCPCONN;
1943 1943 connp->conn_proto = IPPROTO_TCP;
1944 1944 tcp->tcp_connp = connp;
1945 1945 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946 1946
1947 1947 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948 1948 if (connp->conn_ixa == NULL) {
1949 1949 tcp_timermp_free(tcp);
1950 1950 return (ENOMEM);
1951 1951 }
1952 1952 connp->conn_ixa->ixa_refcnt = 1;
1953 1953 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954 1954 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955 1955 return (0);
1956 1956 }
1957 1957
1958 1958 /* ARGSUSED */
1959 1959 static void
1960 1960 tcp_conn_destructor(void *buf, void *cdrarg)
1961 1961 {
1962 1962 itc_t *itc = (itc_t *)buf;
1963 1963 conn_t *connp = &itc->itc_conn;
1964 1964 tcp_t *tcp = (tcp_t *)&itc[1];
1965 1965
1966 1966 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967 1967 ASSERT(tcp->tcp_connp == connp);
1968 1968 ASSERT(connp->conn_tcp == tcp);
1969 1969 tcp_timermp_free(tcp);
1970 1970 mutex_destroy(&connp->conn_lock);
1971 1971 cv_destroy(&connp->conn_cv);
1972 1972 cv_destroy(&connp->conn_sq_cv);
1973 1973 rw_destroy(&connp->conn_ilg_lock);
1974 1974
1975 1975 /* Can be NULL if constructor failed */
1976 1976 if (connp->conn_ixa != NULL) {
1977 1977 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978 1978 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979 1979 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980 1980 ixa_refrele(connp->conn_ixa);
1981 1981 }
1982 1982 }
1983 1983
1984 1984 /* ARGSUSED */
1985 1985 static int
1986 1986 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 1987 {
1988 1988 itc_t *itc = (itc_t *)buf;
1989 1989 conn_t *connp = &itc->itc_conn;
1990 1990
1991 1991 bzero(connp, sizeof (conn_t));
1992 1992 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993 1993 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994 1994 connp->conn_flags = IPCL_IPCCONN;
1995 1995 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996 1996
1997 1997 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998 1998 if (connp->conn_ixa == NULL)
1999 1999 return (ENOMEM);
2000 2000 connp->conn_ixa->ixa_refcnt = 1;
2001 2001 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002 2002 return (0);
2003 2003 }
2004 2004
2005 2005 /* ARGSUSED */
2006 2006 static void
2007 2007 ip_conn_destructor(void *buf, void *cdrarg)
2008 2008 {
2009 2009 itc_t *itc = (itc_t *)buf;
2010 2010 conn_t *connp = &itc->itc_conn;
2011 2011
2012 2012 ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013 2013 ASSERT(connp->conn_priv == NULL);
2014 2014 mutex_destroy(&connp->conn_lock);
2015 2015 cv_destroy(&connp->conn_cv);
2016 2016 rw_destroy(&connp->conn_ilg_lock);
2017 2017
2018 2018 /* Can be NULL if constructor failed */
2019 2019 if (connp->conn_ixa != NULL) {
2020 2020 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021 2021 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022 2022 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023 2023 ixa_refrele(connp->conn_ixa);
2024 2024 }
2025 2025 }
2026 2026
2027 2027 /* ARGSUSED */
2028 2028 static int
2029 2029 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 2030 {
2031 2031 itc_t *itc = (itc_t *)buf;
2032 2032 conn_t *connp = &itc->itc_conn;
2033 2033 udp_t *udp = (udp_t *)&itc[1];
2034 2034
2035 2035 bzero(connp, sizeof (conn_t));
2036 2036 bzero(udp, sizeof (udp_t));
2037 2037
2038 2038 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039 2039 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040 2040 connp->conn_udp = udp;
2041 2041 connp->conn_flags = IPCL_UDPCONN;
2042 2042 connp->conn_proto = IPPROTO_UDP;
2043 2043 udp->udp_connp = connp;
2044 2044 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045 2045 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046 2046 if (connp->conn_ixa == NULL)
2047 2047 return (ENOMEM);
2048 2048 connp->conn_ixa->ixa_refcnt = 1;
2049 2049 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050 2050 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051 2051 return (0);
2052 2052 }
2053 2053
2054 2054 /* ARGSUSED */
2055 2055 static void
2056 2056 udp_conn_destructor(void *buf, void *cdrarg)
2057 2057 {
2058 2058 itc_t *itc = (itc_t *)buf;
2059 2059 conn_t *connp = &itc->itc_conn;
2060 2060 udp_t *udp = (udp_t *)&itc[1];
2061 2061
2062 2062 ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063 2063 ASSERT(udp->udp_connp == connp);
2064 2064 ASSERT(connp->conn_udp == udp);
2065 2065 mutex_destroy(&connp->conn_lock);
2066 2066 cv_destroy(&connp->conn_cv);
2067 2067 rw_destroy(&connp->conn_ilg_lock);
2068 2068
2069 2069 /* Can be NULL if constructor failed */
2070 2070 if (connp->conn_ixa != NULL) {
2071 2071 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072 2072 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073 2073 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074 2074 ixa_refrele(connp->conn_ixa);
2075 2075 }
2076 2076 }
2077 2077
2078 2078 /* ARGSUSED */
2079 2079 static int
2080 2080 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 2081 {
2082 2082 itc_t *itc = (itc_t *)buf;
2083 2083 conn_t *connp = &itc->itc_conn;
2084 2084 icmp_t *icmp = (icmp_t *)&itc[1];
2085 2085
2086 2086 bzero(connp, sizeof (conn_t));
2087 2087 bzero(icmp, sizeof (icmp_t));
2088 2088
2089 2089 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090 2090 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091 2091 connp->conn_icmp = icmp;
2092 2092 connp->conn_flags = IPCL_RAWIPCONN;
2093 2093 connp->conn_proto = IPPROTO_ICMP;
2094 2094 icmp->icmp_connp = connp;
2095 2095 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096 2096 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097 2097 if (connp->conn_ixa == NULL)
2098 2098 return (ENOMEM);
2099 2099 connp->conn_ixa->ixa_refcnt = 1;
2100 2100 connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101 2101 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102 2102 return (0);
2103 2103 }
2104 2104
2105 2105 /* ARGSUSED */
2106 2106 static void
2107 2107 rawip_conn_destructor(void *buf, void *cdrarg)
2108 2108 {
2109 2109 itc_t *itc = (itc_t *)buf;
2110 2110 conn_t *connp = &itc->itc_conn;
2111 2111 icmp_t *icmp = (icmp_t *)&itc[1];
2112 2112
2113 2113 ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114 2114 ASSERT(icmp->icmp_connp == connp);
2115 2115 ASSERT(connp->conn_icmp == icmp);
2116 2116 mutex_destroy(&connp->conn_lock);
2117 2117 cv_destroy(&connp->conn_cv);
2118 2118 rw_destroy(&connp->conn_ilg_lock);
2119 2119
2120 2120 /* Can be NULL if constructor failed */
2121 2121 if (connp->conn_ixa != NULL) {
2122 2122 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123 2123 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124 2124 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125 2125 ixa_refrele(connp->conn_ixa);
2126 2126 }
2127 2127 }
2128 2128
2129 2129 /* ARGSUSED */
2130 2130 static int
2131 2131 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 2132 {
2133 2133 itc_t *itc = (itc_t *)buf;
2134 2134 conn_t *connp = &itc->itc_conn;
2135 2135 rts_t *rts = (rts_t *)&itc[1];
2136 2136
2137 2137 bzero(connp, sizeof (conn_t));
2138 2138 bzero(rts, sizeof (rts_t));
2139 2139
2140 2140 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141 2141 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142 2142 connp->conn_rts = rts;
2143 2143 connp->conn_flags = IPCL_RTSCONN;
2144 2144 rts->rts_connp = connp;
2145 2145 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146 2146 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147 2147 if (connp->conn_ixa == NULL)
2148 2148 return (ENOMEM);
2149 2149 connp->conn_ixa->ixa_refcnt = 1;
2150 2150 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151 2151 return (0);
2152 2152 }
2153 2153
2154 2154 /* ARGSUSED */
2155 2155 static void
2156 2156 rts_conn_destructor(void *buf, void *cdrarg)
2157 2157 {
2158 2158 itc_t *itc = (itc_t *)buf;
2159 2159 conn_t *connp = &itc->itc_conn;
2160 2160 rts_t *rts = (rts_t *)&itc[1];
2161 2161
2162 2162 ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163 2163 ASSERT(rts->rts_connp == connp);
2164 2164 ASSERT(connp->conn_rts == rts);
2165 2165 mutex_destroy(&connp->conn_lock);
2166 2166 cv_destroy(&connp->conn_cv);
2167 2167 rw_destroy(&connp->conn_ilg_lock);
2168 2168
2169 2169 /* Can be NULL if constructor failed */
2170 2170 if (connp->conn_ixa != NULL) {
2171 2171 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172 2172 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173 2173 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174 2174 ixa_refrele(connp->conn_ixa);
2175 2175 }
2176 2176 }
2177 2177
2178 2178 /*
2179 2179 * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180 2180 * in the conn_t.
2181 2181 *
2182 2182 * Below we list all the pointers in the conn_t as a documentation aid.
2183 2183 * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184 2184 * If you add any pointers to the conn_t please add an ASSERT here
2185 2185 * and #ifdef it out if it can't be actually asserted to be NULL.
2186 2186 * In any case, we bzero most of the conn_t at the end of the function.
2187 2187 */
2188 2188 void
2189 2189 ipcl_conn_cleanup(conn_t *connp)
2190 2190 {
2191 2191 ip_xmit_attr_t *ixa;
2192 2192
2193 2193 ASSERT(connp->conn_latch == NULL);
2194 2194 ASSERT(connp->conn_latch_in_policy == NULL);
2195 2195 ASSERT(connp->conn_latch_in_action == NULL);
2196 2196 #ifdef notdef
2197 2197 ASSERT(connp->conn_rq == NULL);
2198 2198 ASSERT(connp->conn_wq == NULL);
2199 2199 #endif
2200 2200 ASSERT(connp->conn_cred == NULL);
2201 2201 ASSERT(connp->conn_g_fanout == NULL);
2202 2202 ASSERT(connp->conn_g_next == NULL);
2203 2203 ASSERT(connp->conn_g_prev == NULL);
2204 2204 ASSERT(connp->conn_policy == NULL);
2205 2205 ASSERT(connp->conn_fanout == NULL);
2206 2206 ASSERT(connp->conn_next == NULL);
2207 2207 ASSERT(connp->conn_prev == NULL);
2208 2208 ASSERT(connp->conn_oper_pending_ill == NULL);
2209 2209 ASSERT(connp->conn_ilg == NULL);
2210 2210 ASSERT(connp->conn_drain_next == NULL);
2211 2211 ASSERT(connp->conn_drain_prev == NULL);
2212 2212 #ifdef notdef
2213 2213 /* conn_idl is not cleared when removed from idl list */
2214 2214 ASSERT(connp->conn_idl == NULL);
2215 2215 #endif
2216 2216 ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 2217 #ifdef notdef
2218 2218 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219 2219 ASSERT(connp->conn_netstack == NULL);
2220 2220 #endif
2221 2221
2222 2222 ASSERT(connp->conn_helper_info == NULL);
2223 2223 ASSERT(connp->conn_ixa != NULL);
2224 2224 ixa = connp->conn_ixa;
2225 2225 ASSERT(ixa->ixa_refcnt == 1);
2226 2226 /* Need to preserve ixa_protocol */
2227 2227 ixa_cleanup(ixa);
2228 2228 ixa->ixa_flags = 0;
2229 2229
2230 2230 /* Clear out the conn_t fields that are not preserved */
2231 2231 bzero(&connp->conn_start_clr,
2232 2232 sizeof (conn_t) -
2233 2233 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 2234 }
2235 2235
2236 2236 /*
2237 2237 * All conns are inserted in a global multi-list for the benefit of
2238 2238 * walkers. The walk is guaranteed to walk all open conns at the time
2239 2239 * of the start of the walk exactly once. This property is needed to
2240 2240 * achieve some cleanups during unplumb of interfaces. This is achieved
2241 2241 * as follows.
2242 2242 *
2243 2243 * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244 2244 * call the insert and delete functions below at creation and deletion
2245 2245 * time respectively. The conn never moves or changes its position in this
2246 2246 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247 2247 * won't increase due to walkers, once the conn deletion has started. Note
2248 2248 * that we can't remove the conn from the global list and then wait for
2249 2249 * the refcnt to drop to zero, since walkers would then see a truncated
2250 2250 * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251 2251 * conns until ip_open is ready to make them globally visible.
2252 2252 * The global round robin multi-list locks are held only to get the
2253 2253 * next member/insertion/deletion and contention should be negligible
2254 2254 * if the multi-list is much greater than the number of cpus.
2255 2255 */
2256 2256 void
2257 2257 ipcl_globalhash_insert(conn_t *connp)
2258 2258 {
2259 2259 int index;
2260 2260 struct connf_s *connfp;
2261 2261 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
2262 2262
2263 2263 /*
2264 2264 * No need for atomic here. Approximate even distribution
2265 2265 * in the global lists is sufficient.
2266 2266 */
2267 2267 ipst->ips_conn_g_index++;
2268 2268 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269 2269
2270 2270 connp->conn_g_prev = NULL;
2271 2271 /*
2272 2272 * Mark as INCIPIENT, so that walkers will ignore this
2273 2273 * for now, till ip_open is ready to make it visible globally.
2274 2274 */
2275 2275 connp->conn_state_flags |= CONN_INCIPIENT;
2276 2276
2277 2277 connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278 2278 /* Insert at the head of the list */
2279 2279 mutex_enter(&connfp->connf_lock);
2280 2280 connp->conn_g_next = connfp->connf_head;
2281 2281 if (connp->conn_g_next != NULL)
2282 2282 connp->conn_g_next->conn_g_prev = connp;
2283 2283 connfp->connf_head = connp;
2284 2284
2285 2285 /* The fanout bucket this conn points to */
2286 2286 connp->conn_g_fanout = connfp;
2287 2287
2288 2288 mutex_exit(&connfp->connf_lock);
2289 2289 }
2290 2290
2291 2291 void
2292 2292 ipcl_globalhash_remove(conn_t *connp)
2293 2293 {
2294 2294 struct connf_s *connfp;
2295 2295
2296 2296 /*
2297 2297 * We were never inserted in the global multi list.
2298 2298 * IPCL_NONE variety is never inserted in the global multilist
2299 2299 * since it is presumed to not need any cleanup and is transient.
2300 2300 */
2301 2301 if (connp->conn_g_fanout == NULL)
2302 2302 return;
2303 2303
2304 2304 connfp = connp->conn_g_fanout;
2305 2305 mutex_enter(&connfp->connf_lock);
2306 2306 if (connp->conn_g_prev != NULL)
2307 2307 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308 2308 else
2309 2309 connfp->connf_head = connp->conn_g_next;
2310 2310 if (connp->conn_g_next != NULL)
2311 2311 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312 2312 mutex_exit(&connfp->connf_lock);
2313 2313
2314 2314 /* Better to stumble on a null pointer than to corrupt memory */
2315 2315 connp->conn_g_next = NULL;
2316 2316 connp->conn_g_prev = NULL;
2317 2317 connp->conn_g_fanout = NULL;
2318 2318 }
2319 2319
2320 2320 /*
2321 2321 * Walk the list of all conn_t's in the system, calling the function provided
2322 2322 * With the specified argument for each.
2323 2323 * Applies to both IPv4 and IPv6.
2324 2324 *
2325 2325 * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326 2326 * conn_oper_pending_ill). To guard against stale pointers
2327 2327 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328 2328 * unplumbed or removed. New conn_t's that are created while we are walking
2329 2329 * may be missed by this walk, because they are not necessarily inserted
2330 2330 * at the tail of the list. They are new conn_t's and thus don't have any
2331 2331 * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332 2332 * is created to the struct that is going away.
2333 2333 */
2334 2334 void
2335 2335 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 2336 {
2337 2337 int i;
2338 2338 conn_t *connp;
2339 2339 conn_t *prev_connp;
2340 2340
2341 2341 for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342 2342 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343 2343 prev_connp = NULL;
2344 2344 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345 2345 while (connp != NULL) {
2346 2346 mutex_enter(&connp->conn_lock);
2347 2347 if (connp->conn_state_flags &
2348 2348 (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349 2349 mutex_exit(&connp->conn_lock);
2350 2350 connp = connp->conn_g_next;
2351 2351 continue;
2352 2352 }
2353 2353 CONN_INC_REF_LOCKED(connp);
2354 2354 mutex_exit(&connp->conn_lock);
2355 2355 mutex_exit(
2356 2356 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 2357 (*func)(connp, arg);
2358 2358 if (prev_connp != NULL)
2359 2359 CONN_DEC_REF(prev_connp);
2360 2360 mutex_enter(
2361 2361 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362 2362 prev_connp = connp;
2363 2363 connp = connp->conn_g_next;
2364 2364 }
2365 2365 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366 2366 if (prev_connp != NULL)
2367 2367 CONN_DEC_REF(prev_connp);
2368 2368 }
2369 2369 }
2370 2370
2371 2371 /*
2372 2372 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373 2373 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2374 2374 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2375 2375 * (peer tcp in ESTABLISHED state).
2376 2376 */
2377 2377 conn_t *
2378 2378 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379 2379 ip_stack_t *ipst)
2380 2380 {
2381 2381 uint32_t ports;
2382 2382 uint16_t *pports = (uint16_t *)&ports;
2383 2383 connf_t *connfp;
2384 2384 conn_t *tconnp;
2385 2385 boolean_t zone_chk;
2386 2386
2387 2387 /*
2388 2388 * If either the source of destination address is loopback, then
2389 2389 * both endpoints must be in the same Zone. Otherwise, both of
2390 2390 * the addresses are system-wide unique (tcp is in ESTABLISHED
2391 2391 * state) and the endpoints may reside in different Zones.
2392 2392 */
2393 2393 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394 2394 ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395 2395
2396 2396 pports[0] = tcpha->tha_fport;
2397 2397 pports[1] = tcpha->tha_lport;
2398 2398
2399 2399 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400 2400 ports, ipst)];
2401 2401
2402 2402 mutex_enter(&connfp->connf_lock);
2403 2403 for (tconnp = connfp->connf_head; tconnp != NULL;
2404 2404 tconnp = tconnp->conn_next) {
2405 2405
2406 2406 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407 2407 ipha->ipha_dst, ipha->ipha_src, ports) &&
2408 2408 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409 2409 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410 2410
2411 2411 ASSERT(tconnp != connp);
2412 2412 CONN_INC_REF(tconnp);
2413 2413 mutex_exit(&connfp->connf_lock);
2414 2414 return (tconnp);
2415 2415 }
2416 2416 }
2417 2417 mutex_exit(&connfp->connf_lock);
2418 2418 return (NULL);
2419 2419 }
2420 2420
2421 2421 /*
2422 2422 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423 2423 * the {src, dst, lport, fport} quadruplet. Returns with conn reference
2424 2424 * held; caller must call CONN_DEC_REF. Only checks for connected entries
2425 2425 * (peer tcp in ESTABLISHED state).
2426 2426 */
2427 2427 conn_t *
2428 2428 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429 2429 ip_stack_t *ipst)
2430 2430 {
2431 2431 uint32_t ports;
2432 2432 uint16_t *pports = (uint16_t *)&ports;
2433 2433 connf_t *connfp;
2434 2434 conn_t *tconnp;
2435 2435 boolean_t zone_chk;
2436 2436
2437 2437 /*
2438 2438 * If either the source of destination address is loopback, then
2439 2439 * both endpoints must be in the same Zone. Otherwise, both of
2440 2440 * the addresses are system-wide unique (tcp is in ESTABLISHED
2441 2441 * state) and the endpoints may reside in different Zones. We
2442 2442 * don't do Zone check for link local address(es) because the
2443 2443 * current Zone implementation treats each link local address as
2444 2444 * being unique per system node, i.e. they belong to global Zone.
2445 2445 */
2446 2446 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447 2447 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448 2448
2449 2449 pports[0] = tcpha->tha_fport;
2450 2450 pports[1] = tcpha->tha_lport;
2451 2451
2452 2452 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453 2453 ports, ipst)];
2454 2454
2455 2455 mutex_enter(&connfp->connf_lock);
2456 2456 for (tconnp = connfp->connf_head; tconnp != NULL;
2457 2457 tconnp = tconnp->conn_next) {
2458 2458
2459 2459 /* We skip conn_bound_if check here as this is loopback tcp */
2460 2460 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461 2461 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462 2462 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463 2463 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464 2464
2465 2465 ASSERT(tconnp != connp);
2466 2466 CONN_INC_REF(tconnp);
2467 2467 mutex_exit(&connfp->connf_lock);
2468 2468 return (tconnp);
2469 2469 }
2470 2470 }
2471 2471 mutex_exit(&connfp->connf_lock);
2472 2472 return (NULL);
2473 2473 }
2474 2474
2475 2475 /*
2476 2476 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477 2477 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478 2478 * Only checks for connected entries i.e. no INADDR_ANY checks.
2479 2479 */
2480 2480 conn_t *
2481 2481 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482 2482 ip_stack_t *ipst)
2483 2483 {
2484 2484 uint32_t ports;
2485 2485 uint16_t *pports;
2486 2486 connf_t *connfp;
2487 2487 conn_t *tconnp;
2488 2488
2489 2489 pports = (uint16_t *)&ports;
2490 2490 pports[0] = tcpha->tha_fport;
2491 2491 pports[1] = tcpha->tha_lport;
2492 2492
2493 2493 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494 2494 ports, ipst)];
2495 2495
2496 2496 mutex_enter(&connfp->connf_lock);
2497 2497 for (tconnp = connfp->connf_head; tconnp != NULL;
2498 2498 tconnp = tconnp->conn_next) {
2499 2499
2500 2500 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501 2501 ipha->ipha_dst, ipha->ipha_src, ports) &&
2502 2502 tconnp->conn_tcp->tcp_state >= min_state) {
2503 2503
2504 2504 CONN_INC_REF(tconnp);
2505 2505 mutex_exit(&connfp->connf_lock);
2506 2506 return (tconnp);
2507 2507 }
2508 2508 }
2509 2509 mutex_exit(&connfp->connf_lock);
2510 2510 return (NULL);
2511 2511 }
2512 2512
2513 2513 /*
2514 2514 * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515 2515 * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516 2516 * Only checks for connected entries i.e. no INADDR_ANY checks.
2517 2517 * Match on ifindex in addition to addresses.
2518 2518 */
2519 2519 conn_t *
2520 2520 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521 2521 uint_t ifindex, ip_stack_t *ipst)
2522 2522 {
2523 2523 tcp_t *tcp;
2524 2524 uint32_t ports;
2525 2525 uint16_t *pports;
2526 2526 connf_t *connfp;
2527 2527 conn_t *tconnp;
2528 2528
2529 2529 pports = (uint16_t *)&ports;
2530 2530 pports[0] = tcpha->tha_fport;
2531 2531 pports[1] = tcpha->tha_lport;
2532 2532
2533 2533 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534 2534 ports, ipst)];
2535 2535
2536 2536 mutex_enter(&connfp->connf_lock);
2537 2537 for (tconnp = connfp->connf_head; tconnp != NULL;
2538 2538 tconnp = tconnp->conn_next) {
2539 2539
2540 2540 tcp = tconnp->conn_tcp;
2541 2541 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542 2542 ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543 2543 tcp->tcp_state >= min_state &&
2544 2544 (tconnp->conn_bound_if == 0 ||
2545 2545 tconnp->conn_bound_if == ifindex)) {
2546 2546
2547 2547 CONN_INC_REF(tconnp);
2548 2548 mutex_exit(&connfp->connf_lock);
2549 2549 return (tconnp);
2550 2550 }
2551 2551 }
2552 2552 mutex_exit(&connfp->connf_lock);
2553 2553 return (NULL);
2554 2554 }
2555 2555
2556 2556 /*
2557 2557 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558 2558 * a listener when changing state.
2559 2559 */
2560 2560 conn_t *
2561 2561 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562 2562 ip_stack_t *ipst)
2563 2563 {
2564 2564 connf_t *bind_connfp;
2565 2565 conn_t *connp;
2566 2566 tcp_t *tcp;
2567 2567
2568 2568 /*
2569 2569 * Avoid false matches for packets sent to an IP destination of
2570 2570 * all zeros.
2571 2571 */
2572 2572 if (laddr == 0)
2573 2573 return (NULL);
2574 2574
2575 2575 ASSERT(zoneid != ALL_ZONES);
2576 2576
2577 2577 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578 2578 mutex_enter(&bind_connfp->connf_lock);
2579 2579 for (connp = bind_connfp->connf_head; connp != NULL;
2580 2580 connp = connp->conn_next) {
2581 2581 tcp = connp->conn_tcp;
2582 2582 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583 2583 IPCL_ZONE_MATCH(connp, zoneid) &&
2584 2584 (tcp->tcp_listener == NULL)) {
2585 2585 CONN_INC_REF(connp);
2586 2586 mutex_exit(&bind_connfp->connf_lock);
2587 2587 return (connp);
2588 2588 }
2589 2589 }
2590 2590 mutex_exit(&bind_connfp->connf_lock);
2591 2591 return (NULL);
2592 2592 }
2593 2593
2594 2594 /*
2595 2595 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596 2596 * a listener when changing state.
2597 2597 */
2598 2598 conn_t *
2599 2599 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600 2600 zoneid_t zoneid, ip_stack_t *ipst)
2601 2601 {
2602 2602 connf_t *bind_connfp;
2603 2603 conn_t *connp = NULL;
2604 2604 tcp_t *tcp;
2605 2605
2606 2606 /*
2607 2607 * Avoid false matches for packets sent to an IP destination of
2608 2608 * all zeros.
2609 2609 */
2610 2610 if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611 2611 return (NULL);
2612 2612
2613 2613 ASSERT(zoneid != ALL_ZONES);
2614 2614
2615 2615 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616 2616 mutex_enter(&bind_connfp->connf_lock);
2617 2617 for (connp = bind_connfp->connf_head; connp != NULL;
2618 2618 connp = connp->conn_next) {
2619 2619 tcp = connp->conn_tcp;
2620 2620 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621 2621 IPCL_ZONE_MATCH(connp, zoneid) &&
2622 2622 (connp->conn_bound_if == 0 ||
2623 2623 connp->conn_bound_if == ifindex) &&
2624 2624 tcp->tcp_listener == NULL) {
2625 2625 CONN_INC_REF(connp);
2626 2626 mutex_exit(&bind_connfp->connf_lock);
2627 2627 return (connp);
2628 2628 }
2629 2629 }
2630 2630 mutex_exit(&bind_connfp->connf_lock);
2631 2631 return (NULL);
2632 2632 }
2633 2633
2634 2634 /*
2635 2635 * ipcl_get_next_conn
2636 2636 * get the next entry in the conn global list
2637 2637 * and put a reference on the next_conn.
2638 2638 * decrement the reference on the current conn.
2639 2639 *
2640 2640 * This is an iterator based walker function that also provides for
2641 2641 * some selection by the caller. It walks through the conn_hash bucket
2642 2642 * searching for the next valid connp in the list, and selects connections
2643 2643 * that are neither closed nor condemned. It also REFHOLDS the conn
2644 2644 * thus ensuring that the conn exists when the caller uses the conn.
2645 2645 */
2646 2646 conn_t *
2647 2647 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 2648 {
2649 2649 conn_t *next_connp;
2650 2650
2651 2651 if (connfp == NULL)
2652 2652 return (NULL);
2653 2653
2654 2654 mutex_enter(&connfp->connf_lock);
2655 2655
2656 2656 next_connp = (connp == NULL) ?
2657 2657 connfp->connf_head : connp->conn_g_next;
2658 2658
2659 2659 while (next_connp != NULL) {
2660 2660 mutex_enter(&next_connp->conn_lock);
2661 2661 if (!(next_connp->conn_flags & conn_flags) ||
2662 2662 (next_connp->conn_state_flags &
2663 2663 (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664 2664 /*
2665 2665 * This conn has been condemned or
2666 2666 * is closing, or the flags don't match
2667 2667 */
2668 2668 mutex_exit(&next_connp->conn_lock);
2669 2669 next_connp = next_connp->conn_g_next;
2670 2670 continue;
2671 2671 }
2672 2672 CONN_INC_REF_LOCKED(next_connp);
2673 2673 mutex_exit(&next_connp->conn_lock);
2674 2674 break;
2675 2675 }
2676 2676
2677 2677 mutex_exit(&connfp->connf_lock);
2678 2678
2679 2679 if (connp != NULL)
2680 2680 CONN_DEC_REF(connp);
2681 2681
2682 2682 return (next_connp);
2683 2683 }
2684 2684
2685 2685 #ifdef CONN_DEBUG
2686 2686 /*
2687 2687 * Trace of the last NBUF refhold/refrele
2688 2688 */
2689 2689 int
2690 2690 conn_trace_ref(conn_t *connp)
2691 2691 {
2692 2692 int last;
2693 2693 conn_trace_t *ctb;
2694 2694
2695 2695 ASSERT(MUTEX_HELD(&connp->conn_lock));
2696 2696 last = connp->conn_trace_last;
2697 2697 last++;
2698 2698 if (last == CONN_TRACE_MAX)
2699 2699 last = 0;
2700 2700
2701 2701 ctb = &connp->conn_trace_buf[last];
2702 2702 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703 2703 connp->conn_trace_last = last;
2704 2704 return (1);
2705 2705 }
2706 2706
2707 2707 int
2708 2708 conn_untrace_ref(conn_t *connp)
2709 2709 {
2710 2710 int last;
2711 2711 conn_trace_t *ctb;
2712 2712
2713 2713 ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 2714 last = connp->conn_trace_last;
↓ open down ↓ |
2714 lines elided |
↑ open up ↑ |
2715 2715 last++;
2716 2716 if (last == CONN_TRACE_MAX)
2717 2717 last = 0;
2718 2718
2719 2719 ctb = &connp->conn_trace_buf[last];
2720 2720 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 2721 connp->conn_trace_last = last;
2722 2722 return (1);
2723 2723 }
2724 2724 #endif
2725 +
2726 +mblk_t *
2727 +conn_get_pid_mblk(conn_t *connp)
2728 +{
2729 + mblk_t *mblk;
2730 + conn_pid_info_t *cpi;
2731 +
2732 + /*
2733 + * If the connection is closing, it is not safe to make an upcall or
2734 + * access the stream associated with the connection.
2735 + */
2736 + if (!(connp->conn_state_flags & CONN_CLOSING)) {
2737 + if (connp->conn_upper_handle != NULL) {
2738 + return (*connp->conn_upcalls->su_get_sock_pid_mblk)
2739 + (connp->conn_upper_handle);
2740 + } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL &&
2741 + connp->conn_rq->q_stream != NULL) {
2742 + return (sh_get_pid_mblk(connp->conn_rq->q_stream));
2743 + }
2744 + }
2745 +
2746 + /* return an empty mblk */
2747 + if ((mblk = allocb(sizeof (conn_pid_info_t), BPRI_HI)) == NULL)
2748 + return (NULL);
2749 + mblk->b_wptr += sizeof (conn_pid_info_t);
2750 + cpi = (conn_pid_info_t *)mblk->b_datap->db_base;
2751 + cpi->cpi_contents = CONN_PID_INFO_NON;
2752 + cpi->cpi_pids_cnt = 0;
2753 + cpi->cpi_tot_size = sizeof (conn_pid_info_t);
2754 + cpi->cpi_pids[0] = 0;
2755 + return (mblk);
2756 +}
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX