1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * IP PACKET CLASSIFIER 27 * 28 * The IP packet classifier provides mapping between IP packets and persistent 29 * connection state for connection-oriented protocols. It also provides 30 * interface for managing connection states. 31 * 32 * The connection state is kept in conn_t data structure and contains, among 33 * other things: 34 * 35 * o local/remote address and ports 36 * o Transport protocol 37 * o squeue for the connection (for TCP only) 38 * o reference counter 39 * o Connection state 40 * o hash table linkage 41 * o interface/ire information 42 * o credentials 43 * o ipsec policy 44 * o send and receive functions. 45 * o mutex lock. 46 * 47 * Connections use a reference counting scheme. They are freed when the 48 * reference counter drops to zero. A reference is incremented when connection 49 * is placed in a list or table, when incoming packet for the connection arrives 50 * and when connection is processed via squeue (squeue processing may be 51 * asynchronous and the reference protects the connection from being destroyed 52 * before its processing is finished). 53 * 54 * conn_recv is used to pass up packets to the ULP. 55 * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for 56 * a listener, and changes to tcp_input_listener as the listener has picked a 57 * good squeue. For other cases it is set to tcp_input_data. 58 * 59 * conn_recvicmp is used to pass up ICMP errors to the ULP. 60 * 61 * Classifier uses several hash tables: 62 * 63 * ipcl_conn_fanout: contains all TCP connections in CONNECTED state 64 * ipcl_bind_fanout: contains all connections in BOUND state 65 * ipcl_proto_fanout: IPv4 protocol fanout 66 * ipcl_proto_fanout_v6: IPv6 protocol fanout 67 * ipcl_udp_fanout: contains all UDP connections 68 * ipcl_iptun_fanout: contains all IP tunnel connections 69 * ipcl_globalhash_fanout: contains all connections 70 * 71 * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering) 72 * which need to view all existing connections. 73 * 74 * All tables are protected by per-bucket locks. When both per-bucket lock and 75 * connection lock need to be held, the per-bucket lock should be acquired 76 * first, followed by the connection lock. 77 * 78 * All functions doing search in one of these tables increment a reference 79 * counter on the connection found (if any). This reference should be dropped 80 * when the caller has finished processing the connection. 81 * 82 * 83 * INTERFACES: 84 * =========== 85 * 86 * Connection Lookup: 87 * ------------------ 88 * 89 * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) 90 * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) 91 * 92 * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if 93 * it can't find any associated connection. If the connection is found, its 94 * reference counter is incremented. 95 * 96 * mp: mblock, containing packet header. The full header should fit 97 * into a single mblock. It should also contain at least full IP 98 * and TCP or UDP header. 99 * 100 * protocol: Either IPPROTO_TCP or IPPROTO_UDP. 101 * 102 * hdr_len: The size of IP header. It is used to find TCP or UDP header in 103 * the packet. 104 * 105 * ira->ira_zoneid: The zone in which the returned connection must be; the 106 * zoneid corresponding to the ire_zoneid on the IRE located for 107 * the packet's destination address. 108 * 109 * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and 110 * IRAF_TX_SHARED_ADDR flags 111 * 112 * For TCP connections, the lookup order is as follows: 113 * 5-tuple {src, dst, protocol, local port, remote port} 114 * lookup in ipcl_conn_fanout table. 115 * 3-tuple {dst, remote port, protocol} lookup in 116 * ipcl_bind_fanout table. 117 * 118 * For UDP connections, a 5-tuple {src, dst, protocol, local port, 119 * remote port} lookup is done on ipcl_udp_fanout. Note that, 120 * these interfaces do not handle cases where a packets belongs 121 * to multiple UDP clients, which is handled in IP itself. 122 * 123 * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must 124 * determine which actual zone gets the segment. This is used only in a 125 * labeled environment. The matching rules are: 126 * 127 * - If it's not a multilevel port, then the label on the packet selects 128 * the zone. Unlabeled packets are delivered to the global zone. 129 * 130 * - If it's a multilevel port, then only the zone registered to receive 131 * packets on that port matches. 132 * 133 * Also, in a labeled environment, packet labels need to be checked. For fully 134 * bound TCP connections, we can assume that the packet label was checked 135 * during connection establishment, and doesn't need to be checked on each 136 * packet. For others, though, we need to check for strict equality or, for 137 * multilevel ports, membership in the range or set. This part currently does 138 * a tnrh lookup on each packet, but could be optimized to use cached results 139 * if that were necessary. (SCTP doesn't come through here, but if it did, 140 * we would apply the same rules as TCP.) 141 * 142 * An implication of the above is that fully-bound TCP sockets must always use 143 * distinct 4-tuples; they can't be discriminated by label alone. 144 * 145 * Note that we cannot trust labels on packets sent to fully-bound UDP sockets, 146 * as there's no connection set-up handshake and no shared state. 147 * 148 * Labels on looped-back packets within a single zone do not need to be 149 * checked, as all processes in the same zone have the same label. 150 * 151 * Finally, for unlabeled packets received by a labeled system, special rules 152 * apply. We consider only the MLP if there is one. Otherwise, we prefer a 153 * socket in the zone whose label matches the default label of the sender, if 154 * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the 155 * receiver's label must dominate the sender's default label. 156 * 157 * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); 158 * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, 159 * ip_stack); 160 * 161 * Lookup routine to find a exact match for {src, dst, local port, 162 * remote port) for TCP connections in ipcl_conn_fanout. The address and 163 * ports are read from the IP and TCP header respectively. 164 * 165 * conn_t *ipcl_lookup_listener_v4(lport, laddr, protocol, 166 * zoneid, ip_stack); 167 * conn_t *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex, 168 * zoneid, ip_stack); 169 * 170 * Lookup routine to find a listener with the tuple {lport, laddr, 171 * protocol} in the ipcl_bind_fanout table. For IPv6, an additional 172 * parameter interface index is also compared. 173 * 174 * void ipcl_walk(func, arg, ip_stack) 175 * 176 * Apply 'func' to every connection available. The 'func' is called as 177 * (*func)(connp, arg). The walk is non-atomic so connections may be 178 * created and destroyed during the walk. The CONN_CONDEMNED and 179 * CONN_INCIPIENT flags ensure that connections which are newly created 180 * or being destroyed are not selected by the walker. 181 * 182 * Table Updates 183 * ------------- 184 * 185 * int ipcl_conn_insert(connp); 186 * int ipcl_conn_insert_v4(connp); 187 * int ipcl_conn_insert_v6(connp); 188 * 189 * Insert 'connp' in the ipcl_conn_fanout. 190 * Arguements : 191 * connp conn_t to be inserted 192 * 193 * Return value : 194 * 0 if connp was inserted 195 * EADDRINUSE if the connection with the same tuple 196 * already exists. 197 * 198 * int ipcl_bind_insert(connp); 199 * int ipcl_bind_insert_v4(connp); 200 * int ipcl_bind_insert_v6(connp); 201 * 202 * Insert 'connp' in ipcl_bind_fanout. 203 * Arguements : 204 * connp conn_t to be inserted 205 * 206 * 207 * void ipcl_hash_remove(connp); 208 * 209 * Removes the 'connp' from the connection fanout table. 210 * 211 * Connection Creation/Destruction 212 * ------------------------------- 213 * 214 * conn_t *ipcl_conn_create(type, sleep, netstack_t *) 215 * 216 * Creates a new conn based on the type flag, inserts it into 217 * globalhash table. 218 * 219 * type: This flag determines the type of conn_t which needs to be 220 * created i.e., which kmem_cache it comes from. 221 * IPCL_TCPCONN indicates a TCP connection 222 * IPCL_SCTPCONN indicates a SCTP connection 223 * IPCL_UDPCONN indicates a UDP conn_t. 224 * IPCL_RAWIPCONN indicates a RAWIP/ICMP conn_t. 225 * IPCL_RTSCONN indicates a RTS conn_t. 226 * IPCL_IPCCONN indicates all other connections. 227 * 228 * void ipcl_conn_destroy(connp) 229 * 230 * Destroys the connection state, removes it from the global 231 * connection hash table and frees its memory. 232 */ 233 234 #include <sys/types.h> 235 #include <sys/stream.h> 236 #include <sys/stropts.h> 237 #include <sys/sysmacros.h> 238 #include <sys/strsubr.h> 239 #include <sys/strsun.h> 240 #define _SUN_TPI_VERSION 2 241 #include <sys/ddi.h> 242 #include <sys/cmn_err.h> 243 #include <sys/debug.h> 244 245 #include <sys/systm.h> 246 #include <sys/param.h> 247 #include <sys/kmem.h> 248 #include <sys/isa_defs.h> 249 #include <inet/common.h> 250 #include <netinet/ip6.h> 251 #include <netinet/icmp6.h> 252 253 #include <inet/ip.h> 254 #include <inet/ip_if.h> 255 #include <inet/ip_ire.h> 256 #include <inet/ip6.h> 257 #include <inet/ip_ndp.h> 258 #include <inet/ip_impl.h> 259 #include <inet/udp_impl.h> 260 #include <inet/sctp_ip.h> 261 #include <inet/sctp/sctp_impl.h> 262 #include <inet/rawip_impl.h> 263 #include <inet/rts_impl.h> 264 #include <inet/iptun/iptun_impl.h> 265 266 #include <sys/cpuvar.h> 267 268 #include <inet/ipclassifier.h> 269 #include <inet/tcp.h> 270 #include <inet/ipsec_impl.h> 271 272 #include <sys/tsol/tnet.h> 273 #include <sys/sockio.h> 274 275 /* Old value for compatibility. Setable in /etc/system */ 276 uint_t tcp_conn_hash_size = 0; 277 278 /* New value. Zero means choose automatically. Setable in /etc/system */ 279 uint_t ipcl_conn_hash_size = 0; 280 uint_t ipcl_conn_hash_memfactor = 8192; 281 uint_t ipcl_conn_hash_maxsize = 82500; 282 283 /* bind/udp fanout table size */ 284 uint_t ipcl_bind_fanout_size = 512; 285 uint_t ipcl_udp_fanout_size = 16384; 286 287 /* Raw socket fanout size. Must be a power of 2. */ 288 uint_t ipcl_raw_fanout_size = 256; 289 290 /* 291 * The IPCL_IPTUN_HASH() function works best with a prime table size. We 292 * expect that most large deployments would have hundreds of tunnels, and 293 * thousands in the extreme case. 294 */ 295 uint_t ipcl_iptun_fanout_size = 6143; 296 297 /* 298 * Power of 2^N Primes useful for hashing for N of 0-28, 299 * these primes are the nearest prime <= 2^N - 2^(N-2). 300 */ 301 302 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067, \ 303 6143, 12281, 24571, 49139, 98299, 196597, 393209, \ 304 786431, 1572853, 3145721, 6291449, 12582893, 25165813, \ 305 50331599, 100663291, 201326557, 0} 306 307 /* 308 * wrapper structure to ensure that conn and what follows it (tcp_t, etc) 309 * are aligned on cache lines. 310 */ 311 typedef union itc_s { 312 conn_t itc_conn; 313 char itcu_filler[CACHE_ALIGN(conn_s)]; 314 } itc_t; 315 316 struct kmem_cache *tcp_conn_cache; 317 struct kmem_cache *ip_conn_cache; 318 extern struct kmem_cache *sctp_conn_cache; 319 struct kmem_cache *udp_conn_cache; 320 struct kmem_cache *rawip_conn_cache; 321 struct kmem_cache *rts_conn_cache; 322 323 extern void tcp_timermp_free(tcp_t *); 324 extern mblk_t *tcp_timermp_alloc(int); 325 326 static int ip_conn_constructor(void *, void *, int); 327 static void ip_conn_destructor(void *, void *); 328 329 static int tcp_conn_constructor(void *, void *, int); 330 static void tcp_conn_destructor(void *, void *); 331 332 static int udp_conn_constructor(void *, void *, int); 333 static void udp_conn_destructor(void *, void *); 334 335 static int rawip_conn_constructor(void *, void *, int); 336 static void rawip_conn_destructor(void *, void *); 337 338 static int rts_conn_constructor(void *, void *, int); 339 static void rts_conn_destructor(void *, void *); 340 341 /* 342 * Global (for all stack instances) init routine 343 */ 344 void 345 ipcl_g_init(void) 346 { 347 ip_conn_cache = kmem_cache_create("ip_conn_cache", 348 sizeof (conn_t), CACHE_ALIGN_SIZE, 349 ip_conn_constructor, ip_conn_destructor, 350 NULL, NULL, NULL, 0); 351 352 tcp_conn_cache = kmem_cache_create("tcp_conn_cache", 353 sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE, 354 tcp_conn_constructor, tcp_conn_destructor, 355 tcp_conn_reclaim, NULL, NULL, 0); 356 357 udp_conn_cache = kmem_cache_create("udp_conn_cache", 358 sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE, 359 udp_conn_constructor, udp_conn_destructor, 360 NULL, NULL, NULL, 0); 361 362 rawip_conn_cache = kmem_cache_create("rawip_conn_cache", 363 sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE, 364 rawip_conn_constructor, rawip_conn_destructor, 365 NULL, NULL, NULL, 0); 366 367 rts_conn_cache = kmem_cache_create("rts_conn_cache", 368 sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, 369 rts_conn_constructor, rts_conn_destructor, 370 NULL, NULL, NULL, 0); 371 } 372 373 /* 374 * ipclassifier intialization routine, sets up hash tables. 375 */ 376 void 377 ipcl_init(ip_stack_t *ipst) 378 { 379 int i; 380 int sizes[] = P2Ps(); 381 382 /* 383 * Calculate size of conn fanout table from /etc/system settings 384 */ 385 if (ipcl_conn_hash_size != 0) { 386 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size; 387 } else if (tcp_conn_hash_size != 0) { 388 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size; 389 } else { 390 extern pgcnt_t freemem; 391 392 ipst->ips_ipcl_conn_fanout_size = 393 (freemem * PAGESIZE) / ipcl_conn_hash_memfactor; 394 395 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) { 396 ipst->ips_ipcl_conn_fanout_size = 397 ipcl_conn_hash_maxsize; 398 } 399 } 400 401 for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) { 402 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) { 403 break; 404 } 405 } 406 if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) { 407 /* Out of range, use the 2^16 value */ 408 ipst->ips_ipcl_conn_fanout_size = sizes[16]; 409 } 410 411 /* Take values from /etc/system */ 412 ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size; 413 ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size; 414 ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size; 415 ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size; 416 417 ASSERT(ipst->ips_ipcl_conn_fanout == NULL); 418 419 ipst->ips_ipcl_conn_fanout = kmem_zalloc( 420 ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP); 421 422 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 423 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL, 424 MUTEX_DEFAULT, NULL); 425 } 426 427 ipst->ips_ipcl_bind_fanout = kmem_zalloc( 428 ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP); 429 430 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 431 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL, 432 MUTEX_DEFAULT, NULL); 433 } 434 435 ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * 436 sizeof (connf_t), KM_SLEEP); 437 for (i = 0; i < IPPROTO_MAX; i++) { 438 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, 439 MUTEX_DEFAULT, NULL); 440 } 441 442 ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX * 443 sizeof (connf_t), KM_SLEEP); 444 for (i = 0; i < IPPROTO_MAX; i++) { 445 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL, 446 MUTEX_DEFAULT, NULL); 447 } 448 449 ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP); 450 mutex_init(&ipst->ips_rts_clients->connf_lock, 451 NULL, MUTEX_DEFAULT, NULL); 452 453 ipst->ips_ipcl_udp_fanout = kmem_zalloc( 454 ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP); 455 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 456 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL, 457 MUTEX_DEFAULT, NULL); 458 } 459 460 ipst->ips_ipcl_iptun_fanout = kmem_zalloc( 461 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP); 462 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 463 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL, 464 MUTEX_DEFAULT, NULL); 465 } 466 467 ipst->ips_ipcl_raw_fanout = kmem_zalloc( 468 ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP); 469 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 470 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL, 471 MUTEX_DEFAULT, NULL); 472 } 473 474 ipst->ips_ipcl_globalhash_fanout = kmem_zalloc( 475 sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP); 476 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 477 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock, 478 NULL, MUTEX_DEFAULT, NULL); 479 } 480 } 481 482 void 483 ipcl_g_destroy(void) 484 { 485 kmem_cache_destroy(ip_conn_cache); 486 kmem_cache_destroy(tcp_conn_cache); 487 kmem_cache_destroy(udp_conn_cache); 488 kmem_cache_destroy(rawip_conn_cache); 489 kmem_cache_destroy(rts_conn_cache); 490 } 491 492 /* 493 * All user-level and kernel use of the stack must be gone 494 * by now. 495 */ 496 void 497 ipcl_destroy(ip_stack_t *ipst) 498 { 499 int i; 500 501 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 502 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL); 503 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock); 504 } 505 kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size * 506 sizeof (connf_t)); 507 ipst->ips_ipcl_conn_fanout = NULL; 508 509 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 510 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL); 511 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock); 512 } 513 kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size * 514 sizeof (connf_t)); 515 ipst->ips_ipcl_bind_fanout = NULL; 516 517 for (i = 0; i < IPPROTO_MAX; i++) { 518 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); 519 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); 520 } 521 kmem_free(ipst->ips_ipcl_proto_fanout_v4, 522 IPPROTO_MAX * sizeof (connf_t)); 523 ipst->ips_ipcl_proto_fanout_v4 = NULL; 524 525 for (i = 0; i < IPPROTO_MAX; i++) { 526 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); 527 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock); 528 } 529 kmem_free(ipst->ips_ipcl_proto_fanout_v6, 530 IPPROTO_MAX * sizeof (connf_t)); 531 ipst->ips_ipcl_proto_fanout_v6 = NULL; 532 533 for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) { 534 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL); 535 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock); 536 } 537 kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size * 538 sizeof (connf_t)); 539 ipst->ips_ipcl_udp_fanout = NULL; 540 541 for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) { 542 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL); 543 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock); 544 } 545 kmem_free(ipst->ips_ipcl_iptun_fanout, 546 ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t)); 547 ipst->ips_ipcl_iptun_fanout = NULL; 548 549 for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) { 550 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL); 551 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock); 552 } 553 kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size * 554 sizeof (connf_t)); 555 ipst->ips_ipcl_raw_fanout = NULL; 556 557 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 558 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL); 559 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 560 } 561 kmem_free(ipst->ips_ipcl_globalhash_fanout, 562 sizeof (connf_t) * CONN_G_HASH_SIZE); 563 ipst->ips_ipcl_globalhash_fanout = NULL; 564 565 ASSERT(ipst->ips_rts_clients->connf_head == NULL); 566 mutex_destroy(&ipst->ips_rts_clients->connf_lock); 567 kmem_free(ipst->ips_rts_clients, sizeof (connf_t)); 568 ipst->ips_rts_clients = NULL; 569 } 570 571 /* 572 * conn creation routine. initialize the conn, sets the reference 573 * and inserts it in the global hash table. 574 */ 575 conn_t * 576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) 577 { 578 conn_t *connp; 579 struct kmem_cache *conn_cache; 580 581 switch (type) { 582 case IPCL_SCTPCONN: 583 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) 584 return (NULL); 585 sctp_conn_init(connp); 586 netstack_hold(ns); 587 connp->conn_netstack = ns; 588 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 589 connp->conn_ixa->ixa_conn_id = (long)connp; 590 ipcl_globalhash_insert(connp); 591 return (connp); 592 593 case IPCL_TCPCONN: 594 conn_cache = tcp_conn_cache; 595 break; 596 597 case IPCL_UDPCONN: 598 conn_cache = udp_conn_cache; 599 break; 600 601 case IPCL_RAWIPCONN: 602 conn_cache = rawip_conn_cache; 603 break; 604 605 case IPCL_RTSCONN: 606 conn_cache = rts_conn_cache; 607 break; 608 609 case IPCL_IPCCONN: 610 conn_cache = ip_conn_cache; 611 break; 612 613 default: 614 connp = NULL; 615 ASSERT(0); 616 } 617 618 if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL) 619 return (NULL); 620 621 connp->conn_ref = 1; 622 netstack_hold(ns); 623 connp->conn_netstack = ns; 624 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 625 connp->conn_ixa->ixa_conn_id = (long)connp; 626 ipcl_globalhash_insert(connp); 627 return (connp); 628 } 629 630 void 631 ipcl_conn_destroy(conn_t *connp) 632 { 633 mblk_t *mp; 634 netstack_t *ns = connp->conn_netstack; 635 636 ASSERT(!MUTEX_HELD(&connp->conn_lock)); 637 ASSERT(connp->conn_ref == 0); 638 ASSERT(connp->conn_ioctlref == 0); 639 640 DTRACE_PROBE1(conn__destroy, conn_t *, connp); 641 642 if (connp->conn_cred != NULL) { 643 crfree(connp->conn_cred); 644 connp->conn_cred = NULL; 645 /* ixa_cred done in ipcl_conn_cleanup below */ 646 } 647 648 if (connp->conn_ht_iphc != NULL) { 649 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 650 connp->conn_ht_iphc = NULL; 651 connp->conn_ht_iphc_allocated = 0; 652 connp->conn_ht_iphc_len = 0; 653 connp->conn_ht_ulp = NULL; 654 connp->conn_ht_ulp_len = 0; 655 } 656 ip_pkt_free(&connp->conn_xmit_ipp); 657 658 ipcl_globalhash_remove(connp); 659 660 if (connp->conn_latch != NULL) { 661 IPLATCH_REFRELE(connp->conn_latch); 662 connp->conn_latch = NULL; 663 } 664 if (connp->conn_latch_in_policy != NULL) { 665 IPPOL_REFRELE(connp->conn_latch_in_policy); 666 connp->conn_latch_in_policy = NULL; 667 } 668 if (connp->conn_latch_in_action != NULL) { 669 IPACT_REFRELE(connp->conn_latch_in_action); 670 connp->conn_latch_in_action = NULL; 671 } 672 if (connp->conn_policy != NULL) { 673 IPPH_REFRELE(connp->conn_policy, ns); 674 connp->conn_policy = NULL; 675 } 676 677 if (connp->conn_ipsec_opt_mp != NULL) { 678 freemsg(connp->conn_ipsec_opt_mp); 679 connp->conn_ipsec_opt_mp = NULL; 680 } 681 682 if (connp->conn_flags & IPCL_TCPCONN) { 683 tcp_t *tcp = connp->conn_tcp; 684 685 tcp_free(tcp); 686 mp = tcp->tcp_timercache; 687 688 tcp->tcp_tcps = NULL; 689 690 /* 691 * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate 692 * the mblk. 693 */ 694 if (tcp->tcp_rsrv_mp != NULL) { 695 freeb(tcp->tcp_rsrv_mp); 696 tcp->tcp_rsrv_mp = NULL; 697 mutex_destroy(&tcp->tcp_rsrv_mp_lock); 698 } 699 700 ipcl_conn_cleanup(connp); 701 connp->conn_flags = IPCL_TCPCONN; 702 if (ns != NULL) { 703 ASSERT(tcp->tcp_tcps == NULL); 704 connp->conn_netstack = NULL; 705 connp->conn_ixa->ixa_ipst = NULL; 706 netstack_rele(ns); 707 } 708 709 bzero(tcp, sizeof (tcp_t)); 710 711 tcp->tcp_timercache = mp; 712 tcp->tcp_connp = connp; 713 kmem_cache_free(tcp_conn_cache, connp); 714 return; 715 } 716 717 if (connp->conn_flags & IPCL_SCTPCONN) { 718 ASSERT(ns != NULL); 719 sctp_free(connp); 720 return; 721 } 722 723 ipcl_conn_cleanup(connp); 724 if (ns != NULL) { 725 connp->conn_netstack = NULL; 726 connp->conn_ixa->ixa_ipst = NULL; 727 netstack_rele(ns); 728 } 729 730 /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ 731 if (connp->conn_flags & IPCL_UDPCONN) { 732 connp->conn_flags = IPCL_UDPCONN; 733 kmem_cache_free(udp_conn_cache, connp); 734 } else if (connp->conn_flags & IPCL_RAWIPCONN) { 735 connp->conn_flags = IPCL_RAWIPCONN; 736 connp->conn_proto = IPPROTO_ICMP; 737 connp->conn_ixa->ixa_protocol = connp->conn_proto; 738 kmem_cache_free(rawip_conn_cache, connp); 739 } else if (connp->conn_flags & IPCL_RTSCONN) { 740 connp->conn_flags = IPCL_RTSCONN; 741 kmem_cache_free(rts_conn_cache, connp); 742 } else { 743 connp->conn_flags = IPCL_IPCCONN; 744 ASSERT(connp->conn_flags & IPCL_IPCCONN); 745 ASSERT(connp->conn_priv == NULL); 746 kmem_cache_free(ip_conn_cache, connp); 747 } 748 } 749 750 /* 751 * Running in cluster mode - deregister listener information 752 */ 753 static void 754 ipcl_conn_unlisten(conn_t *connp) 755 { 756 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0); 757 ASSERT(connp->conn_lport != 0); 758 759 if (cl_inet_unlisten != NULL) { 760 sa_family_t addr_family; 761 uint8_t *laddrp; 762 763 if (connp->conn_ipversion == IPV6_VERSION) { 764 addr_family = AF_INET6; 765 laddrp = (uint8_t *)&connp->conn_bound_addr_v6; 766 } else { 767 addr_family = AF_INET; 768 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 769 } 770 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, 771 IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); 772 } 773 connp->conn_flags &= ~IPCL_CL_LISTENER; 774 } 775 776 /* 777 * We set the IPCL_REMOVED flag (instead of clearing the flag indicating 778 * which table the conn belonged to). So for debugging we can see which hash 779 * table this connection was in. 780 */ 781 #define IPCL_HASH_REMOVE(connp) { \ 782 connf_t *connfp = (connp)->conn_fanout; \ 783 ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ 784 if (connfp != NULL) { \ 785 mutex_enter(&connfp->connf_lock); \ 786 if ((connp)->conn_next != NULL) \ 787 (connp)->conn_next->conn_prev = \ 788 (connp)->conn_prev; \ 789 if ((connp)->conn_prev != NULL) \ 790 (connp)->conn_prev->conn_next = \ 791 (connp)->conn_next; \ 792 else \ 793 connfp->connf_head = (connp)->conn_next; \ 794 (connp)->conn_fanout = NULL; \ 795 (connp)->conn_next = NULL; \ 796 (connp)->conn_prev = NULL; \ 797 (connp)->conn_flags |= IPCL_REMOVED; \ 798 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0) \ 799 ipcl_conn_unlisten((connp)); \ 800 CONN_DEC_REF((connp)); \ 801 mutex_exit(&connfp->connf_lock); \ 802 } \ 803 } 804 805 void 806 ipcl_hash_remove(conn_t *connp) 807 { 808 uint8_t protocol = connp->conn_proto; 809 810 IPCL_HASH_REMOVE(connp); 811 if (protocol == IPPROTO_RSVP) 812 ill_set_inputfn_all(connp->conn_netstack->netstack_ip); 813 } 814 815 /* 816 * The whole purpose of this function is allow removal of 817 * a conn_t from the connected hash for timewait reclaim. 818 * This is essentially a TW reclaim fastpath where timewait 819 * collector checks under fanout lock (so no one else can 820 * get access to the conn_t) that refcnt is 2 i.e. one for 821 * TCP and one for the classifier hash list. If ref count 822 * is indeed 2, we can just remove the conn under lock and 823 * avoid cleaning up the conn under squeue. This gives us 824 * improved performance. 825 */ 826 void 827 ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) 828 { 829 ASSERT(MUTEX_HELD(&connfp->connf_lock)); 830 ASSERT(MUTEX_HELD(&connp->conn_lock)); 831 ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0); 832 833 if ((connp)->conn_next != NULL) { 834 (connp)->conn_next->conn_prev = (connp)->conn_prev; 835 } 836 if ((connp)->conn_prev != NULL) { 837 (connp)->conn_prev->conn_next = (connp)->conn_next; 838 } else { 839 connfp->connf_head = (connp)->conn_next; 840 } 841 (connp)->conn_fanout = NULL; 842 (connp)->conn_next = NULL; 843 (connp)->conn_prev = NULL; 844 (connp)->conn_flags |= IPCL_REMOVED; 845 ASSERT((connp)->conn_ref == 2); 846 (connp)->conn_ref--; 847 } 848 849 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) { \ 850 ASSERT((connp)->conn_fanout == NULL); \ 851 ASSERT((connp)->conn_next == NULL); \ 852 ASSERT((connp)->conn_prev == NULL); \ 853 if ((connfp)->connf_head != NULL) { \ 854 (connfp)->connf_head->conn_prev = (connp); \ 855 (connp)->conn_next = (connfp)->connf_head; \ 856 } \ 857 (connp)->conn_fanout = (connfp); \ 858 (connfp)->connf_head = (connp); \ 859 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 860 IPCL_CONNECTED; \ 861 CONN_INC_REF(connp); \ 862 } 863 864 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ 865 IPCL_HASH_REMOVE((connp)); \ 866 mutex_enter(&(connfp)->connf_lock); \ 867 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ 868 mutex_exit(&(connfp)->connf_lock); \ 869 } 870 871 #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ 872 conn_t *pconnp = NULL, *nconnp; \ 873 IPCL_HASH_REMOVE((connp)); \ 874 mutex_enter(&(connfp)->connf_lock); \ 875 nconnp = (connfp)->connf_head; \ 876 while (nconnp != NULL && \ 877 !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ 878 pconnp = nconnp; \ 879 nconnp = nconnp->conn_next; \ 880 } \ 881 if (pconnp != NULL) { \ 882 pconnp->conn_next = (connp); \ 883 (connp)->conn_prev = pconnp; \ 884 } else { \ 885 (connfp)->connf_head = (connp); \ 886 } \ 887 if (nconnp != NULL) { \ 888 (connp)->conn_next = nconnp; \ 889 nconnp->conn_prev = (connp); \ 890 } \ 891 (connp)->conn_fanout = (connfp); \ 892 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 893 IPCL_BOUND; \ 894 CONN_INC_REF(connp); \ 895 mutex_exit(&(connfp)->connf_lock); \ 896 } 897 898 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ 899 conn_t **list, *prev, *next; \ 900 boolean_t isv4mapped = \ 901 IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ 902 IPCL_HASH_REMOVE((connp)); \ 903 mutex_enter(&(connfp)->connf_lock); \ 904 list = &(connfp)->connf_head; \ 905 prev = NULL; \ 906 while ((next = *list) != NULL) { \ 907 if (isv4mapped && \ 908 IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ 909 connp->conn_zoneid == next->conn_zoneid) { \ 910 (connp)->conn_next = next; \ 911 if (prev != NULL) \ 912 prev = next->conn_prev; \ 913 next->conn_prev = (connp); \ 914 break; \ 915 } \ 916 list = &next->conn_next; \ 917 prev = next; \ 918 } \ 919 (connp)->conn_prev = prev; \ 920 *list = (connp); \ 921 (connp)->conn_fanout = (connfp); \ 922 (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ 923 IPCL_BOUND; \ 924 CONN_INC_REF((connp)); \ 925 mutex_exit(&(connfp)->connf_lock); \ 926 } 927 928 void 929 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) 930 { 931 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 932 } 933 934 /* 935 * Because the classifier is used to classify inbound packets, the destination 936 * address is meant to be our local tunnel address (tunnel source), and the 937 * source the remote tunnel address (tunnel destination). 938 * 939 * Note that conn_proto can't be used for fanout since the upper protocol 940 * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. 941 */ 942 conn_t * 943 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) 944 { 945 connf_t *connfp; 946 conn_t *connp; 947 948 /* first look for IPv4 tunnel links */ 949 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)]; 950 mutex_enter(&connfp->connf_lock); 951 for (connp = connfp->connf_head; connp != NULL; 952 connp = connp->conn_next) { 953 if (IPCL_IPTUN_MATCH(connp, *dst, *src)) 954 break; 955 } 956 if (connp != NULL) 957 goto done; 958 959 mutex_exit(&connfp->connf_lock); 960 961 /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */ 962 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, 963 INADDR_ANY)]; 964 mutex_enter(&connfp->connf_lock); 965 for (connp = connfp->connf_head; connp != NULL; 966 connp = connp->conn_next) { 967 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY)) 968 break; 969 } 970 done: 971 if (connp != NULL) 972 CONN_INC_REF(connp); 973 mutex_exit(&connfp->connf_lock); 974 return (connp); 975 } 976 977 conn_t * 978 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst) 979 { 980 connf_t *connfp; 981 conn_t *connp; 982 983 /* Look for an IPv6 tunnel link */ 984 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)]; 985 mutex_enter(&connfp->connf_lock); 986 for (connp = connfp->connf_head; connp != NULL; 987 connp = connp->conn_next) { 988 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) { 989 CONN_INC_REF(connp); 990 break; 991 } 992 } 993 mutex_exit(&connfp->connf_lock); 994 return (connp); 995 } 996 997 /* 998 * This function is used only for inserting SCTP raw socket now. 999 * This may change later. 1000 * 1001 * Note that only one raw socket can be bound to a port. The param 1002 * lport is in network byte order. 1003 */ 1004 static int 1005 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) 1006 { 1007 connf_t *connfp; 1008 conn_t *oconnp; 1009 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1010 1011 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1012 1013 /* Check for existing raw socket already bound to the port. */ 1014 mutex_enter(&connfp->connf_lock); 1015 for (oconnp = connfp->connf_head; oconnp != NULL; 1016 oconnp = oconnp->conn_next) { 1017 if (oconnp->conn_lport == lport && 1018 oconnp->conn_zoneid == connp->conn_zoneid && 1019 oconnp->conn_family == connp->conn_family && 1020 ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1021 IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || 1022 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || 1023 IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || 1024 IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, 1025 &connp->conn_laddr_v6))) { 1026 break; 1027 } 1028 } 1029 mutex_exit(&connfp->connf_lock); 1030 if (oconnp != NULL) 1031 return (EADDRNOTAVAIL); 1032 1033 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || 1034 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1035 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || 1036 IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { 1037 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1038 } else { 1039 IPCL_HASH_INSERT_BOUND(connfp, connp); 1040 } 1041 } else { 1042 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1043 } 1044 return (0); 1045 } 1046 1047 static int 1048 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) 1049 { 1050 connf_t *connfp; 1051 conn_t *tconnp; 1052 ipaddr_t laddr = connp->conn_laddr_v4; 1053 ipaddr_t faddr = connp->conn_faddr_v4; 1054 1055 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; 1056 mutex_enter(&connfp->connf_lock); 1057 for (tconnp = connfp->connf_head; tconnp != NULL; 1058 tconnp = tconnp->conn_next) { 1059 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { 1060 /* A tunnel is already bound to these addresses. */ 1061 mutex_exit(&connfp->connf_lock); 1062 return (EADDRINUSE); 1063 } 1064 } 1065 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1066 mutex_exit(&connfp->connf_lock); 1067 return (0); 1068 } 1069 1070 static int 1071 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) 1072 { 1073 connf_t *connfp; 1074 conn_t *tconnp; 1075 in6_addr_t *laddr = &connp->conn_laddr_v6; 1076 in6_addr_t *faddr = &connp->conn_faddr_v6; 1077 1078 connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; 1079 mutex_enter(&connfp->connf_lock); 1080 for (tconnp = connfp->connf_head; tconnp != NULL; 1081 tconnp = tconnp->conn_next) { 1082 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { 1083 /* A tunnel is already bound to these addresses. */ 1084 mutex_exit(&connfp->connf_lock); 1085 return (EADDRINUSE); 1086 } 1087 } 1088 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1089 mutex_exit(&connfp->connf_lock); 1090 return (0); 1091 } 1092 1093 /* 1094 * Check for a MAC exemption conflict on a labeled system. Note that for 1095 * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the 1096 * transport layer. This check is for binding all other protocols. 1097 * 1098 * Returns true if there's a conflict. 1099 */ 1100 static boolean_t 1101 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) 1102 { 1103 connf_t *connfp; 1104 conn_t *tconn; 1105 1106 connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; 1107 mutex_enter(&connfp->connf_lock); 1108 for (tconn = connfp->connf_head; tconn != NULL; 1109 tconn = tconn->conn_next) { 1110 /* We don't allow v4 fallback for v6 raw socket */ 1111 if (connp->conn_family != tconn->conn_family) 1112 continue; 1113 /* If neither is exempt, then there's no conflict */ 1114 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1115 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1116 continue; 1117 /* We are only concerned about sockets for a different zone */ 1118 if (connp->conn_zoneid == tconn->conn_zoneid) 1119 continue; 1120 /* If both are bound to different specific addrs, ok */ 1121 if (connp->conn_laddr_v4 != INADDR_ANY && 1122 tconn->conn_laddr_v4 != INADDR_ANY && 1123 connp->conn_laddr_v4 != tconn->conn_laddr_v4) 1124 continue; 1125 /* These two conflict; fail */ 1126 break; 1127 } 1128 mutex_exit(&connfp->connf_lock); 1129 return (tconn != NULL); 1130 } 1131 1132 static boolean_t 1133 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) 1134 { 1135 connf_t *connfp; 1136 conn_t *tconn; 1137 1138 connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; 1139 mutex_enter(&connfp->connf_lock); 1140 for (tconn = connfp->connf_head; tconn != NULL; 1141 tconn = tconn->conn_next) { 1142 /* We don't allow v4 fallback for v6 raw socket */ 1143 if (connp->conn_family != tconn->conn_family) 1144 continue; 1145 /* If neither is exempt, then there's no conflict */ 1146 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && 1147 (tconn->conn_mac_mode == CONN_MAC_DEFAULT)) 1148 continue; 1149 /* We are only concerned about sockets for a different zone */ 1150 if (connp->conn_zoneid == tconn->conn_zoneid) 1151 continue; 1152 /* If both are bound to different addrs, ok */ 1153 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && 1154 !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && 1155 !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, 1156 &tconn->conn_laddr_v6)) 1157 continue; 1158 /* These two conflict; fail */ 1159 break; 1160 } 1161 mutex_exit(&connfp->connf_lock); 1162 return (tconn != NULL); 1163 } 1164 1165 /* 1166 * (v4, v6) bind hash insertion routines 1167 * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) 1168 */ 1169 1170 int 1171 ipcl_bind_insert(conn_t *connp) 1172 { 1173 if (connp->conn_ipversion == IPV6_VERSION) 1174 return (ipcl_bind_insert_v6(connp)); 1175 else 1176 return (ipcl_bind_insert_v4(connp)); 1177 } 1178 1179 int 1180 ipcl_bind_insert_v4(conn_t *connp) 1181 { 1182 connf_t *connfp; 1183 int ret = 0; 1184 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1185 uint16_t lport = connp->conn_lport; 1186 uint8_t protocol = connp->conn_proto; 1187 1188 if (IPCL_IS_IPTUN(connp)) 1189 return (ipcl_iptun_hash_insert(connp, ipst)); 1190 1191 switch (protocol) { 1192 default: 1193 if (is_system_labeled() && 1194 check_exempt_conflict_v4(connp, ipst)) 1195 return (EADDRINUSE); 1196 /* FALLTHROUGH */ 1197 case IPPROTO_UDP: 1198 if (protocol == IPPROTO_UDP) { 1199 connfp = &ipst->ips_ipcl_udp_fanout[ 1200 IPCL_UDP_HASH(lport, ipst)]; 1201 } else { 1202 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1203 } 1204 1205 if (connp->conn_faddr_v4 != INADDR_ANY) { 1206 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1207 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1208 IPCL_HASH_INSERT_BOUND(connfp, connp); 1209 } else { 1210 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1211 } 1212 if (protocol == IPPROTO_RSVP) 1213 ill_set_inputfn_all(ipst); 1214 break; 1215 1216 case IPPROTO_TCP: 1217 /* Insert it in the Bind Hash */ 1218 ASSERT(connp->conn_zoneid != ALL_ZONES); 1219 connfp = &ipst->ips_ipcl_bind_fanout[ 1220 IPCL_BIND_HASH(lport, ipst)]; 1221 if (connp->conn_laddr_v4 != INADDR_ANY) { 1222 IPCL_HASH_INSERT_BOUND(connfp, connp); 1223 } else { 1224 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1225 } 1226 if (cl_inet_listen != NULL) { 1227 ASSERT(connp->conn_ipversion == IPV4_VERSION); 1228 connp->conn_flags |= IPCL_CL_LISTENER; 1229 (*cl_inet_listen)( 1230 connp->conn_netstack->netstack_stackid, 1231 IPPROTO_TCP, AF_INET, 1232 (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); 1233 } 1234 break; 1235 1236 case IPPROTO_SCTP: 1237 ret = ipcl_sctp_hash_insert(connp, lport); 1238 break; 1239 } 1240 1241 return (ret); 1242 } 1243 1244 int 1245 ipcl_bind_insert_v6(conn_t *connp) 1246 { 1247 connf_t *connfp; 1248 int ret = 0; 1249 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1250 uint16_t lport = connp->conn_lport; 1251 uint8_t protocol = connp->conn_proto; 1252 1253 if (IPCL_IS_IPTUN(connp)) { 1254 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1255 } 1256 1257 switch (protocol) { 1258 default: 1259 if (is_system_labeled() && 1260 check_exempt_conflict_v6(connp, ipst)) 1261 return (EADDRINUSE); 1262 /* FALLTHROUGH */ 1263 case IPPROTO_UDP: 1264 if (protocol == IPPROTO_UDP) { 1265 connfp = &ipst->ips_ipcl_udp_fanout[ 1266 IPCL_UDP_HASH(lport, ipst)]; 1267 } else { 1268 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1269 } 1270 1271 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1272 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1273 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1274 IPCL_HASH_INSERT_BOUND(connfp, connp); 1275 } else { 1276 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1277 } 1278 break; 1279 1280 case IPPROTO_TCP: 1281 /* Insert it in the Bind Hash */ 1282 ASSERT(connp->conn_zoneid != ALL_ZONES); 1283 connfp = &ipst->ips_ipcl_bind_fanout[ 1284 IPCL_BIND_HASH(lport, ipst)]; 1285 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1286 IPCL_HASH_INSERT_BOUND(connfp, connp); 1287 } else { 1288 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1289 } 1290 if (cl_inet_listen != NULL) { 1291 sa_family_t addr_family; 1292 uint8_t *laddrp; 1293 1294 if (connp->conn_ipversion == IPV6_VERSION) { 1295 addr_family = AF_INET6; 1296 laddrp = 1297 (uint8_t *)&connp->conn_bound_addr_v6; 1298 } else { 1299 addr_family = AF_INET; 1300 laddrp = (uint8_t *)&connp->conn_bound_addr_v4; 1301 } 1302 connp->conn_flags |= IPCL_CL_LISTENER; 1303 (*cl_inet_listen)( 1304 connp->conn_netstack->netstack_stackid, 1305 IPPROTO_TCP, addr_family, laddrp, lport, NULL); 1306 } 1307 break; 1308 1309 case IPPROTO_SCTP: 1310 ret = ipcl_sctp_hash_insert(connp, lport); 1311 break; 1312 } 1313 1314 return (ret); 1315 } 1316 1317 /* 1318 * ipcl_conn_hash insertion routines. 1319 * The caller has already set conn_proto and the addresses/ports in the conn_t. 1320 */ 1321 1322 int 1323 ipcl_conn_insert(conn_t *connp) 1324 { 1325 if (connp->conn_ipversion == IPV6_VERSION) 1326 return (ipcl_conn_insert_v6(connp)); 1327 else 1328 return (ipcl_conn_insert_v4(connp)); 1329 } 1330 1331 int 1332 ipcl_conn_insert_v4(conn_t *connp) 1333 { 1334 connf_t *connfp; 1335 conn_t *tconnp; 1336 int ret = 0; 1337 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1338 uint16_t lport = connp->conn_lport; 1339 uint8_t protocol = connp->conn_proto; 1340 1341 if (IPCL_IS_IPTUN(connp)) 1342 return (ipcl_iptun_hash_insert(connp, ipst)); 1343 1344 switch (protocol) { 1345 case IPPROTO_TCP: 1346 /* 1347 * For TCP, we check whether the connection tuple already 1348 * exists before allowing the connection to proceed. We 1349 * also allow indexing on the zoneid. This is to allow 1350 * multiple shared stack zones to have the same tcp 1351 * connection tuple. In practice this only happens for 1352 * INADDR_LOOPBACK as it's the only local address which 1353 * doesn't have to be unique. 1354 */ 1355 connfp = &ipst->ips_ipcl_conn_fanout[ 1356 IPCL_CONN_HASH(connp->conn_faddr_v4, 1357 connp->conn_ports, ipst)]; 1358 mutex_enter(&connfp->connf_lock); 1359 for (tconnp = connfp->connf_head; tconnp != NULL; 1360 tconnp = tconnp->conn_next) { 1361 if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, 1362 connp->conn_faddr_v4, connp->conn_laddr_v4, 1363 connp->conn_ports) && 1364 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1365 /* Already have a conn. bail out */ 1366 mutex_exit(&connfp->connf_lock); 1367 return (EADDRINUSE); 1368 } 1369 } 1370 if (connp->conn_fanout != NULL) { 1371 /* 1372 * Probably a XTI/TLI application trying to do a 1373 * rebind. Let it happen. 1374 */ 1375 mutex_exit(&connfp->connf_lock); 1376 IPCL_HASH_REMOVE(connp); 1377 mutex_enter(&connfp->connf_lock); 1378 } 1379 1380 ASSERT(connp->conn_recv != NULL); 1381 ASSERT(connp->conn_recvicmp != NULL); 1382 1383 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1384 mutex_exit(&connfp->connf_lock); 1385 break; 1386 1387 case IPPROTO_SCTP: 1388 /* 1389 * The raw socket may have already been bound, remove it 1390 * from the hash first. 1391 */ 1392 IPCL_HASH_REMOVE(connp); 1393 ret = ipcl_sctp_hash_insert(connp, lport); 1394 break; 1395 1396 default: 1397 /* 1398 * Check for conflicts among MAC exempt bindings. For 1399 * transports with port numbers, this is done by the upper 1400 * level per-transport binding logic. For all others, it's 1401 * done here. 1402 */ 1403 if (is_system_labeled() && 1404 check_exempt_conflict_v4(connp, ipst)) 1405 return (EADDRINUSE); 1406 /* FALLTHROUGH */ 1407 1408 case IPPROTO_UDP: 1409 if (protocol == IPPROTO_UDP) { 1410 connfp = &ipst->ips_ipcl_udp_fanout[ 1411 IPCL_UDP_HASH(lport, ipst)]; 1412 } else { 1413 connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; 1414 } 1415 1416 if (connp->conn_faddr_v4 != INADDR_ANY) { 1417 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1418 } else if (connp->conn_laddr_v4 != INADDR_ANY) { 1419 IPCL_HASH_INSERT_BOUND(connfp, connp); 1420 } else { 1421 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1422 } 1423 break; 1424 } 1425 1426 return (ret); 1427 } 1428 1429 int 1430 ipcl_conn_insert_v6(conn_t *connp) 1431 { 1432 connf_t *connfp; 1433 conn_t *tconnp; 1434 int ret = 0; 1435 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 1436 uint16_t lport = connp->conn_lport; 1437 uint8_t protocol = connp->conn_proto; 1438 uint_t ifindex = connp->conn_bound_if; 1439 1440 if (IPCL_IS_IPTUN(connp)) 1441 return (ipcl_iptun_hash_insert_v6(connp, ipst)); 1442 1443 switch (protocol) { 1444 case IPPROTO_TCP: 1445 1446 /* 1447 * For tcp, we check whether the connection tuple already 1448 * exists before allowing the connection to proceed. We 1449 * also allow indexing on the zoneid. This is to allow 1450 * multiple shared stack zones to have the same tcp 1451 * connection tuple. In practice this only happens for 1452 * ipv6_loopback as it's the only local address which 1453 * doesn't have to be unique. 1454 */ 1455 connfp = &ipst->ips_ipcl_conn_fanout[ 1456 IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, 1457 ipst)]; 1458 mutex_enter(&connfp->connf_lock); 1459 for (tconnp = connfp->connf_head; tconnp != NULL; 1460 tconnp = tconnp->conn_next) { 1461 /* NOTE: need to match zoneid. Bug in onnv-gate */ 1462 if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, 1463 connp->conn_faddr_v6, connp->conn_laddr_v6, 1464 connp->conn_ports) && 1465 (tconnp->conn_bound_if == 0 || 1466 tconnp->conn_bound_if == ifindex) && 1467 IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { 1468 /* Already have a conn. bail out */ 1469 mutex_exit(&connfp->connf_lock); 1470 return (EADDRINUSE); 1471 } 1472 } 1473 if (connp->conn_fanout != NULL) { 1474 /* 1475 * Probably a XTI/TLI application trying to do a 1476 * rebind. Let it happen. 1477 */ 1478 mutex_exit(&connfp->connf_lock); 1479 IPCL_HASH_REMOVE(connp); 1480 mutex_enter(&connfp->connf_lock); 1481 } 1482 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); 1483 mutex_exit(&connfp->connf_lock); 1484 break; 1485 1486 case IPPROTO_SCTP: 1487 IPCL_HASH_REMOVE(connp); 1488 ret = ipcl_sctp_hash_insert(connp, lport); 1489 break; 1490 1491 default: 1492 if (is_system_labeled() && 1493 check_exempt_conflict_v6(connp, ipst)) 1494 return (EADDRINUSE); 1495 /* FALLTHROUGH */ 1496 case IPPROTO_UDP: 1497 if (protocol == IPPROTO_UDP) { 1498 connfp = &ipst->ips_ipcl_udp_fanout[ 1499 IPCL_UDP_HASH(lport, ipst)]; 1500 } else { 1501 connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; 1502 } 1503 1504 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { 1505 IPCL_HASH_INSERT_CONNECTED(connfp, connp); 1506 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 1507 IPCL_HASH_INSERT_BOUND(connfp, connp); 1508 } else { 1509 IPCL_HASH_INSERT_WILDCARD(connfp, connp); 1510 } 1511 break; 1512 } 1513 1514 return (ret); 1515 } 1516 1517 /* 1518 * v4 packet classifying function. looks up the fanout table to 1519 * find the conn, the packet belongs to. returns the conn with 1520 * the reference held, null otherwise. 1521 * 1522 * If zoneid is ALL_ZONES, then the search rules described in the "Connection 1523 * Lookup" comment block are applied. Labels are also checked as described 1524 * above. If the packet is from the inside (looped back), and is from the same 1525 * zone, then label checks are omitted. 1526 */ 1527 conn_t * 1528 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1529 ip_recv_attr_t *ira, ip_stack_t *ipst) 1530 { 1531 ipha_t *ipha; 1532 connf_t *connfp, *bind_connfp; 1533 uint16_t lport; 1534 uint16_t fport; 1535 uint32_t ports; 1536 conn_t *connp; 1537 uint16_t *up; 1538 zoneid_t zoneid = ira->ira_zoneid; 1539 1540 ipha = (ipha_t *)mp->b_rptr; 1541 up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); 1542 1543 switch (protocol) { 1544 case IPPROTO_TCP: 1545 ports = *(uint32_t *)up; 1546 connfp = 1547 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src, 1548 ports, ipst)]; 1549 mutex_enter(&connfp->connf_lock); 1550 for (connp = connfp->connf_head; connp != NULL; 1551 connp = connp->conn_next) { 1552 if (IPCL_CONN_MATCH(connp, protocol, 1553 ipha->ipha_src, ipha->ipha_dst, ports) && 1554 (connp->conn_zoneid == zoneid || 1555 connp->conn_allzones || 1556 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1557 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1558 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1559 break; 1560 } 1561 1562 if (connp != NULL) { 1563 /* 1564 * We have a fully-bound TCP connection. 1565 * 1566 * For labeled systems, there's no need to check the 1567 * label here. It's known to be good as we checked 1568 * before allowing the connection to become bound. 1569 */ 1570 CONN_INC_REF(connp); 1571 mutex_exit(&connfp->connf_lock); 1572 return (connp); 1573 } 1574 1575 mutex_exit(&connfp->connf_lock); 1576 lport = up[1]; 1577 bind_connfp = 1578 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1579 mutex_enter(&bind_connfp->connf_lock); 1580 for (connp = bind_connfp->connf_head; connp != NULL; 1581 connp = connp->conn_next) { 1582 if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, 1583 lport) && 1584 (connp->conn_zoneid == zoneid || 1585 connp->conn_allzones || 1586 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1587 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1588 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1589 break; 1590 } 1591 1592 /* 1593 * If the matching connection is SLP on a private address, then 1594 * the label on the packet must match the local zone's label. 1595 * Otherwise, it must be in the label range defined by tnrh. 1596 * This is ensured by tsol_receive_local. 1597 * 1598 * Note that we don't check tsol_receive_local for 1599 * the connected case. 1600 */ 1601 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1602 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1603 ira, connp)) { 1604 DTRACE_PROBE3(tx__ip__log__info__classify__tcp, 1605 char *, "connp(1) could not receive mp(2)", 1606 conn_t *, connp, mblk_t *, mp); 1607 connp = NULL; 1608 } 1609 1610 if (connp != NULL) { 1611 /* Have a listener at least */ 1612 CONN_INC_REF(connp); 1613 mutex_exit(&bind_connfp->connf_lock); 1614 return (connp); 1615 } 1616 1617 mutex_exit(&bind_connfp->connf_lock); 1618 break; 1619 1620 case IPPROTO_UDP: 1621 lport = up[1]; 1622 fport = up[0]; 1623 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1624 mutex_enter(&connfp->connf_lock); 1625 for (connp = connfp->connf_head; connp != NULL; 1626 connp = connp->conn_next) { 1627 if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, 1628 fport, ipha->ipha_src) && 1629 (connp->conn_zoneid == zoneid || 1630 connp->conn_allzones || 1631 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1632 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) 1633 break; 1634 } 1635 1636 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1637 !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, 1638 ira, connp)) { 1639 DTRACE_PROBE3(tx__ip__log__info__classify__udp, 1640 char *, "connp(1) could not receive mp(2)", 1641 conn_t *, connp, mblk_t *, mp); 1642 connp = NULL; 1643 } 1644 1645 if (connp != NULL) { 1646 CONN_INC_REF(connp); 1647 mutex_exit(&connfp->connf_lock); 1648 return (connp); 1649 } 1650 1651 /* 1652 * We shouldn't come here for multicast/broadcast packets 1653 */ 1654 mutex_exit(&connfp->connf_lock); 1655 1656 break; 1657 1658 case IPPROTO_ENCAP: 1659 case IPPROTO_IPV6: 1660 return (ipcl_iptun_classify_v4(&ipha->ipha_src, 1661 &ipha->ipha_dst, ipst)); 1662 } 1663 1664 return (NULL); 1665 } 1666 1667 conn_t * 1668 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, 1669 ip_recv_attr_t *ira, ip_stack_t *ipst) 1670 { 1671 ip6_t *ip6h; 1672 connf_t *connfp, *bind_connfp; 1673 uint16_t lport; 1674 uint16_t fport; 1675 tcpha_t *tcpha; 1676 uint32_t ports; 1677 conn_t *connp; 1678 uint16_t *up; 1679 zoneid_t zoneid = ira->ira_zoneid; 1680 1681 ip6h = (ip6_t *)mp->b_rptr; 1682 1683 switch (protocol) { 1684 case IPPROTO_TCP: 1685 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; 1686 up = &tcpha->tha_lport; 1687 ports = *(uint32_t *)up; 1688 1689 connfp = 1690 &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src, 1691 ports, ipst)]; 1692 mutex_enter(&connfp->connf_lock); 1693 for (connp = connfp->connf_head; connp != NULL; 1694 connp = connp->conn_next) { 1695 if (IPCL_CONN_MATCH_V6(connp, protocol, 1696 ip6h->ip6_src, ip6h->ip6_dst, ports) && 1697 (connp->conn_zoneid == zoneid || 1698 connp->conn_allzones || 1699 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1700 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1701 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1702 break; 1703 } 1704 1705 if (connp != NULL) { 1706 /* 1707 * We have a fully-bound TCP connection. 1708 * 1709 * For labeled systems, there's no need to check the 1710 * label here. It's known to be good as we checked 1711 * before allowing the connection to become bound. 1712 */ 1713 CONN_INC_REF(connp); 1714 mutex_exit(&connfp->connf_lock); 1715 return (connp); 1716 } 1717 1718 mutex_exit(&connfp->connf_lock); 1719 1720 lport = up[1]; 1721 bind_connfp = 1722 &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 1723 mutex_enter(&bind_connfp->connf_lock); 1724 for (connp = bind_connfp->connf_head; connp != NULL; 1725 connp = connp->conn_next) { 1726 if (IPCL_BIND_MATCH_V6(connp, protocol, 1727 ip6h->ip6_dst, lport) && 1728 (connp->conn_zoneid == zoneid || 1729 connp->conn_allzones || 1730 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1731 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1732 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1733 break; 1734 } 1735 1736 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1737 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1738 ira, connp)) { 1739 DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, 1740 char *, "connp(1) could not receive mp(2)", 1741 conn_t *, connp, mblk_t *, mp); 1742 connp = NULL; 1743 } 1744 1745 if (connp != NULL) { 1746 /* Have a listner at least */ 1747 CONN_INC_REF(connp); 1748 mutex_exit(&bind_connfp->connf_lock); 1749 return (connp); 1750 } 1751 1752 mutex_exit(&bind_connfp->connf_lock); 1753 break; 1754 1755 case IPPROTO_UDP: 1756 up = (uint16_t *)&mp->b_rptr[hdr_len]; 1757 lport = up[1]; 1758 fport = up[0]; 1759 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; 1760 mutex_enter(&connfp->connf_lock); 1761 for (connp = connfp->connf_head; connp != NULL; 1762 connp = connp->conn_next) { 1763 if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, 1764 fport, ip6h->ip6_src) && 1765 (connp->conn_zoneid == zoneid || 1766 connp->conn_allzones || 1767 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1768 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1769 (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) 1770 break; 1771 } 1772 1773 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1774 !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, 1775 ira, connp)) { 1776 DTRACE_PROBE3(tx__ip__log__info__classify__udp6, 1777 char *, "connp(1) could not receive mp(2)", 1778 conn_t *, connp, mblk_t *, mp); 1779 connp = NULL; 1780 } 1781 1782 if (connp != NULL) { 1783 CONN_INC_REF(connp); 1784 mutex_exit(&connfp->connf_lock); 1785 return (connp); 1786 } 1787 1788 /* 1789 * We shouldn't come here for multicast/broadcast packets 1790 */ 1791 mutex_exit(&connfp->connf_lock); 1792 break; 1793 case IPPROTO_ENCAP: 1794 case IPPROTO_IPV6: 1795 return (ipcl_iptun_classify_v6(&ip6h->ip6_src, 1796 &ip6h->ip6_dst, ipst)); 1797 } 1798 1799 return (NULL); 1800 } 1801 1802 /* 1803 * wrapper around ipcl_classify_(v4,v6) routines. 1804 */ 1805 conn_t * 1806 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 1807 { 1808 if (ira->ira_flags & IRAF_IS_IPV4) { 1809 return (ipcl_classify_v4(mp, ira->ira_protocol, 1810 ira->ira_ip_hdr_length, ira, ipst)); 1811 } else { 1812 return (ipcl_classify_v6(mp, ira->ira_protocol, 1813 ira->ira_ip_hdr_length, ira, ipst)); 1814 } 1815 } 1816 1817 /* 1818 * Only used to classify SCTP RAW sockets 1819 */ 1820 conn_t * 1821 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, 1822 ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) 1823 { 1824 connf_t *connfp; 1825 conn_t *connp; 1826 in_port_t lport; 1827 int ipversion; 1828 const void *dst; 1829 zoneid_t zoneid = ira->ira_zoneid; 1830 1831 lport = ((uint16_t *)&ports)[1]; 1832 if (ira->ira_flags & IRAF_IS_IPV4) { 1833 dst = (const void *)&ipha->ipha_dst; 1834 ipversion = IPV4_VERSION; 1835 } else { 1836 dst = (const void *)&ip6h->ip6_dst; 1837 ipversion = IPV6_VERSION; 1838 } 1839 1840 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; 1841 mutex_enter(&connfp->connf_lock); 1842 for (connp = connfp->connf_head; connp != NULL; 1843 connp = connp->conn_next) { 1844 /* We don't allow v4 fallback for v6 raw socket. */ 1845 if (ipversion != connp->conn_ipversion) 1846 continue; 1847 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 1848 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 1849 if (ipversion == IPV4_VERSION) { 1850 if (!IPCL_CONN_MATCH(connp, protocol, 1851 ipha->ipha_src, ipha->ipha_dst, ports)) 1852 continue; 1853 } else { 1854 if (!IPCL_CONN_MATCH_V6(connp, protocol, 1855 ip6h->ip6_src, ip6h->ip6_dst, ports)) 1856 continue; 1857 } 1858 } else { 1859 if (ipversion == IPV4_VERSION) { 1860 if (!IPCL_BIND_MATCH(connp, protocol, 1861 ipha->ipha_dst, lport)) 1862 continue; 1863 } else { 1864 if (!IPCL_BIND_MATCH_V6(connp, protocol, 1865 ip6h->ip6_dst, lport)) 1866 continue; 1867 } 1868 } 1869 1870 if (connp->conn_zoneid == zoneid || 1871 connp->conn_allzones || 1872 ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && 1873 (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && 1874 (ira->ira_flags & IRAF_TX_SHARED_ADDR))) 1875 break; 1876 } 1877 1878 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && 1879 !tsol_receive_local(mp, dst, ipversion, ira, connp)) { 1880 DTRACE_PROBE3(tx__ip__log__info__classify__rawip, 1881 char *, "connp(1) could not receive mp(2)", 1882 conn_t *, connp, mblk_t *, mp); 1883 connp = NULL; 1884 } 1885 1886 if (connp != NULL) 1887 goto found; 1888 mutex_exit(&connfp->connf_lock); 1889 1890 /* Try to look for a wildcard SCTP RAW socket match. */ 1891 connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; 1892 mutex_enter(&connfp->connf_lock); 1893 for (connp = connfp->connf_head; connp != NULL; 1894 connp = connp->conn_next) { 1895 /* We don't allow v4 fallback for v6 raw socket. */ 1896 if (ipversion != connp->conn_ipversion) 1897 continue; 1898 if (!IPCL_ZONE_MATCH(connp, zoneid)) 1899 continue; 1900 1901 if (ipversion == IPV4_VERSION) { 1902 if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) 1903 break; 1904 } else { 1905 if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { 1906 break; 1907 } 1908 } 1909 } 1910 1911 if (connp != NULL) 1912 goto found; 1913 1914 mutex_exit(&connfp->connf_lock); 1915 return (NULL); 1916 1917 found: 1918 ASSERT(connp != NULL); 1919 CONN_INC_REF(connp); 1920 mutex_exit(&connfp->connf_lock); 1921 return (connp); 1922 } 1923 1924 /* ARGSUSED */ 1925 static int 1926 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) 1927 { 1928 itc_t *itc = (itc_t *)buf; 1929 conn_t *connp = &itc->itc_conn; 1930 tcp_t *tcp = (tcp_t *)&itc[1]; 1931 1932 bzero(connp, sizeof (conn_t)); 1933 bzero(tcp, sizeof (tcp_t)); 1934 1935 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1936 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1937 cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); 1938 tcp->tcp_timercache = tcp_timermp_alloc(kmflags); 1939 if (tcp->tcp_timercache == NULL) 1940 return (ENOMEM); 1941 connp->conn_tcp = tcp; 1942 connp->conn_flags = IPCL_TCPCONN; 1943 connp->conn_proto = IPPROTO_TCP; 1944 tcp->tcp_connp = connp; 1945 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1946 1947 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1948 if (connp->conn_ixa == NULL) { 1949 tcp_timermp_free(tcp); 1950 return (ENOMEM); 1951 } 1952 connp->conn_ixa->ixa_refcnt = 1; 1953 connp->conn_ixa->ixa_protocol = connp->conn_proto; 1954 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 1955 return (0); 1956 } 1957 1958 /* ARGSUSED */ 1959 static void 1960 tcp_conn_destructor(void *buf, void *cdrarg) 1961 { 1962 itc_t *itc = (itc_t *)buf; 1963 conn_t *connp = &itc->itc_conn; 1964 tcp_t *tcp = (tcp_t *)&itc[1]; 1965 1966 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1967 ASSERT(tcp->tcp_connp == connp); 1968 ASSERT(connp->conn_tcp == tcp); 1969 tcp_timermp_free(tcp); 1970 mutex_destroy(&connp->conn_lock); 1971 cv_destroy(&connp->conn_cv); 1972 cv_destroy(&connp->conn_sq_cv); 1973 rw_destroy(&connp->conn_ilg_lock); 1974 1975 /* Can be NULL if constructor failed */ 1976 if (connp->conn_ixa != NULL) { 1977 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 1978 ASSERT(connp->conn_ixa->ixa_ire == NULL); 1979 ASSERT(connp->conn_ixa->ixa_nce == NULL); 1980 ixa_refrele(connp->conn_ixa); 1981 } 1982 } 1983 1984 /* ARGSUSED */ 1985 static int 1986 ip_conn_constructor(void *buf, void *cdrarg, int kmflags) 1987 { 1988 itc_t *itc = (itc_t *)buf; 1989 conn_t *connp = &itc->itc_conn; 1990 1991 bzero(connp, sizeof (conn_t)); 1992 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 1993 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 1994 connp->conn_flags = IPCL_IPCCONN; 1995 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 1996 1997 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 1998 if (connp->conn_ixa == NULL) 1999 return (ENOMEM); 2000 connp->conn_ixa->ixa_refcnt = 1; 2001 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2002 return (0); 2003 } 2004 2005 /* ARGSUSED */ 2006 static void 2007 ip_conn_destructor(void *buf, void *cdrarg) 2008 { 2009 itc_t *itc = (itc_t *)buf; 2010 conn_t *connp = &itc->itc_conn; 2011 2012 ASSERT(connp->conn_flags & IPCL_IPCCONN); 2013 ASSERT(connp->conn_priv == NULL); 2014 mutex_destroy(&connp->conn_lock); 2015 cv_destroy(&connp->conn_cv); 2016 rw_destroy(&connp->conn_ilg_lock); 2017 2018 /* Can be NULL if constructor failed */ 2019 if (connp->conn_ixa != NULL) { 2020 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2021 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2022 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2023 ixa_refrele(connp->conn_ixa); 2024 } 2025 } 2026 2027 /* ARGSUSED */ 2028 static int 2029 udp_conn_constructor(void *buf, void *cdrarg, int kmflags) 2030 { 2031 itc_t *itc = (itc_t *)buf; 2032 conn_t *connp = &itc->itc_conn; 2033 udp_t *udp = (udp_t *)&itc[1]; 2034 2035 bzero(connp, sizeof (conn_t)); 2036 bzero(udp, sizeof (udp_t)); 2037 2038 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2039 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2040 connp->conn_udp = udp; 2041 connp->conn_flags = IPCL_UDPCONN; 2042 connp->conn_proto = IPPROTO_UDP; 2043 udp->udp_connp = connp; 2044 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2045 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2046 if (connp->conn_ixa == NULL) 2047 return (ENOMEM); 2048 connp->conn_ixa->ixa_refcnt = 1; 2049 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2050 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2051 return (0); 2052 } 2053 2054 /* ARGSUSED */ 2055 static void 2056 udp_conn_destructor(void *buf, void *cdrarg) 2057 { 2058 itc_t *itc = (itc_t *)buf; 2059 conn_t *connp = &itc->itc_conn; 2060 udp_t *udp = (udp_t *)&itc[1]; 2061 2062 ASSERT(connp->conn_flags & IPCL_UDPCONN); 2063 ASSERT(udp->udp_connp == connp); 2064 ASSERT(connp->conn_udp == udp); 2065 mutex_destroy(&connp->conn_lock); 2066 cv_destroy(&connp->conn_cv); 2067 rw_destroy(&connp->conn_ilg_lock); 2068 2069 /* Can be NULL if constructor failed */ 2070 if (connp->conn_ixa != NULL) { 2071 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2072 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2073 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2074 ixa_refrele(connp->conn_ixa); 2075 } 2076 } 2077 2078 /* ARGSUSED */ 2079 static int 2080 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) 2081 { 2082 itc_t *itc = (itc_t *)buf; 2083 conn_t *connp = &itc->itc_conn; 2084 icmp_t *icmp = (icmp_t *)&itc[1]; 2085 2086 bzero(connp, sizeof (conn_t)); 2087 bzero(icmp, sizeof (icmp_t)); 2088 2089 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2090 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2091 connp->conn_icmp = icmp; 2092 connp->conn_flags = IPCL_RAWIPCONN; 2093 connp->conn_proto = IPPROTO_ICMP; 2094 icmp->icmp_connp = connp; 2095 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2096 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2097 if (connp->conn_ixa == NULL) 2098 return (ENOMEM); 2099 connp->conn_ixa->ixa_refcnt = 1; 2100 connp->conn_ixa->ixa_protocol = connp->conn_proto; 2101 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2102 return (0); 2103 } 2104 2105 /* ARGSUSED */ 2106 static void 2107 rawip_conn_destructor(void *buf, void *cdrarg) 2108 { 2109 itc_t *itc = (itc_t *)buf; 2110 conn_t *connp = &itc->itc_conn; 2111 icmp_t *icmp = (icmp_t *)&itc[1]; 2112 2113 ASSERT(connp->conn_flags & IPCL_RAWIPCONN); 2114 ASSERT(icmp->icmp_connp == connp); 2115 ASSERT(connp->conn_icmp == icmp); 2116 mutex_destroy(&connp->conn_lock); 2117 cv_destroy(&connp->conn_cv); 2118 rw_destroy(&connp->conn_ilg_lock); 2119 2120 /* Can be NULL if constructor failed */ 2121 if (connp->conn_ixa != NULL) { 2122 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2123 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2124 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2125 ixa_refrele(connp->conn_ixa); 2126 } 2127 } 2128 2129 /* ARGSUSED */ 2130 static int 2131 rts_conn_constructor(void *buf, void *cdrarg, int kmflags) 2132 { 2133 itc_t *itc = (itc_t *)buf; 2134 conn_t *connp = &itc->itc_conn; 2135 rts_t *rts = (rts_t *)&itc[1]; 2136 2137 bzero(connp, sizeof (conn_t)); 2138 bzero(rts, sizeof (rts_t)); 2139 2140 mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); 2141 cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); 2142 connp->conn_rts = rts; 2143 connp->conn_flags = IPCL_RTSCONN; 2144 rts->rts_connp = connp; 2145 rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); 2146 connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); 2147 if (connp->conn_ixa == NULL) 2148 return (ENOMEM); 2149 connp->conn_ixa->ixa_refcnt = 1; 2150 connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); 2151 return (0); 2152 } 2153 2154 /* ARGSUSED */ 2155 static void 2156 rts_conn_destructor(void *buf, void *cdrarg) 2157 { 2158 itc_t *itc = (itc_t *)buf; 2159 conn_t *connp = &itc->itc_conn; 2160 rts_t *rts = (rts_t *)&itc[1]; 2161 2162 ASSERT(connp->conn_flags & IPCL_RTSCONN); 2163 ASSERT(rts->rts_connp == connp); 2164 ASSERT(connp->conn_rts == rts); 2165 mutex_destroy(&connp->conn_lock); 2166 cv_destroy(&connp->conn_cv); 2167 rw_destroy(&connp->conn_ilg_lock); 2168 2169 /* Can be NULL if constructor failed */ 2170 if (connp->conn_ixa != NULL) { 2171 ASSERT(connp->conn_ixa->ixa_refcnt == 1); 2172 ASSERT(connp->conn_ixa->ixa_ire == NULL); 2173 ASSERT(connp->conn_ixa->ixa_nce == NULL); 2174 ixa_refrele(connp->conn_ixa); 2175 } 2176 } 2177 2178 /* 2179 * Called as part of ipcl_conn_destroy to assert and clear any pointers 2180 * in the conn_t. 2181 * 2182 * Below we list all the pointers in the conn_t as a documentation aid. 2183 * The ones that we can not ASSERT to be NULL are #ifdef'ed out. 2184 * If you add any pointers to the conn_t please add an ASSERT here 2185 * and #ifdef it out if it can't be actually asserted to be NULL. 2186 * In any case, we bzero most of the conn_t at the end of the function. 2187 */ 2188 void 2189 ipcl_conn_cleanup(conn_t *connp) 2190 { 2191 ip_xmit_attr_t *ixa; 2192 2193 ASSERT(connp->conn_latch == NULL); 2194 ASSERT(connp->conn_latch_in_policy == NULL); 2195 ASSERT(connp->conn_latch_in_action == NULL); 2196 #ifdef notdef 2197 ASSERT(connp->conn_rq == NULL); 2198 ASSERT(connp->conn_wq == NULL); 2199 #endif 2200 ASSERT(connp->conn_cred == NULL); 2201 ASSERT(connp->conn_g_fanout == NULL); 2202 ASSERT(connp->conn_g_next == NULL); 2203 ASSERT(connp->conn_g_prev == NULL); 2204 ASSERT(connp->conn_policy == NULL); 2205 ASSERT(connp->conn_fanout == NULL); 2206 ASSERT(connp->conn_next == NULL); 2207 ASSERT(connp->conn_prev == NULL); 2208 ASSERT(connp->conn_oper_pending_ill == NULL); 2209 ASSERT(connp->conn_ilg == NULL); 2210 ASSERT(connp->conn_drain_next == NULL); 2211 ASSERT(connp->conn_drain_prev == NULL); 2212 #ifdef notdef 2213 /* conn_idl is not cleared when removed from idl list */ 2214 ASSERT(connp->conn_idl == NULL); 2215 #endif 2216 ASSERT(connp->conn_ipsec_opt_mp == NULL); 2217 #ifdef notdef 2218 /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ 2219 ASSERT(connp->conn_netstack == NULL); 2220 #endif 2221 2222 ASSERT(connp->conn_helper_info == NULL); 2223 ASSERT(connp->conn_ixa != NULL); 2224 ixa = connp->conn_ixa; 2225 ASSERT(ixa->ixa_refcnt == 1); 2226 /* Need to preserve ixa_protocol */ 2227 ixa_cleanup(ixa); 2228 ixa->ixa_flags = 0; 2229 2230 /* Clear out the conn_t fields that are not preserved */ 2231 bzero(&connp->conn_start_clr, 2232 sizeof (conn_t) - 2233 ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); 2234 } 2235 2236 /* 2237 * All conns are inserted in a global multi-list for the benefit of 2238 * walkers. The walk is guaranteed to walk all open conns at the time 2239 * of the start of the walk exactly once. This property is needed to 2240 * achieve some cleanups during unplumb of interfaces. This is achieved 2241 * as follows. 2242 * 2243 * ipcl_conn_create and ipcl_conn_destroy are the only functions that 2244 * call the insert and delete functions below at creation and deletion 2245 * time respectively. The conn never moves or changes its position in this 2246 * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt 2247 * won't increase due to walkers, once the conn deletion has started. Note 2248 * that we can't remove the conn from the global list and then wait for 2249 * the refcnt to drop to zero, since walkers would then see a truncated 2250 * list. CONN_INCIPIENT ensures that walkers don't start looking at 2251 * conns until ip_open is ready to make them globally visible. 2252 * The global round robin multi-list locks are held only to get the 2253 * next member/insertion/deletion and contention should be negligible 2254 * if the multi-list is much greater than the number of cpus. 2255 */ 2256 void 2257 ipcl_globalhash_insert(conn_t *connp) 2258 { 2259 int index; 2260 struct connf_s *connfp; 2261 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 2262 2263 /* 2264 * No need for atomic here. Approximate even distribution 2265 * in the global lists is sufficient. 2266 */ 2267 ipst->ips_conn_g_index++; 2268 index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1); 2269 2270 connp->conn_g_prev = NULL; 2271 /* 2272 * Mark as INCIPIENT, so that walkers will ignore this 2273 * for now, till ip_open is ready to make it visible globally. 2274 */ 2275 connp->conn_state_flags |= CONN_INCIPIENT; 2276 2277 connfp = &ipst->ips_ipcl_globalhash_fanout[index]; 2278 /* Insert at the head of the list */ 2279 mutex_enter(&connfp->connf_lock); 2280 connp->conn_g_next = connfp->connf_head; 2281 if (connp->conn_g_next != NULL) 2282 connp->conn_g_next->conn_g_prev = connp; 2283 connfp->connf_head = connp; 2284 2285 /* The fanout bucket this conn points to */ 2286 connp->conn_g_fanout = connfp; 2287 2288 mutex_exit(&connfp->connf_lock); 2289 } 2290 2291 void 2292 ipcl_globalhash_remove(conn_t *connp) 2293 { 2294 struct connf_s *connfp; 2295 2296 /* 2297 * We were never inserted in the global multi list. 2298 * IPCL_NONE variety is never inserted in the global multilist 2299 * since it is presumed to not need any cleanup and is transient. 2300 */ 2301 if (connp->conn_g_fanout == NULL) 2302 return; 2303 2304 connfp = connp->conn_g_fanout; 2305 mutex_enter(&connfp->connf_lock); 2306 if (connp->conn_g_prev != NULL) 2307 connp->conn_g_prev->conn_g_next = connp->conn_g_next; 2308 else 2309 connfp->connf_head = connp->conn_g_next; 2310 if (connp->conn_g_next != NULL) 2311 connp->conn_g_next->conn_g_prev = connp->conn_g_prev; 2312 mutex_exit(&connfp->connf_lock); 2313 2314 /* Better to stumble on a null pointer than to corrupt memory */ 2315 connp->conn_g_next = NULL; 2316 connp->conn_g_prev = NULL; 2317 connp->conn_g_fanout = NULL; 2318 } 2319 2320 /* 2321 * Walk the list of all conn_t's in the system, calling the function provided 2322 * With the specified argument for each. 2323 * Applies to both IPv4 and IPv6. 2324 * 2325 * CONNs may hold pointers to ills (conn_dhcpinit_ill and 2326 * conn_oper_pending_ill). To guard against stale pointers 2327 * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is 2328 * unplumbed or removed. New conn_t's that are created while we are walking 2329 * may be missed by this walk, because they are not necessarily inserted 2330 * at the tail of the list. They are new conn_t's and thus don't have any 2331 * stale pointers. The CONN_CLOSING flag ensures that no new reference 2332 * is created to the struct that is going away. 2333 */ 2334 void 2335 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) 2336 { 2337 int i; 2338 conn_t *connp; 2339 conn_t *prev_connp; 2340 2341 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 2342 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2343 prev_connp = NULL; 2344 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head; 2345 while (connp != NULL) { 2346 mutex_enter(&connp->conn_lock); 2347 if (connp->conn_state_flags & 2348 (CONN_CONDEMNED | CONN_INCIPIENT)) { 2349 mutex_exit(&connp->conn_lock); 2350 connp = connp->conn_g_next; 2351 continue; 2352 } 2353 CONN_INC_REF_LOCKED(connp); 2354 mutex_exit(&connp->conn_lock); 2355 mutex_exit( 2356 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2357 (*func)(connp, arg); 2358 if (prev_connp != NULL) 2359 CONN_DEC_REF(prev_connp); 2360 mutex_enter( 2361 &ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2362 prev_connp = connp; 2363 connp = connp->conn_g_next; 2364 } 2365 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock); 2366 if (prev_connp != NULL) 2367 CONN_DEC_REF(prev_connp); 2368 } 2369 } 2370 2371 /* 2372 * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on 2373 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2374 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2375 * (peer tcp in ESTABLISHED state). 2376 */ 2377 conn_t * 2378 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, 2379 ip_stack_t *ipst) 2380 { 2381 uint32_t ports; 2382 uint16_t *pports = (uint16_t *)&ports; 2383 connf_t *connfp; 2384 conn_t *tconnp; 2385 boolean_t zone_chk; 2386 2387 /* 2388 * If either the source of destination address is loopback, then 2389 * both endpoints must be in the same Zone. Otherwise, both of 2390 * the addresses are system-wide unique (tcp is in ESTABLISHED 2391 * state) and the endpoints may reside in different Zones. 2392 */ 2393 zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || 2394 ipha->ipha_dst == htonl(INADDR_LOOPBACK)); 2395 2396 pports[0] = tcpha->tha_fport; 2397 pports[1] = tcpha->tha_lport; 2398 2399 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2400 ports, ipst)]; 2401 2402 mutex_enter(&connfp->connf_lock); 2403 for (tconnp = connfp->connf_head; tconnp != NULL; 2404 tconnp = tconnp->conn_next) { 2405 2406 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2407 ipha->ipha_dst, ipha->ipha_src, ports) && 2408 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2409 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2410 2411 ASSERT(tconnp != connp); 2412 CONN_INC_REF(tconnp); 2413 mutex_exit(&connfp->connf_lock); 2414 return (tconnp); 2415 } 2416 } 2417 mutex_exit(&connfp->connf_lock); 2418 return (NULL); 2419 } 2420 2421 /* 2422 * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on 2423 * the {src, dst, lport, fport} quadruplet. Returns with conn reference 2424 * held; caller must call CONN_DEC_REF. Only checks for connected entries 2425 * (peer tcp in ESTABLISHED state). 2426 */ 2427 conn_t * 2428 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, 2429 ip_stack_t *ipst) 2430 { 2431 uint32_t ports; 2432 uint16_t *pports = (uint16_t *)&ports; 2433 connf_t *connfp; 2434 conn_t *tconnp; 2435 boolean_t zone_chk; 2436 2437 /* 2438 * If either the source of destination address is loopback, then 2439 * both endpoints must be in the same Zone. Otherwise, both of 2440 * the addresses are system-wide unique (tcp is in ESTABLISHED 2441 * state) and the endpoints may reside in different Zones. We 2442 * don't do Zone check for link local address(es) because the 2443 * current Zone implementation treats each link local address as 2444 * being unique per system node, i.e. they belong to global Zone. 2445 */ 2446 zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || 2447 IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); 2448 2449 pports[0] = tcpha->tha_fport; 2450 pports[1] = tcpha->tha_lport; 2451 2452 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2453 ports, ipst)]; 2454 2455 mutex_enter(&connfp->connf_lock); 2456 for (tconnp = connfp->connf_head; tconnp != NULL; 2457 tconnp = tconnp->conn_next) { 2458 2459 /* We skip conn_bound_if check here as this is loopback tcp */ 2460 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2461 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2462 tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && 2463 (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) { 2464 2465 ASSERT(tconnp != connp); 2466 CONN_INC_REF(tconnp); 2467 mutex_exit(&connfp->connf_lock); 2468 return (tconnp); 2469 } 2470 } 2471 mutex_exit(&connfp->connf_lock); 2472 return (NULL); 2473 } 2474 2475 /* 2476 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2477 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2478 * Only checks for connected entries i.e. no INADDR_ANY checks. 2479 */ 2480 conn_t * 2481 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, 2482 ip_stack_t *ipst) 2483 { 2484 uint32_t ports; 2485 uint16_t *pports; 2486 connf_t *connfp; 2487 conn_t *tconnp; 2488 2489 pports = (uint16_t *)&ports; 2490 pports[0] = tcpha->tha_fport; 2491 pports[1] = tcpha->tha_lport; 2492 2493 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, 2494 ports, ipst)]; 2495 2496 mutex_enter(&connfp->connf_lock); 2497 for (tconnp = connfp->connf_head; tconnp != NULL; 2498 tconnp = tconnp->conn_next) { 2499 2500 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP, 2501 ipha->ipha_dst, ipha->ipha_src, ports) && 2502 tconnp->conn_tcp->tcp_state >= min_state) { 2503 2504 CONN_INC_REF(tconnp); 2505 mutex_exit(&connfp->connf_lock); 2506 return (tconnp); 2507 } 2508 } 2509 mutex_exit(&connfp->connf_lock); 2510 return (NULL); 2511 } 2512 2513 /* 2514 * Find an exact {src, dst, lport, fport} match for a bounced datagram. 2515 * Returns with conn reference held. Caller must call CONN_DEC_REF. 2516 * Only checks for connected entries i.e. no INADDR_ANY checks. 2517 * Match on ifindex in addition to addresses. 2518 */ 2519 conn_t * 2520 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, 2521 uint_t ifindex, ip_stack_t *ipst) 2522 { 2523 tcp_t *tcp; 2524 uint32_t ports; 2525 uint16_t *pports; 2526 connf_t *connfp; 2527 conn_t *tconnp; 2528 2529 pports = (uint16_t *)&ports; 2530 pports[0] = tcpha->tha_fport; 2531 pports[1] = tcpha->tha_lport; 2532 2533 connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, 2534 ports, ipst)]; 2535 2536 mutex_enter(&connfp->connf_lock); 2537 for (tconnp = connfp->connf_head; tconnp != NULL; 2538 tconnp = tconnp->conn_next) { 2539 2540 tcp = tconnp->conn_tcp; 2541 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, 2542 ip6h->ip6_dst, ip6h->ip6_src, ports) && 2543 tcp->tcp_state >= min_state && 2544 (tconnp->conn_bound_if == 0 || 2545 tconnp->conn_bound_if == ifindex)) { 2546 2547 CONN_INC_REF(tconnp); 2548 mutex_exit(&connfp->connf_lock); 2549 return (tconnp); 2550 } 2551 } 2552 mutex_exit(&connfp->connf_lock); 2553 return (NULL); 2554 } 2555 2556 /* 2557 * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate 2558 * a listener when changing state. 2559 */ 2560 conn_t * 2561 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid, 2562 ip_stack_t *ipst) 2563 { 2564 connf_t *bind_connfp; 2565 conn_t *connp; 2566 tcp_t *tcp; 2567 2568 /* 2569 * Avoid false matches for packets sent to an IP destination of 2570 * all zeros. 2571 */ 2572 if (laddr == 0) 2573 return (NULL); 2574 2575 ASSERT(zoneid != ALL_ZONES); 2576 2577 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2578 mutex_enter(&bind_connfp->connf_lock); 2579 for (connp = bind_connfp->connf_head; connp != NULL; 2580 connp = connp->conn_next) { 2581 tcp = connp->conn_tcp; 2582 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) && 2583 IPCL_ZONE_MATCH(connp, zoneid) && 2584 (tcp->tcp_listener == NULL)) { 2585 CONN_INC_REF(connp); 2586 mutex_exit(&bind_connfp->connf_lock); 2587 return (connp); 2588 } 2589 } 2590 mutex_exit(&bind_connfp->connf_lock); 2591 return (NULL); 2592 } 2593 2594 /* 2595 * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate 2596 * a listener when changing state. 2597 */ 2598 conn_t * 2599 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, 2600 zoneid_t zoneid, ip_stack_t *ipst) 2601 { 2602 connf_t *bind_connfp; 2603 conn_t *connp = NULL; 2604 tcp_t *tcp; 2605 2606 /* 2607 * Avoid false matches for packets sent to an IP destination of 2608 * all zeros. 2609 */ 2610 if (IN6_IS_ADDR_UNSPECIFIED(laddr)) 2611 return (NULL); 2612 2613 ASSERT(zoneid != ALL_ZONES); 2614 2615 bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; 2616 mutex_enter(&bind_connfp->connf_lock); 2617 for (connp = bind_connfp->connf_head; connp != NULL; 2618 connp = connp->conn_next) { 2619 tcp = connp->conn_tcp; 2620 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && 2621 IPCL_ZONE_MATCH(connp, zoneid) && 2622 (connp->conn_bound_if == 0 || 2623 connp->conn_bound_if == ifindex) && 2624 tcp->tcp_listener == NULL) { 2625 CONN_INC_REF(connp); 2626 mutex_exit(&bind_connfp->connf_lock); 2627 return (connp); 2628 } 2629 } 2630 mutex_exit(&bind_connfp->connf_lock); 2631 return (NULL); 2632 } 2633 2634 /* 2635 * ipcl_get_next_conn 2636 * get the next entry in the conn global list 2637 * and put a reference on the next_conn. 2638 * decrement the reference on the current conn. 2639 * 2640 * This is an iterator based walker function that also provides for 2641 * some selection by the caller. It walks through the conn_hash bucket 2642 * searching for the next valid connp in the list, and selects connections 2643 * that are neither closed nor condemned. It also REFHOLDS the conn 2644 * thus ensuring that the conn exists when the caller uses the conn. 2645 */ 2646 conn_t * 2647 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) 2648 { 2649 conn_t *next_connp; 2650 2651 if (connfp == NULL) 2652 return (NULL); 2653 2654 mutex_enter(&connfp->connf_lock); 2655 2656 next_connp = (connp == NULL) ? 2657 connfp->connf_head : connp->conn_g_next; 2658 2659 while (next_connp != NULL) { 2660 mutex_enter(&next_connp->conn_lock); 2661 if (!(next_connp->conn_flags & conn_flags) || 2662 (next_connp->conn_state_flags & 2663 (CONN_CONDEMNED | CONN_INCIPIENT))) { 2664 /* 2665 * This conn has been condemned or 2666 * is closing, or the flags don't match 2667 */ 2668 mutex_exit(&next_connp->conn_lock); 2669 next_connp = next_connp->conn_g_next; 2670 continue; 2671 } 2672 CONN_INC_REF_LOCKED(next_connp); 2673 mutex_exit(&next_connp->conn_lock); 2674 break; 2675 } 2676 2677 mutex_exit(&connfp->connf_lock); 2678 2679 if (connp != NULL) 2680 CONN_DEC_REF(connp); 2681 2682 return (next_connp); 2683 } 2684 2685 #ifdef CONN_DEBUG 2686 /* 2687 * Trace of the last NBUF refhold/refrele 2688 */ 2689 int 2690 conn_trace_ref(conn_t *connp) 2691 { 2692 int last; 2693 conn_trace_t *ctb; 2694 2695 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2696 last = connp->conn_trace_last; 2697 last++; 2698 if (last == CONN_TRACE_MAX) 2699 last = 0; 2700 2701 ctb = &connp->conn_trace_buf[last]; 2702 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2703 connp->conn_trace_last = last; 2704 return (1); 2705 } 2706 2707 int 2708 conn_untrace_ref(conn_t *connp) 2709 { 2710 int last; 2711 conn_trace_t *ctb; 2712 2713 ASSERT(MUTEX_HELD(&connp->conn_lock)); 2714 last = connp->conn_trace_last; 2715 last++; 2716 if (last == CONN_TRACE_MAX) 2717 last = 0; 2718 2719 ctb = &connp->conn_trace_buf[last]; 2720 ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH); 2721 connp->conn_trace_last = last; 2722 return (1); 2723 } 2724 #endif 2725 2726 mblk_t * 2727 conn_get_pid_mblk(conn_t *connp) 2728 { 2729 mblk_t *mblk; 2730 conn_pid_info_t *cpi; 2731 2732 if (connp->conn_upper_handle != NULL) { 2733 return (*connp->conn_upcalls->su_get_sock_pid_mblk) 2734 (connp->conn_upper_handle); 2735 } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL && 2736 connp->conn_rq->q_stream != NULL) { 2737 return (sh_get_pid_mblk(connp->conn_rq->q_stream)); 2738 } 2739 2740 /* return an empty mblk */ 2741 if ((mblk = allocb(sizeof (conn_pid_info_t), BPRI_HI)) == NULL) 2742 return (NULL); 2743 mblk->b_wptr += sizeof (conn_pid_info_t); 2744 cpi = (conn_pid_info_t *)mblk->b_datap->db_base; 2745 cpi->cpi_magic = CONN_PID_INFO_MGC; 2746 cpi->cpi_contents = CONN_PID_INFO_NON; 2747 cpi->cpi_pids_cnt = 0; 2748 cpi->cpi_tot_size = sizeof (conn_pid_info_t); 2749 cpi->cpi_pids[0] = 0; 2750 return (mblk); 2751 }