1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * IP PACKET CLASSIFIER
  27  *
  28  * The IP packet classifier provides mapping between IP packets and persistent
  29  * connection state for connection-oriented protocols. It also provides
  30  * interface for managing connection states.
  31  *
  32  * The connection state is kept in conn_t data structure and contains, among
  33  * other things:
  34  *
  35  *      o local/remote address and ports
  36  *      o Transport protocol
  37  *      o squeue for the connection (for TCP only)
  38  *      o reference counter
  39  *      o Connection state
  40  *      o hash table linkage
  41  *      o interface/ire information
  42  *      o credentials
  43  *      o ipsec policy
  44  *      o send and receive functions.
  45  *      o mutex lock.
  46  *
  47  * Connections use a reference counting scheme. They are freed when the
  48  * reference counter drops to zero. A reference is incremented when connection
  49  * is placed in a list or table, when incoming packet for the connection arrives
  50  * and when connection is processed via squeue (squeue processing may be
  51  * asynchronous and the reference protects the connection from being destroyed
  52  * before its processing is finished).
  53  *
  54  * conn_recv is used to pass up packets to the ULP.
  55  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  56  * a listener, and changes to tcp_input_listener as the listener has picked a
  57  * good squeue. For other cases it is set to tcp_input_data.
  58  *
  59  * conn_recvicmp is used to pass up ICMP errors to the ULP.
  60  *
  61  * Classifier uses several hash tables:
  62  *
  63  *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  64  *      ipcl_bind_fanout:       contains all connections in BOUND state
  65  *      ipcl_proto_fanout:      IPv4 protocol fanout
  66  *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  67  *      ipcl_udp_fanout:        contains all UDP connections
  68  *      ipcl_iptun_fanout:      contains all IP tunnel connections
  69  *      ipcl_globalhash_fanout: contains all connections
  70  *`     ipcl_dccp_conn_fanout:  contains all DCCP connections in CONNECTED state
  71  *      ipcl_dccp_bind_fanout:  contains all DCCP connections in BOUND state
  72  *
  73  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  74  * which need to view all existing connections.
  75  *
  76  * All tables are protected by per-bucket locks. When both per-bucket lock and
  77  * connection lock need to be held, the per-bucket lock should be acquired
  78  * first, followed by the connection lock.
  79  *
  80  * All functions doing search in one of these tables increment a reference
  81  * counter on the connection found (if any). This reference should be dropped
  82  * when the caller has finished processing the connection.
  83  *
  84  *
  85  * INTERFACES:
  86  * ===========
  87  *
  88  * Connection Lookup:
  89  * ------------------
  90  *
  91  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  92  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  93  *
  94  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  95  * it can't find any associated connection. If the connection is found, its
  96  * reference counter is incremented.
  97  *
  98  *      mp:     mblock, containing packet header. The full header should fit
  99  *              into a single mblock. It should also contain at least full IP
 100  *              and TCP or UDP header.
 101  *
 102  *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 103  *
 104  *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 105  *               the packet.
 106  *
 107  *      ira->ira_zoneid: The zone in which the returned connection must be; the
 108  *              zoneid corresponding to the ire_zoneid on the IRE located for
 109  *              the packet's destination address.
 110  *
 111  *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 112  *              IRAF_TX_SHARED_ADDR flags
 113  *
 114  *      For TCP connections, the lookup order is as follows:
 115  *              5-tuple {src, dst, protocol, local port, remote port}
 116  *                      lookup in ipcl_conn_fanout table.
 117  *              3-tuple {dst, remote port, protocol} lookup in
 118  *                      ipcl_bind_fanout table.
 119  *
 120  *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 121  *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 122  *      these interfaces do not handle cases where a packets belongs
 123  *      to multiple UDP clients, which is handled in IP itself.
 124  *
 125  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 126  * determine which actual zone gets the segment.  This is used only in a
 127  * labeled environment.  The matching rules are:
 128  *
 129  *      - If it's not a multilevel port, then the label on the packet selects
 130  *        the zone.  Unlabeled packets are delivered to the global zone.
 131  *
 132  *      - If it's a multilevel port, then only the zone registered to receive
 133  *        packets on that port matches.
 134  *
 135  * Also, in a labeled environment, packet labels need to be checked.  For fully
 136  * bound TCP connections, we can assume that the packet label was checked
 137  * during connection establishment, and doesn't need to be checked on each
 138  * packet.  For others, though, we need to check for strict equality or, for
 139  * multilevel ports, membership in the range or set.  This part currently does
 140  * a tnrh lookup on each packet, but could be optimized to use cached results
 141  * if that were necessary.  (SCTP doesn't come through here, but if it did,
 142  * we would apply the same rules as TCP.)
 143  *
 144  * An implication of the above is that fully-bound TCP sockets must always use
 145  * distinct 4-tuples; they can't be discriminated by label alone.
 146  *
 147  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 148  * as there's no connection set-up handshake and no shared state.
 149  *
 150  * Labels on looped-back packets within a single zone do not need to be
 151  * checked, as all processes in the same zone have the same label.
 152  *
 153  * Finally, for unlabeled packets received by a labeled system, special rules
 154  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 155  * socket in the zone whose label matches the default label of the sender, if
 156  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 157  * receiver's label must dominate the sender's default label.
 158  *
 159  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 160  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 161  *                                       ip_stack);
 162  *
 163  *      Lookup routine to find a exact match for {src, dst, local port,
 164  *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 165  *      ports are read from the IP and TCP header respectively.
 166  *
 167  * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 168  *                                       zoneid, ip_stack);
 169  * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 170  *                                       zoneid, ip_stack);
 171  *
 172  *      Lookup routine to find a listener with the tuple {lport, laddr,
 173  *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 174  *      parameter interface index is also compared.
 175  *
 176  * void ipcl_walk(func, arg, ip_stack)
 177  *
 178  *      Apply 'func' to every connection available. The 'func' is called as
 179  *      (*func)(connp, arg). The walk is non-atomic so connections may be
 180  *      created and destroyed during the walk. The CONN_CONDEMNED and
 181  *      CONN_INCIPIENT flags ensure that connections which are newly created
 182  *      or being destroyed are not selected by the walker.
 183  *
 184  * Table Updates
 185  * -------------
 186  *
 187  * int ipcl_conn_insert(connp);
 188  * int ipcl_conn_insert_v4(connp);
 189  * int ipcl_conn_insert_v6(connp);
 190  *
 191  *      Insert 'connp' in the ipcl_conn_fanout.
 192  *      Arguements :
 193  *              connp           conn_t to be inserted
 194  *
 195  *      Return value :
 196  *              0               if connp was inserted
 197  *              EADDRINUSE      if the connection with the same tuple
 198  *                              already exists.
 199  *
 200  * int ipcl_bind_insert(connp);
 201  * int ipcl_bind_insert_v4(connp);
 202  * int ipcl_bind_insert_v6(connp);
 203  *
 204  *      Insert 'connp' in ipcl_bind_fanout.
 205  *      Arguements :
 206  *              connp           conn_t to be inserted
 207  *
 208  *
 209  * void ipcl_hash_remove(connp);
 210  *
 211  *      Removes the 'connp' from the connection fanout table.
 212  *
 213  * Connection Creation/Destruction
 214  * -------------------------------
 215  *
 216  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 217  *
 218  *      Creates a new conn based on the type flag, inserts it into
 219  *      globalhash table.
 220  *
 221  *      type:   This flag determines the type of conn_t which needs to be
 222  *              created i.e., which kmem_cache it comes from.
 223  *              IPCL_TCPCONN    indicates a TCP connection
 224  *              IPCL_SCTPCONN   indicates a SCTP connection
 225  *              IPCL_UDPCONN    indicates a UDP conn_t.
 226  *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 227  *              IPCL_RTSCONN    indicates a RTS conn_t.
 228  *              IPCL_DCCPCONN   indicates a DCCP conn_t.
 229  *              IPCL_IPCCONN    indicates all other connections.
 230  *
 231  * void ipcl_conn_destroy(connp)
 232  *
 233  *      Destroys the connection state, removes it from the global
 234  *      connection hash table and frees its memory.
 235  */
 236 
 237 #include <sys/types.h>
 238 #include <sys/stream.h>
 239 #include <sys/stropts.h>
 240 #include <sys/sysmacros.h>
 241 #include <sys/strsubr.h>
 242 #include <sys/strsun.h>
 243 #define _SUN_TPI_VERSION 2
 244 #include <sys/ddi.h>
 245 #include <sys/cmn_err.h>
 246 #include <sys/debug.h>
 247 
 248 #include <sys/systm.h>
 249 #include <sys/param.h>
 250 #include <sys/kmem.h>
 251 #include <sys/isa_defs.h>
 252 #include <inet/common.h>
 253 #include <netinet/ip6.h>
 254 #include <netinet/icmp6.h>
 255 
 256 #include <inet/ip.h>
 257 #include <inet/ip_if.h>
 258 #include <inet/ip_ire.h>
 259 #include <inet/ip6.h>
 260 #include <inet/ip_ndp.h>
 261 #include <inet/ip_impl.h>
 262 #include <inet/udp_impl.h>
 263 #include <inet/dccp_impl.h>
 264 #include <inet/sctp_ip.h>
 265 #include <inet/sctp/sctp_impl.h>
 266 #include <inet/rawip_impl.h>
 267 #include <inet/rts_impl.h>
 268 #include <inet/iptun/iptun_impl.h>
 269 
 270 #include <sys/cpuvar.h>
 271 
 272 #include <inet/ipclassifier.h>
 273 #include <inet/tcp.h>
 274 #include <inet/ipsec_impl.h>
 275 
 276 #include <sys/tsol/tnet.h>
 277 #include <sys/sockio.h>
 278 
 279 /* Old value for compatibility. Setable in /etc/system */
 280 uint_t tcp_conn_hash_size = 0;
 281 
 282 /* New value. Zero means choose automatically.  Setable in /etc/system */
 283 uint_t ipcl_conn_hash_size = 0;
 284 uint_t ipcl_conn_hash_memfactor = 8192;
 285 uint_t ipcl_conn_hash_maxsize = 82500;
 286 
 287 /* bind/udp fanout table size */
 288 uint_t ipcl_bind_fanout_size = 512;
 289 uint_t ipcl_udp_fanout_size = 16384;
 290 
 291 /* Fanout table sizes for dccp */
 292 uint_t ipcl_dccp_conn_fanout_size = 512;
 293 uint_t ipcl_dccp_bind_fanout_size = 512;
 294 
 295 /* Raw socket fanout size.  Must be a power of 2. */
 296 uint_t ipcl_raw_fanout_size = 256;
 297 
 298 /*
 299  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 300  * expect that most large deployments would have hundreds of tunnels, and
 301  * thousands in the extreme case.
 302  */
 303 uint_t ipcl_iptun_fanout_size = 6143;
 304 
 305 /*
 306  * Power of 2^N Primes useful for hashing for N of 0-28,
 307  * these primes are the nearest prime <= 2^N - 2^(N-2).
 308  */
 309 
 310 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 311                 6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 312                 786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 313                 50331599, 100663291, 201326557, 0}
 314 
 315 /*
 316  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 317  * are aligned on cache lines.
 318  */
 319 typedef union itc_s {
 320         conn_t  itc_conn;
 321         char    itcu_filler[CACHE_ALIGN(conn_s)];
 322 } itc_t;
 323 
 324 struct kmem_cache  *tcp_conn_cache;
 325 struct kmem_cache  *ip_conn_cache;
 326 extern struct kmem_cache  *sctp_conn_cache;
 327 struct kmem_cache  *udp_conn_cache;
 328 struct kmem_cache  *rawip_conn_cache;
 329 struct kmem_cache  *rts_conn_cache;
 330 struct kmem_cache  *dccp_conn_cache;
 331 
 332 extern void     tcp_timermp_free(tcp_t *);
 333 extern mblk_t   *tcp_timermp_alloc(int);
 334 
 335 static int      ip_conn_constructor(void *, void *, int);
 336 static void     ip_conn_destructor(void *, void *);
 337 
 338 static int      tcp_conn_constructor(void *, void *, int);
 339 static void     tcp_conn_destructor(void *, void *);
 340 
 341 static int      udp_conn_constructor(void *, void *, int);
 342 static void     udp_conn_destructor(void *, void *);
 343 
 344 static int      rawip_conn_constructor(void *, void *, int);
 345 static void     rawip_conn_destructor(void *, void *);
 346 
 347 static int      rts_conn_constructor(void *, void *, int);
 348 static void     rts_conn_destructor(void *, void *);
 349 
 350 static int      dccp_conn_constructor(void *, void *, int);
 351 static void     dccp_conn_destructor(void *, void *);
 352 
 353 /*
 354  * Global (for all stack instances) init routine
 355  */
 356 void
 357 ipcl_g_init(void)
 358 {
 359         ip_conn_cache = kmem_cache_create("ip_conn_cache",
 360             sizeof (conn_t), CACHE_ALIGN_SIZE,
 361             ip_conn_constructor, ip_conn_destructor,
 362             NULL, NULL, NULL, 0);
 363 
 364         tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 365             sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 366             tcp_conn_constructor, tcp_conn_destructor,
 367             tcp_conn_reclaim, NULL, NULL, 0);
 368 
 369         udp_conn_cache = kmem_cache_create("udp_conn_cache",
 370             sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 371             udp_conn_constructor, udp_conn_destructor,
 372             NULL, NULL, NULL, 0);
 373 
 374         rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 375             sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 376             rawip_conn_constructor, rawip_conn_destructor,
 377             NULL, NULL, NULL, 0);
 378 
 379         rts_conn_cache = kmem_cache_create("rts_conn_cache",
 380             sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 381             rts_conn_constructor, rts_conn_destructor,
 382             NULL, NULL, NULL, 0);
 383 
 384         /* XXX:DCCP reclaim */
 385         dccp_conn_cache = kmem_cache_create("dccp_conn_cache",
 386             sizeof (itc_t) + sizeof (dccp_t), CACHE_ALIGN_SIZE,
 387             dccp_conn_constructor, dccp_conn_destructor,
 388             NULL, NULL, NULL, 0);
 389 }
 390 
 391 /*
 392  * ipclassifier intialization routine, sets up hash tables.
 393  */
 394 void
 395 ipcl_init(ip_stack_t *ipst)
 396 {
 397         int i;
 398         int sizes[] = P2Ps();
 399 
 400         /*
 401          * Calculate size of conn fanout table from /etc/system settings
 402          */
 403         if (ipcl_conn_hash_size != 0) {
 404                 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 405         } else if (tcp_conn_hash_size != 0) {
 406                 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 407         } else {
 408                 extern pgcnt_t freemem;
 409 
 410                 ipst->ips_ipcl_conn_fanout_size =
 411                     (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 412 
 413                 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 414                         ipst->ips_ipcl_conn_fanout_size =
 415                             ipcl_conn_hash_maxsize;
 416                 }
 417         }
 418 
 419         for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 420                 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 421                         break;
 422                 }
 423         }
 424         if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 425                 /* Out of range, use the 2^16 value */
 426                 ipst->ips_ipcl_conn_fanout_size = sizes[16];
 427         }
 428 
 429         /* Take values from /etc/system */
 430         ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 431         ipst->ips_ipcl_dccp_conn_fanout_size = ipcl_dccp_conn_fanout_size;
 432         ipst->ips_ipcl_dccp_bind_fanout_size = ipcl_dccp_bind_fanout_size;
 433         ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 434         ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 435         ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 436 
 437         ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 438 
 439         ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 440             ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 441 
 442         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 443                 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 444                     MUTEX_DEFAULT, NULL);
 445         }
 446 
 447         ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 448             ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 449 
 450         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 451                 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 452                     MUTEX_DEFAULT, NULL);
 453         }
 454 
 455         ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 456             sizeof (connf_t), KM_SLEEP);
 457         for (i = 0; i < IPPROTO_MAX; i++) {
 458                 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 459                     MUTEX_DEFAULT, NULL);
 460         }
 461 
 462         ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 463             sizeof (connf_t), KM_SLEEP);
 464         for (i = 0; i < IPPROTO_MAX; i++) {
 465                 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 466                     MUTEX_DEFAULT, NULL);
 467         }
 468 
 469         ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 470         mutex_init(&ipst->ips_rts_clients->connf_lock,
 471             NULL, MUTEX_DEFAULT, NULL);
 472 
 473         ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 474             ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 475         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 476                 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 477                     MUTEX_DEFAULT, NULL);
 478         }
 479 
 480         ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 481             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 482         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 483                 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 484                     MUTEX_DEFAULT, NULL);
 485         }
 486 
 487         ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 488             ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 489         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 490                 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 491                     MUTEX_DEFAULT, NULL);
 492         }
 493 
 494         ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 495             sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 496         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 497                 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 498                     NULL, MUTEX_DEFAULT, NULL);
 499         }
 500 
 501         ipst->ips_ipcl_dccp_conn_fanout = kmem_zalloc(
 502             ipst->ips_ipcl_dccp_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 503         for (i = 0; i < ipst->ips_ipcl_dccp_conn_fanout_size; i++) {
 504                 mutex_init(&ipst->ips_ipcl_dccp_conn_fanout[i].connf_lock, NULL,
 505                     MUTEX_DEFAULT, NULL);
 506         }
 507 
 508         ipst->ips_ipcl_dccp_bind_fanout = kmem_zalloc(
 509             ipst->ips_ipcl_dccp_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 510         for (i = 0; i < ipst->ips_ipcl_dccp_bind_fanout_size; i++) {
 511                 mutex_init(&ipst->ips_ipcl_dccp_bind_fanout[i].connf_lock, NULL,
 512                     MUTEX_DEFAULT, NULL);
 513         }
 514 }
 515 
 516 void
 517 ipcl_g_destroy(void)
 518 {
 519         kmem_cache_destroy(ip_conn_cache);
 520         kmem_cache_destroy(tcp_conn_cache);
 521         kmem_cache_destroy(udp_conn_cache);
 522         kmem_cache_destroy(rawip_conn_cache);
 523         kmem_cache_destroy(rts_conn_cache);
 524         kmem_cache_destroy(dccp_conn_cache);
 525 }
 526 
 527 /*
 528  * All user-level and kernel use of the stack must be gone
 529  * by now.
 530  */
 531 void
 532 ipcl_destroy(ip_stack_t *ipst)
 533 {
 534         int i;
 535 
 536         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 537                 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 538                 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 539         }
 540         kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 541             sizeof (connf_t));
 542         ipst->ips_ipcl_conn_fanout = NULL;
 543 
 544         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 545                 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 546                 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 547         }
 548         kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 549             sizeof (connf_t));
 550         ipst->ips_ipcl_bind_fanout = NULL;
 551 
 552         for (i = 0; i < IPPROTO_MAX; i++) {
 553                 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 554                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 555         }
 556         kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 557             IPPROTO_MAX * sizeof (connf_t));
 558         ipst->ips_ipcl_proto_fanout_v4 = NULL;
 559 
 560         for (i = 0; i < IPPROTO_MAX; i++) {
 561                 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 562                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 563         }
 564         kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 565             IPPROTO_MAX * sizeof (connf_t));
 566         ipst->ips_ipcl_proto_fanout_v6 = NULL;
 567 
 568         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 569                 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 570                 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 571         }
 572         kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 573             sizeof (connf_t));
 574         ipst->ips_ipcl_udp_fanout = NULL;
 575 
 576         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 577                 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 578                 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 579         }
 580         kmem_free(ipst->ips_ipcl_iptun_fanout,
 581             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 582         ipst->ips_ipcl_iptun_fanout = NULL;
 583 
 584         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 585                 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 586                 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 587         }
 588         kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 589             sizeof (connf_t));
 590         ipst->ips_ipcl_raw_fanout = NULL;
 591 
 592         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 593                 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 594                 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 595         }
 596         kmem_free(ipst->ips_ipcl_globalhash_fanout,
 597             sizeof (connf_t) * CONN_G_HASH_SIZE);
 598         ipst->ips_ipcl_globalhash_fanout = NULL;
 599 
 600         for (i = 0; i < ipst->ips_ipcl_dccp_conn_fanout_size; i++) {
 601                 ASSERT(ipst->ips_ipcl_dccp_conn_fanout[i].connf_head == NULL);
 602                 mutex_destroy(&ipst->ips_ipcl_dccp_conn_fanout[i].connf_lock);
 603         }
 604         kmem_free(ipst->ips_ipcl_dccp_conn_fanout,
 605             ipst->ips_ipcl_dccp_conn_fanout_size * sizeof (connf_t));
 606         ipst->ips_ipcl_dccp_conn_fanout = NULL;
 607 
 608         for (i = 0; i < ipst->ips_ipcl_dccp_bind_fanout_size; i++) {
 609                 ASSERT(ipst->ips_ipcl_dccp_bind_fanout[i].connf_head == NULL);
 610                 mutex_destroy(&ipst->ips_ipcl_dccp_bind_fanout[i].connf_lock);
 611         }
 612         kmem_free(ipst->ips_ipcl_dccp_bind_fanout,
 613             ipst->ips_ipcl_dccp_bind_fanout_size * sizeof (connf_t));
 614         ipst->ips_ipcl_dccp_bind_fanout = NULL;
 615 
 616         ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 617         mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 618         kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 619         ipst->ips_rts_clients = NULL;
 620 }
 621 
 622 /*
 623  * conn creation routine. initialize the conn, sets the reference
 624  * and inserts it in the global hash table.
 625  */
 626 conn_t *
 627 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 628 {
 629         conn_t  *connp;
 630         struct kmem_cache *conn_cache;
 631 
 632         switch (type) {
 633         case IPCL_SCTPCONN:
 634                 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 635                         return (NULL);
 636                 sctp_conn_init(connp);
 637                 netstack_hold(ns);
 638                 connp->conn_netstack = ns;
 639                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 640                 connp->conn_ixa->ixa_conn_id = (long)connp;
 641                 ipcl_globalhash_insert(connp);
 642                 return (connp);
 643 
 644         case IPCL_TCPCONN:
 645                 conn_cache = tcp_conn_cache;
 646                 break;
 647 
 648         case IPCL_UDPCONN:
 649                 conn_cache = udp_conn_cache;
 650                 break;
 651 
 652         case IPCL_RAWIPCONN:
 653                 conn_cache = rawip_conn_cache;
 654                 break;
 655 
 656         case IPCL_RTSCONN:
 657                 conn_cache = rts_conn_cache;
 658                 break;
 659 
 660         case IPCL_IPCCONN:
 661                 conn_cache = ip_conn_cache;
 662                 break;
 663 
 664         case IPCL_DCCPCONN:
 665                 conn_cache = dccp_conn_cache;
 666                 break;
 667 
 668         default:
 669                 connp = NULL;
 670                 ASSERT(0);
 671         }
 672 
 673         if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 674                 return (NULL);
 675 
 676         connp->conn_ref = 1;
 677         netstack_hold(ns);
 678         connp->conn_netstack = ns;
 679         connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 680         connp->conn_ixa->ixa_conn_id = (long)connp;
 681         ipcl_globalhash_insert(connp);
 682         return (connp);
 683 }
 684 
 685 void
 686 ipcl_conn_destroy(conn_t *connp)
 687 {
 688         mblk_t  *mp;
 689         netstack_t      *ns = connp->conn_netstack;
 690 
 691         ASSERT(!MUTEX_HELD(&connp->conn_lock));
 692         ASSERT(connp->conn_ref == 0);
 693         ASSERT(connp->conn_ioctlref == 0);
 694 
 695         DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 696 
 697         if (connp->conn_cred != NULL) {
 698                 crfree(connp->conn_cred);
 699                 connp->conn_cred = NULL;
 700                 /* ixa_cred done in ipcl_conn_cleanup below */
 701         }
 702 
 703         if (connp->conn_ht_iphc != NULL) {
 704                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 705                 connp->conn_ht_iphc = NULL;
 706                 connp->conn_ht_iphc_allocated = 0;
 707                 connp->conn_ht_iphc_len = 0;
 708                 connp->conn_ht_ulp = NULL;
 709                 connp->conn_ht_ulp_len = 0;
 710         }
 711         ip_pkt_free(&connp->conn_xmit_ipp);
 712 
 713         ipcl_globalhash_remove(connp);
 714 
 715         if (connp->conn_latch != NULL) {
 716                 IPLATCH_REFRELE(connp->conn_latch);
 717                 connp->conn_latch = NULL;
 718         }
 719         if (connp->conn_latch_in_policy != NULL) {
 720                 IPPOL_REFRELE(connp->conn_latch_in_policy);
 721                 connp->conn_latch_in_policy = NULL;
 722         }
 723         if (connp->conn_latch_in_action != NULL) {
 724                 IPACT_REFRELE(connp->conn_latch_in_action);
 725                 connp->conn_latch_in_action = NULL;
 726         }
 727         if (connp->conn_policy != NULL) {
 728                 IPPH_REFRELE(connp->conn_policy, ns);
 729                 connp->conn_policy = NULL;
 730         }
 731 
 732         if (connp->conn_ipsec_opt_mp != NULL) {
 733                 freemsg(connp->conn_ipsec_opt_mp);
 734                 connp->conn_ipsec_opt_mp = NULL;
 735         }
 736 
 737         if (connp->conn_flags & IPCL_TCPCONN) {
 738                 tcp_t *tcp = connp->conn_tcp;
 739 
 740                 tcp_free(tcp);
 741                 mp = tcp->tcp_timercache;
 742 
 743                 tcp->tcp_tcps = NULL;
 744 
 745                 /*
 746                  * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 747                  * the mblk.
 748                  */
 749                 if (tcp->tcp_rsrv_mp != NULL) {
 750                         freeb(tcp->tcp_rsrv_mp);
 751                         tcp->tcp_rsrv_mp = NULL;
 752                         mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 753                 }
 754 
 755                 ipcl_conn_cleanup(connp);
 756                 connp->conn_flags = IPCL_TCPCONN;
 757                 if (ns != NULL) {
 758                         ASSERT(tcp->tcp_tcps == NULL);
 759                         connp->conn_netstack = NULL;
 760                         connp->conn_ixa->ixa_ipst = NULL;
 761                         netstack_rele(ns);
 762                 }
 763 
 764                 bzero(tcp, sizeof (tcp_t));
 765 
 766                 tcp->tcp_timercache = mp;
 767                 tcp->tcp_connp = connp;
 768                 kmem_cache_free(tcp_conn_cache, connp);
 769                 return;
 770         }
 771 
 772         if (connp->conn_flags & IPCL_SCTPCONN) {
 773                 ASSERT(ns != NULL);
 774                 sctp_free(connp);
 775                 return;
 776         }
 777 
 778         if (connp->conn_flags & IPCL_DCCPCONN) {
 779                 dccp_t  *dccp = connp->conn_dccp;
 780 
 781                 cmn_err(CE_NOTE, "ipclassifier: conn_flags DCCP cache_free");
 782 
 783                 dccp_free(dccp);
 784                 mp = dccp->dccp_timercache;
 785 
 786                 dccp->dccp_dccps = NULL;
 787 
 788                 ipcl_conn_cleanup(connp);
 789                 connp->conn_flags = IPCL_DCCPCONN;
 790                 if (ns != NULL) {
 791                         ASSERT(dccp->dccps == NULL);
 792                         connp->conn_netstack = NULL;
 793                         connp->conn_ixa->ixa_ipst = NULL;
 794                         netstack_rele(ns);
 795                 }
 796 
 797                 bzero(dccp, sizeof (dccp_t));
 798 
 799                 dccp->dccp_timercache = mp;
 800                 dccp->dccp_connp = connp;
 801                 kmem_cache_free(dccp_conn_cache, connp);
 802                 return;
 803         }
 804 
 805         ipcl_conn_cleanup(connp);
 806         if (ns != NULL) {
 807                 connp->conn_netstack = NULL;
 808                 connp->conn_ixa->ixa_ipst = NULL;
 809                 netstack_rele(ns);
 810         }
 811 
 812         /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 813         if (connp->conn_flags & IPCL_UDPCONN) {
 814                 connp->conn_flags = IPCL_UDPCONN;
 815                 kmem_cache_free(udp_conn_cache, connp);
 816         } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 817                 connp->conn_flags = IPCL_RAWIPCONN;
 818                 connp->conn_proto = IPPROTO_ICMP;
 819                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
 820                 kmem_cache_free(rawip_conn_cache, connp);
 821         } else if (connp->conn_flags & IPCL_RTSCONN) {
 822                 connp->conn_flags = IPCL_RTSCONN;
 823                 kmem_cache_free(rts_conn_cache, connp);
 824         } else {
 825                 connp->conn_flags = IPCL_IPCCONN;
 826                 ASSERT(connp->conn_flags & IPCL_IPCCONN);
 827                 ASSERT(connp->conn_priv == NULL);
 828                 kmem_cache_free(ip_conn_cache, connp);
 829         }
 830 }
 831 
 832 /*
 833  * Running in cluster mode - deregister listener information
 834  */
 835 static void
 836 ipcl_conn_unlisten(conn_t *connp)
 837 {
 838         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 839         ASSERT(connp->conn_lport != 0);
 840 
 841         if (cl_inet_unlisten != NULL) {
 842                 sa_family_t     addr_family;
 843                 uint8_t         *laddrp;
 844 
 845                 if (connp->conn_ipversion == IPV6_VERSION) {
 846                         addr_family = AF_INET6;
 847                         laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 848                 } else {
 849                         addr_family = AF_INET;
 850                         laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 851                 }
 852                 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 853                     IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 854         }
 855         connp->conn_flags &= ~IPCL_CL_LISTENER;
 856 }
 857 
 858 /*
 859  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 860  * which table the conn belonged to). So for debugging we can see which hash
 861  * table this connection was in.
 862  */
 863 #define IPCL_HASH_REMOVE(connp) {                                       \
 864         connf_t *connfp = (connp)->conn_fanout;                              \
 865         ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                      \
 866         if (connfp != NULL) {                                           \
 867                 mutex_enter(&connfp->connf_lock);                        \
 868                 if ((connp)->conn_next != NULL)                              \
 869                         (connp)->conn_next->conn_prev =                   \
 870                             (connp)->conn_prev;                              \
 871                 if ((connp)->conn_prev != NULL)                              \
 872                         (connp)->conn_prev->conn_next =                   \
 873                             (connp)->conn_next;                              \
 874                 else                                                    \
 875                         connfp->connf_head = (connp)->conn_next;  \
 876                 (connp)->conn_fanout = NULL;                         \
 877                 (connp)->conn_next = NULL;                           \
 878                 (connp)->conn_prev = NULL;                           \
 879                 (connp)->conn_flags |= IPCL_REMOVED;                 \
 880                 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)       \
 881                         ipcl_conn_unlisten((connp));                    \
 882                 CONN_DEC_REF((connp));                                  \
 883                 mutex_exit(&connfp->connf_lock);                 \
 884         }                                                               \
 885 }
 886 
 887 void
 888 ipcl_hash_remove(conn_t *connp)
 889 {
 890         uint8_t         protocol = connp->conn_proto;
 891 
 892         IPCL_HASH_REMOVE(connp);
 893         if (protocol == IPPROTO_RSVP)
 894                 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 895 }
 896 
 897 /*
 898  * The whole purpose of this function is allow removal of
 899  * a conn_t from the connected hash for timewait reclaim.
 900  * This is essentially a TW reclaim fastpath where timewait
 901  * collector checks under fanout lock (so no one else can
 902  * get access to the conn_t) that refcnt is 2 i.e. one for
 903  * TCP and one for the classifier hash list. If ref count
 904  * is indeed 2, we can just remove the conn under lock and
 905  * avoid cleaning up the conn under squeue. This gives us
 906  * improved performance.
 907  */
 908 void
 909 ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 910 {
 911         ASSERT(MUTEX_HELD(&connfp->connf_lock));
 912         ASSERT(MUTEX_HELD(&connp->conn_lock));
 913         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 914 
 915         if ((connp)->conn_next != NULL) {
 916                 (connp)->conn_next->conn_prev = (connp)->conn_prev;
 917         }
 918         if ((connp)->conn_prev != NULL) {
 919                 (connp)->conn_prev->conn_next = (connp)->conn_next;
 920         } else {
 921                 connfp->connf_head = (connp)->conn_next;
 922         }
 923         (connp)->conn_fanout = NULL;
 924         (connp)->conn_next = NULL;
 925         (connp)->conn_prev = NULL;
 926         (connp)->conn_flags |= IPCL_REMOVED;
 927         ASSERT((connp)->conn_ref == 2);
 928         (connp)->conn_ref--;
 929 }
 930 
 931 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 932         ASSERT((connp)->conn_fanout == NULL);                                \
 933         ASSERT((connp)->conn_next == NULL);                          \
 934         ASSERT((connp)->conn_prev == NULL);                          \
 935         if ((connfp)->connf_head != NULL) {                          \
 936                 (connfp)->connf_head->conn_prev = (connp);                \
 937                 (connp)->conn_next = (connfp)->connf_head;                \
 938         }                                                               \
 939         (connp)->conn_fanout = (connfp);                             \
 940         (connfp)->connf_head = (connp);                                      \
 941         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 942             IPCL_CONNECTED;                                             \
 943         CONN_INC_REF(connp);                                            \
 944 }
 945 
 946 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 947         IPCL_HASH_REMOVE((connp));                                      \
 948         mutex_enter(&(connfp)->connf_lock);                              \
 949         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 950         mutex_exit(&(connfp)->connf_lock);                               \
 951 }
 952 
 953 #define IPCL_HASH_INSERT_BOUND(connfp, connp) {                         \
 954         conn_t *pconnp = NULL, *nconnp;                                 \
 955         IPCL_HASH_REMOVE((connp));                                      \
 956         mutex_enter(&(connfp)->connf_lock);                              \
 957         nconnp = (connfp)->connf_head;                                       \
 958         while (nconnp != NULL &&                                        \
 959             !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {            \
 960                 pconnp = nconnp;                                        \
 961                 nconnp = nconnp->conn_next;                          \
 962         }                                                               \
 963         if (pconnp != NULL) {                                           \
 964                 pconnp->conn_next = (connp);                         \
 965                 (connp)->conn_prev = pconnp;                         \
 966         } else {                                                        \
 967                 (connfp)->connf_head = (connp);                              \
 968         }                                                               \
 969         if (nconnp != NULL) {                                           \
 970                 (connp)->conn_next = nconnp;                         \
 971                 nconnp->conn_prev = (connp);                         \
 972         }                                                               \
 973         (connp)->conn_fanout = (connfp);                             \
 974         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 975             IPCL_BOUND;                                                 \
 976         CONN_INC_REF(connp);                                            \
 977         mutex_exit(&(connfp)->connf_lock);                               \
 978 }
 979 
 980 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) {                      \
 981         conn_t **list, *prev, *next;                                    \
 982         boolean_t isv4mapped =                                          \
 983             IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);               \
 984         IPCL_HASH_REMOVE((connp));                                      \
 985         mutex_enter(&(connfp)->connf_lock);                              \
 986         list = &(connfp)->connf_head;                                    \
 987         prev = NULL;                                                    \
 988         while ((next = *list) != NULL) {                                \
 989                 if (isv4mapped &&                                       \
 990                     IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&     \
 991                     connp->conn_zoneid == next->conn_zoneid) {            \
 992                         (connp)->conn_next = next;                   \
 993                         if (prev != NULL)                               \
 994                                 prev = next->conn_prev;                      \
 995                         next->conn_prev = (connp);                   \
 996                         break;                                          \
 997                 }                                                       \
 998                 list = &next->conn_next;                         \
 999                 prev = next;                                            \
1000         }                                                               \
1001         (connp)->conn_prev = prev;                                   \
1002         *list = (connp);                                                \
1003         (connp)->conn_fanout = (connfp);                             \
1004         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
1005             IPCL_BOUND;                                                 \
1006         CONN_INC_REF((connp));                                          \
1007         mutex_exit(&(connfp)->connf_lock);                               \
1008 }
1009 
1010 void
1011 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
1012 {
1013         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1014 }
1015 
1016 /*
1017  * Because the classifier is used to classify inbound packets, the destination
1018  * address is meant to be our local tunnel address (tunnel source), and the
1019  * source the remote tunnel address (tunnel destination).
1020  *
1021  * Note that conn_proto can't be used for fanout since the upper protocol
1022  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
1023  */
1024 conn_t *
1025 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
1026 {
1027         connf_t *connfp;
1028         conn_t  *connp;
1029 
1030         /* first look for IPv4 tunnel links */
1031         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
1032         mutex_enter(&connfp->connf_lock);
1033         for (connp = connfp->connf_head; connp != NULL;
1034             connp = connp->conn_next) {
1035                 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
1036                         break;
1037         }
1038         if (connp != NULL)
1039                 goto done;
1040 
1041         mutex_exit(&connfp->connf_lock);
1042 
1043         /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
1044         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
1045             INADDR_ANY)];
1046         mutex_enter(&connfp->connf_lock);
1047         for (connp = connfp->connf_head; connp != NULL;
1048             connp = connp->conn_next) {
1049                 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
1050                         break;
1051         }
1052 done:
1053         if (connp != NULL)
1054                 CONN_INC_REF(connp);
1055         mutex_exit(&connfp->connf_lock);
1056         return (connp);
1057 }
1058 
1059 conn_t *
1060 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1061 {
1062         connf_t *connfp;
1063         conn_t  *connp;
1064 
1065         /* Look for an IPv6 tunnel link */
1066         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1067         mutex_enter(&connfp->connf_lock);
1068         for (connp = connfp->connf_head; connp != NULL;
1069             connp = connp->conn_next) {
1070                 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1071                         CONN_INC_REF(connp);
1072                         break;
1073                 }
1074         }
1075         mutex_exit(&connfp->connf_lock);
1076         return (connp);
1077 }
1078 
1079 /*
1080  * This function is used only for inserting SCTP raw socket now.
1081  * This may change later.
1082  *
1083  * Note that only one raw socket can be bound to a port.  The param
1084  * lport is in network byte order.
1085  */
1086 static int
1087 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1088 {
1089         connf_t *connfp;
1090         conn_t  *oconnp;
1091         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1092 
1093         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1094 
1095         /* Check for existing raw socket already bound to the port. */
1096         mutex_enter(&connfp->connf_lock);
1097         for (oconnp = connfp->connf_head; oconnp != NULL;
1098             oconnp = oconnp->conn_next) {
1099                 if (oconnp->conn_lport == lport &&
1100                     oconnp->conn_zoneid == connp->conn_zoneid &&
1101                     oconnp->conn_family == connp->conn_family &&
1102                     ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1103                     IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1104                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1105                     IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1106                     IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1107                     &connp->conn_laddr_v6))) {
1108                         break;
1109                 }
1110         }
1111         mutex_exit(&connfp->connf_lock);
1112         if (oconnp != NULL)
1113                 return (EADDRNOTAVAIL);
1114 
1115         if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1116             IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1117                 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1118                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1119                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1120                 } else {
1121                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1122                 }
1123         } else {
1124                 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1125         }
1126         return (0);
1127 }
1128 
1129 static int
1130 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1131 {
1132         connf_t *connfp;
1133         conn_t  *tconnp;
1134         ipaddr_t laddr = connp->conn_laddr_v4;
1135         ipaddr_t faddr = connp->conn_faddr_v4;
1136 
1137         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1138         mutex_enter(&connfp->connf_lock);
1139         for (tconnp = connfp->connf_head; tconnp != NULL;
1140             tconnp = tconnp->conn_next) {
1141                 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1142                         /* A tunnel is already bound to these addresses. */
1143                         mutex_exit(&connfp->connf_lock);
1144                         return (EADDRINUSE);
1145                 }
1146         }
1147         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1148         mutex_exit(&connfp->connf_lock);
1149         return (0);
1150 }
1151 
1152 static int
1153 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1154 {
1155         connf_t *connfp;
1156         conn_t  *tconnp;
1157         in6_addr_t *laddr = &connp->conn_laddr_v6;
1158         in6_addr_t *faddr = &connp->conn_faddr_v6;
1159 
1160         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1161         mutex_enter(&connfp->connf_lock);
1162         for (tconnp = connfp->connf_head; tconnp != NULL;
1163             tconnp = tconnp->conn_next) {
1164                 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1165                         /* A tunnel is already bound to these addresses. */
1166                         mutex_exit(&connfp->connf_lock);
1167                         return (EADDRINUSE);
1168                 }
1169         }
1170         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1171         mutex_exit(&connfp->connf_lock);
1172         return (0);
1173 }
1174 
1175 /*
1176  * Check for a MAC exemption conflict on a labeled system.  Note that for
1177  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1178  * transport layer.  This check is for binding all other protocols.
1179  *
1180  * Returns true if there's a conflict.
1181  */
1182 static boolean_t
1183 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1184 {
1185         connf_t *connfp;
1186         conn_t *tconn;
1187 
1188         connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1189         mutex_enter(&connfp->connf_lock);
1190         for (tconn = connfp->connf_head; tconn != NULL;
1191             tconn = tconn->conn_next) {
1192                 /* We don't allow v4 fallback for v6 raw socket */
1193                 if (connp->conn_family != tconn->conn_family)
1194                         continue;
1195                 /* If neither is exempt, then there's no conflict */
1196                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1197                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1198                         continue;
1199                 /* We are only concerned about sockets for a different zone */
1200                 if (connp->conn_zoneid == tconn->conn_zoneid)
1201                         continue;
1202                 /* If both are bound to different specific addrs, ok */
1203                 if (connp->conn_laddr_v4 != INADDR_ANY &&
1204                     tconn->conn_laddr_v4 != INADDR_ANY &&
1205                     connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1206                         continue;
1207                 /* These two conflict; fail */
1208                 break;
1209         }
1210         mutex_exit(&connfp->connf_lock);
1211         return (tconn != NULL);
1212 }
1213 
1214 static boolean_t
1215 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1216 {
1217         connf_t *connfp;
1218         conn_t *tconn;
1219 
1220         connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1221         mutex_enter(&connfp->connf_lock);
1222         for (tconn = connfp->connf_head; tconn != NULL;
1223             tconn = tconn->conn_next) {
1224                 /* We don't allow v4 fallback for v6 raw socket */
1225                 if (connp->conn_family != tconn->conn_family)
1226                         continue;
1227                 /* If neither is exempt, then there's no conflict */
1228                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1229                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1230                         continue;
1231                 /* We are only concerned about sockets for a different zone */
1232                 if (connp->conn_zoneid == tconn->conn_zoneid)
1233                         continue;
1234                 /* If both are bound to different addrs, ok */
1235                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1236                     !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1237                     !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1238                     &tconn->conn_laddr_v6))
1239                         continue;
1240                 /* These two conflict; fail */
1241                 break;
1242         }
1243         mutex_exit(&connfp->connf_lock);
1244         return (tconn != NULL);
1245 }
1246 
1247 /*
1248  * (v4, v6) bind hash insertion routines
1249  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1250  */
1251 
1252 int
1253 ipcl_bind_insert(conn_t *connp)
1254 {
1255         if (connp->conn_ipversion == IPV6_VERSION)
1256                 return (ipcl_bind_insert_v6(connp));
1257         else
1258                 return (ipcl_bind_insert_v4(connp));
1259 }
1260 
1261 int
1262 ipcl_bind_insert_v4(conn_t *connp)
1263 {
1264         connf_t *connfp;
1265         int     ret = 0;
1266         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1267         uint16_t        lport = connp->conn_lport;
1268         uint8_t         protocol = connp->conn_proto;
1269 
1270         if (IPCL_IS_IPTUN(connp))
1271                 return (ipcl_iptun_hash_insert(connp, ipst));
1272 
1273         switch (protocol) {
1274         default:
1275                 if (is_system_labeled() &&
1276                     check_exempt_conflict_v4(connp, ipst))
1277                         return (EADDRINUSE);
1278                 /* FALLTHROUGH */
1279         case IPPROTO_UDP:
1280                 if (protocol == IPPROTO_UDP) {
1281                         connfp = &ipst->ips_ipcl_udp_fanout[
1282                             IPCL_UDP_HASH(lport, ipst)];
1283                 } else {
1284                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1285                 }
1286 
1287                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1288                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1289                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1290                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1291                 } else {
1292                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1293                 }
1294                 if (protocol == IPPROTO_RSVP)
1295                         ill_set_inputfn_all(ipst);
1296                 break;
1297 
1298         case IPPROTO_TCP:
1299                 /* Insert it in the Bind Hash */
1300                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1301                 connfp = &ipst->ips_ipcl_bind_fanout[
1302                     IPCL_BIND_HASH(lport, ipst)];
1303                 if (connp->conn_laddr_v4 != INADDR_ANY) {
1304                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1305                 } else {
1306                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1307                 }
1308                 if (cl_inet_listen != NULL) {
1309                         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1310                         connp->conn_flags |= IPCL_CL_LISTENER;
1311                         (*cl_inet_listen)(
1312                             connp->conn_netstack->netstack_stackid,
1313                             IPPROTO_TCP, AF_INET,
1314                             (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1315                 }
1316                 break;
1317 
1318         case IPPROTO_SCTP:
1319                 ret = ipcl_sctp_hash_insert(connp, lport);
1320                 break;
1321 
1322         case IPPROTO_DCCP:
1323                 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_bind_insert_v4");
1324                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1325                 connfp = &ipst->ips_ipcl_dccp_bind_fanout[
1326                     IPCL_DCCP_BIND_HASH(lport, ipst)];
1327                 if (connp->conn_laddr_v4 != INADDR_ANY) {
1328                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1329                 } else {
1330                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1331                 }
1332                 break;
1333         }
1334 
1335 
1336         return (ret);
1337 }
1338 
1339 int
1340 ipcl_bind_insert_v6(conn_t *connp)
1341 {
1342         connf_t         *connfp;
1343         int             ret = 0;
1344         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1345         uint16_t        lport = connp->conn_lport;
1346         uint8_t         protocol = connp->conn_proto;
1347 
1348         if (IPCL_IS_IPTUN(connp)) {
1349                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1350         }
1351 
1352         switch (protocol) {
1353         default:
1354                 if (is_system_labeled() &&
1355                     check_exempt_conflict_v6(connp, ipst))
1356                         return (EADDRINUSE);
1357                 /* FALLTHROUGH */
1358         case IPPROTO_UDP:
1359                 if (protocol == IPPROTO_UDP) {
1360                         connfp = &ipst->ips_ipcl_udp_fanout[
1361                             IPCL_UDP_HASH(lport, ipst)];
1362                 } else {
1363                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1364                 }
1365 
1366                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1367                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1368                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1369                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1370                 } else {
1371                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1372                 }
1373                 break;
1374 
1375         case IPPROTO_TCP:
1376                 /* Insert it in the Bind Hash */
1377                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1378                 connfp = &ipst->ips_ipcl_bind_fanout[
1379                     IPCL_BIND_HASH(lport, ipst)];
1380                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1381                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1382                 } else {
1383                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1384                 }
1385                 if (cl_inet_listen != NULL) {
1386                         sa_family_t     addr_family;
1387                         uint8_t         *laddrp;
1388 
1389                         if (connp->conn_ipversion == IPV6_VERSION) {
1390                                 addr_family = AF_INET6;
1391                                 laddrp =
1392                                     (uint8_t *)&connp->conn_bound_addr_v6;
1393                         } else {
1394                                 addr_family = AF_INET;
1395                                 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1396                         }
1397                         connp->conn_flags |= IPCL_CL_LISTENER;
1398                         (*cl_inet_listen)(
1399                             connp->conn_netstack->netstack_stackid,
1400                             IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1401                 }
1402                 break;
1403 
1404         case IPPROTO_SCTP:
1405                 ret = ipcl_sctp_hash_insert(connp, lport);
1406                 break;
1407 
1408         case IPPROTO_DCCP:
1409                 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_bind_insert_v6");
1410                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1411                 connfp = &ipst->ips_ipcl_dccp_bind_fanout[
1412                     IPCL_DCCP_BIND_HASH(lport, ipst)];
1413                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1414                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1415                 } else {
1416                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1417                 }
1418                 break;
1419         }
1420 
1421         return (ret);
1422 }
1423 
1424 /*
1425  * ipcl_conn_hash insertion routines.
1426  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1427  */
1428 
1429 int
1430 ipcl_conn_insert(conn_t *connp)
1431 {
1432         if (connp->conn_ipversion == IPV6_VERSION)
1433                 return (ipcl_conn_insert_v6(connp));
1434         else
1435                 return (ipcl_conn_insert_v4(connp));
1436 }
1437 
1438 int
1439 ipcl_conn_insert_v4(conn_t *connp)
1440 {
1441         connf_t         *connfp;
1442         conn_t          *tconnp;
1443         int             ret = 0;
1444         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1445         uint16_t        lport = connp->conn_lport;
1446         uint8_t         protocol = connp->conn_proto;
1447 
1448         if (IPCL_IS_IPTUN(connp))
1449                 return (ipcl_iptun_hash_insert(connp, ipst));
1450 
1451         switch (protocol) {
1452         case IPPROTO_TCP:
1453                 /*
1454                  * For TCP, we check whether the connection tuple already
1455                  * exists before allowing the connection to proceed.  We
1456                  * also allow indexing on the zoneid. This is to allow
1457                  * multiple shared stack zones to have the same tcp
1458                  * connection tuple. In practice this only happens for
1459                  * INADDR_LOOPBACK as it's the only local address which
1460                  * doesn't have to be unique.
1461                  */
1462                 connfp = &ipst->ips_ipcl_conn_fanout[
1463                     IPCL_CONN_HASH(connp->conn_faddr_v4,
1464                     connp->conn_ports, ipst)];
1465                 mutex_enter(&connfp->connf_lock);
1466                 for (tconnp = connfp->connf_head; tconnp != NULL;
1467                     tconnp = tconnp->conn_next) {
1468                         if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1469                             connp->conn_faddr_v4, connp->conn_laddr_v4,
1470                             connp->conn_ports) &&
1471                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1472                                 /* Already have a conn. bail out */
1473                                 mutex_exit(&connfp->connf_lock);
1474                                 return (EADDRINUSE);
1475                         }
1476                 }
1477                 if (connp->conn_fanout != NULL) {
1478                         /*
1479                          * Probably a XTI/TLI application trying to do a
1480                          * rebind. Let it happen.
1481                          */
1482                         mutex_exit(&connfp->connf_lock);
1483                         IPCL_HASH_REMOVE(connp);
1484                         mutex_enter(&connfp->connf_lock);
1485                 }
1486 
1487                 ASSERT(connp->conn_recv != NULL);
1488                 ASSERT(connp->conn_recvicmp != NULL);
1489 
1490                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1491                 mutex_exit(&connfp->connf_lock);
1492                 break;
1493 
1494         case IPPROTO_SCTP:
1495                 /*
1496                  * The raw socket may have already been bound, remove it
1497                  * from the hash first.
1498                  */
1499                 IPCL_HASH_REMOVE(connp);
1500                 ret = ipcl_sctp_hash_insert(connp, lport);
1501                 break;
1502 
1503         case IPPROTO_DCCP:
1504                 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_conn_insert_v4");
1505                 connfp = &ipst->ips_ipcl_dccp_conn_fanout[IPCL_DCCP_CONN_HASH(
1506                     connp->conn_faddr_v4, connp->conn_ports, ipst)];
1507                 mutex_enter(&connfp->connf_lock);
1508                 for (tconnp = connfp->connf_head; tconnp != NULL;
1509                     tconnp = tconnp->conn_next) {
1510                         if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1511                             connp->conn_faddr_v4, connp->conn_laddr_v4,
1512                             connp->conn_ports) &&
1513                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1514                                 /* Already have a conn. bail out */
1515                                 mutex_exit(&connfp->connf_lock);
1516                                 return (EADDRINUSE);
1517                         }
1518                 }
1519 
1520                 /* XXX:DCCP XTI/TLI application? */
1521 
1522                 ASSERT(connp->conn_recv != NULL);
1523                 ASSERT(connp->conn_recvicmp != NULL);
1524 
1525                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1526                 mutex_exit(&connfp->connf_lock);
1527                 break;
1528 
1529         default:
1530                 /*
1531                  * Check for conflicts among MAC exempt bindings.  For
1532                  * transports with port numbers, this is done by the upper
1533                  * level per-transport binding logic.  For all others, it's
1534                  * done here.
1535                  */
1536                 if (is_system_labeled() &&
1537                     check_exempt_conflict_v4(connp, ipst))
1538                         return (EADDRINUSE);
1539                 /* FALLTHROUGH */
1540 
1541         case IPPROTO_UDP:
1542                 if (protocol == IPPROTO_UDP) {
1543                         connfp = &ipst->ips_ipcl_udp_fanout[
1544                             IPCL_UDP_HASH(lport, ipst)];
1545                 } else {
1546                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1547                 }
1548 
1549                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1550                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1551                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1552                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1553                 } else {
1554                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1555                 }
1556                 break;
1557         }
1558 
1559         return (ret);
1560 }
1561 
1562 int
1563 ipcl_conn_insert_v6(conn_t *connp)
1564 {
1565         connf_t         *connfp;
1566         conn_t          *tconnp;
1567         int             ret = 0;
1568         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1569         uint16_t        lport = connp->conn_lport;
1570         uint8_t         protocol = connp->conn_proto;
1571         uint_t          ifindex = connp->conn_bound_if;
1572 
1573         if (IPCL_IS_IPTUN(connp))
1574                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1575 
1576         switch (protocol) {
1577         case IPPROTO_TCP:
1578 
1579                 /*
1580                  * For tcp, we check whether the connection tuple already
1581                  * exists before allowing the connection to proceed.  We
1582                  * also allow indexing on the zoneid. This is to allow
1583                  * multiple shared stack zones to have the same tcp
1584                  * connection tuple. In practice this only happens for
1585                  * ipv6_loopback as it's the only local address which
1586                  * doesn't have to be unique.
1587                  */
1588                 connfp = &ipst->ips_ipcl_conn_fanout[
1589                     IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1590                     ipst)];
1591                 mutex_enter(&connfp->connf_lock);
1592                 for (tconnp = connfp->connf_head; tconnp != NULL;
1593                     tconnp = tconnp->conn_next) {
1594                         /* NOTE: need to match zoneid. Bug in onnv-gate */
1595                         if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1596                             connp->conn_faddr_v6, connp->conn_laddr_v6,
1597                             connp->conn_ports) &&
1598                             (tconnp->conn_bound_if == 0 ||
1599                             tconnp->conn_bound_if == ifindex) &&
1600                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1601                                 /* Already have a conn. bail out */
1602                                 mutex_exit(&connfp->connf_lock);
1603                                 return (EADDRINUSE);
1604                         }
1605                 }
1606                 if (connp->conn_fanout != NULL) {
1607                         /*
1608                          * Probably a XTI/TLI application trying to do a
1609                          * rebind. Let it happen.
1610                          */
1611                         mutex_exit(&connfp->connf_lock);
1612                         IPCL_HASH_REMOVE(connp);
1613                         mutex_enter(&connfp->connf_lock);
1614                 }
1615                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1616                 mutex_exit(&connfp->connf_lock);
1617                 break;
1618 
1619         case IPPROTO_SCTP:
1620                 IPCL_HASH_REMOVE(connp);
1621                 ret = ipcl_sctp_hash_insert(connp, lport);
1622                 break;
1623 
1624         case IPPROTO_DCCP:
1625                 cmn_err(CE_NOTE, "ipclassifier.c: ipcl_conn_insert_v6");
1626                 connfp = &ipst->ips_ipcl_dccp_conn_fanout[
1627                     IPCL_DCCP_CONN_HASH_V6(connp->conn_faddr_v6,
1628                     connp->conn_ports, ipst)];
1629                 mutex_enter(&connfp->connf_lock);
1630                 for (tconnp = connfp->connf_head; tconnp != NULL;
1631                     tconnp = tconnp->conn_next) {
1632                         /* NOTE: need to match zoneid. Bug in onnv-gate */
1633                         if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1634                             connp->conn_faddr_v6, connp->conn_laddr_v6,
1635                             connp->conn_ports) &&
1636                             (tconnp->conn_bound_if == 0 ||
1637                             tconnp->conn_bound_if == ifindex) &&
1638                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1639                                 /* Already have a conn. bail out */
1640                                 mutex_exit(&connfp->connf_lock);
1641                                 return (EADDRINUSE);
1642                         }
1643                 }
1644 
1645                 /* XXX:DCCP XTI/TLI? */
1646                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1647                 mutex_exit(&connfp->connf_lock);
1648                 break;
1649 
1650         default:
1651                 if (is_system_labeled() &&
1652                     check_exempt_conflict_v6(connp, ipst))
1653                         return (EADDRINUSE);
1654                 /* FALLTHROUGH */
1655         case IPPROTO_UDP:
1656                 if (protocol == IPPROTO_UDP) {
1657                         connfp = &ipst->ips_ipcl_udp_fanout[
1658                             IPCL_UDP_HASH(lport, ipst)];
1659                 } else {
1660                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1661                 }
1662 
1663                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1664                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1665                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1666                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1667                 } else {
1668                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1669                 }
1670                 break;
1671         }
1672 
1673         return (ret);
1674 }
1675 
1676 /*
1677  * v4 packet classifying function. looks up the fanout table to
1678  * find the conn, the packet belongs to. returns the conn with
1679  * the reference held, null otherwise.
1680  *
1681  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1682  * Lookup" comment block are applied.  Labels are also checked as described
1683  * above.  If the packet is from the inside (looped back), and is from the same
1684  * zone, then label checks are omitted.
1685  */
1686 conn_t *
1687 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1688     ip_recv_attr_t *ira, ip_stack_t *ipst)
1689 {
1690         ipha_t  *ipha;
1691         connf_t *connfp, *bind_connfp;
1692         uint16_t lport;
1693         uint16_t fport;
1694         uint32_t ports;
1695         conn_t  *connp;
1696         uint16_t  *up;
1697         zoneid_t        zoneid = ira->ira_zoneid;
1698 
1699         ipha = (ipha_t *)mp->b_rptr;
1700         up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1701 
1702         switch (protocol) {
1703         case IPPROTO_TCP:
1704                 ports = *(uint32_t *)up;
1705                 connfp =
1706                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1707                     ports, ipst)];
1708                 mutex_enter(&connfp->connf_lock);
1709                 for (connp = connfp->connf_head; connp != NULL;
1710                     connp = connp->conn_next) {
1711                         if (IPCL_CONN_MATCH(connp, protocol,
1712                             ipha->ipha_src, ipha->ipha_dst, ports) &&
1713                             (connp->conn_zoneid == zoneid ||
1714                             connp->conn_allzones ||
1715                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1716                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1717                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1718                                 break;
1719                 }
1720 
1721                 if (connp != NULL) {
1722                         /*
1723                          * We have a fully-bound TCP connection.
1724                          *
1725                          * For labeled systems, there's no need to check the
1726                          * label here.  It's known to be good as we checked
1727                          * before allowing the connection to become bound.
1728                          */
1729                         CONN_INC_REF(connp);
1730                         mutex_exit(&connfp->connf_lock);
1731                         return (connp);
1732                 }
1733 
1734                 mutex_exit(&connfp->connf_lock);
1735                 lport = up[1];
1736                 bind_connfp =
1737                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1738                 mutex_enter(&bind_connfp->connf_lock);
1739                 for (connp = bind_connfp->connf_head; connp != NULL;
1740                     connp = connp->conn_next) {
1741                         if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1742                             lport) &&
1743                             (connp->conn_zoneid == zoneid ||
1744                             connp->conn_allzones ||
1745                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1746                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1747                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1748                                 break;
1749                 }
1750 
1751                 /*
1752                  * If the matching connection is SLP on a private address, then
1753                  * the label on the packet must match the local zone's label.
1754                  * Otherwise, it must be in the label range defined by tnrh.
1755                  * This is ensured by tsol_receive_local.
1756                  *
1757                  * Note that we don't check tsol_receive_local for
1758                  * the connected case.
1759                  */
1760                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1761                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1762                     ira, connp)) {
1763                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1764                             char *, "connp(1) could not receive mp(2)",
1765                             conn_t *, connp, mblk_t *, mp);
1766                         connp = NULL;
1767                 }
1768 
1769                 if (connp != NULL) {
1770                         /* Have a listener at least */
1771                         CONN_INC_REF(connp);
1772                         mutex_exit(&bind_connfp->connf_lock);
1773                         return (connp);
1774                 }
1775 
1776                 mutex_exit(&bind_connfp->connf_lock);
1777                 break;
1778 
1779         case IPPROTO_UDP:
1780                 lport = up[1];
1781                 fport = up[0];
1782                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1783                 mutex_enter(&connfp->connf_lock);
1784                 for (connp = connfp->connf_head; connp != NULL;
1785                     connp = connp->conn_next) {
1786                         if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1787                             fport, ipha->ipha_src) &&
1788                             (connp->conn_zoneid == zoneid ||
1789                             connp->conn_allzones ||
1790                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1791                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1792                                 break;
1793                 }
1794 
1795                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1796                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1797                     ira, connp)) {
1798                         DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1799                             char *, "connp(1) could not receive mp(2)",
1800                             conn_t *, connp, mblk_t *, mp);
1801                         connp = NULL;
1802                 }
1803 
1804                 if (connp != NULL) {
1805                         CONN_INC_REF(connp);
1806                         mutex_exit(&connfp->connf_lock);
1807                         return (connp);
1808                 }
1809 
1810                 /*
1811                  * We shouldn't come here for multicast/broadcast packets
1812                  */
1813                 mutex_exit(&connfp->connf_lock);
1814 
1815                 break;
1816 
1817         case IPPROTO_DCCP:
1818                 ports = *(uint32_t *)up;
1819 
1820                 /*
1821                  * Search for fully-bound connection.
1822                  */
1823                 connfp = &ipst->ips_ipcl_dccp_conn_fanout[IPCL_DCCP_CONN_HASH(
1824                     ipha->ipha_src, ports, ipst)];
1825                 mutex_enter(&connfp->connf_lock);
1826                 for (connp = connfp->connf_head; connp != NULL;
1827                     connp = connp->conn_next) {
1828                         /* XXX:DCCP */
1829                         if (IPCL_CONN_MATCH(connp, protocol,
1830                             ipha->ipha_src, ipha->ipha_dst, ports)) {
1831                                 /* XXX */
1832                                 cmn_err(CE_NOTE, "ipclassifier.c: fully bound connection found");
1833                                 break;
1834                         }
1835                 }
1836 
1837                 if (connp != NULL) {
1838                         /*
1839                          * We have a fully-bound DCCP connection.
1840                          */
1841                         CONN_INC_REF(connp);
1842                         mutex_exit(&connfp->connf_lock);
1843                         return (connp);
1844                 }
1845 
1846                 mutex_exit(&connfp->connf_lock);
1847                 lport = up[1];
1848 
1849                 /*
1850                  * Fully-bound connection was not found, search for listener.
1851                  */
1852                 bind_connfp = &ipst->ips_ipcl_dccp_bind_fanout[
1853                     IPCL_DCCP_BIND_HASH(lport, ipst)];
1854                 mutex_enter(&bind_connfp->connf_lock);
1855                 for (connp = bind_connfp->connf_head; connp != NULL;
1856                     connp = connp->conn_next) {
1857                         if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1858                             lport) &&
1859                             (connp->conn_zoneid == zoneid ||
1860                             connp->conn_allzones ||
1861                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1862                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1863                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1864                                 break;
1865                 }
1866 
1867                 if (connp != NULL) {
1868                         cmn_err(CE_NOTE, "ipclassifier.c: half-bound bind listener");
1869                         /* Have a listener at least */
1870                         CONN_INC_REF(connp);
1871                         mutex_exit(&bind_connfp->connf_lock);
1872                         return (connp);
1873                 }
1874 
1875                 mutex_exit(&bind_connfp->connf_lock);
1876                 break;
1877 
1878         case IPPROTO_ENCAP:
1879         case IPPROTO_IPV6:
1880                 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1881                     &ipha->ipha_dst, ipst));
1882         }
1883 
1884         return (NULL);
1885 }
1886 
1887 conn_t *
1888 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1889     ip_recv_attr_t *ira, ip_stack_t *ipst)
1890 {
1891         ip6_t           *ip6h;
1892         connf_t         *connfp, *bind_connfp;
1893         uint16_t        lport;
1894         uint16_t        fport;
1895         tcpha_t         *tcpha;
1896         uint32_t        ports;
1897         conn_t          *connp;
1898         uint16_t        *up;
1899         zoneid_t        zoneid = ira->ira_zoneid;
1900 
1901         ip6h = (ip6_t *)mp->b_rptr;
1902 
1903         switch (protocol) {
1904         case IPPROTO_TCP:
1905                 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1906                 up = &tcpha->tha_lport;
1907                 ports = *(uint32_t *)up;
1908 
1909                 connfp =
1910                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1911                     ports, ipst)];
1912                 mutex_enter(&connfp->connf_lock);
1913                 for (connp = connfp->connf_head; connp != NULL;
1914                     connp = connp->conn_next) {
1915                         if (IPCL_CONN_MATCH_V6(connp, protocol,
1916                             ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1917                             (connp->conn_zoneid == zoneid ||
1918                             connp->conn_allzones ||
1919                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1920                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1921                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1922                                 break;
1923                 }
1924 
1925                 if (connp != NULL) {
1926                         /*
1927                          * We have a fully-bound TCP connection.
1928                          *
1929                          * For labeled systems, there's no need to check the
1930                          * label here.  It's known to be good as we checked
1931                          * before allowing the connection to become bound.
1932                          */
1933                         CONN_INC_REF(connp);
1934                         mutex_exit(&connfp->connf_lock);
1935                         return (connp);
1936                 }
1937 
1938                 mutex_exit(&connfp->connf_lock);
1939 
1940                 lport = up[1];
1941                 bind_connfp =
1942                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1943                 mutex_enter(&bind_connfp->connf_lock);
1944                 for (connp = bind_connfp->connf_head; connp != NULL;
1945                     connp = connp->conn_next) {
1946                         if (IPCL_BIND_MATCH_V6(connp, protocol,
1947                             ip6h->ip6_dst, lport) &&
1948                             (connp->conn_zoneid == zoneid ||
1949                             connp->conn_allzones ||
1950                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1951                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1952                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1953                                 break;
1954                 }
1955 
1956                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1957                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1958                     ira, connp)) {
1959                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1960                             char *, "connp(1) could not receive mp(2)",
1961                             conn_t *, connp, mblk_t *, mp);
1962                         connp = NULL;
1963                 }
1964 
1965                 if (connp != NULL) {
1966                         /* Have a listner at least */
1967                         CONN_INC_REF(connp);
1968                         mutex_exit(&bind_connfp->connf_lock);
1969                         return (connp);
1970                 }
1971 
1972                 mutex_exit(&bind_connfp->connf_lock);
1973                 break;
1974 
1975         case IPPROTO_UDP:
1976                 up = (uint16_t *)&mp->b_rptr[hdr_len];
1977                 lport = up[1];
1978                 fport = up[0];
1979                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1980                 mutex_enter(&connfp->connf_lock);
1981                 for (connp = connfp->connf_head; connp != NULL;
1982                     connp = connp->conn_next) {
1983                         if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1984                             fport, ip6h->ip6_src) &&
1985                             (connp->conn_zoneid == zoneid ||
1986                             connp->conn_allzones ||
1987                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1988                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1989                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1990                                 break;
1991                 }
1992 
1993                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1994                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1995                     ira, connp)) {
1996                         DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1997                             char *, "connp(1) could not receive mp(2)",
1998                             conn_t *, connp, mblk_t *, mp);
1999                         connp = NULL;
2000                 }
2001 
2002                 if (connp != NULL) {
2003                         CONN_INC_REF(connp);
2004                         mutex_exit(&connfp->connf_lock);
2005                         return (connp);
2006                 }
2007 
2008                 /*
2009                  * We shouldn't come here for multicast/broadcast packets
2010                  */
2011                 mutex_exit(&connfp->connf_lock);
2012                 break;
2013         case IPPROTO_ENCAP:
2014         case IPPROTO_IPV6:
2015                 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
2016                     &ip6h->ip6_dst, ipst));
2017         }
2018 
2019         return (NULL);
2020 }
2021 
2022 /*
2023  * wrapper around ipcl_classify_(v4,v6) routines.
2024  */
2025 conn_t *
2026 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
2027 {
2028         if (ira->ira_flags & IRAF_IS_IPV4) {
2029                 return (ipcl_classify_v4(mp, ira->ira_protocol,
2030                     ira->ira_ip_hdr_length, ira, ipst));
2031         } else {
2032                 return (ipcl_classify_v6(mp, ira->ira_protocol,
2033                     ira->ira_ip_hdr_length, ira, ipst));
2034         }
2035 }
2036 
2037 /*
2038  * Only used to classify SCTP RAW sockets
2039  */
2040 conn_t *
2041 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
2042     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
2043 {
2044         connf_t         *connfp;
2045         conn_t          *connp;
2046         in_port_t       lport;
2047         int             ipversion;
2048         const void      *dst;
2049         zoneid_t        zoneid = ira->ira_zoneid;
2050 
2051         lport = ((uint16_t *)&ports)[1];
2052         if (ira->ira_flags & IRAF_IS_IPV4) {
2053                 dst = (const void *)&ipha->ipha_dst;
2054                 ipversion = IPV4_VERSION;
2055         } else {
2056                 dst = (const void *)&ip6h->ip6_dst;
2057                 ipversion = IPV6_VERSION;
2058         }
2059 
2060         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
2061         mutex_enter(&connfp->connf_lock);
2062         for (connp = connfp->connf_head; connp != NULL;
2063             connp = connp->conn_next) {
2064                 /* We don't allow v4 fallback for v6 raw socket. */
2065                 if (ipversion != connp->conn_ipversion)
2066                         continue;
2067                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2068                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2069                         if (ipversion == IPV4_VERSION) {
2070                                 if (!IPCL_CONN_MATCH(connp, protocol,
2071                                     ipha->ipha_src, ipha->ipha_dst, ports))
2072                                         continue;
2073                         } else {
2074                                 if (!IPCL_CONN_MATCH_V6(connp, protocol,
2075                                     ip6h->ip6_src, ip6h->ip6_dst, ports))
2076                                         continue;
2077                         }
2078                 } else {
2079                         if (ipversion == IPV4_VERSION) {
2080                                 if (!IPCL_BIND_MATCH(connp, protocol,
2081                                     ipha->ipha_dst, lport))
2082                                         continue;
2083                         } else {
2084                                 if (!IPCL_BIND_MATCH_V6(connp, protocol,
2085                                     ip6h->ip6_dst, lport))
2086                                         continue;
2087                         }
2088                 }
2089 
2090                 if (connp->conn_zoneid == zoneid ||
2091                     connp->conn_allzones ||
2092                     ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
2093                     (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
2094                     (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
2095                         break;
2096         }
2097 
2098         if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2099             !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
2100                 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
2101                     char *, "connp(1) could not receive mp(2)",
2102                     conn_t *, connp, mblk_t *, mp);
2103                 connp = NULL;
2104         }
2105 
2106         if (connp != NULL)
2107                 goto found;
2108         mutex_exit(&connfp->connf_lock);
2109 
2110         /* Try to look for a wildcard SCTP RAW socket match. */
2111         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
2112         mutex_enter(&connfp->connf_lock);
2113         for (connp = connfp->connf_head; connp != NULL;
2114             connp = connp->conn_next) {
2115                 /* We don't allow v4 fallback for v6 raw socket. */
2116                 if (ipversion != connp->conn_ipversion)
2117                         continue;
2118                 if (!IPCL_ZONE_MATCH(connp, zoneid))
2119                         continue;
2120 
2121                 if (ipversion == IPV4_VERSION) {
2122                         if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
2123                                 break;
2124                 } else {
2125                         if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
2126                                 break;
2127                         }
2128                 }
2129         }
2130 
2131         if (connp != NULL)
2132                 goto found;
2133 
2134         mutex_exit(&connfp->connf_lock);
2135         return (NULL);
2136 
2137 found:
2138         ASSERT(connp != NULL);
2139         CONN_INC_REF(connp);
2140         mutex_exit(&connfp->connf_lock);
2141         return (connp);
2142 }
2143 
2144 /* ARGSUSED */
2145 static int
2146 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2147 {
2148         itc_t   *itc = (itc_t *)buf;
2149         conn_t  *connp = &itc->itc_conn;
2150         tcp_t   *tcp = (tcp_t *)&itc[1];
2151 
2152         bzero(connp, sizeof (conn_t));
2153         bzero(tcp, sizeof (tcp_t));
2154 
2155         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2156         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2157         cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
2158         tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
2159         if (tcp->tcp_timercache == NULL)
2160                 return (ENOMEM);
2161         connp->conn_tcp = tcp;
2162         connp->conn_flags = IPCL_TCPCONN;
2163         connp->conn_proto = IPPROTO_TCP;
2164         tcp->tcp_connp = connp;
2165         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2166 
2167         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2168         if (connp->conn_ixa == NULL) {
2169                 tcp_timermp_free(tcp);
2170                 return (ENOMEM);
2171         }
2172         connp->conn_ixa->ixa_refcnt = 1;
2173         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2174         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2175         return (0);
2176 }
2177 
2178 /* ARGSUSED */
2179 static void
2180 tcp_conn_destructor(void *buf, void *cdrarg)
2181 {
2182         itc_t   *itc = (itc_t *)buf;
2183         conn_t  *connp = &itc->itc_conn;
2184         tcp_t   *tcp = (tcp_t *)&itc[1];
2185 
2186         ASSERT(connp->conn_flags & IPCL_TCPCONN);
2187         ASSERT(tcp->tcp_connp == connp);
2188         ASSERT(connp->conn_tcp == tcp);
2189         tcp_timermp_free(tcp);
2190         mutex_destroy(&connp->conn_lock);
2191         cv_destroy(&connp->conn_cv);
2192         cv_destroy(&connp->conn_sq_cv);
2193         rw_destroy(&connp->conn_ilg_lock);
2194 
2195         /* Can be NULL if constructor failed */
2196         if (connp->conn_ixa != NULL) {
2197                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2198                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2199                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2200                 ixa_refrele(connp->conn_ixa);
2201         }
2202 }
2203 
2204 /* ARGSUSED */
2205 static int
2206 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2207 {
2208         itc_t   *itc = (itc_t *)buf;
2209         conn_t  *connp = &itc->itc_conn;
2210 
2211         bzero(connp, sizeof (conn_t));
2212         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2213         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2214         connp->conn_flags = IPCL_IPCCONN;
2215         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2216 
2217         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2218         if (connp->conn_ixa == NULL)
2219                 return (ENOMEM);
2220         connp->conn_ixa->ixa_refcnt = 1;
2221         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2222         return (0);
2223 }
2224 
2225 /* ARGSUSED */
2226 static void
2227 ip_conn_destructor(void *buf, void *cdrarg)
2228 {
2229         itc_t   *itc = (itc_t *)buf;
2230         conn_t  *connp = &itc->itc_conn;
2231 
2232         ASSERT(connp->conn_flags & IPCL_IPCCONN);
2233         ASSERT(connp->conn_priv == NULL);
2234         mutex_destroy(&connp->conn_lock);
2235         cv_destroy(&connp->conn_cv);
2236         rw_destroy(&connp->conn_ilg_lock);
2237 
2238         /* Can be NULL if constructor failed */
2239         if (connp->conn_ixa != NULL) {
2240                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2241                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2242                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2243                 ixa_refrele(connp->conn_ixa);
2244         }
2245 }
2246 
2247 /* ARGSUSED */
2248 static int
2249 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2250 {
2251         itc_t   *itc = (itc_t *)buf;
2252         conn_t  *connp = &itc->itc_conn;
2253         udp_t   *udp = (udp_t *)&itc[1];
2254 
2255         bzero(connp, sizeof (conn_t));
2256         bzero(udp, sizeof (udp_t));
2257 
2258         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2259         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2260         connp->conn_udp = udp;
2261         connp->conn_flags = IPCL_UDPCONN;
2262         connp->conn_proto = IPPROTO_UDP;
2263         udp->udp_connp = connp;
2264         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2265         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2266         if (connp->conn_ixa == NULL)
2267                 return (ENOMEM);
2268         connp->conn_ixa->ixa_refcnt = 1;
2269         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2270         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2271         return (0);
2272 }
2273 
2274 /* ARGSUSED */
2275 static void
2276 udp_conn_destructor(void *buf, void *cdrarg)
2277 {
2278         itc_t   *itc = (itc_t *)buf;
2279         conn_t  *connp = &itc->itc_conn;
2280         udp_t   *udp = (udp_t *)&itc[1];
2281 
2282         ASSERT(connp->conn_flags & IPCL_UDPCONN);
2283         ASSERT(udp->udp_connp == connp);
2284         ASSERT(connp->conn_udp == udp);
2285         mutex_destroy(&connp->conn_lock);
2286         cv_destroy(&connp->conn_cv);
2287         rw_destroy(&connp->conn_ilg_lock);
2288 
2289         /* Can be NULL if constructor failed */
2290         if (connp->conn_ixa != NULL) {
2291                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2292                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2293                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2294                 ixa_refrele(connp->conn_ixa);
2295         }
2296 }
2297 
2298 /* ARGSUSED */
2299 static int
2300 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2301 {
2302         itc_t   *itc = (itc_t *)buf;
2303         conn_t  *connp = &itc->itc_conn;
2304         icmp_t  *icmp = (icmp_t *)&itc[1];
2305 
2306         bzero(connp, sizeof (conn_t));
2307         bzero(icmp, sizeof (icmp_t));
2308 
2309         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2310         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2311         connp->conn_icmp = icmp;
2312         connp->conn_flags = IPCL_RAWIPCONN;
2313         connp->conn_proto = IPPROTO_ICMP;
2314         icmp->icmp_connp = connp;
2315         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2316         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2317         if (connp->conn_ixa == NULL)
2318                 return (ENOMEM);
2319         connp->conn_ixa->ixa_refcnt = 1;
2320         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2321         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2322         return (0);
2323 }
2324 
2325 /* ARGSUSED */
2326 static void
2327 rawip_conn_destructor(void *buf, void *cdrarg)
2328 {
2329         itc_t   *itc = (itc_t *)buf;
2330         conn_t  *connp = &itc->itc_conn;
2331         icmp_t  *icmp = (icmp_t *)&itc[1];
2332 
2333         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2334         ASSERT(icmp->icmp_connp == connp);
2335         ASSERT(connp->conn_icmp == icmp);
2336         mutex_destroy(&connp->conn_lock);
2337         cv_destroy(&connp->conn_cv);
2338         rw_destroy(&connp->conn_ilg_lock);
2339 
2340         /* Can be NULL if constructor failed */
2341         if (connp->conn_ixa != NULL) {
2342                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2343                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2344                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2345                 ixa_refrele(connp->conn_ixa);
2346         }
2347 }
2348 
2349 /* ARGSUSED */
2350 static int
2351 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2352 {
2353         itc_t   *itc = (itc_t *)buf;
2354         conn_t  *connp = &itc->itc_conn;
2355         rts_t   *rts = (rts_t *)&itc[1];
2356 
2357         bzero(connp, sizeof (conn_t));
2358         bzero(rts, sizeof (rts_t));
2359 
2360         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2361         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2362         connp->conn_rts = rts;
2363         connp->conn_flags = IPCL_RTSCONN;
2364         rts->rts_connp = connp;
2365         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2366         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2367         if (connp->conn_ixa == NULL)
2368                 return (ENOMEM);
2369         connp->conn_ixa->ixa_refcnt = 1;
2370         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2371         return (0);
2372 }
2373 
2374 /* ARGSUSED */
2375 static void
2376 rts_conn_destructor(void *buf, void *cdrarg)
2377 {
2378         itc_t   *itc = (itc_t *)buf;
2379         conn_t  *connp = &itc->itc_conn;
2380         rts_t   *rts = (rts_t *)&itc[1];
2381 
2382         ASSERT(connp->conn_flags & IPCL_RTSCONN);
2383         ASSERT(rts->rts_connp == connp);
2384         ASSERT(connp->conn_rts == rts);
2385         mutex_destroy(&connp->conn_lock);
2386         cv_destroy(&connp->conn_cv);
2387         rw_destroy(&connp->conn_ilg_lock);
2388 
2389         /* Can be NULL if constructor failed */
2390         if (connp->conn_ixa != NULL) {
2391                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2392                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2393                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2394                 ixa_refrele(connp->conn_ixa);
2395         }
2396 }
2397 
2398 /* ARGSUSED */
2399 static int
2400 dccp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2401 {
2402         itc_t   *itc = (itc_t *)buf;
2403         conn_t  *connp = &itc->itc_conn;
2404         dccp_t  *dccp = (dccp_t *)&itc[1];
2405 
2406         bzero(connp, sizeof (conn_t));
2407         bzero(dccp, sizeof (dccp_t));
2408 
2409         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2410         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2411         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2412 
2413         dccp->dccp_timercache = dccp_timermp_alloc(kmflags);
2414         if (dccp->dccp_timercache == NULL) {
2415                 return (ENOMEM);
2416         }
2417 
2418         connp->conn_dccp = dccp;
2419         connp->conn_flags = IPCL_DCCPCONN;
2420         connp->conn_proto = IPPROTO_DCCP;
2421         dccp->dccp_connp = connp;
2422 
2423         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2424         if (connp->conn_ixa == NULL) {
2425                 return (NULL);
2426         }
2427 
2428         connp->conn_ixa->ixa_refcnt = 1;
2429         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2430         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2431 
2432         return (0);
2433 }
2434 
2435 /* ARGSUSED */
2436 static void
2437 dccp_conn_destructor(void *buf, void *cdrarg)
2438 {
2439         itc_t   *itc = (itc_t *)buf;
2440         conn_t  *connp = &itc->itc_conn;
2441         dccp_t  *dccp = (dccp_t *)&itc[1];
2442 
2443         ASSERT(connp->conn_flags & IPCL_DCCPCONN);
2444         ASSERT(dccp->dccp_connp == connp);
2445         ASSERT(connp->conn_dccp == dccp);
2446 
2447         dccp_timermp_free(dccp);
2448 
2449         mutex_destroy(&connp->conn_lock);
2450         cv_destroy(&connp->conn_cv);
2451         rw_destroy(&connp->conn_ilg_lock);
2452 
2453         if (connp->conn_ixa != NULL) {
2454                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2455                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2456                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2457 
2458                 ixa_refrele(connp->conn_ixa);
2459         }
2460 }
2461 
2462 /*
2463  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2464  * in the conn_t.
2465  *
2466  * Below we list all the pointers in the conn_t as a documentation aid.
2467  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2468  * If you add any pointers to the conn_t please add an ASSERT here
2469  * and #ifdef it out if it can't be actually asserted to be NULL.
2470  * In any case, we bzero most of the conn_t at the end of the function.
2471  */
2472 void
2473 ipcl_conn_cleanup(conn_t *connp)
2474 {
2475         ip_xmit_attr_t  *ixa;
2476 
2477         ASSERT(connp->conn_latch == NULL);
2478         ASSERT(connp->conn_latch_in_policy == NULL);
2479         ASSERT(connp->conn_latch_in_action == NULL);
2480 #ifdef notdef
2481         ASSERT(connp->conn_rq == NULL);
2482         ASSERT(connp->conn_wq == NULL);
2483 #endif
2484         ASSERT(connp->conn_cred == NULL);
2485         ASSERT(connp->conn_g_fanout == NULL);
2486         ASSERT(connp->conn_g_next == NULL);
2487         ASSERT(connp->conn_g_prev == NULL);
2488         ASSERT(connp->conn_policy == NULL);
2489         ASSERT(connp->conn_fanout == NULL);
2490         ASSERT(connp->conn_next == NULL);
2491         ASSERT(connp->conn_prev == NULL);
2492         ASSERT(connp->conn_oper_pending_ill == NULL);
2493         ASSERT(connp->conn_ilg == NULL);
2494         ASSERT(connp->conn_drain_next == NULL);
2495         ASSERT(connp->conn_drain_prev == NULL);
2496 #ifdef notdef
2497         /* conn_idl is not cleared when removed from idl list */
2498         ASSERT(connp->conn_idl == NULL);
2499 #endif
2500         ASSERT(connp->conn_ipsec_opt_mp == NULL);
2501 #ifdef notdef
2502         /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2503         ASSERT(connp->conn_netstack == NULL);
2504 #endif
2505 
2506         ASSERT(connp->conn_helper_info == NULL);
2507         ASSERT(connp->conn_ixa != NULL);
2508         ixa = connp->conn_ixa;
2509         ASSERT(ixa->ixa_refcnt == 1);
2510         /* Need to preserve ixa_protocol */
2511         ixa_cleanup(ixa);
2512         ixa->ixa_flags = 0;
2513 
2514         /* Clear out the conn_t fields that are not preserved */
2515         bzero(&connp->conn_start_clr,
2516             sizeof (conn_t) -
2517             ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2518 }
2519 
2520 /*
2521  * All conns are inserted in a global multi-list for the benefit of
2522  * walkers. The walk is guaranteed to walk all open conns at the time
2523  * of the start of the walk exactly once. This property is needed to
2524  * achieve some cleanups during unplumb of interfaces. This is achieved
2525  * as follows.
2526  *
2527  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2528  * call the insert and delete functions below at creation and deletion
2529  * time respectively. The conn never moves or changes its position in this
2530  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2531  * won't increase due to walkers, once the conn deletion has started. Note
2532  * that we can't remove the conn from the global list and then wait for
2533  * the refcnt to drop to zero, since walkers would then see a truncated
2534  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2535  * conns until ip_open is ready to make them globally visible.
2536  * The global round robin multi-list locks are held only to get the
2537  * next member/insertion/deletion and contention should be negligible
2538  * if the multi-list is much greater than the number of cpus.
2539  */
2540 void
2541 ipcl_globalhash_insert(conn_t *connp)
2542 {
2543         int     index;
2544         struct connf_s  *connfp;
2545         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2546 
2547         /*
2548          * No need for atomic here. Approximate even distribution
2549          * in the global lists is sufficient.
2550          */
2551         ipst->ips_conn_g_index++;
2552         index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2553 
2554         connp->conn_g_prev = NULL;
2555         /*
2556          * Mark as INCIPIENT, so that walkers will ignore this
2557          * for now, till ip_open is ready to make it visible globally.
2558          */
2559         connp->conn_state_flags |= CONN_INCIPIENT;
2560 
2561         connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2562         /* Insert at the head of the list */
2563         mutex_enter(&connfp->connf_lock);
2564         connp->conn_g_next = connfp->connf_head;
2565         if (connp->conn_g_next != NULL)
2566                 connp->conn_g_next->conn_g_prev = connp;
2567         connfp->connf_head = connp;
2568 
2569         /* The fanout bucket this conn points to */
2570         connp->conn_g_fanout = connfp;
2571 
2572         mutex_exit(&connfp->connf_lock);
2573 }
2574 
2575 void
2576 ipcl_globalhash_remove(conn_t *connp)
2577 {
2578         struct connf_s  *connfp;
2579 
2580         /*
2581          * We were never inserted in the global multi list.
2582          * IPCL_NONE variety is never inserted in the global multilist
2583          * since it is presumed to not need any cleanup and is transient.
2584          */
2585         if (connp->conn_g_fanout == NULL)
2586                 return;
2587 
2588         connfp = connp->conn_g_fanout;
2589         mutex_enter(&connfp->connf_lock);
2590         if (connp->conn_g_prev != NULL)
2591                 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2592         else
2593                 connfp->connf_head = connp->conn_g_next;
2594         if (connp->conn_g_next != NULL)
2595                 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2596         mutex_exit(&connfp->connf_lock);
2597 
2598         /* Better to stumble on a null pointer than to corrupt memory */
2599         connp->conn_g_next = NULL;
2600         connp->conn_g_prev = NULL;
2601         connp->conn_g_fanout = NULL;
2602 }
2603 
2604 /*
2605  * Walk the list of all conn_t's in the system, calling the function provided
2606  * With the specified argument for each.
2607  * Applies to both IPv4 and IPv6.
2608  *
2609  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2610  * conn_oper_pending_ill). To guard against stale pointers
2611  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2612  * unplumbed or removed. New conn_t's that are created while we are walking
2613  * may be missed by this walk, because they are not necessarily inserted
2614  * at the tail of the list. They are new conn_t's and thus don't have any
2615  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2616  * is created to the struct that is going away.
2617  */
2618 void
2619 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2620 {
2621         int     i;
2622         conn_t  *connp;
2623         conn_t  *prev_connp;
2624 
2625         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2626                 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2627                 prev_connp = NULL;
2628                 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2629                 while (connp != NULL) {
2630                         mutex_enter(&connp->conn_lock);
2631                         if (connp->conn_state_flags &
2632                             (CONN_CONDEMNED | CONN_INCIPIENT)) {
2633                                 mutex_exit(&connp->conn_lock);
2634                                 connp = connp->conn_g_next;
2635                                 continue;
2636                         }
2637                         CONN_INC_REF_LOCKED(connp);
2638                         mutex_exit(&connp->conn_lock);
2639                         mutex_exit(
2640                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2641                         (*func)(connp, arg);
2642                         if (prev_connp != NULL)
2643                                 CONN_DEC_REF(prev_connp);
2644                         mutex_enter(
2645                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2646                         prev_connp = connp;
2647                         connp = connp->conn_g_next;
2648                 }
2649                 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2650                 if (prev_connp != NULL)
2651                         CONN_DEC_REF(prev_connp);
2652         }
2653 }
2654 
2655 /*
2656  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2657  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2658  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2659  * (peer tcp in ESTABLISHED state).
2660  */
2661 conn_t *
2662 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2663     ip_stack_t *ipst)
2664 {
2665         uint32_t ports;
2666         uint16_t *pports = (uint16_t *)&ports;
2667         connf_t *connfp;
2668         conn_t  *tconnp;
2669         boolean_t zone_chk;
2670 
2671         /*
2672          * If either the source of destination address is loopback, then
2673          * both endpoints must be in the same Zone.  Otherwise, both of
2674          * the addresses are system-wide unique (tcp is in ESTABLISHED
2675          * state) and the endpoints may reside in different Zones.
2676          */
2677         zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2678             ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2679 
2680         pports[0] = tcpha->tha_fport;
2681         pports[1] = tcpha->tha_lport;
2682 
2683         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2684             ports, ipst)];
2685 
2686         mutex_enter(&connfp->connf_lock);
2687         for (tconnp = connfp->connf_head; tconnp != NULL;
2688             tconnp = tconnp->conn_next) {
2689 
2690                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2691                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2692                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2693                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2694 
2695                         ASSERT(tconnp != connp);
2696                         CONN_INC_REF(tconnp);
2697                         mutex_exit(&connfp->connf_lock);
2698                         return (tconnp);
2699                 }
2700         }
2701         mutex_exit(&connfp->connf_lock);
2702         return (NULL);
2703 }
2704 
2705 /*
2706  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2707  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2708  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2709  * (peer tcp in ESTABLISHED state).
2710  */
2711 conn_t *
2712 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2713     ip_stack_t *ipst)
2714 {
2715         uint32_t ports;
2716         uint16_t *pports = (uint16_t *)&ports;
2717         connf_t *connfp;
2718         conn_t  *tconnp;
2719         boolean_t zone_chk;
2720 
2721         /*
2722          * If either the source of destination address is loopback, then
2723          * both endpoints must be in the same Zone.  Otherwise, both of
2724          * the addresses are system-wide unique (tcp is in ESTABLISHED
2725          * state) and the endpoints may reside in different Zones.  We
2726          * don't do Zone check for link local address(es) because the
2727          * current Zone implementation treats each link local address as
2728          * being unique per system node, i.e. they belong to global Zone.
2729          */
2730         zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2731             IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2732 
2733         pports[0] = tcpha->tha_fport;
2734         pports[1] = tcpha->tha_lport;
2735 
2736         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2737             ports, ipst)];
2738 
2739         mutex_enter(&connfp->connf_lock);
2740         for (tconnp = connfp->connf_head; tconnp != NULL;
2741             tconnp = tconnp->conn_next) {
2742 
2743                 /* We skip conn_bound_if check here as this is loopback tcp */
2744                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2745                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2746                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2747                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2748 
2749                         ASSERT(tconnp != connp);
2750                         CONN_INC_REF(tconnp);
2751                         mutex_exit(&connfp->connf_lock);
2752                         return (tconnp);
2753                 }
2754         }
2755         mutex_exit(&connfp->connf_lock);
2756         return (NULL);
2757 }
2758 
2759 /*
2760  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2761  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2762  * Only checks for connected entries i.e. no INADDR_ANY checks.
2763  */
2764 conn_t *
2765 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2766     ip_stack_t *ipst)
2767 {
2768         uint32_t ports;
2769         uint16_t *pports;
2770         connf_t *connfp;
2771         conn_t  *tconnp;
2772 
2773         pports = (uint16_t *)&ports;
2774         pports[0] = tcpha->tha_fport;
2775         pports[1] = tcpha->tha_lport;
2776 
2777         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2778             ports, ipst)];
2779 
2780         mutex_enter(&connfp->connf_lock);
2781         for (tconnp = connfp->connf_head; tconnp != NULL;
2782             tconnp = tconnp->conn_next) {
2783 
2784                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2785                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2786                     tconnp->conn_tcp->tcp_state >= min_state) {
2787 
2788                         CONN_INC_REF(tconnp);
2789                         mutex_exit(&connfp->connf_lock);
2790                         return (tconnp);
2791                 }
2792         }
2793         mutex_exit(&connfp->connf_lock);
2794         return (NULL);
2795 }
2796 
2797 /*
2798  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2799  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2800  * Only checks for connected entries i.e. no INADDR_ANY checks.
2801  * Match on ifindex in addition to addresses.
2802  */
2803 conn_t *
2804 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2805     uint_t ifindex, ip_stack_t *ipst)
2806 {
2807         tcp_t   *tcp;
2808         uint32_t ports;
2809         uint16_t *pports;
2810         connf_t *connfp;
2811         conn_t  *tconnp;
2812 
2813         pports = (uint16_t *)&ports;
2814         pports[0] = tcpha->tha_fport;
2815         pports[1] = tcpha->tha_lport;
2816 
2817         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2818             ports, ipst)];
2819 
2820         mutex_enter(&connfp->connf_lock);
2821         for (tconnp = connfp->connf_head; tconnp != NULL;
2822             tconnp = tconnp->conn_next) {
2823 
2824                 tcp = tconnp->conn_tcp;
2825                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2826                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2827                     tcp->tcp_state >= min_state &&
2828                     (tconnp->conn_bound_if == 0 ||
2829                     tconnp->conn_bound_if == ifindex)) {
2830 
2831                         CONN_INC_REF(tconnp);
2832                         mutex_exit(&connfp->connf_lock);
2833                         return (tconnp);
2834                 }
2835         }
2836         mutex_exit(&connfp->connf_lock);
2837         return (NULL);
2838 }
2839 
2840 /*
2841  * Same as ipcl_tcp_lookup_reversed_ipv4.
2842  */
2843 conn_t *
2844 ipcl_dccp_lookup_reversed_ipv4(ipha_t *ipha, dccpha_t *dccpha, int min_state,
2845     ip_stack_t *ipst)
2846 {
2847         conn_t          *tconnp;
2848         connf_t         *connfp;
2849         uint16_t        *pports;
2850         uint32_t        ports;
2851 
2852         pports = (uint16_t *)&ports;
2853         pports[0] = dccpha->dha_fport;
2854         pports[1] = dccpha->dha_lport;
2855 
2856         connfp = &ipst->ips_ipcl_dccp_conn_fanout[IPCL_DCCP_CONN_HASH(
2857             ipha->ipha_dst, ports, ipst)];
2858 
2859         mutex_enter(&connfp->connf_lock);
2860         for (tconnp = connfp->connf_head; tconnp != NULL;
2861             tconnp = tconnp->conn_next) {
2862                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_DCCP,
2863                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2864                     tconnp->conn_dccp->dccp_state >= min_state) {
2865                         CONN_INC_REF(tconnp);
2866                         mutex_exit(&connfp->connf_lock);
2867                         return (tconnp);
2868                 }
2869         }
2870         mutex_exit(&connfp->connf_lock);
2871 
2872         return (NULL);
2873 }
2874 
2875 /*
2876  * Same as ipcl_tcp_lookup_reversed_ipv6.
2877  */
2878 conn_t *
2879 ipcl_dccp_lookup_reversed_ipv6(ip6_t *ip6h, dccpha_t *dccpha, int min_state,
2880     uint_t ifindex, ip_stack_t *ipst)
2881 {
2882         conn_t          *tconnp;
2883         tcp_t           *tcp;
2884         connf_t         *connfp;
2885         uint32_t         ports;
2886         uint16_t         *pports;
2887 
2888         pports = (uint16_t *)&ports;
2889         pports[0] = dccpha->dha_fport;
2890         pports[1] = dccpha->dha_lport;
2891 /*
2892         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2893             ports, ipst)];
2894 
2895         mutex_enter(&connfp->connf_lock);
2896         for (tconnp = connfp->connf_head; tconnp != NULL;
2897             tconnp = tconnp->conn_next) {
2898 
2899                 tcp = tconnp->conn_tcp;
2900                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2901                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2902                     tcp->tcp_state >= min_state &&
2903                     (tconnp->conn_bound_if == 0 ||
2904                     tconnp->conn_bound_if == ifindex)) {
2905 
2906                         CONN_INC_REF(tconnp);
2907                         mutex_exit(&connfp->connf_lock);
2908                         return (tconnp);
2909                 }
2910         }
2911         mutex_exit(&connfp->connf_lock);
2912 */
2913         return (NULL);
2914 }
2915 
2916 /*
2917  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2918  * a listener when changing state.
2919  */
2920 conn_t *
2921 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2922     ip_stack_t *ipst)
2923 {
2924         connf_t         *bind_connfp;
2925         conn_t          *connp;
2926         tcp_t           *tcp;
2927 
2928         /*
2929          * Avoid false matches for packets sent to an IP destination of
2930          * all zeros.
2931          */
2932         if (laddr == 0)
2933                 return (NULL);
2934 
2935         ASSERT(zoneid != ALL_ZONES);
2936 
2937         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2938         mutex_enter(&bind_connfp->connf_lock);
2939         for (connp = bind_connfp->connf_head; connp != NULL;
2940             connp = connp->conn_next) {
2941                 tcp = connp->conn_tcp;
2942                 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2943                     IPCL_ZONE_MATCH(connp, zoneid) &&
2944                     (tcp->tcp_listener == NULL)) {
2945                         CONN_INC_REF(connp);
2946                         mutex_exit(&bind_connfp->connf_lock);
2947                         return (connp);
2948                 }
2949         }
2950         mutex_exit(&bind_connfp->connf_lock);
2951         return (NULL);
2952 }
2953 
2954 /*
2955  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2956  * a listener when changing state.
2957  */
2958 conn_t *
2959 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2960     zoneid_t zoneid, ip_stack_t *ipst)
2961 {
2962         connf_t         *bind_connfp;
2963         conn_t          *connp = NULL;
2964         tcp_t           *tcp;
2965 
2966         /*
2967          * Avoid false matches for packets sent to an IP destination of
2968          * all zeros.
2969          */
2970         if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2971                 return (NULL);
2972 
2973         ASSERT(zoneid != ALL_ZONES);
2974 
2975         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2976         mutex_enter(&bind_connfp->connf_lock);
2977         for (connp = bind_connfp->connf_head; connp != NULL;
2978             connp = connp->conn_next) {
2979                 tcp = connp->conn_tcp;
2980                 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2981                     IPCL_ZONE_MATCH(connp, zoneid) &&
2982                     (connp->conn_bound_if == 0 ||
2983                     connp->conn_bound_if == ifindex) &&
2984                     tcp->tcp_listener == NULL) {
2985                         CONN_INC_REF(connp);
2986                         mutex_exit(&bind_connfp->connf_lock);
2987                         return (connp);
2988                 }
2989         }
2990         mutex_exit(&bind_connfp->connf_lock);
2991         return (NULL);
2992 }
2993 
2994 /*
2995  * ipcl_get_next_conn
2996  *      get the next entry in the conn global list
2997  *      and put a reference on the next_conn.
2998  *      decrement the reference on the current conn.
2999  *
3000  * This is an iterator based walker function that also provides for
3001  * some selection by the caller. It walks through the conn_hash bucket
3002  * searching for the next valid connp in the list, and selects connections
3003  * that are neither closed nor condemned. It also REFHOLDS the conn
3004  * thus ensuring that the conn exists when the caller uses the conn.
3005  */
3006 conn_t *
3007 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
3008 {
3009         conn_t  *next_connp;
3010 
3011         if (connfp == NULL)
3012                 return (NULL);
3013 
3014         mutex_enter(&connfp->connf_lock);
3015 
3016         next_connp = (connp == NULL) ?
3017             connfp->connf_head : connp->conn_g_next;
3018 
3019         while (next_connp != NULL) {
3020                 mutex_enter(&next_connp->conn_lock);
3021                 if (!(next_connp->conn_flags & conn_flags) ||
3022                     (next_connp->conn_state_flags &
3023                     (CONN_CONDEMNED | CONN_INCIPIENT))) {
3024                         /*
3025                          * This conn has been condemned or
3026                          * is closing, or the flags don't match
3027                          */
3028                         mutex_exit(&next_connp->conn_lock);
3029                         next_connp = next_connp->conn_g_next;
3030                         continue;
3031                 }
3032                 CONN_INC_REF_LOCKED(next_connp);
3033                 mutex_exit(&next_connp->conn_lock);
3034                 break;
3035         }
3036 
3037         mutex_exit(&connfp->connf_lock);
3038 
3039         if (connp != NULL)
3040                 CONN_DEC_REF(connp);
3041 
3042         return (next_connp);
3043 }
3044 
3045 #ifdef CONN_DEBUG
3046 /*
3047  * Trace of the last NBUF refhold/refrele
3048  */
3049 int
3050 conn_trace_ref(conn_t *connp)
3051 {
3052         int     last;
3053         conn_trace_t    *ctb;
3054 
3055         ASSERT(MUTEX_HELD(&connp->conn_lock));
3056         last = connp->conn_trace_last;
3057         last++;
3058         if (last == CONN_TRACE_MAX)
3059                 last = 0;
3060 
3061         ctb = &connp->conn_trace_buf[last];
3062         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
3063         connp->conn_trace_last = last;
3064         return (1);
3065 }
3066 
3067 int
3068 conn_untrace_ref(conn_t *connp)
3069 {
3070         int     last;
3071         conn_trace_t    *ctb;
3072 
3073         ASSERT(MUTEX_HELD(&connp->conn_lock));
3074         last = connp->conn_trace_last;
3075         last++;
3076         if (last == CONN_TRACE_MAX)
3077                 last = 0;
3078 
3079         ctb = &connp->conn_trace_buf[last];
3080         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
3081         connp->conn_trace_last = last;
3082         return (1);
3083 }
3084 #endif