1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * IP PACKET CLASSIFIER
  27  *
  28  * The IP packet classifier provides mapping between IP packets and persistent
  29  * connection state for connection-oriented protocols. It also provides
  30  * interface for managing connection states.
  31  *
  32  * The connection state is kept in conn_t data structure and contains, among
  33  * other things:
  34  *
  35  *      o local/remote address and ports
  36  *      o Transport protocol
  37  *      o squeue for the connection (for TCP only)
  38  *      o reference counter
  39  *      o Connection state
  40  *      o hash table linkage
  41  *      o interface/ire information
  42  *      o credentials
  43  *      o ipsec policy
  44  *      o send and receive functions.
  45  *      o mutex lock.
  46  *
  47  * Connections use a reference counting scheme. They are freed when the
  48  * reference counter drops to zero. A reference is incremented when connection
  49  * is placed in a list or table, when incoming packet for the connection arrives
  50  * and when connection is processed via squeue (squeue processing may be
  51  * asynchronous and the reference protects the connection from being destroyed
  52  * before its processing is finished).
  53  *
  54  * conn_recv is used to pass up packets to the ULP.
  55  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  56  * a listener, and changes to tcp_input_listener as the listener has picked a
  57  * good squeue. For other cases it is set to tcp_input_data.
  58  *
  59  * conn_recvicmp is used to pass up ICMP errors to the ULP.
  60  *
  61  * Classifier uses several hash tables:
  62  *
  63  *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  64  *      ipcl_bind_fanout:       contains all connections in BOUND state
  65  *      ipcl_proto_fanout:      IPv4 protocol fanout
  66  *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  67  *      ipcl_udp_fanout:        contains all UDP connections
  68  *      ipcl_iptun_fanout:      contains all IP tunnel connections
  69  *      ipcl_globalhash_fanout: contains all connections
  70  *
  71  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  72  * which need to view all existing connections.
  73  *
  74  * All tables are protected by per-bucket locks. When both per-bucket lock and
  75  * connection lock need to be held, the per-bucket lock should be acquired
  76  * first, followed by the connection lock.
  77  *
  78  * All functions doing search in one of these tables increment a reference
  79  * counter on the connection found (if any). This reference should be dropped
  80  * when the caller has finished processing the connection.
  81  *
  82  *
  83  * INTERFACES:
  84  * ===========
  85  *
  86  * Connection Lookup:
  87  * ------------------
  88  *
  89  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  90  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  91  *
  92  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  93  * it can't find any associated connection. If the connection is found, its
  94  * reference counter is incremented.
  95  *
  96  *      mp:     mblock, containing packet header. The full header should fit
  97  *              into a single mblock. It should also contain at least full IP
  98  *              and TCP or UDP header.
  99  *
 100  *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 101  *
 102  *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 103  *               the packet.
 104  *
 105  *      ira->ira_zoneid: The zone in which the returned connection must be; the
 106  *              zoneid corresponding to the ire_zoneid on the IRE located for
 107  *              the packet's destination address.
 108  *
 109  *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 110  *              IRAF_TX_SHARED_ADDR flags
 111  *
 112  *      For TCP connections, the lookup order is as follows:
 113  *              5-tuple {src, dst, protocol, local port, remote port}
 114  *                      lookup in ipcl_conn_fanout table.
 115  *              3-tuple {dst, remote port, protocol} lookup in
 116  *                      ipcl_bind_fanout table.
 117  *
 118  *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 119  *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 120  *      these interfaces do not handle cases where a packets belongs
 121  *      to multiple UDP clients, which is handled in IP itself.
 122  *
 123  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 124  * determine which actual zone gets the segment.  This is used only in a
 125  * labeled environment.  The matching rules are:
 126  *
 127  *      - If it's not a multilevel port, then the label on the packet selects
 128  *        the zone.  Unlabeled packets are delivered to the global zone.
 129  *
 130  *      - If it's a multilevel port, then only the zone registered to receive
 131  *        packets on that port matches.
 132  *
 133  * Also, in a labeled environment, packet labels need to be checked.  For fully
 134  * bound TCP connections, we can assume that the packet label was checked
 135  * during connection establishment, and doesn't need to be checked on each
 136  * packet.  For others, though, we need to check for strict equality or, for
 137  * multilevel ports, membership in the range or set.  This part currently does
 138  * a tnrh lookup on each packet, but could be optimized to use cached results
 139  * if that were necessary.  (SCTP doesn't come through here, but if it did,
 140  * we would apply the same rules as TCP.)
 141  *
 142  * An implication of the above is that fully-bound TCP sockets must always use
 143  * distinct 4-tuples; they can't be discriminated by label alone.
 144  *
 145  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 146  * as there's no connection set-up handshake and no shared state.
 147  *
 148  * Labels on looped-back packets within a single zone do not need to be
 149  * checked, as all processes in the same zone have the same label.
 150  *
 151  * Finally, for unlabeled packets received by a labeled system, special rules
 152  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 153  * socket in the zone whose label matches the default label of the sender, if
 154  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 155  * receiver's label must dominate the sender's default label.
 156  *
 157  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 158  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 159  *                                       ip_stack);
 160  *
 161  *      Lookup routine to find a exact match for {src, dst, local port,
 162  *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 163  *      ports are read from the IP and TCP header respectively.
 164  *
 165  * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 166  *                                       zoneid, ip_stack);
 167  * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 168  *                                       zoneid, ip_stack);
 169  *
 170  *      Lookup routine to find a listener with the tuple {lport, laddr,
 171  *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 172  *      parameter interface index is also compared.
 173  *
 174  * void ipcl_walk(func, arg, ip_stack)
 175  *
 176  *      Apply 'func' to every connection available. The 'func' is called as
 177  *      (*func)(connp, arg). The walk is non-atomic so connections may be
 178  *      created and destroyed during the walk. The CONN_CONDEMNED and
 179  *      CONN_INCIPIENT flags ensure that connections which are newly created
 180  *      or being destroyed are not selected by the walker.
 181  *
 182  * Table Updates
 183  * -------------
 184  *
 185  * int ipcl_conn_insert(connp);
 186  * int ipcl_conn_insert_v4(connp);
 187  * int ipcl_conn_insert_v6(connp);
 188  *
 189  *      Insert 'connp' in the ipcl_conn_fanout.
 190  *      Arguements :
 191  *              connp           conn_t to be inserted
 192  *
 193  *      Return value :
 194  *              0               if connp was inserted
 195  *              EADDRINUSE      if the connection with the same tuple
 196  *                              already exists.
 197  *
 198  * int ipcl_bind_insert(connp);
 199  * int ipcl_bind_insert_v4(connp);
 200  * int ipcl_bind_insert_v6(connp);
 201  *
 202  *      Insert 'connp' in ipcl_bind_fanout.
 203  *      Arguements :
 204  *              connp           conn_t to be inserted
 205  *
 206  *
 207  * void ipcl_hash_remove(connp);
 208  *
 209  *      Removes the 'connp' from the connection fanout table.
 210  *
 211  * Connection Creation/Destruction
 212  * -------------------------------
 213  *
 214  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 215  *
 216  *      Creates a new conn based on the type flag, inserts it into
 217  *      globalhash table.
 218  *
 219  *      type:   This flag determines the type of conn_t which needs to be
 220  *              created i.e., which kmem_cache it comes from.
 221  *              IPCL_TCPCONN    indicates a TCP connection
 222  *              IPCL_SCTPCONN   indicates a SCTP connection
 223  *              IPCL_UDPCONN    indicates a UDP conn_t.
 224  *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 225  *              IPCL_RTSCONN    indicates a RTS conn_t.
 226  *              IPCL_DCCPCONN   indicates a DCCP conn_t.
 227  *              IPCL_IPCCONN    indicates all other connections.
 228  *
 229  * void ipcl_conn_destroy(connp)
 230  *
 231  *      Destroys the connection state, removes it from the global
 232  *      connection hash table and frees its memory.
 233  */
 234 
 235 #include <sys/types.h>
 236 #include <sys/stream.h>
 237 #include <sys/stropts.h>
 238 #include <sys/sysmacros.h>
 239 #include <sys/strsubr.h>
 240 #include <sys/strsun.h>
 241 #define _SUN_TPI_VERSION 2
 242 #include <sys/ddi.h>
 243 #include <sys/cmn_err.h>
 244 #include <sys/debug.h>
 245 
 246 #include <sys/systm.h>
 247 #include <sys/param.h>
 248 #include <sys/kmem.h>
 249 #include <sys/isa_defs.h>
 250 #include <inet/common.h>
 251 #include <netinet/ip6.h>
 252 #include <netinet/icmp6.h>
 253 
 254 #include <inet/ip.h>
 255 #include <inet/ip_if.h>
 256 #include <inet/ip_ire.h>
 257 #include <inet/ip6.h>
 258 #include <inet/ip_ndp.h>
 259 #include <inet/ip_impl.h>
 260 #include <inet/udp_impl.h>
 261 #include <inet/dccp/dccp_impl.h>
 262 #include <inet/sctp_ip.h>
 263 #include <inet/sctp/sctp_impl.h>
 264 #include <inet/rawip_impl.h>
 265 #include <inet/rts_impl.h>
 266 #include <inet/iptun/iptun_impl.h>
 267 
 268 #include <sys/cpuvar.h>
 269 
 270 #include <inet/ipclassifier.h>
 271 #include <inet/tcp.h>
 272 #include <inet/ipsec_impl.h>
 273 
 274 #include <sys/tsol/tnet.h>
 275 #include <sys/sockio.h>
 276 
 277 /* Old value for compatibility. Setable in /etc/system */
 278 uint_t tcp_conn_hash_size = 0;
 279 
 280 /* New value. Zero means choose automatically.  Setable in /etc/system */
 281 uint_t ipcl_conn_hash_size = 0;
 282 uint_t ipcl_conn_hash_memfactor = 8192;
 283 uint_t ipcl_conn_hash_maxsize = 82500;
 284 
 285 /* bind/dccp/udp fanout table size */
 286 uint_t ipcl_bind_fanout_size = 512;
 287 uint_t ipcl_dccp_fanout_size = 512;
 288 uint_t ipcl_udp_fanout_size = 16384;
 289 
 290 /* Raw socket fanout size.  Must be a power of 2. */
 291 uint_t ipcl_raw_fanout_size = 256;
 292 
 293 /*
 294  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 295  * expect that most large deployments would have hundreds of tunnels, and
 296  * thousands in the extreme case.
 297  */
 298 uint_t ipcl_iptun_fanout_size = 6143;
 299 
 300 /*
 301  * Power of 2^N Primes useful for hashing for N of 0-28,
 302  * these primes are the nearest prime <= 2^N - 2^(N-2).
 303  */
 304 
 305 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 306                 6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 307                 786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 308                 50331599, 100663291, 201326557, 0}
 309 
 310 /*
 311  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 312  * are aligned on cache lines.
 313  */
 314 typedef union itc_s {
 315         conn_t  itc_conn;
 316         char    itcu_filler[CACHE_ALIGN(conn_s)];
 317 } itc_t;
 318 
 319 struct kmem_cache  *tcp_conn_cache;
 320 struct kmem_cache  *ip_conn_cache;
 321 extern struct kmem_cache  *sctp_conn_cache;
 322 struct kmem_cache  *udp_conn_cache;
 323 struct kmem_cache  *rawip_conn_cache;
 324 struct kmem_cache  *rts_conn_cache;
 325 struct kmem_cache  *dccp_conn_cache;
 326 
 327 extern void     tcp_timermp_free(tcp_t *);
 328 extern mblk_t   *tcp_timermp_alloc(int);
 329 
 330 static int      ip_conn_constructor(void *, void *, int);
 331 static void     ip_conn_destructor(void *, void *);
 332 
 333 static int      tcp_conn_constructor(void *, void *, int);
 334 static void     tcp_conn_destructor(void *, void *);
 335 
 336 static int      udp_conn_constructor(void *, void *, int);
 337 static void     udp_conn_destructor(void *, void *);
 338 
 339 static int      rawip_conn_constructor(void *, void *, int);
 340 static void     rawip_conn_destructor(void *, void *);
 341 
 342 static int      rts_conn_constructor(void *, void *, int);
 343 static void     rts_conn_destructor(void *, void *);
 344 
 345 static int      dccp_conn_constructor(void *, void *, int);
 346 static void     dccp_conn_destructor(void *, void *);
 347 
 348 /*
 349  * Global (for all stack instances) init routine
 350  */
 351 void
 352 ipcl_g_init(void)
 353 {
 354         ip_conn_cache = kmem_cache_create("ip_conn_cache",
 355             sizeof (conn_t), CACHE_ALIGN_SIZE,
 356             ip_conn_constructor, ip_conn_destructor,
 357             NULL, NULL, NULL, 0);
 358 
 359         tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 360             sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 361             tcp_conn_constructor, tcp_conn_destructor,
 362             tcp_conn_reclaim, NULL, NULL, 0);
 363 
 364         udp_conn_cache = kmem_cache_create("udp_conn_cache",
 365             sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 366             udp_conn_constructor, udp_conn_destructor,
 367             NULL, NULL, NULL, 0);
 368 
 369         rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 370             sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 371             rawip_conn_constructor, rawip_conn_destructor,
 372             NULL, NULL, NULL, 0);
 373 
 374         rts_conn_cache = kmem_cache_create("rts_conn_cache",
 375             sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 376             rts_conn_constructor, rts_conn_destructor,
 377             NULL, NULL, NULL, 0);
 378 
 379         /* XXX:DCCP reclaim */
 380         dccp_conn_cache = kmem_cache_create("dccp_conn_cache",
 381             sizeof (itc_t) + sizeof (dccp_t), CACHE_ALIGN_SIZE,
 382             dccp_conn_constructor, dccp_conn_destructor,
 383             NULL, NULL, NULL, 0);
 384 }
 385 
 386 /*
 387  * ipclassifier intialization routine, sets up hash tables.
 388  */
 389 void
 390 ipcl_init(ip_stack_t *ipst)
 391 {
 392         int i;
 393         int sizes[] = P2Ps();
 394 
 395         /*
 396          * Calculate size of conn fanout table from /etc/system settings
 397          */
 398         if (ipcl_conn_hash_size != 0) {
 399                 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 400         } else if (tcp_conn_hash_size != 0) {
 401                 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 402         } else {
 403                 extern pgcnt_t freemem;
 404 
 405                 ipst->ips_ipcl_conn_fanout_size =
 406                     (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 407 
 408                 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 409                         ipst->ips_ipcl_conn_fanout_size =
 410                             ipcl_conn_hash_maxsize;
 411                 }
 412         }
 413 
 414         for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 415                 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 416                         break;
 417                 }
 418         }
 419         if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 420                 /* Out of range, use the 2^16 value */
 421                 ipst->ips_ipcl_conn_fanout_size = sizes[16];
 422         }
 423 
 424         /* Take values from /etc/system */
 425         ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 426         ipst->ips_ipcl_dccp_fanout_size = ipcl_dccp_fanout_size;
 427         ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 428         ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 429         ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 430 
 431         ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 432 
 433         ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 434             ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 435 
 436         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 437                 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 438                     MUTEX_DEFAULT, NULL);
 439         }
 440 
 441         ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 442             ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 443 
 444         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 445                 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 446                     MUTEX_DEFAULT, NULL);
 447         }
 448 
 449         ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 450             sizeof (connf_t), KM_SLEEP);
 451         for (i = 0; i < IPPROTO_MAX; i++) {
 452                 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 453                     MUTEX_DEFAULT, NULL);
 454         }
 455 
 456         ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 457             sizeof (connf_t), KM_SLEEP);
 458         for (i = 0; i < IPPROTO_MAX; i++) {
 459                 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 460                     MUTEX_DEFAULT, NULL);
 461         }
 462 
 463         ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 464         mutex_init(&ipst->ips_rts_clients->connf_lock,
 465             NULL, MUTEX_DEFAULT, NULL);
 466 
 467         ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 468             ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 469         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 470                 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 471                     MUTEX_DEFAULT, NULL);
 472         }
 473 
 474         ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 475             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 476         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 477                 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 478                     MUTEX_DEFAULT, NULL);
 479         }
 480 
 481         ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 482             ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 483         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 484                 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 485                     MUTEX_DEFAULT, NULL);
 486         }
 487 
 488         ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 489             sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 490         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 491                 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 492                     NULL, MUTEX_DEFAULT, NULL);
 493         }
 494 
 495         ipst->ips_ipcl_dccp_fanout = kmem_zalloc(
 496             ipst->ips_ipcl_dccp_fanout_size * sizeof (connf_t), KM_SLEEP);
 497         for (i = 0; i < ipst->ips_ipcl_dccp_fanout_size; i++) {
 498                 mutex_init(&ipst->ips_ipcl_dccp_fanout[i].connf_lock, NULL,
 499                     MUTEX_DEFAULT, NULL);
 500         }
 501 }
 502 
 503 void
 504 ipcl_g_destroy(void)
 505 {
 506         kmem_cache_destroy(ip_conn_cache);
 507         kmem_cache_destroy(tcp_conn_cache);
 508         kmem_cache_destroy(udp_conn_cache);
 509         kmem_cache_destroy(rawip_conn_cache);
 510         kmem_cache_destroy(rts_conn_cache);
 511         kmem_cache_destroy(dccp_conn_cache);
 512 }
 513 
 514 /*
 515  * All user-level and kernel use of the stack must be gone
 516  * by now.
 517  */
 518 void
 519 ipcl_destroy(ip_stack_t *ipst)
 520 {
 521         int i;
 522 
 523         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 524                 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 525                 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 526         }
 527         kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 528             sizeof (connf_t));
 529         ipst->ips_ipcl_conn_fanout = NULL;
 530 
 531         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 532                 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 533                 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 534         }
 535         kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 536             sizeof (connf_t));
 537         ipst->ips_ipcl_bind_fanout = NULL;
 538 
 539         for (i = 0; i < IPPROTO_MAX; i++) {
 540                 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 541                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 542         }
 543         kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 544             IPPROTO_MAX * sizeof (connf_t));
 545         ipst->ips_ipcl_proto_fanout_v4 = NULL;
 546 
 547         for (i = 0; i < IPPROTO_MAX; i++) {
 548                 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 549                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 550         }
 551         kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 552             IPPROTO_MAX * sizeof (connf_t));
 553         ipst->ips_ipcl_proto_fanout_v6 = NULL;
 554 
 555         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 556                 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 557                 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 558         }
 559         kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 560             sizeof (connf_t));
 561         ipst->ips_ipcl_udp_fanout = NULL;
 562 
 563         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 564                 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 565                 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 566         }
 567         kmem_free(ipst->ips_ipcl_iptun_fanout,
 568             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 569         ipst->ips_ipcl_iptun_fanout = NULL;
 570 
 571         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 572                 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 573                 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 574         }
 575         kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 576             sizeof (connf_t));
 577         ipst->ips_ipcl_raw_fanout = NULL;
 578 
 579         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 580                 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 581                 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 582         }
 583         kmem_free(ipst->ips_ipcl_globalhash_fanout,
 584             sizeof (connf_t) * CONN_G_HASH_SIZE);
 585         ipst->ips_ipcl_globalhash_fanout = NULL;
 586 
 587         for (i = 0; i < ipst->ips_ipcl_dccp_fanout_size; i++) {
 588                 ASSERT(ipst->ips_ipcl_dccp_fanout[i].connf_head == NULL);
 589                 mutex_destroy(&ipst->ips_ipcl_dccp_fanout[i].connf_lock);
 590         }
 591         kmem_free(ipst->ips_ipcl_dccp_fanout, ipst->ips_ipcl_dccp_fanout_size *
 592             sizeof (connf_t));
 593         ipst->ips_ipcl_dccp_fanout = NULL;
 594 
 595         ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 596         mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 597         kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 598         ipst->ips_rts_clients = NULL;
 599 }
 600 
 601 /*
 602  * conn creation routine. initialize the conn, sets the reference
 603  * and inserts it in the global hash table.
 604  */
 605 conn_t *
 606 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 607 {
 608         conn_t  *connp;
 609         struct kmem_cache *conn_cache;
 610 
 611         switch (type) {
 612         case IPCL_SCTPCONN:
 613                 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 614                         return (NULL);
 615                 sctp_conn_init(connp);
 616                 netstack_hold(ns);
 617                 connp->conn_netstack = ns;
 618                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 619                 connp->conn_ixa->ixa_conn_id = (long)connp;
 620                 ipcl_globalhash_insert(connp);
 621                 return (connp);
 622 
 623         case IPCL_TCPCONN:
 624                 conn_cache = tcp_conn_cache;
 625                 break;
 626 
 627         case IPCL_UDPCONN:
 628                 conn_cache = udp_conn_cache;
 629                 break;
 630 
 631         case IPCL_RAWIPCONN:
 632                 conn_cache = rawip_conn_cache;
 633                 break;
 634 
 635         case IPCL_RTSCONN:
 636                 conn_cache = rts_conn_cache;
 637                 break;
 638 
 639         case IPCL_IPCCONN:
 640                 conn_cache = ip_conn_cache;
 641                 break;
 642 
 643         case IPCL_DCCPCONN:
 644                 conn_cache = dccp_conn_cache;
 645                 break;
 646 
 647         default:
 648                 connp = NULL;
 649                 ASSERT(0);
 650         }
 651 
 652         if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 653                 return (NULL);
 654 
 655         connp->conn_ref = 1;
 656         netstack_hold(ns);
 657         connp->conn_netstack = ns;
 658         connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 659         connp->conn_ixa->ixa_conn_id = (long)connp;
 660         ipcl_globalhash_insert(connp);
 661         return (connp);
 662 }
 663 
 664 void
 665 ipcl_conn_destroy(conn_t *connp)
 666 {
 667         mblk_t  *mp;
 668         netstack_t      *ns = connp->conn_netstack;
 669 
 670         ASSERT(!MUTEX_HELD(&connp->conn_lock));
 671         ASSERT(connp->conn_ref == 0);
 672         ASSERT(connp->conn_ioctlref == 0);
 673 
 674         DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 675 
 676         if (connp->conn_cred != NULL) {
 677                 crfree(connp->conn_cred);
 678                 connp->conn_cred = NULL;
 679                 /* ixa_cred done in ipcl_conn_cleanup below */
 680         }
 681 
 682         if (connp->conn_ht_iphc != NULL) {
 683                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 684                 connp->conn_ht_iphc = NULL;
 685                 connp->conn_ht_iphc_allocated = 0;
 686                 connp->conn_ht_iphc_len = 0;
 687                 connp->conn_ht_ulp = NULL;
 688                 connp->conn_ht_ulp_len = 0;
 689         }
 690         ip_pkt_free(&connp->conn_xmit_ipp);
 691 
 692         ipcl_globalhash_remove(connp);
 693 
 694         if (connp->conn_latch != NULL) {
 695                 IPLATCH_REFRELE(connp->conn_latch);
 696                 connp->conn_latch = NULL;
 697         }
 698         if (connp->conn_latch_in_policy != NULL) {
 699                 IPPOL_REFRELE(connp->conn_latch_in_policy);
 700                 connp->conn_latch_in_policy = NULL;
 701         }
 702         if (connp->conn_latch_in_action != NULL) {
 703                 IPACT_REFRELE(connp->conn_latch_in_action);
 704                 connp->conn_latch_in_action = NULL;
 705         }
 706         if (connp->conn_policy != NULL) {
 707                 IPPH_REFRELE(connp->conn_policy, ns);
 708                 connp->conn_policy = NULL;
 709         }
 710 
 711         if (connp->conn_ipsec_opt_mp != NULL) {
 712                 freemsg(connp->conn_ipsec_opt_mp);
 713                 connp->conn_ipsec_opt_mp = NULL;
 714         }
 715 
 716         if (connp->conn_flags & IPCL_TCPCONN) {
 717                 tcp_t *tcp = connp->conn_tcp;
 718 
 719                 tcp_free(tcp);
 720                 mp = tcp->tcp_timercache;
 721 
 722                 tcp->tcp_tcps = NULL;
 723 
 724                 /*
 725                  * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 726                  * the mblk.
 727                  */
 728                 if (tcp->tcp_rsrv_mp != NULL) {
 729                         freeb(tcp->tcp_rsrv_mp);
 730                         tcp->tcp_rsrv_mp = NULL;
 731                         mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 732                 }
 733 
 734                 ipcl_conn_cleanup(connp);
 735                 connp->conn_flags = IPCL_TCPCONN;
 736                 if (ns != NULL) {
 737                         ASSERT(tcp->tcp_tcps == NULL);
 738                         connp->conn_netstack = NULL;
 739                         connp->conn_ixa->ixa_ipst = NULL;
 740                         netstack_rele(ns);
 741                 }
 742 
 743                 bzero(tcp, sizeof (tcp_t));
 744 
 745                 tcp->tcp_timercache = mp;
 746                 tcp->tcp_connp = connp;
 747                 kmem_cache_free(tcp_conn_cache, connp);
 748                 return;
 749         }
 750 
 751         if (connp->conn_flags & IPCL_SCTPCONN) {
 752                 ASSERT(ns != NULL);
 753                 sctp_free(connp);
 754                 return;
 755         }
 756 
 757         if (connp->conn_flags & IPCL_DCCPCONN) {
 758                 dccp_t  *dccp = connp->conn_dccp;
 759 
 760                 cmn_err(CE_NOTE, "ipclassifier: conn_flags DCCP cache_free");
 761 
 762                 /* XXX:DCCP */
 763                 /* Crash bug here: udp_conn_cache and dccp_conn_cache */
 764 /*
 765                 ipcl_conn_cleanup(connp);
 766                 connp->conn_flags = IPCL_DCCPCONN;
 767                 bzero(dccp, sizeof (dccp_t));
 768                 dccp->dccp_connp = connp;
 769                 kmem_cache_free(dccp_conn_cache, connp);
 770                 return;
 771 */
 772         }
 773 
 774         ipcl_conn_cleanup(connp);
 775         if (ns != NULL) {
 776                 connp->conn_netstack = NULL;
 777                 connp->conn_ixa->ixa_ipst = NULL;
 778                 netstack_rele(ns);
 779         }
 780 
 781         /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 782         if (connp->conn_flags & IPCL_UDPCONN) {
 783                 connp->conn_flags = IPCL_UDPCONN;
 784                 kmem_cache_free(udp_conn_cache, connp);
 785         } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 786                 connp->conn_flags = IPCL_RAWIPCONN;
 787                 connp->conn_proto = IPPROTO_ICMP;
 788                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
 789                 kmem_cache_free(rawip_conn_cache, connp);
 790         } else if (connp->conn_flags & IPCL_RTSCONN) {
 791                 connp->conn_flags = IPCL_RTSCONN;
 792                 kmem_cache_free(rts_conn_cache, connp);
 793         } else {
 794                 connp->conn_flags = IPCL_IPCCONN;
 795                 ASSERT(connp->conn_flags & IPCL_IPCCONN);
 796                 ASSERT(connp->conn_priv == NULL);
 797                 kmem_cache_free(ip_conn_cache, connp);
 798         }
 799 }
 800 
 801 /*
 802  * Running in cluster mode - deregister listener information
 803  */
 804 static void
 805 ipcl_conn_unlisten(conn_t *connp)
 806 {
 807         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 808         ASSERT(connp->conn_lport != 0);
 809 
 810         if (cl_inet_unlisten != NULL) {
 811                 sa_family_t     addr_family;
 812                 uint8_t         *laddrp;
 813 
 814                 if (connp->conn_ipversion == IPV6_VERSION) {
 815                         addr_family = AF_INET6;
 816                         laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 817                 } else {
 818                         addr_family = AF_INET;
 819                         laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 820                 }
 821                 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 822                     IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 823         }
 824         connp->conn_flags &= ~IPCL_CL_LISTENER;
 825 }
 826 
 827 /*
 828  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 829  * which table the conn belonged to). So for debugging we can see which hash
 830  * table this connection was in.
 831  */
 832 #define IPCL_HASH_REMOVE(connp) {                                       \
 833         connf_t *connfp = (connp)->conn_fanout;                              \
 834         ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                      \
 835         if (connfp != NULL) {                                           \
 836                 mutex_enter(&connfp->connf_lock);                        \
 837                 if ((connp)->conn_next != NULL)                              \
 838                         (connp)->conn_next->conn_prev =                   \
 839                             (connp)->conn_prev;                              \
 840                 if ((connp)->conn_prev != NULL)                              \
 841                         (connp)->conn_prev->conn_next =                   \
 842                             (connp)->conn_next;                              \
 843                 else                                                    \
 844                         connfp->connf_head = (connp)->conn_next;  \
 845                 (connp)->conn_fanout = NULL;                         \
 846                 (connp)->conn_next = NULL;                           \
 847                 (connp)->conn_prev = NULL;                           \
 848                 (connp)->conn_flags |= IPCL_REMOVED;                 \
 849                 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)       \
 850                         ipcl_conn_unlisten((connp));                    \
 851                 CONN_DEC_REF((connp));                                  \
 852                 mutex_exit(&connfp->connf_lock);                 \
 853         }                                                               \
 854 }
 855 
 856 void
 857 ipcl_hash_remove(conn_t *connp)
 858 {
 859         uint8_t         protocol = connp->conn_proto;
 860 
 861         IPCL_HASH_REMOVE(connp);
 862         if (protocol == IPPROTO_RSVP)
 863                 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 864 }
 865 
 866 /*
 867  * The whole purpose of this function is allow removal of
 868  * a conn_t from the connected hash for timewait reclaim.
 869  * This is essentially a TW reclaim fastpath where timewait
 870  * collector checks under fanout lock (so no one else can
 871  * get access to the conn_t) that refcnt is 2 i.e. one for
 872  * TCP and one for the classifier hash list. If ref count
 873  * is indeed 2, we can just remove the conn under lock and
 874  * avoid cleaning up the conn under squeue. This gives us
 875  * improved performance.
 876  */
 877 void
 878 ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 879 {
 880         ASSERT(MUTEX_HELD(&connfp->connf_lock));
 881         ASSERT(MUTEX_HELD(&connp->conn_lock));
 882         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 883 
 884         if ((connp)->conn_next != NULL) {
 885                 (connp)->conn_next->conn_prev = (connp)->conn_prev;
 886         }
 887         if ((connp)->conn_prev != NULL) {
 888                 (connp)->conn_prev->conn_next = (connp)->conn_next;
 889         } else {
 890                 connfp->connf_head = (connp)->conn_next;
 891         }
 892         (connp)->conn_fanout = NULL;
 893         (connp)->conn_next = NULL;
 894         (connp)->conn_prev = NULL;
 895         (connp)->conn_flags |= IPCL_REMOVED;
 896         ASSERT((connp)->conn_ref == 2);
 897         (connp)->conn_ref--;
 898 }
 899 
 900 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 901         ASSERT((connp)->conn_fanout == NULL);                                \
 902         ASSERT((connp)->conn_next == NULL);                          \
 903         ASSERT((connp)->conn_prev == NULL);                          \
 904         if ((connfp)->connf_head != NULL) {                          \
 905                 (connfp)->connf_head->conn_prev = (connp);                \
 906                 (connp)->conn_next = (connfp)->connf_head;                \
 907         }                                                               \
 908         (connp)->conn_fanout = (connfp);                             \
 909         (connfp)->connf_head = (connp);                                      \
 910         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 911             IPCL_CONNECTED;                                             \
 912         CONN_INC_REF(connp);                                            \
 913 }
 914 
 915 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 916         IPCL_HASH_REMOVE((connp));                                      \
 917         mutex_enter(&(connfp)->connf_lock);                              \
 918         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 919         mutex_exit(&(connfp)->connf_lock);                               \
 920 }
 921 
 922 #define IPCL_HASH_INSERT_BOUND(connfp, connp) {                         \
 923         conn_t *pconnp = NULL, *nconnp;                                 \
 924         IPCL_HASH_REMOVE((connp));                                      \
 925         mutex_enter(&(connfp)->connf_lock);                              \
 926         nconnp = (connfp)->connf_head;                                       \
 927         while (nconnp != NULL &&                                        \
 928             !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {            \
 929                 pconnp = nconnp;                                        \
 930                 nconnp = nconnp->conn_next;                          \
 931         }                                                               \
 932         if (pconnp != NULL) {                                           \
 933                 pconnp->conn_next = (connp);                         \
 934                 (connp)->conn_prev = pconnp;                         \
 935         } else {                                                        \
 936                 (connfp)->connf_head = (connp);                              \
 937         }                                                               \
 938         if (nconnp != NULL) {                                           \
 939                 (connp)->conn_next = nconnp;                         \
 940                 nconnp->conn_prev = (connp);                         \
 941         }                                                               \
 942         (connp)->conn_fanout = (connfp);                             \
 943         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 944             IPCL_BOUND;                                                 \
 945         CONN_INC_REF(connp);                                            \
 946         mutex_exit(&(connfp)->connf_lock);                               \
 947 }
 948 
 949 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) {                      \
 950         conn_t **list, *prev, *next;                                    \
 951         boolean_t isv4mapped =                                          \
 952             IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);               \
 953         IPCL_HASH_REMOVE((connp));                                      \
 954         mutex_enter(&(connfp)->connf_lock);                              \
 955         list = &(connfp)->connf_head;                                    \
 956         prev = NULL;                                                    \
 957         while ((next = *list) != NULL) {                                \
 958                 if (isv4mapped &&                                       \
 959                     IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&     \
 960                     connp->conn_zoneid == next->conn_zoneid) {            \
 961                         (connp)->conn_next = next;                   \
 962                         if (prev != NULL)                               \
 963                                 prev = next->conn_prev;                      \
 964                         next->conn_prev = (connp);                   \
 965                         break;                                          \
 966                 }                                                       \
 967                 list = &next->conn_next;                         \
 968                 prev = next;                                            \
 969         }                                                               \
 970         (connp)->conn_prev = prev;                                   \
 971         *list = (connp);                                                \
 972         (connp)->conn_fanout = (connfp);                             \
 973         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 974             IPCL_BOUND;                                                 \
 975         CONN_INC_REF((connp));                                          \
 976         mutex_exit(&(connfp)->connf_lock);                               \
 977 }
 978 
 979 void
 980 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 981 {
 982         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 983 }
 984 
 985 /*
 986  * Because the classifier is used to classify inbound packets, the destination
 987  * address is meant to be our local tunnel address (tunnel source), and the
 988  * source the remote tunnel address (tunnel destination).
 989  *
 990  * Note that conn_proto can't be used for fanout since the upper protocol
 991  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 992  */
 993 conn_t *
 994 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
 995 {
 996         connf_t *connfp;
 997         conn_t  *connp;
 998 
 999         /* first look for IPv4 tunnel links */
1000         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
1001         mutex_enter(&connfp->connf_lock);
1002         for (connp = connfp->connf_head; connp != NULL;
1003             connp = connp->conn_next) {
1004                 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
1005                         break;
1006         }
1007         if (connp != NULL)
1008                 goto done;
1009 
1010         mutex_exit(&connfp->connf_lock);
1011 
1012         /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
1013         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
1014             INADDR_ANY)];
1015         mutex_enter(&connfp->connf_lock);
1016         for (connp = connfp->connf_head; connp != NULL;
1017             connp = connp->conn_next) {
1018                 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
1019                         break;
1020         }
1021 done:
1022         if (connp != NULL)
1023                 CONN_INC_REF(connp);
1024         mutex_exit(&connfp->connf_lock);
1025         return (connp);
1026 }
1027 
1028 conn_t *
1029 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
1030 {
1031         connf_t *connfp;
1032         conn_t  *connp;
1033 
1034         /* Look for an IPv6 tunnel link */
1035         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
1036         mutex_enter(&connfp->connf_lock);
1037         for (connp = connfp->connf_head; connp != NULL;
1038             connp = connp->conn_next) {
1039                 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
1040                         CONN_INC_REF(connp);
1041                         break;
1042                 }
1043         }
1044         mutex_exit(&connfp->connf_lock);
1045         return (connp);
1046 }
1047 
1048 /*
1049  * This function is used only for inserting SCTP raw socket now.
1050  * This may change later.
1051  *
1052  * Note that only one raw socket can be bound to a port.  The param
1053  * lport is in network byte order.
1054  */
1055 static int
1056 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1057 {
1058         connf_t *connfp;
1059         conn_t  *oconnp;
1060         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1061 
1062         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1063 
1064         /* Check for existing raw socket already bound to the port. */
1065         mutex_enter(&connfp->connf_lock);
1066         for (oconnp = connfp->connf_head; oconnp != NULL;
1067             oconnp = oconnp->conn_next) {
1068                 if (oconnp->conn_lport == lport &&
1069                     oconnp->conn_zoneid == connp->conn_zoneid &&
1070                     oconnp->conn_family == connp->conn_family &&
1071                     ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1072                     IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1073                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1074                     IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1075                     IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1076                     &connp->conn_laddr_v6))) {
1077                         break;
1078                 }
1079         }
1080         mutex_exit(&connfp->connf_lock);
1081         if (oconnp != NULL)
1082                 return (EADDRNOTAVAIL);
1083 
1084         if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1085             IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1086                 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1087                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1088                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1089                 } else {
1090                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1091                 }
1092         } else {
1093                 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1094         }
1095         return (0);
1096 }
1097 
1098 static int
1099 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1100 {
1101         connf_t *connfp;
1102         conn_t  *tconnp;
1103         ipaddr_t laddr = connp->conn_laddr_v4;
1104         ipaddr_t faddr = connp->conn_faddr_v4;
1105 
1106         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1107         mutex_enter(&connfp->connf_lock);
1108         for (tconnp = connfp->connf_head; tconnp != NULL;
1109             tconnp = tconnp->conn_next) {
1110                 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1111                         /* A tunnel is already bound to these addresses. */
1112                         mutex_exit(&connfp->connf_lock);
1113                         return (EADDRINUSE);
1114                 }
1115         }
1116         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1117         mutex_exit(&connfp->connf_lock);
1118         return (0);
1119 }
1120 
1121 static int
1122 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1123 {
1124         connf_t *connfp;
1125         conn_t  *tconnp;
1126         in6_addr_t *laddr = &connp->conn_laddr_v6;
1127         in6_addr_t *faddr = &connp->conn_faddr_v6;
1128 
1129         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1130         mutex_enter(&connfp->connf_lock);
1131         for (tconnp = connfp->connf_head; tconnp != NULL;
1132             tconnp = tconnp->conn_next) {
1133                 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1134                         /* A tunnel is already bound to these addresses. */
1135                         mutex_exit(&connfp->connf_lock);
1136                         return (EADDRINUSE);
1137                 }
1138         }
1139         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1140         mutex_exit(&connfp->connf_lock);
1141         return (0);
1142 }
1143 
1144 /*
1145  * Check for a MAC exemption conflict on a labeled system.  Note that for
1146  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1147  * transport layer.  This check is for binding all other protocols.
1148  *
1149  * Returns true if there's a conflict.
1150  */
1151 static boolean_t
1152 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1153 {
1154         connf_t *connfp;
1155         conn_t *tconn;
1156 
1157         connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1158         mutex_enter(&connfp->connf_lock);
1159         for (tconn = connfp->connf_head; tconn != NULL;
1160             tconn = tconn->conn_next) {
1161                 /* We don't allow v4 fallback for v6 raw socket */
1162                 if (connp->conn_family != tconn->conn_family)
1163                         continue;
1164                 /* If neither is exempt, then there's no conflict */
1165                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1166                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1167                         continue;
1168                 /* We are only concerned about sockets for a different zone */
1169                 if (connp->conn_zoneid == tconn->conn_zoneid)
1170                         continue;
1171                 /* If both are bound to different specific addrs, ok */
1172                 if (connp->conn_laddr_v4 != INADDR_ANY &&
1173                     tconn->conn_laddr_v4 != INADDR_ANY &&
1174                     connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1175                         continue;
1176                 /* These two conflict; fail */
1177                 break;
1178         }
1179         mutex_exit(&connfp->connf_lock);
1180         return (tconn != NULL);
1181 }
1182 
1183 static boolean_t
1184 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1185 {
1186         connf_t *connfp;
1187         conn_t *tconn;
1188 
1189         connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1190         mutex_enter(&connfp->connf_lock);
1191         for (tconn = connfp->connf_head; tconn != NULL;
1192             tconn = tconn->conn_next) {
1193                 /* We don't allow v4 fallback for v6 raw socket */
1194                 if (connp->conn_family != tconn->conn_family)
1195                         continue;
1196                 /* If neither is exempt, then there's no conflict */
1197                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1198                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1199                         continue;
1200                 /* We are only concerned about sockets for a different zone */
1201                 if (connp->conn_zoneid == tconn->conn_zoneid)
1202                         continue;
1203                 /* If both are bound to different addrs, ok */
1204                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1205                     !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1206                     !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1207                     &tconn->conn_laddr_v6))
1208                         continue;
1209                 /* These two conflict; fail */
1210                 break;
1211         }
1212         mutex_exit(&connfp->connf_lock);
1213         return (tconn != NULL);
1214 }
1215 
1216 /*
1217  * (v4, v6) bind hash insertion routines
1218  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1219  */
1220 
1221 int
1222 ipcl_bind_insert(conn_t *connp)
1223 {
1224         if (connp->conn_ipversion == IPV6_VERSION)
1225                 return (ipcl_bind_insert_v6(connp));
1226         else
1227                 return (ipcl_bind_insert_v4(connp));
1228 }
1229 
1230 int
1231 ipcl_bind_insert_v4(conn_t *connp)
1232 {
1233         connf_t *connfp;
1234         int     ret = 0;
1235         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1236         uint16_t        lport = connp->conn_lport;
1237         uint8_t         protocol = connp->conn_proto;
1238 
1239         if (IPCL_IS_IPTUN(connp))
1240                 return (ipcl_iptun_hash_insert(connp, ipst));
1241 
1242         switch (protocol) {
1243         default:
1244                 if (is_system_labeled() &&
1245                     check_exempt_conflict_v4(connp, ipst))
1246                         return (EADDRINUSE);
1247                 /* FALLTHROUGH */
1248         case IPPROTO_UDP:
1249                 if (protocol == IPPROTO_UDP) {
1250                         connfp = &ipst->ips_ipcl_udp_fanout[
1251                             IPCL_UDP_HASH(lport, ipst)];
1252                 } else {
1253                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1254                 }
1255 
1256                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1257                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1258                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1259                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1260                 } else {
1261                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1262                 }
1263                 if (protocol == IPPROTO_RSVP)
1264                         ill_set_inputfn_all(ipst);
1265                 break;
1266 
1267         case IPPROTO_TCP:
1268                 /* Insert it in the Bind Hash */
1269                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1270                 connfp = &ipst->ips_ipcl_bind_fanout[
1271                     IPCL_BIND_HASH(lport, ipst)];
1272                 if (connp->conn_laddr_v4 != INADDR_ANY) {
1273                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1274                 } else {
1275                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1276                 }
1277                 if (cl_inet_listen != NULL) {
1278                         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1279                         connp->conn_flags |= IPCL_CL_LISTENER;
1280                         (*cl_inet_listen)(
1281                             connp->conn_netstack->netstack_stackid,
1282                             IPPROTO_TCP, AF_INET,
1283                             (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1284                 }
1285                 break;
1286 
1287         case IPPROTO_SCTP:
1288                 ret = ipcl_sctp_hash_insert(connp, lport);
1289                 break;
1290 
1291         case IPPROTO_DCCP:
1292                 cmn_err(CE_NOTE, "ipcl_bind_insert_v4");
1293                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1294                 connfp = &ipst->ips_ipcl_dccp_fanout[
1295                     IPCL_DCCP_HASH(lport, ipst)];
1296                 if (connp->conn_laddr_v4 != INADDR_ANY) {
1297                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1298                 } else {
1299                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1300                 }
1301                 /* XXX:DCCP */
1302                 break;
1303         }
1304 
1305 
1306         return (ret);
1307 }
1308 
1309 int
1310 ipcl_bind_insert_v6(conn_t *connp)
1311 {
1312         connf_t         *connfp;
1313         int             ret = 0;
1314         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1315         uint16_t        lport = connp->conn_lport;
1316         uint8_t         protocol = connp->conn_proto;
1317 
1318         if (IPCL_IS_IPTUN(connp)) {
1319                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1320         }
1321 
1322         switch (protocol) {
1323         default:
1324                 if (is_system_labeled() &&
1325                     check_exempt_conflict_v6(connp, ipst))
1326                         return (EADDRINUSE);
1327                 /* FALLTHROUGH */
1328         case IPPROTO_UDP:
1329                 if (protocol == IPPROTO_UDP) {
1330                         connfp = &ipst->ips_ipcl_udp_fanout[
1331                             IPCL_UDP_HASH(lport, ipst)];
1332                 } else {
1333                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1334                 }
1335 
1336                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1337                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1338                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1339                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1340                 } else {
1341                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1342                 }
1343                 break;
1344 
1345         case IPPROTO_TCP:
1346                 /* Insert it in the Bind Hash */
1347                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1348                 connfp = &ipst->ips_ipcl_bind_fanout[
1349                     IPCL_BIND_HASH(lport, ipst)];
1350                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1351                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1352                 } else {
1353                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1354                 }
1355                 if (cl_inet_listen != NULL) {
1356                         sa_family_t     addr_family;
1357                         uint8_t         *laddrp;
1358 
1359                         if (connp->conn_ipversion == IPV6_VERSION) {
1360                                 addr_family = AF_INET6;
1361                                 laddrp =
1362                                     (uint8_t *)&connp->conn_bound_addr_v6;
1363                         } else {
1364                                 addr_family = AF_INET;
1365                                 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1366                         }
1367                         connp->conn_flags |= IPCL_CL_LISTENER;
1368                         (*cl_inet_listen)(
1369                             connp->conn_netstack->netstack_stackid,
1370                             IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1371                 }
1372                 break;
1373 
1374         case IPPROTO_SCTP:
1375                 ret = ipcl_sctp_hash_insert(connp, lport);
1376                 break;
1377 
1378         case IPPROTO_DCCP:
1379                 /* XXX:DCCP */
1380                 break;
1381         }
1382 
1383         return (ret);
1384 }
1385 
1386 /*
1387  * ipcl_conn_hash insertion routines.
1388  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1389  */
1390 
1391 int
1392 ipcl_conn_insert(conn_t *connp)
1393 {
1394         if (connp->conn_ipversion == IPV6_VERSION)
1395                 return (ipcl_conn_insert_v6(connp));
1396         else
1397                 return (ipcl_conn_insert_v4(connp));
1398 }
1399 
1400 int
1401 ipcl_conn_insert_v4(conn_t *connp)
1402 {
1403         connf_t         *connfp;
1404         conn_t          *tconnp;
1405         int             ret = 0;
1406         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1407         uint16_t        lport = connp->conn_lport;
1408         uint8_t         protocol = connp->conn_proto;
1409 
1410         if (IPCL_IS_IPTUN(connp))
1411                 return (ipcl_iptun_hash_insert(connp, ipst));
1412 
1413         switch (protocol) {
1414         case IPPROTO_TCP:
1415                 /*
1416                  * For TCP, we check whether the connection tuple already
1417                  * exists before allowing the connection to proceed.  We
1418                  * also allow indexing on the zoneid. This is to allow
1419                  * multiple shared stack zones to have the same tcp
1420                  * connection tuple. In practice this only happens for
1421                  * INADDR_LOOPBACK as it's the only local address which
1422                  * doesn't have to be unique.
1423                  */
1424                 connfp = &ipst->ips_ipcl_conn_fanout[
1425                     IPCL_CONN_HASH(connp->conn_faddr_v4,
1426                     connp->conn_ports, ipst)];
1427                 mutex_enter(&connfp->connf_lock);
1428                 for (tconnp = connfp->connf_head; tconnp != NULL;
1429                     tconnp = tconnp->conn_next) {
1430                         if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1431                             connp->conn_faddr_v4, connp->conn_laddr_v4,
1432                             connp->conn_ports) &&
1433                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1434                                 /* Already have a conn. bail out */
1435                                 mutex_exit(&connfp->connf_lock);
1436                                 return (EADDRINUSE);
1437                         }
1438                 }
1439                 if (connp->conn_fanout != NULL) {
1440                         /*
1441                          * Probably a XTI/TLI application trying to do a
1442                          * rebind. Let it happen.
1443                          */
1444                         mutex_exit(&connfp->connf_lock);
1445                         IPCL_HASH_REMOVE(connp);
1446                         mutex_enter(&connfp->connf_lock);
1447                 }
1448 
1449                 ASSERT(connp->conn_recv != NULL);
1450                 ASSERT(connp->conn_recvicmp != NULL);
1451 
1452                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1453                 mutex_exit(&connfp->connf_lock);
1454                 break;
1455 
1456         case IPPROTO_SCTP:
1457                 /*
1458                  * The raw socket may have already been bound, remove it
1459                  * from the hash first.
1460                  */
1461                 IPCL_HASH_REMOVE(connp);
1462                 ret = ipcl_sctp_hash_insert(connp, lport);
1463                 break;
1464 
1465         case IPPROTO_DCCP:
1466                 cmn_err(CE_NOTE, "insert v4");
1467 
1468                 connfp = &ipst->ips_ipcl_conn_fanout[
1469                     IPCL_CONN_HASH(connp->conn_faddr_v4,
1470                     connp->conn_ports, ipst)];
1471                 mutex_enter(&connfp->connf_lock);
1472                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1473                 mutex_exit(&connfp->connf_lock);
1474                 /* XXX:DCCP */
1475                 break;
1476 
1477         default:
1478                 /*
1479                  * Check for conflicts among MAC exempt bindings.  For
1480                  * transports with port numbers, this is done by the upper
1481                  * level per-transport binding logic.  For all others, it's
1482                  * done here.
1483                  */
1484                 if (is_system_labeled() &&
1485                     check_exempt_conflict_v4(connp, ipst))
1486                         return (EADDRINUSE);
1487                 /* FALLTHROUGH */
1488 
1489         case IPPROTO_UDP:
1490                 if (protocol == IPPROTO_UDP) {
1491                         connfp = &ipst->ips_ipcl_udp_fanout[
1492                             IPCL_UDP_HASH(lport, ipst)];
1493                 } else {
1494                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1495                 }
1496 
1497                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1498                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1499                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1500                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1501                 } else {
1502                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1503                 }
1504                 break;
1505         }
1506 
1507         return (ret);
1508 }
1509 
1510 int
1511 ipcl_conn_insert_v6(conn_t *connp)
1512 {
1513         connf_t         *connfp;
1514         conn_t          *tconnp;
1515         int             ret = 0;
1516         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1517         uint16_t        lport = connp->conn_lport;
1518         uint8_t         protocol = connp->conn_proto;
1519         uint_t          ifindex = connp->conn_bound_if;
1520 
1521         if (IPCL_IS_IPTUN(connp))
1522                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1523 
1524         switch (protocol) {
1525         case IPPROTO_TCP:
1526 
1527                 /*
1528                  * For tcp, we check whether the connection tuple already
1529                  * exists before allowing the connection to proceed.  We
1530                  * also allow indexing on the zoneid. This is to allow
1531                  * multiple shared stack zones to have the same tcp
1532                  * connection tuple. In practice this only happens for
1533                  * ipv6_loopback as it's the only local address which
1534                  * doesn't have to be unique.
1535                  */
1536                 connfp = &ipst->ips_ipcl_conn_fanout[
1537                     IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1538                     ipst)];
1539                 mutex_enter(&connfp->connf_lock);
1540                 for (tconnp = connfp->connf_head; tconnp != NULL;
1541                     tconnp = tconnp->conn_next) {
1542                         /* NOTE: need to match zoneid. Bug in onnv-gate */
1543                         if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1544                             connp->conn_faddr_v6, connp->conn_laddr_v6,
1545                             connp->conn_ports) &&
1546                             (tconnp->conn_bound_if == 0 ||
1547                             tconnp->conn_bound_if == ifindex) &&
1548                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1549                                 /* Already have a conn. bail out */
1550                                 mutex_exit(&connfp->connf_lock);
1551                                 return (EADDRINUSE);
1552                         }
1553                 }
1554                 if (connp->conn_fanout != NULL) {
1555                         /*
1556                          * Probably a XTI/TLI application trying to do a
1557                          * rebind. Let it happen.
1558                          */
1559                         mutex_exit(&connfp->connf_lock);
1560                         IPCL_HASH_REMOVE(connp);
1561                         mutex_enter(&connfp->connf_lock);
1562                 }
1563                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1564                 mutex_exit(&connfp->connf_lock);
1565                 break;
1566 
1567         case IPPROTO_SCTP:
1568                 IPCL_HASH_REMOVE(connp);
1569                 ret = ipcl_sctp_hash_insert(connp, lport);
1570                 break;
1571 
1572         case IPPROTO_DCCP:
1573                 /* XXX:DCCP */
1574                 break;
1575 
1576         default:
1577                 if (is_system_labeled() &&
1578                     check_exempt_conflict_v6(connp, ipst))
1579                         return (EADDRINUSE);
1580                 /* FALLTHROUGH */
1581         case IPPROTO_UDP:
1582                 if (protocol == IPPROTO_UDP) {
1583                         connfp = &ipst->ips_ipcl_udp_fanout[
1584                             IPCL_UDP_HASH(lport, ipst)];
1585                 } else {
1586                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1587                 }
1588 
1589                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1590                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1591                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1592                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1593                 } else {
1594                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1595                 }
1596                 break;
1597         }
1598 
1599         return (ret);
1600 }
1601 
1602 /*
1603  * v4 packet classifying function. looks up the fanout table to
1604  * find the conn, the packet belongs to. returns the conn with
1605  * the reference held, null otherwise.
1606  *
1607  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1608  * Lookup" comment block are applied.  Labels are also checked as described
1609  * above.  If the packet is from the inside (looped back), and is from the same
1610  * zone, then label checks are omitted.
1611  */
1612 conn_t *
1613 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1614     ip_recv_attr_t *ira, ip_stack_t *ipst)
1615 {
1616         ipha_t  *ipha;
1617         connf_t *connfp, *bind_connfp;
1618         uint16_t lport;
1619         uint16_t fport;
1620         uint32_t ports;
1621         conn_t  *connp;
1622         uint16_t  *up;
1623         zoneid_t        zoneid = ira->ira_zoneid;
1624 
1625         ipha = (ipha_t *)mp->b_rptr;
1626         up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1627 
1628         switch (protocol) {
1629         case IPPROTO_TCP:
1630                 ports = *(uint32_t *)up;
1631                 connfp =
1632                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1633                     ports, ipst)];
1634                 mutex_enter(&connfp->connf_lock);
1635                 for (connp = connfp->connf_head; connp != NULL;
1636                     connp = connp->conn_next) {
1637                         if (IPCL_CONN_MATCH(connp, protocol,
1638                             ipha->ipha_src, ipha->ipha_dst, ports) &&
1639                             (connp->conn_zoneid == zoneid ||
1640                             connp->conn_allzones ||
1641                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1642                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1643                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1644                                 break;
1645                 }
1646 
1647                 if (connp != NULL) {
1648                         /*
1649                          * We have a fully-bound TCP connection.
1650                          *
1651                          * For labeled systems, there's no need to check the
1652                          * label here.  It's known to be good as we checked
1653                          * before allowing the connection to become bound.
1654                          */
1655                         CONN_INC_REF(connp);
1656                         mutex_exit(&connfp->connf_lock);
1657                         return (connp);
1658                 }
1659 
1660                 mutex_exit(&connfp->connf_lock);
1661                 lport = up[1];
1662                 bind_connfp =
1663                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1664                 mutex_enter(&bind_connfp->connf_lock);
1665                 for (connp = bind_connfp->connf_head; connp != NULL;
1666                     connp = connp->conn_next) {
1667                         if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1668                             lport) &&
1669                             (connp->conn_zoneid == zoneid ||
1670                             connp->conn_allzones ||
1671                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1672                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1673                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1674                                 break;
1675                 }
1676 
1677                 /*
1678                  * If the matching connection is SLP on a private address, then
1679                  * the label on the packet must match the local zone's label.
1680                  * Otherwise, it must be in the label range defined by tnrh.
1681                  * This is ensured by tsol_receive_local.
1682                  *
1683                  * Note that we don't check tsol_receive_local for
1684                  * the connected case.
1685                  */
1686                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1687                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1688                     ira, connp)) {
1689                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1690                             char *, "connp(1) could not receive mp(2)",
1691                             conn_t *, connp, mblk_t *, mp);
1692                         connp = NULL;
1693                 }
1694 
1695                 if (connp != NULL) {
1696                         /* Have a listener at least */
1697                         CONN_INC_REF(connp);
1698                         mutex_exit(&bind_connfp->connf_lock);
1699                         return (connp);
1700                 }
1701 
1702                 mutex_exit(&bind_connfp->connf_lock);
1703                 break;
1704 
1705         case IPPROTO_UDP:
1706                 lport = up[1];
1707                 fport = up[0];
1708                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1709                 mutex_enter(&connfp->connf_lock);
1710                 for (connp = connfp->connf_head; connp != NULL;
1711                     connp = connp->conn_next) {
1712                         if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1713                             fport, ipha->ipha_src) &&
1714                             (connp->conn_zoneid == zoneid ||
1715                             connp->conn_allzones ||
1716                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1717                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1718                                 break;
1719                 }
1720 
1721                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1722                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1723                     ira, connp)) {
1724                         DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1725                             char *, "connp(1) could not receive mp(2)",
1726                             conn_t *, connp, mblk_t *, mp);
1727                         connp = NULL;
1728                 }
1729 
1730                 if (connp != NULL) {
1731                         CONN_INC_REF(connp);
1732                         mutex_exit(&connfp->connf_lock);
1733                         return (connp);
1734                 }
1735 
1736                 /*
1737                  * We shouldn't come here for multicast/broadcast packets
1738                  */
1739                 mutex_exit(&connfp->connf_lock);
1740 
1741                 break;
1742 
1743         case IPPROTO_DCCP:
1744                 fport = up[0];
1745                 lport = up[1];
1746                 connfp = &ipst->ips_ipcl_dccp_fanout[IPCL_DCCP_HASH(
1747                     lport, ipst)];
1748                 mutex_enter(&connfp->connf_lock);
1749                 for (connp = connfp->connf_head; connp != NULL;
1750                     connp = connp->conn_next) {
1751                         cmn_err(CE_NOTE, "connfp found");
1752                         /* XXX:DCCP */
1753                         if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1754                             fport, ipha->ipha_src)) {
1755                                 break;
1756                         }
1757                 }
1758 
1759                 if (connp != NULL) {
1760                         CONN_INC_REF(connp);
1761                         mutex_exit(&connfp->connf_lock);
1762                         return (connp);
1763                 }
1764 
1765                 mutex_exit(&connfp->connf_lock);
1766                 break;
1767 
1768         case IPPROTO_ENCAP:
1769         case IPPROTO_IPV6:
1770                 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1771                     &ipha->ipha_dst, ipst));
1772         }
1773 
1774         return (NULL);
1775 }
1776 
1777 conn_t *
1778 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1779     ip_recv_attr_t *ira, ip_stack_t *ipst)
1780 {
1781         ip6_t           *ip6h;
1782         connf_t         *connfp, *bind_connfp;
1783         uint16_t        lport;
1784         uint16_t        fport;
1785         tcpha_t         *tcpha;
1786         uint32_t        ports;
1787         conn_t          *connp;
1788         uint16_t        *up;
1789         zoneid_t        zoneid = ira->ira_zoneid;
1790 
1791         ip6h = (ip6_t *)mp->b_rptr;
1792 
1793         switch (protocol) {
1794         case IPPROTO_TCP:
1795                 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1796                 up = &tcpha->tha_lport;
1797                 ports = *(uint32_t *)up;
1798 
1799                 connfp =
1800                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1801                     ports, ipst)];
1802                 mutex_enter(&connfp->connf_lock);
1803                 for (connp = connfp->connf_head; connp != NULL;
1804                     connp = connp->conn_next) {
1805                         if (IPCL_CONN_MATCH_V6(connp, protocol,
1806                             ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1807                             (connp->conn_zoneid == zoneid ||
1808                             connp->conn_allzones ||
1809                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1810                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1811                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1812                                 break;
1813                 }
1814 
1815                 if (connp != NULL) {
1816                         /*
1817                          * We have a fully-bound TCP connection.
1818                          *
1819                          * For labeled systems, there's no need to check the
1820                          * label here.  It's known to be good as we checked
1821                          * before allowing the connection to become bound.
1822                          */
1823                         CONN_INC_REF(connp);
1824                         mutex_exit(&connfp->connf_lock);
1825                         return (connp);
1826                 }
1827 
1828                 mutex_exit(&connfp->connf_lock);
1829 
1830                 lport = up[1];
1831                 bind_connfp =
1832                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1833                 mutex_enter(&bind_connfp->connf_lock);
1834                 for (connp = bind_connfp->connf_head; connp != NULL;
1835                     connp = connp->conn_next) {
1836                         if (IPCL_BIND_MATCH_V6(connp, protocol,
1837                             ip6h->ip6_dst, lport) &&
1838                             (connp->conn_zoneid == zoneid ||
1839                             connp->conn_allzones ||
1840                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1841                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1842                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1843                                 break;
1844                 }
1845 
1846                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1847                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1848                     ira, connp)) {
1849                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1850                             char *, "connp(1) could not receive mp(2)",
1851                             conn_t *, connp, mblk_t *, mp);
1852                         connp = NULL;
1853                 }
1854 
1855                 if (connp != NULL) {
1856                         /* Have a listner at least */
1857                         CONN_INC_REF(connp);
1858                         mutex_exit(&bind_connfp->connf_lock);
1859                         return (connp);
1860                 }
1861 
1862                 mutex_exit(&bind_connfp->connf_lock);
1863                 break;
1864 
1865         case IPPROTO_UDP:
1866                 up = (uint16_t *)&mp->b_rptr[hdr_len];
1867                 lport = up[1];
1868                 fport = up[0];
1869                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1870                 mutex_enter(&connfp->connf_lock);
1871                 for (connp = connfp->connf_head; connp != NULL;
1872                     connp = connp->conn_next) {
1873                         if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1874                             fport, ip6h->ip6_src) &&
1875                             (connp->conn_zoneid == zoneid ||
1876                             connp->conn_allzones ||
1877                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1878                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1879                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1880                                 break;
1881                 }
1882 
1883                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1884                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1885                     ira, connp)) {
1886                         DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1887                             char *, "connp(1) could not receive mp(2)",
1888                             conn_t *, connp, mblk_t *, mp);
1889                         connp = NULL;
1890                 }
1891 
1892                 if (connp != NULL) {
1893                         CONN_INC_REF(connp);
1894                         mutex_exit(&connfp->connf_lock);
1895                         return (connp);
1896                 }
1897 
1898                 /*
1899                  * We shouldn't come here for multicast/broadcast packets
1900                  */
1901                 mutex_exit(&connfp->connf_lock);
1902                 break;
1903         case IPPROTO_ENCAP:
1904         case IPPROTO_IPV6:
1905                 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1906                     &ip6h->ip6_dst, ipst));
1907         }
1908 
1909         return (NULL);
1910 }
1911 
1912 /*
1913  * wrapper around ipcl_classify_(v4,v6) routines.
1914  */
1915 conn_t *
1916 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1917 {
1918         if (ira->ira_flags & IRAF_IS_IPV4) {
1919                 return (ipcl_classify_v4(mp, ira->ira_protocol,
1920                     ira->ira_ip_hdr_length, ira, ipst));
1921         } else {
1922                 return (ipcl_classify_v6(mp, ira->ira_protocol,
1923                     ira->ira_ip_hdr_length, ira, ipst));
1924         }
1925 }
1926 
1927 /*
1928  * Only used to classify SCTP RAW sockets
1929  */
1930 conn_t *
1931 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1932     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1933 {
1934         connf_t         *connfp;
1935         conn_t          *connp;
1936         in_port_t       lport;
1937         int             ipversion;
1938         const void      *dst;
1939         zoneid_t        zoneid = ira->ira_zoneid;
1940 
1941         lport = ((uint16_t *)&ports)[1];
1942         if (ira->ira_flags & IRAF_IS_IPV4) {
1943                 dst = (const void *)&ipha->ipha_dst;
1944                 ipversion = IPV4_VERSION;
1945         } else {
1946                 dst = (const void *)&ip6h->ip6_dst;
1947                 ipversion = IPV6_VERSION;
1948         }
1949 
1950         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1951         mutex_enter(&connfp->connf_lock);
1952         for (connp = connfp->connf_head; connp != NULL;
1953             connp = connp->conn_next) {
1954                 /* We don't allow v4 fallback for v6 raw socket. */
1955                 if (ipversion != connp->conn_ipversion)
1956                         continue;
1957                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1958                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1959                         if (ipversion == IPV4_VERSION) {
1960                                 if (!IPCL_CONN_MATCH(connp, protocol,
1961                                     ipha->ipha_src, ipha->ipha_dst, ports))
1962                                         continue;
1963                         } else {
1964                                 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1965                                     ip6h->ip6_src, ip6h->ip6_dst, ports))
1966                                         continue;
1967                         }
1968                 } else {
1969                         if (ipversion == IPV4_VERSION) {
1970                                 if (!IPCL_BIND_MATCH(connp, protocol,
1971                                     ipha->ipha_dst, lport))
1972                                         continue;
1973                         } else {
1974                                 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1975                                     ip6h->ip6_dst, lport))
1976                                         continue;
1977                         }
1978                 }
1979 
1980                 if (connp->conn_zoneid == zoneid ||
1981                     connp->conn_allzones ||
1982                     ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1983                     (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1984                     (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1985                         break;
1986         }
1987 
1988         if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1989             !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1990                 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1991                     char *, "connp(1) could not receive mp(2)",
1992                     conn_t *, connp, mblk_t *, mp);
1993                 connp = NULL;
1994         }
1995 
1996         if (connp != NULL)
1997                 goto found;
1998         mutex_exit(&connfp->connf_lock);
1999 
2000         /* Try to look for a wildcard SCTP RAW socket match. */
2001         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
2002         mutex_enter(&connfp->connf_lock);
2003         for (connp = connfp->connf_head; connp != NULL;
2004             connp = connp->conn_next) {
2005                 /* We don't allow v4 fallback for v6 raw socket. */
2006                 if (ipversion != connp->conn_ipversion)
2007                         continue;
2008                 if (!IPCL_ZONE_MATCH(connp, zoneid))
2009                         continue;
2010 
2011                 if (ipversion == IPV4_VERSION) {
2012                         if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
2013                                 break;
2014                 } else {
2015                         if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
2016                                 break;
2017                         }
2018                 }
2019         }
2020 
2021         if (connp != NULL)
2022                 goto found;
2023 
2024         mutex_exit(&connfp->connf_lock);
2025         return (NULL);
2026 
2027 found:
2028         ASSERT(connp != NULL);
2029         CONN_INC_REF(connp);
2030         mutex_exit(&connfp->connf_lock);
2031         return (connp);
2032 }
2033 
2034 /* ARGSUSED */
2035 static int
2036 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2037 {
2038         itc_t   *itc = (itc_t *)buf;
2039         conn_t  *connp = &itc->itc_conn;
2040         tcp_t   *tcp = (tcp_t *)&itc[1];
2041 
2042         bzero(connp, sizeof (conn_t));
2043         bzero(tcp, sizeof (tcp_t));
2044 
2045         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2046         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2047         cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
2048         tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
2049         if (tcp->tcp_timercache == NULL)
2050                 return (ENOMEM);
2051         connp->conn_tcp = tcp;
2052         connp->conn_flags = IPCL_TCPCONN;
2053         connp->conn_proto = IPPROTO_TCP;
2054         tcp->tcp_connp = connp;
2055         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2056 
2057         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2058         if (connp->conn_ixa == NULL) {
2059                 tcp_timermp_free(tcp);
2060                 return (ENOMEM);
2061         }
2062         connp->conn_ixa->ixa_refcnt = 1;
2063         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2064         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2065         return (0);
2066 }
2067 
2068 /* ARGSUSED */
2069 static void
2070 tcp_conn_destructor(void *buf, void *cdrarg)
2071 {
2072         itc_t   *itc = (itc_t *)buf;
2073         conn_t  *connp = &itc->itc_conn;
2074         tcp_t   *tcp = (tcp_t *)&itc[1];
2075 
2076         ASSERT(connp->conn_flags & IPCL_TCPCONN);
2077         ASSERT(tcp->tcp_connp == connp);
2078         ASSERT(connp->conn_tcp == tcp);
2079         tcp_timermp_free(tcp);
2080         mutex_destroy(&connp->conn_lock);
2081         cv_destroy(&connp->conn_cv);
2082         cv_destroy(&connp->conn_sq_cv);
2083         rw_destroy(&connp->conn_ilg_lock);
2084 
2085         /* Can be NULL if constructor failed */
2086         if (connp->conn_ixa != NULL) {
2087                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2088                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2089                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2090                 ixa_refrele(connp->conn_ixa);
2091         }
2092 }
2093 
2094 /* ARGSUSED */
2095 static int
2096 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2097 {
2098         itc_t   *itc = (itc_t *)buf;
2099         conn_t  *connp = &itc->itc_conn;
2100 
2101         bzero(connp, sizeof (conn_t));
2102         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2103         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2104         connp->conn_flags = IPCL_IPCCONN;
2105         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2106 
2107         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2108         if (connp->conn_ixa == NULL)
2109                 return (ENOMEM);
2110         connp->conn_ixa->ixa_refcnt = 1;
2111         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2112         return (0);
2113 }
2114 
2115 /* ARGSUSED */
2116 static void
2117 ip_conn_destructor(void *buf, void *cdrarg)
2118 {
2119         itc_t   *itc = (itc_t *)buf;
2120         conn_t  *connp = &itc->itc_conn;
2121 
2122         ASSERT(connp->conn_flags & IPCL_IPCCONN);
2123         ASSERT(connp->conn_priv == NULL);
2124         mutex_destroy(&connp->conn_lock);
2125         cv_destroy(&connp->conn_cv);
2126         rw_destroy(&connp->conn_ilg_lock);
2127 
2128         /* Can be NULL if constructor failed */
2129         if (connp->conn_ixa != NULL) {
2130                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2131                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2132                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2133                 ixa_refrele(connp->conn_ixa);
2134         }
2135 }
2136 
2137 /* ARGSUSED */
2138 static int
2139 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2140 {
2141         itc_t   *itc = (itc_t *)buf;
2142         conn_t  *connp = &itc->itc_conn;
2143         udp_t   *udp = (udp_t *)&itc[1];
2144 
2145         bzero(connp, sizeof (conn_t));
2146         bzero(udp, sizeof (udp_t));
2147 
2148         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2149         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2150         connp->conn_udp = udp;
2151         connp->conn_flags = IPCL_UDPCONN;
2152         connp->conn_proto = IPPROTO_UDP;
2153         udp->udp_connp = connp;
2154         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2155         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2156         if (connp->conn_ixa == NULL)
2157                 return (ENOMEM);
2158         connp->conn_ixa->ixa_refcnt = 1;
2159         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2160         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2161         return (0);
2162 }
2163 
2164 /* ARGSUSED */
2165 static void
2166 udp_conn_destructor(void *buf, void *cdrarg)
2167 {
2168         itc_t   *itc = (itc_t *)buf;
2169         conn_t  *connp = &itc->itc_conn;
2170         udp_t   *udp = (udp_t *)&itc[1];
2171 
2172         ASSERT(connp->conn_flags & IPCL_UDPCONN);
2173         ASSERT(udp->udp_connp == connp);
2174         ASSERT(connp->conn_udp == udp);
2175         mutex_destroy(&connp->conn_lock);
2176         cv_destroy(&connp->conn_cv);
2177         rw_destroy(&connp->conn_ilg_lock);
2178 
2179         /* Can be NULL if constructor failed */
2180         if (connp->conn_ixa != NULL) {
2181                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2182                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2183                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2184                 ixa_refrele(connp->conn_ixa);
2185         }
2186 }
2187 
2188 /* ARGSUSED */
2189 static int
2190 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2191 {
2192         itc_t   *itc = (itc_t *)buf;
2193         conn_t  *connp = &itc->itc_conn;
2194         icmp_t  *icmp = (icmp_t *)&itc[1];
2195 
2196         bzero(connp, sizeof (conn_t));
2197         bzero(icmp, sizeof (icmp_t));
2198 
2199         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2200         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2201         connp->conn_icmp = icmp;
2202         connp->conn_flags = IPCL_RAWIPCONN;
2203         connp->conn_proto = IPPROTO_ICMP;
2204         icmp->icmp_connp = connp;
2205         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2206         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2207         if (connp->conn_ixa == NULL)
2208                 return (ENOMEM);
2209         connp->conn_ixa->ixa_refcnt = 1;
2210         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2211         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2212         return (0);
2213 }
2214 
2215 /* ARGSUSED */
2216 static void
2217 rawip_conn_destructor(void *buf, void *cdrarg)
2218 {
2219         itc_t   *itc = (itc_t *)buf;
2220         conn_t  *connp = &itc->itc_conn;
2221         icmp_t  *icmp = (icmp_t *)&itc[1];
2222 
2223         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2224         ASSERT(icmp->icmp_connp == connp);
2225         ASSERT(connp->conn_icmp == icmp);
2226         mutex_destroy(&connp->conn_lock);
2227         cv_destroy(&connp->conn_cv);
2228         rw_destroy(&connp->conn_ilg_lock);
2229 
2230         /* Can be NULL if constructor failed */
2231         if (connp->conn_ixa != NULL) {
2232                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2233                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2234                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2235                 ixa_refrele(connp->conn_ixa);
2236         }
2237 }
2238 
2239 /* ARGSUSED */
2240 static int
2241 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2242 {
2243         itc_t   *itc = (itc_t *)buf;
2244         conn_t  *connp = &itc->itc_conn;
2245         rts_t   *rts = (rts_t *)&itc[1];
2246 
2247         bzero(connp, sizeof (conn_t));
2248         bzero(rts, sizeof (rts_t));
2249 
2250         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2251         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2252         connp->conn_rts = rts;
2253         connp->conn_flags = IPCL_RTSCONN;
2254         rts->rts_connp = connp;
2255         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2256         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2257         if (connp->conn_ixa == NULL)
2258                 return (ENOMEM);
2259         connp->conn_ixa->ixa_refcnt = 1;
2260         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2261         return (0);
2262 }
2263 
2264 /* ARGSUSED */
2265 static void
2266 rts_conn_destructor(void *buf, void *cdrarg)
2267 {
2268         itc_t   *itc = (itc_t *)buf;
2269         conn_t  *connp = &itc->itc_conn;
2270         rts_t   *rts = (rts_t *)&itc[1];
2271 
2272         ASSERT(connp->conn_flags & IPCL_RTSCONN);
2273         ASSERT(rts->rts_connp == connp);
2274         ASSERT(connp->conn_rts == rts);
2275         mutex_destroy(&connp->conn_lock);
2276         cv_destroy(&connp->conn_cv);
2277         rw_destroy(&connp->conn_ilg_lock);
2278 
2279         /* Can be NULL if constructor failed */
2280         if (connp->conn_ixa != NULL) {
2281                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2282                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2283                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2284                 ixa_refrele(connp->conn_ixa);
2285         }
2286 }
2287 
2288 /* ARGSUSED */
2289 static int
2290 dccp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2291 {
2292         itc_t   *itc = (itc_t *)buf;
2293         conn_t  *connp = &itc->itc_conn;
2294         dccp_t  *dccp = (dccp_t *)&itc[1];
2295 
2296         bzero(connp, sizeof (conn_t));
2297         bzero(dccp, sizeof (dccp_t));
2298 
2299         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2300         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2301         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2302 
2303         connp->conn_dccp = dccp;
2304         connp->conn_flags = IPCL_DCCPCONN;
2305         connp->conn_proto = IPPROTO_DCCP;
2306         dccp->dccp_connp = connp;
2307         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2308         if (connp->conn_ixa == NULL)
2309                 return (NULL);
2310         connp->conn_ixa->ixa_refcnt = 1;
2311         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2312         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2313 
2314         return (0);
2315 }
2316 
2317 /* ARGSUSED */
2318 static void
2319 dccp_conn_destructor(void *buf, void *cdrarg)
2320 {
2321         itc_t   *itc = (itc_t *)buf;
2322         conn_t  *connp = &itc->itc_conn;
2323         dccp_t  *dccp = (dccp_t *)&itc[1];
2324 
2325         ASSERT(connp->conn_flags & IPCL_DCCPCONN);
2326         ASSERT(dccp->dccp_connp == connp);
2327         ASSERT(connp->conn_dccp == dccp);
2328 
2329         mutex_destroy(&connp->conn_lock);
2330         cv_destroy(&connp->conn_cv);
2331         rw_destroy(&connp->conn_ilg_lock);
2332 
2333         if (connp->conn_ixa != NULL) {
2334                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2335                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2336                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2337 
2338                 ixa_refrele(connp->conn_ixa);
2339         }
2340 }
2341 
2342 /*
2343  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2344  * in the conn_t.
2345  *
2346  * Below we list all the pointers in the conn_t as a documentation aid.
2347  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2348  * If you add any pointers to the conn_t please add an ASSERT here
2349  * and #ifdef it out if it can't be actually asserted to be NULL.
2350  * In any case, we bzero most of the conn_t at the end of the function.
2351  */
2352 void
2353 ipcl_conn_cleanup(conn_t *connp)
2354 {
2355         ip_xmit_attr_t  *ixa;
2356 
2357         ASSERT(connp->conn_latch == NULL);
2358         ASSERT(connp->conn_latch_in_policy == NULL);
2359         ASSERT(connp->conn_latch_in_action == NULL);
2360 #ifdef notdef
2361         ASSERT(connp->conn_rq == NULL);
2362         ASSERT(connp->conn_wq == NULL);
2363 #endif
2364         ASSERT(connp->conn_cred == NULL);
2365         ASSERT(connp->conn_g_fanout == NULL);
2366         ASSERT(connp->conn_g_next == NULL);
2367         ASSERT(connp->conn_g_prev == NULL);
2368         ASSERT(connp->conn_policy == NULL);
2369         ASSERT(connp->conn_fanout == NULL);
2370         ASSERT(connp->conn_next == NULL);
2371         ASSERT(connp->conn_prev == NULL);
2372         ASSERT(connp->conn_oper_pending_ill == NULL);
2373         ASSERT(connp->conn_ilg == NULL);
2374         ASSERT(connp->conn_drain_next == NULL);
2375         ASSERT(connp->conn_drain_prev == NULL);
2376 #ifdef notdef
2377         /* conn_idl is not cleared when removed from idl list */
2378         ASSERT(connp->conn_idl == NULL);
2379 #endif
2380         ASSERT(connp->conn_ipsec_opt_mp == NULL);
2381 #ifdef notdef
2382         /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2383         ASSERT(connp->conn_netstack == NULL);
2384 #endif
2385 
2386         ASSERT(connp->conn_helper_info == NULL);
2387         ASSERT(connp->conn_ixa != NULL);
2388         ixa = connp->conn_ixa;
2389         ASSERT(ixa->ixa_refcnt == 1);
2390         /* Need to preserve ixa_protocol */
2391         ixa_cleanup(ixa);
2392         ixa->ixa_flags = 0;
2393 
2394         /* Clear out the conn_t fields that are not preserved */
2395         bzero(&connp->conn_start_clr,
2396             sizeof (conn_t) -
2397             ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2398 }
2399 
2400 /*
2401  * All conns are inserted in a global multi-list for the benefit of
2402  * walkers. The walk is guaranteed to walk all open conns at the time
2403  * of the start of the walk exactly once. This property is needed to
2404  * achieve some cleanups during unplumb of interfaces. This is achieved
2405  * as follows.
2406  *
2407  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2408  * call the insert and delete functions below at creation and deletion
2409  * time respectively. The conn never moves or changes its position in this
2410  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2411  * won't increase due to walkers, once the conn deletion has started. Note
2412  * that we can't remove the conn from the global list and then wait for
2413  * the refcnt to drop to zero, since walkers would then see a truncated
2414  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2415  * conns until ip_open is ready to make them globally visible.
2416  * The global round robin multi-list locks are held only to get the
2417  * next member/insertion/deletion and contention should be negligible
2418  * if the multi-list is much greater than the number of cpus.
2419  */
2420 void
2421 ipcl_globalhash_insert(conn_t *connp)
2422 {
2423         int     index;
2424         struct connf_s  *connfp;
2425         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2426 
2427         /*
2428          * No need for atomic here. Approximate even distribution
2429          * in the global lists is sufficient.
2430          */
2431         ipst->ips_conn_g_index++;
2432         index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2433 
2434         connp->conn_g_prev = NULL;
2435         /*
2436          * Mark as INCIPIENT, so that walkers will ignore this
2437          * for now, till ip_open is ready to make it visible globally.
2438          */
2439         connp->conn_state_flags |= CONN_INCIPIENT;
2440 
2441         connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2442         /* Insert at the head of the list */
2443         mutex_enter(&connfp->connf_lock);
2444         connp->conn_g_next = connfp->connf_head;
2445         if (connp->conn_g_next != NULL)
2446                 connp->conn_g_next->conn_g_prev = connp;
2447         connfp->connf_head = connp;
2448 
2449         /* The fanout bucket this conn points to */
2450         connp->conn_g_fanout = connfp;
2451 
2452         mutex_exit(&connfp->connf_lock);
2453 }
2454 
2455 void
2456 ipcl_globalhash_remove(conn_t *connp)
2457 {
2458         struct connf_s  *connfp;
2459 
2460         /*
2461          * We were never inserted in the global multi list.
2462          * IPCL_NONE variety is never inserted in the global multilist
2463          * since it is presumed to not need any cleanup and is transient.
2464          */
2465         if (connp->conn_g_fanout == NULL)
2466                 return;
2467 
2468         connfp = connp->conn_g_fanout;
2469         mutex_enter(&connfp->connf_lock);
2470         if (connp->conn_g_prev != NULL)
2471                 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2472         else
2473                 connfp->connf_head = connp->conn_g_next;
2474         if (connp->conn_g_next != NULL)
2475                 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2476         mutex_exit(&connfp->connf_lock);
2477 
2478         /* Better to stumble on a null pointer than to corrupt memory */
2479         connp->conn_g_next = NULL;
2480         connp->conn_g_prev = NULL;
2481         connp->conn_g_fanout = NULL;
2482 }
2483 
2484 /*
2485  * Walk the list of all conn_t's in the system, calling the function provided
2486  * With the specified argument for each.
2487  * Applies to both IPv4 and IPv6.
2488  *
2489  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2490  * conn_oper_pending_ill). To guard against stale pointers
2491  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2492  * unplumbed or removed. New conn_t's that are created while we are walking
2493  * may be missed by this walk, because they are not necessarily inserted
2494  * at the tail of the list. They are new conn_t's and thus don't have any
2495  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2496  * is created to the struct that is going away.
2497  */
2498 void
2499 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2500 {
2501         int     i;
2502         conn_t  *connp;
2503         conn_t  *prev_connp;
2504 
2505         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2506                 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2507                 prev_connp = NULL;
2508                 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2509                 while (connp != NULL) {
2510                         mutex_enter(&connp->conn_lock);
2511                         if (connp->conn_state_flags &
2512                             (CONN_CONDEMNED | CONN_INCIPIENT)) {
2513                                 mutex_exit(&connp->conn_lock);
2514                                 connp = connp->conn_g_next;
2515                                 continue;
2516                         }
2517                         CONN_INC_REF_LOCKED(connp);
2518                         mutex_exit(&connp->conn_lock);
2519                         mutex_exit(
2520                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2521                         (*func)(connp, arg);
2522                         if (prev_connp != NULL)
2523                                 CONN_DEC_REF(prev_connp);
2524                         mutex_enter(
2525                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2526                         prev_connp = connp;
2527                         connp = connp->conn_g_next;
2528                 }
2529                 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2530                 if (prev_connp != NULL)
2531                         CONN_DEC_REF(prev_connp);
2532         }
2533 }
2534 
2535 /*
2536  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2537  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2538  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2539  * (peer tcp in ESTABLISHED state).
2540  */
2541 conn_t *
2542 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2543     ip_stack_t *ipst)
2544 {
2545         uint32_t ports;
2546         uint16_t *pports = (uint16_t *)&ports;
2547         connf_t *connfp;
2548         conn_t  *tconnp;
2549         boolean_t zone_chk;
2550 
2551         /*
2552          * If either the source of destination address is loopback, then
2553          * both endpoints must be in the same Zone.  Otherwise, both of
2554          * the addresses are system-wide unique (tcp is in ESTABLISHED
2555          * state) and the endpoints may reside in different Zones.
2556          */
2557         zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2558             ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2559 
2560         pports[0] = tcpha->tha_fport;
2561         pports[1] = tcpha->tha_lport;
2562 
2563         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2564             ports, ipst)];
2565 
2566         mutex_enter(&connfp->connf_lock);
2567         for (tconnp = connfp->connf_head; tconnp != NULL;
2568             tconnp = tconnp->conn_next) {
2569 
2570                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2571                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2572                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2573                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2574 
2575                         ASSERT(tconnp != connp);
2576                         CONN_INC_REF(tconnp);
2577                         mutex_exit(&connfp->connf_lock);
2578                         return (tconnp);
2579                 }
2580         }
2581         mutex_exit(&connfp->connf_lock);
2582         return (NULL);
2583 }
2584 
2585 /*
2586  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2587  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2588  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2589  * (peer tcp in ESTABLISHED state).
2590  */
2591 conn_t *
2592 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2593     ip_stack_t *ipst)
2594 {
2595         uint32_t ports;
2596         uint16_t *pports = (uint16_t *)&ports;
2597         connf_t *connfp;
2598         conn_t  *tconnp;
2599         boolean_t zone_chk;
2600 
2601         /*
2602          * If either the source of destination address is loopback, then
2603          * both endpoints must be in the same Zone.  Otherwise, both of
2604          * the addresses are system-wide unique (tcp is in ESTABLISHED
2605          * state) and the endpoints may reside in different Zones.  We
2606          * don't do Zone check for link local address(es) because the
2607          * current Zone implementation treats each link local address as
2608          * being unique per system node, i.e. they belong to global Zone.
2609          */
2610         zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2611             IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2612 
2613         pports[0] = tcpha->tha_fport;
2614         pports[1] = tcpha->tha_lport;
2615 
2616         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2617             ports, ipst)];
2618 
2619         mutex_enter(&connfp->connf_lock);
2620         for (tconnp = connfp->connf_head; tconnp != NULL;
2621             tconnp = tconnp->conn_next) {
2622 
2623                 /* We skip conn_bound_if check here as this is loopback tcp */
2624                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2625                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2626                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2627                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2628 
2629                         ASSERT(tconnp != connp);
2630                         CONN_INC_REF(tconnp);
2631                         mutex_exit(&connfp->connf_lock);
2632                         return (tconnp);
2633                 }
2634         }
2635         mutex_exit(&connfp->connf_lock);
2636         return (NULL);
2637 }
2638 
2639 /*
2640  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2641  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2642  * Only checks for connected entries i.e. no INADDR_ANY checks.
2643  */
2644 conn_t *
2645 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2646     ip_stack_t *ipst)
2647 {
2648         uint32_t ports;
2649         uint16_t *pports;
2650         connf_t *connfp;
2651         conn_t  *tconnp;
2652 
2653         pports = (uint16_t *)&ports;
2654         pports[0] = tcpha->tha_fport;
2655         pports[1] = tcpha->tha_lport;
2656 
2657         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2658             ports, ipst)];
2659 
2660         mutex_enter(&connfp->connf_lock);
2661         for (tconnp = connfp->connf_head; tconnp != NULL;
2662             tconnp = tconnp->conn_next) {
2663 
2664                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2665                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2666                     tconnp->conn_tcp->tcp_state >= min_state) {
2667 
2668                         CONN_INC_REF(tconnp);
2669                         mutex_exit(&connfp->connf_lock);
2670                         return (tconnp);
2671                 }
2672         }
2673         mutex_exit(&connfp->connf_lock);
2674         return (NULL);
2675 }
2676 
2677 /*
2678  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2679  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2680  * Only checks for connected entries i.e. no INADDR_ANY checks.
2681  * Match on ifindex in addition to addresses.
2682  */
2683 conn_t *
2684 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2685     uint_t ifindex, ip_stack_t *ipst)
2686 {
2687         tcp_t   *tcp;
2688         uint32_t ports;
2689         uint16_t *pports;
2690         connf_t *connfp;
2691         conn_t  *tconnp;
2692 
2693         pports = (uint16_t *)&ports;
2694         pports[0] = tcpha->tha_fport;
2695         pports[1] = tcpha->tha_lport;
2696 
2697         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2698             ports, ipst)];
2699 
2700         mutex_enter(&connfp->connf_lock);
2701         for (tconnp = connfp->connf_head; tconnp != NULL;
2702             tconnp = tconnp->conn_next) {
2703 
2704                 tcp = tconnp->conn_tcp;
2705                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2706                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2707                     tcp->tcp_state >= min_state &&
2708                     (tconnp->conn_bound_if == 0 ||
2709                     tconnp->conn_bound_if == ifindex)) {
2710 
2711                         CONN_INC_REF(tconnp);
2712                         mutex_exit(&connfp->connf_lock);
2713                         return (tconnp);
2714                 }
2715         }
2716         mutex_exit(&connfp->connf_lock);
2717         return (NULL);
2718 }
2719 
2720 /*
2721  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2722  * a listener when changing state.
2723  */
2724 conn_t *
2725 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2726     ip_stack_t *ipst)
2727 {
2728         connf_t         *bind_connfp;
2729         conn_t          *connp;
2730         tcp_t           *tcp;
2731 
2732         /*
2733          * Avoid false matches for packets sent to an IP destination of
2734          * all zeros.
2735          */
2736         if (laddr == 0)
2737                 return (NULL);
2738 
2739         ASSERT(zoneid != ALL_ZONES);
2740 
2741         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2742         mutex_enter(&bind_connfp->connf_lock);
2743         for (connp = bind_connfp->connf_head; connp != NULL;
2744             connp = connp->conn_next) {
2745                 tcp = connp->conn_tcp;
2746                 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2747                     IPCL_ZONE_MATCH(connp, zoneid) &&
2748                     (tcp->tcp_listener == NULL)) {
2749                         CONN_INC_REF(connp);
2750                         mutex_exit(&bind_connfp->connf_lock);
2751                         return (connp);
2752                 }
2753         }
2754         mutex_exit(&bind_connfp->connf_lock);
2755         return (NULL);
2756 }
2757 
2758 /*
2759  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2760  * a listener when changing state.
2761  */
2762 conn_t *
2763 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2764     zoneid_t zoneid, ip_stack_t *ipst)
2765 {
2766         connf_t         *bind_connfp;
2767         conn_t          *connp = NULL;
2768         tcp_t           *tcp;
2769 
2770         /*
2771          * Avoid false matches for packets sent to an IP destination of
2772          * all zeros.
2773          */
2774         if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2775                 return (NULL);
2776 
2777         ASSERT(zoneid != ALL_ZONES);
2778 
2779         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2780         mutex_enter(&bind_connfp->connf_lock);
2781         for (connp = bind_connfp->connf_head; connp != NULL;
2782             connp = connp->conn_next) {
2783                 tcp = connp->conn_tcp;
2784                 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2785                     IPCL_ZONE_MATCH(connp, zoneid) &&
2786                     (connp->conn_bound_if == 0 ||
2787                     connp->conn_bound_if == ifindex) &&
2788                     tcp->tcp_listener == NULL) {
2789                         CONN_INC_REF(connp);
2790                         mutex_exit(&bind_connfp->connf_lock);
2791                         return (connp);
2792                 }
2793         }
2794         mutex_exit(&bind_connfp->connf_lock);
2795         return (NULL);
2796 }
2797 
2798 /*
2799  * ipcl_get_next_conn
2800  *      get the next entry in the conn global list
2801  *      and put a reference on the next_conn.
2802  *      decrement the reference on the current conn.
2803  *
2804  * This is an iterator based walker function that also provides for
2805  * some selection by the caller. It walks through the conn_hash bucket
2806  * searching for the next valid connp in the list, and selects connections
2807  * that are neither closed nor condemned. It also REFHOLDS the conn
2808  * thus ensuring that the conn exists when the caller uses the conn.
2809  */
2810 conn_t *
2811 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2812 {
2813         conn_t  *next_connp;
2814 
2815         if (connfp == NULL)
2816                 return (NULL);
2817 
2818         mutex_enter(&connfp->connf_lock);
2819 
2820         next_connp = (connp == NULL) ?
2821             connfp->connf_head : connp->conn_g_next;
2822 
2823         while (next_connp != NULL) {
2824                 mutex_enter(&next_connp->conn_lock);
2825                 if (!(next_connp->conn_flags & conn_flags) ||
2826                     (next_connp->conn_state_flags &
2827                     (CONN_CONDEMNED | CONN_INCIPIENT))) {
2828                         /*
2829                          * This conn has been condemned or
2830                          * is closing, or the flags don't match
2831                          */
2832                         mutex_exit(&next_connp->conn_lock);
2833                         next_connp = next_connp->conn_g_next;
2834                         continue;
2835                 }
2836                 CONN_INC_REF_LOCKED(next_connp);
2837                 mutex_exit(&next_connp->conn_lock);
2838                 break;
2839         }
2840 
2841         mutex_exit(&connfp->connf_lock);
2842 
2843         if (connp != NULL)
2844                 CONN_DEC_REF(connp);
2845 
2846         return (next_connp);
2847 }
2848 
2849 #ifdef CONN_DEBUG
2850 /*
2851  * Trace of the last NBUF refhold/refrele
2852  */
2853 int
2854 conn_trace_ref(conn_t *connp)
2855 {
2856         int     last;
2857         conn_trace_t    *ctb;
2858 
2859         ASSERT(MUTEX_HELD(&connp->conn_lock));
2860         last = connp->conn_trace_last;
2861         last++;
2862         if (last == CONN_TRACE_MAX)
2863                 last = 0;
2864 
2865         ctb = &connp->conn_trace_buf[last];
2866         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2867         connp->conn_trace_last = last;
2868         return (1);
2869 }
2870 
2871 int
2872 conn_untrace_ref(conn_t *connp)
2873 {
2874         int     last;
2875         conn_trace_t    *ctb;
2876 
2877         ASSERT(MUTEX_HELD(&connp->conn_lock));
2878         last = connp->conn_trace_last;
2879         last++;
2880         if (last == CONN_TRACE_MAX)
2881                 last = 0;
2882 
2883         ctb = &connp->conn_trace_buf[last];
2884         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2885         connp->conn_trace_last = last;
2886         return (1);
2887 }
2888 #endif