1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * IP PACKET CLASSIFIER
  27  *
  28  * The IP packet classifier provides mapping between IP packets and persistent
  29  * connection state for connection-oriented protocols. It also provides
  30  * interface for managing connection states.
  31  *
  32  * The connection state is kept in conn_t data structure and contains, among
  33  * other things:
  34  *
  35  *      o local/remote address and ports
  36  *      o Transport protocol
  37  *      o squeue for the connection (for TCP only)
  38  *      o reference counter
  39  *      o Connection state
  40  *      o hash table linkage
  41  *      o interface/ire information
  42  *      o credentials
  43  *      o ipsec policy
  44  *      o send and receive functions.
  45  *      o mutex lock.
  46  *
  47  * Connections use a reference counting scheme. They are freed when the
  48  * reference counter drops to zero. A reference is incremented when connection
  49  * is placed in a list or table, when incoming packet for the connection arrives
  50  * and when connection is processed via squeue (squeue processing may be
  51  * asynchronous and the reference protects the connection from being destroyed
  52  * before its processing is finished).
  53  *
  54  * conn_recv is used to pass up packets to the ULP.
  55  * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  56  * a listener, and changes to tcp_input_listener as the listener has picked a
  57  * good squeue. For other cases it is set to tcp_input_data.
  58  *
  59  * conn_recvicmp is used to pass up ICMP errors to the ULP.
  60  *
  61  * Classifier uses several hash tables:
  62  *
  63  *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  64  *      ipcl_bind_fanout:       contains all connections in BOUND state
  65  *      ipcl_proto_fanout:      IPv4 protocol fanout
  66  *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  67  *      ipcl_udp_fanout:        contains all UDP connections
  68  *      ipcl_iptun_fanout:      contains all IP tunnel connections
  69  *      ipcl_globalhash_fanout: contains all connections
  70  *
  71  * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  72  * which need to view all existing connections.
  73  *
  74  * All tables are protected by per-bucket locks. When both per-bucket lock and
  75  * connection lock need to be held, the per-bucket lock should be acquired
  76  * first, followed by the connection lock.
  77  *
  78  * All functions doing search in one of these tables increment a reference
  79  * counter on the connection found (if any). This reference should be dropped
  80  * when the caller has finished processing the connection.
  81  *
  82  *
  83  * INTERFACES:
  84  * ===========
  85  *
  86  * Connection Lookup:
  87  * ------------------
  88  *
  89  * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  90  * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  91  *
  92  * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  93  * it can't find any associated connection. If the connection is found, its
  94  * reference counter is incremented.
  95  *
  96  *      mp:     mblock, containing packet header. The full header should fit
  97  *              into a single mblock. It should also contain at least full IP
  98  *              and TCP or UDP header.
  99  *
 100  *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 101  *
 102  *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 103  *               the packet.
 104  *
 105  *      ira->ira_zoneid: The zone in which the returned connection must be; the
 106  *              zoneid corresponding to the ire_zoneid on the IRE located for
 107  *              the packet's destination address.
 108  *
 109  *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 110  *              IRAF_TX_SHARED_ADDR flags
 111  *
 112  *      For TCP connections, the lookup order is as follows:
 113  *              5-tuple {src, dst, protocol, local port, remote port}
 114  *                      lookup in ipcl_conn_fanout table.
 115  *              3-tuple {dst, remote port, protocol} lookup in
 116  *                      ipcl_bind_fanout table.
 117  *
 118  *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 119  *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 120  *      these interfaces do not handle cases where a packets belongs
 121  *      to multiple UDP clients, which is handled in IP itself.
 122  *
 123  * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 124  * determine which actual zone gets the segment.  This is used only in a
 125  * labeled environment.  The matching rules are:
 126  *
 127  *      - If it's not a multilevel port, then the label on the packet selects
 128  *        the zone.  Unlabeled packets are delivered to the global zone.
 129  *
 130  *      - If it's a multilevel port, then only the zone registered to receive
 131  *        packets on that port matches.
 132  *
 133  * Also, in a labeled environment, packet labels need to be checked.  For fully
 134  * bound TCP connections, we can assume that the packet label was checked
 135  * during connection establishment, and doesn't need to be checked on each
 136  * packet.  For others, though, we need to check for strict equality or, for
 137  * multilevel ports, membership in the range or set.  This part currently does
 138  * a tnrh lookup on each packet, but could be optimized to use cached results
 139  * if that were necessary.  (SCTP doesn't come through here, but if it did,
 140  * we would apply the same rules as TCP.)
 141  *
 142  * An implication of the above is that fully-bound TCP sockets must always use
 143  * distinct 4-tuples; they can't be discriminated by label alone.
 144  *
 145  * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 146  * as there's no connection set-up handshake and no shared state.
 147  *
 148  * Labels on looped-back packets within a single zone do not need to be
 149  * checked, as all processes in the same zone have the same label.
 150  *
 151  * Finally, for unlabeled packets received by a labeled system, special rules
 152  * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 153  * socket in the zone whose label matches the default label of the sender, if
 154  * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 155  * receiver's label must dominate the sender's default label.
 156  *
 157  * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 158  * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 159  *                                       ip_stack);
 160  *
 161  *      Lookup routine to find a exact match for {src, dst, local port,
 162  *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 163  *      ports are read from the IP and TCP header respectively.
 164  *
 165  * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 166  *                                       zoneid, ip_stack);
 167  * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 168  *                                       zoneid, ip_stack);
 169  *
 170  *      Lookup routine to find a listener with the tuple {lport, laddr,
 171  *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 172  *      parameter interface index is also compared.
 173  *
 174  * void ipcl_walk(func, arg, ip_stack)
 175  *
 176  *      Apply 'func' to every connection available. The 'func' is called as
 177  *      (*func)(connp, arg). The walk is non-atomic so connections may be
 178  *      created and destroyed during the walk. The CONN_CONDEMNED and
 179  *      CONN_INCIPIENT flags ensure that connections which are newly created
 180  *      or being destroyed are not selected by the walker.
 181  *
 182  * Table Updates
 183  * -------------
 184  *
 185  * int ipcl_conn_insert(connp);
 186  * int ipcl_conn_insert_v4(connp);
 187  * int ipcl_conn_insert_v6(connp);
 188  *
 189  *      Insert 'connp' in the ipcl_conn_fanout.
 190  *      Arguements :
 191  *              connp           conn_t to be inserted
 192  *
 193  *      Return value :
 194  *              0               if connp was inserted
 195  *              EADDRINUSE      if the connection with the same tuple
 196  *                              already exists.
 197  *
 198  * int ipcl_bind_insert(connp);
 199  * int ipcl_bind_insert_v4(connp);
 200  * int ipcl_bind_insert_v6(connp);
 201  *
 202  *      Insert 'connp' in ipcl_bind_fanout.
 203  *      Arguements :
 204  *              connp           conn_t to be inserted
 205  *
 206  *
 207  * void ipcl_hash_remove(connp);
 208  *
 209  *      Removes the 'connp' from the connection fanout table.
 210  *
 211  * Connection Creation/Destruction
 212  * -------------------------------
 213  *
 214  * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 215  *
 216  *      Creates a new conn based on the type flag, inserts it into
 217  *      globalhash table.
 218  *
 219  *      type:   This flag determines the type of conn_t which needs to be
 220  *              created i.e., which kmem_cache it comes from.
 221  *              IPCL_TCPCONN    indicates a TCP connection
 222  *              IPCL_SCTPCONN   indicates a SCTP connection
 223  *              IPCL_UDPCONN    indicates a UDP conn_t.
 224  *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 225  *              IPCL_RTSCONN    indicates a RTS conn_t.
 226  *              IPCL_IPCCONN    indicates all other connections.
 227  *
 228  * void ipcl_conn_destroy(connp)
 229  *
 230  *      Destroys the connection state, removes it from the global
 231  *      connection hash table and frees its memory.
 232  */
 233 
 234 #include <sys/types.h>
 235 #include <sys/stream.h>
 236 #include <sys/stropts.h>
 237 #include <sys/sysmacros.h>
 238 #include <sys/strsubr.h>
 239 #include <sys/strsun.h>
 240 #define _SUN_TPI_VERSION 2
 241 #include <sys/ddi.h>
 242 #include <sys/cmn_err.h>
 243 #include <sys/debug.h>
 244 
 245 #include <sys/systm.h>
 246 #include <sys/param.h>
 247 #include <sys/kmem.h>
 248 #include <sys/isa_defs.h>
 249 #include <inet/common.h>
 250 #include <netinet/ip6.h>
 251 #include <netinet/icmp6.h>
 252 
 253 #include <inet/ip.h>
 254 #include <inet/ip_if.h>
 255 #include <inet/ip_ire.h>
 256 #include <inet/ip6.h>
 257 #include <inet/ip_ndp.h>
 258 #include <inet/ip_impl.h>
 259 #include <inet/udp_impl.h>
 260 #include <inet/sctp_ip.h>
 261 #include <inet/sctp/sctp_impl.h>
 262 #include <inet/rawip_impl.h>
 263 #include <inet/rts_impl.h>
 264 #include <inet/iptun/iptun_impl.h>
 265 
 266 #include <sys/cpuvar.h>
 267 
 268 #include <inet/ipclassifier.h>
 269 #include <inet/tcp.h>
 270 #include <inet/ipsec_impl.h>
 271 
 272 #include <sys/tsol/tnet.h>
 273 #include <sys/sockio.h>
 274 
 275 /* Old value for compatibility. Setable in /etc/system */
 276 uint_t tcp_conn_hash_size = 0;
 277 
 278 /* New value. Zero means choose automatically.  Setable in /etc/system */
 279 uint_t ipcl_conn_hash_size = 0;
 280 uint_t ipcl_conn_hash_memfactor = 8192;
 281 uint_t ipcl_conn_hash_maxsize = 82500;
 282 
 283 /* bind/udp fanout table size */
 284 uint_t ipcl_bind_fanout_size = 512;
 285 uint_t ipcl_udp_fanout_size = 16384;
 286 
 287 /* Raw socket fanout size.  Must be a power of 2. */
 288 uint_t ipcl_raw_fanout_size = 256;
 289 
 290 /*
 291  * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 292  * expect that most large deployments would have hundreds of tunnels, and
 293  * thousands in the extreme case.
 294  */
 295 uint_t ipcl_iptun_fanout_size = 6143;
 296 
 297 /*
 298  * Power of 2^N Primes useful for hashing for N of 0-28,
 299  * these primes are the nearest prime <= 2^N - 2^(N-2).
 300  */
 301 
 302 #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 303                 6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 304                 786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 305                 50331599, 100663291, 201326557, 0}
 306 
 307 /*
 308  * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 309  * are aligned on cache lines.
 310  */
 311 typedef union itc_s {
 312         conn_t  itc_conn;
 313         char    itcu_filler[CACHE_ALIGN(conn_s)];
 314 } itc_t;
 315 
 316 struct kmem_cache  *tcp_conn_cache;
 317 struct kmem_cache  *ip_conn_cache;
 318 extern struct kmem_cache  *sctp_conn_cache;
 319 struct kmem_cache  *udp_conn_cache;
 320 struct kmem_cache  *rawip_conn_cache;
 321 struct kmem_cache  *rts_conn_cache;
 322 
 323 extern void     tcp_timermp_free(tcp_t *);
 324 extern mblk_t   *tcp_timermp_alloc(int);
 325 
 326 static int      ip_conn_constructor(void *, void *, int);
 327 static void     ip_conn_destructor(void *, void *);
 328 
 329 static int      tcp_conn_constructor(void *, void *, int);
 330 static void     tcp_conn_destructor(void *, void *);
 331 
 332 static int      udp_conn_constructor(void *, void *, int);
 333 static void     udp_conn_destructor(void *, void *);
 334 
 335 static int      rawip_conn_constructor(void *, void *, int);
 336 static void     rawip_conn_destructor(void *, void *);
 337 
 338 static int      rts_conn_constructor(void *, void *, int);
 339 static void     rts_conn_destructor(void *, void *);
 340 
 341 /*
 342  * Global (for all stack instances) init routine
 343  */
 344 void
 345 ipcl_g_init(void)
 346 {
 347         ip_conn_cache = kmem_cache_create("ip_conn_cache",
 348             sizeof (conn_t), CACHE_ALIGN_SIZE,
 349             ip_conn_constructor, ip_conn_destructor,
 350             NULL, NULL, NULL, 0);
 351 
 352         tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 353             sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 354             tcp_conn_constructor, tcp_conn_destructor,
 355             tcp_conn_reclaim, NULL, NULL, 0);
 356 
 357         udp_conn_cache = kmem_cache_create("udp_conn_cache",
 358             sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 359             udp_conn_constructor, udp_conn_destructor,
 360             NULL, NULL, NULL, 0);
 361 
 362         rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 363             sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 364             rawip_conn_constructor, rawip_conn_destructor,
 365             NULL, NULL, NULL, 0);
 366 
 367         rts_conn_cache = kmem_cache_create("rts_conn_cache",
 368             sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 369             rts_conn_constructor, rts_conn_destructor,
 370             NULL, NULL, NULL, 0);
 371 }
 372 
 373 /*
 374  * ipclassifier intialization routine, sets up hash tables.
 375  */
 376 void
 377 ipcl_init(ip_stack_t *ipst)
 378 {
 379         int i;
 380         int sizes[] = P2Ps();
 381 
 382         /*
 383          * Calculate size of conn fanout table from /etc/system settings
 384          */
 385         if (ipcl_conn_hash_size != 0) {
 386                 ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 387         } else if (tcp_conn_hash_size != 0) {
 388                 ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 389         } else {
 390                 extern pgcnt_t freemem;
 391 
 392                 ipst->ips_ipcl_conn_fanout_size =
 393                     (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 394 
 395                 if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 396                         ipst->ips_ipcl_conn_fanout_size =
 397                             ipcl_conn_hash_maxsize;
 398                 }
 399         }
 400 
 401         for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 402                 if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 403                         break;
 404                 }
 405         }
 406         if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 407                 /* Out of range, use the 2^16 value */
 408                 ipst->ips_ipcl_conn_fanout_size = sizes[16];
 409         }
 410 
 411         /* Take values from /etc/system */
 412         ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 413         ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 414         ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 415         ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 416 
 417         ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 418 
 419         ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 420             ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 421 
 422         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 423                 mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 424                     MUTEX_DEFAULT, NULL);
 425         }
 426 
 427         ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 428             ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 429 
 430         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 431                 mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 432                     MUTEX_DEFAULT, NULL);
 433         }
 434 
 435         ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 436             sizeof (connf_t), KM_SLEEP);
 437         for (i = 0; i < IPPROTO_MAX; i++) {
 438                 mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 439                     MUTEX_DEFAULT, NULL);
 440         }
 441 
 442         ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 443             sizeof (connf_t), KM_SLEEP);
 444         for (i = 0; i < IPPROTO_MAX; i++) {
 445                 mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 446                     MUTEX_DEFAULT, NULL);
 447         }
 448 
 449         ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 450         mutex_init(&ipst->ips_rts_clients->connf_lock,
 451             NULL, MUTEX_DEFAULT, NULL);
 452 
 453         ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 454             ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 455         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 456                 mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 457                     MUTEX_DEFAULT, NULL);
 458         }
 459 
 460         ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 461             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 462         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 463                 mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 464                     MUTEX_DEFAULT, NULL);
 465         }
 466 
 467         ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 468             ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 469         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 470                 mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 471                     MUTEX_DEFAULT, NULL);
 472         }
 473 
 474         ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 475             sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 476         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 477                 mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 478                     NULL, MUTEX_DEFAULT, NULL);
 479         }
 480 }
 481 
 482 void
 483 ipcl_g_destroy(void)
 484 {
 485         kmem_cache_destroy(ip_conn_cache);
 486         kmem_cache_destroy(tcp_conn_cache);
 487         kmem_cache_destroy(udp_conn_cache);
 488         kmem_cache_destroy(rawip_conn_cache);
 489         kmem_cache_destroy(rts_conn_cache);
 490 }
 491 
 492 /*
 493  * All user-level and kernel use of the stack must be gone
 494  * by now.
 495  */
 496 void
 497 ipcl_destroy(ip_stack_t *ipst)
 498 {
 499         int i;
 500 
 501         for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 502                 ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 503                 mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 504         }
 505         kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 506             sizeof (connf_t));
 507         ipst->ips_ipcl_conn_fanout = NULL;
 508 
 509         for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 510                 ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 511                 mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 512         }
 513         kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 514             sizeof (connf_t));
 515         ipst->ips_ipcl_bind_fanout = NULL;
 516 
 517         for (i = 0; i < IPPROTO_MAX; i++) {
 518                 ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 519                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 520         }
 521         kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 522             IPPROTO_MAX * sizeof (connf_t));
 523         ipst->ips_ipcl_proto_fanout_v4 = NULL;
 524 
 525         for (i = 0; i < IPPROTO_MAX; i++) {
 526                 ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 527                 mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 528         }
 529         kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 530             IPPROTO_MAX * sizeof (connf_t));
 531         ipst->ips_ipcl_proto_fanout_v6 = NULL;
 532 
 533         for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 534                 ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 535                 mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 536         }
 537         kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 538             sizeof (connf_t));
 539         ipst->ips_ipcl_udp_fanout = NULL;
 540 
 541         for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 542                 ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 543                 mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 544         }
 545         kmem_free(ipst->ips_ipcl_iptun_fanout,
 546             ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 547         ipst->ips_ipcl_iptun_fanout = NULL;
 548 
 549         for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 550                 ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 551                 mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 552         }
 553         kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 554             sizeof (connf_t));
 555         ipst->ips_ipcl_raw_fanout = NULL;
 556 
 557         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 558                 ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 559                 mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 560         }
 561         kmem_free(ipst->ips_ipcl_globalhash_fanout,
 562             sizeof (connf_t) * CONN_G_HASH_SIZE);
 563         ipst->ips_ipcl_globalhash_fanout = NULL;
 564 
 565         ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 566         mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 567         kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 568         ipst->ips_rts_clients = NULL;
 569 }
 570 
 571 /*
 572  * conn creation routine. initialize the conn, sets the reference
 573  * and inserts it in the global hash table.
 574  */
 575 conn_t *
 576 ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 577 {
 578         conn_t  *connp;
 579         struct kmem_cache *conn_cache;
 580 
 581         switch (type) {
 582         case IPCL_SCTPCONN:
 583                 if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 584                         return (NULL);
 585                 sctp_conn_init(connp);
 586                 netstack_hold(ns);
 587                 connp->conn_netstack = ns;
 588                 connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 589                 connp->conn_ixa->ixa_conn_id = (long)connp;
 590                 ipcl_globalhash_insert(connp);
 591                 return (connp);
 592 
 593         case IPCL_TCPCONN:
 594                 conn_cache = tcp_conn_cache;
 595                 break;
 596 
 597         case IPCL_UDPCONN:
 598                 conn_cache = udp_conn_cache;
 599                 break;
 600 
 601         case IPCL_RAWIPCONN:
 602                 conn_cache = rawip_conn_cache;
 603                 break;
 604 
 605         case IPCL_RTSCONN:
 606                 conn_cache = rts_conn_cache;
 607                 break;
 608 
 609         case IPCL_IPCCONN:
 610                 conn_cache = ip_conn_cache;
 611                 break;
 612 
 613         default:
 614                 connp = NULL;
 615                 ASSERT(0);
 616         }
 617 
 618         if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 619                 return (NULL);
 620 
 621         connp->conn_ref = 1;
 622         netstack_hold(ns);
 623         connp->conn_netstack = ns;
 624         connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 625         connp->conn_ixa->ixa_conn_id = (long)connp;
 626         ipcl_globalhash_insert(connp);
 627         return (connp);
 628 }
 629 
 630 void
 631 ipcl_conn_destroy(conn_t *connp)
 632 {
 633         mblk_t  *mp;
 634         netstack_t      *ns = connp->conn_netstack;
 635 
 636         ASSERT(!MUTEX_HELD(&connp->conn_lock));
 637         ASSERT(connp->conn_ref == 0);
 638         ASSERT(connp->conn_ioctlref == 0);
 639 
 640         DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 641 
 642         if (connp->conn_cred != NULL) {
 643                 crfree(connp->conn_cred);
 644                 connp->conn_cred = NULL;
 645                 /* ixa_cred done in ipcl_conn_cleanup below */
 646         }
 647 
 648         if (connp->conn_ht_iphc != NULL) {
 649                 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 650                 connp->conn_ht_iphc = NULL;
 651                 connp->conn_ht_iphc_allocated = 0;
 652                 connp->conn_ht_iphc_len = 0;
 653                 connp->conn_ht_ulp = NULL;
 654                 connp->conn_ht_ulp_len = 0;
 655         }
 656         ip_pkt_free(&connp->conn_xmit_ipp);
 657 
 658         ipcl_globalhash_remove(connp);
 659 
 660         if (connp->conn_latch != NULL) {
 661                 IPLATCH_REFRELE(connp->conn_latch);
 662                 connp->conn_latch = NULL;
 663         }
 664         if (connp->conn_latch_in_policy != NULL) {
 665                 IPPOL_REFRELE(connp->conn_latch_in_policy);
 666                 connp->conn_latch_in_policy = NULL;
 667         }
 668         if (connp->conn_latch_in_action != NULL) {
 669                 IPACT_REFRELE(connp->conn_latch_in_action);
 670                 connp->conn_latch_in_action = NULL;
 671         }
 672         if (connp->conn_policy != NULL) {
 673                 IPPH_REFRELE(connp->conn_policy, ns);
 674                 connp->conn_policy = NULL;
 675         }
 676 
 677         if (connp->conn_ipsec_opt_mp != NULL) {
 678                 freemsg(connp->conn_ipsec_opt_mp);
 679                 connp->conn_ipsec_opt_mp = NULL;
 680         }
 681 
 682         if (connp->conn_flags & IPCL_TCPCONN) {
 683                 tcp_t *tcp = connp->conn_tcp;
 684 
 685                 tcp_free(tcp);
 686                 mp = tcp->tcp_timercache;
 687 
 688                 tcp->tcp_tcps = NULL;
 689 
 690                 /*
 691                  * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 692                  * the mblk.
 693                  */
 694                 if (tcp->tcp_rsrv_mp != NULL) {
 695                         freeb(tcp->tcp_rsrv_mp);
 696                         tcp->tcp_rsrv_mp = NULL;
 697                         mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 698                 }
 699 
 700                 ipcl_conn_cleanup(connp);
 701                 connp->conn_flags = IPCL_TCPCONN;
 702                 if (ns != NULL) {
 703                         ASSERT(tcp->tcp_tcps == NULL);
 704                         connp->conn_netstack = NULL;
 705                         connp->conn_ixa->ixa_ipst = NULL;
 706                         netstack_rele(ns);
 707                 }
 708 
 709                 bzero(tcp, sizeof (tcp_t));
 710 
 711                 tcp->tcp_timercache = mp;
 712                 tcp->tcp_connp = connp;
 713                 kmem_cache_free(tcp_conn_cache, connp);
 714                 return;
 715         }
 716 
 717         if (connp->conn_flags & IPCL_SCTPCONN) {
 718                 ASSERT(ns != NULL);
 719                 sctp_free(connp);
 720                 return;
 721         }
 722 
 723         ipcl_conn_cleanup(connp);
 724         if (ns != NULL) {
 725                 connp->conn_netstack = NULL;
 726                 connp->conn_ixa->ixa_ipst = NULL;
 727                 netstack_rele(ns);
 728         }
 729 
 730         /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 731         if (connp->conn_flags & IPCL_UDPCONN) {
 732                 connp->conn_flags = IPCL_UDPCONN;
 733                 kmem_cache_free(udp_conn_cache, connp);
 734         } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 735                 connp->conn_flags = IPCL_RAWIPCONN;
 736                 connp->conn_proto = IPPROTO_ICMP;
 737                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
 738                 kmem_cache_free(rawip_conn_cache, connp);
 739         } else if (connp->conn_flags & IPCL_RTSCONN) {
 740                 connp->conn_flags = IPCL_RTSCONN;
 741                 kmem_cache_free(rts_conn_cache, connp);
 742         } else {
 743                 connp->conn_flags = IPCL_IPCCONN;
 744                 ASSERT(connp->conn_flags & IPCL_IPCCONN);
 745                 ASSERT(connp->conn_priv == NULL);
 746                 kmem_cache_free(ip_conn_cache, connp);
 747         }
 748 }
 749 
 750 /*
 751  * Running in cluster mode - deregister listener information
 752  */
 753 static void
 754 ipcl_conn_unlisten(conn_t *connp)
 755 {
 756         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 757         ASSERT(connp->conn_lport != 0);
 758 
 759         if (cl_inet_unlisten != NULL) {
 760                 sa_family_t     addr_family;
 761                 uint8_t         *laddrp;
 762 
 763                 if (connp->conn_ipversion == IPV6_VERSION) {
 764                         addr_family = AF_INET6;
 765                         laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 766                 } else {
 767                         addr_family = AF_INET;
 768                         laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 769                 }
 770                 (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 771                     IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 772         }
 773         connp->conn_flags &= ~IPCL_CL_LISTENER;
 774 }
 775 
 776 /*
 777  * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 778  * which table the conn belonged to). So for debugging we can see which hash
 779  * table this connection was in.
 780  */
 781 #define IPCL_HASH_REMOVE(connp) {                                       \
 782         connf_t *connfp = (connp)->conn_fanout;                              \
 783         ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                      \
 784         if (connfp != NULL) {                                           \
 785                 mutex_enter(&connfp->connf_lock);                        \
 786                 if ((connp)->conn_next != NULL)                              \
 787                         (connp)->conn_next->conn_prev =                   \
 788                             (connp)->conn_prev;                              \
 789                 if ((connp)->conn_prev != NULL)                              \
 790                         (connp)->conn_prev->conn_next =                   \
 791                             (connp)->conn_next;                              \
 792                 else                                                    \
 793                         connfp->connf_head = (connp)->conn_next;  \
 794                 (connp)->conn_fanout = NULL;                         \
 795                 (connp)->conn_next = NULL;                           \
 796                 (connp)->conn_prev = NULL;                           \
 797                 (connp)->conn_flags |= IPCL_REMOVED;                 \
 798                 if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)       \
 799                         ipcl_conn_unlisten((connp));                    \
 800                 CONN_DEC_REF((connp));                                  \
 801                 mutex_exit(&connfp->connf_lock);                 \
 802         }                                                               \
 803 }
 804 
 805 void
 806 ipcl_hash_remove(conn_t *connp)
 807 {
 808         uint8_t         protocol = connp->conn_proto;
 809 
 810         IPCL_HASH_REMOVE(connp);
 811         if (protocol == IPPROTO_RSVP)
 812                 ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 813 }
 814 
 815 /*
 816  * The whole purpose of this function is allow removal of
 817  * a conn_t from the connected hash for timewait reclaim.
 818  * This is essentially a TW reclaim fastpath where timewait
 819  * collector checks under fanout lock (so no one else can
 820  * get access to the conn_t) that refcnt is 2 i.e. one for
 821  * TCP and one for the classifier hash list. If ref count
 822  * is indeed 2, we can just remove the conn under lock and
 823  * avoid cleaning up the conn under squeue. This gives us
 824  * improved performance.
 825  */
 826 void
 827 ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 828 {
 829         ASSERT(MUTEX_HELD(&connfp->connf_lock));
 830         ASSERT(MUTEX_HELD(&connp->conn_lock));
 831         ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 832 
 833         if ((connp)->conn_next != NULL) {
 834                 (connp)->conn_next->conn_prev = (connp)->conn_prev;
 835         }
 836         if ((connp)->conn_prev != NULL) {
 837                 (connp)->conn_prev->conn_next = (connp)->conn_next;
 838         } else {
 839                 connfp->connf_head = (connp)->conn_next;
 840         }
 841         (connp)->conn_fanout = NULL;
 842         (connp)->conn_next = NULL;
 843         (connp)->conn_prev = NULL;
 844         (connp)->conn_flags |= IPCL_REMOVED;
 845         ASSERT((connp)->conn_ref == 2);
 846         (connp)->conn_ref--;
 847 }
 848 
 849 #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 850         ASSERT((connp)->conn_fanout == NULL);                                \
 851         ASSERT((connp)->conn_next == NULL);                          \
 852         ASSERT((connp)->conn_prev == NULL);                          \
 853         if ((connfp)->connf_head != NULL) {                          \
 854                 (connfp)->connf_head->conn_prev = (connp);                \
 855                 (connp)->conn_next = (connfp)->connf_head;                \
 856         }                                                               \
 857         (connp)->conn_fanout = (connfp);                             \
 858         (connfp)->connf_head = (connp);                                      \
 859         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 860             IPCL_CONNECTED;                                             \
 861         CONN_INC_REF(connp);                                            \
 862 }
 863 
 864 #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 865         IPCL_HASH_REMOVE((connp));                                      \
 866         mutex_enter(&(connfp)->connf_lock);                              \
 867         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 868         mutex_exit(&(connfp)->connf_lock);                               \
 869 }
 870 
 871 #define IPCL_HASH_INSERT_BOUND(connfp, connp) {                         \
 872         conn_t *pconnp = NULL, *nconnp;                                 \
 873         IPCL_HASH_REMOVE((connp));                                      \
 874         mutex_enter(&(connfp)->connf_lock);                              \
 875         nconnp = (connfp)->connf_head;                                       \
 876         while (nconnp != NULL &&                                        \
 877             !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {            \
 878                 pconnp = nconnp;                                        \
 879                 nconnp = nconnp->conn_next;                          \
 880         }                                                               \
 881         if (pconnp != NULL) {                                           \
 882                 pconnp->conn_next = (connp);                         \
 883                 (connp)->conn_prev = pconnp;                         \
 884         } else {                                                        \
 885                 (connfp)->connf_head = (connp);                              \
 886         }                                                               \
 887         if (nconnp != NULL) {                                           \
 888                 (connp)->conn_next = nconnp;                         \
 889                 nconnp->conn_prev = (connp);                         \
 890         }                                                               \
 891         (connp)->conn_fanout = (connfp);                             \
 892         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 893             IPCL_BOUND;                                                 \
 894         CONN_INC_REF(connp);                                            \
 895         mutex_exit(&(connfp)->connf_lock);                               \
 896 }
 897 
 898 #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) {                      \
 899         conn_t **list, *prev, *next;                                    \
 900         boolean_t isv4mapped =                                          \
 901             IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);               \
 902         IPCL_HASH_REMOVE((connp));                                      \
 903         mutex_enter(&(connfp)->connf_lock);                              \
 904         list = &(connfp)->connf_head;                                    \
 905         prev = NULL;                                                    \
 906         while ((next = *list) != NULL) {                                \
 907                 if (isv4mapped &&                                       \
 908                     IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&     \
 909                     connp->conn_zoneid == next->conn_zoneid) {            \
 910                         (connp)->conn_next = next;                   \
 911                         if (prev != NULL)                               \
 912                                 prev = next->conn_prev;                      \
 913                         next->conn_prev = (connp);                   \
 914                         break;                                          \
 915                 }                                                       \
 916                 list = &next->conn_next;                         \
 917                 prev = next;                                            \
 918         }                                                               \
 919         (connp)->conn_prev = prev;                                   \
 920         *list = (connp);                                                \
 921         (connp)->conn_fanout = (connfp);                             \
 922         (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
 923             IPCL_BOUND;                                                 \
 924         CONN_INC_REF((connp));                                          \
 925         mutex_exit(&(connfp)->connf_lock);                               \
 926 }
 927 
 928 void
 929 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 930 {
 931         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 932 }
 933 
 934 /*
 935  * Because the classifier is used to classify inbound packets, the destination
 936  * address is meant to be our local tunnel address (tunnel source), and the
 937  * source the remote tunnel address (tunnel destination).
 938  *
 939  * Note that conn_proto can't be used for fanout since the upper protocol
 940  * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 941  */
 942 conn_t *
 943 ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
 944 {
 945         connf_t *connfp;
 946         conn_t  *connp;
 947 
 948         /* first look for IPv4 tunnel links */
 949         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
 950         mutex_enter(&connfp->connf_lock);
 951         for (connp = connfp->connf_head; connp != NULL;
 952             connp = connp->conn_next) {
 953                 if (IPCL_IPTUN_MATCH(connp, *dst, *src))
 954                         break;
 955         }
 956         if (connp != NULL)
 957                 goto done;
 958 
 959         mutex_exit(&connfp->connf_lock);
 960 
 961         /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
 962         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
 963             INADDR_ANY)];
 964         mutex_enter(&connfp->connf_lock);
 965         for (connp = connfp->connf_head; connp != NULL;
 966             connp = connp->conn_next) {
 967                 if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
 968                         break;
 969         }
 970 done:
 971         if (connp != NULL)
 972                 CONN_INC_REF(connp);
 973         mutex_exit(&connfp->connf_lock);
 974         return (connp);
 975 }
 976 
 977 conn_t *
 978 ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
 979 {
 980         connf_t *connfp;
 981         conn_t  *connp;
 982 
 983         /* Look for an IPv6 tunnel link */
 984         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
 985         mutex_enter(&connfp->connf_lock);
 986         for (connp = connfp->connf_head; connp != NULL;
 987             connp = connp->conn_next) {
 988                 if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
 989                         CONN_INC_REF(connp);
 990                         break;
 991                 }
 992         }
 993         mutex_exit(&connfp->connf_lock);
 994         return (connp);
 995 }
 996 
 997 /*
 998  * This function is used only for inserting SCTP raw socket now.
 999  * This may change later.
1000  *
1001  * Note that only one raw socket can be bound to a port.  The param
1002  * lport is in network byte order.
1003  */
1004 static int
1005 ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 {
1007         connf_t *connfp;
1008         conn_t  *oconnp;
1009         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1010 
1011         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012 
1013         /* Check for existing raw socket already bound to the port. */
1014         mutex_enter(&connfp->connf_lock);
1015         for (oconnp = connfp->connf_head; oconnp != NULL;
1016             oconnp = oconnp->conn_next) {
1017                 if (oconnp->conn_lport == lport &&
1018                     oconnp->conn_zoneid == connp->conn_zoneid &&
1019                     oconnp->conn_family == connp->conn_family &&
1020                     ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021                     IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023                     IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024                     IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025                     &connp->conn_laddr_v6))) {
1026                         break;
1027                 }
1028         }
1029         mutex_exit(&connfp->connf_lock);
1030         if (oconnp != NULL)
1031                 return (EADDRNOTAVAIL);
1032 
1033         if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034             IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035                 if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036                     IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1038                 } else {
1039                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1040                 }
1041         } else {
1042                 IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043         }
1044         return (0);
1045 }
1046 
1047 static int
1048 ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 {
1050         connf_t *connfp;
1051         conn_t  *tconnp;
1052         ipaddr_t laddr = connp->conn_laddr_v4;
1053         ipaddr_t faddr = connp->conn_faddr_v4;
1054 
1055         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056         mutex_enter(&connfp->connf_lock);
1057         for (tconnp = connfp->connf_head; tconnp != NULL;
1058             tconnp = tconnp->conn_next) {
1059                 if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060                         /* A tunnel is already bound to these addresses. */
1061                         mutex_exit(&connfp->connf_lock);
1062                         return (EADDRINUSE);
1063                 }
1064         }
1065         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066         mutex_exit(&connfp->connf_lock);
1067         return (0);
1068 }
1069 
1070 static int
1071 ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 {
1073         connf_t *connfp;
1074         conn_t  *tconnp;
1075         in6_addr_t *laddr = &connp->conn_laddr_v6;
1076         in6_addr_t *faddr = &connp->conn_faddr_v6;
1077 
1078         connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079         mutex_enter(&connfp->connf_lock);
1080         for (tconnp = connfp->connf_head; tconnp != NULL;
1081             tconnp = tconnp->conn_next) {
1082                 if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083                         /* A tunnel is already bound to these addresses. */
1084                         mutex_exit(&connfp->connf_lock);
1085                         return (EADDRINUSE);
1086                 }
1087         }
1088         IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089         mutex_exit(&connfp->connf_lock);
1090         return (0);
1091 }
1092 
1093 /*
1094  * Check for a MAC exemption conflict on a labeled system.  Note that for
1095  * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096  * transport layer.  This check is for binding all other protocols.
1097  *
1098  * Returns true if there's a conflict.
1099  */
1100 static boolean_t
1101 check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 {
1103         connf_t *connfp;
1104         conn_t *tconn;
1105 
1106         connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107         mutex_enter(&connfp->connf_lock);
1108         for (tconn = connfp->connf_head; tconn != NULL;
1109             tconn = tconn->conn_next) {
1110                 /* We don't allow v4 fallback for v6 raw socket */
1111                 if (connp->conn_family != tconn->conn_family)
1112                         continue;
1113                 /* If neither is exempt, then there's no conflict */
1114                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116                         continue;
1117                 /* We are only concerned about sockets for a different zone */
1118                 if (connp->conn_zoneid == tconn->conn_zoneid)
1119                         continue;
1120                 /* If both are bound to different specific addrs, ok */
1121                 if (connp->conn_laddr_v4 != INADDR_ANY &&
1122                     tconn->conn_laddr_v4 != INADDR_ANY &&
1123                     connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124                         continue;
1125                 /* These two conflict; fail */
1126                 break;
1127         }
1128         mutex_exit(&connfp->connf_lock);
1129         return (tconn != NULL);
1130 }
1131 
1132 static boolean_t
1133 check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 {
1135         connf_t *connfp;
1136         conn_t *tconn;
1137 
1138         connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139         mutex_enter(&connfp->connf_lock);
1140         for (tconn = connfp->connf_head; tconn != NULL;
1141             tconn = tconn->conn_next) {
1142                 /* We don't allow v4 fallback for v6 raw socket */
1143                 if (connp->conn_family != tconn->conn_family)
1144                         continue;
1145                 /* If neither is exempt, then there's no conflict */
1146                 if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147                     (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148                         continue;
1149                 /* We are only concerned about sockets for a different zone */
1150                 if (connp->conn_zoneid == tconn->conn_zoneid)
1151                         continue;
1152                 /* If both are bound to different addrs, ok */
1153                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154                     !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155                     !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156                     &tconn->conn_laddr_v6))
1157                         continue;
1158                 /* These two conflict; fail */
1159                 break;
1160         }
1161         mutex_exit(&connfp->connf_lock);
1162         return (tconn != NULL);
1163 }
1164 
1165 /*
1166  * (v4, v6) bind hash insertion routines
1167  * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168  */
1169 
1170 int
1171 ipcl_bind_insert(conn_t *connp)
1172 {
1173         if (connp->conn_ipversion == IPV6_VERSION)
1174                 return (ipcl_bind_insert_v6(connp));
1175         else
1176                 return (ipcl_bind_insert_v4(connp));
1177 }
1178 
1179 int
1180 ipcl_bind_insert_v4(conn_t *connp)
1181 {
1182         connf_t *connfp;
1183         int     ret = 0;
1184         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1185         uint16_t        lport = connp->conn_lport;
1186         uint8_t         protocol = connp->conn_proto;
1187 
1188         if (IPCL_IS_IPTUN(connp))
1189                 return (ipcl_iptun_hash_insert(connp, ipst));
1190 
1191         switch (protocol) {
1192         default:
1193                 if (is_system_labeled() &&
1194                     check_exempt_conflict_v4(connp, ipst))
1195                         return (EADDRINUSE);
1196                 /* FALLTHROUGH */
1197         case IPPROTO_UDP:
1198                 if (protocol == IPPROTO_UDP) {
1199                         connfp = &ipst->ips_ipcl_udp_fanout[
1200                             IPCL_UDP_HASH(lport, ipst)];
1201                 } else {
1202                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203                 }
1204 
1205                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1206                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1209                 } else {
1210                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1211                 }
1212                 if (protocol == IPPROTO_RSVP)
1213                         ill_set_inputfn_all(ipst);
1214                 break;
1215 
1216         case IPPROTO_TCP:
1217                 /* Insert it in the Bind Hash */
1218                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1219                 connfp = &ipst->ips_ipcl_bind_fanout[
1220                     IPCL_BIND_HASH(lport, ipst)];
1221                 if (connp->conn_laddr_v4 != INADDR_ANY) {
1222                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1223                 } else {
1224                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1225                 }
1226                 if (cl_inet_listen != NULL) {
1227                         ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228                         connp->conn_flags |= IPCL_CL_LISTENER;
1229                         (*cl_inet_listen)(
1230                             connp->conn_netstack->netstack_stackid,
1231                             IPPROTO_TCP, AF_INET,
1232                             (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233                 }
1234                 break;
1235 
1236         case IPPROTO_SCTP:
1237                 ret = ipcl_sctp_hash_insert(connp, lport);
1238                 break;
1239         }
1240 
1241         return (ret);
1242 }
1243 
1244 int
1245 ipcl_bind_insert_v6(conn_t *connp)
1246 {
1247         connf_t         *connfp;
1248         int             ret = 0;
1249         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1250         uint16_t        lport = connp->conn_lport;
1251         uint8_t         protocol = connp->conn_proto;
1252 
1253         if (IPCL_IS_IPTUN(connp)) {
1254                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255         }
1256 
1257         switch (protocol) {
1258         default:
1259                 if (is_system_labeled() &&
1260                     check_exempt_conflict_v6(connp, ipst))
1261                         return (EADDRINUSE);
1262                 /* FALLTHROUGH */
1263         case IPPROTO_UDP:
1264                 if (protocol == IPPROTO_UDP) {
1265                         connfp = &ipst->ips_ipcl_udp_fanout[
1266                             IPCL_UDP_HASH(lport, ipst)];
1267                 } else {
1268                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269                 }
1270 
1271                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1275                 } else {
1276                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277                 }
1278                 break;
1279 
1280         case IPPROTO_TCP:
1281                 /* Insert it in the Bind Hash */
1282                 ASSERT(connp->conn_zoneid != ALL_ZONES);
1283                 connfp = &ipst->ips_ipcl_bind_fanout[
1284                     IPCL_BIND_HASH(lport, ipst)];
1285                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1287                 } else {
1288                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1289                 }
1290                 if (cl_inet_listen != NULL) {
1291                         sa_family_t     addr_family;
1292                         uint8_t         *laddrp;
1293 
1294                         if (connp->conn_ipversion == IPV6_VERSION) {
1295                                 addr_family = AF_INET6;
1296                                 laddrp =
1297                                     (uint8_t *)&connp->conn_bound_addr_v6;
1298                         } else {
1299                                 addr_family = AF_INET;
1300                                 laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301                         }
1302                         connp->conn_flags |= IPCL_CL_LISTENER;
1303                         (*cl_inet_listen)(
1304                             connp->conn_netstack->netstack_stackid,
1305                             IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306                 }
1307                 break;
1308 
1309         case IPPROTO_SCTP:
1310                 ret = ipcl_sctp_hash_insert(connp, lport);
1311                 break;
1312         }
1313 
1314         return (ret);
1315 }
1316 
1317 /*
1318  * ipcl_conn_hash insertion routines.
1319  * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320  */
1321 
1322 int
1323 ipcl_conn_insert(conn_t *connp)
1324 {
1325         if (connp->conn_ipversion == IPV6_VERSION)
1326                 return (ipcl_conn_insert_v6(connp));
1327         else
1328                 return (ipcl_conn_insert_v4(connp));
1329 }
1330 
1331 int
1332 ipcl_conn_insert_v4(conn_t *connp)
1333 {
1334         connf_t         *connfp;
1335         conn_t          *tconnp;
1336         int             ret = 0;
1337         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1338         uint16_t        lport = connp->conn_lport;
1339         uint8_t         protocol = connp->conn_proto;
1340 
1341         if (IPCL_IS_IPTUN(connp))
1342                 return (ipcl_iptun_hash_insert(connp, ipst));
1343 
1344         switch (protocol) {
1345         case IPPROTO_TCP:
1346                 /*
1347                  * For TCP, we check whether the connection tuple already
1348                  * exists before allowing the connection to proceed.  We
1349                  * also allow indexing on the zoneid. This is to allow
1350                  * multiple shared stack zones to have the same tcp
1351                  * connection tuple. In practice this only happens for
1352                  * INADDR_LOOPBACK as it's the only local address which
1353                  * doesn't have to be unique.
1354                  */
1355                 connfp = &ipst->ips_ipcl_conn_fanout[
1356                     IPCL_CONN_HASH(connp->conn_faddr_v4,
1357                     connp->conn_ports, ipst)];
1358                 mutex_enter(&connfp->connf_lock);
1359                 for (tconnp = connfp->connf_head; tconnp != NULL;
1360                     tconnp = tconnp->conn_next) {
1361                         if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362                             connp->conn_faddr_v4, connp->conn_laddr_v4,
1363                             connp->conn_ports) &&
1364                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365                                 /* Already have a conn. bail out */
1366                                 mutex_exit(&connfp->connf_lock);
1367                                 return (EADDRINUSE);
1368                         }
1369                 }
1370                 if (connp->conn_fanout != NULL) {
1371                         /*
1372                          * Probably a XTI/TLI application trying to do a
1373                          * rebind. Let it happen.
1374                          */
1375                         mutex_exit(&connfp->connf_lock);
1376                         IPCL_HASH_REMOVE(connp);
1377                         mutex_enter(&connfp->connf_lock);
1378                 }
1379 
1380                 ASSERT(connp->conn_recv != NULL);
1381                 ASSERT(connp->conn_recvicmp != NULL);
1382 
1383                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384                 mutex_exit(&connfp->connf_lock);
1385                 break;
1386 
1387         case IPPROTO_SCTP:
1388                 /*
1389                  * The raw socket may have already been bound, remove it
1390                  * from the hash first.
1391                  */
1392                 IPCL_HASH_REMOVE(connp);
1393                 ret = ipcl_sctp_hash_insert(connp, lport);
1394                 break;
1395 
1396         default:
1397                 /*
1398                  * Check for conflicts among MAC exempt bindings.  For
1399                  * transports with port numbers, this is done by the upper
1400                  * level per-transport binding logic.  For all others, it's
1401                  * done here.
1402                  */
1403                 if (is_system_labeled() &&
1404                     check_exempt_conflict_v4(connp, ipst))
1405                         return (EADDRINUSE);
1406                 /* FALLTHROUGH */
1407 
1408         case IPPROTO_UDP:
1409                 if (protocol == IPPROTO_UDP) {
1410                         connfp = &ipst->ips_ipcl_udp_fanout[
1411                             IPCL_UDP_HASH(lport, ipst)];
1412                 } else {
1413                         connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414                 }
1415 
1416                 if (connp->conn_faddr_v4 != INADDR_ANY) {
1417                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418                 } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1420                 } else {
1421                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1422                 }
1423                 break;
1424         }
1425 
1426         return (ret);
1427 }
1428 
1429 int
1430 ipcl_conn_insert_v6(conn_t *connp)
1431 {
1432         connf_t         *connfp;
1433         conn_t          *tconnp;
1434         int             ret = 0;
1435         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1436         uint16_t        lport = connp->conn_lport;
1437         uint8_t         protocol = connp->conn_proto;
1438         uint_t          ifindex = connp->conn_bound_if;
1439 
1440         if (IPCL_IS_IPTUN(connp))
1441                 return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442 
1443         switch (protocol) {
1444         case IPPROTO_TCP:
1445 
1446                 /*
1447                  * For tcp, we check whether the connection tuple already
1448                  * exists before allowing the connection to proceed.  We
1449                  * also allow indexing on the zoneid. This is to allow
1450                  * multiple shared stack zones to have the same tcp
1451                  * connection tuple. In practice this only happens for
1452                  * ipv6_loopback as it's the only local address which
1453                  * doesn't have to be unique.
1454                  */
1455                 connfp = &ipst->ips_ipcl_conn_fanout[
1456                     IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457                     ipst)];
1458                 mutex_enter(&connfp->connf_lock);
1459                 for (tconnp = connfp->connf_head; tconnp != NULL;
1460                     tconnp = tconnp->conn_next) {
1461                         /* NOTE: need to match zoneid. Bug in onnv-gate */
1462                         if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463                             connp->conn_faddr_v6, connp->conn_laddr_v6,
1464                             connp->conn_ports) &&
1465                             (tconnp->conn_bound_if == 0 ||
1466                             tconnp->conn_bound_if == ifindex) &&
1467                             IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468                                 /* Already have a conn. bail out */
1469                                 mutex_exit(&connfp->connf_lock);
1470                                 return (EADDRINUSE);
1471                         }
1472                 }
1473                 if (connp->conn_fanout != NULL) {
1474                         /*
1475                          * Probably a XTI/TLI application trying to do a
1476                          * rebind. Let it happen.
1477                          */
1478                         mutex_exit(&connfp->connf_lock);
1479                         IPCL_HASH_REMOVE(connp);
1480                         mutex_enter(&connfp->connf_lock);
1481                 }
1482                 IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483                 mutex_exit(&connfp->connf_lock);
1484                 break;
1485 
1486         case IPPROTO_SCTP:
1487                 IPCL_HASH_REMOVE(connp);
1488                 ret = ipcl_sctp_hash_insert(connp, lport);
1489                 break;
1490 
1491         default:
1492                 if (is_system_labeled() &&
1493                     check_exempt_conflict_v6(connp, ipst))
1494                         return (EADDRINUSE);
1495                 /* FALLTHROUGH */
1496         case IPPROTO_UDP:
1497                 if (protocol == IPPROTO_UDP) {
1498                         connfp = &ipst->ips_ipcl_udp_fanout[
1499                             IPCL_UDP_HASH(lport, ipst)];
1500                 } else {
1501                         connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502                 }
1503 
1504                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505                         IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506                 } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507                         IPCL_HASH_INSERT_BOUND(connfp, connp);
1508                 } else {
1509                         IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1510                 }
1511                 break;
1512         }
1513 
1514         return (ret);
1515 }
1516 
1517 /*
1518  * v4 packet classifying function. looks up the fanout table to
1519  * find the conn, the packet belongs to. returns the conn with
1520  * the reference held, null otherwise.
1521  *
1522  * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523  * Lookup" comment block are applied.  Labels are also checked as described
1524  * above.  If the packet is from the inside (looped back), and is from the same
1525  * zone, then label checks are omitted.
1526  */
1527 conn_t *
1528 ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529     ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 {
1531         ipha_t  *ipha;
1532         connf_t *connfp, *bind_connfp;
1533         uint16_t lport;
1534         uint16_t fport;
1535         uint32_t ports;
1536         conn_t  *connp;
1537         uint16_t  *up;
1538         zoneid_t        zoneid = ira->ira_zoneid;
1539 
1540         ipha = (ipha_t *)mp->b_rptr;
1541         up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 
1543         switch (protocol) {
1544         case IPPROTO_TCP:
1545                 ports = *(uint32_t *)up;
1546                 connfp =
1547                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548                     ports, ipst)];
1549                 mutex_enter(&connfp->connf_lock);
1550                 for (connp = connfp->connf_head; connp != NULL;
1551                     connp = connp->conn_next) {
1552                         if (IPCL_CONN_MATCH(connp, protocol,
1553                             ipha->ipha_src, ipha->ipha_dst, ports) &&
1554                             (connp->conn_zoneid == zoneid ||
1555                             connp->conn_allzones ||
1556                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559                                 break;
1560                 }
1561 
1562                 if (connp != NULL) {
1563                         /*
1564                          * We have a fully-bound TCP connection.
1565                          *
1566                          * For labeled systems, there's no need to check the
1567                          * label here.  It's known to be good as we checked
1568                          * before allowing the connection to become bound.
1569                          */
1570                         CONN_INC_REF(connp);
1571                         mutex_exit(&connfp->connf_lock);
1572                         return (connp);
1573                 }
1574 
1575                 mutex_exit(&connfp->connf_lock);
1576                 lport = up[1];
1577                 bind_connfp =
1578                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579                 mutex_enter(&bind_connfp->connf_lock);
1580                 for (connp = bind_connfp->connf_head; connp != NULL;
1581                     connp = connp->conn_next) {
1582                         if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583                             lport) &&
1584                             (connp->conn_zoneid == zoneid ||
1585                             connp->conn_allzones ||
1586                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589                                 break;
1590                 }
1591 
1592                 /*
1593                  * If the matching connection is SLP on a private address, then
1594                  * the label on the packet must match the local zone's label.
1595                  * Otherwise, it must be in the label range defined by tnrh.
1596                  * This is ensured by tsol_receive_local.
1597                  *
1598                  * Note that we don't check tsol_receive_local for
1599                  * the connected case.
1600                  */
1601                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603                     ira, connp)) {
1604                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605                             char *, "connp(1) could not receive mp(2)",
1606                             conn_t *, connp, mblk_t *, mp);
1607                         connp = NULL;
1608                 }
1609 
1610                 if (connp != NULL) {
1611                         /* Have a listener at least */
1612                         CONN_INC_REF(connp);
1613                         mutex_exit(&bind_connfp->connf_lock);
1614                         return (connp);
1615                 }
1616 
1617                 mutex_exit(&bind_connfp->connf_lock);
1618                 break;
1619 
1620         case IPPROTO_UDP:
1621                 lport = up[1];
1622                 fport = up[0];
1623                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624                 mutex_enter(&connfp->connf_lock);
1625                 for (connp = connfp->connf_head; connp != NULL;
1626                     connp = connp->conn_next) {
1627                         if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628                             fport, ipha->ipha_src) &&
1629                             (connp->conn_zoneid == zoneid ||
1630                             connp->conn_allzones ||
1631                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633                                 break;
1634                 }
1635 
1636                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637                     !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638                     ira, connp)) {
1639                         DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640                             char *, "connp(1) could not receive mp(2)",
1641                             conn_t *, connp, mblk_t *, mp);
1642                         connp = NULL;
1643                 }
1644 
1645                 if (connp != NULL) {
1646                         CONN_INC_REF(connp);
1647                         mutex_exit(&connfp->connf_lock);
1648                         return (connp);
1649                 }
1650 
1651                 /*
1652                  * We shouldn't come here for multicast/broadcast packets
1653                  */
1654                 mutex_exit(&connfp->connf_lock);
1655 
1656                 break;
1657 
1658         case IPPROTO_ENCAP:
1659         case IPPROTO_IPV6:
1660                 return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661                     &ipha->ipha_dst, ipst));
1662         }
1663 
1664         return (NULL);
1665 }
1666 
1667 conn_t *
1668 ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669     ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 {
1671         ip6_t           *ip6h;
1672         connf_t         *connfp, *bind_connfp;
1673         uint16_t        lport;
1674         uint16_t        fport;
1675         tcpha_t         *tcpha;
1676         uint32_t        ports;
1677         conn_t          *connp;
1678         uint16_t        *up;
1679         zoneid_t        zoneid = ira->ira_zoneid;
1680 
1681         ip6h = (ip6_t *)mp->b_rptr;
1682 
1683         switch (protocol) {
1684         case IPPROTO_TCP:
1685                 tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686                 up = &tcpha->tha_lport;
1687                 ports = *(uint32_t *)up;
1688 
1689                 connfp =
1690                     &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691                     ports, ipst)];
1692                 mutex_enter(&connfp->connf_lock);
1693                 for (connp = connfp->connf_head; connp != NULL;
1694                     connp = connp->conn_next) {
1695                         if (IPCL_CONN_MATCH_V6(connp, protocol,
1696                             ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697                             (connp->conn_zoneid == zoneid ||
1698                             connp->conn_allzones ||
1699                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702                                 break;
1703                 }
1704 
1705                 if (connp != NULL) {
1706                         /*
1707                          * We have a fully-bound TCP connection.
1708                          *
1709                          * For labeled systems, there's no need to check the
1710                          * label here.  It's known to be good as we checked
1711                          * before allowing the connection to become bound.
1712                          */
1713                         CONN_INC_REF(connp);
1714                         mutex_exit(&connfp->connf_lock);
1715                         return (connp);
1716                 }
1717 
1718                 mutex_exit(&connfp->connf_lock);
1719 
1720                 lport = up[1];
1721                 bind_connfp =
1722                     &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723                 mutex_enter(&bind_connfp->connf_lock);
1724                 for (connp = bind_connfp->connf_head; connp != NULL;
1725                     connp = connp->conn_next) {
1726                         if (IPCL_BIND_MATCH_V6(connp, protocol,
1727                             ip6h->ip6_dst, lport) &&
1728                             (connp->conn_zoneid == zoneid ||
1729                             connp->conn_allzones ||
1730                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733                                 break;
1734                 }
1735 
1736                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738                     ira, connp)) {
1739                         DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740                             char *, "connp(1) could not receive mp(2)",
1741                             conn_t *, connp, mblk_t *, mp);
1742                         connp = NULL;
1743                 }
1744 
1745                 if (connp != NULL) {
1746                         /* Have a listner at least */
1747                         CONN_INC_REF(connp);
1748                         mutex_exit(&bind_connfp->connf_lock);
1749                         return (connp);
1750                 }
1751 
1752                 mutex_exit(&bind_connfp->connf_lock);
1753                 break;
1754 
1755         case IPPROTO_UDP:
1756                 up = (uint16_t *)&mp->b_rptr[hdr_len];
1757                 lport = up[1];
1758                 fport = up[0];
1759                 connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760                 mutex_enter(&connfp->connf_lock);
1761                 for (connp = connfp->connf_head; connp != NULL;
1762                     connp = connp->conn_next) {
1763                         if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764                             fport, ip6h->ip6_src) &&
1765                             (connp->conn_zoneid == zoneid ||
1766                             connp->conn_allzones ||
1767                             ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768                             (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769                             (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770                                 break;
1771                 }
1772 
1773                 if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774                     !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775                     ira, connp)) {
1776                         DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777                             char *, "connp(1) could not receive mp(2)",
1778                             conn_t *, connp, mblk_t *, mp);
1779                         connp = NULL;
1780                 }
1781 
1782                 if (connp != NULL) {
1783                         CONN_INC_REF(connp);
1784                         mutex_exit(&connfp->connf_lock);
1785                         return (connp);
1786                 }
1787 
1788                 /*
1789                  * We shouldn't come here for multicast/broadcast packets
1790                  */
1791                 mutex_exit(&connfp->connf_lock);
1792                 break;
1793         case IPPROTO_ENCAP:
1794         case IPPROTO_IPV6:
1795                 return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796                     &ip6h->ip6_dst, ipst));
1797         }
1798 
1799         return (NULL);
1800 }
1801 
1802 /*
1803  * wrapper around ipcl_classify_(v4,v6) routines.
1804  */
1805 conn_t *
1806 ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 {
1808         if (ira->ira_flags & IRAF_IS_IPV4) {
1809                 return (ipcl_classify_v4(mp, ira->ira_protocol,
1810                     ira->ira_ip_hdr_length, ira, ipst));
1811         } else {
1812                 return (ipcl_classify_v6(mp, ira->ira_protocol,
1813                     ira->ira_ip_hdr_length, ira, ipst));
1814         }
1815 }
1816 
1817 /*
1818  * Only used to classify SCTP RAW sockets
1819  */
1820 conn_t *
1821 ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822     ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 {
1824         connf_t         *connfp;
1825         conn_t          *connp;
1826         in_port_t       lport;
1827         int             ipversion;
1828         const void      *dst;
1829         zoneid_t        zoneid = ira->ira_zoneid;
1830 
1831         lport = ((uint16_t *)&ports)[1];
1832         if (ira->ira_flags & IRAF_IS_IPV4) {
1833                 dst = (const void *)&ipha->ipha_dst;
1834                 ipversion = IPV4_VERSION;
1835         } else {
1836                 dst = (const void *)&ip6h->ip6_dst;
1837                 ipversion = IPV6_VERSION;
1838         }
1839 
1840         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841         mutex_enter(&connfp->connf_lock);
1842         for (connp = connfp->connf_head; connp != NULL;
1843             connp = connp->conn_next) {
1844                 /* We don't allow v4 fallback for v6 raw socket. */
1845                 if (ipversion != connp->conn_ipversion)
1846                         continue;
1847                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849                         if (ipversion == IPV4_VERSION) {
1850                                 if (!IPCL_CONN_MATCH(connp, protocol,
1851                                     ipha->ipha_src, ipha->ipha_dst, ports))
1852                                         continue;
1853                         } else {
1854                                 if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855                                     ip6h->ip6_src, ip6h->ip6_dst, ports))
1856                                         continue;
1857                         }
1858                 } else {
1859                         if (ipversion == IPV4_VERSION) {
1860                                 if (!IPCL_BIND_MATCH(connp, protocol,
1861                                     ipha->ipha_dst, lport))
1862                                         continue;
1863                         } else {
1864                                 if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865                                     ip6h->ip6_dst, lport))
1866                                         continue;
1867                         }
1868                 }
1869 
1870                 if (connp->conn_zoneid == zoneid ||
1871                     connp->conn_allzones ||
1872                     ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873                     (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874                     (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875                         break;
1876         }
1877 
1878         if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879             !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880                 DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881                     char *, "connp(1) could not receive mp(2)",
1882                     conn_t *, connp, mblk_t *, mp);
1883                 connp = NULL;
1884         }
1885 
1886         if (connp != NULL)
1887                 goto found;
1888         mutex_exit(&connfp->connf_lock);
1889 
1890         /* Try to look for a wildcard SCTP RAW socket match. */
1891         connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892         mutex_enter(&connfp->connf_lock);
1893         for (connp = connfp->connf_head; connp != NULL;
1894             connp = connp->conn_next) {
1895                 /* We don't allow v4 fallback for v6 raw socket. */
1896                 if (ipversion != connp->conn_ipversion)
1897                         continue;
1898                 if (!IPCL_ZONE_MATCH(connp, zoneid))
1899                         continue;
1900 
1901                 if (ipversion == IPV4_VERSION) {
1902                         if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903                                 break;
1904                 } else {
1905                         if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906                                 break;
1907                         }
1908                 }
1909         }
1910 
1911         if (connp != NULL)
1912                 goto found;
1913 
1914         mutex_exit(&connfp->connf_lock);
1915         return (NULL);
1916 
1917 found:
1918         ASSERT(connp != NULL);
1919         CONN_INC_REF(connp);
1920         mutex_exit(&connfp->connf_lock);
1921         return (connp);
1922 }
1923 
1924 /* ARGSUSED */
1925 static int
1926 tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 {
1928         itc_t   *itc = (itc_t *)buf;
1929         conn_t  *connp = &itc->itc_conn;
1930         tcp_t   *tcp = (tcp_t *)&itc[1];
1931 
1932         bzero(connp, sizeof (conn_t));
1933         bzero(tcp, sizeof (tcp_t));
1934 
1935         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937         cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938         tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939         if (tcp->tcp_timercache == NULL)
1940                 return (ENOMEM);
1941         connp->conn_tcp = tcp;
1942         connp->conn_flags = IPCL_TCPCONN;
1943         connp->conn_proto = IPPROTO_TCP;
1944         tcp->tcp_connp = connp;
1945         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946 
1947         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948         if (connp->conn_ixa == NULL) {
1949                 tcp_timermp_free(tcp);
1950                 return (ENOMEM);
1951         }
1952         connp->conn_ixa->ixa_refcnt = 1;
1953         connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955         return (0);
1956 }
1957 
1958 /* ARGSUSED */
1959 static void
1960 tcp_conn_destructor(void *buf, void *cdrarg)
1961 {
1962         itc_t   *itc = (itc_t *)buf;
1963         conn_t  *connp = &itc->itc_conn;
1964         tcp_t   *tcp = (tcp_t *)&itc[1];
1965 
1966         ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967         ASSERT(tcp->tcp_connp == connp);
1968         ASSERT(connp->conn_tcp == tcp);
1969         tcp_timermp_free(tcp);
1970         mutex_destroy(&connp->conn_lock);
1971         cv_destroy(&connp->conn_cv);
1972         cv_destroy(&connp->conn_sq_cv);
1973         rw_destroy(&connp->conn_ilg_lock);
1974 
1975         /* Can be NULL if constructor failed */
1976         if (connp->conn_ixa != NULL) {
1977                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980                 ixa_refrele(connp->conn_ixa);
1981         }
1982 }
1983 
1984 /* ARGSUSED */
1985 static int
1986 ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 {
1988         itc_t   *itc = (itc_t *)buf;
1989         conn_t  *connp = &itc->itc_conn;
1990 
1991         bzero(connp, sizeof (conn_t));
1992         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994         connp->conn_flags = IPCL_IPCCONN;
1995         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996 
1997         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998         if (connp->conn_ixa == NULL)
1999                 return (ENOMEM);
2000         connp->conn_ixa->ixa_refcnt = 1;
2001         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002         return (0);
2003 }
2004 
2005 /* ARGSUSED */
2006 static void
2007 ip_conn_destructor(void *buf, void *cdrarg)
2008 {
2009         itc_t   *itc = (itc_t *)buf;
2010         conn_t  *connp = &itc->itc_conn;
2011 
2012         ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013         ASSERT(connp->conn_priv == NULL);
2014         mutex_destroy(&connp->conn_lock);
2015         cv_destroy(&connp->conn_cv);
2016         rw_destroy(&connp->conn_ilg_lock);
2017 
2018         /* Can be NULL if constructor failed */
2019         if (connp->conn_ixa != NULL) {
2020                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023                 ixa_refrele(connp->conn_ixa);
2024         }
2025 }
2026 
2027 /* ARGSUSED */
2028 static int
2029 udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 {
2031         itc_t   *itc = (itc_t *)buf;
2032         conn_t  *connp = &itc->itc_conn;
2033         udp_t   *udp = (udp_t *)&itc[1];
2034 
2035         bzero(connp, sizeof (conn_t));
2036         bzero(udp, sizeof (udp_t));
2037 
2038         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040         connp->conn_udp = udp;
2041         connp->conn_flags = IPCL_UDPCONN;
2042         connp->conn_proto = IPPROTO_UDP;
2043         udp->udp_connp = connp;
2044         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046         if (connp->conn_ixa == NULL)
2047                 return (ENOMEM);
2048         connp->conn_ixa->ixa_refcnt = 1;
2049         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051         return (0);
2052 }
2053 
2054 /* ARGSUSED */
2055 static void
2056 udp_conn_destructor(void *buf, void *cdrarg)
2057 {
2058         itc_t   *itc = (itc_t *)buf;
2059         conn_t  *connp = &itc->itc_conn;
2060         udp_t   *udp = (udp_t *)&itc[1];
2061 
2062         ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063         ASSERT(udp->udp_connp == connp);
2064         ASSERT(connp->conn_udp == udp);
2065         mutex_destroy(&connp->conn_lock);
2066         cv_destroy(&connp->conn_cv);
2067         rw_destroy(&connp->conn_ilg_lock);
2068 
2069         /* Can be NULL if constructor failed */
2070         if (connp->conn_ixa != NULL) {
2071                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074                 ixa_refrele(connp->conn_ixa);
2075         }
2076 }
2077 
2078 /* ARGSUSED */
2079 static int
2080 rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 {
2082         itc_t   *itc = (itc_t *)buf;
2083         conn_t  *connp = &itc->itc_conn;
2084         icmp_t  *icmp = (icmp_t *)&itc[1];
2085 
2086         bzero(connp, sizeof (conn_t));
2087         bzero(icmp, sizeof (icmp_t));
2088 
2089         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091         connp->conn_icmp = icmp;
2092         connp->conn_flags = IPCL_RAWIPCONN;
2093         connp->conn_proto = IPPROTO_ICMP;
2094         icmp->icmp_connp = connp;
2095         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097         if (connp->conn_ixa == NULL)
2098                 return (ENOMEM);
2099         connp->conn_ixa->ixa_refcnt = 1;
2100         connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102         return (0);
2103 }
2104 
2105 /* ARGSUSED */
2106 static void
2107 rawip_conn_destructor(void *buf, void *cdrarg)
2108 {
2109         itc_t   *itc = (itc_t *)buf;
2110         conn_t  *connp = &itc->itc_conn;
2111         icmp_t  *icmp = (icmp_t *)&itc[1];
2112 
2113         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114         ASSERT(icmp->icmp_connp == connp);
2115         ASSERT(connp->conn_icmp == icmp);
2116         mutex_destroy(&connp->conn_lock);
2117         cv_destroy(&connp->conn_cv);
2118         rw_destroy(&connp->conn_ilg_lock);
2119 
2120         /* Can be NULL if constructor failed */
2121         if (connp->conn_ixa != NULL) {
2122                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125                 ixa_refrele(connp->conn_ixa);
2126         }
2127 }
2128 
2129 /* ARGSUSED */
2130 static int
2131 rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 {
2133         itc_t   *itc = (itc_t *)buf;
2134         conn_t  *connp = &itc->itc_conn;
2135         rts_t   *rts = (rts_t *)&itc[1];
2136 
2137         bzero(connp, sizeof (conn_t));
2138         bzero(rts, sizeof (rts_t));
2139 
2140         mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141         cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142         connp->conn_rts = rts;
2143         connp->conn_flags = IPCL_RTSCONN;
2144         rts->rts_connp = connp;
2145         rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146         connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147         if (connp->conn_ixa == NULL)
2148                 return (ENOMEM);
2149         connp->conn_ixa->ixa_refcnt = 1;
2150         connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151         return (0);
2152 }
2153 
2154 /* ARGSUSED */
2155 static void
2156 rts_conn_destructor(void *buf, void *cdrarg)
2157 {
2158         itc_t   *itc = (itc_t *)buf;
2159         conn_t  *connp = &itc->itc_conn;
2160         rts_t   *rts = (rts_t *)&itc[1];
2161 
2162         ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163         ASSERT(rts->rts_connp == connp);
2164         ASSERT(connp->conn_rts == rts);
2165         mutex_destroy(&connp->conn_lock);
2166         cv_destroy(&connp->conn_cv);
2167         rw_destroy(&connp->conn_ilg_lock);
2168 
2169         /* Can be NULL if constructor failed */
2170         if (connp->conn_ixa != NULL) {
2171                 ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172                 ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173                 ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174                 ixa_refrele(connp->conn_ixa);
2175         }
2176 }
2177 
2178 /*
2179  * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180  * in the conn_t.
2181  *
2182  * Below we list all the pointers in the conn_t as a documentation aid.
2183  * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184  * If you add any pointers to the conn_t please add an ASSERT here
2185  * and #ifdef it out if it can't be actually asserted to be NULL.
2186  * In any case, we bzero most of the conn_t at the end of the function.
2187  */
2188 void
2189 ipcl_conn_cleanup(conn_t *connp)
2190 {
2191         ip_xmit_attr_t  *ixa;
2192 
2193         ASSERT(connp->conn_latch == NULL);
2194         ASSERT(connp->conn_latch_in_policy == NULL);
2195         ASSERT(connp->conn_latch_in_action == NULL);
2196 #ifdef notdef
2197         ASSERT(connp->conn_rq == NULL);
2198         ASSERT(connp->conn_wq == NULL);
2199 #endif
2200         ASSERT(connp->conn_cred == NULL);
2201         ASSERT(connp->conn_g_fanout == NULL);
2202         ASSERT(connp->conn_g_next == NULL);
2203         ASSERT(connp->conn_g_prev == NULL);
2204         ASSERT(connp->conn_policy == NULL);
2205         ASSERT(connp->conn_fanout == NULL);
2206         ASSERT(connp->conn_next == NULL);
2207         ASSERT(connp->conn_prev == NULL);
2208         ASSERT(connp->conn_oper_pending_ill == NULL);
2209         ASSERT(connp->conn_ilg == NULL);
2210         ASSERT(connp->conn_drain_next == NULL);
2211         ASSERT(connp->conn_drain_prev == NULL);
2212 #ifdef notdef
2213         /* conn_idl is not cleared when removed from idl list */
2214         ASSERT(connp->conn_idl == NULL);
2215 #endif
2216         ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 #ifdef notdef
2218         /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219         ASSERT(connp->conn_netstack == NULL);
2220 #endif
2221 
2222         ASSERT(connp->conn_helper_info == NULL);
2223         ASSERT(connp->conn_ixa != NULL);
2224         ixa = connp->conn_ixa;
2225         ASSERT(ixa->ixa_refcnt == 1);
2226         /* Need to preserve ixa_protocol */
2227         ixa_cleanup(ixa);
2228         ixa->ixa_flags = 0;
2229 
2230         /* Clear out the conn_t fields that are not preserved */
2231         bzero(&connp->conn_start_clr,
2232             sizeof (conn_t) -
2233             ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 }
2235 
2236 /*
2237  * All conns are inserted in a global multi-list for the benefit of
2238  * walkers. The walk is guaranteed to walk all open conns at the time
2239  * of the start of the walk exactly once. This property is needed to
2240  * achieve some cleanups during unplumb of interfaces. This is achieved
2241  * as follows.
2242  *
2243  * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244  * call the insert and delete functions below at creation and deletion
2245  * time respectively. The conn never moves or changes its position in this
2246  * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247  * won't increase due to walkers, once the conn deletion has started. Note
2248  * that we can't remove the conn from the global list and then wait for
2249  * the refcnt to drop to zero, since walkers would then see a truncated
2250  * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251  * conns until ip_open is ready to make them globally visible.
2252  * The global round robin multi-list locks are held only to get the
2253  * next member/insertion/deletion and contention should be negligible
2254  * if the multi-list is much greater than the number of cpus.
2255  */
2256 void
2257 ipcl_globalhash_insert(conn_t *connp)
2258 {
2259         int     index;
2260         struct connf_s  *connfp;
2261         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2262 
2263         /*
2264          * No need for atomic here. Approximate even distribution
2265          * in the global lists is sufficient.
2266          */
2267         ipst->ips_conn_g_index++;
2268         index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269 
2270         connp->conn_g_prev = NULL;
2271         /*
2272          * Mark as INCIPIENT, so that walkers will ignore this
2273          * for now, till ip_open is ready to make it visible globally.
2274          */
2275         connp->conn_state_flags |= CONN_INCIPIENT;
2276 
2277         connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278         /* Insert at the head of the list */
2279         mutex_enter(&connfp->connf_lock);
2280         connp->conn_g_next = connfp->connf_head;
2281         if (connp->conn_g_next != NULL)
2282                 connp->conn_g_next->conn_g_prev = connp;
2283         connfp->connf_head = connp;
2284 
2285         /* The fanout bucket this conn points to */
2286         connp->conn_g_fanout = connfp;
2287 
2288         mutex_exit(&connfp->connf_lock);
2289 }
2290 
2291 void
2292 ipcl_globalhash_remove(conn_t *connp)
2293 {
2294         struct connf_s  *connfp;
2295 
2296         /*
2297          * We were never inserted in the global multi list.
2298          * IPCL_NONE variety is never inserted in the global multilist
2299          * since it is presumed to not need any cleanup and is transient.
2300          */
2301         if (connp->conn_g_fanout == NULL)
2302                 return;
2303 
2304         connfp = connp->conn_g_fanout;
2305         mutex_enter(&connfp->connf_lock);
2306         if (connp->conn_g_prev != NULL)
2307                 connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308         else
2309                 connfp->connf_head = connp->conn_g_next;
2310         if (connp->conn_g_next != NULL)
2311                 connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312         mutex_exit(&connfp->connf_lock);
2313 
2314         /* Better to stumble on a null pointer than to corrupt memory */
2315         connp->conn_g_next = NULL;
2316         connp->conn_g_prev = NULL;
2317         connp->conn_g_fanout = NULL;
2318 }
2319 
2320 /*
2321  * Walk the list of all conn_t's in the system, calling the function provided
2322  * With the specified argument for each.
2323  * Applies to both IPv4 and IPv6.
2324  *
2325  * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326  * conn_oper_pending_ill). To guard against stale pointers
2327  * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328  * unplumbed or removed. New conn_t's that are created while we are walking
2329  * may be missed by this walk, because they are not necessarily inserted
2330  * at the tail of the list. They are new conn_t's and thus don't have any
2331  * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332  * is created to the struct that is going away.
2333  */
2334 void
2335 ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 {
2337         int     i;
2338         conn_t  *connp;
2339         conn_t  *prev_connp;
2340 
2341         for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342                 mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343                 prev_connp = NULL;
2344                 connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345                 while (connp != NULL) {
2346                         mutex_enter(&connp->conn_lock);
2347                         if (connp->conn_state_flags &
2348                             (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349                                 mutex_exit(&connp->conn_lock);
2350                                 connp = connp->conn_g_next;
2351                                 continue;
2352                         }
2353                         CONN_INC_REF_LOCKED(connp);
2354                         mutex_exit(&connp->conn_lock);
2355                         mutex_exit(
2356                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357                         (*func)(connp, arg);
2358                         if (prev_connp != NULL)
2359                                 CONN_DEC_REF(prev_connp);
2360                         mutex_enter(
2361                             &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362                         prev_connp = connp;
2363                         connp = connp->conn_g_next;
2364                 }
2365                 mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366                 if (prev_connp != NULL)
2367                         CONN_DEC_REF(prev_connp);
2368         }
2369 }
2370 
2371 /*
2372  * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2374  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2375  * (peer tcp in ESTABLISHED state).
2376  */
2377 conn_t *
2378 ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379     ip_stack_t *ipst)
2380 {
2381         uint32_t ports;
2382         uint16_t *pports = (uint16_t *)&ports;
2383         connf_t *connfp;
2384         conn_t  *tconnp;
2385         boolean_t zone_chk;
2386 
2387         /*
2388          * If either the source of destination address is loopback, then
2389          * both endpoints must be in the same Zone.  Otherwise, both of
2390          * the addresses are system-wide unique (tcp is in ESTABLISHED
2391          * state) and the endpoints may reside in different Zones.
2392          */
2393         zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394             ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395 
2396         pports[0] = tcpha->tha_fport;
2397         pports[1] = tcpha->tha_lport;
2398 
2399         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400             ports, ipst)];
2401 
2402         mutex_enter(&connfp->connf_lock);
2403         for (tconnp = connfp->connf_head; tconnp != NULL;
2404             tconnp = tconnp->conn_next) {
2405 
2406                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2408                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410 
2411                         ASSERT(tconnp != connp);
2412                         CONN_INC_REF(tconnp);
2413                         mutex_exit(&connfp->connf_lock);
2414                         return (tconnp);
2415                 }
2416         }
2417         mutex_exit(&connfp->connf_lock);
2418         return (NULL);
2419 }
2420 
2421 /*
2422  * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423  * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2424  * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2425  * (peer tcp in ESTABLISHED state).
2426  */
2427 conn_t *
2428 ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429     ip_stack_t *ipst)
2430 {
2431         uint32_t ports;
2432         uint16_t *pports = (uint16_t *)&ports;
2433         connf_t *connfp;
2434         conn_t  *tconnp;
2435         boolean_t zone_chk;
2436 
2437         /*
2438          * If either the source of destination address is loopback, then
2439          * both endpoints must be in the same Zone.  Otherwise, both of
2440          * the addresses are system-wide unique (tcp is in ESTABLISHED
2441          * state) and the endpoints may reside in different Zones.  We
2442          * don't do Zone check for link local address(es) because the
2443          * current Zone implementation treats each link local address as
2444          * being unique per system node, i.e. they belong to global Zone.
2445          */
2446         zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447             IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448 
2449         pports[0] = tcpha->tha_fport;
2450         pports[1] = tcpha->tha_lport;
2451 
2452         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453             ports, ipst)];
2454 
2455         mutex_enter(&connfp->connf_lock);
2456         for (tconnp = connfp->connf_head; tconnp != NULL;
2457             tconnp = tconnp->conn_next) {
2458 
2459                 /* We skip conn_bound_if check here as this is loopback tcp */
2460                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462                     tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463                     (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464 
2465                         ASSERT(tconnp != connp);
2466                         CONN_INC_REF(tconnp);
2467                         mutex_exit(&connfp->connf_lock);
2468                         return (tconnp);
2469                 }
2470         }
2471         mutex_exit(&connfp->connf_lock);
2472         return (NULL);
2473 }
2474 
2475 /*
2476  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478  * Only checks for connected entries i.e. no INADDR_ANY checks.
2479  */
2480 conn_t *
2481 ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482     ip_stack_t *ipst)
2483 {
2484         uint32_t ports;
2485         uint16_t *pports;
2486         connf_t *connfp;
2487         conn_t  *tconnp;
2488 
2489         pports = (uint16_t *)&ports;
2490         pports[0] = tcpha->tha_fport;
2491         pports[1] = tcpha->tha_lport;
2492 
2493         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494             ports, ipst)];
2495 
2496         mutex_enter(&connfp->connf_lock);
2497         for (tconnp = connfp->connf_head; tconnp != NULL;
2498             tconnp = tconnp->conn_next) {
2499 
2500                 if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501                     ipha->ipha_dst, ipha->ipha_src, ports) &&
2502                     tconnp->conn_tcp->tcp_state >= min_state) {
2503 
2504                         CONN_INC_REF(tconnp);
2505                         mutex_exit(&connfp->connf_lock);
2506                         return (tconnp);
2507                 }
2508         }
2509         mutex_exit(&connfp->connf_lock);
2510         return (NULL);
2511 }
2512 
2513 /*
2514  * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515  * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516  * Only checks for connected entries i.e. no INADDR_ANY checks.
2517  * Match on ifindex in addition to addresses.
2518  */
2519 conn_t *
2520 ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521     uint_t ifindex, ip_stack_t *ipst)
2522 {
2523         tcp_t   *tcp;
2524         uint32_t ports;
2525         uint16_t *pports;
2526         connf_t *connfp;
2527         conn_t  *tconnp;
2528 
2529         pports = (uint16_t *)&ports;
2530         pports[0] = tcpha->tha_fport;
2531         pports[1] = tcpha->tha_lport;
2532 
2533         connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534             ports, ipst)];
2535 
2536         mutex_enter(&connfp->connf_lock);
2537         for (tconnp = connfp->connf_head; tconnp != NULL;
2538             tconnp = tconnp->conn_next) {
2539 
2540                 tcp = tconnp->conn_tcp;
2541                 if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542                     ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543                     tcp->tcp_state >= min_state &&
2544                     (tconnp->conn_bound_if == 0 ||
2545                     tconnp->conn_bound_if == ifindex)) {
2546 
2547                         CONN_INC_REF(tconnp);
2548                         mutex_exit(&connfp->connf_lock);
2549                         return (tconnp);
2550                 }
2551         }
2552         mutex_exit(&connfp->connf_lock);
2553         return (NULL);
2554 }
2555 
2556 /*
2557  * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558  * a listener when changing state.
2559  */
2560 conn_t *
2561 ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562     ip_stack_t *ipst)
2563 {
2564         connf_t         *bind_connfp;
2565         conn_t          *connp;
2566         tcp_t           *tcp;
2567 
2568         /*
2569          * Avoid false matches for packets sent to an IP destination of
2570          * all zeros.
2571          */
2572         if (laddr == 0)
2573                 return (NULL);
2574 
2575         ASSERT(zoneid != ALL_ZONES);
2576 
2577         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578         mutex_enter(&bind_connfp->connf_lock);
2579         for (connp = bind_connfp->connf_head; connp != NULL;
2580             connp = connp->conn_next) {
2581                 tcp = connp->conn_tcp;
2582                 if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583                     IPCL_ZONE_MATCH(connp, zoneid) &&
2584                     (tcp->tcp_listener == NULL)) {
2585                         CONN_INC_REF(connp);
2586                         mutex_exit(&bind_connfp->connf_lock);
2587                         return (connp);
2588                 }
2589         }
2590         mutex_exit(&bind_connfp->connf_lock);
2591         return (NULL);
2592 }
2593 
2594 /*
2595  * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596  * a listener when changing state.
2597  */
2598 conn_t *
2599 ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600     zoneid_t zoneid, ip_stack_t *ipst)
2601 {
2602         connf_t         *bind_connfp;
2603         conn_t          *connp = NULL;
2604         tcp_t           *tcp;
2605 
2606         /*
2607          * Avoid false matches for packets sent to an IP destination of
2608          * all zeros.
2609          */
2610         if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611                 return (NULL);
2612 
2613         ASSERT(zoneid != ALL_ZONES);
2614 
2615         bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616         mutex_enter(&bind_connfp->connf_lock);
2617         for (connp = bind_connfp->connf_head; connp != NULL;
2618             connp = connp->conn_next) {
2619                 tcp = connp->conn_tcp;
2620                 if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621                     IPCL_ZONE_MATCH(connp, zoneid) &&
2622                     (connp->conn_bound_if == 0 ||
2623                     connp->conn_bound_if == ifindex) &&
2624                     tcp->tcp_listener == NULL) {
2625                         CONN_INC_REF(connp);
2626                         mutex_exit(&bind_connfp->connf_lock);
2627                         return (connp);
2628                 }
2629         }
2630         mutex_exit(&bind_connfp->connf_lock);
2631         return (NULL);
2632 }
2633 
2634 /*
2635  * ipcl_get_next_conn
2636  *      get the next entry in the conn global list
2637  *      and put a reference on the next_conn.
2638  *      decrement the reference on the current conn.
2639  *
2640  * This is an iterator based walker function that also provides for
2641  * some selection by the caller. It walks through the conn_hash bucket
2642  * searching for the next valid connp in the list, and selects connections
2643  * that are neither closed nor condemned. It also REFHOLDS the conn
2644  * thus ensuring that the conn exists when the caller uses the conn.
2645  */
2646 conn_t *
2647 ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 {
2649         conn_t  *next_connp;
2650 
2651         if (connfp == NULL)
2652                 return (NULL);
2653 
2654         mutex_enter(&connfp->connf_lock);
2655 
2656         next_connp = (connp == NULL) ?
2657             connfp->connf_head : connp->conn_g_next;
2658 
2659         while (next_connp != NULL) {
2660                 mutex_enter(&next_connp->conn_lock);
2661                 if (!(next_connp->conn_flags & conn_flags) ||
2662                     (next_connp->conn_state_flags &
2663                     (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664                         /*
2665                          * This conn has been condemned or
2666                          * is closing, or the flags don't match
2667                          */
2668                         mutex_exit(&next_connp->conn_lock);
2669                         next_connp = next_connp->conn_g_next;
2670                         continue;
2671                 }
2672                 CONN_INC_REF_LOCKED(next_connp);
2673                 mutex_exit(&next_connp->conn_lock);
2674                 break;
2675         }
2676 
2677         mutex_exit(&connfp->connf_lock);
2678 
2679         if (connp != NULL)
2680                 CONN_DEC_REF(connp);
2681 
2682         return (next_connp);
2683 }
2684 
2685 #ifdef CONN_DEBUG
2686 /*
2687  * Trace of the last NBUF refhold/refrele
2688  */
2689 int
2690 conn_trace_ref(conn_t *connp)
2691 {
2692         int     last;
2693         conn_trace_t    *ctb;
2694 
2695         ASSERT(MUTEX_HELD(&connp->conn_lock));
2696         last = connp->conn_trace_last;
2697         last++;
2698         if (last == CONN_TRACE_MAX)
2699                 last = 0;
2700 
2701         ctb = &connp->conn_trace_buf[last];
2702         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703         connp->conn_trace_last = last;
2704         return (1);
2705 }
2706 
2707 int
2708 conn_untrace_ref(conn_t *connp)
2709 {
2710         int     last;
2711         conn_trace_t    *ctb;
2712 
2713         ASSERT(MUTEX_HELD(&connp->conn_lock));
2714         last = connp->conn_trace_last;
2715         last++;
2716         if (last == CONN_TRACE_MAX)
2717                 last = 0;
2718 
2719         ctb = &connp->conn_trace_buf[last];
2720         ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721         connp->conn_trace_last = last;
2722         return (1);
2723 }
2724 #endif