Print this page
    
XXXX adding PID information to netstat output
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ip/ipclassifier.c
          +++ new/usr/src/uts/common/inet/ip/ipclassifier.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*
  26   26   * IP PACKET CLASSIFIER
  27   27   *
  28   28   * The IP packet classifier provides mapping between IP packets and persistent
  29   29   * connection state for connection-oriented protocols. It also provides
  30   30   * interface for managing connection states.
  31   31   *
  32   32   * The connection state is kept in conn_t data structure and contains, among
  33   33   * other things:
  34   34   *
  35   35   *      o local/remote address and ports
  36   36   *      o Transport protocol
  37   37   *      o squeue for the connection (for TCP only)
  38   38   *      o reference counter
  39   39   *      o Connection state
  40   40   *      o hash table linkage
  41   41   *      o interface/ire information
  42   42   *      o credentials
  43   43   *      o ipsec policy
  44   44   *      o send and receive functions.
  45   45   *      o mutex lock.
  46   46   *
  47   47   * Connections use a reference counting scheme. They are freed when the
  48   48   * reference counter drops to zero. A reference is incremented when connection
  49   49   * is placed in a list or table, when incoming packet for the connection arrives
  50   50   * and when connection is processed via squeue (squeue processing may be
  51   51   * asynchronous and the reference protects the connection from being destroyed
  52   52   * before its processing is finished).
  53   53   *
  54   54   * conn_recv is used to pass up packets to the ULP.
  55   55   * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for
  56   56   * a listener, and changes to tcp_input_listener as the listener has picked a
  57   57   * good squeue. For other cases it is set to tcp_input_data.
  58   58   *
  59   59   * conn_recvicmp is used to pass up ICMP errors to the ULP.
  60   60   *
  61   61   * Classifier uses several hash tables:
  62   62   *
  63   63   *      ipcl_conn_fanout:       contains all TCP connections in CONNECTED state
  64   64   *      ipcl_bind_fanout:       contains all connections in BOUND state
  65   65   *      ipcl_proto_fanout:      IPv4 protocol fanout
  66   66   *      ipcl_proto_fanout_v6:   IPv6 protocol fanout
  67   67   *      ipcl_udp_fanout:        contains all UDP connections
  68   68   *      ipcl_iptun_fanout:      contains all IP tunnel connections
  69   69   *      ipcl_globalhash_fanout: contains all connections
  70   70   *
  71   71   * The ipcl_globalhash_fanout is used for any walkers (like snmp and Clustering)
  72   72   * which need to view all existing connections.
  73   73   *
  74   74   * All tables are protected by per-bucket locks. When both per-bucket lock and
  75   75   * connection lock need to be held, the per-bucket lock should be acquired
  76   76   * first, followed by the connection lock.
  77   77   *
  78   78   * All functions doing search in one of these tables increment a reference
  79   79   * counter on the connection found (if any). This reference should be dropped
  80   80   * when the caller has finished processing the connection.
  81   81   *
  82   82   *
  83   83   * INTERFACES:
  84   84   * ===========
  85   85   *
  86   86   * Connection Lookup:
  87   87   * ------------------
  88   88   *
  89   89   * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack)
  90   90   * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack)
  91   91   *
  92   92   * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if
  93   93   * it can't find any associated connection. If the connection is found, its
  94   94   * reference counter is incremented.
  95   95   *
  96   96   *      mp:     mblock, containing packet header. The full header should fit
  97   97   *              into a single mblock. It should also contain at least full IP
  98   98   *              and TCP or UDP header.
  99   99   *
 100  100   *      protocol: Either IPPROTO_TCP or IPPROTO_UDP.
 101  101   *
 102  102   *      hdr_len: The size of IP header. It is used to find TCP or UDP header in
 103  103   *               the packet.
 104  104   *
 105  105   *      ira->ira_zoneid: The zone in which the returned connection must be; the
 106  106   *              zoneid corresponding to the ire_zoneid on the IRE located for
 107  107   *              the packet's destination address.
 108  108   *
 109  109   *      ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and
 110  110   *              IRAF_TX_SHARED_ADDR flags
 111  111   *
 112  112   *      For TCP connections, the lookup order is as follows:
 113  113   *              5-tuple {src, dst, protocol, local port, remote port}
 114  114   *                      lookup in ipcl_conn_fanout table.
 115  115   *              3-tuple {dst, remote port, protocol} lookup in
 116  116   *                      ipcl_bind_fanout table.
 117  117   *
 118  118   *      For UDP connections, a 5-tuple {src, dst, protocol, local port,
 119  119   *      remote port} lookup is done on ipcl_udp_fanout. Note that,
 120  120   *      these interfaces do not handle cases where a packets belongs
 121  121   *      to multiple UDP clients, which is handled in IP itself.
 122  122   *
 123  123   * If the destination IRE is ALL_ZONES (indicated by zoneid), then we must
 124  124   * determine which actual zone gets the segment.  This is used only in a
 125  125   * labeled environment.  The matching rules are:
 126  126   *
 127  127   *      - If it's not a multilevel port, then the label on the packet selects
 128  128   *        the zone.  Unlabeled packets are delivered to the global zone.
 129  129   *
 130  130   *      - If it's a multilevel port, then only the zone registered to receive
 131  131   *        packets on that port matches.
 132  132   *
 133  133   * Also, in a labeled environment, packet labels need to be checked.  For fully
 134  134   * bound TCP connections, we can assume that the packet label was checked
 135  135   * during connection establishment, and doesn't need to be checked on each
 136  136   * packet.  For others, though, we need to check for strict equality or, for
 137  137   * multilevel ports, membership in the range or set.  This part currently does
 138  138   * a tnrh lookup on each packet, but could be optimized to use cached results
 139  139   * if that were necessary.  (SCTP doesn't come through here, but if it did,
 140  140   * we would apply the same rules as TCP.)
 141  141   *
 142  142   * An implication of the above is that fully-bound TCP sockets must always use
 143  143   * distinct 4-tuples; they can't be discriminated by label alone.
 144  144   *
 145  145   * Note that we cannot trust labels on packets sent to fully-bound UDP sockets,
 146  146   * as there's no connection set-up handshake and no shared state.
 147  147   *
 148  148   * Labels on looped-back packets within a single zone do not need to be
 149  149   * checked, as all processes in the same zone have the same label.
 150  150   *
 151  151   * Finally, for unlabeled packets received by a labeled system, special rules
 152  152   * apply.  We consider only the MLP if there is one.  Otherwise, we prefer a
 153  153   * socket in the zone whose label matches the default label of the sender, if
 154  154   * any.  In any event, the receiving socket must have SO_MAC_EXEMPT set and the
 155  155   * receiver's label must dominate the sender's default label.
 156  156   *
 157  157   * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack);
 158  158   * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t,
 159  159   *                                       ip_stack);
 160  160   *
 161  161   *      Lookup routine to find a exact match for {src, dst, local port,
 162  162   *      remote port) for TCP connections in ipcl_conn_fanout. The address and
 163  163   *      ports are read from the IP and TCP header respectively.
 164  164   *
 165  165   * conn_t       *ipcl_lookup_listener_v4(lport, laddr, protocol,
 166  166   *                                       zoneid, ip_stack);
 167  167   * conn_t       *ipcl_lookup_listener_v6(lport, laddr, protocol, ifindex,
 168  168   *                                       zoneid, ip_stack);
 169  169   *
 170  170   *      Lookup routine to find a listener with the tuple {lport, laddr,
 171  171   *      protocol} in the ipcl_bind_fanout table. For IPv6, an additional
 172  172   *      parameter interface index is also compared.
 173  173   *
 174  174   * void ipcl_walk(func, arg, ip_stack)
 175  175   *
 176  176   *      Apply 'func' to every connection available. The 'func' is called as
 177  177   *      (*func)(connp, arg). The walk is non-atomic so connections may be
 178  178   *      created and destroyed during the walk. The CONN_CONDEMNED and
 179  179   *      CONN_INCIPIENT flags ensure that connections which are newly created
 180  180   *      or being destroyed are not selected by the walker.
 181  181   *
 182  182   * Table Updates
 183  183   * -------------
 184  184   *
 185  185   * int ipcl_conn_insert(connp);
 186  186   * int ipcl_conn_insert_v4(connp);
 187  187   * int ipcl_conn_insert_v6(connp);
 188  188   *
 189  189   *      Insert 'connp' in the ipcl_conn_fanout.
 190  190   *      Arguements :
 191  191   *              connp           conn_t to be inserted
 192  192   *
 193  193   *      Return value :
 194  194   *              0               if connp was inserted
 195  195   *              EADDRINUSE      if the connection with the same tuple
 196  196   *                              already exists.
 197  197   *
 198  198   * int ipcl_bind_insert(connp);
 199  199   * int ipcl_bind_insert_v4(connp);
 200  200   * int ipcl_bind_insert_v6(connp);
 201  201   *
 202  202   *      Insert 'connp' in ipcl_bind_fanout.
 203  203   *      Arguements :
 204  204   *              connp           conn_t to be inserted
 205  205   *
 206  206   *
 207  207   * void ipcl_hash_remove(connp);
 208  208   *
 209  209   *      Removes the 'connp' from the connection fanout table.
 210  210   *
 211  211   * Connection Creation/Destruction
 212  212   * -------------------------------
 213  213   *
 214  214   * conn_t *ipcl_conn_create(type, sleep, netstack_t *)
 215  215   *
 216  216   *      Creates a new conn based on the type flag, inserts it into
 217  217   *      globalhash table.
 218  218   *
 219  219   *      type:   This flag determines the type of conn_t which needs to be
 220  220   *              created i.e., which kmem_cache it comes from.
 221  221   *              IPCL_TCPCONN    indicates a TCP connection
 222  222   *              IPCL_SCTPCONN   indicates a SCTP connection
 223  223   *              IPCL_UDPCONN    indicates a UDP conn_t.
 224  224   *              IPCL_RAWIPCONN  indicates a RAWIP/ICMP conn_t.
 225  225   *              IPCL_RTSCONN    indicates a RTS conn_t.
 226  226   *              IPCL_IPCCONN    indicates all other connections.
 227  227   *
 228  228   * void ipcl_conn_destroy(connp)
 229  229   *
 230  230   *      Destroys the connection state, removes it from the global
 231  231   *      connection hash table and frees its memory.
 232  232   */
 233  233  
 234  234  #include <sys/types.h>
 235  235  #include <sys/stream.h>
 236  236  #include <sys/stropts.h>
 237  237  #include <sys/sysmacros.h>
 238  238  #include <sys/strsubr.h>
 239  239  #include <sys/strsun.h>
 240  240  #define _SUN_TPI_VERSION 2
 241  241  #include <sys/ddi.h>
 242  242  #include <sys/cmn_err.h>
 243  243  #include <sys/debug.h>
 244  244  
 245  245  #include <sys/systm.h>
 246  246  #include <sys/param.h>
 247  247  #include <sys/kmem.h>
 248  248  #include <sys/isa_defs.h>
 249  249  #include <inet/common.h>
 250  250  #include <netinet/ip6.h>
 251  251  #include <netinet/icmp6.h>
 252  252  
 253  253  #include <inet/ip.h>
 254  254  #include <inet/ip_if.h>
 255  255  #include <inet/ip_ire.h>
 256  256  #include <inet/ip6.h>
 257  257  #include <inet/ip_ndp.h>
 258  258  #include <inet/ip_impl.h>
 259  259  #include <inet/udp_impl.h>
 260  260  #include <inet/sctp_ip.h>
 261  261  #include <inet/sctp/sctp_impl.h>
 262  262  #include <inet/rawip_impl.h>
 263  263  #include <inet/rts_impl.h>
 264  264  #include <inet/iptun/iptun_impl.h>
 265  265  
 266  266  #include <sys/cpuvar.h>
 267  267  
 268  268  #include <inet/ipclassifier.h>
 269  269  #include <inet/tcp.h>
 270  270  #include <inet/ipsec_impl.h>
 271  271  
 272  272  #include <sys/tsol/tnet.h>
 273  273  #include <sys/sockio.h>
 274  274  
 275  275  /* Old value for compatibility. Setable in /etc/system */
 276  276  uint_t tcp_conn_hash_size = 0;
 277  277  
 278  278  /* New value. Zero means choose automatically.  Setable in /etc/system */
 279  279  uint_t ipcl_conn_hash_size = 0;
 280  280  uint_t ipcl_conn_hash_memfactor = 8192;
 281  281  uint_t ipcl_conn_hash_maxsize = 82500;
 282  282  
 283  283  /* bind/udp fanout table size */
 284  284  uint_t ipcl_bind_fanout_size = 512;
 285  285  uint_t ipcl_udp_fanout_size = 16384;
 286  286  
 287  287  /* Raw socket fanout size.  Must be a power of 2. */
 288  288  uint_t ipcl_raw_fanout_size = 256;
 289  289  
 290  290  /*
 291  291   * The IPCL_IPTUN_HASH() function works best with a prime table size.  We
 292  292   * expect that most large deployments would have hundreds of tunnels, and
 293  293   * thousands in the extreme case.
 294  294   */
 295  295  uint_t ipcl_iptun_fanout_size = 6143;
 296  296  
 297  297  /*
 298  298   * Power of 2^N Primes useful for hashing for N of 0-28,
 299  299   * these primes are the nearest prime <= 2^N - 2^(N-2).
 300  300   */
 301  301  
 302  302  #define P2Ps() {0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,  \
 303  303                  6143, 12281, 24571, 49139, 98299, 196597, 393209,       \
 304  304                  786431, 1572853, 3145721, 6291449, 12582893, 25165813,  \
 305  305                  50331599, 100663291, 201326557, 0}
 306  306  
 307  307  /*
 308  308   * wrapper structure to ensure that conn and what follows it (tcp_t, etc)
 309  309   * are aligned on cache lines.
 310  310   */
 311  311  typedef union itc_s {
 312  312          conn_t  itc_conn;
 313  313          char    itcu_filler[CACHE_ALIGN(conn_s)];
 314  314  } itc_t;
 315  315  
 316  316  struct kmem_cache  *tcp_conn_cache;
 317  317  struct kmem_cache  *ip_conn_cache;
 318  318  extern struct kmem_cache  *sctp_conn_cache;
 319  319  struct kmem_cache  *udp_conn_cache;
 320  320  struct kmem_cache  *rawip_conn_cache;
 321  321  struct kmem_cache  *rts_conn_cache;
 322  322  
 323  323  extern void     tcp_timermp_free(tcp_t *);
 324  324  extern mblk_t   *tcp_timermp_alloc(int);
 325  325  
 326  326  static int      ip_conn_constructor(void *, void *, int);
 327  327  static void     ip_conn_destructor(void *, void *);
 328  328  
 329  329  static int      tcp_conn_constructor(void *, void *, int);
 330  330  static void     tcp_conn_destructor(void *, void *);
 331  331  
 332  332  static int      udp_conn_constructor(void *, void *, int);
 333  333  static void     udp_conn_destructor(void *, void *);
 334  334  
 335  335  static int      rawip_conn_constructor(void *, void *, int);
 336  336  static void     rawip_conn_destructor(void *, void *);
 337  337  
 338  338  static int      rts_conn_constructor(void *, void *, int);
 339  339  static void     rts_conn_destructor(void *, void *);
 340  340  
 341  341  /*
 342  342   * Global (for all stack instances) init routine
 343  343   */
 344  344  void
 345  345  ipcl_g_init(void)
 346  346  {
 347  347          ip_conn_cache = kmem_cache_create("ip_conn_cache",
 348  348              sizeof (conn_t), CACHE_ALIGN_SIZE,
 349  349              ip_conn_constructor, ip_conn_destructor,
 350  350              NULL, NULL, NULL, 0);
 351  351  
 352  352          tcp_conn_cache = kmem_cache_create("tcp_conn_cache",
 353  353              sizeof (itc_t) + sizeof (tcp_t), CACHE_ALIGN_SIZE,
 354  354              tcp_conn_constructor, tcp_conn_destructor,
 355  355              tcp_conn_reclaim, NULL, NULL, 0);
 356  356  
 357  357          udp_conn_cache = kmem_cache_create("udp_conn_cache",
 358  358              sizeof (itc_t) + sizeof (udp_t), CACHE_ALIGN_SIZE,
 359  359              udp_conn_constructor, udp_conn_destructor,
 360  360              NULL, NULL, NULL, 0);
 361  361  
 362  362          rawip_conn_cache = kmem_cache_create("rawip_conn_cache",
 363  363              sizeof (itc_t) + sizeof (icmp_t), CACHE_ALIGN_SIZE,
 364  364              rawip_conn_constructor, rawip_conn_destructor,
 365  365              NULL, NULL, NULL, 0);
 366  366  
 367  367          rts_conn_cache = kmem_cache_create("rts_conn_cache",
 368  368              sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
 369  369              rts_conn_constructor, rts_conn_destructor,
 370  370              NULL, NULL, NULL, 0);
 371  371  }
 372  372  
 373  373  /*
 374  374   * ipclassifier intialization routine, sets up hash tables.
 375  375   */
 376  376  void
 377  377  ipcl_init(ip_stack_t *ipst)
 378  378  {
 379  379          int i;
 380  380          int sizes[] = P2Ps();
 381  381  
 382  382          /*
 383  383           * Calculate size of conn fanout table from /etc/system settings
 384  384           */
 385  385          if (ipcl_conn_hash_size != 0) {
 386  386                  ipst->ips_ipcl_conn_fanout_size = ipcl_conn_hash_size;
 387  387          } else if (tcp_conn_hash_size != 0) {
 388  388                  ipst->ips_ipcl_conn_fanout_size = tcp_conn_hash_size;
 389  389          } else {
 390  390                  extern pgcnt_t freemem;
 391  391  
 392  392                  ipst->ips_ipcl_conn_fanout_size =
 393  393                      (freemem * PAGESIZE) / ipcl_conn_hash_memfactor;
 394  394  
 395  395                  if (ipst->ips_ipcl_conn_fanout_size > ipcl_conn_hash_maxsize) {
 396  396                          ipst->ips_ipcl_conn_fanout_size =
 397  397                              ipcl_conn_hash_maxsize;
 398  398                  }
 399  399          }
 400  400  
 401  401          for (i = 9; i < sizeof (sizes) / sizeof (*sizes) - 1; i++) {
 402  402                  if (sizes[i] >= ipst->ips_ipcl_conn_fanout_size) {
 403  403                          break;
 404  404                  }
 405  405          }
 406  406          if ((ipst->ips_ipcl_conn_fanout_size = sizes[i]) == 0) {
 407  407                  /* Out of range, use the 2^16 value */
 408  408                  ipst->ips_ipcl_conn_fanout_size = sizes[16];
 409  409          }
 410  410  
 411  411          /* Take values from /etc/system */
 412  412          ipst->ips_ipcl_bind_fanout_size = ipcl_bind_fanout_size;
 413  413          ipst->ips_ipcl_udp_fanout_size = ipcl_udp_fanout_size;
 414  414          ipst->ips_ipcl_raw_fanout_size = ipcl_raw_fanout_size;
 415  415          ipst->ips_ipcl_iptun_fanout_size = ipcl_iptun_fanout_size;
 416  416  
 417  417          ASSERT(ipst->ips_ipcl_conn_fanout == NULL);
 418  418  
 419  419          ipst->ips_ipcl_conn_fanout = kmem_zalloc(
 420  420              ipst->ips_ipcl_conn_fanout_size * sizeof (connf_t), KM_SLEEP);
 421  421  
 422  422          for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 423  423                  mutex_init(&ipst->ips_ipcl_conn_fanout[i].connf_lock, NULL,
 424  424                      MUTEX_DEFAULT, NULL);
 425  425          }
 426  426  
 427  427          ipst->ips_ipcl_bind_fanout = kmem_zalloc(
 428  428              ipst->ips_ipcl_bind_fanout_size * sizeof (connf_t), KM_SLEEP);
 429  429  
 430  430          for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 431  431                  mutex_init(&ipst->ips_ipcl_bind_fanout[i].connf_lock, NULL,
 432  432                      MUTEX_DEFAULT, NULL);
 433  433          }
 434  434  
 435  435          ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX *
 436  436              sizeof (connf_t), KM_SLEEP);
 437  437          for (i = 0; i < IPPROTO_MAX; i++) {
 438  438                  mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL,
 439  439                      MUTEX_DEFAULT, NULL);
 440  440          }
 441  441  
 442  442          ipst->ips_ipcl_proto_fanout_v6 = kmem_zalloc(IPPROTO_MAX *
 443  443              sizeof (connf_t), KM_SLEEP);
 444  444          for (i = 0; i < IPPROTO_MAX; i++) {
 445  445                  mutex_init(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock, NULL,
 446  446                      MUTEX_DEFAULT, NULL);
 447  447          }
 448  448  
 449  449          ipst->ips_rts_clients = kmem_zalloc(sizeof (connf_t), KM_SLEEP);
 450  450          mutex_init(&ipst->ips_rts_clients->connf_lock,
 451  451              NULL, MUTEX_DEFAULT, NULL);
 452  452  
 453  453          ipst->ips_ipcl_udp_fanout = kmem_zalloc(
 454  454              ipst->ips_ipcl_udp_fanout_size * sizeof (connf_t), KM_SLEEP);
 455  455          for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 456  456                  mutex_init(&ipst->ips_ipcl_udp_fanout[i].connf_lock, NULL,
 457  457                      MUTEX_DEFAULT, NULL);
 458  458          }
 459  459  
 460  460          ipst->ips_ipcl_iptun_fanout = kmem_zalloc(
 461  461              ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t), KM_SLEEP);
 462  462          for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 463  463                  mutex_init(&ipst->ips_ipcl_iptun_fanout[i].connf_lock, NULL,
 464  464                      MUTEX_DEFAULT, NULL);
 465  465          }
 466  466  
 467  467          ipst->ips_ipcl_raw_fanout = kmem_zalloc(
 468  468              ipst->ips_ipcl_raw_fanout_size * sizeof (connf_t), KM_SLEEP);
 469  469          for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 470  470                  mutex_init(&ipst->ips_ipcl_raw_fanout[i].connf_lock, NULL,
 471  471                      MUTEX_DEFAULT, NULL);
 472  472          }
 473  473  
 474  474          ipst->ips_ipcl_globalhash_fanout = kmem_zalloc(
 475  475              sizeof (connf_t) * CONN_G_HASH_SIZE, KM_SLEEP);
 476  476          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 477  477                  mutex_init(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock,
 478  478                      NULL, MUTEX_DEFAULT, NULL);
 479  479          }
 480  480  }
 481  481  
 482  482  void
 483  483  ipcl_g_destroy(void)
 484  484  {
 485  485          kmem_cache_destroy(ip_conn_cache);
 486  486          kmem_cache_destroy(tcp_conn_cache);
 487  487          kmem_cache_destroy(udp_conn_cache);
 488  488          kmem_cache_destroy(rawip_conn_cache);
 489  489          kmem_cache_destroy(rts_conn_cache);
 490  490  }
 491  491  
 492  492  /*
 493  493   * All user-level and kernel use of the stack must be gone
 494  494   * by now.
 495  495   */
 496  496  void
 497  497  ipcl_destroy(ip_stack_t *ipst)
 498  498  {
 499  499          int i;
 500  500  
 501  501          for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) {
 502  502                  ASSERT(ipst->ips_ipcl_conn_fanout[i].connf_head == NULL);
 503  503                  mutex_destroy(&ipst->ips_ipcl_conn_fanout[i].connf_lock);
 504  504          }
 505  505          kmem_free(ipst->ips_ipcl_conn_fanout, ipst->ips_ipcl_conn_fanout_size *
 506  506              sizeof (connf_t));
 507  507          ipst->ips_ipcl_conn_fanout = NULL;
 508  508  
 509  509          for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) {
 510  510                  ASSERT(ipst->ips_ipcl_bind_fanout[i].connf_head == NULL);
 511  511                  mutex_destroy(&ipst->ips_ipcl_bind_fanout[i].connf_lock);
 512  512          }
 513  513          kmem_free(ipst->ips_ipcl_bind_fanout, ipst->ips_ipcl_bind_fanout_size *
 514  514              sizeof (connf_t));
 515  515          ipst->ips_ipcl_bind_fanout = NULL;
 516  516  
 517  517          for (i = 0; i < IPPROTO_MAX; i++) {
 518  518                  ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL);
 519  519                  mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock);
 520  520          }
 521  521          kmem_free(ipst->ips_ipcl_proto_fanout_v4,
 522  522              IPPROTO_MAX * sizeof (connf_t));
 523  523          ipst->ips_ipcl_proto_fanout_v4 = NULL;
 524  524  
 525  525          for (i = 0; i < IPPROTO_MAX; i++) {
 526  526                  ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL);
 527  527                  mutex_destroy(&ipst->ips_ipcl_proto_fanout_v6[i].connf_lock);
 528  528          }
 529  529          kmem_free(ipst->ips_ipcl_proto_fanout_v6,
 530  530              IPPROTO_MAX * sizeof (connf_t));
 531  531          ipst->ips_ipcl_proto_fanout_v6 = NULL;
 532  532  
 533  533          for (i = 0; i < ipst->ips_ipcl_udp_fanout_size; i++) {
 534  534                  ASSERT(ipst->ips_ipcl_udp_fanout[i].connf_head == NULL);
 535  535                  mutex_destroy(&ipst->ips_ipcl_udp_fanout[i].connf_lock);
 536  536          }
 537  537          kmem_free(ipst->ips_ipcl_udp_fanout, ipst->ips_ipcl_udp_fanout_size *
 538  538              sizeof (connf_t));
 539  539          ipst->ips_ipcl_udp_fanout = NULL;
 540  540  
 541  541          for (i = 0; i < ipst->ips_ipcl_iptun_fanout_size; i++) {
 542  542                  ASSERT(ipst->ips_ipcl_iptun_fanout[i].connf_head == NULL);
 543  543                  mutex_destroy(&ipst->ips_ipcl_iptun_fanout[i].connf_lock);
 544  544          }
 545  545          kmem_free(ipst->ips_ipcl_iptun_fanout,
 546  546              ipst->ips_ipcl_iptun_fanout_size * sizeof (connf_t));
 547  547          ipst->ips_ipcl_iptun_fanout = NULL;
 548  548  
 549  549          for (i = 0; i < ipst->ips_ipcl_raw_fanout_size; i++) {
 550  550                  ASSERT(ipst->ips_ipcl_raw_fanout[i].connf_head == NULL);
 551  551                  mutex_destroy(&ipst->ips_ipcl_raw_fanout[i].connf_lock);
 552  552          }
 553  553          kmem_free(ipst->ips_ipcl_raw_fanout, ipst->ips_ipcl_raw_fanout_size *
 554  554              sizeof (connf_t));
 555  555          ipst->ips_ipcl_raw_fanout = NULL;
 556  556  
 557  557          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
 558  558                  ASSERT(ipst->ips_ipcl_globalhash_fanout[i].connf_head == NULL);
 559  559                  mutex_destroy(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
 560  560          }
 561  561          kmem_free(ipst->ips_ipcl_globalhash_fanout,
 562  562              sizeof (connf_t) * CONN_G_HASH_SIZE);
 563  563          ipst->ips_ipcl_globalhash_fanout = NULL;
 564  564  
 565  565          ASSERT(ipst->ips_rts_clients->connf_head == NULL);
 566  566          mutex_destroy(&ipst->ips_rts_clients->connf_lock);
 567  567          kmem_free(ipst->ips_rts_clients, sizeof (connf_t));
 568  568          ipst->ips_rts_clients = NULL;
 569  569  }
 570  570  
 571  571  /*
 572  572   * conn creation routine. initialize the conn, sets the reference
 573  573   * and inserts it in the global hash table.
 574  574   */
 575  575  conn_t *
 576  576  ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns)
 577  577  {
 578  578          conn_t  *connp;
 579  579          struct kmem_cache *conn_cache;
 580  580  
 581  581          switch (type) {
 582  582          case IPCL_SCTPCONN:
 583  583                  if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL)
 584  584                          return (NULL);
 585  585                  sctp_conn_init(connp);
 586  586                  netstack_hold(ns);
 587  587                  connp->conn_netstack = ns;
 588  588                  connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 589  589                  connp->conn_ixa->ixa_conn_id = (long)connp;
 590  590                  ipcl_globalhash_insert(connp);
 591  591                  return (connp);
 592  592  
 593  593          case IPCL_TCPCONN:
 594  594                  conn_cache = tcp_conn_cache;
 595  595                  break;
 596  596  
 597  597          case IPCL_UDPCONN:
 598  598                  conn_cache = udp_conn_cache;
 599  599                  break;
 600  600  
 601  601          case IPCL_RAWIPCONN:
 602  602                  conn_cache = rawip_conn_cache;
 603  603                  break;
 604  604  
 605  605          case IPCL_RTSCONN:
 606  606                  conn_cache = rts_conn_cache;
 607  607                  break;
 608  608  
 609  609          case IPCL_IPCCONN:
 610  610                  conn_cache = ip_conn_cache;
 611  611                  break;
 612  612  
 613  613          default:
 614  614                  connp = NULL;
 615  615                  ASSERT(0);
 616  616          }
 617  617  
 618  618          if ((connp = kmem_cache_alloc(conn_cache, sleep)) == NULL)
 619  619                  return (NULL);
 620  620  
 621  621          connp->conn_ref = 1;
 622  622          netstack_hold(ns);
 623  623          connp->conn_netstack = ns;
 624  624          connp->conn_ixa->ixa_ipst = ns->netstack_ip;
 625  625          connp->conn_ixa->ixa_conn_id = (long)connp;
 626  626          ipcl_globalhash_insert(connp);
 627  627          return (connp);
 628  628  }
 629  629  
 630  630  void
 631  631  ipcl_conn_destroy(conn_t *connp)
 632  632  {
 633  633          mblk_t  *mp;
 634  634          netstack_t      *ns = connp->conn_netstack;
 635  635  
 636  636          ASSERT(!MUTEX_HELD(&connp->conn_lock));
 637  637          ASSERT(connp->conn_ref == 0);
 638  638          ASSERT(connp->conn_ioctlref == 0);
 639  639  
 640  640          DTRACE_PROBE1(conn__destroy, conn_t *, connp);
 641  641  
 642  642          if (connp->conn_cred != NULL) {
 643  643                  crfree(connp->conn_cred);
 644  644                  connp->conn_cred = NULL;
 645  645                  /* ixa_cred done in ipcl_conn_cleanup below */
 646  646          }
 647  647  
 648  648          if (connp->conn_ht_iphc != NULL) {
 649  649                  kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
 650  650                  connp->conn_ht_iphc = NULL;
 651  651                  connp->conn_ht_iphc_allocated = 0;
 652  652                  connp->conn_ht_iphc_len = 0;
 653  653                  connp->conn_ht_ulp = NULL;
 654  654                  connp->conn_ht_ulp_len = 0;
 655  655          }
 656  656          ip_pkt_free(&connp->conn_xmit_ipp);
 657  657  
 658  658          ipcl_globalhash_remove(connp);
 659  659  
 660  660          if (connp->conn_latch != NULL) {
 661  661                  IPLATCH_REFRELE(connp->conn_latch);
 662  662                  connp->conn_latch = NULL;
 663  663          }
 664  664          if (connp->conn_latch_in_policy != NULL) {
 665  665                  IPPOL_REFRELE(connp->conn_latch_in_policy);
 666  666                  connp->conn_latch_in_policy = NULL;
 667  667          }
 668  668          if (connp->conn_latch_in_action != NULL) {
 669  669                  IPACT_REFRELE(connp->conn_latch_in_action);
 670  670                  connp->conn_latch_in_action = NULL;
 671  671          }
 672  672          if (connp->conn_policy != NULL) {
 673  673                  IPPH_REFRELE(connp->conn_policy, ns);
 674  674                  connp->conn_policy = NULL;
 675  675          }
 676  676  
 677  677          if (connp->conn_ipsec_opt_mp != NULL) {
 678  678                  freemsg(connp->conn_ipsec_opt_mp);
 679  679                  connp->conn_ipsec_opt_mp = NULL;
 680  680          }
 681  681  
 682  682          if (connp->conn_flags & IPCL_TCPCONN) {
 683  683                  tcp_t *tcp = connp->conn_tcp;
 684  684  
 685  685                  tcp_free(tcp);
 686  686                  mp = tcp->tcp_timercache;
 687  687  
 688  688                  tcp->tcp_tcps = NULL;
 689  689  
 690  690                  /*
 691  691                   * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate
 692  692                   * the mblk.
 693  693                   */
 694  694                  if (tcp->tcp_rsrv_mp != NULL) {
 695  695                          freeb(tcp->tcp_rsrv_mp);
 696  696                          tcp->tcp_rsrv_mp = NULL;
 697  697                          mutex_destroy(&tcp->tcp_rsrv_mp_lock);
 698  698                  }
 699  699  
 700  700                  ipcl_conn_cleanup(connp);
 701  701                  connp->conn_flags = IPCL_TCPCONN;
 702  702                  if (ns != NULL) {
 703  703                          ASSERT(tcp->tcp_tcps == NULL);
 704  704                          connp->conn_netstack = NULL;
 705  705                          connp->conn_ixa->ixa_ipst = NULL;
 706  706                          netstack_rele(ns);
 707  707                  }
 708  708  
 709  709                  bzero(tcp, sizeof (tcp_t));
 710  710  
 711  711                  tcp->tcp_timercache = mp;
 712  712                  tcp->tcp_connp = connp;
 713  713                  kmem_cache_free(tcp_conn_cache, connp);
 714  714                  return;
 715  715          }
 716  716  
 717  717          if (connp->conn_flags & IPCL_SCTPCONN) {
 718  718                  ASSERT(ns != NULL);
 719  719                  sctp_free(connp);
 720  720                  return;
 721  721          }
 722  722  
 723  723          ipcl_conn_cleanup(connp);
 724  724          if (ns != NULL) {
 725  725                  connp->conn_netstack = NULL;
 726  726                  connp->conn_ixa->ixa_ipst = NULL;
 727  727                  netstack_rele(ns);
 728  728          }
 729  729  
 730  730          /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
 731  731          if (connp->conn_flags & IPCL_UDPCONN) {
 732  732                  connp->conn_flags = IPCL_UDPCONN;
 733  733                  kmem_cache_free(udp_conn_cache, connp);
 734  734          } else if (connp->conn_flags & IPCL_RAWIPCONN) {
 735  735                  connp->conn_flags = IPCL_RAWIPCONN;
 736  736                  connp->conn_proto = IPPROTO_ICMP;
 737  737                  connp->conn_ixa->ixa_protocol = connp->conn_proto;
 738  738                  kmem_cache_free(rawip_conn_cache, connp);
 739  739          } else if (connp->conn_flags & IPCL_RTSCONN) {
 740  740                  connp->conn_flags = IPCL_RTSCONN;
 741  741                  kmem_cache_free(rts_conn_cache, connp);
 742  742          } else {
 743  743                  connp->conn_flags = IPCL_IPCCONN;
 744  744                  ASSERT(connp->conn_flags & IPCL_IPCCONN);
 745  745                  ASSERT(connp->conn_priv == NULL);
 746  746                  kmem_cache_free(ip_conn_cache, connp);
 747  747          }
 748  748  }
 749  749  
 750  750  /*
 751  751   * Running in cluster mode - deregister listener information
 752  752   */
 753  753  static void
 754  754  ipcl_conn_unlisten(conn_t *connp)
 755  755  {
 756  756          ASSERT((connp->conn_flags & IPCL_CL_LISTENER) != 0);
 757  757          ASSERT(connp->conn_lport != 0);
 758  758  
 759  759          if (cl_inet_unlisten != NULL) {
 760  760                  sa_family_t     addr_family;
 761  761                  uint8_t         *laddrp;
 762  762  
 763  763                  if (connp->conn_ipversion == IPV6_VERSION) {
 764  764                          addr_family = AF_INET6;
 765  765                          laddrp = (uint8_t *)&connp->conn_bound_addr_v6;
 766  766                  } else {
 767  767                          addr_family = AF_INET;
 768  768                          laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
 769  769                  }
 770  770                  (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid,
 771  771                      IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL);
 772  772          }
 773  773          connp->conn_flags &= ~IPCL_CL_LISTENER;
 774  774  }
 775  775  
 776  776  /*
 777  777   * We set the IPCL_REMOVED flag (instead of clearing the flag indicating
 778  778   * which table the conn belonged to). So for debugging we can see which hash
 779  779   * table this connection was in.
 780  780   */
 781  781  #define IPCL_HASH_REMOVE(connp) {                                       \
 782  782          connf_t *connfp = (connp)->conn_fanout;                         \
 783  783          ASSERT(!MUTEX_HELD(&((connp)->conn_lock)));                     \
 784  784          if (connfp != NULL) {                                           \
 785  785                  mutex_enter(&connfp->connf_lock);                       \
 786  786                  if ((connp)->conn_next != NULL)                         \
 787  787                          (connp)->conn_next->conn_prev =                 \
 788  788                              (connp)->conn_prev;                         \
 789  789                  if ((connp)->conn_prev != NULL)                         \
 790  790                          (connp)->conn_prev->conn_next =                 \
 791  791                              (connp)->conn_next;                         \
 792  792                  else                                                    \
 793  793                          connfp->connf_head = (connp)->conn_next;        \
 794  794                  (connp)->conn_fanout = NULL;                            \
 795  795                  (connp)->conn_next = NULL;                              \
 796  796                  (connp)->conn_prev = NULL;                              \
 797  797                  (connp)->conn_flags |= IPCL_REMOVED;                    \
 798  798                  if (((connp)->conn_flags & IPCL_CL_LISTENER) != 0)      \
 799  799                          ipcl_conn_unlisten((connp));                    \
 800  800                  CONN_DEC_REF((connp));                                  \
 801  801                  mutex_exit(&connfp->connf_lock);                        \
 802  802          }                                                               \
 803  803  }
 804  804  
 805  805  void
 806  806  ipcl_hash_remove(conn_t *connp)
 807  807  {
 808  808          uint8_t         protocol = connp->conn_proto;
 809  809  
 810  810          IPCL_HASH_REMOVE(connp);
 811  811          if (protocol == IPPROTO_RSVP)
 812  812                  ill_set_inputfn_all(connp->conn_netstack->netstack_ip);
 813  813  }
 814  814  
 815  815  /*
 816  816   * The whole purpose of this function is allow removal of
 817  817   * a conn_t from the connected hash for timewait reclaim.
 818  818   * This is essentially a TW reclaim fastpath where timewait
 819  819   * collector checks under fanout lock (so no one else can
 820  820   * get access to the conn_t) that refcnt is 2 i.e. one for
 821  821   * TCP and one for the classifier hash list. If ref count
 822  822   * is indeed 2, we can just remove the conn under lock and
 823  823   * avoid cleaning up the conn under squeue. This gives us
 824  824   * improved performance.
 825  825   */
 826  826  void
 827  827  ipcl_hash_remove_locked(conn_t *connp, connf_t  *connfp)
 828  828  {
 829  829          ASSERT(MUTEX_HELD(&connfp->connf_lock));
 830  830          ASSERT(MUTEX_HELD(&connp->conn_lock));
 831  831          ASSERT((connp->conn_flags & IPCL_CL_LISTENER) == 0);
 832  832  
 833  833          if ((connp)->conn_next != NULL) {
 834  834                  (connp)->conn_next->conn_prev = (connp)->conn_prev;
 835  835          }
 836  836          if ((connp)->conn_prev != NULL) {
 837  837                  (connp)->conn_prev->conn_next = (connp)->conn_next;
 838  838          } else {
 839  839                  connfp->connf_head = (connp)->conn_next;
 840  840          }
 841  841          (connp)->conn_fanout = NULL;
 842  842          (connp)->conn_next = NULL;
 843  843          (connp)->conn_prev = NULL;
 844  844          (connp)->conn_flags |= IPCL_REMOVED;
 845  845          ASSERT((connp)->conn_ref == 2);
 846  846          (connp)->conn_ref--;
 847  847  }
 848  848  
 849  849  #define IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp) {              \
 850  850          ASSERT((connp)->conn_fanout == NULL);                           \
 851  851          ASSERT((connp)->conn_next == NULL);                             \
 852  852          ASSERT((connp)->conn_prev == NULL);                             \
 853  853          if ((connfp)->connf_head != NULL) {                             \
 854  854                  (connfp)->connf_head->conn_prev = (connp);              \
 855  855                  (connp)->conn_next = (connfp)->connf_head;              \
 856  856          }                                                               \
 857  857          (connp)->conn_fanout = (connfp);                                \
 858  858          (connfp)->connf_head = (connp);                                 \
 859  859          (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 860  860              IPCL_CONNECTED;                                             \
 861  861          CONN_INC_REF(connp);                                            \
 862  862  }
 863  863  
 864  864  #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) {                     \
 865  865          IPCL_HASH_REMOVE((connp));                                      \
 866  866          mutex_enter(&(connfp)->connf_lock);                             \
 867  867          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);               \
 868  868          mutex_exit(&(connfp)->connf_lock);                              \
 869  869  }
 870  870  
 871  871  #define IPCL_HASH_INSERT_BOUND(connfp, connp) {                         \
 872  872          conn_t *pconnp = NULL, *nconnp;                                 \
 873  873          IPCL_HASH_REMOVE((connp));                                      \
 874  874          mutex_enter(&(connfp)->connf_lock);                             \
 875  875          nconnp = (connfp)->connf_head;                                  \
 876  876          while (nconnp != NULL &&                                        \
 877  877              !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {               \
 878  878                  pconnp = nconnp;                                        \
 879  879                  nconnp = nconnp->conn_next;                             \
 880  880          }                                                               \
 881  881          if (pconnp != NULL) {                                           \
 882  882                  pconnp->conn_next = (connp);                            \
 883  883                  (connp)->conn_prev = pconnp;                            \
 884  884          } else {                                                        \
 885  885                  (connfp)->connf_head = (connp);                         \
 886  886          }                                                               \
 887  887          if (nconnp != NULL) {                                           \
 888  888                  (connp)->conn_next = nconnp;                            \
 889  889                  nconnp->conn_prev = (connp);                            \
 890  890          }                                                               \
 891  891          (connp)->conn_fanout = (connfp);                                \
 892  892          (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 893  893              IPCL_BOUND;                                                 \
 894  894          CONN_INC_REF(connp);                                            \
 895  895          mutex_exit(&(connfp)->connf_lock);                              \
 896  896  }
 897  897  
 898  898  #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) {                      \
 899  899          conn_t **list, *prev, *next;                                    \
 900  900          boolean_t isv4mapped =                                          \
 901  901              IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);              \
 902  902          IPCL_HASH_REMOVE((connp));                                      \
 903  903          mutex_enter(&(connfp)->connf_lock);                             \
 904  904          list = &(connfp)->connf_head;                                   \
 905  905          prev = NULL;                                                    \
 906  906          while ((next = *list) != NULL) {                                \
 907  907                  if (isv4mapped &&                                       \
 908  908                      IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&    \
 909  909                      connp->conn_zoneid == next->conn_zoneid) {          \
 910  910                          (connp)->conn_next = next;                      \
 911  911                          if (prev != NULL)                               \
 912  912                                  prev = next->conn_prev;                 \
 913  913                          next->conn_prev = (connp);                      \
 914  914                          break;                                          \
 915  915                  }                                                       \
 916  916                  list = &next->conn_next;                                \
 917  917                  prev = next;                                            \
 918  918          }                                                               \
 919  919          (connp)->conn_prev = prev;                                      \
 920  920          *list = (connp);                                                \
 921  921          (connp)->conn_fanout = (connfp);                                \
 922  922          (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |   \
 923  923              IPCL_BOUND;                                                 \
 924  924          CONN_INC_REF((connp));                                          \
 925  925          mutex_exit(&(connfp)->connf_lock);                              \
 926  926  }
 927  927  
 928  928  void
 929  929  ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 930  930  {
 931  931          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
 932  932  }
 933  933  
 934  934  /*
 935  935   * Because the classifier is used to classify inbound packets, the destination
 936  936   * address is meant to be our local tunnel address (tunnel source), and the
 937  937   * source the remote tunnel address (tunnel destination).
 938  938   *
 939  939   * Note that conn_proto can't be used for fanout since the upper protocol
 940  940   * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 941  941   */
 942  942  conn_t *
 943  943  ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst)
 944  944  {
 945  945          connf_t *connfp;
 946  946          conn_t  *connp;
 947  947  
 948  948          /* first look for IPv4 tunnel links */
 949  949          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst, *src)];
 950  950          mutex_enter(&connfp->connf_lock);
 951  951          for (connp = connfp->connf_head; connp != NULL;
 952  952              connp = connp->conn_next) {
 953  953                  if (IPCL_IPTUN_MATCH(connp, *dst, *src))
 954  954                          break;
 955  955          }
 956  956          if (connp != NULL)
 957  957                  goto done;
 958  958  
 959  959          mutex_exit(&connfp->connf_lock);
 960  960  
 961  961          /* We didn't find an IPv4 tunnel, try a 6to4 tunnel */
 962  962          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(*dst,
 963  963              INADDR_ANY)];
 964  964          mutex_enter(&connfp->connf_lock);
 965  965          for (connp = connfp->connf_head; connp != NULL;
 966  966              connp = connp->conn_next) {
 967  967                  if (IPCL_IPTUN_MATCH(connp, *dst, INADDR_ANY))
 968  968                          break;
 969  969          }
 970  970  done:
 971  971          if (connp != NULL)
 972  972                  CONN_INC_REF(connp);
 973  973          mutex_exit(&connfp->connf_lock);
 974  974          return (connp);
 975  975  }
 976  976  
 977  977  conn_t *
 978  978  ipcl_iptun_classify_v6(in6_addr_t *src, in6_addr_t *dst, ip_stack_t *ipst)
 979  979  {
 980  980          connf_t *connfp;
 981  981          conn_t  *connp;
 982  982  
 983  983          /* Look for an IPv6 tunnel link */
 984  984          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(dst, src)];
 985  985          mutex_enter(&connfp->connf_lock);
 986  986          for (connp = connfp->connf_head; connp != NULL;
 987  987              connp = connp->conn_next) {
 988  988                  if (IPCL_IPTUN_MATCH_V6(connp, dst, src)) {
 989  989                          CONN_INC_REF(connp);
 990  990                          break;
 991  991                  }
 992  992          }
 993  993          mutex_exit(&connfp->connf_lock);
 994  994          return (connp);
 995  995  }
 996  996  
 997  997  /*
 998  998   * This function is used only for inserting SCTP raw socket now.
 999  999   * This may change later.
1000 1000   *
1001 1001   * Note that only one raw socket can be bound to a port.  The param
1002 1002   * lport is in network byte order.
1003 1003   */
1004 1004  static int
1005 1005  ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
1006 1006  {
1007 1007          connf_t *connfp;
1008 1008          conn_t  *oconnp;
1009 1009          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1010 1010  
1011 1011          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1012 1012  
1013 1013          /* Check for existing raw socket already bound to the port. */
1014 1014          mutex_enter(&connfp->connf_lock);
1015 1015          for (oconnp = connfp->connf_head; oconnp != NULL;
1016 1016              oconnp = oconnp->conn_next) {
1017 1017                  if (oconnp->conn_lport == lport &&
1018 1018                      oconnp->conn_zoneid == connp->conn_zoneid &&
1019 1019                      oconnp->conn_family == connp->conn_family &&
1020 1020                      ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1021 1021                      IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) ||
1022 1022                      IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) ||
1023 1023                      IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) ||
1024 1024                      IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6,
1025 1025                      &connp->conn_laddr_v6))) {
1026 1026                          break;
1027 1027                  }
1028 1028          }
1029 1029          mutex_exit(&connfp->connf_lock);
1030 1030          if (oconnp != NULL)
1031 1031                  return (EADDRNOTAVAIL);
1032 1032  
1033 1033          if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) ||
1034 1034              IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1035 1035                  if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
1036 1036                      IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
1037 1037                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1038 1038                  } else {
1039 1039                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1040 1040                  }
1041 1041          } else {
1042 1042                  IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1043 1043          }
1044 1044          return (0);
1045 1045  }
1046 1046  
1047 1047  static int
1048 1048  ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst)
1049 1049  {
1050 1050          connf_t *connfp;
1051 1051          conn_t  *tconnp;
1052 1052          ipaddr_t laddr = connp->conn_laddr_v4;
1053 1053          ipaddr_t faddr = connp->conn_faddr_v4;
1054 1054  
1055 1055          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)];
1056 1056          mutex_enter(&connfp->connf_lock);
1057 1057          for (tconnp = connfp->connf_head; tconnp != NULL;
1058 1058              tconnp = tconnp->conn_next) {
1059 1059                  if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) {
1060 1060                          /* A tunnel is already bound to these addresses. */
1061 1061                          mutex_exit(&connfp->connf_lock);
1062 1062                          return (EADDRINUSE);
1063 1063                  }
1064 1064          }
1065 1065          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1066 1066          mutex_exit(&connfp->connf_lock);
1067 1067          return (0);
1068 1068  }
1069 1069  
1070 1070  static int
1071 1071  ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst)
1072 1072  {
1073 1073          connf_t *connfp;
1074 1074          conn_t  *tconnp;
1075 1075          in6_addr_t *laddr = &connp->conn_laddr_v6;
1076 1076          in6_addr_t *faddr = &connp->conn_faddr_v6;
1077 1077  
1078 1078          connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)];
1079 1079          mutex_enter(&connfp->connf_lock);
1080 1080          for (tconnp = connfp->connf_head; tconnp != NULL;
1081 1081              tconnp = tconnp->conn_next) {
1082 1082                  if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) {
1083 1083                          /* A tunnel is already bound to these addresses. */
1084 1084                          mutex_exit(&connfp->connf_lock);
1085 1085                          return (EADDRINUSE);
1086 1086                  }
1087 1087          }
1088 1088          IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1089 1089          mutex_exit(&connfp->connf_lock);
1090 1090          return (0);
1091 1091  }
1092 1092  
1093 1093  /*
1094 1094   * Check for a MAC exemption conflict on a labeled system.  Note that for
1095 1095   * protocols that use port numbers (UDP, TCP, SCTP), we do this check up in the
1096 1096   * transport layer.  This check is for binding all other protocols.
1097 1097   *
1098 1098   * Returns true if there's a conflict.
1099 1099   */
1100 1100  static boolean_t
1101 1101  check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst)
1102 1102  {
1103 1103          connf_t *connfp;
1104 1104          conn_t *tconn;
1105 1105  
1106 1106          connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto];
1107 1107          mutex_enter(&connfp->connf_lock);
1108 1108          for (tconn = connfp->connf_head; tconn != NULL;
1109 1109              tconn = tconn->conn_next) {
1110 1110                  /* We don't allow v4 fallback for v6 raw socket */
1111 1111                  if (connp->conn_family != tconn->conn_family)
1112 1112                          continue;
1113 1113                  /* If neither is exempt, then there's no conflict */
1114 1114                  if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1115 1115                      (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1116 1116                          continue;
1117 1117                  /* We are only concerned about sockets for a different zone */
1118 1118                  if (connp->conn_zoneid == tconn->conn_zoneid)
1119 1119                          continue;
1120 1120                  /* If both are bound to different specific addrs, ok */
1121 1121                  if (connp->conn_laddr_v4 != INADDR_ANY &&
1122 1122                      tconn->conn_laddr_v4 != INADDR_ANY &&
1123 1123                      connp->conn_laddr_v4 != tconn->conn_laddr_v4)
1124 1124                          continue;
1125 1125                  /* These two conflict; fail */
1126 1126                  break;
1127 1127          }
1128 1128          mutex_exit(&connfp->connf_lock);
1129 1129          return (tconn != NULL);
1130 1130  }
1131 1131  
1132 1132  static boolean_t
1133 1133  check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst)
1134 1134  {
1135 1135          connf_t *connfp;
1136 1136          conn_t *tconn;
1137 1137  
1138 1138          connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto];
1139 1139          mutex_enter(&connfp->connf_lock);
1140 1140          for (tconn = connfp->connf_head; tconn != NULL;
1141 1141              tconn = tconn->conn_next) {
1142 1142                  /* We don't allow v4 fallback for v6 raw socket */
1143 1143                  if (connp->conn_family != tconn->conn_family)
1144 1144                          continue;
1145 1145                  /* If neither is exempt, then there's no conflict */
1146 1146                  if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) &&
1147 1147                      (tconn->conn_mac_mode == CONN_MAC_DEFAULT))
1148 1148                          continue;
1149 1149                  /* We are only concerned about sockets for a different zone */
1150 1150                  if (connp->conn_zoneid == tconn->conn_zoneid)
1151 1151                          continue;
1152 1152                  /* If both are bound to different addrs, ok */
1153 1153                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) &&
1154 1154                      !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) &&
1155 1155                      !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
1156 1156                      &tconn->conn_laddr_v6))
1157 1157                          continue;
1158 1158                  /* These two conflict; fail */
1159 1159                  break;
1160 1160          }
1161 1161          mutex_exit(&connfp->connf_lock);
1162 1162          return (tconn != NULL);
1163 1163  }
1164 1164  
1165 1165  /*
1166 1166   * (v4, v6) bind hash insertion routines
1167 1167   * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport)
1168 1168   */
1169 1169  
1170 1170  int
1171 1171  ipcl_bind_insert(conn_t *connp)
1172 1172  {
1173 1173          if (connp->conn_ipversion == IPV6_VERSION)
1174 1174                  return (ipcl_bind_insert_v6(connp));
1175 1175          else
1176 1176                  return (ipcl_bind_insert_v4(connp));
1177 1177  }
1178 1178  
1179 1179  int
1180 1180  ipcl_bind_insert_v4(conn_t *connp)
1181 1181  {
1182 1182          connf_t *connfp;
1183 1183          int     ret = 0;
1184 1184          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1185 1185          uint16_t        lport = connp->conn_lport;
1186 1186          uint8_t         protocol = connp->conn_proto;
1187 1187  
1188 1188          if (IPCL_IS_IPTUN(connp))
1189 1189                  return (ipcl_iptun_hash_insert(connp, ipst));
1190 1190  
1191 1191          switch (protocol) {
1192 1192          default:
1193 1193                  if (is_system_labeled() &&
1194 1194                      check_exempt_conflict_v4(connp, ipst))
1195 1195                          return (EADDRINUSE);
1196 1196                  /* FALLTHROUGH */
1197 1197          case IPPROTO_UDP:
1198 1198                  if (protocol == IPPROTO_UDP) {
1199 1199                          connfp = &ipst->ips_ipcl_udp_fanout[
1200 1200                              IPCL_UDP_HASH(lport, ipst)];
1201 1201                  } else {
1202 1202                          connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1203 1203                  }
1204 1204  
1205 1205                  if (connp->conn_faddr_v4 != INADDR_ANY) {
1206 1206                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1207 1207                  } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1208 1208                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1209 1209                  } else {
1210 1210                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1211 1211                  }
1212 1212                  if (protocol == IPPROTO_RSVP)
1213 1213                          ill_set_inputfn_all(ipst);
1214 1214                  break;
1215 1215  
1216 1216          case IPPROTO_TCP:
1217 1217                  /* Insert it in the Bind Hash */
1218 1218                  ASSERT(connp->conn_zoneid != ALL_ZONES);
1219 1219                  connfp = &ipst->ips_ipcl_bind_fanout[
1220 1220                      IPCL_BIND_HASH(lport, ipst)];
1221 1221                  if (connp->conn_laddr_v4 != INADDR_ANY) {
1222 1222                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1223 1223                  } else {
1224 1224                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1225 1225                  }
1226 1226                  if (cl_inet_listen != NULL) {
1227 1227                          ASSERT(connp->conn_ipversion == IPV4_VERSION);
1228 1228                          connp->conn_flags |= IPCL_CL_LISTENER;
1229 1229                          (*cl_inet_listen)(
1230 1230                              connp->conn_netstack->netstack_stackid,
1231 1231                              IPPROTO_TCP, AF_INET,
1232 1232                              (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL);
1233 1233                  }
1234 1234                  break;
1235 1235  
1236 1236          case IPPROTO_SCTP:
1237 1237                  ret = ipcl_sctp_hash_insert(connp, lport);
1238 1238                  break;
1239 1239          }
1240 1240  
1241 1241          return (ret);
1242 1242  }
1243 1243  
1244 1244  int
1245 1245  ipcl_bind_insert_v6(conn_t *connp)
1246 1246  {
1247 1247          connf_t         *connfp;
1248 1248          int             ret = 0;
1249 1249          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1250 1250          uint16_t        lport = connp->conn_lport;
1251 1251          uint8_t         protocol = connp->conn_proto;
1252 1252  
1253 1253          if (IPCL_IS_IPTUN(connp)) {
1254 1254                  return (ipcl_iptun_hash_insert_v6(connp, ipst));
1255 1255          }
1256 1256  
1257 1257          switch (protocol) {
1258 1258          default:
1259 1259                  if (is_system_labeled() &&
1260 1260                      check_exempt_conflict_v6(connp, ipst))
1261 1261                          return (EADDRINUSE);
1262 1262                  /* FALLTHROUGH */
1263 1263          case IPPROTO_UDP:
1264 1264                  if (protocol == IPPROTO_UDP) {
1265 1265                          connfp = &ipst->ips_ipcl_udp_fanout[
1266 1266                              IPCL_UDP_HASH(lport, ipst)];
1267 1267                  } else {
1268 1268                          connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1269 1269                  }
1270 1270  
1271 1271                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1272 1272                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1273 1273                  } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1274 1274                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1275 1275                  } else {
1276 1276                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1277 1277                  }
1278 1278                  break;
1279 1279  
1280 1280          case IPPROTO_TCP:
1281 1281                  /* Insert it in the Bind Hash */
1282 1282                  ASSERT(connp->conn_zoneid != ALL_ZONES);
1283 1283                  connfp = &ipst->ips_ipcl_bind_fanout[
1284 1284                      IPCL_BIND_HASH(lport, ipst)];
1285 1285                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1286 1286                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1287 1287                  } else {
1288 1288                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1289 1289                  }
1290 1290                  if (cl_inet_listen != NULL) {
1291 1291                          sa_family_t     addr_family;
1292 1292                          uint8_t         *laddrp;
1293 1293  
1294 1294                          if (connp->conn_ipversion == IPV6_VERSION) {
1295 1295                                  addr_family = AF_INET6;
1296 1296                                  laddrp =
1297 1297                                      (uint8_t *)&connp->conn_bound_addr_v6;
1298 1298                          } else {
1299 1299                                  addr_family = AF_INET;
1300 1300                                  laddrp = (uint8_t *)&connp->conn_bound_addr_v4;
1301 1301                          }
1302 1302                          connp->conn_flags |= IPCL_CL_LISTENER;
1303 1303                          (*cl_inet_listen)(
1304 1304                              connp->conn_netstack->netstack_stackid,
1305 1305                              IPPROTO_TCP, addr_family, laddrp, lport, NULL);
1306 1306                  }
1307 1307                  break;
1308 1308  
1309 1309          case IPPROTO_SCTP:
1310 1310                  ret = ipcl_sctp_hash_insert(connp, lport);
1311 1311                  break;
1312 1312          }
1313 1313  
1314 1314          return (ret);
1315 1315  }
1316 1316  
1317 1317  /*
1318 1318   * ipcl_conn_hash insertion routines.
1319 1319   * The caller has already set conn_proto and the addresses/ports in the conn_t.
1320 1320   */
1321 1321  
1322 1322  int
1323 1323  ipcl_conn_insert(conn_t *connp)
1324 1324  {
1325 1325          if (connp->conn_ipversion == IPV6_VERSION)
1326 1326                  return (ipcl_conn_insert_v6(connp));
1327 1327          else
1328 1328                  return (ipcl_conn_insert_v4(connp));
1329 1329  }
1330 1330  
1331 1331  int
1332 1332  ipcl_conn_insert_v4(conn_t *connp)
1333 1333  {
1334 1334          connf_t         *connfp;
1335 1335          conn_t          *tconnp;
1336 1336          int             ret = 0;
1337 1337          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1338 1338          uint16_t        lport = connp->conn_lport;
1339 1339          uint8_t         protocol = connp->conn_proto;
1340 1340  
1341 1341          if (IPCL_IS_IPTUN(connp))
1342 1342                  return (ipcl_iptun_hash_insert(connp, ipst));
1343 1343  
1344 1344          switch (protocol) {
1345 1345          case IPPROTO_TCP:
1346 1346                  /*
1347 1347                   * For TCP, we check whether the connection tuple already
1348 1348                   * exists before allowing the connection to proceed.  We
1349 1349                   * also allow indexing on the zoneid. This is to allow
1350 1350                   * multiple shared stack zones to have the same tcp
1351 1351                   * connection tuple. In practice this only happens for
1352 1352                   * INADDR_LOOPBACK as it's the only local address which
1353 1353                   * doesn't have to be unique.
1354 1354                   */
1355 1355                  connfp = &ipst->ips_ipcl_conn_fanout[
1356 1356                      IPCL_CONN_HASH(connp->conn_faddr_v4,
1357 1357                      connp->conn_ports, ipst)];
1358 1358                  mutex_enter(&connfp->connf_lock);
1359 1359                  for (tconnp = connfp->connf_head; tconnp != NULL;
1360 1360                      tconnp = tconnp->conn_next) {
1361 1361                          if (IPCL_CONN_MATCH(tconnp, connp->conn_proto,
1362 1362                              connp->conn_faddr_v4, connp->conn_laddr_v4,
1363 1363                              connp->conn_ports) &&
1364 1364                              IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1365 1365                                  /* Already have a conn. bail out */
1366 1366                                  mutex_exit(&connfp->connf_lock);
1367 1367                                  return (EADDRINUSE);
1368 1368                          }
1369 1369                  }
1370 1370                  if (connp->conn_fanout != NULL) {
1371 1371                          /*
1372 1372                           * Probably a XTI/TLI application trying to do a
1373 1373                           * rebind. Let it happen.
1374 1374                           */
1375 1375                          mutex_exit(&connfp->connf_lock);
1376 1376                          IPCL_HASH_REMOVE(connp);
1377 1377                          mutex_enter(&connfp->connf_lock);
1378 1378                  }
1379 1379  
1380 1380                  ASSERT(connp->conn_recv != NULL);
1381 1381                  ASSERT(connp->conn_recvicmp != NULL);
1382 1382  
1383 1383                  IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1384 1384                  mutex_exit(&connfp->connf_lock);
1385 1385                  break;
1386 1386  
1387 1387          case IPPROTO_SCTP:
1388 1388                  /*
1389 1389                   * The raw socket may have already been bound, remove it
1390 1390                   * from the hash first.
1391 1391                   */
1392 1392                  IPCL_HASH_REMOVE(connp);
1393 1393                  ret = ipcl_sctp_hash_insert(connp, lport);
1394 1394                  break;
1395 1395  
1396 1396          default:
1397 1397                  /*
1398 1398                   * Check for conflicts among MAC exempt bindings.  For
1399 1399                   * transports with port numbers, this is done by the upper
1400 1400                   * level per-transport binding logic.  For all others, it's
1401 1401                   * done here.
1402 1402                   */
1403 1403                  if (is_system_labeled() &&
1404 1404                      check_exempt_conflict_v4(connp, ipst))
1405 1405                          return (EADDRINUSE);
1406 1406                  /* FALLTHROUGH */
1407 1407  
1408 1408          case IPPROTO_UDP:
1409 1409                  if (protocol == IPPROTO_UDP) {
1410 1410                          connfp = &ipst->ips_ipcl_udp_fanout[
1411 1411                              IPCL_UDP_HASH(lport, ipst)];
1412 1412                  } else {
1413 1413                          connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol];
1414 1414                  }
1415 1415  
1416 1416                  if (connp->conn_faddr_v4 != INADDR_ANY) {
1417 1417                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1418 1418                  } else if (connp->conn_laddr_v4 != INADDR_ANY) {
1419 1419                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1420 1420                  } else {
1421 1421                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1422 1422                  }
1423 1423                  break;
1424 1424          }
1425 1425  
1426 1426          return (ret);
1427 1427  }
1428 1428  
1429 1429  int
1430 1430  ipcl_conn_insert_v6(conn_t *connp)
1431 1431  {
1432 1432          connf_t         *connfp;
1433 1433          conn_t          *tconnp;
1434 1434          int             ret = 0;
1435 1435          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
1436 1436          uint16_t        lport = connp->conn_lport;
1437 1437          uint8_t         protocol = connp->conn_proto;
1438 1438          uint_t          ifindex = connp->conn_bound_if;
1439 1439  
1440 1440          if (IPCL_IS_IPTUN(connp))
1441 1441                  return (ipcl_iptun_hash_insert_v6(connp, ipst));
1442 1442  
1443 1443          switch (protocol) {
1444 1444          case IPPROTO_TCP:
1445 1445  
1446 1446                  /*
1447 1447                   * For tcp, we check whether the connection tuple already
1448 1448                   * exists before allowing the connection to proceed.  We
1449 1449                   * also allow indexing on the zoneid. This is to allow
1450 1450                   * multiple shared stack zones to have the same tcp
1451 1451                   * connection tuple. In practice this only happens for
1452 1452                   * ipv6_loopback as it's the only local address which
1453 1453                   * doesn't have to be unique.
1454 1454                   */
1455 1455                  connfp = &ipst->ips_ipcl_conn_fanout[
1456 1456                      IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports,
1457 1457                      ipst)];
1458 1458                  mutex_enter(&connfp->connf_lock);
1459 1459                  for (tconnp = connfp->connf_head; tconnp != NULL;
1460 1460                      tconnp = tconnp->conn_next) {
1461 1461                          /* NOTE: need to match zoneid. Bug in onnv-gate */
1462 1462                          if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto,
1463 1463                              connp->conn_faddr_v6, connp->conn_laddr_v6,
1464 1464                              connp->conn_ports) &&
1465 1465                              (tconnp->conn_bound_if == 0 ||
1466 1466                              tconnp->conn_bound_if == ifindex) &&
1467 1467                              IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) {
1468 1468                                  /* Already have a conn. bail out */
1469 1469                                  mutex_exit(&connfp->connf_lock);
1470 1470                                  return (EADDRINUSE);
1471 1471                          }
1472 1472                  }
1473 1473                  if (connp->conn_fanout != NULL) {
1474 1474                          /*
1475 1475                           * Probably a XTI/TLI application trying to do a
1476 1476                           * rebind. Let it happen.
1477 1477                           */
1478 1478                          mutex_exit(&connfp->connf_lock);
1479 1479                          IPCL_HASH_REMOVE(connp);
1480 1480                          mutex_enter(&connfp->connf_lock);
1481 1481                  }
1482 1482                  IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
1483 1483                  mutex_exit(&connfp->connf_lock);
1484 1484                  break;
1485 1485  
1486 1486          case IPPROTO_SCTP:
1487 1487                  IPCL_HASH_REMOVE(connp);
1488 1488                  ret = ipcl_sctp_hash_insert(connp, lport);
1489 1489                  break;
1490 1490  
1491 1491          default:
1492 1492                  if (is_system_labeled() &&
1493 1493                      check_exempt_conflict_v6(connp, ipst))
1494 1494                          return (EADDRINUSE);
1495 1495                  /* FALLTHROUGH */
1496 1496          case IPPROTO_UDP:
1497 1497                  if (protocol == IPPROTO_UDP) {
1498 1498                          connfp = &ipst->ips_ipcl_udp_fanout[
1499 1499                              IPCL_UDP_HASH(lport, ipst)];
1500 1500                  } else {
1501 1501                          connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol];
1502 1502                  }
1503 1503  
1504 1504                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
1505 1505                          IPCL_HASH_INSERT_CONNECTED(connfp, connp);
1506 1506                  } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
1507 1507                          IPCL_HASH_INSERT_BOUND(connfp, connp);
1508 1508                  } else {
1509 1509                          IPCL_HASH_INSERT_WILDCARD(connfp, connp);
1510 1510                  }
1511 1511                  break;
1512 1512          }
1513 1513  
1514 1514          return (ret);
1515 1515  }
1516 1516  
1517 1517  /*
1518 1518   * v4 packet classifying function. looks up the fanout table to
1519 1519   * find the conn, the packet belongs to. returns the conn with
1520 1520   * the reference held, null otherwise.
1521 1521   *
1522 1522   * If zoneid is ALL_ZONES, then the search rules described in the "Connection
1523 1523   * Lookup" comment block are applied.  Labels are also checked as described
1524 1524   * above.  If the packet is from the inside (looped back), and is from the same
1525 1525   * zone, then label checks are omitted.
1526 1526   */
1527 1527  conn_t *
1528 1528  ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1529 1529      ip_recv_attr_t *ira, ip_stack_t *ipst)
1530 1530  {
1531 1531          ipha_t  *ipha;
1532 1532          connf_t *connfp, *bind_connfp;
1533 1533          uint16_t lport;
1534 1534          uint16_t fport;
1535 1535          uint32_t ports;
1536 1536          conn_t  *connp;
1537 1537          uint16_t  *up;
1538 1538          zoneid_t        zoneid = ira->ira_zoneid;
1539 1539  
1540 1540          ipha = (ipha_t *)mp->b_rptr;
1541 1541          up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET);
1542 1542  
1543 1543          switch (protocol) {
1544 1544          case IPPROTO_TCP:
1545 1545                  ports = *(uint32_t *)up;
1546 1546                  connfp =
1547 1547                      &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_src,
1548 1548                      ports, ipst)];
1549 1549                  mutex_enter(&connfp->connf_lock);
1550 1550                  for (connp = connfp->connf_head; connp != NULL;
1551 1551                      connp = connp->conn_next) {
1552 1552                          if (IPCL_CONN_MATCH(connp, protocol,
1553 1553                              ipha->ipha_src, ipha->ipha_dst, ports) &&
1554 1554                              (connp->conn_zoneid == zoneid ||
1555 1555                              connp->conn_allzones ||
1556 1556                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1557 1557                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1558 1558                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1559 1559                                  break;
1560 1560                  }
1561 1561  
1562 1562                  if (connp != NULL) {
1563 1563                          /*
1564 1564                           * We have a fully-bound TCP connection.
1565 1565                           *
1566 1566                           * For labeled systems, there's no need to check the
1567 1567                           * label here.  It's known to be good as we checked
1568 1568                           * before allowing the connection to become bound.
1569 1569                           */
1570 1570                          CONN_INC_REF(connp);
1571 1571                          mutex_exit(&connfp->connf_lock);
1572 1572                          return (connp);
1573 1573                  }
1574 1574  
1575 1575                  mutex_exit(&connfp->connf_lock);
1576 1576                  lport = up[1];
1577 1577                  bind_connfp =
1578 1578                      &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1579 1579                  mutex_enter(&bind_connfp->connf_lock);
1580 1580                  for (connp = bind_connfp->connf_head; connp != NULL;
1581 1581                      connp = connp->conn_next) {
1582 1582                          if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst,
1583 1583                              lport) &&
1584 1584                              (connp->conn_zoneid == zoneid ||
1585 1585                              connp->conn_allzones ||
1586 1586                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1587 1587                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1588 1588                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1589 1589                                  break;
1590 1590                  }
1591 1591  
1592 1592                  /*
1593 1593                   * If the matching connection is SLP on a private address, then
1594 1594                   * the label on the packet must match the local zone's label.
1595 1595                   * Otherwise, it must be in the label range defined by tnrh.
1596 1596                   * This is ensured by tsol_receive_local.
1597 1597                   *
1598 1598                   * Note that we don't check tsol_receive_local for
1599 1599                   * the connected case.
1600 1600                   */
1601 1601                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1602 1602                      !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1603 1603                      ira, connp)) {
1604 1604                          DTRACE_PROBE3(tx__ip__log__info__classify__tcp,
1605 1605                              char *, "connp(1) could not receive mp(2)",
1606 1606                              conn_t *, connp, mblk_t *, mp);
1607 1607                          connp = NULL;
1608 1608                  }
1609 1609  
1610 1610                  if (connp != NULL) {
1611 1611                          /* Have a listener at least */
1612 1612                          CONN_INC_REF(connp);
1613 1613                          mutex_exit(&bind_connfp->connf_lock);
1614 1614                          return (connp);
1615 1615                  }
1616 1616  
1617 1617                  mutex_exit(&bind_connfp->connf_lock);
1618 1618                  break;
1619 1619  
1620 1620          case IPPROTO_UDP:
1621 1621                  lport = up[1];
1622 1622                  fport = up[0];
1623 1623                  connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1624 1624                  mutex_enter(&connfp->connf_lock);
1625 1625                  for (connp = connfp->connf_head; connp != NULL;
1626 1626                      connp = connp->conn_next) {
1627 1627                          if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst,
1628 1628                              fport, ipha->ipha_src) &&
1629 1629                              (connp->conn_zoneid == zoneid ||
1630 1630                              connp->conn_allzones ||
1631 1631                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1632 1632                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE))))
1633 1633                                  break;
1634 1634                  }
1635 1635  
1636 1636                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1637 1637                      !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION,
1638 1638                      ira, connp)) {
1639 1639                          DTRACE_PROBE3(tx__ip__log__info__classify__udp,
1640 1640                              char *, "connp(1) could not receive mp(2)",
1641 1641                              conn_t *, connp, mblk_t *, mp);
1642 1642                          connp = NULL;
1643 1643                  }
1644 1644  
1645 1645                  if (connp != NULL) {
1646 1646                          CONN_INC_REF(connp);
1647 1647                          mutex_exit(&connfp->connf_lock);
1648 1648                          return (connp);
1649 1649                  }
1650 1650  
1651 1651                  /*
1652 1652                   * We shouldn't come here for multicast/broadcast packets
1653 1653                   */
1654 1654                  mutex_exit(&connfp->connf_lock);
1655 1655  
1656 1656                  break;
1657 1657  
1658 1658          case IPPROTO_ENCAP:
1659 1659          case IPPROTO_IPV6:
1660 1660                  return (ipcl_iptun_classify_v4(&ipha->ipha_src,
1661 1661                      &ipha->ipha_dst, ipst));
1662 1662          }
1663 1663  
1664 1664          return (NULL);
1665 1665  }
1666 1666  
1667 1667  conn_t *
1668 1668  ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len,
1669 1669      ip_recv_attr_t *ira, ip_stack_t *ipst)
1670 1670  {
1671 1671          ip6_t           *ip6h;
1672 1672          connf_t         *connfp, *bind_connfp;
1673 1673          uint16_t        lport;
1674 1674          uint16_t        fport;
1675 1675          tcpha_t         *tcpha;
1676 1676          uint32_t        ports;
1677 1677          conn_t          *connp;
1678 1678          uint16_t        *up;
1679 1679          zoneid_t        zoneid = ira->ira_zoneid;
1680 1680  
1681 1681          ip6h = (ip6_t *)mp->b_rptr;
1682 1682  
1683 1683          switch (protocol) {
1684 1684          case IPPROTO_TCP:
1685 1685                  tcpha = (tcpha_t *)&mp->b_rptr[hdr_len];
1686 1686                  up = &tcpha->tha_lport;
1687 1687                  ports = *(uint32_t *)up;
1688 1688  
1689 1689                  connfp =
1690 1690                      &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_src,
1691 1691                      ports, ipst)];
1692 1692                  mutex_enter(&connfp->connf_lock);
1693 1693                  for (connp = connfp->connf_head; connp != NULL;
1694 1694                      connp = connp->conn_next) {
1695 1695                          if (IPCL_CONN_MATCH_V6(connp, protocol,
1696 1696                              ip6h->ip6_src, ip6h->ip6_dst, ports) &&
1697 1697                              (connp->conn_zoneid == zoneid ||
1698 1698                              connp->conn_allzones ||
1699 1699                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1700 1700                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1701 1701                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1702 1702                                  break;
1703 1703                  }
1704 1704  
1705 1705                  if (connp != NULL) {
1706 1706                          /*
1707 1707                           * We have a fully-bound TCP connection.
1708 1708                           *
1709 1709                           * For labeled systems, there's no need to check the
1710 1710                           * label here.  It's known to be good as we checked
1711 1711                           * before allowing the connection to become bound.
1712 1712                           */
1713 1713                          CONN_INC_REF(connp);
1714 1714                          mutex_exit(&connfp->connf_lock);
1715 1715                          return (connp);
1716 1716                  }
1717 1717  
1718 1718                  mutex_exit(&connfp->connf_lock);
1719 1719  
1720 1720                  lport = up[1];
1721 1721                  bind_connfp =
1722 1722                      &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
1723 1723                  mutex_enter(&bind_connfp->connf_lock);
1724 1724                  for (connp = bind_connfp->connf_head; connp != NULL;
1725 1725                      connp = connp->conn_next) {
1726 1726                          if (IPCL_BIND_MATCH_V6(connp, protocol,
1727 1727                              ip6h->ip6_dst, lport) &&
1728 1728                              (connp->conn_zoneid == zoneid ||
1729 1729                              connp->conn_allzones ||
1730 1730                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1731 1731                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1732 1732                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1733 1733                                  break;
1734 1734                  }
1735 1735  
1736 1736                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1737 1737                      !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1738 1738                      ira, connp)) {
1739 1739                          DTRACE_PROBE3(tx__ip__log__info__classify__tcp6,
1740 1740                              char *, "connp(1) could not receive mp(2)",
1741 1741                              conn_t *, connp, mblk_t *, mp);
1742 1742                          connp = NULL;
1743 1743                  }
1744 1744  
1745 1745                  if (connp != NULL) {
1746 1746                          /* Have a listner at least */
1747 1747                          CONN_INC_REF(connp);
1748 1748                          mutex_exit(&bind_connfp->connf_lock);
1749 1749                          return (connp);
1750 1750                  }
1751 1751  
1752 1752                  mutex_exit(&bind_connfp->connf_lock);
1753 1753                  break;
1754 1754  
1755 1755          case IPPROTO_UDP:
1756 1756                  up = (uint16_t *)&mp->b_rptr[hdr_len];
1757 1757                  lport = up[1];
1758 1758                  fport = up[0];
1759 1759                  connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
1760 1760                  mutex_enter(&connfp->connf_lock);
1761 1761                  for (connp = connfp->connf_head; connp != NULL;
1762 1762                      connp = connp->conn_next) {
1763 1763                          if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst,
1764 1764                              fport, ip6h->ip6_src) &&
1765 1765                              (connp->conn_zoneid == zoneid ||
1766 1766                              connp->conn_allzones ||
1767 1767                              ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1768 1768                              (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1769 1769                              (ira->ira_flags & IRAF_TX_SHARED_ADDR))))
1770 1770                                  break;
1771 1771                  }
1772 1772  
1773 1773                  if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1774 1774                      !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION,
1775 1775                      ira, connp)) {
1776 1776                          DTRACE_PROBE3(tx__ip__log__info__classify__udp6,
1777 1777                              char *, "connp(1) could not receive mp(2)",
1778 1778                              conn_t *, connp, mblk_t *, mp);
1779 1779                          connp = NULL;
1780 1780                  }
1781 1781  
1782 1782                  if (connp != NULL) {
1783 1783                          CONN_INC_REF(connp);
1784 1784                          mutex_exit(&connfp->connf_lock);
1785 1785                          return (connp);
1786 1786                  }
1787 1787  
1788 1788                  /*
1789 1789                   * We shouldn't come here for multicast/broadcast packets
1790 1790                   */
1791 1791                  mutex_exit(&connfp->connf_lock);
1792 1792                  break;
1793 1793          case IPPROTO_ENCAP:
1794 1794          case IPPROTO_IPV6:
1795 1795                  return (ipcl_iptun_classify_v6(&ip6h->ip6_src,
1796 1796                      &ip6h->ip6_dst, ipst));
1797 1797          }
1798 1798  
1799 1799          return (NULL);
1800 1800  }
1801 1801  
1802 1802  /*
1803 1803   * wrapper around ipcl_classify_(v4,v6) routines.
1804 1804   */
1805 1805  conn_t *
1806 1806  ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst)
1807 1807  {
1808 1808          if (ira->ira_flags & IRAF_IS_IPV4) {
1809 1809                  return (ipcl_classify_v4(mp, ira->ira_protocol,
1810 1810                      ira->ira_ip_hdr_length, ira, ipst));
1811 1811          } else {
1812 1812                  return (ipcl_classify_v6(mp, ira->ira_protocol,
1813 1813                      ira->ira_ip_hdr_length, ira, ipst));
1814 1814          }
1815 1815  }
1816 1816  
1817 1817  /*
1818 1818   * Only used to classify SCTP RAW sockets
1819 1819   */
1820 1820  conn_t *
1821 1821  ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports,
1822 1822      ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst)
1823 1823  {
1824 1824          connf_t         *connfp;
1825 1825          conn_t          *connp;
1826 1826          in_port_t       lport;
1827 1827          int             ipversion;
1828 1828          const void      *dst;
1829 1829          zoneid_t        zoneid = ira->ira_zoneid;
1830 1830  
1831 1831          lport = ((uint16_t *)&ports)[1];
1832 1832          if (ira->ira_flags & IRAF_IS_IPV4) {
1833 1833                  dst = (const void *)&ipha->ipha_dst;
1834 1834                  ipversion = IPV4_VERSION;
1835 1835          } else {
1836 1836                  dst = (const void *)&ip6h->ip6_dst;
1837 1837                  ipversion = IPV6_VERSION;
1838 1838          }
1839 1839  
1840 1840          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)];
1841 1841          mutex_enter(&connfp->connf_lock);
1842 1842          for (connp = connfp->connf_head; connp != NULL;
1843 1843              connp = connp->conn_next) {
1844 1844                  /* We don't allow v4 fallback for v6 raw socket. */
1845 1845                  if (ipversion != connp->conn_ipversion)
1846 1846                          continue;
1847 1847                  if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
1848 1848                      !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
1849 1849                          if (ipversion == IPV4_VERSION) {
1850 1850                                  if (!IPCL_CONN_MATCH(connp, protocol,
1851 1851                                      ipha->ipha_src, ipha->ipha_dst, ports))
1852 1852                                          continue;
1853 1853                          } else {
1854 1854                                  if (!IPCL_CONN_MATCH_V6(connp, protocol,
1855 1855                                      ip6h->ip6_src, ip6h->ip6_dst, ports))
1856 1856                                          continue;
1857 1857                          }
1858 1858                  } else {
1859 1859                          if (ipversion == IPV4_VERSION) {
1860 1860                                  if (!IPCL_BIND_MATCH(connp, protocol,
1861 1861                                      ipha->ipha_dst, lport))
1862 1862                                          continue;
1863 1863                          } else {
1864 1864                                  if (!IPCL_BIND_MATCH_V6(connp, protocol,
1865 1865                                      ip6h->ip6_dst, lport))
1866 1866                                          continue;
1867 1867                          }
1868 1868                  }
1869 1869  
1870 1870                  if (connp->conn_zoneid == zoneid ||
1871 1871                      connp->conn_allzones ||
1872 1872                      ((connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
1873 1873                      (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) &&
1874 1874                      (ira->ira_flags & IRAF_TX_SHARED_ADDR)))
1875 1875                          break;
1876 1876          }
1877 1877  
1878 1878          if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) &&
1879 1879              !tsol_receive_local(mp, dst, ipversion, ira, connp)) {
1880 1880                  DTRACE_PROBE3(tx__ip__log__info__classify__rawip,
1881 1881                      char *, "connp(1) could not receive mp(2)",
1882 1882                      conn_t *, connp, mblk_t *, mp);
1883 1883                  connp = NULL;
1884 1884          }
1885 1885  
1886 1886          if (connp != NULL)
1887 1887                  goto found;
1888 1888          mutex_exit(&connfp->connf_lock);
1889 1889  
1890 1890          /* Try to look for a wildcard SCTP RAW socket match. */
1891 1891          connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)];
1892 1892          mutex_enter(&connfp->connf_lock);
1893 1893          for (connp = connfp->connf_head; connp != NULL;
1894 1894              connp = connp->conn_next) {
1895 1895                  /* We don't allow v4 fallback for v6 raw socket. */
1896 1896                  if (ipversion != connp->conn_ipversion)
1897 1897                          continue;
1898 1898                  if (!IPCL_ZONE_MATCH(connp, zoneid))
1899 1899                          continue;
1900 1900  
1901 1901                  if (ipversion == IPV4_VERSION) {
1902 1902                          if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst))
1903 1903                                  break;
1904 1904                  } else {
1905 1905                          if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) {
1906 1906                                  break;
1907 1907                          }
1908 1908                  }
1909 1909          }
1910 1910  
1911 1911          if (connp != NULL)
1912 1912                  goto found;
1913 1913  
1914 1914          mutex_exit(&connfp->connf_lock);
1915 1915          return (NULL);
1916 1916  
1917 1917  found:
1918 1918          ASSERT(connp != NULL);
1919 1919          CONN_INC_REF(connp);
1920 1920          mutex_exit(&connfp->connf_lock);
1921 1921          return (connp);
1922 1922  }
1923 1923  
1924 1924  /* ARGSUSED */
1925 1925  static int
1926 1926  tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
1927 1927  {
1928 1928          itc_t   *itc = (itc_t *)buf;
1929 1929          conn_t  *connp = &itc->itc_conn;
1930 1930          tcp_t   *tcp = (tcp_t *)&itc[1];
1931 1931  
1932 1932          bzero(connp, sizeof (conn_t));
1933 1933          bzero(tcp, sizeof (tcp_t));
1934 1934  
1935 1935          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1936 1936          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1937 1937          cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
1938 1938          tcp->tcp_timercache = tcp_timermp_alloc(kmflags);
1939 1939          if (tcp->tcp_timercache == NULL)
1940 1940                  return (ENOMEM);
1941 1941          connp->conn_tcp = tcp;
1942 1942          connp->conn_flags = IPCL_TCPCONN;
1943 1943          connp->conn_proto = IPPROTO_TCP;
1944 1944          tcp->tcp_connp = connp;
1945 1945          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1946 1946  
1947 1947          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1948 1948          if (connp->conn_ixa == NULL) {
1949 1949                  tcp_timermp_free(tcp);
1950 1950                  return (ENOMEM);
1951 1951          }
1952 1952          connp->conn_ixa->ixa_refcnt = 1;
1953 1953          connp->conn_ixa->ixa_protocol = connp->conn_proto;
1954 1954          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
1955 1955          return (0);
1956 1956  }
1957 1957  
1958 1958  /* ARGSUSED */
1959 1959  static void
1960 1960  tcp_conn_destructor(void *buf, void *cdrarg)
1961 1961  {
1962 1962          itc_t   *itc = (itc_t *)buf;
1963 1963          conn_t  *connp = &itc->itc_conn;
1964 1964          tcp_t   *tcp = (tcp_t *)&itc[1];
1965 1965  
1966 1966          ASSERT(connp->conn_flags & IPCL_TCPCONN);
1967 1967          ASSERT(tcp->tcp_connp == connp);
1968 1968          ASSERT(connp->conn_tcp == tcp);
1969 1969          tcp_timermp_free(tcp);
1970 1970          mutex_destroy(&connp->conn_lock);
1971 1971          cv_destroy(&connp->conn_cv);
1972 1972          cv_destroy(&connp->conn_sq_cv);
1973 1973          rw_destroy(&connp->conn_ilg_lock);
1974 1974  
1975 1975          /* Can be NULL if constructor failed */
1976 1976          if (connp->conn_ixa != NULL) {
1977 1977                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
1978 1978                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
1979 1979                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
1980 1980                  ixa_refrele(connp->conn_ixa);
1981 1981          }
1982 1982  }
1983 1983  
1984 1984  /* ARGSUSED */
1985 1985  static int
1986 1986  ip_conn_constructor(void *buf, void *cdrarg, int kmflags)
1987 1987  {
1988 1988          itc_t   *itc = (itc_t *)buf;
1989 1989          conn_t  *connp = &itc->itc_conn;
1990 1990  
1991 1991          bzero(connp, sizeof (conn_t));
1992 1992          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
1993 1993          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
1994 1994          connp->conn_flags = IPCL_IPCCONN;
1995 1995          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
1996 1996  
1997 1997          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
1998 1998          if (connp->conn_ixa == NULL)
1999 1999                  return (ENOMEM);
2000 2000          connp->conn_ixa->ixa_refcnt = 1;
2001 2001          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2002 2002          return (0);
2003 2003  }
2004 2004  
2005 2005  /* ARGSUSED */
2006 2006  static void
2007 2007  ip_conn_destructor(void *buf, void *cdrarg)
2008 2008  {
2009 2009          itc_t   *itc = (itc_t *)buf;
2010 2010          conn_t  *connp = &itc->itc_conn;
2011 2011  
2012 2012          ASSERT(connp->conn_flags & IPCL_IPCCONN);
2013 2013          ASSERT(connp->conn_priv == NULL);
2014 2014          mutex_destroy(&connp->conn_lock);
2015 2015          cv_destroy(&connp->conn_cv);
2016 2016          rw_destroy(&connp->conn_ilg_lock);
2017 2017  
2018 2018          /* Can be NULL if constructor failed */
2019 2019          if (connp->conn_ixa != NULL) {
2020 2020                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2021 2021                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2022 2022                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2023 2023                  ixa_refrele(connp->conn_ixa);
2024 2024          }
2025 2025  }
2026 2026  
2027 2027  /* ARGSUSED */
2028 2028  static int
2029 2029  udp_conn_constructor(void *buf, void *cdrarg, int kmflags)
2030 2030  {
2031 2031          itc_t   *itc = (itc_t *)buf;
2032 2032          conn_t  *connp = &itc->itc_conn;
2033 2033          udp_t   *udp = (udp_t *)&itc[1];
2034 2034  
2035 2035          bzero(connp, sizeof (conn_t));
2036 2036          bzero(udp, sizeof (udp_t));
2037 2037  
2038 2038          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2039 2039          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2040 2040          connp->conn_udp = udp;
2041 2041          connp->conn_flags = IPCL_UDPCONN;
2042 2042          connp->conn_proto = IPPROTO_UDP;
2043 2043          udp->udp_connp = connp;
2044 2044          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2045 2045          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2046 2046          if (connp->conn_ixa == NULL)
2047 2047                  return (ENOMEM);
2048 2048          connp->conn_ixa->ixa_refcnt = 1;
2049 2049          connp->conn_ixa->ixa_protocol = connp->conn_proto;
2050 2050          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2051 2051          return (0);
2052 2052  }
2053 2053  
2054 2054  /* ARGSUSED */
2055 2055  static void
2056 2056  udp_conn_destructor(void *buf, void *cdrarg)
2057 2057  {
2058 2058          itc_t   *itc = (itc_t *)buf;
2059 2059          conn_t  *connp = &itc->itc_conn;
2060 2060          udp_t   *udp = (udp_t *)&itc[1];
2061 2061  
2062 2062          ASSERT(connp->conn_flags & IPCL_UDPCONN);
2063 2063          ASSERT(udp->udp_connp == connp);
2064 2064          ASSERT(connp->conn_udp == udp);
2065 2065          mutex_destroy(&connp->conn_lock);
2066 2066          cv_destroy(&connp->conn_cv);
2067 2067          rw_destroy(&connp->conn_ilg_lock);
2068 2068  
2069 2069          /* Can be NULL if constructor failed */
2070 2070          if (connp->conn_ixa != NULL) {
2071 2071                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2072 2072                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2073 2073                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2074 2074                  ixa_refrele(connp->conn_ixa);
2075 2075          }
2076 2076  }
2077 2077  
2078 2078  /* ARGSUSED */
2079 2079  static int
2080 2080  rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
2081 2081  {
2082 2082          itc_t   *itc = (itc_t *)buf;
2083 2083          conn_t  *connp = &itc->itc_conn;
2084 2084          icmp_t  *icmp = (icmp_t *)&itc[1];
2085 2085  
2086 2086          bzero(connp, sizeof (conn_t));
2087 2087          bzero(icmp, sizeof (icmp_t));
2088 2088  
2089 2089          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2090 2090          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2091 2091          connp->conn_icmp = icmp;
2092 2092          connp->conn_flags = IPCL_RAWIPCONN;
2093 2093          connp->conn_proto = IPPROTO_ICMP;
2094 2094          icmp->icmp_connp = connp;
2095 2095          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2096 2096          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2097 2097          if (connp->conn_ixa == NULL)
2098 2098                  return (ENOMEM);
2099 2099          connp->conn_ixa->ixa_refcnt = 1;
2100 2100          connp->conn_ixa->ixa_protocol = connp->conn_proto;
2101 2101          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2102 2102          return (0);
2103 2103  }
2104 2104  
2105 2105  /* ARGSUSED */
2106 2106  static void
2107 2107  rawip_conn_destructor(void *buf, void *cdrarg)
2108 2108  {
2109 2109          itc_t   *itc = (itc_t *)buf;
2110 2110          conn_t  *connp = &itc->itc_conn;
2111 2111          icmp_t  *icmp = (icmp_t *)&itc[1];
2112 2112  
2113 2113          ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2114 2114          ASSERT(icmp->icmp_connp == connp);
2115 2115          ASSERT(connp->conn_icmp == icmp);
2116 2116          mutex_destroy(&connp->conn_lock);
2117 2117          cv_destroy(&connp->conn_cv);
2118 2118          rw_destroy(&connp->conn_ilg_lock);
2119 2119  
2120 2120          /* Can be NULL if constructor failed */
2121 2121          if (connp->conn_ixa != NULL) {
2122 2122                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2123 2123                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2124 2124                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2125 2125                  ixa_refrele(connp->conn_ixa);
2126 2126          }
2127 2127  }
2128 2128  
2129 2129  /* ARGSUSED */
2130 2130  static int
2131 2131  rts_conn_constructor(void *buf, void *cdrarg, int kmflags)
2132 2132  {
2133 2133          itc_t   *itc = (itc_t *)buf;
2134 2134          conn_t  *connp = &itc->itc_conn;
2135 2135          rts_t   *rts = (rts_t *)&itc[1];
2136 2136  
2137 2137          bzero(connp, sizeof (conn_t));
2138 2138          bzero(rts, sizeof (rts_t));
2139 2139  
2140 2140          mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
2141 2141          cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
2142 2142          connp->conn_rts = rts;
2143 2143          connp->conn_flags = IPCL_RTSCONN;
2144 2144          rts->rts_connp = connp;
2145 2145          rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
2146 2146          connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
2147 2147          if (connp->conn_ixa == NULL)
2148 2148                  return (ENOMEM);
2149 2149          connp->conn_ixa->ixa_refcnt = 1;
2150 2150          connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp);
2151 2151          return (0);
2152 2152  }
2153 2153  
2154 2154  /* ARGSUSED */
2155 2155  static void
2156 2156  rts_conn_destructor(void *buf, void *cdrarg)
2157 2157  {
2158 2158          itc_t   *itc = (itc_t *)buf;
2159 2159          conn_t  *connp = &itc->itc_conn;
2160 2160          rts_t   *rts = (rts_t *)&itc[1];
2161 2161  
2162 2162          ASSERT(connp->conn_flags & IPCL_RTSCONN);
2163 2163          ASSERT(rts->rts_connp == connp);
2164 2164          ASSERT(connp->conn_rts == rts);
2165 2165          mutex_destroy(&connp->conn_lock);
2166 2166          cv_destroy(&connp->conn_cv);
2167 2167          rw_destroy(&connp->conn_ilg_lock);
2168 2168  
2169 2169          /* Can be NULL if constructor failed */
2170 2170          if (connp->conn_ixa != NULL) {
2171 2171                  ASSERT(connp->conn_ixa->ixa_refcnt == 1);
2172 2172                  ASSERT(connp->conn_ixa->ixa_ire == NULL);
2173 2173                  ASSERT(connp->conn_ixa->ixa_nce == NULL);
2174 2174                  ixa_refrele(connp->conn_ixa);
2175 2175          }
2176 2176  }
2177 2177  
2178 2178  /*
2179 2179   * Called as part of ipcl_conn_destroy to assert and clear any pointers
2180 2180   * in the conn_t.
2181 2181   *
2182 2182   * Below we list all the pointers in the conn_t as a documentation aid.
2183 2183   * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
2184 2184   * If you add any pointers to the conn_t please add an ASSERT here
2185 2185   * and #ifdef it out if it can't be actually asserted to be NULL.
2186 2186   * In any case, we bzero most of the conn_t at the end of the function.
2187 2187   */
2188 2188  void
2189 2189  ipcl_conn_cleanup(conn_t *connp)
2190 2190  {
2191 2191          ip_xmit_attr_t  *ixa;
2192 2192  
2193 2193          ASSERT(connp->conn_latch == NULL);
2194 2194          ASSERT(connp->conn_latch_in_policy == NULL);
2195 2195          ASSERT(connp->conn_latch_in_action == NULL);
2196 2196  #ifdef notdef
2197 2197          ASSERT(connp->conn_rq == NULL);
2198 2198          ASSERT(connp->conn_wq == NULL);
2199 2199  #endif
2200 2200          ASSERT(connp->conn_cred == NULL);
2201 2201          ASSERT(connp->conn_g_fanout == NULL);
2202 2202          ASSERT(connp->conn_g_next == NULL);
2203 2203          ASSERT(connp->conn_g_prev == NULL);
2204 2204          ASSERT(connp->conn_policy == NULL);
2205 2205          ASSERT(connp->conn_fanout == NULL);
2206 2206          ASSERT(connp->conn_next == NULL);
2207 2207          ASSERT(connp->conn_prev == NULL);
2208 2208          ASSERT(connp->conn_oper_pending_ill == NULL);
2209 2209          ASSERT(connp->conn_ilg == NULL);
2210 2210          ASSERT(connp->conn_drain_next == NULL);
2211 2211          ASSERT(connp->conn_drain_prev == NULL);
2212 2212  #ifdef notdef
2213 2213          /* conn_idl is not cleared when removed from idl list */
2214 2214          ASSERT(connp->conn_idl == NULL);
2215 2215  #endif
2216 2216          ASSERT(connp->conn_ipsec_opt_mp == NULL);
2217 2217  #ifdef notdef
2218 2218          /* conn_netstack is cleared by the caller; needed by ixa_cleanup */
2219 2219          ASSERT(connp->conn_netstack == NULL);
2220 2220  #endif
2221 2221  
2222 2222          ASSERT(connp->conn_helper_info == NULL);
2223 2223          ASSERT(connp->conn_ixa != NULL);
2224 2224          ixa = connp->conn_ixa;
2225 2225          ASSERT(ixa->ixa_refcnt == 1);
2226 2226          /* Need to preserve ixa_protocol */
2227 2227          ixa_cleanup(ixa);
2228 2228          ixa->ixa_flags = 0;
2229 2229  
2230 2230          /* Clear out the conn_t fields that are not preserved */
2231 2231          bzero(&connp->conn_start_clr,
2232 2232              sizeof (conn_t) -
2233 2233              ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
2234 2234  }
2235 2235  
2236 2236  /*
2237 2237   * All conns are inserted in a global multi-list for the benefit of
2238 2238   * walkers. The walk is guaranteed to walk all open conns at the time
2239 2239   * of the start of the walk exactly once. This property is needed to
2240 2240   * achieve some cleanups during unplumb of interfaces. This is achieved
2241 2241   * as follows.
2242 2242   *
2243 2243   * ipcl_conn_create and ipcl_conn_destroy are the only functions that
2244 2244   * call the insert and delete functions below at creation and deletion
2245 2245   * time respectively. The conn never moves or changes its position in this
2246 2246   * multi-list during its lifetime. CONN_CONDEMNED ensures that the refcnt
2247 2247   * won't increase due to walkers, once the conn deletion has started. Note
2248 2248   * that we can't remove the conn from the global list and then wait for
2249 2249   * the refcnt to drop to zero, since walkers would then see a truncated
2250 2250   * list. CONN_INCIPIENT ensures that walkers don't start looking at
2251 2251   * conns until ip_open is ready to make them globally visible.
2252 2252   * The global round robin multi-list locks are held only to get the
2253 2253   * next member/insertion/deletion and contention should be negligible
2254 2254   * if the multi-list is much greater than the number of cpus.
2255 2255   */
2256 2256  void
2257 2257  ipcl_globalhash_insert(conn_t *connp)
2258 2258  {
2259 2259          int     index;
2260 2260          struct connf_s  *connfp;
2261 2261          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
2262 2262  
2263 2263          /*
2264 2264           * No need for atomic here. Approximate even distribution
2265 2265           * in the global lists is sufficient.
2266 2266           */
2267 2267          ipst->ips_conn_g_index++;
2268 2268          index = ipst->ips_conn_g_index & (CONN_G_HASH_SIZE - 1);
2269 2269  
2270 2270          connp->conn_g_prev = NULL;
2271 2271          /*
2272 2272           * Mark as INCIPIENT, so that walkers will ignore this
2273 2273           * for now, till ip_open is ready to make it visible globally.
2274 2274           */
2275 2275          connp->conn_state_flags |= CONN_INCIPIENT;
2276 2276  
2277 2277          connfp = &ipst->ips_ipcl_globalhash_fanout[index];
2278 2278          /* Insert at the head of the list */
2279 2279          mutex_enter(&connfp->connf_lock);
2280 2280          connp->conn_g_next = connfp->connf_head;
2281 2281          if (connp->conn_g_next != NULL)
2282 2282                  connp->conn_g_next->conn_g_prev = connp;
2283 2283          connfp->connf_head = connp;
2284 2284  
2285 2285          /* The fanout bucket this conn points to */
2286 2286          connp->conn_g_fanout = connfp;
2287 2287  
2288 2288          mutex_exit(&connfp->connf_lock);
2289 2289  }
2290 2290  
2291 2291  void
2292 2292  ipcl_globalhash_remove(conn_t *connp)
2293 2293  {
2294 2294          struct connf_s  *connfp;
2295 2295  
2296 2296          /*
2297 2297           * We were never inserted in the global multi list.
2298 2298           * IPCL_NONE variety is never inserted in the global multilist
2299 2299           * since it is presumed to not need any cleanup and is transient.
2300 2300           */
2301 2301          if (connp->conn_g_fanout == NULL)
2302 2302                  return;
2303 2303  
2304 2304          connfp = connp->conn_g_fanout;
2305 2305          mutex_enter(&connfp->connf_lock);
2306 2306          if (connp->conn_g_prev != NULL)
2307 2307                  connp->conn_g_prev->conn_g_next = connp->conn_g_next;
2308 2308          else
2309 2309                  connfp->connf_head = connp->conn_g_next;
2310 2310          if (connp->conn_g_next != NULL)
2311 2311                  connp->conn_g_next->conn_g_prev = connp->conn_g_prev;
2312 2312          mutex_exit(&connfp->connf_lock);
2313 2313  
2314 2314          /* Better to stumble on a null pointer than to corrupt memory */
2315 2315          connp->conn_g_next = NULL;
2316 2316          connp->conn_g_prev = NULL;
2317 2317          connp->conn_g_fanout = NULL;
2318 2318  }
2319 2319  
2320 2320  /*
2321 2321   * Walk the list of all conn_t's in the system, calling the function provided
2322 2322   * With the specified argument for each.
2323 2323   * Applies to both IPv4 and IPv6.
2324 2324   *
2325 2325   * CONNs may hold pointers to ills (conn_dhcpinit_ill and
2326 2326   * conn_oper_pending_ill). To guard against stale pointers
2327 2327   * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is
2328 2328   * unplumbed or removed. New conn_t's that are created while we are walking
2329 2329   * may be missed by this walk, because they are not necessarily inserted
2330 2330   * at the tail of the list. They are new conn_t's and thus don't have any
2331 2331   * stale pointers. The CONN_CLOSING flag ensures that no new reference
2332 2332   * is created to the struct that is going away.
2333 2333   */
2334 2334  void
2335 2335  ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst)
2336 2336  {
2337 2337          int     i;
2338 2338          conn_t  *connp;
2339 2339          conn_t  *prev_connp;
2340 2340  
2341 2341          for (i = 0; i < CONN_G_HASH_SIZE; i++) {
2342 2342                  mutex_enter(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2343 2343                  prev_connp = NULL;
2344 2344                  connp = ipst->ips_ipcl_globalhash_fanout[i].connf_head;
2345 2345                  while (connp != NULL) {
2346 2346                          mutex_enter(&connp->conn_lock);
2347 2347                          if (connp->conn_state_flags &
2348 2348                              (CONN_CONDEMNED | CONN_INCIPIENT)) {
2349 2349                                  mutex_exit(&connp->conn_lock);
2350 2350                                  connp = connp->conn_g_next;
2351 2351                                  continue;
2352 2352                          }
2353 2353                          CONN_INC_REF_LOCKED(connp);
2354 2354                          mutex_exit(&connp->conn_lock);
2355 2355                          mutex_exit(
2356 2356                              &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2357 2357                          (*func)(connp, arg);
2358 2358                          if (prev_connp != NULL)
2359 2359                                  CONN_DEC_REF(prev_connp);
2360 2360                          mutex_enter(
2361 2361                              &ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2362 2362                          prev_connp = connp;
2363 2363                          connp = connp->conn_g_next;
2364 2364                  }
2365 2365                  mutex_exit(&ipst->ips_ipcl_globalhash_fanout[i].connf_lock);
2366 2366                  if (prev_connp != NULL)
2367 2367                          CONN_DEC_REF(prev_connp);
2368 2368          }
2369 2369  }
2370 2370  
2371 2371  /*
2372 2372   * Search for a peer TCP/IPv4 loopback conn by doing a reverse lookup on
2373 2373   * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2374 2374   * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2375 2375   * (peer tcp in ESTABLISHED state).
2376 2376   */
2377 2377  conn_t *
2378 2378  ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha,
2379 2379      ip_stack_t *ipst)
2380 2380  {
2381 2381          uint32_t ports;
2382 2382          uint16_t *pports = (uint16_t *)&ports;
2383 2383          connf_t *connfp;
2384 2384          conn_t  *tconnp;
2385 2385          boolean_t zone_chk;
2386 2386  
2387 2387          /*
2388 2388           * If either the source of destination address is loopback, then
2389 2389           * both endpoints must be in the same Zone.  Otherwise, both of
2390 2390           * the addresses are system-wide unique (tcp is in ESTABLISHED
2391 2391           * state) and the endpoints may reside in different Zones.
2392 2392           */
2393 2393          zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) ||
2394 2394              ipha->ipha_dst == htonl(INADDR_LOOPBACK));
2395 2395  
2396 2396          pports[0] = tcpha->tha_fport;
2397 2397          pports[1] = tcpha->tha_lport;
2398 2398  
2399 2399          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2400 2400              ports, ipst)];
2401 2401  
2402 2402          mutex_enter(&connfp->connf_lock);
2403 2403          for (tconnp = connfp->connf_head; tconnp != NULL;
2404 2404              tconnp = tconnp->conn_next) {
2405 2405  
2406 2406                  if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2407 2407                      ipha->ipha_dst, ipha->ipha_src, ports) &&
2408 2408                      tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2409 2409                      (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2410 2410  
2411 2411                          ASSERT(tconnp != connp);
2412 2412                          CONN_INC_REF(tconnp);
2413 2413                          mutex_exit(&connfp->connf_lock);
2414 2414                          return (tconnp);
2415 2415                  }
2416 2416          }
2417 2417          mutex_exit(&connfp->connf_lock);
2418 2418          return (NULL);
2419 2419  }
2420 2420  
2421 2421  /*
2422 2422   * Search for a peer TCP/IPv6 loopback conn by doing a reverse lookup on
2423 2423   * the {src, dst, lport, fport} quadruplet.  Returns with conn reference
2424 2424   * held; caller must call CONN_DEC_REF.  Only checks for connected entries
2425 2425   * (peer tcp in ESTABLISHED state).
2426 2426   */
2427 2427  conn_t *
2428 2428  ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha,
2429 2429      ip_stack_t *ipst)
2430 2430  {
2431 2431          uint32_t ports;
2432 2432          uint16_t *pports = (uint16_t *)&ports;
2433 2433          connf_t *connfp;
2434 2434          conn_t  *tconnp;
2435 2435          boolean_t zone_chk;
2436 2436  
2437 2437          /*
2438 2438           * If either the source of destination address is loopback, then
2439 2439           * both endpoints must be in the same Zone.  Otherwise, both of
2440 2440           * the addresses are system-wide unique (tcp is in ESTABLISHED
2441 2441           * state) and the endpoints may reside in different Zones.  We
2442 2442           * don't do Zone check for link local address(es) because the
2443 2443           * current Zone implementation treats each link local address as
2444 2444           * being unique per system node, i.e. they belong to global Zone.
2445 2445           */
2446 2446          zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) ||
2447 2447              IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst));
2448 2448  
2449 2449          pports[0] = tcpha->tha_fport;
2450 2450          pports[1] = tcpha->tha_lport;
2451 2451  
2452 2452          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2453 2453              ports, ipst)];
2454 2454  
2455 2455          mutex_enter(&connfp->connf_lock);
2456 2456          for (tconnp = connfp->connf_head; tconnp != NULL;
2457 2457              tconnp = tconnp->conn_next) {
2458 2458  
2459 2459                  /* We skip conn_bound_if check here as this is loopback tcp */
2460 2460                  if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2461 2461                      ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2462 2462                      tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED &&
2463 2463                      (!zone_chk || tconnp->conn_zoneid == connp->conn_zoneid)) {
2464 2464  
2465 2465                          ASSERT(tconnp != connp);
2466 2466                          CONN_INC_REF(tconnp);
2467 2467                          mutex_exit(&connfp->connf_lock);
2468 2468                          return (tconnp);
2469 2469                  }
2470 2470          }
2471 2471          mutex_exit(&connfp->connf_lock);
2472 2472          return (NULL);
2473 2473  }
2474 2474  
2475 2475  /*
2476 2476   * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2477 2477   * Returns with conn reference held. Caller must call CONN_DEC_REF.
2478 2478   * Only checks for connected entries i.e. no INADDR_ANY checks.
2479 2479   */
2480 2480  conn_t *
2481 2481  ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state,
2482 2482      ip_stack_t *ipst)
2483 2483  {
2484 2484          uint32_t ports;
2485 2485          uint16_t *pports;
2486 2486          connf_t *connfp;
2487 2487          conn_t  *tconnp;
2488 2488  
2489 2489          pports = (uint16_t *)&ports;
2490 2490          pports[0] = tcpha->tha_fport;
2491 2491          pports[1] = tcpha->tha_lport;
2492 2492  
2493 2493          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst,
2494 2494              ports, ipst)];
2495 2495  
2496 2496          mutex_enter(&connfp->connf_lock);
2497 2497          for (tconnp = connfp->connf_head; tconnp != NULL;
2498 2498              tconnp = tconnp->conn_next) {
2499 2499  
2500 2500                  if (IPCL_CONN_MATCH(tconnp, IPPROTO_TCP,
2501 2501                      ipha->ipha_dst, ipha->ipha_src, ports) &&
2502 2502                      tconnp->conn_tcp->tcp_state >= min_state) {
2503 2503  
2504 2504                          CONN_INC_REF(tconnp);
2505 2505                          mutex_exit(&connfp->connf_lock);
2506 2506                          return (tconnp);
2507 2507                  }
2508 2508          }
2509 2509          mutex_exit(&connfp->connf_lock);
2510 2510          return (NULL);
2511 2511  }
2512 2512  
2513 2513  /*
2514 2514   * Find an exact {src, dst, lport, fport} match for a bounced datagram.
2515 2515   * Returns with conn reference held. Caller must call CONN_DEC_REF.
2516 2516   * Only checks for connected entries i.e. no INADDR_ANY checks.
2517 2517   * Match on ifindex in addition to addresses.
2518 2518   */
2519 2519  conn_t *
2520 2520  ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state,
2521 2521      uint_t ifindex, ip_stack_t *ipst)
2522 2522  {
2523 2523          tcp_t   *tcp;
2524 2524          uint32_t ports;
2525 2525          uint16_t *pports;
2526 2526          connf_t *connfp;
2527 2527          conn_t  *tconnp;
2528 2528  
2529 2529          pports = (uint16_t *)&ports;
2530 2530          pports[0] = tcpha->tha_fport;
2531 2531          pports[1] = tcpha->tha_lport;
2532 2532  
2533 2533          connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst,
2534 2534              ports, ipst)];
2535 2535  
2536 2536          mutex_enter(&connfp->connf_lock);
2537 2537          for (tconnp = connfp->connf_head; tconnp != NULL;
2538 2538              tconnp = tconnp->conn_next) {
2539 2539  
2540 2540                  tcp = tconnp->conn_tcp;
2541 2541                  if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP,
2542 2542                      ip6h->ip6_dst, ip6h->ip6_src, ports) &&
2543 2543                      tcp->tcp_state >= min_state &&
2544 2544                      (tconnp->conn_bound_if == 0 ||
2545 2545                      tconnp->conn_bound_if == ifindex)) {
2546 2546  
2547 2547                          CONN_INC_REF(tconnp);
2548 2548                          mutex_exit(&connfp->connf_lock);
2549 2549                          return (tconnp);
2550 2550                  }
2551 2551          }
2552 2552          mutex_exit(&connfp->connf_lock);
2553 2553          return (NULL);
2554 2554  }
2555 2555  
2556 2556  /*
2557 2557   * Finds a TCP/IPv4 listening connection; called by tcp_disconnect to locate
2558 2558   * a listener when changing state.
2559 2559   */
2560 2560  conn_t *
2561 2561  ipcl_lookup_listener_v4(uint16_t lport, ipaddr_t laddr, zoneid_t zoneid,
2562 2562      ip_stack_t *ipst)
2563 2563  {
2564 2564          connf_t         *bind_connfp;
2565 2565          conn_t          *connp;
2566 2566          tcp_t           *tcp;
2567 2567  
2568 2568          /*
2569 2569           * Avoid false matches for packets sent to an IP destination of
2570 2570           * all zeros.
2571 2571           */
2572 2572          if (laddr == 0)
2573 2573                  return (NULL);
2574 2574  
2575 2575          ASSERT(zoneid != ALL_ZONES);
2576 2576  
2577 2577          bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2578 2578          mutex_enter(&bind_connfp->connf_lock);
2579 2579          for (connp = bind_connfp->connf_head; connp != NULL;
2580 2580              connp = connp->conn_next) {
2581 2581                  tcp = connp->conn_tcp;
2582 2582                  if (IPCL_BIND_MATCH(connp, IPPROTO_TCP, laddr, lport) &&
2583 2583                      IPCL_ZONE_MATCH(connp, zoneid) &&
2584 2584                      (tcp->tcp_listener == NULL)) {
2585 2585                          CONN_INC_REF(connp);
2586 2586                          mutex_exit(&bind_connfp->connf_lock);
2587 2587                          return (connp);
2588 2588                  }
2589 2589          }
2590 2590          mutex_exit(&bind_connfp->connf_lock);
2591 2591          return (NULL);
2592 2592  }
2593 2593  
2594 2594  /*
2595 2595   * Finds a TCP/IPv6 listening connection; called by tcp_disconnect to locate
2596 2596   * a listener when changing state.
2597 2597   */
2598 2598  conn_t *
2599 2599  ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
2600 2600      zoneid_t zoneid, ip_stack_t *ipst)
2601 2601  {
2602 2602          connf_t         *bind_connfp;
2603 2603          conn_t          *connp = NULL;
2604 2604          tcp_t           *tcp;
2605 2605  
2606 2606          /*
2607 2607           * Avoid false matches for packets sent to an IP destination of
2608 2608           * all zeros.
2609 2609           */
2610 2610          if (IN6_IS_ADDR_UNSPECIFIED(laddr))
2611 2611                  return (NULL);
2612 2612  
2613 2613          ASSERT(zoneid != ALL_ZONES);
2614 2614  
2615 2615          bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)];
2616 2616          mutex_enter(&bind_connfp->connf_lock);
2617 2617          for (connp = bind_connfp->connf_head; connp != NULL;
2618 2618              connp = connp->conn_next) {
2619 2619                  tcp = connp->conn_tcp;
2620 2620                  if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) &&
2621 2621                      IPCL_ZONE_MATCH(connp, zoneid) &&
2622 2622                      (connp->conn_bound_if == 0 ||
2623 2623                      connp->conn_bound_if == ifindex) &&
2624 2624                      tcp->tcp_listener == NULL) {
2625 2625                          CONN_INC_REF(connp);
2626 2626                          mutex_exit(&bind_connfp->connf_lock);
2627 2627                          return (connp);
2628 2628                  }
2629 2629          }
2630 2630          mutex_exit(&bind_connfp->connf_lock);
2631 2631          return (NULL);
2632 2632  }
2633 2633  
2634 2634  /*
2635 2635   * ipcl_get_next_conn
2636 2636   *      get the next entry in the conn global list
2637 2637   *      and put a reference on the next_conn.
2638 2638   *      decrement the reference on the current conn.
2639 2639   *
2640 2640   * This is an iterator based walker function that also provides for
2641 2641   * some selection by the caller. It walks through the conn_hash bucket
2642 2642   * searching for the next valid connp in the list, and selects connections
2643 2643   * that are neither closed nor condemned. It also REFHOLDS the conn
2644 2644   * thus ensuring that the conn exists when the caller uses the conn.
2645 2645   */
2646 2646  conn_t *
2647 2647  ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
2648 2648  {
2649 2649          conn_t  *next_connp;
2650 2650  
2651 2651          if (connfp == NULL)
2652 2652                  return (NULL);
2653 2653  
2654 2654          mutex_enter(&connfp->connf_lock);
2655 2655  
2656 2656          next_connp = (connp == NULL) ?
2657 2657              connfp->connf_head : connp->conn_g_next;
2658 2658  
2659 2659          while (next_connp != NULL) {
2660 2660                  mutex_enter(&next_connp->conn_lock);
2661 2661                  if (!(next_connp->conn_flags & conn_flags) ||
2662 2662                      (next_connp->conn_state_flags &
2663 2663                      (CONN_CONDEMNED | CONN_INCIPIENT))) {
2664 2664                          /*
2665 2665                           * This conn has been condemned or
2666 2666                           * is closing, or the flags don't match
2667 2667                           */
2668 2668                          mutex_exit(&next_connp->conn_lock);
2669 2669                          next_connp = next_connp->conn_g_next;
2670 2670                          continue;
2671 2671                  }
2672 2672                  CONN_INC_REF_LOCKED(next_connp);
2673 2673                  mutex_exit(&next_connp->conn_lock);
2674 2674                  break;
2675 2675          }
2676 2676  
2677 2677          mutex_exit(&connfp->connf_lock);
2678 2678  
2679 2679          if (connp != NULL)
2680 2680                  CONN_DEC_REF(connp);
2681 2681  
2682 2682          return (next_connp);
2683 2683  }
2684 2684  
2685 2685  #ifdef CONN_DEBUG
2686 2686  /*
2687 2687   * Trace of the last NBUF refhold/refrele
2688 2688   */
2689 2689  int
2690 2690  conn_trace_ref(conn_t *connp)
2691 2691  {
2692 2692          int     last;
2693 2693          conn_trace_t    *ctb;
2694 2694  
2695 2695          ASSERT(MUTEX_HELD(&connp->conn_lock));
2696 2696          last = connp->conn_trace_last;
2697 2697          last++;
2698 2698          if (last == CONN_TRACE_MAX)
2699 2699                  last = 0;
2700 2700  
2701 2701          ctb = &connp->conn_trace_buf[last];
2702 2702          ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2703 2703          connp->conn_trace_last = last;
2704 2704          return (1);
2705 2705  }
2706 2706  
2707 2707  int
2708 2708  conn_untrace_ref(conn_t *connp)
2709 2709  {
2710 2710          int     last;
2711 2711          conn_trace_t    *ctb;
2712 2712  
2713 2713          ASSERT(MUTEX_HELD(&connp->conn_lock));
2714 2714          last = connp->conn_trace_last;
  
    | 
      ↓ open down ↓ | 
    2714 lines elided | 
    
      ↑ open up ↑ | 
  
2715 2715          last++;
2716 2716          if (last == CONN_TRACE_MAX)
2717 2717                  last = 0;
2718 2718  
2719 2719          ctb = &connp->conn_trace_buf[last];
2720 2720          ctb->ctb_depth = getpcstack(ctb->ctb_stack, CONN_STACK_DEPTH);
2721 2721          connp->conn_trace_last = last;
2722 2722          return (1);
2723 2723  }
2724 2724  #endif
     2725 +
     2726 +mblk_t *
     2727 +conn_get_pid_mblk(conn_t *connp)
     2728 +{
     2729 +        mblk_t *mblk;
     2730 +        conn_pid_info_t *cpi;
     2731 +
     2732 +        if (connp->conn_upper_handle != NULL) {
     2733 +                return (*connp->conn_upcalls->su_get_sock_pid_mblk)
     2734 +                    (connp->conn_upper_handle);
     2735 +        } else if (!IPCL_IS_NONSTR(connp) && connp->conn_rq != NULL &&
     2736 +            connp->conn_rq->q_stream != NULL) {
     2737 +                return (sh_get_pid_mblk(connp->conn_rq->q_stream));
     2738 +        }
     2739 +
     2740 +        /* return an empty mblk */
     2741 +        if ((mblk = allocb(sizeof (conn_pid_info_t), BPRI_HI)) == NULL)
     2742 +                return (NULL);
     2743 +        mblk->b_wptr += sizeof (conn_pid_info_t);
     2744 +        cpi = (conn_pid_info_t *)mblk->b_datap->db_base;
     2745 +        cpi->cpi_magic = CONN_PID_INFO_MGC;
     2746 +        cpi->cpi_contents = CONN_PID_INFO_NON;
     2747 +        cpi->cpi_pids_cnt = 0;
     2748 +        cpi->cpi_tot_size = sizeof (conn_pid_info_t);
     2749 +        cpi->cpi_pids[0] = 0;
     2750 +        return (mblk);
     2751 +}
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX