illumos-dccp Wdiff usr/src/uts/common/inet/ip/ip.c

Print this page

dccp: starting module template

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/ip/ip.c
          +++ new/usr/src/uts/common/inet/ip/ip.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 1990 Mentat Inc.
  25   25   * Copyright (c) 2011 Joyent, Inc. All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/types.h>
  29   29  #include <sys/stream.h>
  30   30  #include <sys/dlpi.h>
  31   31  #include <sys/stropts.h>
  32   32  #include <sys/sysmacros.h>
  33   33  #include <sys/strsubr.h>
  34   34  #include <sys/strlog.h>
  35   35  #include <sys/strsun.h>
  36   36  #include <sys/zone.h>
  37   37  #define _SUN_TPI_VERSION 2
  38   38  #include <sys/tihdr.h>
  39   39  #include <sys/xti_inet.h>
  40   40  #include <sys/ddi.h>
  41   41  #include <sys/suntpi.h>
  42   42  #include <sys/cmn_err.h>
  43   43  #include <sys/debug.h>
  44   44  #include <sys/kobj.h>
  45   45  #include <sys/modctl.h>
  46   46  #include <sys/atomic.h>
  47   47  #include <sys/policy.h>
  48   48  #include <sys/priv.h>
  49   49  #include <sys/taskq.h>
  50   50  
  51   51  #include <sys/systm.h>
  52   52  #include <sys/param.h>
  53   53  #include <sys/kmem.h>
  54   54  #include <sys/sdt.h>
  55   55  #include <sys/socket.h>
  56   56  #include <sys/vtrace.h>
  57   57  #include <sys/isa_defs.h>
  58   58  #include <sys/mac.h>
  59   59  #include <net/if.h>
  60   60  #include <net/if_arp.h>
  61   61  #include <net/route.h>
  62   62  #include <sys/sockio.h>
  63   63  #include <netinet/in.h>
  64   64  #include <net/if_dl.h>
  65   65  
  66   66  #include <inet/common.h>
  67   67  #include <inet/mi.h>
  68   68  #include <inet/mib2.h>
  69   69  #include <inet/nd.h>
  70   70  #include <inet/arp.h>
  71   71  #include <inet/snmpcom.h>
  72   72  #include <inet/optcom.h>
  73   73  #include <inet/kstatcom.h>
  74   74  
  75   75  #include <netinet/igmp_var.h>
  76   76  #include <netinet/ip6.h>
  77   77  #include <netinet/icmp6.h>
  78   78  #include <netinet/sctp.h>
  79   79  
  80   80  #include <inet/ip.h>
  81   81  #include <inet/ip_impl.h>
  82   82  #include <inet/ip6.h>
  83   83  #include <inet/ip6_asp.h>
  84   84  #include <inet/tcp.h>
  85   85  #include <inet/tcp_impl.h>
  86   86  #include <inet/ip_multi.h>
  87   87  #include <inet/ip_if.h>
  88   88  #include <inet/ip_ire.h>
  89   89  #include <inet/ip_ftable.h>
  90   90  #include <inet/ip_rts.h>
  91   91  #include <inet/ip_ndp.h>
  92   92  #include <inet/ip_listutils.h>
  93   93  #include <netinet/igmp.h>
  94   94  #include <netinet/ip_mroute.h>
  95   95  #include <inet/ipp_common.h>
  96   96  
  97   97  #include <net/pfkeyv2.h>
  98   98  #include <inet/sadb.h>
  99   99  #include <inet/ipsec_impl.h>
 100  100  #include <inet/iptun/iptun_impl.h>
 101  101  #include <inet/ipdrop.h>
 102  102  #include <inet/ip_netinfo.h>
 103  103  #include <inet/ilb_ip.h>

↓ open down ↓

103 lines elided

↑ open up ↑

 104  104  
 105  105  #include <sys/ethernet.h>
 106  106  #include <net/if_types.h>
 107  107  #include <sys/cpuvar.h>
 108  108  
 109  109  #include <ipp/ipp.h>
 110  110  #include <ipp/ipp_impl.h>
 111  111  #include <ipp/ipgpc/ipgpc.h>
 112  112  
 113  113  #include <sys/pattr.h>
      114 +#include <inet/dccp.h>
      115 +#include <inet/dccp_impl.h>
      116 +#include <inet/dccp_ip.h>
 114  117  #include <inet/ipclassifier.h>
 115  118  #include <inet/sctp_ip.h>
 116  119  #include <inet/sctp/sctp_impl.h>
 117  120  #include <inet/udp_impl.h>
 118  121  #include <inet/rawip_impl.h>
 119  122  #include <inet/rts_impl.h>
 120  123  
 121  124  #include <sys/tsol/label.h>
 122  125  #include <sys/tsol/tnet.h>
 123  126

 124  127  #include <sys/squeue_impl.h>
 125  128  #include <inet/ip_arp.h>
 126  129  
 127  130  #include <sys/clock_impl.h>     /* For LBOLT_FASTPATH{,64} */
 128  131  
 129  132  /*
 130  133   * Values for squeue switch:
 131  134   * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
 132  135   * IP_SQUEUE_ENTER: SQ_PROCESS
 133  136   * IP_SQUEUE_FILL: SQ_FILL
 134  137   */
 135  138  int ip_squeue_enter = IP_SQUEUE_ENTER;  /* Setable in /etc/system */
 136  139  
 137  140  int ip_squeue_flag;
 138  141  
 139  142  /*
 140  143   * Setable in /etc/system
 141  144   */
 142  145  int ip_poll_normal_ms = 100;
 143  146  int ip_poll_normal_ticks = 0;
 144  147  int ip_modclose_ackwait_ms = 3000;
 145  148  
 146  149  /*
 147  150   * It would be nice to have these present only in DEBUG systems, but the
 148  151   * current design of the global symbol checking logic requires them to be
 149  152   * unconditionally present.
 150  153   */
 151  154  uint_t ip_thread_data;                  /* TSD key for debug support */
 152  155  krwlock_t ip_thread_rwlock;
 153  156  list_t  ip_thread_list;
 154  157  
 155  158  /*
 156  159   * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
 157  160   */
 158  161  
 159  162  struct listptr_s {
 160  163          mblk_t  *lp_head;       /* pointer to the head of the list */
 161  164          mblk_t  *lp_tail;       /* pointer to the tail of the list */
 162  165  };
 163  166  
 164  167  typedef struct listptr_s listptr_t;
 165  168  
 166  169  /*
 167  170   * This is used by ip_snmp_get_mib2_ip_route_media and
 168  171   * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
 169  172   */
 170  173  typedef struct iproutedata_s {
 171  174          uint_t          ird_idx;
 172  175          uint_t          ird_flags;      /* see below */
 173  176          listptr_t       ird_route;      /* ipRouteEntryTable */
 174  177          listptr_t       ird_netmedia;   /* ipNetToMediaEntryTable */
 175  178          listptr_t       ird_attrs;      /* ipRouteAttributeTable */
 176  179  } iproutedata_t;
 177  180  
 178  181  /* Include ire_testhidden and IRE_IF_CLONE routes */
 179  182  #define IRD_REPORT_ALL  0x01
 180  183  
 181  184  /*
 182  185   * Cluster specific hooks. These should be NULL when booted as a non-cluster
 183  186   */
 184  187  
 185  188  /*
 186  189   * Hook functions to enable cluster networking
 187  190   * On non-clustered systems these vectors must always be NULL.
 188  191   *
 189  192   * Hook function to Check ip specified ip address is a shared ip address
 190  193   * in the cluster
 191  194   *
 192  195   */
 193  196  int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
 194  197      sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
 195  198  
 196  199  /*
 197  200   * Hook function to generate cluster wide ip fragment identifier
 198  201   */
 199  202  uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
 200  203      sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
 201  204      void *args) = NULL;
 202  205  
 203  206  /*
 204  207   * Hook function to generate cluster wide SPI.
 205  208   */
 206  209  void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
 207  210      void *) = NULL;
 208  211  
 209  212  /*
 210  213   * Hook function to verify if the SPI is already utlized.
 211  214   */
 212  215  
 213  216  int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
 214  217  
 215  218  /*
 216  219   * Hook function to delete the SPI from the cluster wide repository.
 217  220   */
 218  221  
 219  222  void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
 220  223  
 221  224  /*
 222  225   * Hook function to inform the cluster when packet received on an IDLE SA
 223  226   */
 224  227  
 225  228  void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
 226  229      in6_addr_t, in6_addr_t, void *) = NULL;
 227  230  
 228  231  /*
 229  232   * Synchronization notes:
 230  233   *
 231  234   * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
 232  235   * MT level protection given by STREAMS. IP uses a combination of its own
 233  236   * internal serialization mechanism and standard Solaris locking techniques.
 234  237   * The internal serialization is per phyint.  This is used to serialize
 235  238   * plumbing operations, IPMP operations, most set ioctls, etc.
 236  239   *
 237  240   * Plumbing is a long sequence of operations involving message
 238  241   * exchanges between IP, ARP and device drivers. Many set ioctls are typically
 239  242   * involved in plumbing operations. A natural model is to serialize these
 240  243   * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
 241  244   * parallel without any interference. But various set ioctls on hme0 are best
 242  245   * serialized, along with IPMP operations and processing of DLPI control
 243  246   * messages received from drivers on a per phyint basis. This serialization is
 244  247   * provided by the ipsq_t and primitives operating on this. Details can
 245  248   * be found in ip_if.c above the core primitives operating on ipsq_t.
 246  249   *
 247  250   * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
 248  251   * Simiarly lookup of an ire by a thread also returns a refheld ire.
 249  252   * In addition ipif's and ill's referenced by the ire are also indirectly
 250  253   * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
 251  254   * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
 252  255   * address of an ipif has to go through the ipsq_t. This ensures that only
 253  256   * one such exclusive operation proceeds at any time on the ipif. It then
 254  257   * waits for all refcnts
 255  258   * associated with this ipif to come down to zero. The address is changed
 256  259   * only after the ipif has been quiesced. Then the ipif is brought up again.
 257  260   * More details are described above the comment in ip_sioctl_flags.
 258  261   *
 259  262   * Packet processing is based mostly on IREs and are fully multi-threaded
 260  263   * using standard Solaris MT techniques.
 261  264   *
 262  265   * There are explicit locks in IP to handle:
 263  266   * - The ip_g_head list maintained by mi_open_link() and friends.
 264  267   *
 265  268   * - The reassembly data structures (one lock per hash bucket)
 266  269   *
 267  270   * - conn_lock is meant to protect conn_t fields. The fields actually
 268  271   *   protected by conn_lock are documented in the conn_t definition.
 269  272   *
 270  273   * - ire_lock to protect some of the fields of the ire, IRE tables
 271  274   *   (one lock per hash bucket). Refer to ip_ire.c for details.
 272  275   *
 273  276   * - ndp_g_lock and ncec_lock for protecting NCEs.
 274  277   *
 275  278   * - ill_lock protects fields of the ill and ipif. Details in ip.h
 276  279   *
 277  280   * - ill_g_lock: This is a global reader/writer lock. Protects the following
 278  281   *      * The AVL tree based global multi list of all ills.
 279  282   *      * The linked list of all ipifs of an ill
 280  283   *      * The <ipsq-xop> mapping
 281  284   *      * <ill-phyint> association
 282  285   *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
 283  286   *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
 284  287   *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
 285  288   *   writer for the actual duration of the insertion/deletion/change.
 286  289   *
 287  290   * - ill_lock:  This is a per ill mutex.
 288  291   *   It protects some members of the ill_t struct; see ip.h for details.
 289  292   *   It also protects the <ill-phyint> assoc.
 290  293   *   It also protects the list of ipifs hanging off the ill.
 291  294   *
 292  295   * - ipsq_lock: This is a per ipsq_t mutex lock.
 293  296   *   This protects some members of the ipsq_t struct; see ip.h for details.
 294  297   *   It also protects the <ipsq-ipxop> mapping
 295  298   *
 296  299   * - ipx_lock: This is a per ipxop_t mutex lock.
 297  300   *   This protects some members of the ipxop_t struct; see ip.h for details.
 298  301   *
 299  302   * - phyint_lock: This is a per phyint mutex lock. Protects just the
 300  303   *   phyint_flags
 301  304   *
 302  305   * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
 303  306   *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
 304  307   *   uniqueness check also done atomically.
 305  308   *
 306  309   * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
 307  310   *   group list linked by ill_usesrc_grp_next. It also protects the
 308  311   *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
 309  312   *   group is being added or deleted.  This lock is taken as a reader when
 310  313   *   walking the list/group(eg: to get the number of members in a usesrc group).
 311  314   *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
 312  315   *   field is changing state i.e from NULL to non-NULL or vice-versa. For
 313  316   *   example, it is not necessary to take this lock in the initial portion
 314  317   *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
 315  318   *   operations are executed exclusively and that ensures that the "usesrc
 316  319   *   group state" cannot change. The "usesrc group state" change can happen
 317  320   *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
 318  321   *
 319  322   * Changing <ill-phyint>, <ipsq-xop> assocications:
 320  323   *
 321  324   * To change the <ill-phyint> association, the ill_g_lock must be held
 322  325   * as writer, and the ill_locks of both the v4 and v6 instance of the ill
 323  326   * must be held.
 324  327   *
 325  328   * To change the <ipsq-xop> association, the ill_g_lock must be held as
 326  329   * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
 327  330   * This is only done when ills are added or removed from IPMP groups.
 328  331   *
 329  332   * To add or delete an ipif from the list of ipifs hanging off the ill,
 330  333   * ill_g_lock (writer) and ill_lock must be held and the thread must be
 331  334   * a writer on the associated ipsq.
 332  335   *
 333  336   * To add or delete an ill to the system, the ill_g_lock must be held as
 334  337   * writer and the thread must be a writer on the associated ipsq.
 335  338   *
 336  339   * To add or delete an ilm to an ill, the ill_lock must be held and the thread
 337  340   * must be a writer on the associated ipsq.
 338  341   *
 339  342   * Lock hierarchy
 340  343   *
 341  344   * Some lock hierarchy scenarios are listed below.
 342  345   *
 343  346   * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
 344  347   * ill_g_lock -> ill_lock(s) -> phyint_lock
 345  348   * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
 346  349   * ill_g_lock -> ip_addr_avail_lock
 347  350   * conn_lock -> irb_lock -> ill_lock -> ire_lock
 348  351   * ill_g_lock -> ip_g_nd_lock
 349  352   * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
 350  353   * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
 351  354   * arl_lock -> ill_lock
 352  355   * ips_ire_dep_lock -> irb_lock
 353  356   *
 354  357   * When more than 1 ill lock is needed to be held, all ill lock addresses
 355  358   * are sorted on address and locked starting from highest addressed lock
 356  359   * downward.
 357  360   *
 358  361   * Multicast scenarios
 359  362   * ips_ill_g_lock -> ill_mcast_lock
 360  363   * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
 361  364   * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
 362  365   * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
 363  366   * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
 364  367   * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
 365  368   *
 366  369   * IPsec scenarios
 367  370   *
 368  371   * ipsa_lock -> ill_g_lock -> ill_lock
 369  372   * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
 370  373   *
 371  374   * Trusted Solaris scenarios
 372  375   *
 373  376   * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
 374  377   * igsa_lock -> gcdb_lock
 375  378   * gcgrp_rwlock -> ire_lock
 376  379   * gcgrp_rwlock -> gcdb_lock
 377  380   *
 378  381   * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
 379  382   *
 380  383   * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
 381  384   * sq_lock -> conn_lock -> QLOCK(q)
 382  385   * ill_lock -> ft_lock -> fe_lock
 383  386   *
 384  387   * Routing/forwarding table locking notes:
 385  388   *
 386  389   * Lock acquisition order: Radix tree lock, irb_lock.
 387  390   * Requirements:
 388  391   * i.  Walker must not hold any locks during the walker callback.
 389  392   * ii  Walker must not see a truncated tree during the walk because of any node
 390  393   *     deletion.
 391  394   * iii Existing code assumes ire_bucket is valid if it is non-null and is used
 392  395   *     in many places in the code to walk the irb list. Thus even if all the
 393  396   *     ires in a bucket have been deleted, we still can't free the radix node
 394  397   *     until the ires have actually been inactive'd (freed).
 395  398   *
 396  399   * Tree traversal - Need to hold the global tree lock in read mode.
 397  400   * Before dropping the global tree lock, need to either increment the ire_refcnt
 398  401   * to ensure that the radix node can't be deleted.
 399  402   *
 400  403   * Tree add - Need to hold the global tree lock in write mode to add a
 401  404   * radix node. To prevent the node from being deleted, increment the
 402  405   * irb_refcnt, after the node is added to the tree. The ire itself is
 403  406   * added later while holding the irb_lock, but not the tree lock.
 404  407   *
 405  408   * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
 406  409   * All associated ires must be inactive (i.e. freed), and irb_refcnt
 407  410   * must be zero.
 408  411   *
 409  412   * Walker - Increment irb_refcnt before calling the walker callback. Hold the
 410  413   * global tree lock (read mode) for traversal.
 411  414   *
 412  415   * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
 413  416   * hence we will acquire irb_lock while holding ips_ire_dep_lock.
 414  417   *
 415  418   * IPsec notes :
 416  419   *
 417  420   * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
 418  421   * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
 419  422   * ip_xmit_attr_t has the
 420  423   * information used by the IPsec code for applying the right level of
 421  424   * protection. The information initialized by IP in the ip_xmit_attr_t
 422  425   * is determined by the per-socket policy or global policy in the system.
 423  426   * For inbound datagrams, the ip_recv_attr_t
 424  427   * starts out with nothing in it. It gets filled
 425  428   * with the right information if it goes through the AH/ESP code, which
 426  429   * happens if the incoming packet is secure. The information initialized
 427  430   * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
 428  431   * the policy requirements needed by per-socket policy or global policy
 429  432   * is met or not.
 430  433   *
 431  434   * For fully connected sockets i.e dst, src [addr, port] is known,
 432  435   * conn_policy_cached is set indicating that policy has been cached.
 433  436   * conn_in_enforce_policy may or may not be set depending on whether
 434  437   * there is a global policy match or per-socket policy match.
 435  438   * Policy inheriting happpens in ip_policy_set once the destination is known.
 436  439   * Once the right policy is set on the conn_t, policy cannot change for
 437  440   * this socket. This makes life simpler for TCP (UDP ?) where
 438  441   * re-transmissions go out with the same policy. For symmetry, policy
 439  442   * is cached for fully connected UDP sockets also. Thus if policy is cached,
 440  443   * it also implies that policy is latched i.e policy cannot change
 441  444   * on these sockets. As we have the right policy on the conn, we don't
 442  445   * have to lookup global policy for every outbound and inbound datagram
 443  446   * and thus serving as an optimization. Note that a global policy change
 444  447   * does not affect fully connected sockets if they have policy. If fully
 445  448   * connected sockets did not have any policy associated with it, global
 446  449   * policy change may affect them.
 447  450   *
 448  451   * IP Flow control notes:
 449  452   * ---------------------
 450  453   * Non-TCP streams are flow controlled by IP. The way this is accomplished
 451  454   * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
 452  455   * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
 453  456   * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
 454  457   * functions.
 455  458   *
 456  459   * Per Tx ring udp flow control:
 457  460   * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
 458  461   * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
 459  462   *
 460  463   * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
 461  464   * To achieve best performance, outgoing traffic need to be fanned out among
 462  465   * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
 463  466   * traffic out of the NIC and it takes a fanout hint. UDP connections pass
 464  467   * the address of connp as fanout hint to mac_tx(). Under flow controlled
 465  468   * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
 466  469   * cookie points to a specific Tx ring that is blocked. The cookie is used to
 467  470   * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
 468  471   * point to drain_lists (idl_t's). These drain list will store the blocked UDP
 469  472   * connp's. The drain list is not a single list but a configurable number of
 470  473   * lists.
 471  474   *
 472  475   * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
 473  476   * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
 474  477   * which is equal to 128. This array in turn contains a pointer to idl_t[],
 475  478   * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
 476  479   * list will point to the list of connp's that are flow controlled.
 477  480   *
 478  481   *                      ---------------   -------   -------   -------
 479  482   *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
 480  483   *                   |  ---------------   -------   -------   -------
 481  484   *                   |  ---------------   -------   -------   -------
 482  485   *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
 483  486   * ----------------  |  ---------------   -------   -------   -------
 484  487   * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
 485  488   * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
 486  489   *                   |  ---------------   -------   -------   -------
 487  490   *                   .        .              .         .         .
 488  491   *                   |  ---------------   -------   -------   -------
 489  492   *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
 490  493   *                      ---------------   -------   -------   -------
 491  494   *                      ---------------   -------   -------   -------
 492  495   *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
 493  496   *                   |  ---------------   -------   -------   -------
 494  497   *                   |  ---------------   -------   -------   -------
 495  498   * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
 496  499   * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
 497  500   * ----------------  |        .              .         .         .
 498  501   *                   |  ---------------   -------   -------   -------
 499  502   *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
 500  503   *                      ---------------   -------   -------   -------
 501  504   *     .....
 502  505   * ----------------
 503  506   * |idl_tx_list[n]|-> ...
 504  507   * ----------------
 505  508   *
 506  509   * When mac_tx() returns a cookie, the cookie is hashed into an index into
 507  510   * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
 508  511   * to insert the conn onto.  conn_drain_insert() asserts flow control for the
 509  512   * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
 510  513   * Further, conn_blocked is set to indicate that the conn is blocked.
 511  514   *
 512  515   * GLDv3 calls ill_flow_enable() when flow control is relieved.  The cookie
 513  516   * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
 514  517   * is again hashed to locate the appropriate idl_tx_list, which is then
 515  518   * drained via conn_walk_drain().  conn_walk_drain() goes through each conn in
 516  519   * the drain list and calls conn_drain_remove() to clear flow control (via
 517  520   * calling su_txq_full() or clearing QFULL), and remove the conn from the
 518  521   * drain list.
 519  522   *
 520  523   * Note that the drain list is not a single list but a (configurable) array of
 521  524   * lists (8 elements by default).  Synchronization between drain insertion and
 522  525   * flow control wakeup is handled by using idl_txl->txl_lock, and only
 523  526   * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
 524  527   *
 525  528   * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
 526  529   * On the send side, if the packet cannot be sent down to the driver by IP
 527  530   * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
 528  531   * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
 529  532   * the 0'th drain list.  When ip_wsrv() runs on the ill_wq because flow
 530  533   * control has been relieved, the blocked conns in the 0'th drain list are
 531  534   * drained as in the non-STREAMS case.
 532  535   *
 533  536   * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
 534  537   * is done when the conn is inserted into the drain list (conn_drain_insert())
 535  538   * and cleared when the conn is removed from the it (conn_drain_remove()).
 536  539   *
 537  540   * IPQOS notes:
 538  541   *
 539  542   * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
 540  543   * and IPQoS modules. IPPF includes hooks in IP at different control points
 541  544   * (callout positions) which direct packets to IPQoS modules for policy
 542  545   * processing. Policies, if present, are global.
 543  546   *
 544  547   * The callout positions are located in the following paths:
 545  548   *              o local_in (packets destined for this host)
 546  549   *              o local_out (packets orginating from this host )
 547  550   *              o fwd_in  (packets forwarded by this m/c - inbound)
 548  551   *              o fwd_out (packets forwarded by this m/c - outbound)
 549  552   * Hooks at these callout points can be enabled/disabled using the ndd variable
 550  553   * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
 551  554   * By default all the callout positions are enabled.
 552  555   *
 553  556   * Outbound (local_out)
 554  557   * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
 555  558   *
 556  559   * Inbound (local_in)
 557  560   * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
 558  561   *
 559  562   * Forwarding (in and out)
 560  563   * Hooks are placed in ire_recv_forward_v4/v6.
 561  564   *
 562  565   * IP Policy Framework processing (IPPF processing)
 563  566   * Policy processing for a packet is initiated by ip_process, which ascertains
 564  567   * that the classifier (ipgpc) is loaded and configured, failing which the
 565  568   * packet resumes normal processing in IP. If the clasifier is present, the
 566  569   * packet is acted upon by one or more IPQoS modules (action instances), per
 567  570   * filters configured in ipgpc and resumes normal IP processing thereafter.
 568  571   * An action instance can drop a packet in course of its processing.
 569  572   *
 570  573   * Zones notes:
 571  574   *
 572  575   * The partitioning rules for networking are as follows:
 573  576   * 1) Packets coming from a zone must have a source address belonging to that
 574  577   * zone.
 575  578   * 2) Packets coming from a zone can only be sent on a physical interface on
 576  579   * which the zone has an IP address.
 577  580   * 3) Between two zones on the same machine, packet delivery is only allowed if
 578  581   * there's a matching route for the destination and zone in the forwarding
 579  582   * table.
 580  583   * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
 581  584   * different zones can bind to the same port with the wildcard address
 582  585   * (INADDR_ANY).
 583  586   *
 584  587   * The granularity of interface partitioning is at the logical interface level.
 585  588   * Therefore, every zone has its own IP addresses, and incoming packets can be
 586  589   * attributed to a zone unambiguously. A logical interface is placed into a zone
 587  590   * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
 588  591   * structure. Rule (1) is implemented by modifying the source address selection
 589  592   * algorithm so that the list of eligible addresses is filtered based on the
 590  593   * sending process zone.
 591  594   *
 592  595   * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
 593  596   * across all zones, depending on their type. Here is the break-up:
 594  597   *
 595  598   * IRE type                             Shared/exclusive
 596  599   * --------                             ----------------
 597  600   * IRE_BROADCAST                        Exclusive
 598  601   * IRE_DEFAULT (default routes)         Shared (*)
 599  602   * IRE_LOCAL                            Exclusive (x)
 600  603   * IRE_LOOPBACK                         Exclusive
 601  604   * IRE_PREFIX (net routes)              Shared (*)
 602  605   * IRE_IF_NORESOLVER (interface routes) Exclusive
 603  606   * IRE_IF_RESOLVER (interface routes)   Exclusive
 604  607   * IRE_IF_CLONE (interface routes)      Exclusive
 605  608   * IRE_HOST (host routes)               Shared (*)
 606  609   *
 607  610   * (*) A zone can only use a default or off-subnet route if the gateway is
 608  611   * directly reachable from the zone, that is, if the gateway's address matches
 609  612   * one of the zone's logical interfaces.
 610  613   *
 611  614   * (x) IRE_LOCAL are handled a bit differently.
 612  615   * When ip_restrict_interzone_loopback is set (the default),
 613  616   * ire_route_recursive restricts loopback using an IRE_LOCAL
 614  617   * between zone to the case when L2 would have conceptually looped the packet
 615  618   * back, i.e. the loopback which is required since neither Ethernet drivers
 616  619   * nor Ethernet hardware loops them back. This is the case when the normal
 617  620   * routes (ignoring IREs with different zoneids) would send out the packet on
 618  621   * the same ill as the ill with which is IRE_LOCAL is associated.
 619  622   *
 620  623   * Multiple zones can share a common broadcast address; typically all zones
 621  624   * share the 255.255.255.255 address. Incoming as well as locally originated
 622  625   * broadcast packets must be dispatched to all the zones on the broadcast
 623  626   * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
 624  627   * since some zones may not be on the 10.16.72/24 network. To handle this, each
 625  628   * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
 626  629   * sent to every zone that has an IRE_BROADCAST entry for the destination
 627  630   * address on the input ill, see ip_input_broadcast().
 628  631   *
 629  632   * Applications in different zones can join the same multicast group address.
 630  633   * The same logic applies for multicast as for broadcast. ip_input_multicast
 631  634   * dispatches packets to all zones that have members on the physical interface.
 632  635   */
 633  636  
 634  637  /*
 635  638   * Squeue Fanout flags:
 636  639   *      0: No fanout.
 637  640   *      1: Fanout across all squeues
 638  641   */
 639  642  boolean_t       ip_squeue_fanout = 0;
 640  643  
 641  644  /*
 642  645   * Maximum dups allowed per packet.
 643  646   */
 644  647  uint_t ip_max_frag_dups = 10;
 645  648  
 646  649  static int      ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
 647  650                      cred_t *credp, boolean_t isv6);
 648  651  static mblk_t   *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
 649  652  
 650  653  static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
 651  654  static void     icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
 652  655  static void     icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
 653  656      ip_recv_attr_t *);
 654  657  static void     icmp_options_update(ipha_t *);
 655  658  static void     icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
 656  659  static void     icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
 657  660  static mblk_t   *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
 658  661  static void     icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
 659  662      ip_recv_attr_t *);
 660  663  static void     icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
 661  664  static void     icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
 662  665      ip_recv_attr_t *);
 663  666  
 664  667  mblk_t          *ip_dlpi_alloc(size_t, t_uscalar_t);
 665  668  char            *ip_dot_addr(ipaddr_t, char *);
 666  669  mblk_t          *ip_carve_mp(mblk_t **, ssize_t);
 667  670  int             ip_close(queue_t *, int);
 668  671  static char     *ip_dot_saddr(uchar_t *, char *);
 669  672  static void     ip_lrput(queue_t *, mblk_t *);
 670  673  ipaddr_t        ip_net_mask(ipaddr_t);
 671  674  char            *ip_nv_lookup(nv_t *, int);
 672  675  void    ip_rput(queue_t *, mblk_t *);
 673  676  static void     ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
 674  677                      void *dummy_arg);
 675  678  int             ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
 676  679  static mblk_t   *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
 677  680                      mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
 678  681  static mblk_t   *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
 679  682                      ip_stack_t *, boolean_t);
 680  683  static mblk_t   *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
 681  684                      boolean_t);
 682  685  static mblk_t   *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
 683  686  static mblk_t   *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
 684  687  static mblk_t   *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
 685  688  static mblk_t   *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
 686  689  static mblk_t   *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
 687  690                      ip_stack_t *ipst, boolean_t);
 688  691  static mblk_t   *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
 689  692                      ip_stack_t *ipst, boolean_t);
 690  693  static mblk_t   *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
 691  694                      ip_stack_t *ipst);
 692  695  static mblk_t   *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
 693  696                      ip_stack_t *ipst);
 694  697  static mblk_t   *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
 695  698                      ip_stack_t *ipst);
 696  699  static mblk_t   *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
 697  700                      ip_stack_t *ipst);
 698  701  static mblk_t   *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
 699  702                      ip_stack_t *ipst);
 700  703  static mblk_t   *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
 701  704                      ip_stack_t *ipst);
 702  705  static mblk_t   *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
 703  706                      ip_stack_t *ipst);
 704  707  static mblk_t   *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
 705  708                      ip_stack_t *ipst);
 706  709  static void     ip_snmp_get2_v4(ire_t *, iproutedata_t *);
 707  710  static void     ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
 708  711  static int      ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
 709  712  static int      ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
 710  713  int             ip_snmp_set(queue_t *, int, int, uchar_t *, int);
 711  714  
 712  715  static mblk_t   *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
 713  716                      mblk_t *);
 714  717  
 715  718  static void     conn_drain_init(ip_stack_t *);
 716  719  static void     conn_drain_fini(ip_stack_t *);
 717  720  static void     conn_drain(conn_t *connp, boolean_t closing);
 718  721  
 719  722  static void     conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
 720  723  static void     conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
 721  724  
 722  725  static void     *ip_stack_init(netstackid_t stackid, netstack_t *ns);
 723  726  static void     ip_stack_shutdown(netstackid_t stackid, void *arg);
 724  727  static void     ip_stack_fini(netstackid_t stackid, void *arg);
 725  728  
 726  729  static int      ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
 727  730      const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
 728  731      ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
 729  732      const in6_addr_t *);
 730  733  
 731  734  static int      ip_squeue_switch(int);
 732  735  
 733  736  static void     *ip_kstat_init(netstackid_t, ip_stack_t *);
 734  737  static void     ip_kstat_fini(netstackid_t, kstat_t *);
 735  738  static int      ip_kstat_update(kstat_t *kp, int rw);
 736  739  static void     *icmp_kstat_init(netstackid_t);
 737  740  static void     icmp_kstat_fini(netstackid_t, kstat_t *);
 738  741  static int      icmp_kstat_update(kstat_t *kp, int rw);
 739  742  static void     *ip_kstat2_init(netstackid_t, ip_stat_t *);
 740  743  static void     ip_kstat2_fini(netstackid_t, kstat_t *);
 741  744  
 742  745  static void     ipobs_init(ip_stack_t *);
 743  746  static void     ipobs_fini(ip_stack_t *);
 744  747  
 745  748  static int      ip_tp_cpu_update(cpu_setup_t, int, void *);
 746  749  
 747  750  ipaddr_t        ip_g_all_ones = IP_HOST_MASK;
 748  751  
 749  752  static long ip_rput_pullups;
 750  753  int     dohwcksum = 1;  /* use h/w cksum if supported by the hardware */
 751  754  
 752  755  vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
 753  756  vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
 754  757  
 755  758  int     ip_debug;
 756  759  
 757  760  /*
 758  761   * Multirouting/CGTP stuff
 759  762   */
 760  763  int     ip_cgtp_filter_rev = CGTP_FILTER_REV;   /* CGTP hooks version */
 761  764  
 762  765  /*
 763  766   * IP tunables related declarations. Definitions are in ip_tunables.c
 764  767   */
 765  768  extern mod_prop_info_t ip_propinfo_tbl[];
 766  769  extern int ip_propinfo_count;
 767  770  
 768  771  /*
 769  772   * Table of IP ioctls encoding the various properties of the ioctl and
 770  773   * indexed based on the last byte of the ioctl command. Occasionally there
 771  774   * is a clash, and there is more than 1 ioctl with the same last byte.
 772  775   * In such a case 1 ioctl is encoded in the ndx table and the remaining
 773  776   * ioctls are encoded in the misc table. An entry in the ndx table is
 774  777   * retrieved by indexing on the last byte of the ioctl command and comparing
 775  778   * the ioctl command with the value in the ndx table. In the event of a
 776  779   * mismatch the misc table is then searched sequentially for the desired
 777  780   * ioctl command.
 778  781   *
 779  782   * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
 780  783   */
 781  784  ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 782  785          /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 783  786          /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 784  787          /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 785  788          /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 786  789          /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 787  790          /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 788  791          /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 789  792          /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 790  793          /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 791  794          /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 792  795  
 793  796          /* 010 */ { SIOCADDRT,  sizeof (struct rtentry), IPI_PRIV,
 794  797                          MISC_CMD, ip_siocaddrt, NULL },
 795  798          /* 011 */ { SIOCDELRT,  sizeof (struct rtentry), IPI_PRIV,
 796  799                          MISC_CMD, ip_siocdelrt, NULL },
 797  800  
 798  801          /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 799  802                          IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 800  803          /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
 801  804                          IF_CMD, ip_sioctl_get_addr, NULL },
 802  805  
 803  806          /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 804  807                          IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 805  808          /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
 806  809                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
 807  810  
 808  811          /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
 809  812                          IPI_PRIV | IPI_WR,
 810  813                          IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 811  814          /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
 812  815                          IPI_MODOK | IPI_GET_CMD,
 813  816                          IF_CMD, ip_sioctl_get_flags, NULL },
 814  817  
 815  818          /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 816  819          /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 817  820  
 818  821          /* copyin size cannot be coded for SIOCGIFCONF */
 819  822          /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
 820  823                          MISC_CMD, ip_sioctl_get_ifconf, NULL },
 821  824  
 822  825          /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 823  826                          IF_CMD, ip_sioctl_mtu, NULL },
 824  827          /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
 825  828                          IF_CMD, ip_sioctl_get_mtu, NULL },
 826  829          /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
 827  830                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
 828  831          /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 829  832                          IF_CMD, ip_sioctl_brdaddr, NULL },
 830  833          /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
 831  834                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
 832  835          /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 833  836                          IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 834  837          /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
 835  838                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
 836  839          /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
 837  840                          IF_CMD, ip_sioctl_metric, NULL },
 838  841          /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 839  842  
 840  843          /* See 166-168 below for extended SIOC*XARP ioctls */
 841  844          /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 842  845                          ARP_CMD, ip_sioctl_arp, NULL },
 843  846          /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
 844  847                          ARP_CMD, ip_sioctl_arp, NULL },
 845  848          /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 846  849                          ARP_CMD, ip_sioctl_arp, NULL },
 847  850  
 848  851          /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 849  852          /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 850  853          /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 851  854          /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 852  855          /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 853  856          /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 854  857          /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 855  858          /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 856  859          /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 857  860          /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 858  861          /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 859  862          /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 860  863          /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 861  864          /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 862  865          /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 863  866          /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 864  867          /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 865  868          /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 866  869          /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 867  870          /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 868  871          /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 869  872  
 870  873          /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
 871  874                          MISC_CMD, if_unitsel, if_unitsel_restart },
 872  875  
 873  876          /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 874  877          /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 875  878          /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 876  879          /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 877  880          /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 878  881          /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 879  882          /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 880  883          /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 881  884          /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 882  885          /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 883  886          /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 884  887          /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 885  888          /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 886  889          /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 887  890          /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 888  891          /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 889  892          /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 890  893          /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 891  894  
 892  895          /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
 893  896                          IPI_PRIV | IPI_WR | IPI_MODOK,
 894  897                          IF_CMD, ip_sioctl_sifname, NULL },
 895  898  
 896  899          /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 897  900          /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 898  901          /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 899  902          /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 900  903          /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 901  904          /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 902  905          /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 903  906          /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 904  907          /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 905  908          /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 906  909          /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 907  910          /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 908  911          /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 909  912  
 910  913          /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
 911  914                          MISC_CMD, ip_sioctl_get_ifnum, NULL },
 912  915          /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
 913  916                          IF_CMD, ip_sioctl_get_muxid, NULL },
 914  917          /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
 915  918                          IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
 916  919  
 917  920          /* Both if and lif variants share same func */
 918  921          /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
 919  922                          IF_CMD, ip_sioctl_get_lifindex, NULL },
 920  923          /* Both if and lif variants share same func */
 921  924          /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
 922  925                          IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
 923  926  
 924  927          /* copyin size cannot be coded for SIOCGIFCONF */
 925  928          /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
 926  929                          MISC_CMD, ip_sioctl_get_ifconf, NULL },
 927  930          /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 928  931          /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 929  932          /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 930  933          /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 931  934          /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 932  935          /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 933  936          /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 934  937          /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 935  938          /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 936  939          /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 937  940          /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 938  941          /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 939  942          /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 940  943          /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 941  944          /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 942  945          /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 943  946          /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 944  947  
 945  948          /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
 946  949                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
 947  950                          ip_sioctl_removeif_restart },
 948  951          /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
 949  952                          IPI_GET_CMD | IPI_PRIV | IPI_WR,
 950  953                          LIF_CMD, ip_sioctl_addif, NULL },
 951  954  #define SIOCLIFADDR_NDX 112
 952  955          /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 953  956                          LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 954  957          /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
 955  958                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
 956  959          /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 957  960                          LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 958  961          /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
 959  962                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
 960  963          /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
 961  964                          IPI_PRIV | IPI_WR,
 962  965                          LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 963  966          /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
 964  967                          IPI_GET_CMD | IPI_MODOK,
 965  968                          LIF_CMD, ip_sioctl_get_flags, NULL },
 966  969  
 967  970          /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 968  971          /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 969  972  
 970  973          /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
 971  974                          ip_sioctl_get_lifconf, NULL },
 972  975          /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 973  976                          LIF_CMD, ip_sioctl_mtu, NULL },
 974  977          /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
 975  978                          LIF_CMD, ip_sioctl_get_mtu, NULL },
 976  979          /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
 977  980                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
 978  981          /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 979  982                          LIF_CMD, ip_sioctl_brdaddr, NULL },
 980  983          /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
 981  984                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
 982  985          /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 983  986                          LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 984  987          /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
 985  988                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
 986  989          /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 987  990                          LIF_CMD, ip_sioctl_metric, NULL },
 988  991          /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
 989  992                          IPI_PRIV | IPI_WR | IPI_MODOK,
 990  993                          LIF_CMD, ip_sioctl_slifname,
 991  994                          ip_sioctl_slifname_restart },
 992  995  
 993  996          /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
 994  997                          MISC_CMD, ip_sioctl_get_lifnum, NULL },
 995  998          /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
 996  999                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
 997 1000          /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
 998 1001                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
 999 1002          /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
1000 1003                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
1001 1004          /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
1002 1005                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
1003 1006          /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1004 1007                          LIF_CMD, ip_sioctl_token, NULL },
1005 1008          /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
1006 1009                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
1007 1010          /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1008 1011                          LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
1009 1012          /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
1010 1013                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
1011 1014          /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1012 1015                          LIF_CMD, ip_sioctl_lnkinfo, NULL },
1013 1016  
1014 1017          /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
1015 1018                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
1016 1019          /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
1017 1020                          LIF_CMD, ip_siocdelndp_v6, NULL },
1018 1021          /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
1019 1022                          LIF_CMD, ip_siocqueryndp_v6, NULL },
1020 1023          /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
1021 1024                          LIF_CMD, ip_siocsetndp_v6, NULL },
1022 1025          /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1023 1026                          MISC_CMD, ip_sioctl_tmyaddr, NULL },
1024 1027          /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1025 1028                          MISC_CMD, ip_sioctl_tonlink, NULL },
1026 1029          /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
1027 1030                          MISC_CMD, ip_sioctl_tmysite, NULL },
1028 1031          /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1029 1032          /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1030 1033          /* IPSECioctls handled in ip_sioctl_copyin_setup itself */
1031 1034          /* 149 */ { SIOCFIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1032 1035          /* 150 */ { SIOCSIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1033 1036          /* 151 */ { SIOCDIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1034 1037          /* 152 */ { SIOCLIPSECONFIG, 0, IPI_PRIV, MISC_CMD, NULL, NULL },
1035 1038  
1036 1039          /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1037 1040  
1038 1041          /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
1039 1042                          LIF_CMD, ip_sioctl_get_binding, NULL },
1040 1043          /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
1041 1044                          IPI_PRIV | IPI_WR,
1042 1045                          LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
1043 1046          /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
1044 1047                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
1045 1048          /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
1046 1049                          IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
1047 1050  
1048 1051          /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
1049 1052          /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1050 1053          /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1051 1054          /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1052 1055  
1053 1056          /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1054 1057  
1055 1058          /* These are handled in ip_sioctl_copyin_setup itself */
1056 1059          /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
1057 1060                          MISC_CMD, NULL, NULL },
1058 1061          /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
1059 1062                          MISC_CMD, NULL, NULL },
1060 1063          /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
1061 1064  
1062 1065          /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
1063 1066                          ip_sioctl_get_lifconf, NULL },
1064 1067  
1065 1068          /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1066 1069                          XARP_CMD, ip_sioctl_arp, NULL },
1067 1070          /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
1068 1071                          XARP_CMD, ip_sioctl_arp, NULL },
1069 1072          /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1070 1073                          XARP_CMD, ip_sioctl_arp, NULL },
1071 1074  
1072 1075          /* SIOCPOPSOCKFS is not handled by IP */
1073 1076          /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
1074 1077  
1075 1078          /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
1076 1079                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
1077 1080          /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
1078 1081                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
1079 1082                          ip_sioctl_slifzone_restart },
1080 1083          /* 172-174 are SCTP ioctls and not handled by IP */
1081 1084          /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1082 1085          /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1083 1086          /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1084 1087          /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
1085 1088                          IPI_GET_CMD, LIF_CMD,
1086 1089                          ip_sioctl_get_lifusesrc, 0 },
1087 1090          /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
1088 1091                          IPI_PRIV | IPI_WR,
1089 1092                          LIF_CMD, ip_sioctl_slifusesrc,
1090 1093                          NULL },
1091 1094          /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
1092 1095                          ip_sioctl_get_lifsrcof, NULL },
1093 1096          /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
1094 1097                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1095 1098          /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
1096 1099                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1097 1100          /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
1098 1101                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1099 1102          /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
1100 1103                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1101 1104          /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1102 1105          /* SIOCSENABLESDP is handled by SDP */
1103 1106          /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
1104 1107          /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
1105 1108          /* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
1106 1109                          IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
1107 1110          /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
1108 1111          /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
1109 1112                          ip_sioctl_ilb_cmd, NULL },
1110 1113          /* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
1111 1114          /* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
1112 1115          /* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
1113 1116                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
1114 1117          /* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1115 1118                          LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
1116 1119          /* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
1117 1120                          LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
1118 1121  };
1119 1122  
1120 1123  int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1121 1124  
1122 1125  ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
1123 1126          { I_LINK,       0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1124 1127          { I_UNLINK,     0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1125 1128          { I_PLINK,      0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1126 1129          { I_PUNLINK,    0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1127 1130          { ND_GET,       0, 0, 0, NULL, NULL },
1128 1131          { ND_SET,       0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1129 1132          { IP_IOCTL,     0, 0, 0, NULL, NULL },
1130 1133          { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
1131 1134                  MISC_CMD, mrt_ioctl},
1132 1135          { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
1133 1136                  MISC_CMD, mrt_ioctl},
1134 1137          { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
1135 1138                  MISC_CMD, mrt_ioctl}
1136 1139  };
1137 1140  
1138 1141  int ip_misc_ioctl_count =
1139 1142      sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1140 1143  
1141 1144  int     conn_drain_nthreads;            /* Number of drainers reqd. */
1142 1145                                          /* Settable in /etc/system */
1143 1146  /* Defined in ip_ire.c */
1144 1147  extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
1145 1148  extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
1146 1149  extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
1147 1150  
1148 1151  static nv_t     ire_nv_arr[] = {
1149 1152          { IRE_BROADCAST, "BROADCAST" },
1150 1153          { IRE_LOCAL, "LOCAL" },
1151 1154          { IRE_LOOPBACK, "LOOPBACK" },
1152 1155          { IRE_DEFAULT, "DEFAULT" },
1153 1156          { IRE_PREFIX, "PREFIX" },
1154 1157          { IRE_IF_NORESOLVER, "IF_NORESOL" },
1155 1158          { IRE_IF_RESOLVER, "IF_RESOLV" },
1156 1159          { IRE_IF_CLONE, "IF_CLONE" },
1157 1160          { IRE_HOST, "HOST" },
1158 1161          { IRE_MULTICAST, "MULTICAST" },
1159 1162          { IRE_NOROUTE, "NOROUTE" },
1160 1163          { 0 }
1161 1164  };
1162 1165  
1163 1166  nv_t    *ire_nv_tbl = ire_nv_arr;
1164 1167  
1165 1168  /* Simple ICMP IP Header Template */
1166 1169  static ipha_t icmp_ipha = {
1167 1170          IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
1168 1171  };
1169 1172  
1170 1173  struct module_info ip_mod_info = {
1171 1174          IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
1172 1175          IP_MOD_LOWAT
1173 1176  };
1174 1177  
1175 1178  /*
1176 1179   * Duplicate static symbols within a module confuses mdb; so we avoid the
1177 1180   * problem by making the symbols here distinct from those in udp.c.
1178 1181   */
1179 1182  
1180 1183  /*
1181 1184   * Entry points for IP as a device and as a module.
1182 1185   * We have separate open functions for the /dev/ip and /dev/ip6 devices.
1183 1186   */
1184 1187  static struct qinit iprinitv4 = {
1185 1188          (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
1186 1189          &ip_mod_info
1187 1190  };
1188 1191  
1189 1192  struct qinit iprinitv6 = {
1190 1193          (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
1191 1194          &ip_mod_info
1192 1195  };
1193 1196  
1194 1197  static struct qinit ipwinit = {
1195 1198          (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
1196 1199          &ip_mod_info
1197 1200  };
1198 1201  
1199 1202  static struct qinit iplrinit = {
1200 1203          (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
1201 1204          &ip_mod_info
1202 1205  };
1203 1206  
1204 1207  static struct qinit iplwinit = {
1205 1208          (pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
1206 1209          &ip_mod_info
1207 1210  };
1208 1211  
1209 1212  /* For AF_INET aka /dev/ip */
1210 1213  struct streamtab ipinfov4 = {
1211 1214          &iprinitv4, &ipwinit, &iplrinit, &iplwinit
1212 1215  };
1213 1216  
1214 1217  /* For AF_INET6 aka /dev/ip6 */
1215 1218  struct streamtab ipinfov6 = {
1216 1219          &iprinitv6, &ipwinit, &iplrinit, &iplwinit
1217 1220  };
1218 1221  
1219 1222  #ifdef  DEBUG
1220 1223  boolean_t skip_sctp_cksum = B_FALSE;
1221 1224  #endif
1222 1225  
1223 1226  /*
1224 1227   * Generate an ICMP fragmentation needed message.
1225 1228   * When called from ip_output side a minimal ip_recv_attr_t needs to be
1226 1229   * constructed by the caller.
1227 1230   */
1228 1231  void
1229 1232  icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
1230 1233  {
1231 1234          icmph_t icmph;
1232 1235          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1233 1236  
1234 1237          mp = icmp_pkt_err_ok(mp, ira);
1235 1238          if (mp == NULL)
1236 1239                  return;
1237 1240  
1238 1241          bzero(&icmph, sizeof (icmph_t));
1239 1242          icmph.icmph_type = ICMP_DEST_UNREACHABLE;
1240 1243          icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
1241 1244          icmph.icmph_du_mtu = htons((uint16_t)mtu);
1242 1245          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
1243 1246          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
1244 1247  
1245 1248          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
1246 1249  }
1247 1250  
1248 1251  /*
1249 1252   * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
1250 1253   * If the ICMP message is consumed by IP, i.e., it should not be delivered
1251 1254   * to any IPPROTO_ICMP raw sockets, then it returns NULL.
1252 1255   * Likewise, if the ICMP error is misformed (too short, etc), then it
1253 1256   * returns NULL. The caller uses this to determine whether or not to send
1254 1257   * to raw sockets.
1255 1258   *
1256 1259   * All error messages are passed to the matching transport stream.
1257 1260   *
1258 1261   * The following cases are handled by icmp_inbound:
1259 1262   * 1) It needs to send a reply back and possibly delivering it
1260 1263   *    to the "interested" upper clients.
1261 1264   * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
1262 1265   * 3) It needs to change some values in IP only.
1263 1266   * 4) It needs to change some values in IP and upper layers e.g TCP
1264 1267   *    by delivering an error to the upper layers.
1265 1268   *
1266 1269   * We handle the above three cases in the context of IPsec in the
1267 1270   * following way :
1268 1271   *
1269 1272   * 1) Send the reply back in the same way as the request came in.
1270 1273   *    If it came in encrypted, it goes out encrypted. If it came in
1271 1274   *    clear, it goes out in clear. Thus, this will prevent chosen
1272 1275   *    plain text attack.
1273 1276   * 2) The client may or may not expect things to come in secure.
1274 1277   *    If it comes in secure, the policy constraints are checked
1275 1278   *    before delivering it to the upper layers. If it comes in
1276 1279   *    clear, ipsec_inbound_accept_clear will decide whether to
1277 1280   *    accept this in clear or not. In both the cases, if the returned
1278 1281   *    message (IP header + 8 bytes) that caused the icmp message has
1279 1282   *    AH/ESP headers, it is sent up to AH/ESP for validation before
1280 1283   *    sending up. If there are only 8 bytes of returned message, then
1281 1284   *    upper client will not be notified.
1282 1285   * 3) Check with global policy to see whether it matches the constaints.
1283 1286   *    But this will be done only if icmp_accept_messages_in_clear is
1284 1287   *    zero.
1285 1288   * 4) If we need to change both in IP and ULP, then the decision taken
1286 1289   *    while affecting the values in IP and while delivering up to TCP
1287 1290   *    should be the same.
1288 1291   *
1289 1292   *      There are two cases.
1290 1293   *
1291 1294   *      a) If we reject data at the IP layer (ipsec_check_global_policy()
1292 1295   *         failed), we will not deliver it to the ULP, even though they
1293 1296   *         are *willing* to accept in *clear*. This is fine as our global
1294 1297   *         disposition to icmp messages asks us reject the datagram.
1295 1298   *
1296 1299   *      b) If we accept data at the IP layer (ipsec_check_global_policy()
1297 1300   *         succeeded or icmp_accept_messages_in_clear is 1), and not able
1298 1301   *         to deliver it to ULP (policy failed), it can lead to
1299 1302   *         consistency problems. The cases known at this time are
1300 1303   *         ICMP_DESTINATION_UNREACHABLE  messages with following code
1301 1304   *         values :
1302 1305   *
1303 1306   *         - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
1304 1307   *           and Upper layer rejects. Then the communication will
1305 1308   *           come to a stop. This is solved by making similar decisions
1306 1309   *           at both levels. Currently, when we are unable to deliver
1307 1310   *           to the Upper Layer (due to policy failures) while IP has
1308 1311   *           adjusted dce_pmtu, the next outbound datagram would
1309 1312   *           generate a local ICMP_FRAGMENTATION_NEEDED message - which
1310 1313   *           will be with the right level of protection. Thus the right
1311 1314   *           value will be communicated even if we are not able to
1312 1315   *           communicate when we get from the wire initially. But this
1313 1316   *           assumes there would be at least one outbound datagram after
1314 1317   *           IP has adjusted its dce_pmtu value. To make things
1315 1318   *           simpler, we accept in clear after the validation of
1316 1319   *           AH/ESP headers.
1317 1320   *
1318 1321   *         - Other ICMP ERRORS : We may not be able to deliver it to the
1319 1322   *           upper layer depending on the level of protection the upper
1320 1323   *           layer expects and the disposition in ipsec_inbound_accept_clear().
1321 1324   *           ipsec_inbound_accept_clear() decides whether a given ICMP error
1322 1325   *           should be accepted in clear when the Upper layer expects secure.
1323 1326   *           Thus the communication may get aborted by some bad ICMP
1324 1327   *           packets.
1325 1328   */
1326 1329  mblk_t *
1327 1330  icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
1328 1331  {
1329 1332          icmph_t         *icmph;
1330 1333          ipha_t          *ipha;          /* Outer header */
1331 1334          int             ip_hdr_length;  /* Outer header length */
1332 1335          boolean_t       interested;
1333 1336          ipif_t          *ipif;
1334 1337          uint32_t        ts;
1335 1338          uint32_t        *tsp;
1336 1339          timestruc_t     now;
1337 1340          ill_t           *ill = ira->ira_ill;
1338 1341          ip_stack_t      *ipst = ill->ill_ipst;
1339 1342          zoneid_t        zoneid = ira->ira_zoneid;
1340 1343          int             len_needed;
1341 1344          mblk_t          *mp_ret = NULL;
1342 1345  
1343 1346          ipha = (ipha_t *)mp->b_rptr;
1344 1347  
1345 1348          BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
1346 1349  
1347 1350          ip_hdr_length = ira->ira_ip_hdr_length;
1348 1351          if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
1349 1352                  if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
1350 1353                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1351 1354                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1352 1355                          freemsg(mp);
1353 1356                          return (NULL);
1354 1357                  }
1355 1358                  /* Last chance to get real. */
1356 1359                  ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
1357 1360                  if (ipha == NULL) {
1358 1361                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
1359 1362                          freemsg(mp);
1360 1363                          return (NULL);
1361 1364                  }
1362 1365          }
1363 1366  
1364 1367          /* The IP header will always be a multiple of four bytes */
1365 1368          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1366 1369          ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
1367 1370              icmph->icmph_code));
1368 1371  
1369 1372          /*
1370 1373           * We will set "interested" to "true" if we should pass a copy to
1371 1374           * the transport or if we handle the packet locally.
1372 1375           */
1373 1376          interested = B_FALSE;
1374 1377          switch (icmph->icmph_type) {
1375 1378          case ICMP_ECHO_REPLY:
1376 1379                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
1377 1380                  break;
1378 1381          case ICMP_DEST_UNREACHABLE:
1379 1382                  if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
1380 1383                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
1381 1384                  interested = B_TRUE;    /* Pass up to transport */
1382 1385                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
1383 1386                  break;
1384 1387          case ICMP_SOURCE_QUENCH:
1385 1388                  interested = B_TRUE;    /* Pass up to transport */
1386 1389                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
1387 1390                  break;
1388 1391          case ICMP_REDIRECT:
1389 1392                  if (!ipst->ips_ip_ignore_redirect)
1390 1393                          interested = B_TRUE;
1391 1394                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
1392 1395                  break;
1393 1396          case ICMP_ECHO_REQUEST:
1394 1397                  /*
1395 1398                   * Whether to respond to echo requests that come in as IP
1396 1399                   * broadcasts or as IP multicast is subject to debate
1397 1400                   * (what isn't?).  We aim to please, you pick it.
1398 1401                   * Default is do it.
1399 1402                   */
1400 1403                  if (ira->ira_flags & IRAF_MULTICAST) {
1401 1404                          /* multicast: respond based on tunable */
1402 1405                          interested = ipst->ips_ip_g_resp_to_echo_mcast;
1403 1406                  } else if (ira->ira_flags & IRAF_BROADCAST) {
1404 1407                          /* broadcast: respond based on tunable */
1405 1408                          interested = ipst->ips_ip_g_resp_to_echo_bcast;
1406 1409                  } else {
1407 1410                          /* unicast: always respond */
1408 1411                          interested = B_TRUE;
1409 1412                  }
1410 1413                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
1411 1414                  if (!interested) {
1412 1415                          /* We never pass these to RAW sockets */
1413 1416                          freemsg(mp);
1414 1417                          return (NULL);
1415 1418                  }
1416 1419  
1417 1420                  /* Check db_ref to make sure we can modify the packet. */
1418 1421                  if (mp->b_datap->db_ref > 1) {
1419 1422                          mblk_t  *mp1;
1420 1423  
1421 1424                          mp1 = copymsg(mp);
1422 1425                          freemsg(mp);
1423 1426                          if (!mp1) {
1424 1427                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1425 1428                                  return (NULL);
1426 1429                          }
1427 1430                          mp = mp1;
1428 1431                          ipha = (ipha_t *)mp->b_rptr;
1429 1432                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1430 1433                  }
1431 1434                  icmph->icmph_type = ICMP_ECHO_REPLY;
1432 1435                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
1433 1436                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1434 1437                  return (NULL);
1435 1438  
1436 1439          case ICMP_ROUTER_ADVERTISEMENT:
1437 1440          case ICMP_ROUTER_SOLICITATION:
1438 1441                  break;
1439 1442          case ICMP_TIME_EXCEEDED:
1440 1443                  interested = B_TRUE;    /* Pass up to transport */
1441 1444                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
1442 1445                  break;
1443 1446          case ICMP_PARAM_PROBLEM:
1444 1447                  interested = B_TRUE;    /* Pass up to transport */
1445 1448                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
1446 1449                  break;
1447 1450          case ICMP_TIME_STAMP_REQUEST:
1448 1451                  /* Response to Time Stamp Requests is local policy. */
1449 1452                  if (ipst->ips_ip_g_resp_to_timestamp) {
1450 1453                          if (ira->ira_flags & IRAF_MULTIBROADCAST)
1451 1454                                  interested =
1452 1455                                      ipst->ips_ip_g_resp_to_timestamp_bcast;
1453 1456                          else
1454 1457                                  interested = B_TRUE;
1455 1458                  }
1456 1459                  if (!interested) {
1457 1460                          /* We never pass these to RAW sockets */
1458 1461                          freemsg(mp);
1459 1462                          return (NULL);
1460 1463                  }
1461 1464  
1462 1465                  /* Make sure we have enough of the packet */
1463 1466                  len_needed = ip_hdr_length + ICMPH_SIZE +
1464 1467                      3 * sizeof (uint32_t);
1465 1468  
1466 1469                  if (mp->b_wptr - mp->b_rptr < len_needed) {
1467 1470                          ipha = ip_pullup(mp, len_needed, ira);
1468 1471                          if (ipha == NULL) {
1469 1472                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1470 1473                                  ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1471 1474                                      mp, ill);
1472 1475                                  freemsg(mp);
1473 1476                                  return (NULL);
1474 1477                          }
1475 1478                          /* Refresh following the pullup. */
1476 1479                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1477 1480                  }
1478 1481                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
1479 1482                  /* Check db_ref to make sure we can modify the packet. */
1480 1483                  if (mp->b_datap->db_ref > 1) {
1481 1484                          mblk_t  *mp1;
1482 1485  
1483 1486                          mp1 = copymsg(mp);
1484 1487                          freemsg(mp);
1485 1488                          if (!mp1) {
1486 1489                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1487 1490                                  return (NULL);
1488 1491                          }
1489 1492                          mp = mp1;
1490 1493                          ipha = (ipha_t *)mp->b_rptr;
1491 1494                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1492 1495                  }
1493 1496                  icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
1494 1497                  tsp = (uint32_t *)&icmph[1];
1495 1498                  tsp++;          /* Skip past 'originate time' */
1496 1499                  /* Compute # of milliseconds since midnight */
1497 1500                  gethrestime(&now);
1498 1501                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
1499 1502                      now.tv_nsec / (NANOSEC / MILLISEC);
1500 1503                  *tsp++ = htonl(ts);     /* Lay in 'receive time' */
1501 1504                  *tsp++ = htonl(ts);     /* Lay in 'send time' */
1502 1505                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
1503 1506                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1504 1507                  return (NULL);
1505 1508  
1506 1509          case ICMP_TIME_STAMP_REPLY:
1507 1510                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
1508 1511                  break;
1509 1512          case ICMP_INFO_REQUEST:
1510 1513                  /* Per RFC 1122 3.2.2.7, ignore this. */
1511 1514          case ICMP_INFO_REPLY:
1512 1515                  break;
1513 1516          case ICMP_ADDRESS_MASK_REQUEST:
1514 1517                  if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1515 1518                          interested =
1516 1519                              ipst->ips_ip_respond_to_address_mask_broadcast;
1517 1520                  } else {
1518 1521                          interested = B_TRUE;
1519 1522                  }
1520 1523                  if (!interested) {
1521 1524                          /* We never pass these to RAW sockets */
1522 1525                          freemsg(mp);
1523 1526                          return (NULL);
1524 1527                  }
1525 1528                  len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
1526 1529                  if (mp->b_wptr - mp->b_rptr < len_needed) {
1527 1530                          ipha = ip_pullup(mp, len_needed, ira);
1528 1531                          if (ipha == NULL) {
1529 1532                                  BUMP_MIB(ill->ill_ip_mib,
1530 1533                                      ipIfStatsInTruncatedPkts);
1531 1534                                  ip_drop_input("ipIfStatsInTruncatedPkts", mp,
1532 1535                                      ill);
1533 1536                                  freemsg(mp);
1534 1537                                  return (NULL);
1535 1538                          }
1536 1539                          /* Refresh following the pullup. */
1537 1540                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1538 1541                  }
1539 1542                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
1540 1543                  /* Check db_ref to make sure we can modify the packet. */
1541 1544                  if (mp->b_datap->db_ref > 1) {
1542 1545                          mblk_t  *mp1;
1543 1546  
1544 1547                          mp1 = copymsg(mp);
1545 1548                          freemsg(mp);
1546 1549                          if (!mp1) {
1547 1550                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1548 1551                                  return (NULL);
1549 1552                          }
1550 1553                          mp = mp1;
1551 1554                          ipha = (ipha_t *)mp->b_rptr;
1552 1555                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1553 1556                  }
1554 1557                  /*
1555 1558                   * Need the ipif with the mask be the same as the source
1556 1559                   * address of the mask reply. For unicast we have a specific
1557 1560                   * ipif. For multicast/broadcast we only handle onlink
1558 1561                   * senders, and use the source address to pick an ipif.
1559 1562                   */
1560 1563                  ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
1561 1564                  if (ipif == NULL) {
1562 1565                          /* Broadcast or multicast */
1563 1566                          ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
1564 1567                          if (ipif == NULL) {
1565 1568                                  freemsg(mp);
1566 1569                                  return (NULL);
1567 1570                          }
1568 1571                  }
1569 1572                  icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
1570 1573                  bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
1571 1574                  ipif_refrele(ipif);
1572 1575                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
1573 1576                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1574 1577                  return (NULL);
1575 1578  
1576 1579          case ICMP_ADDRESS_MASK_REPLY:
1577 1580                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
1578 1581                  break;
1579 1582          default:
1580 1583                  interested = B_TRUE;    /* Pass up to transport */
1581 1584                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
1582 1585                  break;
1583 1586          }
1584 1587          /*
1585 1588           * See if there is an ICMP client to avoid an extra copymsg/freemsg
1586 1589           * if there isn't one.
1587 1590           */
1588 1591          if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
1589 1592                  /* If there is an ICMP client and we want one too, copy it. */
1590 1593  
1591 1594                  if (!interested) {
1592 1595                          /* Caller will deliver to RAW sockets */
1593 1596                          return (mp);
1594 1597                  }
1595 1598                  mp_ret = copymsg(mp);
1596 1599                  if (mp_ret == NULL) {
1597 1600                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1598 1601                          ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1599 1602                  }
1600 1603          } else if (!interested) {
1601 1604                  /* Neither we nor raw sockets are interested. Drop packet now */
1602 1605                  freemsg(mp);
1603 1606                  return (NULL);
1604 1607          }
1605 1608  
1606 1609          /*
1607 1610           * ICMP error or redirect packet. Make sure we have enough of
1608 1611           * the header and that db_ref == 1 since we might end up modifying
1609 1612           * the packet.
1610 1613           */
1611 1614          if (mp->b_cont != NULL) {
1612 1615                  if (ip_pullup(mp, -1, ira) == NULL) {
1613 1616                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1614 1617                          ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1615 1618                              mp, ill);
1616 1619                          freemsg(mp);
1617 1620                          return (mp_ret);
1618 1621                  }
1619 1622          }
1620 1623  
1621 1624          if (mp->b_datap->db_ref > 1) {
1622 1625                  mblk_t  *mp1;
1623 1626  
1624 1627                  mp1 = copymsg(mp);
1625 1628                  if (mp1 == NULL) {
1626 1629                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1627 1630                          ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1628 1631                          freemsg(mp);
1629 1632                          return (mp_ret);
1630 1633                  }
1631 1634                  freemsg(mp);
1632 1635                  mp = mp1;
1633 1636          }
1634 1637  
1635 1638          /*
1636 1639           * In case mp has changed, verify the message before any further
1637 1640           * processes.
1638 1641           */
1639 1642          ipha = (ipha_t *)mp->b_rptr;
1640 1643          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1641 1644          if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
1642 1645                  freemsg(mp);
1643 1646                  return (mp_ret);
1644 1647          }
1645 1648  
1646 1649          switch (icmph->icmph_type) {
1647 1650          case ICMP_REDIRECT:
1648 1651                  icmp_redirect_v4(mp, ipha, icmph, ira);
1649 1652                  break;
1650 1653          case ICMP_DEST_UNREACHABLE:
1651 1654                  if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
1652 1655                          /* Update DCE and adjust MTU is icmp header if needed */
1653 1656                          icmp_inbound_too_big_v4(icmph, ira);
1654 1657                  }
1655 1658                  /* FALLTHRU */
1656 1659          default:
1657 1660                  icmp_inbound_error_fanout_v4(mp, icmph, ira);
1658 1661                  break;
1659 1662          }
1660 1663          return (mp_ret);
1661 1664  }
1662 1665  
1663 1666  /*
1664 1667   * Send an ICMP echo, timestamp or address mask reply.
1665 1668   * The caller has already updated the payload part of the packet.
1666 1669   * We handle the ICMP checksum, IP source address selection and feed
1667 1670   * the packet into ip_output_simple.
1668 1671   */
1669 1672  static void
1670 1673  icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
1671 1674      ip_recv_attr_t *ira)
1672 1675  {
1673 1676          uint_t          ip_hdr_length = ira->ira_ip_hdr_length;
1674 1677          ill_t           *ill = ira->ira_ill;
1675 1678          ip_stack_t      *ipst = ill->ill_ipst;
1676 1679          ip_xmit_attr_t  ixas;
1677 1680  
1678 1681          /* Send out an ICMP packet */
1679 1682          icmph->icmph_checksum = 0;
1680 1683          icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
1681 1684          /* Reset time to live. */
1682 1685          ipha->ipha_ttl = ipst->ips_ip_def_ttl;
1683 1686          {
1684 1687                  /* Swap source and destination addresses */
1685 1688                  ipaddr_t tmp;
1686 1689  
1687 1690                  tmp = ipha->ipha_src;
1688 1691                  ipha->ipha_src = ipha->ipha_dst;
1689 1692                  ipha->ipha_dst = tmp;
1690 1693          }
1691 1694          ipha->ipha_ident = 0;
1692 1695          if (!IS_SIMPLE_IPH(ipha))
1693 1696                  icmp_options_update(ipha);
1694 1697  
1695 1698          bzero(&ixas, sizeof (ixas));
1696 1699          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1697 1700          ixas.ixa_zoneid = ira->ira_zoneid;
1698 1701          ixas.ixa_cred = kcred;
1699 1702          ixas.ixa_cpid = NOPID;
1700 1703          ixas.ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
1701 1704          ixas.ixa_ifindex = 0;
1702 1705          ixas.ixa_ipst = ipst;
1703 1706          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1704 1707  
1705 1708          if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
1706 1709                  /*
1707 1710                   * This packet should go out the same way as it
1708 1711                   * came in i.e in clear, independent of the IPsec policy
1709 1712                   * for transmitting packets.
1710 1713                   */
1711 1714                  ixas.ixa_flags |= IXAF_NO_IPSEC;
1712 1715          } else {
1713 1716                  if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
1714 1717                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1715 1718                          /* Note: mp already consumed and ip_drop_packet done */
1716 1719                          return;
1717 1720                  }
1718 1721          }
1719 1722          if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1720 1723                  /*
1721 1724                   * Not one or our addresses (IRE_LOCALs), thus we let
1722 1725                   * ip_output_simple pick the source.
1723 1726                   */
1724 1727                  ipha->ipha_src = INADDR_ANY;
1725 1728                  ixas.ixa_flags |= IXAF_SET_SOURCE;
1726 1729          }
1727 1730          /* Should we send with DF and use dce_pmtu? */
1728 1731          if (ipst->ips_ipv4_icmp_return_pmtu) {
1729 1732                  ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
1730 1733                  ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
1731 1734          }
1732 1735  
1733 1736          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
1734 1737  
1735 1738          (void) ip_output_simple(mp, &ixas);
1736 1739          ixa_cleanup(&ixas);
1737 1740  }
1738 1741  
1739 1742  /*
1740 1743   * Verify the ICMP messages for either for ICMP error or redirect packet.
1741 1744   * The caller should have fully pulled up the message. If it's a redirect
1742 1745   * packet, only basic checks on IP header will be done; otherwise, verify
1743 1746   * the packet by looking at the included ULP header.
1744 1747   *
1745 1748   * Called before icmp_inbound_error_fanout_v4 is called.
1746 1749   */
1747 1750  static boolean_t
1748 1751  icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
1749 1752  {
1750 1753          ill_t           *ill = ira->ira_ill;
1751 1754          int             hdr_length;
1752 1755          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1753 1756          conn_t          *connp;
1754 1757          ipha_t          *ipha;  /* Inner IP header */
1755 1758  
1756 1759          ipha = (ipha_t *)&icmph[1];
1757 1760          if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
1758 1761                  goto truncated;
1759 1762  
1760 1763          hdr_length = IPH_HDR_LENGTH(ipha);
1761 1764  
1762 1765          if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
1763 1766                  goto discard_pkt;
1764 1767  
1765 1768          if (hdr_length < sizeof (ipha_t))
1766 1769                  goto truncated;
1767 1770  
1768 1771          if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
1769 1772                  goto truncated;
1770 1773  
1771 1774          /*
1772 1775           * Stop here for ICMP_REDIRECT.
1773 1776           */
1774 1777          if (icmph->icmph_type == ICMP_REDIRECT)
1775 1778                  return (B_TRUE);
1776 1779  
1777 1780          /*
1778 1781           * ICMP errors only.
1779 1782           */
1780 1783          switch (ipha->ipha_protocol) {
1781 1784          case IPPROTO_UDP:
1782 1785                  /*
1783 1786                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1784 1787                   * transport header.
1785 1788                   */
1786 1789                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1787 1790                      mp->b_wptr)
1788 1791                          goto truncated;
1789 1792                  break;
1790 1793          case IPPROTO_TCP: {
1791 1794                  tcpha_t         *tcpha;
1792 1795  
1793 1796                  /*
1794 1797                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1795 1798                   * transport header.
1796 1799                   */
1797 1800                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1798 1801                      mp->b_wptr)
1799 1802                          goto truncated;
1800 1803  
1801 1804                  tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
1802 1805                  connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
1803 1806                      ipst);
1804 1807                  if (connp == NULL)
1805 1808                          goto discard_pkt;
1806 1809  
1807 1810                  if ((connp->conn_verifyicmp != NULL) &&
1808 1811                      !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
1809 1812                          CONN_DEC_REF(connp);
1810 1813                          goto discard_pkt;
1811 1814                  }
1812 1815                  CONN_DEC_REF(connp);
1813 1816                  break;
1814 1817          }
1815 1818          case IPPROTO_SCTP:
1816 1819                  /*
1817 1820                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1818 1821                   * transport header.
1819 1822                   */
1820 1823                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1821 1824                      mp->b_wptr)
1822 1825                          goto truncated;
1823 1826                  break;
1824 1827          case IPPROTO_ESP:
1825 1828          case IPPROTO_AH:
1826 1829                  break;
1827 1830          case IPPROTO_ENCAP:
1828 1831                  if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
1829 1832                      mp->b_wptr)
1830 1833                          goto truncated;
1831 1834                  break;
1832 1835          default:
1833 1836                  break;
1834 1837          }
1835 1838  
1836 1839          return (B_TRUE);
1837 1840  
1838 1841  discard_pkt:
1839 1842          /* Bogus ICMP error. */
1840 1843          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1841 1844          return (B_FALSE);
1842 1845  
1843 1846  truncated:
1844 1847          /* We pulled up everthing already. Must be truncated */
1845 1848          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1846 1849          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1847 1850          return (B_FALSE);
1848 1851  }
1849 1852  
1850 1853  /* Table from RFC 1191 */
1851 1854  static int icmp_frag_size_table[] =
1852 1855  { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
1853 1856  
1854 1857  /*
1855 1858   * Process received ICMP Packet too big.
1856 1859   * Just handles the DCE create/update, including using the above table of
1857 1860   * PMTU guesses. The caller is responsible for validating the packet before
1858 1861   * passing it in and also to fanout the ICMP error to any matching transport
1859 1862   * conns. Assumes the message has been fully pulled up and verified.
1860 1863   *
1861 1864   * Before getting here, the caller has called icmp_inbound_verify_v4()
1862 1865   * that should have verified with ULP to prevent undoing the changes we're
1863 1866   * going to make to DCE. For example, TCP might have verified that the packet
1864 1867   * which generated error is in the send window.
1865 1868   *
1866 1869   * In some cases modified this MTU in the ICMP header packet; the caller
1867 1870   * should pass to the matching ULP after this returns.
1868 1871   */
1869 1872  static void
1870 1873  icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
1871 1874  {
1872 1875          dce_t           *dce;
1873 1876          int             old_mtu;
1874 1877          int             mtu, orig_mtu;
1875 1878          ipaddr_t        dst;
1876 1879          boolean_t       disable_pmtud;
1877 1880          ill_t           *ill = ira->ira_ill;
1878 1881          ip_stack_t      *ipst = ill->ill_ipst;
1879 1882          uint_t          hdr_length;
1880 1883          ipha_t          *ipha;
1881 1884  
1882 1885          /* Caller already pulled up everything. */
1883 1886          ipha = (ipha_t *)&icmph[1];
1884 1887          ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
1885 1888              icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
1886 1889          ASSERT(ill != NULL);
1887 1890  
1888 1891          hdr_length = IPH_HDR_LENGTH(ipha);
1889 1892  
1890 1893          /*
1891 1894           * We handle path MTU for source routed packets since the DCE
1892 1895           * is looked up using the final destination.
1893 1896           */
1894 1897          dst = ip_get_dst(ipha);
1895 1898  
1896 1899          dce = dce_lookup_and_add_v4(dst, ipst);
1897 1900          if (dce == NULL) {
1898 1901                  /* Couldn't add a unique one - ENOMEM */
1899 1902                  ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
1900 1903                      ntohl(dst)));
1901 1904                  return;
1902 1905          }
1903 1906  
1904 1907          /* Check for MTU discovery advice as described in RFC 1191 */
1905 1908          mtu = ntohs(icmph->icmph_du_mtu);
1906 1909          orig_mtu = mtu;
1907 1910          disable_pmtud = B_FALSE;
1908 1911  
1909 1912          mutex_enter(&dce->dce_lock);
1910 1913          if (dce->dce_flags & DCEF_PMTU)
1911 1914                  old_mtu = dce->dce_pmtu;
1912 1915          else
1913 1916                  old_mtu = ill->ill_mtu;
1914 1917  
1915 1918          if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
1916 1919                  uint32_t length;
1917 1920                  int     i;
1918 1921  
1919 1922                  /*
1920 1923                   * Use the table from RFC 1191 to figure out
1921 1924                   * the next "plateau" based on the length in
1922 1925                   * the original IP packet.
1923 1926                   */
1924 1927                  length = ntohs(ipha->ipha_length);
1925 1928                  DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
1926 1929                      uint32_t, length);
1927 1930                  if (old_mtu <= length &&
1928 1931                      old_mtu >= length - hdr_length) {
1929 1932                          /*
1930 1933                           * Handle broken BSD 4.2 systems that
1931 1934                           * return the wrong ipha_length in ICMP
1932 1935                           * errors.
1933 1936                           */
1934 1937                          ip1dbg(("Wrong mtu: sent %d, dce %d\n",
1935 1938                              length, old_mtu));
1936 1939                          length -= hdr_length;
1937 1940                  }
1938 1941                  for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
1939 1942                          if (length > icmp_frag_size_table[i])
1940 1943                                  break;
1941 1944                  }
1942 1945                  if (i == A_CNT(icmp_frag_size_table)) {
1943 1946                          /* Smaller than IP_MIN_MTU! */
1944 1947                          ip1dbg(("Too big for packet size %d\n",
1945 1948                              length));
1946 1949                          disable_pmtud = B_TRUE;
1947 1950                          mtu = ipst->ips_ip_pmtu_min;
1948 1951                  } else {
1949 1952                          mtu = icmp_frag_size_table[i];
1950 1953                          ip1dbg(("Calculated mtu %d, packet size %d, "
1951 1954                              "before %d\n", mtu, length, old_mtu));
1952 1955                          if (mtu < ipst->ips_ip_pmtu_min) {
1953 1956                                  mtu = ipst->ips_ip_pmtu_min;
1954 1957                                  disable_pmtud = B_TRUE;
1955 1958                          }
1956 1959                  }
1957 1960          }
1958 1961          if (disable_pmtud)
1959 1962                  dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
1960 1963          else
1961 1964                  dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
1962 1965  
1963 1966          dce->dce_pmtu = MIN(old_mtu, mtu);
1964 1967          /* Prepare to send the new max frag size for the ULP. */
1965 1968          icmph->icmph_du_zero = 0;
1966 1969          icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
1967 1970          DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
1968 1971              dce, int, orig_mtu, int, mtu);
1969 1972  
1970 1973          /* We now have a PMTU for sure */
1971 1974          dce->dce_flags |= DCEF_PMTU;
1972 1975          dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
1973 1976          mutex_exit(&dce->dce_lock);
1974 1977          /*
1975 1978           * After dropping the lock the new value is visible to everyone.
1976 1979           * Then we bump the generation number so any cached values reinspect
1977 1980           * the dce_t.
1978 1981           */
1979 1982          dce_increment_generation(dce);
1980 1983          dce_refrele(dce);
1981 1984  }
1982 1985  
1983 1986  /*
1984 1987   * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
1985 1988   * calls this function.
1986 1989   */
1987 1990  static mblk_t *
1988 1991  icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
1989 1992  {
1990 1993          int length;
1991 1994  
1992 1995          ASSERT(mp->b_datap->db_type == M_DATA);
1993 1996  
1994 1997          /* icmp_inbound_v4 has already pulled up the whole error packet */
1995 1998          ASSERT(mp->b_cont == NULL);
1996 1999  
1997 2000          /*
1998 2001           * The length that we want to overlay is the inner header
1999 2002           * and what follows it.
2000 2003           */
2001 2004          length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
2002 2005  
2003 2006          /*
2004 2007           * Overlay the inner header and whatever follows it over the
2005 2008           * outer header.
2006 2009           */
2007 2010          bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
2008 2011  
2009 2012          /* Adjust for what we removed */
2010 2013          mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
2011 2014          return (mp);
2012 2015  }
2013 2016  
2014 2017  /*
2015 2018   * Try to pass the ICMP message upstream in case the ULP cares.
2016 2019   *
2017 2020   * If the packet that caused the ICMP error is secure, we send
2018 2021   * it to AH/ESP to make sure that the attached packet has a
2019 2022   * valid association. ipha in the code below points to the
2020 2023   * IP header of the packet that caused the error.
2021 2024   *
2022 2025   * For IPsec cases, we let the next-layer-up (which has access to
2023 2026   * cached policy on the conn_t, or can query the SPD directly)
2024 2027   * subtract out any IPsec overhead if they must.  We therefore make no
2025 2028   * adjustments here for IPsec overhead.
2026 2029   *
2027 2030   * IFN could have been generated locally or by some router.
2028 2031   *
2029 2032   * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
2030 2033   * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
2031 2034   *          This happens because IP adjusted its value of MTU on an
2032 2035   *          earlier IFN message and could not tell the upper layer,
2033 2036   *          the new adjusted value of MTU e.g. Packet was encrypted
2034 2037   *          or there was not enough information to fanout to upper
2035 2038   *          layers. Thus on the next outbound datagram, ire_send_wire
2036 2039   *          generates the IFN, where IPsec processing has *not* been
2037 2040   *          done.
2038 2041   *
2039 2042   *          Note that we retain ixa_fragsize across IPsec thus once
2040 2043   *          we have picking ixa_fragsize and entered ipsec_out_process we do
2041 2044   *          no change the fragsize even if the path MTU changes before
2042 2045   *          we reach ip_output_post_ipsec.
2043 2046   *
2044 2047   *          In the local case, IRAF_LOOPBACK will be set indicating
2045 2048   *          that IFN was generated locally.
2046 2049   *
2047 2050   * ROUTER : IFN could be secure or non-secure.
2048 2051   *
2049 2052   *          * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
2050 2053   *            packet in error has AH/ESP headers to validate the AH/ESP
2051 2054   *            headers. AH/ESP will verify whether there is a valid SA or
2052 2055   *            not and send it back. We will fanout again if we have more
2053 2056   *            data in the packet.
2054 2057   *
2055 2058   *            If the packet in error does not have AH/ESP, we handle it
2056 2059   *            like any other case.
2057 2060   *
2058 2061   *          * NON_SECURE : If the packet in error has AH/ESP headers, we send it
2059 2062   *            up to AH/ESP for validation. AH/ESP will verify whether there is a
2060 2063   *            valid SA or not and send it back. We will fanout again if
2061 2064   *            we have more data in the packet.
2062 2065   *
2063 2066   *            If the packet in error does not have AH/ESP, we handle it
2064 2067   *            like any other case.
2065 2068   *
2066 2069   * The caller must have called icmp_inbound_verify_v4.
2067 2070   */
2068 2071  static void
2069 2072  icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
2070 2073  {
2071 2074          uint16_t        *up;    /* Pointer to ports in ULP header */
2072 2075          uint32_t        ports;  /* reversed ports for fanout */
2073 2076          ipha_t          ripha;  /* With reversed addresses */
2074 2077          ipha_t          *ipha;  /* Inner IP header */
2075 2078          uint_t          hdr_length; /* Inner IP header length */
2076 2079          tcpha_t         *tcpha;
2077 2080          conn_t          *connp;
2078 2081          ill_t           *ill = ira->ira_ill;
2079 2082          ip_stack_t      *ipst = ill->ill_ipst;
2080 2083          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
2081 2084          ill_t           *rill = ira->ira_rill;
2082 2085  
2083 2086          /* Caller already pulled up everything. */
2084 2087          ipha = (ipha_t *)&icmph[1];
2085 2088          ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
2086 2089          ASSERT(mp->b_cont == NULL);
2087 2090  
2088 2091          hdr_length = IPH_HDR_LENGTH(ipha);
2089 2092          ira->ira_protocol = ipha->ipha_protocol;
2090 2093  
2091 2094          /*
2092 2095           * We need a separate IP header with the source and destination
2093 2096           * addresses reversed to do fanout/classification because the ipha in
2094 2097           * the ICMP error is in the form we sent it out.
2095 2098           */
2096 2099          ripha.ipha_src = ipha->ipha_dst;
2097 2100          ripha.ipha_dst = ipha->ipha_src;
2098 2101          ripha.ipha_protocol = ipha->ipha_protocol;
2099 2102          ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
2100 2103  
2101 2104          ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
2102 2105              ripha.ipha_protocol, ntohl(ipha->ipha_src),
2103 2106              ntohl(ipha->ipha_dst),
2104 2107              icmph->icmph_type, icmph->icmph_code));
2105 2108  
2106 2109          switch (ipha->ipha_protocol) {
2107 2110          case IPPROTO_UDP:
2108 2111                  up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2109 2112  
2110 2113                  /* Attempt to find a client stream based on port. */
2111 2114                  ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
2112 2115                      ntohs(up[0]), ntohs(up[1])));
2113 2116  
2114 2117                  /* Note that we send error to all matches. */
2115 2118                  ira->ira_flags |= IRAF_ICMP_ERROR;
2116 2119                  ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
2117 2120                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2118 2121                  return;
2119 2122  
2120 2123          case IPPROTO_TCP:
2121 2124                  /*
2122 2125                   * Find a TCP client stream for this packet.
2123 2126                   * Note that we do a reverse lookup since the header is
2124 2127                   * in the form we sent it out.
2125 2128                   */
2126 2129                  tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
2127 2130                  connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
2128 2131                      ipst);
2129 2132                  if (connp == NULL)
2130 2133                          goto discard_pkt;
2131 2134  
2132 2135                  if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2133 2136                      (ira->ira_flags & IRAF_IPSEC_SECURE)) {
2134 2137                          mp = ipsec_check_inbound_policy(mp, connp,
2135 2138                              ipha, NULL, ira);
2136 2139                          if (mp == NULL) {
2137 2140                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2138 2141                                  /* Note that mp is NULL */
2139 2142                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
2140 2143                                  CONN_DEC_REF(connp);
2141 2144                                  return;
2142 2145                          }
2143 2146                  }
2144 2147  
2145 2148                  ira->ira_flags |= IRAF_ICMP_ERROR;
2146 2149                  ira->ira_ill = ira->ira_rill = NULL;
2147 2150                  if (IPCL_IS_TCP(connp)) {
2148 2151                          SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2149 2152                              connp->conn_recvicmp, connp, ira, SQ_FILL,
2150 2153                              SQTAG_TCP_INPUT_ICMP_ERR);
2151 2154                  } else {
2152 2155                          /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2153 2156                          (connp->conn_recv)(connp, mp, NULL, ira);
2154 2157                          CONN_DEC_REF(connp);
2155 2158                  }
2156 2159                  ira->ira_ill = ill;
2157 2160                  ira->ira_rill = rill;
2158 2161                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2159 2162                  return;
2160 2163  
2161 2164          case IPPROTO_SCTP:
2162 2165                  up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2163 2166                  /* Find a SCTP client stream for this packet. */
2164 2167                  ((uint16_t *)&ports)[0] = up[1];
2165 2168                  ((uint16_t *)&ports)[1] = up[0];
2166 2169  
2167 2170                  ira->ira_flags |= IRAF_ICMP_ERROR;
2168 2171                  ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
2169 2172                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2170 2173                  return;
2171 2174  
2172 2175          case IPPROTO_ESP:
2173 2176          case IPPROTO_AH:
2174 2177                  if (!ipsec_loaded(ipss)) {
2175 2178                          ip_proto_not_sup(mp, ira);
2176 2179                          return;
2177 2180                  }
2178 2181  
2179 2182                  if (ipha->ipha_protocol == IPPROTO_ESP)
2180 2183                          mp = ipsecesp_icmp_error(mp, ira);
2181 2184                  else
2182 2185                          mp = ipsecah_icmp_error(mp, ira);
2183 2186                  if (mp == NULL)
2184 2187                          return;
2185 2188  
2186 2189                  /* Just in case ipsec didn't preserve the NULL b_cont */
2187 2190                  if (mp->b_cont != NULL) {
2188 2191                          if (!pullupmsg(mp, -1))
2189 2192                                  goto discard_pkt;
2190 2193                  }
2191 2194  
2192 2195                  /*
2193 2196                   * Note that ira_pktlen and ira_ip_hdr_length are no longer
2194 2197                   * correct, but we don't use them any more here.
2195 2198                   *
2196 2199                   * If succesful, the mp has been modified to not include
2197 2200                   * the ESP/AH header so we can fanout to the ULP's icmp
2198 2201                   * error handler.
2199 2202                   */
2200 2203                  if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2201 2204                          goto truncated;
2202 2205  
2203 2206                  /* Verify the modified message before any further processes. */
2204 2207                  ipha = (ipha_t *)mp->b_rptr;
2205 2208                  hdr_length = IPH_HDR_LENGTH(ipha);
2206 2209                  icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2207 2210                  if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2208 2211                          freemsg(mp);
2209 2212                          return;
2210 2213                  }
2211 2214  
2212 2215                  icmp_inbound_error_fanout_v4(mp, icmph, ira);
2213 2216                  return;
2214 2217  
2215 2218          case IPPROTO_ENCAP: {
2216 2219                  /* Look for self-encapsulated packets that caused an error */
2217 2220                  ipha_t *in_ipha;
2218 2221  
2219 2222                  /*
2220 2223                   * Caller has verified that length has to be
2221 2224                   * at least the size of IP header.
2222 2225                   */
2223 2226                  ASSERT(hdr_length >= sizeof (ipha_t));
2224 2227                  /*
2225 2228                   * Check the sanity of the inner IP header like
2226 2229                   * we did for the outer header.
2227 2230                   */
2228 2231                  in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
2229 2232                  if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
2230 2233                          goto discard_pkt;
2231 2234                  }
2232 2235                  if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
2233 2236                          goto discard_pkt;
2234 2237                  }
2235 2238                  /* Check for Self-encapsulated tunnels */
2236 2239                  if (in_ipha->ipha_src == ipha->ipha_src &&
2237 2240                      in_ipha->ipha_dst == ipha->ipha_dst) {
2238 2241  
2239 2242                          mp = icmp_inbound_self_encap_error_v4(mp, ipha,
2240 2243                              in_ipha);
2241 2244                          if (mp == NULL)
2242 2245                                  goto discard_pkt;
2243 2246  
2244 2247                          /*
2245 2248                           * Just in case self_encap didn't preserve the NULL
2246 2249                           * b_cont
2247 2250                           */
2248 2251                          if (mp->b_cont != NULL) {
2249 2252                                  if (!pullupmsg(mp, -1))
2250 2253                                          goto discard_pkt;
2251 2254                          }
2252 2255                          /*
2253 2256                           * Note that ira_pktlen and ira_ip_hdr_length are no
2254 2257                           * longer correct, but we don't use them any more here.
2255 2258                           */
2256 2259                          if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2257 2260                                  goto truncated;
2258 2261  
2259 2262                          /*
2260 2263                           * Verify the modified message before any further
2261 2264                           * processes.
2262 2265                           */
2263 2266                          ipha = (ipha_t *)mp->b_rptr;
2264 2267                          hdr_length = IPH_HDR_LENGTH(ipha);
2265 2268                          icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2266 2269                          if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2267 2270                                  freemsg(mp);
2268 2271                                  return;
2269 2272                          }
2270 2273  
2271 2274                          /*
2272 2275                           * The packet in error is self-encapsualted.
2273 2276                           * And we are finding it further encapsulated
2274 2277                           * which we could not have possibly generated.
2275 2278                           */
2276 2279                          if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2277 2280                                  goto discard_pkt;
2278 2281                          }
2279 2282                          icmp_inbound_error_fanout_v4(mp, icmph, ira);
2280 2283                          return;
2281 2284                  }
2282 2285                  /* No self-encapsulated */
2283 2286                  /* FALLTHRU */
2284 2287          }
2285 2288          case IPPROTO_IPV6:
2286 2289                  if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
2287 2290                      &ripha.ipha_dst, ipst)) != NULL) {
2288 2291                          ira->ira_flags |= IRAF_ICMP_ERROR;
2289 2292                          connp->conn_recvicmp(connp, mp, NULL, ira);
2290 2293                          CONN_DEC_REF(connp);
2291 2294                          ira->ira_flags &= ~IRAF_ICMP_ERROR;
2292 2295                          return;
2293 2296                  }
2294 2297                  /*
2295 2298                   * No IP tunnel is interested, fallthrough and see
2296 2299                   * if a raw socket will want it.
2297 2300                   */
2298 2301                  /* FALLTHRU */
2299 2302          default:
2300 2303                  ira->ira_flags |= IRAF_ICMP_ERROR;
2301 2304                  ip_fanout_proto_v4(mp, &ripha, ira);
2302 2305                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2303 2306                  return;
2304 2307          }
2305 2308          /* NOTREACHED */
2306 2309  discard_pkt:
2307 2310          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2308 2311          ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
2309 2312          ip_drop_input("ipIfStatsInDiscards", mp, ill);
2310 2313          freemsg(mp);
2311 2314          return;
2312 2315  
2313 2316  truncated:
2314 2317          /* We pulled up everthing already. Must be truncated */
2315 2318          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2316 2319          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2317 2320          freemsg(mp);
2318 2321  }
2319 2322  
2320 2323  /*
2321 2324   * Common IP options parser.
2322 2325   *
2323 2326   * Setup routine: fill in *optp with options-parsing state, then
2324 2327   * tail-call ipoptp_next to return the first option.
2325 2328   */
2326 2329  uint8_t
2327 2330  ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
2328 2331  {
2329 2332          uint32_t totallen; /* total length of all options */
2330 2333  
2331 2334          totallen = ipha->ipha_version_and_hdr_length -
2332 2335              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
2333 2336          totallen <<= 2;
2334 2337          optp->ipoptp_next = (uint8_t *)(&ipha[1]);
2335 2338          optp->ipoptp_end = optp->ipoptp_next + totallen;
2336 2339          optp->ipoptp_flags = 0;
2337 2340          return (ipoptp_next(optp));
2338 2341  }
2339 2342  
2340 2343  /* Like above but without an ipha_t */
2341 2344  uint8_t
2342 2345  ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
2343 2346  {
2344 2347          optp->ipoptp_next = opt;
2345 2348          optp->ipoptp_end = optp->ipoptp_next + totallen;
2346 2349          optp->ipoptp_flags = 0;
2347 2350          return (ipoptp_next(optp));
2348 2351  }
2349 2352  
2350 2353  /*
2351 2354   * Common IP options parser: extract next option.
2352 2355   */
2353 2356  uint8_t
2354 2357  ipoptp_next(ipoptp_t *optp)
2355 2358  {
2356 2359          uint8_t *end = optp->ipoptp_end;
2357 2360          uint8_t *cur = optp->ipoptp_next;
2358 2361          uint8_t opt, len, pointer;
2359 2362  
2360 2363          /*
2361 2364           * If cur > end already, then the ipoptp_end or ipoptp_next pointer
2362 2365           * has been corrupted.
2363 2366           */
2364 2367          ASSERT(cur <= end);
2365 2368  
2366 2369          if (cur == end)
2367 2370                  return (IPOPT_EOL);
2368 2371  
2369 2372          opt = cur[IPOPT_OPTVAL];
2370 2373  
2371 2374          /*
2372 2375           * Skip any NOP options.
2373 2376           */
2374 2377          while (opt == IPOPT_NOP) {
2375 2378                  cur++;
2376 2379                  if (cur == end)
2377 2380                          return (IPOPT_EOL);
2378 2381                  opt = cur[IPOPT_OPTVAL];
2379 2382          }
2380 2383  
2381 2384          if (opt == IPOPT_EOL)
2382 2385                  return (IPOPT_EOL);
2383 2386  
2384 2387          /*
2385 2388           * Option requiring a length.
2386 2389           */
2387 2390          if ((cur + 1) >= end) {
2388 2391                  optp->ipoptp_flags |= IPOPTP_ERROR;
2389 2392                  return (IPOPT_EOL);
2390 2393          }
2391 2394          len = cur[IPOPT_OLEN];
2392 2395          if (len < 2) {
2393 2396                  optp->ipoptp_flags |= IPOPTP_ERROR;
2394 2397                  return (IPOPT_EOL);
2395 2398          }
2396 2399          optp->ipoptp_cur = cur;
2397 2400          optp->ipoptp_len = len;
2398 2401          optp->ipoptp_next = cur + len;
2399 2402          if (cur + len > end) {
2400 2403                  optp->ipoptp_flags |= IPOPTP_ERROR;
2401 2404                  return (IPOPT_EOL);
2402 2405          }
2403 2406  
2404 2407          /*
2405 2408           * For the options which require a pointer field, make sure
2406 2409           * its there, and make sure it points to either something
2407 2410           * inside this option, or the end of the option.
2408 2411           */
2409 2412          switch (opt) {
2410 2413          case IPOPT_RR:
2411 2414          case IPOPT_TS:
2412 2415          case IPOPT_LSRR:
2413 2416          case IPOPT_SSRR:
2414 2417                  if (len <= IPOPT_OFFSET) {
2415 2418                          optp->ipoptp_flags |= IPOPTP_ERROR;
2416 2419                          return (opt);
2417 2420                  }
2418 2421                  pointer = cur[IPOPT_OFFSET];
2419 2422                  if (pointer - 1 > len) {
2420 2423                          optp->ipoptp_flags |= IPOPTP_ERROR;
2421 2424                          return (opt);
2422 2425                  }
2423 2426                  break;
2424 2427          }
2425 2428  
2426 2429          /*
2427 2430           * Sanity check the pointer field based on the type of the
2428 2431           * option.
2429 2432           */
2430 2433          switch (opt) {
2431 2434          case IPOPT_RR:
2432 2435          case IPOPT_SSRR:
2433 2436          case IPOPT_LSRR:
2434 2437                  if (pointer < IPOPT_MINOFF_SR)
2435 2438                          optp->ipoptp_flags |= IPOPTP_ERROR;
2436 2439                  break;
2437 2440          case IPOPT_TS:
2438 2441                  if (pointer < IPOPT_MINOFF_IT)
2439 2442                          optp->ipoptp_flags |= IPOPTP_ERROR;
2440 2443                  /*
2441 2444                   * Note that the Internet Timestamp option also
2442 2445                   * contains two four bit fields (the Overflow field,
2443 2446                   * and the Flag field), which follow the pointer
2444 2447                   * field.  We don't need to check that these fields
2445 2448                   * fall within the length of the option because this
2446 2449                   * was implicitely done above.  We've checked that the
2447 2450                   * pointer value is at least IPOPT_MINOFF_IT, and that
2448 2451                   * it falls within the option.  Since IPOPT_MINOFF_IT >
2449 2452                   * IPOPT_POS_OV_FLG, we don't need the explicit check.
2450 2453                   */
2451 2454                  ASSERT(len > IPOPT_POS_OV_FLG);
2452 2455                  break;
2453 2456          }
2454 2457  
2455 2458          return (opt);
2456 2459  }
2457 2460  
2458 2461  /*
2459 2462   * Use the outgoing IP header to create an IP_OPTIONS option the way
2460 2463   * it was passed down from the application.
2461 2464   *
2462 2465   * This is compatible with BSD in that it returns
2463 2466   * the reverse source route with the final destination
2464 2467   * as the last entry. The first 4 bytes of the option
2465 2468   * will contain the final destination.
2466 2469   */
2467 2470  int
2468 2471  ip_opt_get_user(conn_t *connp, uchar_t *buf)
2469 2472  {
2470 2473          ipoptp_t        opts;
2471 2474          uchar_t         *opt;
2472 2475          uint8_t         optval;
2473 2476          uint8_t         optlen;
2474 2477          uint32_t        len = 0;
2475 2478          uchar_t         *buf1 = buf;
2476 2479          uint32_t        totallen;
2477 2480          ipaddr_t        dst;
2478 2481          ip_pkt_t        *ipp = &connp->conn_xmit_ipp;
2479 2482  
2480 2483          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
2481 2484                  return (0);
2482 2485  
2483 2486          totallen = ipp->ipp_ipv4_options_len;
2484 2487          if (totallen & 0x3)
2485 2488                  return (0);
2486 2489  
2487 2490          buf += IP_ADDR_LEN;     /* Leave room for final destination */
2488 2491          len += IP_ADDR_LEN;
2489 2492          bzero(buf1, IP_ADDR_LEN);
2490 2493  
2491 2494          dst = connp->conn_faddr_v4;
2492 2495  
2493 2496          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
2494 2497              optval != IPOPT_EOL;
2495 2498              optval = ipoptp_next(&opts)) {
2496 2499                  int     off;
2497 2500  
2498 2501                  opt = opts.ipoptp_cur;
2499 2502                  if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
2500 2503                          break;
2501 2504                  }
2502 2505                  optlen = opts.ipoptp_len;
2503 2506  
2504 2507                  switch (optval) {
2505 2508                  case IPOPT_SSRR:
2506 2509                  case IPOPT_LSRR:
2507 2510  
2508 2511                          /*
2509 2512                           * Insert destination as the first entry in the source
2510 2513                           * route and move down the entries on step.
2511 2514                           * The last entry gets placed at buf1.
2512 2515                           */
2513 2516                          buf[IPOPT_OPTVAL] = optval;
2514 2517                          buf[IPOPT_OLEN] = optlen;
2515 2518                          buf[IPOPT_OFFSET] = optlen;
2516 2519  
2517 2520                          off = optlen - IP_ADDR_LEN;
2518 2521                          if (off < 0) {
2519 2522                                  /* No entries in source route */
2520 2523                                  break;
2521 2524                          }
2522 2525                          /* Last entry in source route if not already set */
2523 2526                          if (dst == INADDR_ANY)
2524 2527                                  bcopy(opt + off, buf1, IP_ADDR_LEN);
2525 2528                          off -= IP_ADDR_LEN;
2526 2529  
2527 2530                          while (off > 0) {
2528 2531                                  bcopy(opt + off,
2529 2532                                      buf + off + IP_ADDR_LEN,
2530 2533                                      IP_ADDR_LEN);
2531 2534                                  off -= IP_ADDR_LEN;
2532 2535                          }
2533 2536                          /* ipha_dst into first slot */
2534 2537                          bcopy(&dst, buf + off + IP_ADDR_LEN,
2535 2538                              IP_ADDR_LEN);
2536 2539                          buf += optlen;
2537 2540                          len += optlen;
2538 2541                          break;
2539 2542  
2540 2543                  default:
2541 2544                          bcopy(opt, buf, optlen);
2542 2545                          buf += optlen;
2543 2546                          len += optlen;
2544 2547                          break;
2545 2548                  }
2546 2549          }
2547 2550  done:
2548 2551          /* Pad the resulting options */
2549 2552          while (len & 0x3) {
2550 2553                  *buf++ = IPOPT_EOL;
2551 2554                  len++;
2552 2555          }
2553 2556          return (len);
2554 2557  }
2555 2558  
2556 2559  /*
2557 2560   * Update any record route or timestamp options to include this host.
2558 2561   * Reverse any source route option.
2559 2562   * This routine assumes that the options are well formed i.e. that they
2560 2563   * have already been checked.
2561 2564   */
2562 2565  static void
2563 2566  icmp_options_update(ipha_t *ipha)
2564 2567  {
2565 2568          ipoptp_t        opts;
2566 2569          uchar_t         *opt;
2567 2570          uint8_t         optval;
2568 2571          ipaddr_t        src;            /* Our local address */
2569 2572          ipaddr_t        dst;
2570 2573  
2571 2574          ip2dbg(("icmp_options_update\n"));
2572 2575          src = ipha->ipha_src;
2573 2576          dst = ipha->ipha_dst;
2574 2577  
2575 2578          for (optval = ipoptp_first(&opts, ipha);
2576 2579              optval != IPOPT_EOL;
2577 2580              optval = ipoptp_next(&opts)) {
2578 2581                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
2579 2582                  opt = opts.ipoptp_cur;
2580 2583                  ip2dbg(("icmp_options_update: opt %d, len %d\n",
2581 2584                      optval, opts.ipoptp_len));
2582 2585                  switch (optval) {
2583 2586                          int off1, off2;
2584 2587                  case IPOPT_SSRR:
2585 2588                  case IPOPT_LSRR:
2586 2589                          /*
2587 2590                           * Reverse the source route.  The first entry
2588 2591                           * should be the next to last one in the current
2589 2592                           * source route (the last entry is our address).
2590 2593                           * The last entry should be the final destination.
2591 2594                           */
2592 2595                          off1 = IPOPT_MINOFF_SR - 1;
2593 2596                          off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
2594 2597                          if (off2 < 0) {
2595 2598                                  /* No entries in source route */
2596 2599                                  ip1dbg((
2597 2600                                      "icmp_options_update: bad src route\n"));
2598 2601                                  break;
2599 2602                          }
2600 2603                          bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
2601 2604                          bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
2602 2605                          bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
2603 2606                          off2 -= IP_ADDR_LEN;
2604 2607  
2605 2608                          while (off1 < off2) {
2606 2609                                  bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
2607 2610                                  bcopy((char *)opt + off2, (char *)opt + off1,
2608 2611                                      IP_ADDR_LEN);
2609 2612                                  bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
2610 2613                                  off1 += IP_ADDR_LEN;
2611 2614                                  off2 -= IP_ADDR_LEN;
2612 2615                          }
2613 2616                          opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
2614 2617                          break;
2615 2618                  }
2616 2619          }
2617 2620  }
2618 2621  
2619 2622  /*
2620 2623   * Process received ICMP Redirect messages.
2621 2624   * Assumes the caller has verified that the headers are in the pulled up mblk.
2622 2625   * Consumes mp.
2623 2626   */
2624 2627  static void
2625 2628  icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
2626 2629  {
2627 2630          ire_t           *ire, *nire;
2628 2631          ire_t           *prev_ire;
2629 2632          ipaddr_t        src, dst, gateway;
2630 2633          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2631 2634          ipha_t          *inner_ipha;    /* Inner IP header */
2632 2635  
2633 2636          /* Caller already pulled up everything. */
2634 2637          inner_ipha = (ipha_t *)&icmph[1];
2635 2638          src = ipha->ipha_src;
2636 2639          dst = inner_ipha->ipha_dst;
2637 2640          gateway = icmph->icmph_rd_gateway;
2638 2641          /* Make sure the new gateway is reachable somehow. */
2639 2642          ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
2640 2643              ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
2641 2644          /*
2642 2645           * Make sure we had a route for the dest in question and that
2643 2646           * that route was pointing to the old gateway (the source of the
2644 2647           * redirect packet.)
2645 2648           * We do longest match and then compare ire_gateway_addr below.
2646 2649           */
2647 2650          prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
2648 2651              NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2649 2652          /*
2650 2653           * Check that
2651 2654           *      the redirect was not from ourselves
2652 2655           *      the new gateway and the old gateway are directly reachable
2653 2656           */
2654 2657          if (prev_ire == NULL || ire == NULL ||
2655 2658              (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
2656 2659              (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2657 2660              !(ire->ire_type & IRE_IF_ALL) ||
2658 2661              prev_ire->ire_gateway_addr != src) {
2659 2662                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2660 2663                  ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
2661 2664                  freemsg(mp);
2662 2665                  if (ire != NULL)
2663 2666                          ire_refrele(ire);
2664 2667                  if (prev_ire != NULL)
2665 2668                          ire_refrele(prev_ire);
2666 2669                  return;
2667 2670          }
2668 2671  
2669 2672          ire_refrele(prev_ire);
2670 2673          ire_refrele(ire);
2671 2674  
2672 2675          /*
2673 2676           * TODO: more precise handling for cases 0, 2, 3, the latter two
2674 2677           * require TOS routing
2675 2678           */
2676 2679          switch (icmph->icmph_code) {
2677 2680          case 0:
2678 2681          case 1:
2679 2682                  /* TODO: TOS specificity for cases 2 and 3 */
2680 2683          case 2:
2681 2684          case 3:
2682 2685                  break;
2683 2686          default:
2684 2687                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2685 2688                  ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
2686 2689                  freemsg(mp);
2687 2690                  return;
2688 2691          }
2689 2692          /*
2690 2693           * Create a Route Association.  This will allow us to remember that
2691 2694           * someone we believe told us to use the particular gateway.
2692 2695           */
2693 2696          ire = ire_create(
2694 2697              (uchar_t *)&dst,                    /* dest addr */
2695 2698              (uchar_t *)&ip_g_all_ones,          /* mask */
2696 2699              (uchar_t *)&gateway,                /* gateway addr */
2697 2700              IRE_HOST,
2698 2701              NULL,                               /* ill */
2699 2702              ALL_ZONES,
2700 2703              (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
2701 2704              NULL,                               /* tsol_gc_t */
2702 2705              ipst);
2703 2706  
2704 2707          if (ire == NULL) {
2705 2708                  freemsg(mp);
2706 2709                  return;
2707 2710          }
2708 2711          nire = ire_add(ire);
2709 2712          /* Check if it was a duplicate entry */
2710 2713          if (nire != NULL && nire != ire) {
2711 2714                  ASSERT(nire->ire_identical_ref > 1);
2712 2715                  ire_delete(nire);
2713 2716                  ire_refrele(nire);
2714 2717                  nire = NULL;
2715 2718          }
2716 2719          ire = nire;
2717 2720          if (ire != NULL) {
2718 2721                  ire_refrele(ire);               /* Held in ire_add */
2719 2722  
2720 2723                  /* tell routing sockets that we received a redirect */
2721 2724                  ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
2722 2725                      (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
2723 2726                      (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
2724 2727          }
2725 2728  
2726 2729          /*
2727 2730           * Delete any existing IRE_HOST type redirect ires for this destination.
2728 2731           * This together with the added IRE has the effect of
2729 2732           * modifying an existing redirect.
2730 2733           */
2731 2734          prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
2732 2735              ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
2733 2736          if (prev_ire != NULL) {
2734 2737                  if (prev_ire ->ire_flags & RTF_DYNAMIC)
2735 2738                          ire_delete(prev_ire);
2736 2739                  ire_refrele(prev_ire);
2737 2740          }
2738 2741  
2739 2742          freemsg(mp);
2740 2743  }
2741 2744  
2742 2745  /*
2743 2746   * Generate an ICMP parameter problem message.
2744 2747   * When called from ip_output side a minimal ip_recv_attr_t needs to be
2745 2748   * constructed by the caller.
2746 2749   */
2747 2750  static void
2748 2751  icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
2749 2752  {
2750 2753          icmph_t icmph;
2751 2754          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2752 2755  
2753 2756          mp = icmp_pkt_err_ok(mp, ira);
2754 2757          if (mp == NULL)
2755 2758                  return;
2756 2759  
2757 2760          bzero(&icmph, sizeof (icmph_t));
2758 2761          icmph.icmph_type = ICMP_PARAM_PROBLEM;
2759 2762          icmph.icmph_pp_ptr = ptr;
2760 2763          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
2761 2764          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
2762 2765  }
2763 2766  
2764 2767  /*
2765 2768   * Build and ship an IPv4 ICMP message using the packet data in mp, and
2766 2769   * the ICMP header pointed to by "stuff".  (May be called as writer.)
2767 2770   * Note: assumes that icmp_pkt_err_ok has been called to verify that
2768 2771   * an icmp error packet can be sent.
2769 2772   * Assigns an appropriate source address to the packet. If ipha_dst is
2770 2773   * one of our addresses use it for source. Otherwise let ip_output_simple
2771 2774   * pick the source address.
2772 2775   */
2773 2776  static void
2774 2777  icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
2775 2778  {
2776 2779          ipaddr_t dst;
2777 2780          icmph_t *icmph;
2778 2781          ipha_t  *ipha;
2779 2782          uint_t  len_needed;
2780 2783          size_t  msg_len;
2781 2784          mblk_t  *mp1;
2782 2785          ipaddr_t src;
2783 2786          ire_t   *ire;
2784 2787          ip_xmit_attr_t ixas;
2785 2788          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2786 2789  
2787 2790          ipha = (ipha_t *)mp->b_rptr;
2788 2791  
2789 2792          bzero(&ixas, sizeof (ixas));
2790 2793          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2791 2794          ixas.ixa_zoneid = ira->ira_zoneid;
2792 2795          ixas.ixa_ifindex = 0;
2793 2796          ixas.ixa_ipst = ipst;
2794 2797          ixas.ixa_cred = kcred;
2795 2798          ixas.ixa_cpid = NOPID;
2796 2799          ixas.ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
2797 2800          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2798 2801  
2799 2802          if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2800 2803                  /*
2801 2804                   * Apply IPsec based on how IPsec was applied to
2802 2805                   * the packet that had the error.
2803 2806                   *
2804 2807                   * If it was an outbound packet that caused the ICMP
2805 2808                   * error, then the caller will have setup the IRA
2806 2809                   * appropriately.
2807 2810                   */
2808 2811                  if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
2809 2812                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2810 2813                          /* Note: mp already consumed and ip_drop_packet done */
2811 2814                          return;
2812 2815                  }
2813 2816          } else {
2814 2817                  /*
2815 2818                   * This is in clear. The icmp message we are building
2816 2819                   * here should go out in clear, independent of our policy.
2817 2820                   */
2818 2821                  ixas.ixa_flags |= IXAF_NO_IPSEC;
2819 2822          }
2820 2823  
2821 2824          /* Remember our eventual destination */
2822 2825          dst = ipha->ipha_src;
2823 2826  
2824 2827          /*
2825 2828           * If the packet was for one of our unicast addresses, make
2826 2829           * sure we respond with that as the source. Otherwise
2827 2830           * have ip_output_simple pick the source address.
2828 2831           */
2829 2832          ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
2830 2833              (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
2831 2834              MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
2832 2835          if (ire != NULL) {
2833 2836                  ire_refrele(ire);
2834 2837                  src = ipha->ipha_dst;
2835 2838          } else {
2836 2839                  src = INADDR_ANY;
2837 2840                  ixas.ixa_flags |= IXAF_SET_SOURCE;
2838 2841          }
2839 2842  
2840 2843          /*
2841 2844           * Check if we can send back more then 8 bytes in addition to
2842 2845           * the IP header.  We try to send 64 bytes of data and the internal
2843 2846           * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
2844 2847           */
2845 2848          len_needed = IPH_HDR_LENGTH(ipha);
2846 2849          if (ipha->ipha_protocol == IPPROTO_ENCAP ||
2847 2850              ipha->ipha_protocol == IPPROTO_IPV6) {
2848 2851                  if (!pullupmsg(mp, -1)) {
2849 2852                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2850 2853                          ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2851 2854                          freemsg(mp);
2852 2855                          return;
2853 2856                  }
2854 2857                  ipha = (ipha_t *)mp->b_rptr;
2855 2858  
2856 2859                  if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2857 2860                          len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
2858 2861                              len_needed));
2859 2862                  } else {
2860 2863                          ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
2861 2864  
2862 2865                          ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
2863 2866                          len_needed += ip_hdr_length_v6(mp, ip6h);
2864 2867                  }
2865 2868          }
2866 2869          len_needed += ipst->ips_ip_icmp_return;
2867 2870          msg_len = msgdsize(mp);
2868 2871          if (msg_len > len_needed) {
2869 2872                  (void) adjmsg(mp, len_needed - msg_len);
2870 2873                  msg_len = len_needed;
2871 2874          }
2872 2875          mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
2873 2876          if (mp1 == NULL) {
2874 2877                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
2875 2878                  freemsg(mp);
2876 2879                  return;
2877 2880          }
2878 2881          mp1->b_cont = mp;
2879 2882          mp = mp1;
2880 2883  
2881 2884          /*
2882 2885           * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
2883 2886           * node generates be accepted in peace by all on-host destinations.
2884 2887           * If we do NOT assume that all on-host destinations trust
2885 2888           * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
2886 2889           * (Look for IXAF_TRUSTED_ICMP).
2887 2890           */
2888 2891          ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
2889 2892  
2890 2893          ipha = (ipha_t *)mp->b_rptr;
2891 2894          mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
2892 2895          *ipha = icmp_ipha;
2893 2896          ipha->ipha_src = src;
2894 2897          ipha->ipha_dst = dst;
2895 2898          ipha->ipha_ttl = ipst->ips_ip_def_ttl;
2896 2899          msg_len += sizeof (icmp_ipha) + len;
2897 2900          if (msg_len > IP_MAXPACKET) {
2898 2901                  (void) adjmsg(mp, IP_MAXPACKET - msg_len);
2899 2902                  msg_len = IP_MAXPACKET;
2900 2903          }
2901 2904          ipha->ipha_length = htons((uint16_t)msg_len);
2902 2905          icmph = (icmph_t *)&ipha[1];
2903 2906          bcopy(stuff, icmph, len);
2904 2907          icmph->icmph_checksum = 0;
2905 2908          icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
2906 2909          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
2907 2910  
2908 2911          (void) ip_output_simple(mp, &ixas);
2909 2912          ixa_cleanup(&ixas);
2910 2913  }
2911 2914  
2912 2915  /*
2913 2916   * Determine if an ICMP error packet can be sent given the rate limit.
2914 2917   * The limit consists of an average frequency (icmp_pkt_err_interval measured
2915 2918   * in milliseconds) and a burst size. Burst size number of packets can
2916 2919   * be sent arbitrarely closely spaced.
2917 2920   * The state is tracked using two variables to implement an approximate
2918 2921   * token bucket filter:
2919 2922   *      icmp_pkt_err_last - lbolt value when the last burst started
2920 2923   *      icmp_pkt_err_sent - number of packets sent in current burst
2921 2924   */
2922 2925  boolean_t
2923 2926  icmp_err_rate_limit(ip_stack_t *ipst)
2924 2927  {
2925 2928          clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
2926 2929          uint_t refilled; /* Number of packets refilled in tbf since last */
2927 2930          /* Guard against changes by loading into local variable */
2928 2931          uint_t err_interval = ipst->ips_ip_icmp_err_interval;
2929 2932  
2930 2933          if (err_interval == 0)
2931 2934                  return (B_FALSE);
2932 2935  
2933 2936          if (ipst->ips_icmp_pkt_err_last > now) {
2934 2937                  /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
2935 2938                  ipst->ips_icmp_pkt_err_last = 0;
2936 2939                  ipst->ips_icmp_pkt_err_sent = 0;
2937 2940          }
2938 2941          /*
2939 2942           * If we are in a burst update the token bucket filter.
2940 2943           * Update the "last" time to be close to "now" but make sure
2941 2944           * we don't loose precision.
2942 2945           */
2943 2946          if (ipst->ips_icmp_pkt_err_sent != 0) {
2944 2947                  refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
2945 2948                  if (refilled > ipst->ips_icmp_pkt_err_sent) {
2946 2949                          ipst->ips_icmp_pkt_err_sent = 0;
2947 2950                  } else {
2948 2951                          ipst->ips_icmp_pkt_err_sent -= refilled;
2949 2952                          ipst->ips_icmp_pkt_err_last += refilled * err_interval;
2950 2953                  }
2951 2954          }
2952 2955          if (ipst->ips_icmp_pkt_err_sent == 0) {
2953 2956                  /* Start of new burst */
2954 2957                  ipst->ips_icmp_pkt_err_last = now;
2955 2958          }
2956 2959          if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
2957 2960                  ipst->ips_icmp_pkt_err_sent++;
2958 2961                  ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
2959 2962                      ipst->ips_icmp_pkt_err_sent));
2960 2963                  return (B_FALSE);
2961 2964          }
2962 2965          ip1dbg(("icmp_err_rate_limit: dropped\n"));
2963 2966          return (B_TRUE);
2964 2967  }
2965 2968  
2966 2969  /*
2967 2970   * Check if it is ok to send an IPv4 ICMP error packet in
2968 2971   * response to the IPv4 packet in mp.
2969 2972   * Free the message and return null if no
2970 2973   * ICMP error packet should be sent.
2971 2974   */
2972 2975  static mblk_t *
2973 2976  icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
2974 2977  {
2975 2978          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2976 2979          icmph_t *icmph;
2977 2980          ipha_t  *ipha;
2978 2981          uint_t  len_needed;
2979 2982  
2980 2983          if (!mp)
2981 2984                  return (NULL);
2982 2985          ipha = (ipha_t *)mp->b_rptr;
2983 2986          if (ip_csum_hdr(ipha)) {
2984 2987                  BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
2985 2988                  ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
2986 2989                  freemsg(mp);
2987 2990                  return (NULL);
2988 2991          }
2989 2992          if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
2990 2993              ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
2991 2994              CLASSD(ipha->ipha_dst) ||
2992 2995              CLASSD(ipha->ipha_src) ||
2993 2996              (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
2994 2997                  /* Note: only errors to the fragment with offset 0 */
2995 2998                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
2996 2999                  freemsg(mp);
2997 3000                  return (NULL);
2998 3001          }
2999 3002          if (ipha->ipha_protocol == IPPROTO_ICMP) {
3000 3003                  /*
3001 3004                   * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
3002 3005                   * errors in response to any ICMP errors.
3003 3006                   */
3004 3007                  len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
3005 3008                  if (mp->b_wptr - mp->b_rptr < len_needed) {
3006 3009                          if (!pullupmsg(mp, len_needed)) {
3007 3010                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
3008 3011                                  freemsg(mp);
3009 3012                                  return (NULL);
3010 3013                          }
3011 3014                          ipha = (ipha_t *)mp->b_rptr;
3012 3015                  }
3013 3016                  icmph = (icmph_t *)
3014 3017                      (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
3015 3018                  switch (icmph->icmph_type) {
3016 3019                  case ICMP_DEST_UNREACHABLE:
3017 3020                  case ICMP_SOURCE_QUENCH:
3018 3021                  case ICMP_TIME_EXCEEDED:
3019 3022                  case ICMP_PARAM_PROBLEM:
3020 3023                  case ICMP_REDIRECT:
3021 3024                          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3022 3025                          freemsg(mp);
3023 3026                          return (NULL);
3024 3027                  default:
3025 3028                          break;
3026 3029                  }
3027 3030          }
3028 3031          /*
3029 3032           * If this is a labeled system, then check to see if we're allowed to
3030 3033           * send a response to this particular sender.  If not, then just drop.
3031 3034           */
3032 3035          if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
3033 3036                  ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
3034 3037                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3035 3038                  freemsg(mp);
3036 3039                  return (NULL);
3037 3040          }
3038 3041          if (icmp_err_rate_limit(ipst)) {
3039 3042                  /*
3040 3043                   * Only send ICMP error packets every so often.
3041 3044                   * This should be done on a per port/source basis,
3042 3045                   * but for now this will suffice.
3043 3046                   */
3044 3047                  freemsg(mp);
3045 3048                  return (NULL);
3046 3049          }
3047 3050          return (mp);
3048 3051  }
3049 3052  
3050 3053  /*
3051 3054   * Called when a packet was sent out the same link that it arrived on.
3052 3055   * Check if it is ok to send a redirect and then send it.
3053 3056   */
3054 3057  void
3055 3058  ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
3056 3059      ip_recv_attr_t *ira)
3057 3060  {
3058 3061          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
3059 3062          ipaddr_t        src, nhop;
3060 3063          mblk_t          *mp1;
3061 3064          ire_t           *nhop_ire;
3062 3065  
3063 3066          /*
3064 3067           * Check the source address to see if it originated
3065 3068           * on the same logical subnet it is going back out on.
3066 3069           * If so, we should be able to send it a redirect.
3067 3070           * Avoid sending a redirect if the destination
3068 3071           * is directly connected (i.e., we matched an IRE_ONLINK),
3069 3072           * or if the packet was source routed out this interface.
3070 3073           *
3071 3074           * We avoid sending a redirect if the
3072 3075           * destination is directly connected
3073 3076           * because it is possible that multiple
3074 3077           * IP subnets may have been configured on
3075 3078           * the link, and the source may not
3076 3079           * be on the same subnet as ip destination,
3077 3080           * even though they are on the same
3078 3081           * physical link.
3079 3082           */
3080 3083          if ((ire->ire_type & IRE_ONLINK) ||
3081 3084              ip_source_routed(ipha, ipst))
3082 3085                  return;
3083 3086  
3084 3087          nhop_ire = ire_nexthop(ire);
3085 3088          if (nhop_ire == NULL)
3086 3089                  return;
3087 3090  
3088 3091          nhop = nhop_ire->ire_addr;
3089 3092  
3090 3093          if (nhop_ire->ire_type & IRE_IF_CLONE) {
3091 3094                  ire_t   *ire2;
3092 3095  
3093 3096                  /* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
3094 3097                  mutex_enter(&nhop_ire->ire_lock);
3095 3098                  ire2 = nhop_ire->ire_dep_parent;
3096 3099                  if (ire2 != NULL)
3097 3100                          ire_refhold(ire2);
3098 3101                  mutex_exit(&nhop_ire->ire_lock);
3099 3102                  ire_refrele(nhop_ire);
3100 3103                  nhop_ire = ire2;
3101 3104          }
3102 3105          if (nhop_ire == NULL)
3103 3106                  return;
3104 3107  
3105 3108          ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
3106 3109  
3107 3110          src = ipha->ipha_src;
3108 3111  
3109 3112          /*
3110 3113           * We look at the interface ire for the nexthop,
3111 3114           * to see if ipha_src is in the same subnet
3112 3115           * as the nexthop.
3113 3116           */
3114 3117          if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
3115 3118                  /*
3116 3119                   * The source is directly connected.
3117 3120                   */
3118 3121                  mp1 = copymsg(mp);
3119 3122                  if (mp1 != NULL) {
3120 3123                          icmp_send_redirect(mp1, nhop, ira);
3121 3124                  }
3122 3125          }
3123 3126          ire_refrele(nhop_ire);
3124 3127  }
3125 3128  
3126 3129  /*
3127 3130   * Generate an ICMP redirect message.
3128 3131   */
3129 3132  static void
3130 3133  icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
3131 3134  {
3132 3135          icmph_t icmph;
3133 3136          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3134 3137  
3135 3138          mp = icmp_pkt_err_ok(mp, ira);
3136 3139          if (mp == NULL)
3137 3140                  return;
3138 3141  
3139 3142          bzero(&icmph, sizeof (icmph_t));
3140 3143          icmph.icmph_type = ICMP_REDIRECT;
3141 3144          icmph.icmph_code = 1;
3142 3145          icmph.icmph_rd_gateway = gateway;
3143 3146          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
3144 3147          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3145 3148  }
3146 3149  
3147 3150  /*
3148 3151   * Generate an ICMP time exceeded message.
3149 3152   */
3150 3153  void
3151 3154  icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3152 3155  {
3153 3156          icmph_t icmph;
3154 3157          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3155 3158  
3156 3159          mp = icmp_pkt_err_ok(mp, ira);
3157 3160          if (mp == NULL)
3158 3161                  return;
3159 3162  
3160 3163          bzero(&icmph, sizeof (icmph_t));
3161 3164          icmph.icmph_type = ICMP_TIME_EXCEEDED;
3162 3165          icmph.icmph_code = code;
3163 3166          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
3164 3167          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3165 3168  }
3166 3169  
3167 3170  /*
3168 3171   * Generate an ICMP unreachable message.
3169 3172   * When called from ip_output side a minimal ip_recv_attr_t needs to be
3170 3173   * constructed by the caller.
3171 3174   */
3172 3175  void
3173 3176  icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3174 3177  {
3175 3178          icmph_t icmph;
3176 3179          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3177 3180  
3178 3181          mp = icmp_pkt_err_ok(mp, ira);
3179 3182          if (mp == NULL)
3180 3183                  return;
3181 3184  
3182 3185          bzero(&icmph, sizeof (icmph_t));
3183 3186          icmph.icmph_type = ICMP_DEST_UNREACHABLE;
3184 3187          icmph.icmph_code = code;
3185 3188          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
3186 3189          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3187 3190  }
3188 3191  
3189 3192  /*
3190 3193   * Latch in the IPsec state for a stream based the policy in the listener
3191 3194   * and the actions in the ip_recv_attr_t.
3192 3195   * Called directly from TCP and SCTP.
3193 3196   */
3194 3197  boolean_t
3195 3198  ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
3196 3199  {
3197 3200          ASSERT(lconnp->conn_policy != NULL);
3198 3201          ASSERT(connp->conn_policy == NULL);
3199 3202  
3200 3203          IPPH_REFHOLD(lconnp->conn_policy);
3201 3204          connp->conn_policy = lconnp->conn_policy;
3202 3205  
3203 3206          if (ira->ira_ipsec_action != NULL) {
3204 3207                  if (connp->conn_latch == NULL) {
3205 3208                          connp->conn_latch = iplatch_create();
3206 3209                          if (connp->conn_latch == NULL)
3207 3210                                  return (B_FALSE);
3208 3211                  }
3209 3212                  ipsec_latch_inbound(connp, ira);
3210 3213          }
3211 3214          return (B_TRUE);
3212 3215  }
3213 3216  
3214 3217  /*
3215 3218   * Verify whether or not the IP address is a valid local address.
3216 3219   * Could be a unicast, including one for a down interface.
3217 3220   * If allow_mcbc then a multicast or broadcast address is also
3218 3221   * acceptable.
3219 3222   *
3220 3223   * In the case of a broadcast/multicast address, however, the
3221 3224   * upper protocol is expected to reset the src address
3222 3225   * to zero when we return IPVL_MCAST/IPVL_BCAST so that
3223 3226   * no packets are emitted with broadcast/multicast address as
3224 3227   * source address (that violates hosts requirements RFC 1122)
3225 3228   * The addresses valid for bind are:
3226 3229   *      (1) - INADDR_ANY (0)
3227 3230   *      (2) - IP address of an UP interface
3228 3231   *      (3) - IP address of a DOWN interface
3229 3232   *      (4) - valid local IP broadcast addresses. In this case
3230 3233   *      the conn will only receive packets destined to
3231 3234   *      the specified broadcast address.
3232 3235   *      (5) - a multicast address. In this case
3233 3236   *      the conn will only receive packets destined to
3234 3237   *      the specified multicast address. Note: the
3235 3238   *      application still has to issue an
3236 3239   *      IP_ADD_MEMBERSHIP socket option.
3237 3240   *
3238 3241   * In all the above cases, the bound address must be valid in the current zone.
3239 3242   * When the address is loopback, multicast or broadcast, there might be many
3240 3243   * matching IREs so bind has to look up based on the zone.
3241 3244   */
3242 3245  ip_laddr_t
3243 3246  ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
3244 3247      ip_stack_t *ipst, boolean_t allow_mcbc)
3245 3248  {
3246 3249          ire_t *src_ire;
3247 3250  
3248 3251          ASSERT(src_addr != INADDR_ANY);
3249 3252  
3250 3253          src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
3251 3254              NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
3252 3255  
3253 3256          /*
3254 3257           * If an address other than in6addr_any is requested,
3255 3258           * we verify that it is a valid address for bind
3256 3259           * Note: Following code is in if-else-if form for
3257 3260           * readability compared to a condition check.
3258 3261           */
3259 3262          if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
3260 3263                  /*
3261 3264                   * (2) Bind to address of local UP interface
3262 3265                   */
3263 3266                  ire_refrele(src_ire);
3264 3267                  return (IPVL_UNICAST_UP);
3265 3268          } else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
3266 3269                  /*
3267 3270                   * (4) Bind to broadcast address
3268 3271                   */
3269 3272                  ire_refrele(src_ire);
3270 3273                  if (allow_mcbc)
3271 3274                          return (IPVL_BCAST);
3272 3275                  else
3273 3276                          return (IPVL_BAD);
3274 3277          } else if (CLASSD(src_addr)) {
3275 3278                  /* (5) bind to multicast address. */
3276 3279                  if (src_ire != NULL)
3277 3280                          ire_refrele(src_ire);
3278 3281  
3279 3282                  if (allow_mcbc)
3280 3283                          return (IPVL_MCAST);
3281 3284                  else
3282 3285                          return (IPVL_BAD);
3283 3286          } else {
3284 3287                  ipif_t *ipif;
3285 3288  
3286 3289                  /*
3287 3290                   * (3) Bind to address of local DOWN interface?
3288 3291                   * (ipif_lookup_addr() looks up all interfaces
3289 3292                   * but we do not get here for UP interfaces
3290 3293                   * - case (2) above)
3291 3294                   */
3292 3295                  if (src_ire != NULL)
3293 3296                          ire_refrele(src_ire);
3294 3297  
3295 3298                  ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
3296 3299                  if (ipif == NULL)
3297 3300                          return (IPVL_BAD);
3298 3301  
3299 3302                  /* Not a useful source? */
3300 3303                  if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
3301 3304                          ipif_refrele(ipif);
3302 3305                          return (IPVL_BAD);
3303 3306                  }
3304 3307                  ipif_refrele(ipif);
3305 3308                  return (IPVL_UNICAST_DOWN);
3306 3309          }
3307 3310  }
3308 3311  
3309 3312  /*
3310 3313   * Insert in the bind fanout for IPv4 and IPv6.
3311 3314   * The caller should already have used ip_laddr_verify_v*() before calling
3312 3315   * this.
3313 3316   */
3314 3317  int
3315 3318  ip_laddr_fanout_insert(conn_t *connp)
3316 3319  {
3317 3320          int             error;
3318 3321  
3319 3322          /*
3320 3323           * Allow setting new policies. For example, disconnects result
3321 3324           * in us being called. As we would have set conn_policy_cached
3322 3325           * to B_TRUE before, we should set it to B_FALSE, so that policy
3323 3326           * can change after the disconnect.
3324 3327           */
3325 3328          connp->conn_policy_cached = B_FALSE;
3326 3329  
3327 3330          error = ipcl_bind_insert(connp);
3328 3331          if (error != 0) {
3329 3332                  if (connp->conn_anon_port) {
3330 3333                          (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
3331 3334                              connp->conn_mlp_type, connp->conn_proto,
3332 3335                              ntohs(connp->conn_lport), B_FALSE);
3333 3336                  }
3334 3337                  connp->conn_mlp_type = mlptSingle;
3335 3338          }
3336 3339          return (error);
3337 3340  }
3338 3341  
3339 3342  /*
3340 3343   * Verify that both the source and destination addresses are valid. If
3341 3344   * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
3342 3345   * i.e. have no route to it.  Protocols like TCP want to verify destination
3343 3346   * reachability, while tunnels do not.
3344 3347   *
3345 3348   * Determine the route, the interface, and (optionally) the source address
3346 3349   * to use to reach a given destination.
3347 3350   * Note that we allow connect to broadcast and multicast addresses when
3348 3351   * IPDF_ALLOW_MCBC is set.
3349 3352   * first_hop and dst_addr are normally the same, but if source routing
3350 3353   * they will differ; in that case the first_hop is what we'll use for the
3351 3354   * routing lookup but the dce and label checks will be done on dst_addr,
3352 3355   *
3353 3356   * If uinfo is set, then we fill in the best available information
3354 3357   * we have for the destination. This is based on (in priority order) any
3355 3358   * metrics and path MTU stored in a dce_t, route metrics, and finally the
3356 3359   * ill_mtu/ill_mc_mtu.
3357 3360   *
3358 3361   * Tsol note: If we have a source route then dst_addr != firsthop. But we
3359 3362   * always do the label check on dst_addr.
3360 3363   */
3361 3364  int
3362 3365  ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
3363 3366      ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
3364 3367  {
3365 3368          ire_t           *ire = NULL;
3366 3369          int             error = 0;
3367 3370          ipaddr_t        setsrc;                         /* RTF_SETSRC */
3368 3371          zoneid_t        zoneid = ixa->ixa_zoneid;       /* Honors SO_ALLZONES */
3369 3372          ip_stack_t      *ipst = ixa->ixa_ipst;
3370 3373          dce_t           *dce;
3371 3374          uint_t          pmtu;
3372 3375          uint_t          generation;
3373 3376          nce_t           *nce;
3374 3377          ill_t           *ill = NULL;
3375 3378          boolean_t       multirt = B_FALSE;
3376 3379  
3377 3380          ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
3378 3381  
3379 3382          /*
3380 3383           * We never send to zero; the ULPs map it to the loopback address.
3381 3384           * We can't allow it since we use zero to mean unitialized in some
3382 3385           * places.
3383 3386           */
3384 3387          ASSERT(dst_addr != INADDR_ANY);
3385 3388  
3386 3389          if (is_system_labeled()) {
3387 3390                  ts_label_t *tsl = NULL;
3388 3391  
3389 3392                  error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
3390 3393                      mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
3391 3394                  if (error != 0)
3392 3395                          return (error);
3393 3396                  if (tsl != NULL) {
3394 3397                          /* Update the label */
3395 3398                          ip_xmit_attr_replace_tsl(ixa, tsl);
3396 3399                  }
3397 3400          }
3398 3401  
3399 3402          setsrc = INADDR_ANY;
3400 3403          /*
3401 3404           * Select a route; For IPMP interfaces, we would only select
3402 3405           * a "hidden" route (i.e., going through a specific under_ill)
3403 3406           * if ixa_ifindex has been specified.
3404 3407           */
3405 3408          ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
3406 3409              &generation, &setsrc, &error, &multirt);
3407 3410          ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
3408 3411          if (error != 0)
3409 3412                  goto bad_addr;
3410 3413  
3411 3414          /*
3412 3415           * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
3413 3416           * If IPDF_VERIFY_DST is set, the destination must be reachable;
3414 3417           * Otherwise the destination needn't be reachable.
3415 3418           *
3416 3419           * If we match on a reject or black hole, then we've got a
3417 3420           * local failure.  May as well fail out the connect() attempt,
3418 3421           * since it's never going to succeed.
3419 3422           */
3420 3423          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
3421 3424                  /*
3422 3425                   * If we're verifying destination reachability, we always want
3423 3426                   * to complain here.
3424 3427                   *
3425 3428                   * If we're not verifying destination reachability but the
3426 3429                   * destination has a route, we still want to fail on the
3427 3430                   * temporary address and broadcast address tests.
3428 3431                   *
3429 3432                   * In both cases do we let the code continue so some reasonable
3430 3433                   * information is returned to the caller. That enables the
3431 3434                   * caller to use (and even cache) the IRE. conn_ip_ouput will
3432 3435                   * use the generation mismatch path to check for the unreachable
3433 3436                   * case thereby avoiding any specific check in the main path.
3434 3437                   */
3435 3438                  ASSERT(generation == IRE_GENERATION_VERIFY);
3436 3439                  if (flags & IPDF_VERIFY_DST) {
3437 3440                          /*
3438 3441                           * Set errno but continue to set up ixa_ire to be
3439 3442                           * the RTF_REJECT|RTF_BLACKHOLE IRE.
3440 3443                           * That allows callers to use ip_output to get an
3441 3444                           * ICMP error back.
3442 3445                           */
3443 3446                          if (!(ire->ire_type & IRE_HOST))
3444 3447                                  error = ENETUNREACH;
3445 3448                          else
3446 3449                                  error = EHOSTUNREACH;
3447 3450                  }
3448 3451          }
3449 3452  
3450 3453          if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
3451 3454              !(flags & IPDF_ALLOW_MCBC)) {
3452 3455                  ire_refrele(ire);
3453 3456                  ire = ire_reject(ipst, B_FALSE);
3454 3457                  generation = IRE_GENERATION_VERIFY;
3455 3458                  error = ENETUNREACH;
3456 3459          }
3457 3460  
3458 3461          /* Cache things */
3459 3462          if (ixa->ixa_ire != NULL)
3460 3463                  ire_refrele_notr(ixa->ixa_ire);
3461 3464  #ifdef DEBUG
3462 3465          ire_refhold_notr(ire);
3463 3466          ire_refrele(ire);
3464 3467  #endif
3465 3468          ixa->ixa_ire = ire;
3466 3469          ixa->ixa_ire_generation = generation;
3467 3470  
3468 3471          /*
3469 3472           * Ensure that ixa_dce is always set any time that ixa_ire is set,
3470 3473           * since some callers will send a packet to conn_ip_output() even if
3471 3474           * there's an error.
3472 3475           */
3473 3476          if (flags & IPDF_UNIQUE_DCE) {
3474 3477                  /* Fallback to the default dce if allocation fails */
3475 3478                  dce = dce_lookup_and_add_v4(dst_addr, ipst);
3476 3479                  if (dce != NULL)
3477 3480                          generation = dce->dce_generation;
3478 3481                  else
3479 3482                          dce = dce_lookup_v4(dst_addr, ipst, &generation);
3480 3483          } else {
3481 3484                  dce = dce_lookup_v4(dst_addr, ipst, &generation);
3482 3485          }
3483 3486          ASSERT(dce != NULL);
3484 3487          if (ixa->ixa_dce != NULL)
3485 3488                  dce_refrele_notr(ixa->ixa_dce);
3486 3489  #ifdef DEBUG
3487 3490          dce_refhold_notr(dce);
3488 3491          dce_refrele(dce);
3489 3492  #endif
3490 3493          ixa->ixa_dce = dce;
3491 3494          ixa->ixa_dce_generation = generation;
3492 3495  
3493 3496          /*
3494 3497           * For multicast with multirt we have a flag passed back from
3495 3498           * ire_lookup_multi_ill_v4 since we don't have an IRE for each
3496 3499           * possible multicast address.
3497 3500           * We also need a flag for multicast since we can't check
3498 3501           * whether RTF_MULTIRT is set in ixa_ire for multicast.
3499 3502           */
3500 3503          if (multirt) {
3501 3504                  ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
3502 3505                  ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
3503 3506          } else {
3504 3507                  ixa->ixa_postfragfn = ire->ire_postfragfn;
3505 3508                  ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
3506 3509          }
3507 3510          if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3508 3511                  /* Get an nce to cache. */
3509 3512                  nce = ire_to_nce(ire, firsthop, NULL);
3510 3513                  if (nce == NULL) {
3511 3514                          /* Allocation failure? */
3512 3515                          ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3513 3516                  } else {
3514 3517                          if (ixa->ixa_nce != NULL)
3515 3518                                  nce_refrele(ixa->ixa_nce);
3516 3519                          ixa->ixa_nce = nce;
3517 3520                  }
3518 3521          }
3519 3522  
3520 3523          /*
3521 3524           * If the source address is a loopback address, the
3522 3525           * destination had best be local or multicast.
3523 3526           * If we are sending to an IRE_LOCAL using a loopback source then
3524 3527           * it had better be the same zoneid.
3525 3528           */
3526 3529          if (*src_addrp == htonl(INADDR_LOOPBACK)) {
3527 3530                  if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
3528 3531                          ire = NULL;     /* Stored in ixa_ire */
3529 3532                          error = EADDRNOTAVAIL;
3530 3533                          goto bad_addr;
3531 3534                  }
3532 3535                  if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
3533 3536                          ire = NULL;     /* Stored in ixa_ire */
3534 3537                          error = EADDRNOTAVAIL;
3535 3538                          goto bad_addr;
3536 3539                  }
3537 3540          }
3538 3541          if (ire->ire_type & IRE_BROADCAST) {
3539 3542                  /*
3540 3543                   * If the ULP didn't have a specified source, then we
3541 3544                   * make sure we reselect the source when sending
3542 3545                   * broadcasts out different interfaces.
3543 3546                   */
3544 3547                  if (flags & IPDF_SELECT_SRC)
3545 3548                          ixa->ixa_flags |= IXAF_SET_SOURCE;
3546 3549                  else
3547 3550                          ixa->ixa_flags &= ~IXAF_SET_SOURCE;
3548 3551          }
3549 3552  
3550 3553          /*
3551 3554           * Does the caller want us to pick a source address?
3552 3555           */
3553 3556          if (flags & IPDF_SELECT_SRC) {
3554 3557                  ipaddr_t        src_addr;
3555 3558  
3556 3559                  /*
3557 3560                   * We use use ire_nexthop_ill to avoid the under ipmp
3558 3561                   * interface for source address selection. Note that for ipmp
3559 3562                   * probe packets, ixa_ifindex would have been specified, and
3560 3563                   * the ip_select_route() invocation would have picked an ire
3561 3564                   * will ire_ill pointing at an under interface.
3562 3565                   */
3563 3566                  ill = ire_nexthop_ill(ire);
3564 3567  
3565 3568                  /* If unreachable we have no ill but need some source */
3566 3569                  if (ill == NULL) {
3567 3570                          src_addr = htonl(INADDR_LOOPBACK);
3568 3571                          /* Make sure we look for a better source address */
3569 3572                          generation = SRC_GENERATION_VERIFY;
3570 3573                  } else {
3571 3574                          error = ip_select_source_v4(ill, setsrc, dst_addr,
3572 3575                              ixa->ixa_multicast_ifaddr, zoneid,
3573 3576                              ipst, &src_addr, &generation, NULL);
3574 3577                          if (error != 0) {
3575 3578                                  ire = NULL;     /* Stored in ixa_ire */
3576 3579                                  goto bad_addr;
3577 3580                          }
3578 3581                  }
3579 3582  
3580 3583                  /*
3581 3584                   * We allow the source address to to down.
3582 3585                   * However, we check that we don't use the loopback address
3583 3586                   * as a source when sending out on the wire.
3584 3587                   */
3585 3588                  if ((src_addr == htonl(INADDR_LOOPBACK)) &&
3586 3589                      !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
3587 3590                      !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3588 3591                          ire = NULL;     /* Stored in ixa_ire */
3589 3592                          error = EADDRNOTAVAIL;
3590 3593                          goto bad_addr;
3591 3594                  }
3592 3595  
3593 3596                  *src_addrp = src_addr;
3594 3597                  ixa->ixa_src_generation = generation;
3595 3598          }
3596 3599  
3597 3600          /*
3598 3601           * Make sure we don't leave an unreachable ixa_nce in place
3599 3602           * since ip_select_route is used when we unplumb i.e., remove
3600 3603           * references on ixa_ire, ixa_nce, and ixa_dce.
3601 3604           */
3602 3605          nce = ixa->ixa_nce;
3603 3606          if (nce != NULL && nce->nce_is_condemned) {
3604 3607                  nce_refrele(nce);
3605 3608                  ixa->ixa_nce = NULL;
3606 3609                  ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3607 3610          }
3608 3611  
3609 3612          /*
3610 3613           * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
3611 3614           * However, we can't do it for IPv4 multicast or broadcast.
3612 3615           */
3613 3616          if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
3614 3617                  ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3615 3618  
3616 3619          /*
3617 3620           * Set initial value for fragmentation limit. Either conn_ip_output
3618 3621           * or ULP might updates it when there are routing changes.
3619 3622           * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
3620 3623           */
3621 3624          pmtu = ip_get_pmtu(ixa);
3622 3625          ixa->ixa_fragsize = pmtu;
3623 3626          /* Make sure ixa_fragsize and ixa_pmtu remain identical */
3624 3627          if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
3625 3628                  ixa->ixa_pmtu = pmtu;
3626 3629  
3627 3630          /*
3628 3631           * Extract information useful for some transports.
3629 3632           * First we look for DCE metrics. Then we take what we have in
3630 3633           * the metrics in the route, where the offlink is used if we have
3631 3634           * one.
3632 3635           */
3633 3636          if (uinfo != NULL) {
3634 3637                  bzero(uinfo, sizeof (*uinfo));
3635 3638  
3636 3639                  if (dce->dce_flags & DCEF_UINFO)
3637 3640                          *uinfo = dce->dce_uinfo;
3638 3641  
3639 3642                  rts_merge_metrics(uinfo, &ire->ire_metrics);
3640 3643  
3641 3644                  /* Allow ire_metrics to decrease the path MTU from above */
3642 3645                  if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
3643 3646                          uinfo->iulp_mtu = pmtu;
3644 3647  
3645 3648                  uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
3646 3649                  uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
3647 3650                  uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
3648 3651          }
3649 3652  
3650 3653          if (ill != NULL)
3651 3654                  ill_refrele(ill);
3652 3655  
3653 3656          return (error);
3654 3657  
3655 3658  bad_addr:
3656 3659          if (ire != NULL)
3657 3660                  ire_refrele(ire);
3658 3661  
3659 3662          if (ill != NULL)
3660 3663                  ill_refrele(ill);
3661 3664  
3662 3665          /*
3663 3666           * Make sure we don't leave an unreachable ixa_nce in place
3664 3667           * since ip_select_route is used when we unplumb i.e., remove
3665 3668           * references on ixa_ire, ixa_nce, and ixa_dce.
3666 3669           */
3667 3670          nce = ixa->ixa_nce;
3668 3671          if (nce != NULL && nce->nce_is_condemned) {
3669 3672                  nce_refrele(nce);
3670 3673                  ixa->ixa_nce = NULL;
3671 3674                  ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3672 3675          }
3673 3676  
3674 3677          return (error);
3675 3678  }
3676 3679  
3677 3680  
3678 3681  /*
3679 3682   * Get the base MTU for the case when path MTU discovery is not used.
3680 3683   * Takes the MTU of the IRE into account.
3681 3684   */
3682 3685  uint_t
3683 3686  ip_get_base_mtu(ill_t *ill, ire_t *ire)
3684 3687  {
3685 3688          uint_t mtu;
3686 3689          uint_t iremtu = ire->ire_metrics.iulp_mtu;
3687 3690  
3688 3691          if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST))
3689 3692                  mtu = ill->ill_mc_mtu;
3690 3693          else
3691 3694                  mtu = ill->ill_mtu;
3692 3695  
3693 3696          if (iremtu != 0 && iremtu < mtu)
3694 3697                  mtu = iremtu;
3695 3698  
3696 3699          return (mtu);
3697 3700  }
3698 3701  
3699 3702  /*
3700 3703   * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
3701 3704   * Assumes that ixa_ire, dce, and nce have already been set up.
3702 3705   *
3703 3706   * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
3704 3707   * We avoid path MTU discovery if it is disabled with ndd.
3705 3708   * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
3706 3709   *
3707 3710   * NOTE: We also used to turn it off for source routed packets. That
3708 3711   * is no longer required since the dce is per final destination.
3709 3712   */
3710 3713  uint_t
3711 3714  ip_get_pmtu(ip_xmit_attr_t *ixa)
3712 3715  {
3713 3716          ip_stack_t      *ipst = ixa->ixa_ipst;
3714 3717          dce_t           *dce;
3715 3718          nce_t           *nce;
3716 3719          ire_t           *ire;
3717 3720          uint_t          pmtu;
3718 3721  
3719 3722          ire = ixa->ixa_ire;
3720 3723          dce = ixa->ixa_dce;
3721 3724          nce = ixa->ixa_nce;
3722 3725  
3723 3726          /*
3724 3727           * If path MTU discovery has been turned off by ndd, then we ignore
3725 3728           * any dce_pmtu and for IPv4 we will not set DF.
3726 3729           */
3727 3730          if (!ipst->ips_ip_path_mtu_discovery)
3728 3731                  ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3729 3732  
3730 3733          pmtu = IP_MAXPACKET;
3731 3734          /*
3732 3735           * Decide whether whether IPv4 sets DF
3733 3736           * For IPv6 "no DF" means to use the 1280 mtu
3734 3737           */
3735 3738          if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3736 3739                  ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3737 3740          } else {
3738 3741                  ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3739 3742                  if (!(ixa->ixa_flags & IXAF_IS_IPV4))
3740 3743                          pmtu = IPV6_MIN_MTU;
3741 3744          }
3742 3745  
3743 3746          /* Check if the PMTU is to old before we use it */
3744 3747          if ((dce->dce_flags & DCEF_PMTU) &&
3745 3748              TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
3746 3749              ipst->ips_ip_pathmtu_interval) {
3747 3750                  /*
3748 3751                   * Older than 20 minutes. Drop the path MTU information.
3749 3752                   */
3750 3753                  mutex_enter(&dce->dce_lock);
3751 3754                  dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
3752 3755                  dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
3753 3756                  mutex_exit(&dce->dce_lock);
3754 3757                  dce_increment_generation(dce);
3755 3758          }
3756 3759  
3757 3760          /* The metrics on the route can lower the path MTU */
3758 3761          if (ire->ire_metrics.iulp_mtu != 0 &&
3759 3762              ire->ire_metrics.iulp_mtu < pmtu)
3760 3763                  pmtu = ire->ire_metrics.iulp_mtu;
3761 3764  
3762 3765          /*
3763 3766           * If the path MTU is smaller than some minimum, we still use dce_pmtu
3764 3767           * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
3765 3768           * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
3766 3769           */
3767 3770          if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3768 3771                  if (dce->dce_flags & DCEF_PMTU) {
3769 3772                          if (dce->dce_pmtu < pmtu)
3770 3773                                  pmtu = dce->dce_pmtu;
3771 3774  
3772 3775                          if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
3773 3776                                  ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
3774 3777                                  ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3775 3778                          } else {
3776 3779                                  ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3777 3780                                  ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3778 3781                          }
3779 3782                  } else {
3780 3783                          ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3781 3784                          ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3782 3785                  }
3783 3786          }
3784 3787  
3785 3788          /*
3786 3789           * If we have an IRE_LOCAL we use the loopback mtu instead of
3787 3790           * the ill for going out the wire i.e., IRE_LOCAL gets the same
3788 3791           * mtu as IRE_LOOPBACK.
3789 3792           */
3790 3793          if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
3791 3794                  uint_t loopback_mtu;
3792 3795  
3793 3796                  loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
3794 3797                      ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
3795 3798  
3796 3799                  if (loopback_mtu < pmtu)
3797 3800                          pmtu = loopback_mtu;
3798 3801          } else if (nce != NULL) {
3799 3802                  /*
3800 3803                   * Make sure we don't exceed the interface MTU.
3801 3804                   * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
3802 3805                   * an ill. We'd use the above IP_MAXPACKET in that case just
3803 3806                   * to tell the transport something larger than zero.
3804 3807                   */
3805 3808                  if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)) {
3806 3809                          if (nce->nce_common->ncec_ill->ill_mc_mtu < pmtu)
3807 3810                                  pmtu = nce->nce_common->ncec_ill->ill_mc_mtu;
3808 3811                          if (nce->nce_common->ncec_ill != nce->nce_ill &&
3809 3812                              nce->nce_ill->ill_mc_mtu < pmtu) {
3810 3813                                  /*
3811 3814                                   * for interfaces in an IPMP group, the mtu of
3812 3815                                   * the nce_ill (under_ill) could be different
3813 3816                                   * from the mtu of the ncec_ill, so we take the
3814 3817                                   * min of the two.
3815 3818                                   */
3816 3819                                  pmtu = nce->nce_ill->ill_mc_mtu;
3817 3820                          }
3818 3821                  } else {
3819 3822                          if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
3820 3823                                  pmtu = nce->nce_common->ncec_ill->ill_mtu;
3821 3824                          if (nce->nce_common->ncec_ill != nce->nce_ill &&
3822 3825                              nce->nce_ill->ill_mtu < pmtu) {
3823 3826                                  /*
3824 3827                                   * for interfaces in an IPMP group, the mtu of
3825 3828                                   * the nce_ill (under_ill) could be different
3826 3829                                   * from the mtu of the ncec_ill, so we take the
3827 3830                                   * min of the two.
3828 3831                                   */
3829 3832                                  pmtu = nce->nce_ill->ill_mtu;
3830 3833                          }
3831 3834                  }
3832 3835          }
3833 3836  
3834 3837          /*
3835 3838           * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
3836 3839           * Only applies to IPv6.
3837 3840           */
3838 3841          if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3839 3842                  if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
3840 3843                          switch (ixa->ixa_use_min_mtu) {
3841 3844                          case IPV6_USE_MIN_MTU_MULTICAST:
3842 3845                                  if (ire->ire_type & IRE_MULTICAST)
3843 3846                                          pmtu = IPV6_MIN_MTU;
3844 3847                                  break;
3845 3848                          case IPV6_USE_MIN_MTU_ALWAYS:
3846 3849                                  pmtu = IPV6_MIN_MTU;
3847 3850                                  break;
3848 3851                          case IPV6_USE_MIN_MTU_NEVER:
3849 3852                                  break;
3850 3853                          }
3851 3854                  } else {
3852 3855                          /* Default is IPV6_USE_MIN_MTU_MULTICAST */
3853 3856                          if (ire->ire_type & IRE_MULTICAST)
3854 3857                                  pmtu = IPV6_MIN_MTU;
3855 3858                  }
3856 3859          }
3857 3860  
3858 3861          /*
3859 3862           * After receiving an ICMPv6 "packet too big" message with a
3860 3863           * MTU < 1280, and for multirouted IPv6 packets, the IP layer
3861 3864           * will insert a 8-byte fragment header in every packet. We compensate
3862 3865           * for those cases by returning a smaller path MTU to the ULP.
3863 3866           *
3864 3867           * In the case of CGTP then ip_output will add a fragment header.
3865 3868           * Make sure there is room for it by telling a smaller number
3866 3869           * to the transport.
3867 3870           *
3868 3871           * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
3869 3872           * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
3870 3873           * which is the size of the packets it can send.
3871 3874           */
3872 3875          if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3873 3876                  if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) ||
3874 3877                      (ire->ire_flags & RTF_MULTIRT) ||
3875 3878                      (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
3876 3879                          pmtu -= sizeof (ip6_frag_t);
3877 3880                          ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
3878 3881                  }
3879 3882          }
3880 3883  
3881 3884          return (pmtu);
3882 3885  }
3883 3886  
3884 3887  /*
3885 3888   * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
3886 3889   * the final piece where we don't.  Return a pointer to the first mblk in the
3887 3890   * result, and update the pointer to the next mblk to chew on.  If anything
3888 3891   * goes wrong (i.e., dupb fails), we waste everything in sight and return a
3889 3892   * NULL pointer.
3890 3893   */
3891 3894  mblk_t *
3892 3895  ip_carve_mp(mblk_t **mpp, ssize_t len)
3893 3896  {
3894 3897          mblk_t  *mp0;
3895 3898          mblk_t  *mp1;
3896 3899          mblk_t  *mp2;
3897 3900  
3898 3901          if (!len || !mpp || !(mp0 = *mpp))
3899 3902                  return (NULL);
3900 3903          /* If we aren't going to consume the first mblk, we need a dup. */
3901 3904          if (mp0->b_wptr - mp0->b_rptr > len) {
3902 3905                  mp1 = dupb(mp0);
3903 3906                  if (mp1) {
3904 3907                          /* Partition the data between the two mblks. */
3905 3908                          mp1->b_wptr = mp1->b_rptr + len;
3906 3909                          mp0->b_rptr = mp1->b_wptr;
3907 3910                          /*
3908 3911                           * after adjustments if mblk not consumed is now
3909 3912                           * unaligned, try to align it. If this fails free
3910 3913                           * all messages and let upper layer recover.
3911 3914                           */
3912 3915                          if (!OK_32PTR(mp0->b_rptr)) {
3913 3916                                  if (!pullupmsg(mp0, -1)) {
3914 3917                                          freemsg(mp0);
3915 3918                                          freemsg(mp1);
3916 3919                                          *mpp = NULL;
3917 3920                                          return (NULL);
3918 3921                                  }
3919 3922                          }
3920 3923                  }
3921 3924                  return (mp1);
3922 3925          }
3923 3926          /* Eat through as many mblks as we need to get len bytes. */
3924 3927          len -= mp0->b_wptr - mp0->b_rptr;
3925 3928          for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
3926 3929                  if (mp2->b_wptr - mp2->b_rptr > len) {
3927 3930                          /*
3928 3931                           * We won't consume the entire last mblk.  Like
3929 3932                           * above, dup and partition it.
3930 3933                           */
3931 3934                          mp1->b_cont = dupb(mp2);
3932 3935                          mp1 = mp1->b_cont;
3933 3936                          if (!mp1) {
3934 3937                                  /*
3935 3938                                   * Trouble.  Rather than go to a lot of
3936 3939                                   * trouble to clean up, we free the messages.
3937 3940                                   * This won't be any worse than losing it on
3938 3941                                   * the wire.
3939 3942                                   */
3940 3943                                  freemsg(mp0);
3941 3944                                  freemsg(mp2);
3942 3945                                  *mpp = NULL;
3943 3946                                  return (NULL);
3944 3947                          }
3945 3948                          mp1->b_wptr = mp1->b_rptr + len;
3946 3949                          mp2->b_rptr = mp1->b_wptr;
3947 3950                          /*
3948 3951                           * after adjustments if mblk not consumed is now
3949 3952                           * unaligned, try to align it. If this fails free
3950 3953                           * all messages and let upper layer recover.
3951 3954                           */
3952 3955                          if (!OK_32PTR(mp2->b_rptr)) {
3953 3956                                  if (!pullupmsg(mp2, -1)) {
3954 3957                                          freemsg(mp0);
3955 3958                                          freemsg(mp2);
3956 3959                                          *mpp = NULL;
3957 3960                                          return (NULL);
3958 3961                                  }
3959 3962                          }
3960 3963                          *mpp = mp2;
3961 3964                          return (mp0);
3962 3965                  }
3963 3966                  /* Decrement len by the amount we just got. */
3964 3967                  len -= mp2->b_wptr - mp2->b_rptr;
3965 3968          }
3966 3969          /*
3967 3970           * len should be reduced to zero now.  If not our caller has
3968 3971           * screwed up.
3969 3972           */
3970 3973          if (len) {
3971 3974                  /* Shouldn't happen! */
3972 3975                  freemsg(mp0);
3973 3976                  *mpp = NULL;
3974 3977                  return (NULL);
3975 3978          }
3976 3979          /*
3977 3980           * We consumed up to exactly the end of an mblk.  Detach the part
3978 3981           * we are returning from the rest of the chain.
3979 3982           */
3980 3983          mp1->b_cont = NULL;
3981 3984          *mpp = mp2;
3982 3985          return (mp0);
3983 3986  }
3984 3987  
3985 3988  /* The ill stream is being unplumbed. Called from ip_close */
3986 3989  int
3987 3990  ip_modclose(ill_t *ill)
3988 3991  {
3989 3992          boolean_t success;
3990 3993          ipsq_t  *ipsq;
3991 3994          ipif_t  *ipif;
3992 3995          queue_t *q = ill->ill_rq;
3993 3996          ip_stack_t      *ipst = ill->ill_ipst;
3994 3997          int     i;
3995 3998          arl_ill_common_t *ai = ill->ill_common;
3996 3999  
3997 4000          /*
3998 4001           * The punlink prior to this may have initiated a capability
3999 4002           * negotiation. But ipsq_enter will block until that finishes or
4000 4003           * times out.
4001 4004           */
4002 4005          success = ipsq_enter(ill, B_FALSE, NEW_OP);
4003 4006  
4004 4007          /*
4005 4008           * Open/close/push/pop is guaranteed to be single threaded
4006 4009           * per stream by STREAMS. FS guarantees that all references
4007 4010           * from top are gone before close is called. So there can't
4008 4011           * be another close thread that has set CONDEMNED on this ill.
4009 4012           * and cause ipsq_enter to return failure.
4010 4013           */
4011 4014          ASSERT(success);
4012 4015          ipsq = ill->ill_phyint->phyint_ipsq;
4013 4016  
4014 4017          /*
4015 4018           * Mark it condemned. No new reference will be made to this ill.
4016 4019           * Lookup functions will return an error. Threads that try to
4017 4020           * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
4018 4021           * that the refcnt will drop down to zero.
4019 4022           */
4020 4023          mutex_enter(&ill->ill_lock);
4021 4024          ill->ill_state_flags |= ILL_CONDEMNED;
4022 4025          for (ipif = ill->ill_ipif; ipif != NULL;
4023 4026              ipif = ipif->ipif_next) {
4024 4027                  ipif->ipif_state_flags |= IPIF_CONDEMNED;
4025 4028          }
4026 4029          /*
4027 4030           * Wake up anybody waiting to enter the ipsq. ipsq_enter
4028 4031           * returns  error if ILL_CONDEMNED is set
4029 4032           */
4030 4033          cv_broadcast(&ill->ill_cv);
4031 4034          mutex_exit(&ill->ill_lock);
4032 4035  
4033 4036          /*
4034 4037           * Send all the deferred DLPI messages downstream which came in
4035 4038           * during the small window right before ipsq_enter(). We do this
4036 4039           * without waiting for the ACKs because all the ACKs for M_PROTO
4037 4040           * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
4038 4041           */
4039 4042          ill_dlpi_send_deferred(ill);
4040 4043  
4041 4044          /*
4042 4045           * Shut down fragmentation reassembly.
4043 4046           * ill_frag_timer won't start a timer again.
4044 4047           * Now cancel any existing timer
4045 4048           */
4046 4049          (void) untimeout(ill->ill_frag_timer_id);
4047 4050          (void) ill_frag_timeout(ill, 0);
4048 4051  
4049 4052          /*
4050 4053           * Call ill_delete to bring down the ipifs, ilms and ill on
4051 4054           * this ill. Then wait for the refcnts to drop to zero.
4052 4055           * ill_is_freeable checks whether the ill is really quiescent.
4053 4056           * Then make sure that threads that are waiting to enter the
4054 4057           * ipsq have seen the error returned by ipsq_enter and have
4055 4058           * gone away. Then we call ill_delete_tail which does the
4056 4059           * DL_UNBIND_REQ with the driver and then qprocsoff.
4057 4060           */
4058 4061          ill_delete(ill);
4059 4062          mutex_enter(&ill->ill_lock);
4060 4063          while (!ill_is_freeable(ill))
4061 4064                  cv_wait(&ill->ill_cv, &ill->ill_lock);
4062 4065  
4063 4066          while (ill->ill_waiters)
4064 4067                  cv_wait(&ill->ill_cv, &ill->ill_lock);
4065 4068  
4066 4069          mutex_exit(&ill->ill_lock);
4067 4070  
4068 4071          /*
4069 4072           * ill_delete_tail drops reference on ill_ipst, but we need to keep
4070 4073           * it held until the end of the function since the cleanup
4071 4074           * below needs to be able to use the ip_stack_t.
4072 4075           */
4073 4076          netstack_hold(ipst->ips_netstack);
4074 4077  
4075 4078          /* qprocsoff is done via ill_delete_tail */
4076 4079          ill_delete_tail(ill);
4077 4080          /*
4078 4081           * synchronously wait for arp stream to unbind. After this, we
4079 4082           * cannot get any data packets up from the driver.
4080 4083           */
4081 4084          arp_unbind_complete(ill);
4082 4085          ASSERT(ill->ill_ipst == NULL);
4083 4086  
4084 4087          /*
4085 4088           * Walk through all conns and qenable those that have queued data.
4086 4089           * Close synchronization needs this to
4087 4090           * be done to ensure that all upper layers blocked
4088 4091           * due to flow control to the closing device
4089 4092           * get unblocked.
4090 4093           */
4091 4094          ip1dbg(("ip_wsrv: walking\n"));
4092 4095          for (i = 0; i < TX_FANOUT_SIZE; i++) {
4093 4096                  conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
4094 4097          }
4095 4098  
4096 4099          /*
4097 4100           * ai can be null if this is an IPv6 ill, or if the IPv4
4098 4101           * stream is being torn down before ARP was plumbed (e.g.,
4099 4102           * /sbin/ifconfig plumbing a stream twice, and encountering
4100 4103           * an error
4101 4104           */
4102 4105          if (ai != NULL) {
4103 4106                  ASSERT(!ill->ill_isv6);
4104 4107                  mutex_enter(&ai->ai_lock);
4105 4108                  ai->ai_ill = NULL;
4106 4109                  if (ai->ai_arl == NULL) {
4107 4110                          mutex_destroy(&ai->ai_lock);
4108 4111                          kmem_free(ai, sizeof (*ai));
4109 4112                  } else {
4110 4113                          cv_signal(&ai->ai_ill_unplumb_done);
4111 4114                          mutex_exit(&ai->ai_lock);
4112 4115                  }
4113 4116          }
4114 4117  
4115 4118          mutex_enter(&ipst->ips_ip_mi_lock);
4116 4119          mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
4117 4120          mutex_exit(&ipst->ips_ip_mi_lock);
4118 4121  
4119 4122          /*
4120 4123           * credp could be null if the open didn't succeed and ip_modopen
4121 4124           * itself calls ip_close.
4122 4125           */
4123 4126          if (ill->ill_credp != NULL)
4124 4127                  crfree(ill->ill_credp);
4125 4128  
4126 4129          mutex_destroy(&ill->ill_saved_ire_lock);
4127 4130          mutex_destroy(&ill->ill_lock);
4128 4131          rw_destroy(&ill->ill_mcast_lock);
4129 4132          mutex_destroy(&ill->ill_mcast_serializer);
4130 4133          list_destroy(&ill->ill_nce);
4131 4134  
4132 4135          /*
4133 4136           * Now we are done with the module close pieces that
4134 4137           * need the netstack_t.
4135 4138           */
4136 4139          netstack_rele(ipst->ips_netstack);
4137 4140  
4138 4141          mi_close_free((IDP)ill);
4139 4142          q->q_ptr = WR(q)->q_ptr = NULL;
4140 4143  
4141 4144          ipsq_exit(ipsq);
4142 4145  
4143 4146          return (0);
4144 4147  }
4145 4148  
4146 4149  /*
4147 4150   * This is called as part of close() for IP, UDP, ICMP, and RTS
4148 4151   * in order to quiesce the conn.
4149 4152   */
4150 4153  void
4151 4154  ip_quiesce_conn(conn_t *connp)
4152 4155  {
4153 4156          boolean_t       drain_cleanup_reqd = B_FALSE;
4154 4157          boolean_t       conn_ioctl_cleanup_reqd = B_FALSE;
4155 4158          boolean_t       ilg_cleanup_reqd = B_FALSE;
4156 4159          ip_stack_t      *ipst;
4157 4160  
4158 4161          ASSERT(!IPCL_IS_TCP(connp));
4159 4162          ipst = connp->conn_netstack->netstack_ip;
4160 4163  
4161 4164          /*
4162 4165           * Mark the conn as closing, and this conn must not be
4163 4166           * inserted in future into any list. Eg. conn_drain_insert(),
4164 4167           * won't insert this conn into the conn_drain_list.
4165 4168           *
4166 4169           * conn_idl, and conn_ilg cannot get set henceforth.
4167 4170           */
4168 4171          mutex_enter(&connp->conn_lock);
4169 4172          ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
4170 4173          connp->conn_state_flags |= CONN_CLOSING;
4171 4174          if (connp->conn_idl != NULL)
4172 4175                  drain_cleanup_reqd = B_TRUE;
4173 4176          if (connp->conn_oper_pending_ill != NULL)
4174 4177                  conn_ioctl_cleanup_reqd = B_TRUE;
4175 4178          if (connp->conn_dhcpinit_ill != NULL) {
4176 4179                  ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
4177 4180                  atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
4178 4181                  ill_set_inputfn(connp->conn_dhcpinit_ill);
4179 4182                  connp->conn_dhcpinit_ill = NULL;
4180 4183          }
4181 4184          if (connp->conn_ilg != NULL)
4182 4185                  ilg_cleanup_reqd = B_TRUE;
4183 4186          mutex_exit(&connp->conn_lock);
4184 4187  
4185 4188          if (conn_ioctl_cleanup_reqd)
4186 4189                  conn_ioctl_cleanup(connp);
4187 4190  
4188 4191          if (is_system_labeled() && connp->conn_anon_port) {
4189 4192                  (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
4190 4193                      connp->conn_mlp_type, connp->conn_proto,
4191 4194                      ntohs(connp->conn_lport), B_FALSE);
4192 4195                  connp->conn_anon_port = 0;
4193 4196          }
4194 4197          connp->conn_mlp_type = mlptSingle;
4195 4198  
4196 4199          /*
4197 4200           * Remove this conn from any fanout list it is on.
4198 4201           * and then wait for any threads currently operating
4199 4202           * on this endpoint to finish
4200 4203           */
4201 4204          ipcl_hash_remove(connp);
4202 4205  
4203 4206          /*
4204 4207           * Remove this conn from the drain list, and do any other cleanup that
4205 4208           * may be required.  (TCP conns are never flow controlled, and
4206 4209           * conn_idl will be NULL.)
4207 4210           */
4208 4211          if (drain_cleanup_reqd && connp->conn_idl != NULL) {
4209 4212                  idl_t *idl = connp->conn_idl;
4210 4213  
4211 4214                  mutex_enter(&idl->idl_lock);
4212 4215                  conn_drain(connp, B_TRUE);
4213 4216                  mutex_exit(&idl->idl_lock);
4214 4217          }
4215 4218  
4216 4219          if (connp == ipst->ips_ip_g_mrouter)
4217 4220                  (void) ip_mrouter_done(ipst);
4218 4221  
4219 4222          if (ilg_cleanup_reqd)
4220 4223                  ilg_delete_all(connp);
4221 4224  
4222 4225          /*
4223 4226           * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
4224 4227           * callers from write side can't be there now because close
4225 4228           * is in progress. The only other caller is ipcl_walk
4226 4229           * which checks for the condemned flag.
4227 4230           */
4228 4231          mutex_enter(&connp->conn_lock);
4229 4232          connp->conn_state_flags |= CONN_CONDEMNED;
4230 4233          while (connp->conn_ref != 1)
4231 4234                  cv_wait(&connp->conn_cv, &connp->conn_lock);
4232 4235          connp->conn_state_flags |= CONN_QUIESCED;
4233 4236          mutex_exit(&connp->conn_lock);
4234 4237  }
4235 4238  
4236 4239  /* ARGSUSED */
4237 4240  int
4238 4241  ip_close(queue_t *q, int flags)
4239 4242  {
4240 4243          conn_t          *connp;
4241 4244  
4242 4245          /*
4243 4246           * Call the appropriate delete routine depending on whether this is
4244 4247           * a module or device.
4245 4248           */
4246 4249          if (WR(q)->q_next != NULL) {
4247 4250                  /* This is a module close */
4248 4251                  return (ip_modclose((ill_t *)q->q_ptr));
4249 4252          }
4250 4253  
4251 4254          connp = q->q_ptr;
4252 4255          ip_quiesce_conn(connp);
4253 4256  
4254 4257          qprocsoff(q);
4255 4258  
4256 4259          /*
4257 4260           * Now we are truly single threaded on this stream, and can
4258 4261           * delete the things hanging off the connp, and finally the connp.
4259 4262           * We removed this connp from the fanout list, it cannot be
4260 4263           * accessed thru the fanouts, and we already waited for the
4261 4264           * conn_ref to drop to 0. We are already in close, so
4262 4265           * there cannot be any other thread from the top. qprocsoff
4263 4266           * has completed, and service has completed or won't run in
4264 4267           * future.
4265 4268           */
4266 4269          ASSERT(connp->conn_ref == 1);
4267 4270  
4268 4271          inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4269 4272  
4270 4273          connp->conn_ref--;
4271 4274          ipcl_conn_destroy(connp);
4272 4275  
4273 4276          q->q_ptr = WR(q)->q_ptr = NULL;
4274 4277          return (0);
4275 4278  }
4276 4279  
4277 4280  /*
4278 4281   * Wapper around putnext() so that ip_rts_request can merely use
4279 4282   * conn_recv.
4280 4283   */
4281 4284  /*ARGSUSED2*/
4282 4285  static void
4283 4286  ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4284 4287  {
4285 4288          conn_t *connp = (conn_t *)arg1;
4286 4289  
4287 4290          putnext(connp->conn_rq, mp);
4288 4291  }
4289 4292  
4290 4293  /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
4291 4294  /* ARGSUSED */
4292 4295  static void
4293 4296  ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4294 4297  {
4295 4298          freemsg(mp);
4296 4299  }
4297 4300  
4298 4301  /*
4299 4302   * Called when the module is about to be unloaded
4300 4303   */
4301 4304  void
4302 4305  ip_ddi_destroy(void)
4303 4306  {

↓ open down ↓

4180 lines elided

↑ open up ↑

4304 4307          /* This needs to be called before destroying any transports. */
4305 4308          mutex_enter(&cpu_lock);
4306 4309          unregister_cpu_setup_func(ip_tp_cpu_update, NULL);
4307 4310          mutex_exit(&cpu_lock);
4308 4311  
4309 4312          tnet_fini();
4310 4313  
4311 4314          icmp_ddi_g_destroy();
4312 4315          rts_ddi_g_destroy();
4313 4316          udp_ddi_g_destroy();
     4317 +        dccp_ddi_g_destroy();
4314 4318          sctp_ddi_g_destroy();
4315 4319          tcp_ddi_g_destroy();
4316 4320          ilb_ddi_g_destroy();
4317 4321          dce_g_destroy();
4318 4322          ipsec_policy_g_destroy();
4319 4323          ipcl_g_destroy();
4320 4324          ip_net_g_destroy();
4321 4325          ip_ire_g_fini();
4322 4326          inet_minor_destroy(ip_minor_arena_sa);
4323 4327  #if defined(_LP64)

4324 4328          inet_minor_destroy(ip_minor_arena_la);
4325 4329  #endif
4326 4330  
4327 4331  #ifdef DEBUG
4328 4332          list_destroy(&ip_thread_list);
4329 4333          rw_destroy(&ip_thread_rwlock);
4330 4334          tsd_destroy(&ip_thread_data);
4331 4335  #endif
4332 4336  
4333 4337          netstack_unregister(NS_IP);
4334 4338  }
4335 4339  
4336 4340  /*
4337 4341   * First step in cleanup.
4338 4342   */
4339 4343  /* ARGSUSED */
4340 4344  static void
4341 4345  ip_stack_shutdown(netstackid_t stackid, void *arg)
4342 4346  {
4343 4347          ip_stack_t *ipst = (ip_stack_t *)arg;
4344 4348  
4345 4349  #ifdef NS_DEBUG
4346 4350          printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
4347 4351  #endif
4348 4352  
4349 4353          /*
4350 4354           * Perform cleanup for special interfaces (loopback and IPMP).
4351 4355           */
4352 4356          ip_interface_cleanup(ipst);
4353 4357  
4354 4358          /*
4355 4359           * The *_hook_shutdown()s start the process of notifying any
4356 4360           * consumers that things are going away.... nothing is destroyed.
4357 4361           */
4358 4362          ipv4_hook_shutdown(ipst);
4359 4363          ipv6_hook_shutdown(ipst);
4360 4364          arp_hook_shutdown(ipst);
4361 4365  
4362 4366          mutex_enter(&ipst->ips_capab_taskq_lock);
4363 4367          ipst->ips_capab_taskq_quit = B_TRUE;
4364 4368          cv_signal(&ipst->ips_capab_taskq_cv);
4365 4369          mutex_exit(&ipst->ips_capab_taskq_lock);
4366 4370  }
4367 4371  
4368 4372  /*
4369 4373   * Free the IP stack instance.
4370 4374   */
4371 4375  static void
4372 4376  ip_stack_fini(netstackid_t stackid, void *arg)
4373 4377  {
4374 4378          ip_stack_t *ipst = (ip_stack_t *)arg;
4375 4379          int ret;
4376 4380  
4377 4381  #ifdef NS_DEBUG
4378 4382          printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
4379 4383  #endif
4380 4384          /*
4381 4385           * At this point, all of the notifications that the events and
4382 4386           * protocols are going away have been run, meaning that we can
4383 4387           * now set about starting to clean things up.
4384 4388           */
4385 4389          ipobs_fini(ipst);
4386 4390          ipv4_hook_destroy(ipst);
4387 4391          ipv6_hook_destroy(ipst);
4388 4392          arp_hook_destroy(ipst);
4389 4393          ip_net_destroy(ipst);
4390 4394  
4391 4395          ipmp_destroy(ipst);
4392 4396  
4393 4397          ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
4394 4398          ipst->ips_ip_mibkp = NULL;
4395 4399          icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
4396 4400          ipst->ips_icmp_mibkp = NULL;
4397 4401          ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
4398 4402          ipst->ips_ip_kstat = NULL;
4399 4403          bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
4400 4404          ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
4401 4405          ipst->ips_ip6_kstat = NULL;
4402 4406          bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
4403 4407  
4404 4408          kmem_free(ipst->ips_propinfo_tbl,
4405 4409              ip_propinfo_count * sizeof (mod_prop_info_t));
4406 4410          ipst->ips_propinfo_tbl = NULL;
4407 4411  
4408 4412          dce_stack_destroy(ipst);
4409 4413          ip_mrouter_stack_destroy(ipst);
4410 4414  
4411 4415          ret = untimeout(ipst->ips_igmp_timeout_id);
4412 4416          if (ret == -1) {
4413 4417                  ASSERT(ipst->ips_igmp_timeout_id == 0);
4414 4418          } else {
4415 4419                  ASSERT(ipst->ips_igmp_timeout_id != 0);
4416 4420                  ipst->ips_igmp_timeout_id = 0;
4417 4421          }
4418 4422          ret = untimeout(ipst->ips_igmp_slowtimeout_id);
4419 4423          if (ret == -1) {
4420 4424                  ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
4421 4425          } else {
4422 4426                  ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
4423 4427                  ipst->ips_igmp_slowtimeout_id = 0;
4424 4428          }
4425 4429          ret = untimeout(ipst->ips_mld_timeout_id);
4426 4430          if (ret == -1) {
4427 4431                  ASSERT(ipst->ips_mld_timeout_id == 0);
4428 4432          } else {
4429 4433                  ASSERT(ipst->ips_mld_timeout_id != 0);
4430 4434                  ipst->ips_mld_timeout_id = 0;
4431 4435          }
4432 4436          ret = untimeout(ipst->ips_mld_slowtimeout_id);
4433 4437          if (ret == -1) {
4434 4438                  ASSERT(ipst->ips_mld_slowtimeout_id == 0);
4435 4439          } else {
4436 4440                  ASSERT(ipst->ips_mld_slowtimeout_id != 0);
4437 4441                  ipst->ips_mld_slowtimeout_id = 0;
4438 4442          }
4439 4443  
4440 4444          ip_ire_fini(ipst);
4441 4445          ip6_asp_free(ipst);
4442 4446          conn_drain_fini(ipst);
4443 4447          ipcl_destroy(ipst);
4444 4448  
4445 4449          mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
4446 4450          mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
4447 4451          kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
4448 4452          ipst->ips_ndp4 = NULL;
4449 4453          kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
4450 4454          ipst->ips_ndp6 = NULL;
4451 4455  
4452 4456          if (ipst->ips_loopback_ksp != NULL) {
4453 4457                  kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
4454 4458                  ipst->ips_loopback_ksp = NULL;
4455 4459          }
4456 4460  
4457 4461          mutex_destroy(&ipst->ips_capab_taskq_lock);
4458 4462          cv_destroy(&ipst->ips_capab_taskq_cv);
4459 4463  
4460 4464          rw_destroy(&ipst->ips_srcid_lock);
4461 4465  
4462 4466          mutex_destroy(&ipst->ips_ip_mi_lock);
4463 4467          rw_destroy(&ipst->ips_ill_g_usesrc_lock);
4464 4468  
4465 4469          mutex_destroy(&ipst->ips_igmp_timer_lock);
4466 4470          mutex_destroy(&ipst->ips_mld_timer_lock);
4467 4471          mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
4468 4472          mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
4469 4473          mutex_destroy(&ipst->ips_ip_addr_avail_lock);
4470 4474          rw_destroy(&ipst->ips_ill_g_lock);
4471 4475  
4472 4476          kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
4473 4477          ipst->ips_phyint_g_list = NULL;
4474 4478          kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
4475 4479          ipst->ips_ill_g_heads = NULL;
4476 4480  
4477 4481          ldi_ident_release(ipst->ips_ldi_ident);
4478 4482          kmem_free(ipst, sizeof (*ipst));
4479 4483  }
4480 4484  
4481 4485  /*
4482 4486   * This function is called from the TSD destructor, and is used to debug
4483 4487   * reference count issues in IP. See block comment in <inet/ip_if.h> for
4484 4488   * details.
4485 4489   */
4486 4490  static void
4487 4491  ip_thread_exit(void *phash)
4488 4492  {
4489 4493          th_hash_t *thh = phash;
4490 4494  
4491 4495          rw_enter(&ip_thread_rwlock, RW_WRITER);
4492 4496          list_remove(&ip_thread_list, thh);
4493 4497          rw_exit(&ip_thread_rwlock);
4494 4498          mod_hash_destroy_hash(thh->thh_hash);
4495 4499          kmem_free(thh, sizeof (*thh));
4496 4500  }
4497 4501  
4498 4502  /*
4499 4503   * Called when the IP kernel module is loaded into the kernel
4500 4504   */
4501 4505  void
4502 4506  ip_ddi_init(void)
4503 4507  {
4504 4508          ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
4505 4509  
4506 4510          /*
4507 4511           * For IP and TCP the minor numbers should start from 2 since we have 4
4508 4512           * initial devices: ip, ip6, tcp, tcp6.
4509 4513           */
4510 4514          /*
4511 4515           * If this is a 64-bit kernel, then create two separate arenas -
4512 4516           * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
4513 4517           * other for socket apps in the range 2^^18 through 2^^32-1.
4514 4518           */
4515 4519          ip_minor_arena_la = NULL;
4516 4520          ip_minor_arena_sa = NULL;
4517 4521  #if defined(_LP64)
4518 4522          if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4519 4523              INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
4520 4524                  cmn_err(CE_PANIC,
4521 4525                      "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4522 4526          }
4523 4527          if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
4524 4528              MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
4525 4529                  cmn_err(CE_PANIC,
4526 4530                      "ip_ddi_init: ip_minor_arena_la creation failed\n");
4527 4531          }
4528 4532  #else
4529 4533          if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4530 4534              INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
4531 4535                  cmn_err(CE_PANIC,
4532 4536                      "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4533 4537          }
4534 4538  #endif
4535 4539          ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
4536 4540  
4537 4541          ipcl_g_init();
4538 4542          ip_ire_g_init();
4539 4543          ip_net_g_init();

↓ open down ↓

216 lines elided

↑ open up ↑

4540 4544  
4541 4545  #ifdef DEBUG
4542 4546          tsd_create(&ip_thread_data, ip_thread_exit);
4543 4547          rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
4544 4548          list_create(&ip_thread_list, sizeof (th_hash_t),
4545 4549              offsetof(th_hash_t, thh_link));
4546 4550  #endif
4547 4551          ipsec_policy_g_init();
4548 4552          tcp_ddi_g_init();
4549 4553          sctp_ddi_g_init();
     4554 +        dccp_ddi_g_init();
4550 4555          dce_g_init();
4551 4556  
4552 4557          /*
4553 4558           * We want to be informed each time a stack is created or
4554 4559           * destroyed in the kernel, so we can maintain the
4555 4560           * set of udp_stack_t's.
4556 4561           */
4557 4562          netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
4558 4563              ip_stack_fini);
4559 4564

4560 4565          tnet_init();
4561 4566  
4562 4567          udp_ddi_g_init();
4563 4568          rts_ddi_g_init();
4564 4569          icmp_ddi_g_init();
4565 4570          ilb_ddi_g_init();
4566 4571  
4567 4572          /* This needs to be called after all transports are initialized. */
4568 4573          mutex_enter(&cpu_lock);
4569 4574          register_cpu_setup_func(ip_tp_cpu_update, NULL);
4570 4575          mutex_exit(&cpu_lock);
4571 4576  }
4572 4577  
4573 4578  /*
4574 4579   * Initialize the IP stack instance.
4575 4580   */
4576 4581  static void *
4577 4582  ip_stack_init(netstackid_t stackid, netstack_t *ns)
4578 4583  {
4579 4584          ip_stack_t      *ipst;
4580 4585          size_t          arrsz;
4581 4586          major_t         major;
4582 4587  
4583 4588  #ifdef NS_DEBUG
4584 4589          printf("ip_stack_init(stack %d)\n", stackid);
4585 4590  #endif
4586 4591  
4587 4592          ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
4588 4593          ipst->ips_netstack = ns;
4589 4594  
4590 4595          ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
4591 4596              KM_SLEEP);
4592 4597          ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
4593 4598              KM_SLEEP);
4594 4599          ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4595 4600          ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4596 4601          mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4597 4602          mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4598 4603  
4599 4604          mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4600 4605          ipst->ips_igmp_deferred_next = INFINITY;
4601 4606          mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4602 4607          ipst->ips_mld_deferred_next = INFINITY;
4603 4608          mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4604 4609          mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4605 4610          mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
4606 4611          mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
4607 4612          rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
4608 4613          rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
4609 4614  
4610 4615          ipcl_init(ipst);
4611 4616          ip_ire_init(ipst);
4612 4617          ip6_asp_init(ipst);
4613 4618          ipif_init(ipst);
4614 4619          conn_drain_init(ipst);
4615 4620          ip_mrouter_stack_init(ipst);
4616 4621          dce_stack_init(ipst);
4617 4622  
4618 4623          ipst->ips_ip_multirt_log_interval = 1000;
4619 4624  
4620 4625          ipst->ips_ill_index = 1;
4621 4626  
4622 4627          ipst->ips_saved_ip_forwarding = -1;
4623 4628          ipst->ips_reg_vif_num = ALL_VIFS;       /* Index to Register vif */
4624 4629  
4625 4630          arrsz = ip_propinfo_count * sizeof (mod_prop_info_t);
4626 4631          ipst->ips_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4627 4632          bcopy(ip_propinfo_tbl, ipst->ips_propinfo_tbl, arrsz);
4628 4633  
4629 4634          ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
4630 4635          ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
4631 4636          ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
4632 4637          ipst->ips_ip6_kstat =
4633 4638              ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
4634 4639  
4635 4640          ipst->ips_ip_src_id = 1;
4636 4641          rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
4637 4642  
4638 4643          ipst->ips_src_generation = SRC_GENERATION_INITIAL;
4639 4644  
4640 4645          ip_net_init(ipst, ns);
4641 4646          ipv4_hook_init(ipst);
4642 4647          ipv6_hook_init(ipst);
4643 4648          arp_hook_init(ipst);
4644 4649          ipmp_init(ipst);
4645 4650          ipobs_init(ipst);
4646 4651  
4647 4652          /*
4648 4653           * Create the taskq dispatcher thread and initialize related stuff.
4649 4654           */
4650 4655          mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
4651 4656          cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
4652 4657          ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
4653 4658              ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
4654 4659  
4655 4660          major = mod_name_to_major(INET_NAME);
4656 4661          (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
4657 4662          return (ipst);
4658 4663  }
4659 4664  
4660 4665  /*
4661 4666   * Allocate and initialize a DLPI template of the specified length.  (May be
4662 4667   * called as writer.)
4663 4668   */
4664 4669  mblk_t *
4665 4670  ip_dlpi_alloc(size_t len, t_uscalar_t prim)
4666 4671  {
4667 4672          mblk_t  *mp;
4668 4673  
4669 4674          mp = allocb(len, BPRI_MED);
4670 4675          if (!mp)
4671 4676                  return (NULL);
4672 4677  
4673 4678          /*
4674 4679           * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
4675 4680           * of which we don't seem to use) are sent with M_PCPROTO, and
4676 4681           * that other DLPI are M_PROTO.
4677 4682           */
4678 4683          if (prim == DL_INFO_REQ) {
4679 4684                  mp->b_datap->db_type = M_PCPROTO;
4680 4685          } else {
4681 4686                  mp->b_datap->db_type = M_PROTO;
4682 4687          }
4683 4688  
4684 4689          mp->b_wptr = mp->b_rptr + len;
4685 4690          bzero(mp->b_rptr, len);
4686 4691          ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
4687 4692          return (mp);
4688 4693  }
4689 4694  
4690 4695  /*
4691 4696   * Allocate and initialize a DLPI notification.  (May be called as writer.)
4692 4697   */
4693 4698  mblk_t *
4694 4699  ip_dlnotify_alloc(uint_t notification, uint_t data)
4695 4700  {
4696 4701          dl_notify_ind_t *notifyp;
4697 4702          mblk_t          *mp;
4698 4703  
4699 4704          if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4700 4705                  return (NULL);
4701 4706  
4702 4707          notifyp = (dl_notify_ind_t *)mp->b_rptr;
4703 4708          notifyp->dl_notification = notification;
4704 4709          notifyp->dl_data = data;
4705 4710          return (mp);
4706 4711  }
4707 4712  
4708 4713  mblk_t *
4709 4714  ip_dlnotify_alloc2(uint_t notification, uint_t data1, uint_t data2)
4710 4715  {
4711 4716          dl_notify_ind_t *notifyp;
4712 4717          mblk_t          *mp;
4713 4718  
4714 4719          if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4715 4720                  return (NULL);
4716 4721  
4717 4722          notifyp = (dl_notify_ind_t *)mp->b_rptr;
4718 4723          notifyp->dl_notification = notification;
4719 4724          notifyp->dl_data1 = data1;
4720 4725          notifyp->dl_data2 = data2;
4721 4726          return (mp);
4722 4727  }
4723 4728  
4724 4729  /*
4725 4730   * Debug formatting routine.  Returns a character string representation of the
4726 4731   * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
4727 4732   * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
4728 4733   *
4729 4734   * Once the ndd table-printing interfaces are removed, this can be changed to
4730 4735   * standard dotted-decimal form.
4731 4736   */
4732 4737  char *
4733 4738  ip_dot_addr(ipaddr_t addr, char *buf)
4734 4739  {
4735 4740          uint8_t *ap = (uint8_t *)&addr;
4736 4741  
4737 4742          (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
4738 4743              ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
4739 4744          return (buf);
4740 4745  }
4741 4746  
4742 4747  /*
4743 4748   * Write the given MAC address as a printable string in the usual colon-
4744 4749   * separated format.
4745 4750   */
4746 4751  const char *
4747 4752  mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
4748 4753  {
4749 4754          char *bp;
4750 4755  
4751 4756          if (alen == 0 || buflen < 4)
4752 4757                  return ("?");
4753 4758          bp = buf;
4754 4759          for (;;) {
4755 4760                  /*
4756 4761                   * If there are more MAC address bytes available, but we won't
4757 4762                   * have any room to print them, then add "..." to the string
4758 4763                   * instead.  See below for the 'magic number' explanation.
4759 4764                   */
4760 4765                  if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
4761 4766                          (void) strcpy(bp, "...");
4762 4767                          break;
4763 4768                  }
4764 4769                  (void) sprintf(bp, "%02x", *addr++);
4765 4770                  bp += 2;
4766 4771                  if (--alen == 0)
4767 4772                          break;
4768 4773                  *bp++ = ':';
4769 4774                  buflen -= 3;
4770 4775                  /*
4771 4776                   * At this point, based on the first 'if' statement above,
4772 4777                   * either alen == 1 and buflen >= 3, or alen > 1 and
4773 4778                   * buflen >= 4.  The first case leaves room for the final "xx"
4774 4779                   * number and trailing NUL byte.  The second leaves room for at
4775 4780                   * least "...".  Thus the apparently 'magic' numbers chosen for
4776 4781                   * that statement.
4777 4782                   */
4778 4783          }
4779 4784          return (buf);
4780 4785  }
4781 4786  
4782 4787  /*
4783 4788   * Called when it is conceptually a ULP that would sent the packet
4784 4789   * e.g., port unreachable and protocol unreachable. Check that the packet
4785 4790   * would have passed the IPsec global policy before sending the error.
4786 4791   *
4787 4792   * Send an ICMP error after patching up the packet appropriately.
4788 4793   * Uses ip_drop_input and bumps the appropriate MIB.
4789 4794   */
4790 4795  void
4791 4796  ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
4792 4797      ip_recv_attr_t *ira)
4793 4798  {
4794 4799          ipha_t          *ipha;
4795 4800          boolean_t       secure;
4796 4801          ill_t           *ill = ira->ira_ill;
4797 4802          ip_stack_t      *ipst = ill->ill_ipst;
4798 4803          netstack_t      *ns = ipst->ips_netstack;
4799 4804          ipsec_stack_t   *ipss = ns->netstack_ipsec;
4800 4805  
4801 4806          secure = ira->ira_flags & IRAF_IPSEC_SECURE;
4802 4807  
4803 4808          /*
4804 4809           * We are generating an icmp error for some inbound packet.
4805 4810           * Called from all ip_fanout_(udp, tcp, proto) functions.
4806 4811           * Before we generate an error, check with global policy
4807 4812           * to see whether this is allowed to enter the system. As
4808 4813           * there is no "conn", we are checking with global policy.
4809 4814           */
4810 4815          ipha = (ipha_t *)mp->b_rptr;
4811 4816          if (secure || ipss->ipsec_inbound_v4_policy_present) {
4812 4817                  mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
4813 4818                  if (mp == NULL)
4814 4819                          return;
4815 4820          }
4816 4821  
4817 4822          /* We never send errors for protocols that we do implement */
4818 4823          if (ira->ira_protocol == IPPROTO_ICMP ||
4819 4824              ira->ira_protocol == IPPROTO_IGMP) {
4820 4825                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4821 4826                  ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
4822 4827                  freemsg(mp);
4823 4828                  return;
4824 4829          }
4825 4830          /*
4826 4831           * Have to correct checksum since
4827 4832           * the packet might have been
4828 4833           * fragmented and the reassembly code in ip_rput
4829 4834           * does not restore the IP checksum.
4830 4835           */
4831 4836          ipha->ipha_hdr_checksum = 0;
4832 4837          ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
4833 4838  
4834 4839          switch (icmp_type) {
4835 4840          case ICMP_DEST_UNREACHABLE:
4836 4841                  switch (icmp_code) {
4837 4842                  case ICMP_PROTOCOL_UNREACHABLE:
4838 4843                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
4839 4844                          ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
4840 4845                          break;
4841 4846                  case ICMP_PORT_UNREACHABLE:
4842 4847                          BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
4843 4848                          ip_drop_input("ipIfStatsNoPorts", mp, ill);
4844 4849                          break;
4845 4850                  }
4846 4851  
4847 4852                  icmp_unreachable(mp, icmp_code, ira);
4848 4853                  break;
4849 4854          default:
4850 4855  #ifdef DEBUG
4851 4856                  panic("ip_fanout_send_icmp_v4: wrong type");
4852 4857                  /*NOTREACHED*/
4853 4858  #else
4854 4859                  freemsg(mp);
4855 4860                  break;
4856 4861  #endif
4857 4862          }
4858 4863  }
4859 4864  
4860 4865  /*
4861 4866   * Used to send an ICMP error message when a packet is received for
4862 4867   * a protocol that is not supported. The mblk passed as argument
4863 4868   * is consumed by this function.
4864 4869   */
4865 4870  void
4866 4871  ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
4867 4872  {
4868 4873          ipha_t          *ipha;
4869 4874  
4870 4875          ipha = (ipha_t *)mp->b_rptr;
4871 4876          if (ira->ira_flags & IRAF_IS_IPV4) {
4872 4877                  ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
4873 4878                  ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
4874 4879                      ICMP_PROTOCOL_UNREACHABLE, ira);
4875 4880          } else {
4876 4881                  ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
4877 4882                  ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
4878 4883                      ICMP6_PARAMPROB_NEXTHEADER, ira);
4879 4884          }
4880 4885  }
4881 4886  
4882 4887  /*
4883 4888   * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
4884 4889   * Handles IPv4 and IPv6.
4885 4890   * We are responsible for disposing of mp, such as by freemsg() or putnext()
4886 4891   * Caller is responsible for dropping references to the conn.
4887 4892   */
4888 4893  void
4889 4894  ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
4890 4895      ip_recv_attr_t *ira)
4891 4896  {
4892 4897          ill_t           *ill = ira->ira_ill;
4893 4898          ip_stack_t      *ipst = ill->ill_ipst;
4894 4899          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
4895 4900          boolean_t       secure;
4896 4901          uint_t          protocol = ira->ira_protocol;
4897 4902          iaflags_t       iraflags = ira->ira_flags;
4898 4903          queue_t         *rq;
4899 4904  
4900 4905          secure = iraflags & IRAF_IPSEC_SECURE;
4901 4906  
4902 4907          rq = connp->conn_rq;
4903 4908          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
4904 4909                  switch (protocol) {
4905 4910                  case IPPROTO_ICMPV6:
4906 4911                          BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
4907 4912                          break;
4908 4913                  case IPPROTO_ICMP:
4909 4914                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
4910 4915                          break;
4911 4916                  default:
4912 4917                          BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
4913 4918                          break;
4914 4919                  }
4915 4920                  freemsg(mp);
4916 4921                  return;
4917 4922          }
4918 4923  
4919 4924          ASSERT(!(IPCL_IS_IPTUN(connp)));
4920 4925  
4921 4926          if (((iraflags & IRAF_IS_IPV4) ?
4922 4927              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
4923 4928              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
4924 4929              secure) {
4925 4930                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
4926 4931                      ip6h, ira);
4927 4932                  if (mp == NULL) {
4928 4933                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4929 4934                          /* Note that mp is NULL */
4930 4935                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
4931 4936                          return;
4932 4937                  }
4933 4938          }
4934 4939  
4935 4940          if (iraflags & IRAF_ICMP_ERROR) {
4936 4941                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
4937 4942          } else {
4938 4943                  ill_t *rill = ira->ira_rill;
4939 4944  
4940 4945                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
4941 4946                  ira->ira_ill = ira->ira_rill = NULL;
4942 4947                  /* Send it upstream */
4943 4948                  (connp->conn_recv)(connp, mp, NULL, ira);
4944 4949                  ira->ira_ill = ill;
4945 4950                  ira->ira_rill = rill;
4946 4951          }
4947 4952  }
4948 4953  
4949 4954  /*
4950 4955   * Handle protocols with which IP is less intimate.  There
4951 4956   * can be more than one stream bound to a particular
4952 4957   * protocol.  When this is the case, normally each one gets a copy
4953 4958   * of any incoming packets.
4954 4959   *
4955 4960   * IPsec NOTE :
4956 4961   *
4957 4962   * Don't allow a secure packet going up a non-secure connection.
4958 4963   * We don't allow this because
4959 4964   *
4960 4965   * 1) Reply might go out in clear which will be dropped at
4961 4966   *    the sending side.
4962 4967   * 2) If the reply goes out in clear it will give the
4963 4968   *    adversary enough information for getting the key in
4964 4969   *    most of the cases.
4965 4970   *
4966 4971   * Moreover getting a secure packet when we expect clear
4967 4972   * implies that SA's were added without checking for
4968 4973   * policy on both ends. This should not happen once ISAKMP
4969 4974   * is used to negotiate SAs as SAs will be added only after
4970 4975   * verifying the policy.
4971 4976   *
4972 4977   * Zones notes:
4973 4978   * Earlier in ip_input on a system with multiple shared-IP zones we
4974 4979   * duplicate the multicast and broadcast packets and send them up
4975 4980   * with each explicit zoneid that exists on that ill.
4976 4981   * This means that here we can match the zoneid with SO_ALLZONES being special.
4977 4982   */
4978 4983  void
4979 4984  ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
4980 4985  {
4981 4986          mblk_t          *mp1;
4982 4987          ipaddr_t        laddr;
4983 4988          conn_t          *connp, *first_connp, *next_connp;
4984 4989          connf_t         *connfp;
4985 4990          ill_t           *ill = ira->ira_ill;
4986 4991          ip_stack_t      *ipst = ill->ill_ipst;
4987 4992  
4988 4993          laddr = ipha->ipha_dst;
4989 4994  
4990 4995          connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
4991 4996          mutex_enter(&connfp->connf_lock);
4992 4997          connp = connfp->connf_head;
4993 4998          for (connp = connfp->connf_head; connp != NULL;
4994 4999              connp = connp->conn_next) {
4995 5000                  /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
4996 5001                  if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
4997 5002                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
4998 5003                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
4999 5004                          break;
5000 5005                  }
5001 5006          }
5002 5007  
5003 5008          if (connp == NULL) {
5004 5009                  /*
5005 5010                   * No one bound to these addresses.  Is
5006 5011                   * there a client that wants all
5007 5012                   * unclaimed datagrams?
5008 5013                   */
5009 5014                  mutex_exit(&connfp->connf_lock);
5010 5015                  ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
5011 5016                      ICMP_PROTOCOL_UNREACHABLE, ira);
5012 5017                  return;
5013 5018          }
5014 5019  
5015 5020          ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5016 5021  
5017 5022          CONN_INC_REF(connp);
5018 5023          first_connp = connp;
5019 5024          connp = connp->conn_next;
5020 5025  
5021 5026          for (;;) {
5022 5027                  while (connp != NULL) {
5023 5028                          /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5024 5029                          if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5025 5030                              (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5026 5031                              tsol_receive_local(mp, &laddr, IPV4_VERSION,
5027 5032                              ira, connp)))
5028 5033                                  break;
5029 5034                          connp = connp->conn_next;
5030 5035                  }
5031 5036  
5032 5037                  if (connp == NULL) {
5033 5038                          /* No more interested clients */
5034 5039                          connp = first_connp;
5035 5040                          break;
5036 5041                  }
5037 5042                  if (((mp1 = dupmsg(mp)) == NULL) &&
5038 5043                      ((mp1 = copymsg(mp)) == NULL)) {
5039 5044                          /* Memory allocation failed */
5040 5045                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5041 5046                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
5042 5047                          connp = first_connp;
5043 5048                          break;
5044 5049                  }
5045 5050  
5046 5051                  CONN_INC_REF(connp);
5047 5052                  mutex_exit(&connfp->connf_lock);
5048 5053  
5049 5054                  ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
5050 5055                      ira);
5051 5056  
5052 5057                  mutex_enter(&connfp->connf_lock);
5053 5058                  /* Follow the next pointer before releasing the conn. */
5054 5059                  next_connp = connp->conn_next;
5055 5060                  CONN_DEC_REF(connp);
5056 5061                  connp = next_connp;
5057 5062          }
5058 5063  
5059 5064          /* Last one.  Send it upstream. */
5060 5065          mutex_exit(&connfp->connf_lock);
5061 5066  
5062 5067          ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
5063 5068  
5064 5069          CONN_DEC_REF(connp);
5065 5070  }
5066 5071  
5067 5072  /*
5068 5073   * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
5069 5074   * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
5070 5075   * is not consumed.
5071 5076   *
5072 5077   * One of three things can happen, all of which affect the passed-in mblk:
5073 5078   *
5074 5079   * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
5075 5080   *
5076 5081   * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
5077 5082   *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
5078 5083   *
5079 5084   * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
5080 5085   */
5081 5086  mblk_t *
5082 5087  zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
5083 5088  {
5084 5089          int shift, plen, iph_len;
5085 5090          ipha_t *ipha;
5086 5091          udpha_t *udpha;
5087 5092          uint32_t *spi;
5088 5093          uint32_t esp_ports;
5089 5094          uint8_t *orptr;
5090 5095          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
5091 5096          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
5092 5097  
5093 5098          ipha = (ipha_t *)mp->b_rptr;
5094 5099          iph_len = ira->ira_ip_hdr_length;
5095 5100          plen = ira->ira_pktlen;
5096 5101  
5097 5102          if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
5098 5103                  /*
5099 5104                   * Most likely a keepalive for the benefit of an intervening
5100 5105                   * NAT.  These aren't for us, per se, so drop it.
5101 5106                   *
5102 5107                   * RFC 3947/8 doesn't say for sure what to do for 2-3
5103 5108                   * byte packets (keepalives are 1-byte), but we'll drop them
5104 5109                   * also.
5105 5110                   */
5106 5111                  ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5107 5112                      DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
5108 5113                  return (NULL);
5109 5114          }
5110 5115  
5111 5116          if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
5112 5117                  /* might as well pull it all up - it might be ESP. */
5113 5118                  if (!pullupmsg(mp, -1)) {
5114 5119                          ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5115 5120                              DROPPER(ipss, ipds_esp_nomem),
5116 5121                              &ipss->ipsec_dropper);
5117 5122                          return (NULL);
5118 5123                  }
5119 5124  
5120 5125                  ipha = (ipha_t *)mp->b_rptr;
5121 5126          }
5122 5127          spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
5123 5128          if (*spi == 0) {
5124 5129                  /* UDP packet - remove 0-spi. */
5125 5130                  shift = sizeof (uint32_t);
5126 5131          } else {
5127 5132                  /* ESP-in-UDP packet - reduce to ESP. */
5128 5133                  ipha->ipha_protocol = IPPROTO_ESP;
5129 5134                  shift = sizeof (udpha_t);
5130 5135          }
5131 5136  
5132 5137          /* Fix IP header */
5133 5138          ira->ira_pktlen = (plen - shift);
5134 5139          ipha->ipha_length = htons(ira->ira_pktlen);
5135 5140          ipha->ipha_hdr_checksum = 0;
5136 5141  
5137 5142          orptr = mp->b_rptr;
5138 5143          mp->b_rptr += shift;
5139 5144  
5140 5145          udpha = (udpha_t *)(orptr + iph_len);
5141 5146          if (*spi == 0) {
5142 5147                  ASSERT((uint8_t *)ipha == orptr);
5143 5148                  udpha->uha_length = htons(plen - shift - iph_len);
5144 5149                  iph_len += sizeof (udpha_t);    /* For the call to ovbcopy(). */
5145 5150                  esp_ports = 0;
5146 5151          } else {
5147 5152                  esp_ports = *((uint32_t *)udpha);
5148 5153                  ASSERT(esp_ports != 0);
5149 5154          }
5150 5155          ovbcopy(orptr, orptr + shift, iph_len);
5151 5156          if (esp_ports != 0) /* Punt up for ESP processing. */ {
5152 5157                  ipha = (ipha_t *)(orptr + shift);
5153 5158  
5154 5159                  ira->ira_flags |= IRAF_ESP_UDP_PORTS;
5155 5160                  ira->ira_esp_udp_ports = esp_ports;
5156 5161                  ip_fanout_v4(mp, ipha, ira);
5157 5162                  return (NULL);
5158 5163          }
5159 5164          return (mp);
5160 5165  }
5161 5166  
5162 5167  /*
5163 5168   * Deliver a udp packet to the given conn, possibly applying ipsec policy.
5164 5169   * Handles IPv4 and IPv6.
5165 5170   * We are responsible for disposing of mp, such as by freemsg() or putnext()
5166 5171   * Caller is responsible for dropping references to the conn.
5167 5172   */
5168 5173  void
5169 5174  ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
5170 5175      ip_recv_attr_t *ira)
5171 5176  {
5172 5177          ill_t           *ill = ira->ira_ill;
5173 5178          ip_stack_t      *ipst = ill->ill_ipst;
5174 5179          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
5175 5180          boolean_t       secure;
5176 5181          iaflags_t       iraflags = ira->ira_flags;
5177 5182  
5178 5183          secure = iraflags & IRAF_IPSEC_SECURE;
5179 5184  
5180 5185          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
5181 5186              !canputnext(connp->conn_rq)) {
5182 5187                  BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
5183 5188                  freemsg(mp);
5184 5189                  return;
5185 5190          }
5186 5191  
5187 5192          if (((iraflags & IRAF_IS_IPV4) ?
5188 5193              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
5189 5194              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
5190 5195              secure) {
5191 5196                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
5192 5197                      ip6h, ira);
5193 5198                  if (mp == NULL) {
5194 5199                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5195 5200                          /* Note that mp is NULL */
5196 5201                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
5197 5202                          return;
5198 5203                  }
5199 5204          }
5200 5205  
5201 5206          /*
5202 5207           * Since this code is not used for UDP unicast we don't need a NAT_T
5203 5208           * check. Only ip_fanout_v4 has that check.
5204 5209           */
5205 5210          if (ira->ira_flags & IRAF_ICMP_ERROR) {
5206 5211                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
5207 5212          } else {
5208 5213                  ill_t *rill = ira->ira_rill;
5209 5214  
5210 5215                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
5211 5216                  ira->ira_ill = ira->ira_rill = NULL;
5212 5217                  /* Send it upstream */
5213 5218                  (connp->conn_recv)(connp, mp, NULL, ira);
5214 5219                  ira->ira_ill = ill;
5215 5220                  ira->ira_rill = rill;
5216 5221          }
5217 5222  }
5218 5223  
5219 5224  /*
5220 5225   * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
5221 5226   * (Unicast fanout is handled in ip_input_v4.)
5222 5227   *
5223 5228   * If SO_REUSEADDR is set all multicast and broadcast packets
5224 5229   * will be delivered to all conns bound to the same port.
5225 5230   *
5226 5231   * If there is at least one matching AF_INET receiver, then we will
5227 5232   * ignore any AF_INET6 receivers.
5228 5233   * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
5229 5234   * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
5230 5235   * packets.
5231 5236   *
5232 5237   * Zones notes:
5233 5238   * Earlier in ip_input on a system with multiple shared-IP zones we
5234 5239   * duplicate the multicast and broadcast packets and send them up
5235 5240   * with each explicit zoneid that exists on that ill.
5236 5241   * This means that here we can match the zoneid with SO_ALLZONES being special.
5237 5242   */
5238 5243  void
5239 5244  ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
5240 5245      ip_recv_attr_t *ira)
5241 5246  {
5242 5247          ipaddr_t        laddr;
5243 5248          in6_addr_t      v6faddr;
5244 5249          conn_t          *connp;
5245 5250          connf_t         *connfp;
5246 5251          ipaddr_t        faddr;
5247 5252          ill_t           *ill = ira->ira_ill;
5248 5253          ip_stack_t      *ipst = ill->ill_ipst;
5249 5254  
5250 5255          ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
5251 5256  
5252 5257          laddr = ipha->ipha_dst;
5253 5258          faddr = ipha->ipha_src;
5254 5259  
5255 5260          connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5256 5261          mutex_enter(&connfp->connf_lock);
5257 5262          connp = connfp->connf_head;
5258 5263  
5259 5264          /*
5260 5265           * If SO_REUSEADDR has been set on the first we send the
5261 5266           * packet to all clients that have joined the group and
5262 5267           * match the port.
5263 5268           */
5264 5269          while (connp != NULL) {
5265 5270                  if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
5266 5271                      conn_wantpacket(connp, ira, ipha) &&
5267 5272                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5268 5273                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5269 5274                          break;
5270 5275                  connp = connp->conn_next;
5271 5276          }
5272 5277  
5273 5278          if (connp == NULL)
5274 5279                  goto notfound;
5275 5280  
5276 5281          CONN_INC_REF(connp);
5277 5282  
5278 5283          if (connp->conn_reuseaddr) {
5279 5284                  conn_t          *first_connp = connp;
5280 5285                  conn_t          *next_connp;
5281 5286                  mblk_t          *mp1;
5282 5287  
5283 5288                  connp = connp->conn_next;
5284 5289                  for (;;) {
5285 5290                          while (connp != NULL) {
5286 5291                                  if (IPCL_UDP_MATCH(connp, lport, laddr,
5287 5292                                      fport, faddr) &&
5288 5293                                      conn_wantpacket(connp, ira, ipha) &&
5289 5294                                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5290 5295                                      tsol_receive_local(mp, &laddr, IPV4_VERSION,
5291 5296                                      ira, connp)))
5292 5297                                          break;
5293 5298                                  connp = connp->conn_next;
5294 5299                          }
5295 5300                          if (connp == NULL) {
5296 5301                                  /* No more interested clients */
5297 5302                                  connp = first_connp;
5298 5303                                  break;
5299 5304                          }
5300 5305                          if (((mp1 = dupmsg(mp)) == NULL) &&
5301 5306                              ((mp1 = copymsg(mp)) == NULL)) {
5302 5307                                  /* Memory allocation failed */
5303 5308                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5304 5309                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
5305 5310                                  connp = first_connp;
5306 5311                                  break;
5307 5312                          }
5308 5313                          CONN_INC_REF(connp);
5309 5314                          mutex_exit(&connfp->connf_lock);
5310 5315  
5311 5316                          IP_STAT(ipst, ip_udp_fanmb);
5312 5317                          ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5313 5318                              NULL, ira);
5314 5319                          mutex_enter(&connfp->connf_lock);
5315 5320                          /* Follow the next pointer before releasing the conn */
5316 5321                          next_connp = connp->conn_next;
5317 5322                          CONN_DEC_REF(connp);
5318 5323                          connp = next_connp;
5319 5324                  }
5320 5325          }
5321 5326  
5322 5327          /* Last one.  Send it upstream. */
5323 5328          mutex_exit(&connfp->connf_lock);
5324 5329          IP_STAT(ipst, ip_udp_fanmb);
5325 5330          ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5326 5331          CONN_DEC_REF(connp);
5327 5332          return;
5328 5333  
5329 5334  notfound:
5330 5335          mutex_exit(&connfp->connf_lock);
5331 5336          /*
5332 5337           * IPv6 endpoints bound to multicast IPv4-mapped addresses
5333 5338           * have already been matched above, since they live in the IPv4
5334 5339           * fanout tables. This implies we only need to
5335 5340           * check for IPv6 in6addr_any endpoints here.
5336 5341           * Thus we compare using ipv6_all_zeros instead of the destination
5337 5342           * address, except for the multicast group membership lookup which
5338 5343           * uses the IPv4 destination.
5339 5344           */
5340 5345          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
5341 5346          connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5342 5347          mutex_enter(&connfp->connf_lock);
5343 5348          connp = connfp->connf_head;
5344 5349          /*
5345 5350           * IPv4 multicast packet being delivered to an AF_INET6
5346 5351           * in6addr_any endpoint.
5347 5352           * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
5348 5353           * and not conn_wantpacket_v6() since any multicast membership is
5349 5354           * for an IPv4-mapped multicast address.
5350 5355           */
5351 5356          while (connp != NULL) {
5352 5357                  if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
5353 5358                      fport, v6faddr) &&
5354 5359                      conn_wantpacket(connp, ira, ipha) &&
5355 5360                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5356 5361                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5357 5362                          break;
5358 5363                  connp = connp->conn_next;
5359 5364          }
5360 5365  
5361 5366          if (connp == NULL) {
5362 5367                  /*
5363 5368                   * No one bound to this port.  Is
5364 5369                   * there a client that wants all
5365 5370                   * unclaimed datagrams?
5366 5371                   */
5367 5372                  mutex_exit(&connfp->connf_lock);
5368 5373  
5369 5374                  if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
5370 5375                      NULL) {
5371 5376                          ASSERT(ira->ira_protocol == IPPROTO_UDP);
5372 5377                          ip_fanout_proto_v4(mp, ipha, ira);
5373 5378                  } else {
5374 5379                          /*
5375 5380                           * We used to attempt to send an icmp error here, but
5376 5381                           * since this is known to be a multicast packet
5377 5382                           * and we don't send icmp errors in response to
5378 5383                           * multicast, just drop the packet and give up sooner.
5379 5384                           */
5380 5385                          BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
5381 5386                          freemsg(mp);
5382 5387                  }
5383 5388                  return;
5384 5389          }
5385 5390          ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5386 5391  
5387 5392          /*
5388 5393           * If SO_REUSEADDR has been set on the first we send the
5389 5394           * packet to all clients that have joined the group and
5390 5395           * match the port.
5391 5396           */
5392 5397          if (connp->conn_reuseaddr) {
5393 5398                  conn_t          *first_connp = connp;
5394 5399                  conn_t          *next_connp;
5395 5400                  mblk_t          *mp1;
5396 5401  
5397 5402                  CONN_INC_REF(connp);
5398 5403                  connp = connp->conn_next;
5399 5404                  for (;;) {
5400 5405                          while (connp != NULL) {
5401 5406                                  if (IPCL_UDP_MATCH_V6(connp, lport,
5402 5407                                      ipv6_all_zeros, fport, v6faddr) &&
5403 5408                                      conn_wantpacket(connp, ira, ipha) &&
5404 5409                                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5405 5410                                      tsol_receive_local(mp, &laddr, IPV4_VERSION,
5406 5411                                      ira, connp)))
5407 5412                                          break;
5408 5413                                  connp = connp->conn_next;
5409 5414                          }
5410 5415                          if (connp == NULL) {
5411 5416                                  /* No more interested clients */
5412 5417                                  connp = first_connp;
5413 5418                                  break;
5414 5419                          }
5415 5420                          if (((mp1 = dupmsg(mp)) == NULL) &&
5416 5421                              ((mp1 = copymsg(mp)) == NULL)) {
5417 5422                                  /* Memory allocation failed */
5418 5423                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5419 5424                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
5420 5425                                  connp = first_connp;
5421 5426                                  break;
5422 5427                          }
5423 5428                          CONN_INC_REF(connp);
5424 5429                          mutex_exit(&connfp->connf_lock);
5425 5430  
5426 5431                          IP_STAT(ipst, ip_udp_fanmb);
5427 5432                          ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5428 5433                              NULL, ira);
5429 5434                          mutex_enter(&connfp->connf_lock);
5430 5435                          /* Follow the next pointer before releasing the conn */
5431 5436                          next_connp = connp->conn_next;
5432 5437                          CONN_DEC_REF(connp);
5433 5438                          connp = next_connp;
5434 5439                  }
5435 5440          }
5436 5441  
5437 5442          /* Last one.  Send it upstream. */
5438 5443          mutex_exit(&connfp->connf_lock);
5439 5444          IP_STAT(ipst, ip_udp_fanmb);
5440 5445          ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5441 5446          CONN_DEC_REF(connp);
5442 5447  }
5443 5448  
5444 5449  /*
5445 5450   * Split an incoming packet's IPv4 options into the label and the other options.
5446 5451   * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
5447 5452   * clearing out any leftover label or options.
5448 5453   * Otherwise it just makes ipp point into the packet.
5449 5454   *
5450 5455   * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
5451 5456   */
5452 5457  int
5453 5458  ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
5454 5459  {
5455 5460          uchar_t         *opt;
5456 5461          uint32_t        totallen;
5457 5462          uint32_t        optval;
5458 5463          uint32_t        optlen;
5459 5464  
5460 5465          ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
5461 5466          ipp->ipp_hoplimit = ipha->ipha_ttl;
5462 5467          ipp->ipp_type_of_service = ipha->ipha_type_of_service;
5463 5468          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
5464 5469  
5465 5470          /*
5466 5471           * Get length (in 4 byte octets) of IP header options.
5467 5472           */
5468 5473          totallen = ipha->ipha_version_and_hdr_length -
5469 5474              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5470 5475  
5471 5476          if (totallen == 0) {
5472 5477                  if (!allocate)
5473 5478                          return (0);
5474 5479  
5475 5480                  /* Clear out anything from a previous packet */
5476 5481                  if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5477 5482                          kmem_free(ipp->ipp_ipv4_options,
5478 5483                              ipp->ipp_ipv4_options_len);
5479 5484                          ipp->ipp_ipv4_options = NULL;
5480 5485                          ipp->ipp_ipv4_options_len = 0;
5481 5486                          ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5482 5487                  }
5483 5488                  if (ipp->ipp_fields & IPPF_LABEL_V4) {
5484 5489                          kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5485 5490                          ipp->ipp_label_v4 = NULL;
5486 5491                          ipp->ipp_label_len_v4 = 0;
5487 5492                          ipp->ipp_fields &= ~IPPF_LABEL_V4;
5488 5493                  }
5489 5494                  return (0);
5490 5495          }
5491 5496  
5492 5497          totallen <<= 2;
5493 5498          opt = (uchar_t *)&ipha[1];
5494 5499          if (!is_system_labeled()) {
5495 5500  
5496 5501          copyall:
5497 5502                  if (!allocate) {
5498 5503                          if (totallen != 0) {
5499 5504                                  ipp->ipp_ipv4_options = opt;
5500 5505                                  ipp->ipp_ipv4_options_len = totallen;
5501 5506                                  ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5502 5507                          }
5503 5508                          return (0);
5504 5509                  }
5505 5510                  /* Just copy all of options */
5506 5511                  if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5507 5512                          if (totallen == ipp->ipp_ipv4_options_len) {
5508 5513                                  bcopy(opt, ipp->ipp_ipv4_options, totallen);
5509 5514                                  return (0);
5510 5515                          }
5511 5516                          kmem_free(ipp->ipp_ipv4_options,
5512 5517                              ipp->ipp_ipv4_options_len);
5513 5518                          ipp->ipp_ipv4_options = NULL;
5514 5519                          ipp->ipp_ipv4_options_len = 0;
5515 5520                          ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5516 5521                  }
5517 5522                  if (totallen == 0)
5518 5523                          return (0);
5519 5524  
5520 5525                  ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
5521 5526                  if (ipp->ipp_ipv4_options == NULL)
5522 5527                          return (ENOMEM);
5523 5528                  ipp->ipp_ipv4_options_len = totallen;
5524 5529                  ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5525 5530                  bcopy(opt, ipp->ipp_ipv4_options, totallen);
5526 5531                  return (0);
5527 5532          }
5528 5533  
5529 5534          if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
5530 5535                  kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5531 5536                  ipp->ipp_label_v4 = NULL;
5532 5537                  ipp->ipp_label_len_v4 = 0;
5533 5538                  ipp->ipp_fields &= ~IPPF_LABEL_V4;
5534 5539          }
5535 5540  
5536 5541          /*
5537 5542           * Search for CIPSO option.
5538 5543           * We assume CIPSO is first in options if it is present.
5539 5544           * If it isn't, then ipp_opt_ipv4_options will not include the options
5540 5545           * prior to the CIPSO option.
5541 5546           */
5542 5547          while (totallen != 0) {
5543 5548                  switch (optval = opt[IPOPT_OPTVAL]) {
5544 5549                  case IPOPT_EOL:
5545 5550                          return (0);
5546 5551                  case IPOPT_NOP:
5547 5552                          optlen = 1;
5548 5553                          break;
5549 5554                  default:
5550 5555                          if (totallen <= IPOPT_OLEN)
5551 5556                                  return (EINVAL);
5552 5557                          optlen = opt[IPOPT_OLEN];
5553 5558                          if (optlen < 2)
5554 5559                                  return (EINVAL);
5555 5560                  }
5556 5561                  if (optlen > totallen)
5557 5562                          return (EINVAL);
5558 5563  
5559 5564                  switch (optval) {
5560 5565                  case IPOPT_COMSEC:
5561 5566                          if (!allocate) {
5562 5567                                  ipp->ipp_label_v4 = opt;
5563 5568                                  ipp->ipp_label_len_v4 = optlen;
5564 5569                                  ipp->ipp_fields |= IPPF_LABEL_V4;
5565 5570                          } else {
5566 5571                                  ipp->ipp_label_v4 = kmem_alloc(optlen,
5567 5572                                      KM_NOSLEEP);
5568 5573                                  if (ipp->ipp_label_v4 == NULL)
5569 5574                                          return (ENOMEM);
5570 5575                                  ipp->ipp_label_len_v4 = optlen;
5571 5576                                  ipp->ipp_fields |= IPPF_LABEL_V4;
5572 5577                                  bcopy(opt, ipp->ipp_label_v4, optlen);
5573 5578                          }
5574 5579                          totallen -= optlen;
5575 5580                          opt += optlen;
5576 5581  
5577 5582                          /* Skip padding bytes until we get to a multiple of 4 */
5578 5583                          while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
5579 5584                                  totallen--;
5580 5585                                  opt++;
5581 5586                          }
5582 5587                          /* Remaining as ipp_ipv4_options */
5583 5588                          goto copyall;
5584 5589                  }
5585 5590                  totallen -= optlen;
5586 5591                  opt += optlen;
5587 5592          }
5588 5593          /* No CIPSO found; return everything as ipp_ipv4_options */
5589 5594          totallen = ipha->ipha_version_and_hdr_length -
5590 5595              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5591 5596          totallen <<= 2;
5592 5597          opt = (uchar_t *)&ipha[1];
5593 5598          goto copyall;
5594 5599  }
5595 5600  
5596 5601  /*
5597 5602   * Efficient versions of lookup for an IRE when we only
5598 5603   * match the address.
5599 5604   * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5600 5605   * Does not handle multicast addresses.
5601 5606   */
5602 5607  uint_t
5603 5608  ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
5604 5609  {
5605 5610          ire_t *ire;
5606 5611          uint_t result;
5607 5612  
5608 5613          ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
5609 5614          ASSERT(ire != NULL);
5610 5615          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5611 5616                  result = IRE_NOROUTE;
5612 5617          else
5613 5618                  result = ire->ire_type;
5614 5619          ire_refrele(ire);
5615 5620          return (result);
5616 5621  }
5617 5622  
5618 5623  /*
5619 5624   * Efficient versions of lookup for an IRE when we only
5620 5625   * match the address.
5621 5626   * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5622 5627   * Does not handle multicast addresses.
5623 5628   */
5624 5629  uint_t
5625 5630  ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
5626 5631  {
5627 5632          ire_t *ire;
5628 5633          uint_t result;
5629 5634  
5630 5635          ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
5631 5636          ASSERT(ire != NULL);
5632 5637          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5633 5638                  result = IRE_NOROUTE;
5634 5639          else
5635 5640                  result = ire->ire_type;
5636 5641          ire_refrele(ire);
5637 5642          return (result);
5638 5643  }
5639 5644  
5640 5645  /*
5641 5646   * Nobody should be sending
5642 5647   * packets up this stream
5643 5648   */
5644 5649  static void
5645 5650  ip_lrput(queue_t *q, mblk_t *mp)
5646 5651  {
5647 5652          switch (mp->b_datap->db_type) {
5648 5653          case M_FLUSH:
5649 5654                  /* Turn around */
5650 5655                  if (*mp->b_rptr & FLUSHW) {
5651 5656                          *mp->b_rptr &= ~FLUSHR;
5652 5657                          qreply(q, mp);
5653 5658                          return;
5654 5659                  }
5655 5660                  break;
5656 5661          }
5657 5662          freemsg(mp);
5658 5663  }
5659 5664  
5660 5665  /* Nobody should be sending packets down this stream */
5661 5666  /* ARGSUSED */
5662 5667  void
5663 5668  ip_lwput(queue_t *q, mblk_t *mp)
5664 5669  {
5665 5670          freemsg(mp);
5666 5671  }
5667 5672  
5668 5673  /*
5669 5674   * Move the first hop in any source route to ipha_dst and remove that part of
5670 5675   * the source route.  Called by other protocols.  Errors in option formatting
5671 5676   * are ignored - will be handled by ip_output_options. Return the final
5672 5677   * destination (either ipha_dst or the last entry in a source route.)
5673 5678   */
5674 5679  ipaddr_t
5675 5680  ip_massage_options(ipha_t *ipha, netstack_t *ns)
5676 5681  {
5677 5682          ipoptp_t        opts;
5678 5683          uchar_t         *opt;
5679 5684          uint8_t         optval;
5680 5685          uint8_t         optlen;
5681 5686          ipaddr_t        dst;
5682 5687          int             i;
5683 5688          ip_stack_t      *ipst = ns->netstack_ip;
5684 5689  
5685 5690          ip2dbg(("ip_massage_options\n"));
5686 5691          dst = ipha->ipha_dst;
5687 5692          for (optval = ipoptp_first(&opts, ipha);
5688 5693              optval != IPOPT_EOL;
5689 5694              optval = ipoptp_next(&opts)) {
5690 5695                  opt = opts.ipoptp_cur;
5691 5696                  switch (optval) {
5692 5697                          uint8_t off;
5693 5698                  case IPOPT_SSRR:
5694 5699                  case IPOPT_LSRR:
5695 5700                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
5696 5701                                  ip1dbg(("ip_massage_options: bad src route\n"));
5697 5702                                  break;
5698 5703                          }
5699 5704                          optlen = opts.ipoptp_len;
5700 5705                          off = opt[IPOPT_OFFSET];
5701 5706                          off--;
5702 5707                  redo_srr:
5703 5708                          if (optlen < IP_ADDR_LEN ||
5704 5709                              off > optlen - IP_ADDR_LEN) {
5705 5710                                  /* End of source route */
5706 5711                                  ip1dbg(("ip_massage_options: end of SR\n"));
5707 5712                                  break;
5708 5713                          }
5709 5714                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
5710 5715                          ip1dbg(("ip_massage_options: next hop 0x%x\n",
5711 5716                              ntohl(dst)));
5712 5717                          /*
5713 5718                           * Check if our address is present more than
5714 5719                           * once as consecutive hops in source route.
5715 5720                           * XXX verify per-interface ip_forwarding
5716 5721                           * for source route?
5717 5722                           */
5718 5723                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
5719 5724                                  off += IP_ADDR_LEN;
5720 5725                                  goto redo_srr;
5721 5726                          }
5722 5727                          if (dst == htonl(INADDR_LOOPBACK)) {
5723 5728                                  ip1dbg(("ip_massage_options: loopback addr in "
5724 5729                                      "source route!\n"));
5725 5730                                  break;
5726 5731                          }
5727 5732                          /*
5728 5733                           * Update ipha_dst to be the first hop and remove the
5729 5734                           * first hop from the source route (by overwriting
5730 5735                           * part of the option with NOP options).
5731 5736                           */
5732 5737                          ipha->ipha_dst = dst;
5733 5738                          /* Put the last entry in dst */
5734 5739                          off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
5735 5740                              3;
5736 5741                          bcopy(&opt[off], &dst, IP_ADDR_LEN);
5737 5742  
5738 5743                          ip1dbg(("ip_massage_options: last hop 0x%x\n",
5739 5744                              ntohl(dst)));
5740 5745                          /* Move down and overwrite */
5741 5746                          opt[IP_ADDR_LEN] = opt[0];
5742 5747                          opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
5743 5748                          opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
5744 5749                          for (i = 0; i < IP_ADDR_LEN; i++)
5745 5750                                  opt[i] = IPOPT_NOP;
5746 5751                          break;
5747 5752                  }
5748 5753          }
5749 5754          return (dst);
5750 5755  }
5751 5756  
5752 5757  /*
5753 5758   * Return the network mask
5754 5759   * associated with the specified address.
5755 5760   */
5756 5761  ipaddr_t
5757 5762  ip_net_mask(ipaddr_t addr)
5758 5763  {
5759 5764          uchar_t *up = (uchar_t *)&addr;
5760 5765          ipaddr_t mask = 0;
5761 5766          uchar_t *maskp = (uchar_t *)&mask;
5762 5767  
5763 5768  #if defined(__i386) || defined(__amd64)
5764 5769  #define TOTALLY_BRAIN_DAMAGED_C_COMPILER
5765 5770  #endif
5766 5771  #ifdef  TOTALLY_BRAIN_DAMAGED_C_COMPILER
5767 5772          maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
5768 5773  #endif
5769 5774          if (CLASSD(addr)) {
5770 5775                  maskp[0] = 0xF0;
5771 5776                  return (mask);
5772 5777          }
5773 5778  
5774 5779          /* We assume Class E default netmask to be 32 */
5775 5780          if (CLASSE(addr))
5776 5781                  return (0xffffffffU);
5777 5782  
5778 5783          if (addr == 0)
5779 5784                  return (0);
5780 5785          maskp[0] = 0xFF;
5781 5786          if ((up[0] & 0x80) == 0)
5782 5787                  return (mask);
5783 5788  
5784 5789          maskp[1] = 0xFF;
5785 5790          if ((up[0] & 0xC0) == 0x80)
5786 5791                  return (mask);
5787 5792  
5788 5793          maskp[2] = 0xFF;
5789 5794          if ((up[0] & 0xE0) == 0xC0)
5790 5795                  return (mask);
5791 5796  
5792 5797          /* Otherwise return no mask */
5793 5798          return ((ipaddr_t)0);
5794 5799  }
5795 5800  
5796 5801  /* Name/Value Table Lookup Routine */
5797 5802  char *
5798 5803  ip_nv_lookup(nv_t *nv, int value)
5799 5804  {
5800 5805          if (!nv)
5801 5806                  return (NULL);
5802 5807          for (; nv->nv_name; nv++) {
5803 5808                  if (nv->nv_value == value)
5804 5809                          return (nv->nv_name);
5805 5810          }
5806 5811          return ("unknown");
5807 5812  }
5808 5813  
5809 5814  static int
5810 5815  ip_wait_for_info_ack(ill_t *ill)
5811 5816  {
5812 5817          int err;
5813 5818  
5814 5819          mutex_enter(&ill->ill_lock);
5815 5820          while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
5816 5821                  /*
5817 5822                   * Return value of 0 indicates a pending signal.
5818 5823                   */
5819 5824                  err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
5820 5825                  if (err == 0) {
5821 5826                          mutex_exit(&ill->ill_lock);
5822 5827                          return (EINTR);
5823 5828                  }
5824 5829          }
5825 5830          mutex_exit(&ill->ill_lock);
5826 5831          /*
5827 5832           * ip_rput_other could have set an error  in ill_error on
5828 5833           * receipt of M_ERROR.
5829 5834           */
5830 5835          return (ill->ill_error);
5831 5836  }
5832 5837  
5833 5838  /*
5834 5839   * This is a module open, i.e. this is a control stream for access
5835 5840   * to a DLPI device.  We allocate an ill_t as the instance data in
5836 5841   * this case.
5837 5842   */
5838 5843  static int
5839 5844  ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5840 5845  {
5841 5846          ill_t   *ill;
5842 5847          int     err;
5843 5848          zoneid_t zoneid;
5844 5849          netstack_t *ns;
5845 5850          ip_stack_t *ipst;
5846 5851  
5847 5852          /*
5848 5853           * Prevent unprivileged processes from pushing IP so that
5849 5854           * they can't send raw IP.
5850 5855           */
5851 5856          if (secpolicy_net_rawaccess(credp) != 0)
5852 5857                  return (EPERM);
5853 5858  
5854 5859          ns = netstack_find_by_cred(credp);
5855 5860          ASSERT(ns != NULL);
5856 5861          ipst = ns->netstack_ip;
5857 5862          ASSERT(ipst != NULL);
5858 5863  
5859 5864          /*
5860 5865           * For exclusive stacks we set the zoneid to zero
5861 5866           * to make IP operate as if in the global zone.
5862 5867           */
5863 5868          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5864 5869                  zoneid = GLOBAL_ZONEID;
5865 5870          else
5866 5871                  zoneid = crgetzoneid(credp);
5867 5872  
5868 5873          ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
5869 5874          q->q_ptr = WR(q)->q_ptr = ill;
5870 5875          ill->ill_ipst = ipst;
5871 5876          ill->ill_zoneid = zoneid;
5872 5877  
5873 5878          /*
5874 5879           * ill_init initializes the ill fields and then sends down
5875 5880           * down a DL_INFO_REQ after calling qprocson.
5876 5881           */
5877 5882          err = ill_init(q, ill);
5878 5883  
5879 5884          if (err != 0) {
5880 5885                  mi_free(ill);
5881 5886                  netstack_rele(ipst->ips_netstack);
5882 5887                  q->q_ptr = NULL;
5883 5888                  WR(q)->q_ptr = NULL;
5884 5889                  return (err);
5885 5890          }
5886 5891  
5887 5892          /*
5888 5893           * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
5889 5894           *
5890 5895           * ill_init initializes the ipsq marking this thread as
5891 5896           * writer
5892 5897           */
5893 5898          ipsq_exit(ill->ill_phyint->phyint_ipsq);
5894 5899          err = ip_wait_for_info_ack(ill);
5895 5900          if (err == 0)
5896 5901                  ill->ill_credp = credp;
5897 5902          else
5898 5903                  goto fail;
5899 5904  
5900 5905          crhold(credp);
5901 5906  
5902 5907          mutex_enter(&ipst->ips_ip_mi_lock);
5903 5908          err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
5904 5909              sflag, credp);
5905 5910          mutex_exit(&ipst->ips_ip_mi_lock);
5906 5911  fail:
5907 5912          if (err) {
5908 5913                  (void) ip_close(q, 0);
5909 5914                  return (err);
5910 5915          }
5911 5916          return (0);
5912 5917  }
5913 5918  
5914 5919  /* For /dev/ip aka AF_INET open */
5915 5920  int
5916 5921  ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5917 5922  {
5918 5923          return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
5919 5924  }
5920 5925  
5921 5926  /* For /dev/ip6 aka AF_INET6 open */
5922 5927  int
5923 5928  ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5924 5929  {
5925 5930          return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
5926 5931  }
5927 5932  
5928 5933  /* IP open routine. */
5929 5934  int
5930 5935  ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
5931 5936      boolean_t isv6)
5932 5937  {
5933 5938          conn_t          *connp;
5934 5939          major_t         maj;
5935 5940          zoneid_t        zoneid;
5936 5941          netstack_t      *ns;
5937 5942          ip_stack_t      *ipst;
5938 5943  
5939 5944          /* Allow reopen. */
5940 5945          if (q->q_ptr != NULL)
5941 5946                  return (0);
5942 5947  
5943 5948          if (sflag & MODOPEN) {
5944 5949                  /* This is a module open */
5945 5950                  return (ip_modopen(q, devp, flag, sflag, credp));
5946 5951          }
5947 5952  
5948 5953          if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
5949 5954                  /*
5950 5955                   * Non streams based socket looking for a stream
5951 5956                   * to access IP
5952 5957                   */
5953 5958                  return (ip_helper_stream_setup(q, devp, flag, sflag,
5954 5959                      credp, isv6));
5955 5960          }
5956 5961  
5957 5962          ns = netstack_find_by_cred(credp);
5958 5963          ASSERT(ns != NULL);
5959 5964          ipst = ns->netstack_ip;
5960 5965          ASSERT(ipst != NULL);
5961 5966  
5962 5967          /*
5963 5968           * For exclusive stacks we set the zoneid to zero
5964 5969           * to make IP operate as if in the global zone.
5965 5970           */
5966 5971          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5967 5972                  zoneid = GLOBAL_ZONEID;
5968 5973          else
5969 5974                  zoneid = crgetzoneid(credp);
5970 5975  
5971 5976          /*
5972 5977           * We are opening as a device. This is an IP client stream, and we
5973 5978           * allocate an conn_t as the instance data.
5974 5979           */
5975 5980          connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
5976 5981  
5977 5982          /*
5978 5983           * ipcl_conn_create did a netstack_hold. Undo the hold that was
5979 5984           * done by netstack_find_by_cred()
5980 5985           */
5981 5986          netstack_rele(ipst->ips_netstack);
5982 5987  
5983 5988          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
5984 5989          /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
5985 5990          connp->conn_ixa->ixa_zoneid = zoneid;
5986 5991          connp->conn_zoneid = zoneid;
5987 5992  
5988 5993          connp->conn_rq = q;
5989 5994          q->q_ptr = WR(q)->q_ptr = connp;
5990 5995  
5991 5996          /* Minor tells us which /dev entry was opened */
5992 5997          if (isv6) {
5993 5998                  connp->conn_family = AF_INET6;
5994 5999                  connp->conn_ipversion = IPV6_VERSION;
5995 6000                  connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
5996 6001                  connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
5997 6002          } else {
5998 6003                  connp->conn_family = AF_INET;
5999 6004                  connp->conn_ipversion = IPV4_VERSION;
6000 6005                  connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
6001 6006          }
6002 6007  
6003 6008          if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
6004 6009              ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
6005 6010                  connp->conn_minor_arena = ip_minor_arena_la;
6006 6011          } else {
6007 6012                  /*
6008 6013                   * Either minor numbers in the large arena were exhausted
6009 6014                   * or a non socket application is doing the open.
6010 6015                   * Try to allocate from the small arena.
6011 6016                   */
6012 6017                  if ((connp->conn_dev =
6013 6018                      inet_minor_alloc(ip_minor_arena_sa)) == 0) {
6014 6019                          /* CONN_DEC_REF takes care of netstack_rele() */
6015 6020                          q->q_ptr = WR(q)->q_ptr = NULL;
6016 6021                          CONN_DEC_REF(connp);
6017 6022                          return (EBUSY);
6018 6023                  }
6019 6024                  connp->conn_minor_arena = ip_minor_arena_sa;
6020 6025          }
6021 6026  
6022 6027          maj = getemajor(*devp);
6023 6028          *devp = makedevice(maj, (minor_t)connp->conn_dev);
6024 6029  
6025 6030          /*
6026 6031           * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
6027 6032           */
6028 6033          connp->conn_cred = credp;
6029 6034          connp->conn_cpid = curproc->p_pid;
6030 6035          /* Cache things in ixa without an extra refhold */
6031 6036          ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
6032 6037          connp->conn_ixa->ixa_cred = connp->conn_cred;
6033 6038          connp->conn_ixa->ixa_cpid = connp->conn_cpid;
6034 6039          if (is_system_labeled())
6035 6040                  connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
6036 6041  
6037 6042          /*
6038 6043           * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
6039 6044           */
6040 6045          connp->conn_recv = ip_conn_input;
6041 6046          connp->conn_recvicmp = ip_conn_input_icmp;
6042 6047  
6043 6048          crhold(connp->conn_cred);
6044 6049  
6045 6050          /*
6046 6051           * If the caller has the process-wide flag set, then default to MAC
6047 6052           * exempt mode.  This allows read-down to unlabeled hosts.
6048 6053           */
6049 6054          if (getpflags(NET_MAC_AWARE, credp) != 0)
6050 6055                  connp->conn_mac_mode = CONN_MAC_AWARE;
6051 6056  
6052 6057          connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
6053 6058  
6054 6059          connp->conn_rq = q;
6055 6060          connp->conn_wq = WR(q);
6056 6061  
6057 6062          /* Non-zero default values */
6058 6063          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
6059 6064  
6060 6065          /*
6061 6066           * Make the conn globally visible to walkers
6062 6067           */
6063 6068          ASSERT(connp->conn_ref == 1);
6064 6069          mutex_enter(&connp->conn_lock);
6065 6070          connp->conn_state_flags &= ~CONN_INCIPIENT;
6066 6071          mutex_exit(&connp->conn_lock);
6067 6072  
6068 6073          qprocson(q);
6069 6074  
6070 6075          return (0);
6071 6076  }
6072 6077  
6073 6078  /*
6074 6079   * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
6075 6080   * all of them are copied to the conn_t. If the req is "zero", the policy is
6076 6081   * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
6077 6082   * fields.
6078 6083   * We keep only the latest setting of the policy and thus policy setting
6079 6084   * is not incremental/cumulative.
6080 6085   *
6081 6086   * Requests to set policies with multiple alternative actions will
6082 6087   * go through a different API.
6083 6088   */
6084 6089  int
6085 6090  ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
6086 6091  {
6087 6092          uint_t ah_req = 0;
6088 6093          uint_t esp_req = 0;
6089 6094          uint_t se_req = 0;
6090 6095          ipsec_act_t *actp = NULL;
6091 6096          uint_t nact;
6092 6097          ipsec_policy_head_t *ph;
6093 6098          boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
6094 6099          int error = 0;
6095 6100          netstack_t      *ns = connp->conn_netstack;
6096 6101          ip_stack_t      *ipst = ns->netstack_ip;
6097 6102          ipsec_stack_t   *ipss = ns->netstack_ipsec;
6098 6103  
6099 6104  #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
6100 6105  
6101 6106          /*
6102 6107           * The IP_SEC_OPT option does not allow variable length parameters,
6103 6108           * hence a request cannot be NULL.
6104 6109           */
6105 6110          if (req == NULL)
6106 6111                  return (EINVAL);
6107 6112  
6108 6113          ah_req = req->ipsr_ah_req;
6109 6114          esp_req = req->ipsr_esp_req;
6110 6115          se_req = req->ipsr_self_encap_req;
6111 6116  
6112 6117          /* Don't allow setting self-encap without one or more of AH/ESP. */
6113 6118          if (se_req != 0 && esp_req == 0 && ah_req == 0)
6114 6119                  return (EINVAL);
6115 6120  
6116 6121          /*
6117 6122           * Are we dealing with a request to reset the policy (i.e.
6118 6123           * zero requests).
6119 6124           */
6120 6125          is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
6121 6126              (esp_req & REQ_MASK) == 0 &&
6122 6127              (se_req & REQ_MASK) == 0);
6123 6128  
6124 6129          if (!is_pol_reset) {
6125 6130                  /*
6126 6131                   * If we couldn't load IPsec, fail with "protocol
6127 6132                   * not supported".
6128 6133                   * IPsec may not have been loaded for a request with zero
6129 6134                   * policies, so we don't fail in this case.
6130 6135                   */
6131 6136                  mutex_enter(&ipss->ipsec_loader_lock);
6132 6137                  if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
6133 6138                          mutex_exit(&ipss->ipsec_loader_lock);
6134 6139                          return (EPROTONOSUPPORT);
6135 6140                  }
6136 6141                  mutex_exit(&ipss->ipsec_loader_lock);
6137 6142  
6138 6143                  /*
6139 6144                   * Test for valid requests. Invalid algorithms
6140 6145                   * need to be tested by IPsec code because new
6141 6146                   * algorithms can be added dynamically.
6142 6147                   */
6143 6148                  if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6144 6149                      (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6145 6150                      (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
6146 6151                          return (EINVAL);
6147 6152                  }
6148 6153  
6149 6154                  /*
6150 6155                   * Only privileged users can issue these
6151 6156                   * requests.
6152 6157                   */
6153 6158                  if (((ah_req & IPSEC_PREF_NEVER) ||
6154 6159                      (esp_req & IPSEC_PREF_NEVER) ||
6155 6160                      (se_req & IPSEC_PREF_NEVER)) &&
6156 6161                      secpolicy_ip_config(cr, B_FALSE) != 0) {
6157 6162                          return (EPERM);
6158 6163                  }
6159 6164  
6160 6165                  /*
6161 6166                   * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
6162 6167                   * are mutually exclusive.
6163 6168                   */
6164 6169                  if (((ah_req & REQ_MASK) == REQ_MASK) ||
6165 6170                      ((esp_req & REQ_MASK) == REQ_MASK) ||
6166 6171                      ((se_req & REQ_MASK) == REQ_MASK)) {
6167 6172                          /* Both of them are set */
6168 6173                          return (EINVAL);
6169 6174                  }
6170 6175          }
6171 6176  
6172 6177          ASSERT(MUTEX_HELD(&connp->conn_lock));
6173 6178  
6174 6179          /*
6175 6180           * If we have already cached policies in conn_connect(), don't
6176 6181           * let them change now. We cache policies for connections
6177 6182           * whose src,dst [addr, port] is known.
6178 6183           */
6179 6184          if (connp->conn_policy_cached) {
6180 6185                  return (EINVAL);
6181 6186          }
6182 6187  
6183 6188          /*
6184 6189           * We have a zero policies, reset the connection policy if already
6185 6190           * set. This will cause the connection to inherit the
6186 6191           * global policy, if any.
6187 6192           */
6188 6193          if (is_pol_reset) {
6189 6194                  if (connp->conn_policy != NULL) {
6190 6195                          IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
6191 6196                          connp->conn_policy = NULL;
6192 6197                  }
6193 6198                  connp->conn_in_enforce_policy = B_FALSE;
6194 6199                  connp->conn_out_enforce_policy = B_FALSE;
6195 6200                  return (0);
6196 6201          }
6197 6202  
6198 6203          ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
6199 6204              ipst->ips_netstack);
6200 6205          if (ph == NULL)
6201 6206                  goto enomem;
6202 6207  
6203 6208          ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
6204 6209          if (actp == NULL)
6205 6210                  goto enomem;
6206 6211  
6207 6212          /*
6208 6213           * Always insert IPv4 policy entries, since they can also apply to
6209 6214           * ipv6 sockets being used in ipv4-compat mode.
6210 6215           */
6211 6216          if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6212 6217              IPSEC_TYPE_INBOUND, ns))
6213 6218                  goto enomem;
6214 6219          is_pol_inserted = B_TRUE;
6215 6220          if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6216 6221              IPSEC_TYPE_OUTBOUND, ns))
6217 6222                  goto enomem;
6218 6223  
6219 6224          /*
6220 6225           * We're looking at a v6 socket, also insert the v6-specific
6221 6226           * entries.
6222 6227           */
6223 6228          if (connp->conn_family == AF_INET6) {
6224 6229                  if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6225 6230                      IPSEC_TYPE_INBOUND, ns))
6226 6231                          goto enomem;
6227 6232                  if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6228 6233                      IPSEC_TYPE_OUTBOUND, ns))
6229 6234                          goto enomem;
6230 6235          }
6231 6236  
6232 6237          ipsec_actvec_free(actp, nact);
6233 6238  
6234 6239          /*
6235 6240           * If the requests need security, set enforce_policy.
6236 6241           * If the requests are IPSEC_PREF_NEVER, one should
6237 6242           * still set conn_out_enforce_policy so that ip_set_destination
6238 6243           * marks the ip_xmit_attr_t appropriatly. This is needed so that
6239 6244           * for connections that we don't cache policy in at connect time,
6240 6245           * if global policy matches in ip_output_attach_policy, we
6241 6246           * don't wrongly inherit global policy. Similarly, we need
6242 6247           * to set conn_in_enforce_policy also so that we don't verify
6243 6248           * policy wrongly.
6244 6249           */
6245 6250          if ((ah_req & REQ_MASK) != 0 ||
6246 6251              (esp_req & REQ_MASK) != 0 ||
6247 6252              (se_req & REQ_MASK) != 0) {
6248 6253                  connp->conn_in_enforce_policy = B_TRUE;
6249 6254                  connp->conn_out_enforce_policy = B_TRUE;
6250 6255          }
6251 6256  
6252 6257          return (error);
6253 6258  #undef REQ_MASK
6254 6259  
6255 6260          /*
6256 6261           * Common memory-allocation-failure exit path.
6257 6262           */
6258 6263  enomem:
6259 6264          if (actp != NULL)
6260 6265                  ipsec_actvec_free(actp, nact);
6261 6266          if (is_pol_inserted)
6262 6267                  ipsec_polhead_flush(ph, ns);
6263 6268          return (ENOMEM);
6264 6269  }
6265 6270  
6266 6271  /*
6267 6272   * Set socket options for joining and leaving multicast groups.
6268 6273   * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6269 6274   * The caller has already check that the option name is consistent with
6270 6275   * the address family of the socket.
6271 6276   */
6272 6277  int
6273 6278  ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
6274 6279      uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6275 6280  {
6276 6281          int             *i1 = (int *)invalp;
6277 6282          int             error = 0;
6278 6283          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
6279 6284          struct ip_mreq  *v4_mreqp;
6280 6285          struct ipv6_mreq *v6_mreqp;
6281 6286          struct group_req *greqp;
6282 6287          ire_t *ire;
6283 6288          boolean_t done = B_FALSE;
6284 6289          ipaddr_t ifaddr;
6285 6290          in6_addr_t v6group;
6286 6291          uint_t ifindex;
6287 6292          boolean_t mcast_opt = B_TRUE;
6288 6293          mcast_record_t fmode;
6289 6294          int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6290 6295              ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6291 6296  
6292 6297          switch (name) {
6293 6298          case IP_ADD_MEMBERSHIP:
6294 6299          case IPV6_JOIN_GROUP:
6295 6300                  mcast_opt = B_FALSE;
6296 6301                  /* FALLTHRU */
6297 6302          case MCAST_JOIN_GROUP:
6298 6303                  fmode = MODE_IS_EXCLUDE;
6299 6304                  optfn = ip_opt_add_group;
6300 6305                  break;
6301 6306  
6302 6307          case IP_DROP_MEMBERSHIP:
6303 6308          case IPV6_LEAVE_GROUP:
6304 6309                  mcast_opt = B_FALSE;
6305 6310                  /* FALLTHRU */
6306 6311          case MCAST_LEAVE_GROUP:
6307 6312                  fmode = MODE_IS_INCLUDE;
6308 6313                  optfn = ip_opt_delete_group;
6309 6314                  break;
6310 6315          default:
6311 6316                  ASSERT(0);
6312 6317          }
6313 6318  
6314 6319          if (mcast_opt) {
6315 6320                  struct sockaddr_in *sin;
6316 6321                  struct sockaddr_in6 *sin6;
6317 6322  
6318 6323                  greqp = (struct group_req *)i1;
6319 6324                  if (greqp->gr_group.ss_family == AF_INET) {
6320 6325                          sin = (struct sockaddr_in *)&(greqp->gr_group);
6321 6326                          IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
6322 6327                  } else {
6323 6328                          if (!inet6)
6324 6329                                  return (EINVAL);        /* Not on INET socket */
6325 6330  
6326 6331                          sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
6327 6332                          v6group = sin6->sin6_addr;
6328 6333                  }
6329 6334                  ifaddr = INADDR_ANY;
6330 6335                  ifindex = greqp->gr_interface;
6331 6336          } else if (inet6) {
6332 6337                  v6_mreqp = (struct ipv6_mreq *)i1;
6333 6338                  v6group = v6_mreqp->ipv6mr_multiaddr;
6334 6339                  ifaddr = INADDR_ANY;
6335 6340                  ifindex = v6_mreqp->ipv6mr_interface;
6336 6341          } else {
6337 6342                  v4_mreqp = (struct ip_mreq *)i1;
6338 6343                  IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
6339 6344                  ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
6340 6345                  ifindex = 0;
6341 6346          }
6342 6347  
6343 6348          /*
6344 6349           * In the multirouting case, we need to replicate
6345 6350           * the request on all interfaces that will take part
6346 6351           * in replication.  We do so because multirouting is
6347 6352           * reflective, thus we will probably receive multi-
6348 6353           * casts on those interfaces.
6349 6354           * The ip_multirt_apply_membership() succeeds if
6350 6355           * the operation succeeds on at least one interface.
6351 6356           */
6352 6357          if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6353 6358                  ipaddr_t group;
6354 6359  
6355 6360                  IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6356 6361  
6357 6362                  ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6358 6363                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6359 6364                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6360 6365          } else {
6361 6366                  ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6362 6367                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6363 6368                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6364 6369          }
6365 6370          if (ire != NULL) {
6366 6371                  if (ire->ire_flags & RTF_MULTIRT) {
6367 6372                          error = ip_multirt_apply_membership(optfn, ire, connp,
6368 6373                              checkonly, &v6group, fmode, &ipv6_all_zeros);
6369 6374                          done = B_TRUE;
6370 6375                  }
6371 6376                  ire_refrele(ire);
6372 6377          }
6373 6378  
6374 6379          if (!done) {
6375 6380                  error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6376 6381                      fmode, &ipv6_all_zeros);
6377 6382          }
6378 6383          return (error);
6379 6384  }
6380 6385  
6381 6386  /*
6382 6387   * Set socket options for joining and leaving multicast groups
6383 6388   * for specific sources.
6384 6389   * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6385 6390   * The caller has already check that the option name is consistent with
6386 6391   * the address family of the socket.
6387 6392   */
6388 6393  int
6389 6394  ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
6390 6395      uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6391 6396  {
6392 6397          int             *i1 = (int *)invalp;
6393 6398          int             error = 0;
6394 6399          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
6395 6400          struct ip_mreq_source *imreqp;
6396 6401          struct group_source_req *gsreqp;
6397 6402          in6_addr_t v6group, v6src;
6398 6403          uint32_t ifindex;
6399 6404          ipaddr_t ifaddr;
6400 6405          boolean_t mcast_opt = B_TRUE;
6401 6406          mcast_record_t fmode;
6402 6407          ire_t *ire;
6403 6408          boolean_t done = B_FALSE;
6404 6409          int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6405 6410              ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6406 6411  
6407 6412          switch (name) {
6408 6413          case IP_BLOCK_SOURCE:
6409 6414                  mcast_opt = B_FALSE;
6410 6415                  /* FALLTHRU */
6411 6416          case MCAST_BLOCK_SOURCE:
6412 6417                  fmode = MODE_IS_EXCLUDE;
6413 6418                  optfn = ip_opt_add_group;
6414 6419                  break;
6415 6420  
6416 6421          case IP_UNBLOCK_SOURCE:
6417 6422                  mcast_opt = B_FALSE;
6418 6423                  /* FALLTHRU */
6419 6424          case MCAST_UNBLOCK_SOURCE:
6420 6425                  fmode = MODE_IS_EXCLUDE;
6421 6426                  optfn = ip_opt_delete_group;
6422 6427                  break;
6423 6428  
6424 6429          case IP_ADD_SOURCE_MEMBERSHIP:
6425 6430                  mcast_opt = B_FALSE;
6426 6431                  /* FALLTHRU */
6427 6432          case MCAST_JOIN_SOURCE_GROUP:
6428 6433                  fmode = MODE_IS_INCLUDE;
6429 6434                  optfn = ip_opt_add_group;
6430 6435                  break;
6431 6436  
6432 6437          case IP_DROP_SOURCE_MEMBERSHIP:
6433 6438                  mcast_opt = B_FALSE;
6434 6439                  /* FALLTHRU */
6435 6440          case MCAST_LEAVE_SOURCE_GROUP:
6436 6441                  fmode = MODE_IS_INCLUDE;
6437 6442                  optfn = ip_opt_delete_group;
6438 6443                  break;
6439 6444          default:
6440 6445                  ASSERT(0);
6441 6446          }
6442 6447  
6443 6448          if (mcast_opt) {
6444 6449                  gsreqp = (struct group_source_req *)i1;
6445 6450                  ifindex = gsreqp->gsr_interface;
6446 6451                  if (gsreqp->gsr_group.ss_family == AF_INET) {
6447 6452                          struct sockaddr_in *s;
6448 6453                          s = (struct sockaddr_in *)&gsreqp->gsr_group;
6449 6454                          IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
6450 6455                          s = (struct sockaddr_in *)&gsreqp->gsr_source;
6451 6456                          IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
6452 6457                  } else {
6453 6458                          struct sockaddr_in6 *s6;
6454 6459  
6455 6460                          if (!inet6)
6456 6461                                  return (EINVAL);        /* Not on INET socket */
6457 6462  
6458 6463                          s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
6459 6464                          v6group = s6->sin6_addr;
6460 6465                          s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
6461 6466                          v6src = s6->sin6_addr;
6462 6467                  }
6463 6468                  ifaddr = INADDR_ANY;
6464 6469          } else {
6465 6470                  imreqp = (struct ip_mreq_source *)i1;
6466 6471                  IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
6467 6472                  IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
6468 6473                  ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
6469 6474                  ifindex = 0;
6470 6475          }
6471 6476  
6472 6477          /*
6473 6478           * Handle src being mapped INADDR_ANY by changing it to unspecified.
6474 6479           */
6475 6480          if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
6476 6481                  v6src = ipv6_all_zeros;
6477 6482  
6478 6483          /*
6479 6484           * In the multirouting case, we need to replicate
6480 6485           * the request as noted in the mcast cases above.
6481 6486           */
6482 6487          if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6483 6488                  ipaddr_t group;
6484 6489  
6485 6490                  IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6486 6491  
6487 6492                  ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6488 6493                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6489 6494                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6490 6495          } else {
6491 6496                  ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6492 6497                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6493 6498                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6494 6499          }
6495 6500          if (ire != NULL) {
6496 6501                  if (ire->ire_flags & RTF_MULTIRT) {
6497 6502                          error = ip_multirt_apply_membership(optfn, ire, connp,
6498 6503                              checkonly, &v6group, fmode, &v6src);
6499 6504                          done = B_TRUE;
6500 6505                  }
6501 6506                  ire_refrele(ire);
6502 6507          }
6503 6508          if (!done) {
6504 6509                  error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6505 6510                      fmode, &v6src);
6506 6511          }
6507 6512          return (error);
6508 6513  }
6509 6514  
6510 6515  /*
6511 6516   * Given a destination address and a pointer to where to put the information
6512 6517   * this routine fills in the mtuinfo.
6513 6518   * The socket must be connected.
6514 6519   * For sctp conn_faddr is the primary address.
6515 6520   */
6516 6521  int
6517 6522  ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
6518 6523  {
6519 6524          uint32_t        pmtu = IP_MAXPACKET;
6520 6525          uint_t          scopeid;
6521 6526  
6522 6527          if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
6523 6528                  return (-1);
6524 6529  
6525 6530          /* In case we never sent or called ip_set_destination_v4/v6 */
6526 6531          if (ixa->ixa_ire != NULL)
6527 6532                  pmtu = ip_get_pmtu(ixa);
6528 6533  
6529 6534          if (ixa->ixa_flags & IXAF_SCOPEID_SET)
6530 6535                  scopeid = ixa->ixa_scopeid;
6531 6536          else
6532 6537                  scopeid = 0;
6533 6538  
6534 6539          bzero(mtuinfo, sizeof (*mtuinfo));
6535 6540          mtuinfo->ip6m_addr.sin6_family = AF_INET6;
6536 6541          mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
6537 6542          mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
6538 6543          mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
6539 6544          mtuinfo->ip6m_mtu = pmtu;
6540 6545  
6541 6546          return (sizeof (struct ip6_mtuinfo));
6542 6547  }
6543 6548  
6544 6549  /*
6545 6550   * When the src multihoming is changed from weak to [strong, preferred]
6546 6551   * ip_ire_rebind_walker is called to walk the list of all ire_t entries
6547 6552   * and identify routes that were created by user-applications in the
6548 6553   * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
6549 6554   * currently defined. These routes are then 'rebound', i.e., their ire_ill
6550 6555   * is selected by finding an interface route for the gateway.
6551 6556   */
6552 6557  /* ARGSUSED */
6553 6558  void
6554 6559  ip_ire_rebind_walker(ire_t *ire, void *notused)
6555 6560  {
6556 6561          if (!ire->ire_unbound || ire->ire_ill != NULL)
6557 6562                  return;
6558 6563          ire_rebind(ire);
6559 6564          ire_delete(ire);
6560 6565  }
6561 6566  
6562 6567  /*
6563 6568   * When the src multihoming is changed from  [strong, preferred] to weak,
6564 6569   * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
6565 6570   * set any entries that were created by user-applications in the unbound state
6566 6571   * (i.e., without RTA_IFP) back to having a NULL ire_ill.
6567 6572   */
6568 6573  /* ARGSUSED */
6569 6574  void
6570 6575  ip_ire_unbind_walker(ire_t *ire, void *notused)
6571 6576  {
6572 6577          ire_t *new_ire;
6573 6578  
6574 6579          if (!ire->ire_unbound || ire->ire_ill == NULL)
6575 6580                  return;
6576 6581          if (ire->ire_ipversion == IPV6_VERSION) {
6577 6582                  new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
6578 6583                      &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
6579 6584                      ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6580 6585          } else {
6581 6586                  new_ire = ire_create((uchar_t *)&ire->ire_addr,
6582 6587                      (uchar_t *)&ire->ire_mask,
6583 6588                      (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
6584 6589                      ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6585 6590          }
6586 6591          if (new_ire == NULL)
6587 6592                  return;
6588 6593          new_ire->ire_unbound = B_TRUE;
6589 6594          /*
6590 6595           * The bound ire must first be deleted so that we don't return
6591 6596           * the existing one on the attempt to add the unbound new_ire.
6592 6597           */
6593 6598          ire_delete(ire);
6594 6599          new_ire = ire_add(new_ire);
6595 6600          if (new_ire != NULL)
6596 6601                  ire_refrele(new_ire);
6597 6602  }
6598 6603  
6599 6604  /*
6600 6605   * When the settings of ip*_strict_src_multihoming tunables are changed,
6601 6606   * all cached routes need to be recomputed. This recomputation needs to be
6602 6607   * done when going from weaker to stronger modes so that the cached ire
6603 6608   * for the connection does not violate the current ip*_strict_src_multihoming
6604 6609   * setting. It also needs to be done when going from stronger to weaker modes,
6605 6610   * so that we fall back to matching on the longest-matching-route (as opposed
6606 6611   * to a shorter match that may have been selected in the strong mode
6607 6612   * to satisfy src_multihoming settings).
6608 6613   *
6609 6614   * The cached ixa_ire entires for all conn_t entries are marked as
6610 6615   * "verify" so that they will be recomputed for the next packet.
6611 6616   */
6612 6617  void
6613 6618  conn_ire_revalidate(conn_t *connp, void *arg)
6614 6619  {
6615 6620          boolean_t isv6 = (boolean_t)arg;
6616 6621  
6617 6622          if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
6618 6623              (!isv6 && connp->conn_ipversion != IPV4_VERSION))
6619 6624                  return;
6620 6625          connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
6621 6626  }
6622 6627  
6623 6628  /*
6624 6629   * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
6625 6630   * When an ipf is passed here for the first time, if
6626 6631   * we already have in-order fragments on the queue, we convert from the fast-
6627 6632   * path reassembly scheme to the hard-case scheme.  From then on, additional
6628 6633   * fragments are reassembled here.  We keep track of the start and end offsets
6629 6634   * of each piece, and the number of holes in the chain.  When the hole count
6630 6635   * goes to zero, we are done!
6631 6636   *
6632 6637   * The ipf_count will be updated to account for any mblk(s) added (pointed to
6633 6638   * by mp) or subtracted (freeb()ed dups), upon return the caller must update
6634 6639   * ipfb_count and ill_frag_count by the difference of ipf_count before and
6635 6640   * after the call to ip_reassemble().
6636 6641   */
6637 6642  int
6638 6643  ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
6639 6644      size_t msg_len)
6640 6645  {
6641 6646          uint_t  end;
6642 6647          mblk_t  *next_mp;
6643 6648          mblk_t  *mp1;
6644 6649          uint_t  offset;
6645 6650          boolean_t incr_dups = B_TRUE;
6646 6651          boolean_t offset_zero_seen = B_FALSE;
6647 6652          boolean_t pkt_boundary_checked = B_FALSE;
6648 6653  
6649 6654          /* If start == 0 then ipf_nf_hdr_len has to be set. */
6650 6655          ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
6651 6656  
6652 6657          /* Add in byte count */
6653 6658          ipf->ipf_count += msg_len;
6654 6659          if (ipf->ipf_end) {
6655 6660                  /*
6656 6661                   * We were part way through in-order reassembly, but now there
6657 6662                   * is a hole.  We walk through messages already queued, and
6658 6663                   * mark them for hard case reassembly.  We know that up till
6659 6664                   * now they were in order starting from offset zero.
6660 6665                   */
6661 6666                  offset = 0;
6662 6667                  for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6663 6668                          IP_REASS_SET_START(mp1, offset);
6664 6669                          if (offset == 0) {
6665 6670                                  ASSERT(ipf->ipf_nf_hdr_len != 0);
6666 6671                                  offset = -ipf->ipf_nf_hdr_len;
6667 6672                          }
6668 6673                          offset += mp1->b_wptr - mp1->b_rptr;
6669 6674                          IP_REASS_SET_END(mp1, offset);
6670 6675                  }
6671 6676                  /* One hole at the end. */
6672 6677                  ipf->ipf_hole_cnt = 1;
6673 6678                  /* Brand it as a hard case, forever. */
6674 6679                  ipf->ipf_end = 0;
6675 6680          }
6676 6681          /* Walk through all the new pieces. */
6677 6682          do {
6678 6683                  end = start + (mp->b_wptr - mp->b_rptr);
6679 6684                  /*
6680 6685                   * If start is 0, decrease 'end' only for the first mblk of
6681 6686                   * the fragment. Otherwise 'end' can get wrong value in the
6682 6687                   * second pass of the loop if first mblk is exactly the
6683 6688                   * size of ipf_nf_hdr_len.
6684 6689                   */
6685 6690                  if (start == 0 && !offset_zero_seen) {
6686 6691                          /* First segment */
6687 6692                          ASSERT(ipf->ipf_nf_hdr_len != 0);
6688 6693                          end -= ipf->ipf_nf_hdr_len;
6689 6694                          offset_zero_seen = B_TRUE;
6690 6695                  }
6691 6696                  next_mp = mp->b_cont;
6692 6697                  /*
6693 6698                   * We are checking to see if there is any interesing data
6694 6699                   * to process.  If there isn't and the mblk isn't the
6695 6700                   * one which carries the unfragmentable header then we
6696 6701                   * drop it.  It's possible to have just the unfragmentable
6697 6702                   * header come through without any data.  That needs to be
6698 6703                   * saved.
6699 6704                   *
6700 6705                   * If the assert at the top of this function holds then the
6701 6706                   * term "ipf->ipf_nf_hdr_len != 0" isn't needed.  This code
6702 6707                   * is infrequently traveled enough that the test is left in
6703 6708                   * to protect against future code changes which break that
6704 6709                   * invariant.
6705 6710                   */
6706 6711                  if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
6707 6712                          /* Empty.  Blast it. */
6708 6713                          IP_REASS_SET_START(mp, 0);
6709 6714                          IP_REASS_SET_END(mp, 0);
6710 6715                          /*
6711 6716                           * If the ipf points to the mblk we are about to free,
6712 6717                           * update ipf to point to the next mblk (or NULL
6713 6718                           * if none).
6714 6719                           */
6715 6720                          if (ipf->ipf_mp->b_cont == mp)
6716 6721                                  ipf->ipf_mp->b_cont = next_mp;
6717 6722                          freeb(mp);
6718 6723                          continue;
6719 6724                  }
6720 6725                  mp->b_cont = NULL;
6721 6726                  IP_REASS_SET_START(mp, start);
6722 6727                  IP_REASS_SET_END(mp, end);
6723 6728                  if (!ipf->ipf_tail_mp) {
6724 6729                          ipf->ipf_tail_mp = mp;
6725 6730                          ipf->ipf_mp->b_cont = mp;
6726 6731                          if (start == 0 || !more) {
6727 6732                                  ipf->ipf_hole_cnt = 1;
6728 6733                                  /*
6729 6734                                   * if the first fragment comes in more than one
6730 6735                                   * mblk, this loop will be executed for each
6731 6736                                   * mblk. Need to adjust hole count so exiting
6732 6737                                   * this routine will leave hole count at 1.
6733 6738                                   */
6734 6739                                  if (next_mp)
6735 6740                                          ipf->ipf_hole_cnt++;
6736 6741                          } else
6737 6742                                  ipf->ipf_hole_cnt = 2;
6738 6743                          continue;
6739 6744                  } else if (ipf->ipf_last_frag_seen && !more &&
6740 6745                      !pkt_boundary_checked) {
6741 6746                          /*
6742 6747                           * We check datagram boundary only if this fragment
6743 6748                           * claims to be the last fragment and we have seen a
6744 6749                           * last fragment in the past too. We do this only
6745 6750                           * once for a given fragment.
6746 6751                           *
6747 6752                           * start cannot be 0 here as fragments with start=0
6748 6753                           * and MF=0 gets handled as a complete packet. These
6749 6754                           * fragments should not reach here.
6750 6755                           */
6751 6756  
6752 6757                          if (start + msgdsize(mp) !=
6753 6758                              IP_REASS_END(ipf->ipf_tail_mp)) {
6754 6759                                  /*
6755 6760                                   * We have two fragments both of which claim
6756 6761                                   * to be the last fragment but gives conflicting
6757 6762                                   * information about the whole datagram size.
6758 6763                                   * Something fishy is going on. Drop the
6759 6764                                   * fragment and free up the reassembly list.
6760 6765                                   */
6761 6766                                  return (IP_REASS_FAILED);
6762 6767                          }
6763 6768  
6764 6769                          /*
6765 6770                           * We shouldn't come to this code block again for this
6766 6771                           * particular fragment.
6767 6772                           */
6768 6773                          pkt_boundary_checked = B_TRUE;
6769 6774                  }
6770 6775  
6771 6776                  /* New stuff at or beyond tail? */
6772 6777                  offset = IP_REASS_END(ipf->ipf_tail_mp);
6773 6778                  if (start >= offset) {
6774 6779                          if (ipf->ipf_last_frag_seen) {
6775 6780                                  /* current fragment is beyond last fragment */
6776 6781                                  return (IP_REASS_FAILED);
6777 6782                          }
6778 6783                          /* Link it on end. */
6779 6784                          ipf->ipf_tail_mp->b_cont = mp;
6780 6785                          ipf->ipf_tail_mp = mp;
6781 6786                          if (more) {
6782 6787                                  if (start != offset)
6783 6788                                          ipf->ipf_hole_cnt++;
6784 6789                          } else if (start == offset && next_mp == NULL)
6785 6790                                          ipf->ipf_hole_cnt--;
6786 6791                          continue;
6787 6792                  }
6788 6793                  mp1 = ipf->ipf_mp->b_cont;
6789 6794                  offset = IP_REASS_START(mp1);
6790 6795                  /* New stuff at the front? */
6791 6796                  if (start < offset) {
6792 6797                          if (start == 0) {
6793 6798                                  if (end >= offset) {
6794 6799                                          /* Nailed the hole at the begining. */
6795 6800                                          ipf->ipf_hole_cnt--;
6796 6801                                  }
6797 6802                          } else if (end < offset) {
6798 6803                                  /*
6799 6804                                   * A hole, stuff, and a hole where there used
6800 6805                                   * to be just a hole.
6801 6806                                   */
6802 6807                                  ipf->ipf_hole_cnt++;
6803 6808                          }
6804 6809                          mp->b_cont = mp1;
6805 6810                          /* Check for overlap. */
6806 6811                          while (end > offset) {
6807 6812                                  if (end < IP_REASS_END(mp1)) {
6808 6813                                          mp->b_wptr -= end - offset;
6809 6814                                          IP_REASS_SET_END(mp, offset);
6810 6815                                          BUMP_MIB(ill->ill_ip_mib,
6811 6816                                              ipIfStatsReasmPartDups);
6812 6817                                          break;
6813 6818                                  }
6814 6819                                  /* Did we cover another hole? */
6815 6820                                  if ((mp1->b_cont &&
6816 6821                                      IP_REASS_END(mp1) !=
6817 6822                                      IP_REASS_START(mp1->b_cont) &&
6818 6823                                      end >= IP_REASS_START(mp1->b_cont)) ||
6819 6824                                      (!ipf->ipf_last_frag_seen && !more)) {
6820 6825                                          ipf->ipf_hole_cnt--;
6821 6826                                  }
6822 6827                                  /* Clip out mp1. */
6823 6828                                  if ((mp->b_cont = mp1->b_cont) == NULL) {
6824 6829                                          /*
6825 6830                                           * After clipping out mp1, this guy
6826 6831                                           * is now hanging off the end.
6827 6832                                           */
6828 6833                                          ipf->ipf_tail_mp = mp;
6829 6834                                  }
6830 6835                                  IP_REASS_SET_START(mp1, 0);
6831 6836                                  IP_REASS_SET_END(mp1, 0);
6832 6837                                  /* Subtract byte count */
6833 6838                                  ipf->ipf_count -= mp1->b_datap->db_lim -
6834 6839                                      mp1->b_datap->db_base;
6835 6840                                  freeb(mp1);
6836 6841                                  BUMP_MIB(ill->ill_ip_mib,
6837 6842                                      ipIfStatsReasmPartDups);
6838 6843                                  mp1 = mp->b_cont;
6839 6844                                  if (!mp1)
6840 6845                                          break;
6841 6846                                  offset = IP_REASS_START(mp1);
6842 6847                          }
6843 6848                          ipf->ipf_mp->b_cont = mp;
6844 6849                          continue;
6845 6850                  }
6846 6851                  /*
6847 6852                   * The new piece starts somewhere between the start of the head
6848 6853                   * and before the end of the tail.
6849 6854                   */
6850 6855                  for (; mp1; mp1 = mp1->b_cont) {
6851 6856                          offset = IP_REASS_END(mp1);
6852 6857                          if (start < offset) {
6853 6858                                  if (end <= offset) {
6854 6859                                          /* Nothing new. */
6855 6860                                          IP_REASS_SET_START(mp, 0);
6856 6861                                          IP_REASS_SET_END(mp, 0);
6857 6862                                          /* Subtract byte count */
6858 6863                                          ipf->ipf_count -= mp->b_datap->db_lim -
6859 6864                                              mp->b_datap->db_base;
6860 6865                                          if (incr_dups) {
6861 6866                                                  ipf->ipf_num_dups++;
6862 6867                                                  incr_dups = B_FALSE;
6863 6868                                          }
6864 6869                                          freeb(mp);
6865 6870                                          BUMP_MIB(ill->ill_ip_mib,
6866 6871                                              ipIfStatsReasmDuplicates);
6867 6872                                          break;
6868 6873                                  }
6869 6874                                  /*
6870 6875                                   * Trim redundant stuff off beginning of new
6871 6876                                   * piece.
6872 6877                                   */
6873 6878                                  IP_REASS_SET_START(mp, offset);
6874 6879                                  mp->b_rptr += offset - start;
6875 6880                                  BUMP_MIB(ill->ill_ip_mib,
6876 6881                                      ipIfStatsReasmPartDups);
6877 6882                                  start = offset;
6878 6883                                  if (!mp1->b_cont) {
6879 6884                                          /*
6880 6885                                           * After trimming, this guy is now
6881 6886                                           * hanging off the end.
6882 6887                                           */
6883 6888                                          mp1->b_cont = mp;
6884 6889                                          ipf->ipf_tail_mp = mp;
6885 6890                                          if (!more) {
6886 6891                                                  ipf->ipf_hole_cnt--;
6887 6892                                          }
6888 6893                                          break;
6889 6894                                  }
6890 6895                          }
6891 6896                          if (start >= IP_REASS_START(mp1->b_cont))
6892 6897                                  continue;
6893 6898                          /* Fill a hole */
6894 6899                          if (start > offset)
6895 6900                                  ipf->ipf_hole_cnt++;
6896 6901                          mp->b_cont = mp1->b_cont;
6897 6902                          mp1->b_cont = mp;
6898 6903                          mp1 = mp->b_cont;
6899 6904                          offset = IP_REASS_START(mp1);
6900 6905                          if (end >= offset) {
6901 6906                                  ipf->ipf_hole_cnt--;
6902 6907                                  /* Check for overlap. */
6903 6908                                  while (end > offset) {
6904 6909                                          if (end < IP_REASS_END(mp1)) {
6905 6910                                                  mp->b_wptr -= end - offset;
6906 6911                                                  IP_REASS_SET_END(mp, offset);
6907 6912                                                  /*
6908 6913                                                   * TODO we might bump
6909 6914                                                   * this up twice if there is
6910 6915                                                   * overlap at both ends.
6911 6916                                                   */
6912 6917                                                  BUMP_MIB(ill->ill_ip_mib,
6913 6918                                                      ipIfStatsReasmPartDups);
6914 6919                                                  break;
6915 6920                                          }
6916 6921                                          /* Did we cover another hole? */
6917 6922                                          if ((mp1->b_cont &&
6918 6923                                              IP_REASS_END(mp1)
6919 6924                                              != IP_REASS_START(mp1->b_cont) &&
6920 6925                                              end >=
6921 6926                                              IP_REASS_START(mp1->b_cont)) ||
6922 6927                                              (!ipf->ipf_last_frag_seen &&
6923 6928                                              !more)) {
6924 6929                                                  ipf->ipf_hole_cnt--;
6925 6930                                          }
6926 6931                                          /* Clip out mp1. */
6927 6932                                          if ((mp->b_cont = mp1->b_cont) ==
6928 6933                                              NULL) {
6929 6934                                                  /*
6930 6935                                                   * After clipping out mp1,
6931 6936                                                   * this guy is now hanging
6932 6937                                                   * off the end.
6933 6938                                                   */
6934 6939                                                  ipf->ipf_tail_mp = mp;
6935 6940                                          }
6936 6941                                          IP_REASS_SET_START(mp1, 0);
6937 6942                                          IP_REASS_SET_END(mp1, 0);
6938 6943                                          /* Subtract byte count */
6939 6944                                          ipf->ipf_count -=
6940 6945                                              mp1->b_datap->db_lim -
6941 6946                                              mp1->b_datap->db_base;
6942 6947                                          freeb(mp1);
6943 6948                                          BUMP_MIB(ill->ill_ip_mib,
6944 6949                                              ipIfStatsReasmPartDups);
6945 6950                                          mp1 = mp->b_cont;
6946 6951                                          if (!mp1)
6947 6952                                                  break;
6948 6953                                          offset = IP_REASS_START(mp1);
6949 6954                                  }
6950 6955                          }
6951 6956                          break;
6952 6957                  }
6953 6958          } while (start = end, mp = next_mp);
6954 6959  
6955 6960          /* Fragment just processed could be the last one. Remember this fact */
6956 6961          if (!more)
6957 6962                  ipf->ipf_last_frag_seen = B_TRUE;
6958 6963  
6959 6964          /* Still got holes? */
6960 6965          if (ipf->ipf_hole_cnt)
6961 6966                  return (IP_REASS_PARTIAL);
6962 6967          /* Clean up overloaded fields to avoid upstream disasters. */
6963 6968          for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6964 6969                  IP_REASS_SET_START(mp1, 0);
6965 6970                  IP_REASS_SET_END(mp1, 0);
6966 6971          }
6967 6972          return (IP_REASS_COMPLETE);
6968 6973  }
6969 6974  
6970 6975  /*
6971 6976   * Fragmentation reassembly.  Each ILL has a hash table for
6972 6977   * queuing packets undergoing reassembly for all IPIFs
6973 6978   * associated with the ILL.  The hash is based on the packet
6974 6979   * IP ident field.  The ILL frag hash table was allocated
6975 6980   * as a timer block at the time the ILL was created.  Whenever
6976 6981   * there is anything on the reassembly queue, the timer will
6977 6982   * be running.  Returns the reassembled packet if reassembly completes.
6978 6983   */
6979 6984  mblk_t *
6980 6985  ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
6981 6986  {
6982 6987          uint32_t        frag_offset_flags;
6983 6988          mblk_t          *t_mp;
6984 6989          ipaddr_t        dst;
6985 6990          uint8_t         proto = ipha->ipha_protocol;
6986 6991          uint32_t        sum_val;
6987 6992          uint16_t        sum_flags;
6988 6993          ipf_t           *ipf;
6989 6994          ipf_t           **ipfp;
6990 6995          ipfb_t          *ipfb;
6991 6996          uint16_t        ident;
6992 6997          uint32_t        offset;
6993 6998          ipaddr_t        src;
6994 6999          uint_t          hdr_length;
6995 7000          uint32_t        end;
6996 7001          mblk_t          *mp1;
6997 7002          mblk_t          *tail_mp;
6998 7003          size_t          count;
6999 7004          size_t          msg_len;
7000 7005          uint8_t         ecn_info = 0;
7001 7006          uint32_t        packet_size;
7002 7007          boolean_t       pruned = B_FALSE;
7003 7008          ill_t           *ill = ira->ira_ill;
7004 7009          ip_stack_t      *ipst = ill->ill_ipst;
7005 7010  
7006 7011          /*
7007 7012           * Drop the fragmented as early as possible, if
7008 7013           * we don't have resource(s) to re-assemble.
7009 7014           */
7010 7015          if (ipst->ips_ip_reass_queue_bytes == 0) {
7011 7016                  freemsg(mp);
7012 7017                  return (NULL);
7013 7018          }
7014 7019  
7015 7020          /* Check for fragmentation offset; return if there's none */
7016 7021          if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
7017 7022              (IPH_MF | IPH_OFFSET)) == 0)
7018 7023                  return (mp);
7019 7024  
7020 7025          /*
7021 7026           * We utilize hardware computed checksum info only for UDP since
7022 7027           * IP fragmentation is a normal occurrence for the protocol.  In
7023 7028           * addition, checksum offload support for IP fragments carrying
7024 7029           * UDP payload is commonly implemented across network adapters.
7025 7030           */
7026 7031          ASSERT(ira->ira_rill != NULL);
7027 7032          if (proto == IPPROTO_UDP && dohwcksum &&
7028 7033              ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
7029 7034              (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
7030 7035                  mblk_t *mp1 = mp->b_cont;
7031 7036                  int32_t len;
7032 7037  
7033 7038                  /* Record checksum information from the packet */
7034 7039                  sum_val = (uint32_t)DB_CKSUM16(mp);
7035 7040                  sum_flags = DB_CKSUMFLAGS(mp);
7036 7041  
7037 7042                  /* IP payload offset from beginning of mblk */
7038 7043                  offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
7039 7044  
7040 7045                  if ((sum_flags & HCK_PARTIALCKSUM) &&
7041 7046                      (mp1 == NULL || mp1->b_cont == NULL) &&
7042 7047                      offset >= DB_CKSUMSTART(mp) &&
7043 7048                      ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
7044 7049                          uint32_t adj;
7045 7050                          /*
7046 7051                           * Partial checksum has been calculated by hardware
7047 7052                           * and attached to the packet; in addition, any
7048 7053                           * prepended extraneous data is even byte aligned.
7049 7054                           * If any such data exists, we adjust the checksum;
7050 7055                           * this would also handle any postpended data.
7051 7056                           */
7052 7057                          IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
7053 7058                              mp, mp1, len, adj);
7054 7059  
7055 7060                          /* One's complement subtract extraneous checksum */
7056 7061                          if (adj >= sum_val)
7057 7062                                  sum_val = ~(adj - sum_val) & 0xFFFF;
7058 7063                          else
7059 7064                                  sum_val -= adj;
7060 7065                  }
7061 7066          } else {
7062 7067                  sum_val = 0;
7063 7068                  sum_flags = 0;
7064 7069          }
7065 7070  
7066 7071          /* Clear hardware checksumming flag */
7067 7072          DB_CKSUMFLAGS(mp) = 0;
7068 7073  
7069 7074          ident = ipha->ipha_ident;
7070 7075          offset = (frag_offset_flags << 3) & 0xFFFF;
7071 7076          src = ipha->ipha_src;
7072 7077          dst = ipha->ipha_dst;
7073 7078          hdr_length = IPH_HDR_LENGTH(ipha);
7074 7079          end = ntohs(ipha->ipha_length) - hdr_length;
7075 7080  
7076 7081          /* If end == 0 then we have a packet with no data, so just free it */
7077 7082          if (end == 0) {
7078 7083                  freemsg(mp);
7079 7084                  return (NULL);
7080 7085          }
7081 7086  
7082 7087          /* Record the ECN field info. */
7083 7088          ecn_info = (ipha->ipha_type_of_service & 0x3);
7084 7089          if (offset != 0) {
7085 7090                  /*
7086 7091                   * If this isn't the first piece, strip the header, and
7087 7092                   * add the offset to the end value.
7088 7093                   */
7089 7094                  mp->b_rptr += hdr_length;
7090 7095                  end += offset;
7091 7096          }
7092 7097  
7093 7098          /* Handle vnic loopback of fragments */
7094 7099          if (mp->b_datap->db_ref > 2)
7095 7100                  msg_len = 0;
7096 7101          else
7097 7102                  msg_len = MBLKSIZE(mp);
7098 7103  
7099 7104          tail_mp = mp;
7100 7105          while (tail_mp->b_cont != NULL) {
7101 7106                  tail_mp = tail_mp->b_cont;
7102 7107                  if (tail_mp->b_datap->db_ref <= 2)
7103 7108                          msg_len += MBLKSIZE(tail_mp);
7104 7109          }
7105 7110  
7106 7111          /* If the reassembly list for this ILL will get too big, prune it */
7107 7112          if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
7108 7113              ipst->ips_ip_reass_queue_bytes) {
7109 7114                  DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
7110 7115                      uint_t, ill->ill_frag_count,
7111 7116                      uint_t, ipst->ips_ip_reass_queue_bytes);
7112 7117                  ill_frag_prune(ill,
7113 7118                      (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
7114 7119                      (ipst->ips_ip_reass_queue_bytes - msg_len));
7115 7120                  pruned = B_TRUE;
7116 7121          }
7117 7122  
7118 7123          ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
7119 7124          mutex_enter(&ipfb->ipfb_lock);
7120 7125  
7121 7126          ipfp = &ipfb->ipfb_ipf;
7122 7127          /* Try to find an existing fragment queue for this packet. */
7123 7128          for (;;) {
7124 7129                  ipf = ipfp[0];
7125 7130                  if (ipf != NULL) {
7126 7131                          /*
7127 7132                           * It has to match on ident and src/dst address.
7128 7133                           */
7129 7134                          if (ipf->ipf_ident == ident &&
7130 7135                              ipf->ipf_src == src &&
7131 7136                              ipf->ipf_dst == dst &&
7132 7137                              ipf->ipf_protocol == proto) {
7133 7138                                  /*
7134 7139                                   * If we have received too many
7135 7140                                   * duplicate fragments for this packet
7136 7141                                   * free it.
7137 7142                                   */
7138 7143                                  if (ipf->ipf_num_dups > ip_max_frag_dups) {
7139 7144                                          ill_frag_free_pkts(ill, ipfb, ipf, 1);
7140 7145                                          freemsg(mp);
7141 7146                                          mutex_exit(&ipfb->ipfb_lock);
7142 7147                                          return (NULL);
7143 7148                                  }
7144 7149                                  /* Found it. */
7145 7150                                  break;
7146 7151                          }
7147 7152                          ipfp = &ipf->ipf_hash_next;
7148 7153                          continue;
7149 7154                  }
7150 7155  
7151 7156                  /*
7152 7157                   * If we pruned the list, do we want to store this new
7153 7158                   * fragment?. We apply an optimization here based on the
7154 7159                   * fact that most fragments will be received in order.
7155 7160                   * So if the offset of this incoming fragment is zero,
7156 7161                   * it is the first fragment of a new packet. We will
7157 7162                   * keep it.  Otherwise drop the fragment, as we have
7158 7163                   * probably pruned the packet already (since the
7159 7164                   * packet cannot be found).
7160 7165                   */
7161 7166                  if (pruned && offset != 0) {
7162 7167                          mutex_exit(&ipfb->ipfb_lock);
7163 7168                          freemsg(mp);
7164 7169                          return (NULL);
7165 7170                  }
7166 7171  
7167 7172                  if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
7168 7173                          /*
7169 7174                           * Too many fragmented packets in this hash
7170 7175                           * bucket. Free the oldest.
7171 7176                           */
7172 7177                          ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
7173 7178                  }
7174 7179  
7175 7180                  /* New guy.  Allocate a frag message. */
7176 7181                  mp1 = allocb(sizeof (*ipf), BPRI_MED);
7177 7182                  if (mp1 == NULL) {
7178 7183                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7179 7184                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7180 7185                          freemsg(mp);
7181 7186  reass_done:
7182 7187                          mutex_exit(&ipfb->ipfb_lock);
7183 7188                          return (NULL);
7184 7189                  }
7185 7190  
7186 7191                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
7187 7192                  mp1->b_cont = mp;
7188 7193  
7189 7194                  /* Initialize the fragment header. */
7190 7195                  ipf = (ipf_t *)mp1->b_rptr;
7191 7196                  ipf->ipf_mp = mp1;
7192 7197                  ipf->ipf_ptphn = ipfp;
7193 7198                  ipfp[0] = ipf;
7194 7199                  ipf->ipf_hash_next = NULL;
7195 7200                  ipf->ipf_ident = ident;
7196 7201                  ipf->ipf_protocol = proto;
7197 7202                  ipf->ipf_src = src;
7198 7203                  ipf->ipf_dst = dst;
7199 7204                  ipf->ipf_nf_hdr_len = 0;
7200 7205                  /* Record reassembly start time. */
7201 7206                  ipf->ipf_timestamp = gethrestime_sec();
7202 7207                  /* Record ipf generation and account for frag header */
7203 7208                  ipf->ipf_gen = ill->ill_ipf_gen++;
7204 7209                  ipf->ipf_count = MBLKSIZE(mp1);
7205 7210                  ipf->ipf_last_frag_seen = B_FALSE;
7206 7211                  ipf->ipf_ecn = ecn_info;
7207 7212                  ipf->ipf_num_dups = 0;
7208 7213                  ipfb->ipfb_frag_pkts++;
7209 7214                  ipf->ipf_checksum = 0;
7210 7215                  ipf->ipf_checksum_flags = 0;
7211 7216  
7212 7217                  /* Store checksum value in fragment header */
7213 7218                  if (sum_flags != 0) {
7214 7219                          sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7215 7220                          sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7216 7221                          ipf->ipf_checksum = sum_val;
7217 7222                          ipf->ipf_checksum_flags = sum_flags;
7218 7223                  }
7219 7224  
7220 7225                  /*
7221 7226                   * We handle reassembly two ways.  In the easy case,
7222 7227                   * where all the fragments show up in order, we do
7223 7228                   * minimal bookkeeping, and just clip new pieces on
7224 7229                   * the end.  If we ever see a hole, then we go off
7225 7230                   * to ip_reassemble which has to mark the pieces and
7226 7231                   * keep track of the number of holes, etc.  Obviously,
7227 7232                   * the point of having both mechanisms is so we can
7228 7233                   * handle the easy case as efficiently as possible.
7229 7234                   */
7230 7235                  if (offset == 0) {
7231 7236                          /* Easy case, in-order reassembly so far. */
7232 7237                          ipf->ipf_count += msg_len;
7233 7238                          ipf->ipf_tail_mp = tail_mp;
7234 7239                          /*
7235 7240                           * Keep track of next expected offset in
7236 7241                           * ipf_end.
7237 7242                           */
7238 7243                          ipf->ipf_end = end;
7239 7244                          ipf->ipf_nf_hdr_len = hdr_length;
7240 7245                  } else {
7241 7246                          /* Hard case, hole at the beginning. */
7242 7247                          ipf->ipf_tail_mp = NULL;
7243 7248                          /*
7244 7249                           * ipf_end == 0 means that we have given up
7245 7250                           * on easy reassembly.
7246 7251                           */
7247 7252                          ipf->ipf_end = 0;
7248 7253  
7249 7254                          /* Forget checksum offload from now on */
7250 7255                          ipf->ipf_checksum_flags = 0;
7251 7256  
7252 7257                          /*
7253 7258                           * ipf_hole_cnt is set by ip_reassemble.
7254 7259                           * ipf_count is updated by ip_reassemble.
7255 7260                           * No need to check for return value here
7256 7261                           * as we don't expect reassembly to complete
7257 7262                           * or fail for the first fragment itself.
7258 7263                           */
7259 7264                          (void) ip_reassemble(mp, ipf,
7260 7265                              (frag_offset_flags & IPH_OFFSET) << 3,
7261 7266                              (frag_offset_flags & IPH_MF), ill, msg_len);
7262 7267                  }
7263 7268                  /* Update per ipfb and ill byte counts */
7264 7269                  ipfb->ipfb_count += ipf->ipf_count;
7265 7270                  ASSERT(ipfb->ipfb_count > 0);   /* Wraparound */
7266 7271                  atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
7267 7272                  /* If the frag timer wasn't already going, start it. */
7268 7273                  mutex_enter(&ill->ill_lock);
7269 7274                  ill_frag_timer_start(ill);
7270 7275                  mutex_exit(&ill->ill_lock);
7271 7276                  goto reass_done;
7272 7277          }
7273 7278  
7274 7279          /*
7275 7280           * If the packet's flag has changed (it could be coming up
7276 7281           * from an interface different than the previous, therefore
7277 7282           * possibly different checksum capability), then forget about
7278 7283           * any stored checksum states.  Otherwise add the value to
7279 7284           * the existing one stored in the fragment header.
7280 7285           */
7281 7286          if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
7282 7287                  sum_val += ipf->ipf_checksum;
7283 7288                  sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7284 7289                  sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7285 7290                  ipf->ipf_checksum = sum_val;
7286 7291          } else if (ipf->ipf_checksum_flags != 0) {
7287 7292                  /* Forget checksum offload from now on */
7288 7293                  ipf->ipf_checksum_flags = 0;
7289 7294          }
7290 7295  
7291 7296          /*
7292 7297           * We have a new piece of a datagram which is already being
7293 7298           * reassembled.  Update the ECN info if all IP fragments
7294 7299           * are ECN capable.  If there is one which is not, clear
7295 7300           * all the info.  If there is at least one which has CE
7296 7301           * code point, IP needs to report that up to transport.
7297 7302           */
7298 7303          if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
7299 7304                  if (ecn_info == IPH_ECN_CE)
7300 7305                          ipf->ipf_ecn = IPH_ECN_CE;
7301 7306          } else {
7302 7307                  ipf->ipf_ecn = IPH_ECN_NECT;
7303 7308          }
7304 7309          if (offset && ipf->ipf_end == offset) {
7305 7310                  /* The new fragment fits at the end */
7306 7311                  ipf->ipf_tail_mp->b_cont = mp;
7307 7312                  /* Update the byte count */
7308 7313                  ipf->ipf_count += msg_len;
7309 7314                  /* Update per ipfb and ill byte counts */
7310 7315                  ipfb->ipfb_count += msg_len;
7311 7316                  ASSERT(ipfb->ipfb_count > 0);   /* Wraparound */
7312 7317                  atomic_add_32(&ill->ill_frag_count, msg_len);
7313 7318                  if (frag_offset_flags & IPH_MF) {
7314 7319                          /* More to come. */
7315 7320                          ipf->ipf_end = end;
7316 7321                          ipf->ipf_tail_mp = tail_mp;
7317 7322                          goto reass_done;
7318 7323                  }
7319 7324          } else {
7320 7325                  /* Go do the hard cases. */
7321 7326                  int ret;
7322 7327  
7323 7328                  if (offset == 0)
7324 7329                          ipf->ipf_nf_hdr_len = hdr_length;
7325 7330  
7326 7331                  /* Save current byte count */
7327 7332                  count = ipf->ipf_count;
7328 7333                  ret = ip_reassemble(mp, ipf,
7329 7334                      (frag_offset_flags & IPH_OFFSET) << 3,
7330 7335                      (frag_offset_flags & IPH_MF), ill, msg_len);
7331 7336                  /* Count of bytes added and subtracted (freeb()ed) */
7332 7337                  count = ipf->ipf_count - count;
7333 7338                  if (count) {
7334 7339                          /* Update per ipfb and ill byte counts */
7335 7340                          ipfb->ipfb_count += count;
7336 7341                          ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
7337 7342                          atomic_add_32(&ill->ill_frag_count, count);
7338 7343                  }
7339 7344                  if (ret == IP_REASS_PARTIAL) {
7340 7345                          goto reass_done;
7341 7346                  } else if (ret == IP_REASS_FAILED) {
7342 7347                          /* Reassembly failed. Free up all resources */
7343 7348                          ill_frag_free_pkts(ill, ipfb, ipf, 1);
7344 7349                          for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
7345 7350                                  IP_REASS_SET_START(t_mp, 0);
7346 7351                                  IP_REASS_SET_END(t_mp, 0);
7347 7352                          }
7348 7353                          freemsg(mp);
7349 7354                          goto reass_done;
7350 7355                  }
7351 7356                  /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
7352 7357          }
7353 7358          /*
7354 7359           * We have completed reassembly.  Unhook the frag header from
7355 7360           * the reassembly list.
7356 7361           *
7357 7362           * Before we free the frag header, record the ECN info
7358 7363           * to report back to the transport.
7359 7364           */
7360 7365          ecn_info = ipf->ipf_ecn;
7361 7366          BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
7362 7367          ipfp = ipf->ipf_ptphn;
7363 7368  
7364 7369          /* We need to supply these to caller */
7365 7370          if ((sum_flags = ipf->ipf_checksum_flags) != 0)
7366 7371                  sum_val = ipf->ipf_checksum;
7367 7372          else
7368 7373                  sum_val = 0;
7369 7374  
7370 7375          mp1 = ipf->ipf_mp;
7371 7376          count = ipf->ipf_count;
7372 7377          ipf = ipf->ipf_hash_next;
7373 7378          if (ipf != NULL)
7374 7379                  ipf->ipf_ptphn = ipfp;
7375 7380          ipfp[0] = ipf;
7376 7381          atomic_add_32(&ill->ill_frag_count, -count);
7377 7382          ASSERT(ipfb->ipfb_count >= count);
7378 7383          ipfb->ipfb_count -= count;
7379 7384          ipfb->ipfb_frag_pkts--;
7380 7385          mutex_exit(&ipfb->ipfb_lock);
7381 7386          /* Ditch the frag header. */
7382 7387          mp = mp1->b_cont;
7383 7388  
7384 7389          freeb(mp1);
7385 7390  
7386 7391          /* Restore original IP length in header. */
7387 7392          packet_size = (uint32_t)msgdsize(mp);
7388 7393          if (packet_size > IP_MAXPACKET) {
7389 7394                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7390 7395                  ip_drop_input("Reassembled packet too large", mp, ill);
7391 7396                  freemsg(mp);
7392 7397                  return (NULL);
7393 7398          }
7394 7399  
7395 7400          if (DB_REF(mp) > 1) {
7396 7401                  mblk_t *mp2 = copymsg(mp);
7397 7402  
7398 7403                  if (mp2 == NULL) {
7399 7404                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7400 7405                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7401 7406                          freemsg(mp);
7402 7407                          return (NULL);
7403 7408                  }
7404 7409                  freemsg(mp);
7405 7410                  mp = mp2;
7406 7411          }
7407 7412          ipha = (ipha_t *)mp->b_rptr;
7408 7413  
7409 7414          ipha->ipha_length = htons((uint16_t)packet_size);
7410 7415          /* We're now complete, zip the frag state */
7411 7416          ipha->ipha_fragment_offset_and_flags = 0;
7412 7417          /* Record the ECN info. */
7413 7418          ipha->ipha_type_of_service &= 0xFC;
7414 7419          ipha->ipha_type_of_service |= ecn_info;
7415 7420  
7416 7421          /* Update the receive attributes */
7417 7422          ira->ira_pktlen = packet_size;
7418 7423          ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
7419 7424  
7420 7425          /* Reassembly is successful; set checksum information in packet */
7421 7426          DB_CKSUM16(mp) = (uint16_t)sum_val;
7422 7427          DB_CKSUMFLAGS(mp) = sum_flags;
7423 7428          DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
7424 7429  
7425 7430          return (mp);
7426 7431  }
7427 7432  
7428 7433  /*
7429 7434   * Pullup function that should be used for IP input in order to
7430 7435   * ensure we do not loose the L2 source address; we need the l2 source
7431 7436   * address for IP_RECVSLLA and for ndp_input.
7432 7437   *
7433 7438   * We return either NULL or b_rptr.
7434 7439   */
7435 7440  void *
7436 7441  ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
7437 7442  {
7438 7443          ill_t           *ill = ira->ira_ill;
7439 7444  
7440 7445          if (ip_rput_pullups++ == 0) {
7441 7446                  (void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
7442 7447                      "ip_pullup: %s forced us to "
7443 7448                      " pullup pkt, hdr len %ld, hdr addr %p",
7444 7449                      ill->ill_name, len, (void *)mp->b_rptr);
7445 7450          }
7446 7451          if (!(ira->ira_flags & IRAF_L2SRC_SET))
7447 7452                  ip_setl2src(mp, ira, ira->ira_rill);
7448 7453          ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7449 7454          if (!pullupmsg(mp, len))
7450 7455                  return (NULL);
7451 7456          else
7452 7457                  return (mp->b_rptr);
7453 7458  }
7454 7459  
7455 7460  /*
7456 7461   * Make sure ira_l2src has an address. If we don't have one fill with zeros.
7457 7462   * When called from the ULP ira_rill will be NULL hence the caller has to
7458 7463   * pass in the ill.
7459 7464   */
7460 7465  /* ARGSUSED */
7461 7466  void
7462 7467  ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
7463 7468  {
7464 7469          const uchar_t *addr;
7465 7470          int alen;
7466 7471  
7467 7472          if (ira->ira_flags & IRAF_L2SRC_SET)
7468 7473                  return;
7469 7474  
7470 7475          ASSERT(ill != NULL);
7471 7476          alen = ill->ill_phys_addr_length;
7472 7477          ASSERT(alen <= sizeof (ira->ira_l2src));
7473 7478          if (ira->ira_mhip != NULL &&
7474 7479              (addr = ira->ira_mhip->mhi_saddr) != NULL) {
7475 7480                  bcopy(addr, ira->ira_l2src, alen);
7476 7481          } else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
7477 7482              (addr = ill->ill_phys_addr) != NULL) {
7478 7483                  bcopy(addr, ira->ira_l2src, alen);
7479 7484          } else {
7480 7485                  bzero(ira->ira_l2src, alen);
7481 7486          }
7482 7487          ira->ira_flags |= IRAF_L2SRC_SET;
7483 7488  }
7484 7489  
7485 7490  /*
7486 7491   * check ip header length and align it.
7487 7492   */
7488 7493  mblk_t *
7489 7494  ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
7490 7495  {
7491 7496          ill_t   *ill = ira->ira_ill;
7492 7497          ssize_t len;
7493 7498  
7494 7499          len = MBLKL(mp);
7495 7500  
7496 7501          if (!OK_32PTR(mp->b_rptr))
7497 7502                  IP_STAT(ill->ill_ipst, ip_notaligned);
7498 7503          else
7499 7504                  IP_STAT(ill->ill_ipst, ip_recv_pullup);
7500 7505  
7501 7506          /* Guard against bogus device drivers */
7502 7507          if (len < 0) {
7503 7508                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7504 7509                  ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7505 7510                  freemsg(mp);
7506 7511                  return (NULL);
7507 7512          }
7508 7513  
7509 7514          if (len == 0) {
7510 7515                  /* GLD sometimes sends up mblk with b_rptr == b_wptr! */
7511 7516                  mblk_t *mp1 = mp->b_cont;
7512 7517  
7513 7518                  if (!(ira->ira_flags & IRAF_L2SRC_SET))
7514 7519                          ip_setl2src(mp, ira, ira->ira_rill);
7515 7520                  ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7516 7521  
7517 7522                  freeb(mp);
7518 7523                  mp = mp1;
7519 7524                  if (mp == NULL)
7520 7525                          return (NULL);
7521 7526  
7522 7527                  if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
7523 7528                          return (mp);
7524 7529          }
7525 7530          if (ip_pullup(mp, min_size, ira) == NULL) {
7526 7531                  if (msgdsize(mp) < min_size) {
7527 7532                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7528 7533                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7529 7534                  } else {
7530 7535                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7531 7536                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7532 7537                  }
7533 7538                  freemsg(mp);
7534 7539                  return (NULL);
7535 7540          }
7536 7541          return (mp);
7537 7542  }
7538 7543  
7539 7544  /*
7540 7545   * Common code for IPv4 and IPv6 to check and pullup multi-mblks
7541 7546   */
7542 7547  mblk_t *
7543 7548  ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len, uint_t pkt_len,
7544 7549      uint_t min_size, ip_recv_attr_t *ira)
7545 7550  {
7546 7551          ill_t   *ill = ira->ira_ill;
7547 7552  
7548 7553          /*
7549 7554           * Make sure we have data length consistent
7550 7555           * with the IP header.
7551 7556           */
7552 7557          if (mp->b_cont == NULL) {
7553 7558                  /* pkt_len is based on ipha_len, not the mblk length */
7554 7559                  if (pkt_len < min_size) {
7555 7560                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7556 7561                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7557 7562                          freemsg(mp);
7558 7563                          return (NULL);
7559 7564                  }
7560 7565                  if (len < 0) {
7561 7566                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7562 7567                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7563 7568                          freemsg(mp);
7564 7569                          return (NULL);
7565 7570                  }
7566 7571                  /* Drop any pad */
7567 7572                  mp->b_wptr = rptr + pkt_len;
7568 7573          } else if ((len += msgdsize(mp->b_cont)) != 0) {
7569 7574                  ASSERT(pkt_len >= min_size);
7570 7575                  if (pkt_len < min_size) {
7571 7576                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7572 7577                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7573 7578                          freemsg(mp);
7574 7579                          return (NULL);
7575 7580                  }
7576 7581                  if (len < 0) {
7577 7582                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7578 7583                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7579 7584                          freemsg(mp);
7580 7585                          return (NULL);
7581 7586                  }
7582 7587                  /* Drop any pad */
7583 7588                  (void) adjmsg(mp, -len);
7584 7589                  /*
7585 7590                   * adjmsg may have freed an mblk from the chain, hence
7586 7591                   * invalidate any hw checksum here. This will force IP to
7587 7592                   * calculate the checksum in sw, but only for this packet.
7588 7593                   */
7589 7594                  DB_CKSUMFLAGS(mp) = 0;
7590 7595                  IP_STAT(ill->ill_ipst, ip_multimblk);
7591 7596          }
7592 7597          return (mp);
7593 7598  }
7594 7599  
7595 7600  /*
7596 7601   * Check that the IPv4 opt_len is consistent with the packet and pullup
7597 7602   * the options.
7598 7603   */
7599 7604  mblk_t *
7600 7605  ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
7601 7606      ip_recv_attr_t *ira)
7602 7607  {
7603 7608          ill_t   *ill = ira->ira_ill;
7604 7609          ssize_t len;
7605 7610  
7606 7611          /* Assume no IPv6 packets arrive over the IPv4 queue */
7607 7612          if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
7608 7613                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7609 7614                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
7610 7615                  ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
7611 7616                  freemsg(mp);
7612 7617                  return (NULL);
7613 7618          }
7614 7619  
7615 7620          if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
7616 7621                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7617 7622                  ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7618 7623                  freemsg(mp);
7619 7624                  return (NULL);
7620 7625          }
7621 7626          /*
7622 7627           * Recompute complete header length and make sure we
7623 7628           * have access to all of it.
7624 7629           */
7625 7630          len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
7626 7631          if (len > (mp->b_wptr - mp->b_rptr)) {
7627 7632                  if (len > pkt_len) {
7628 7633                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7629 7634                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7630 7635                          freemsg(mp);
7631 7636                          return (NULL);
7632 7637                  }
7633 7638                  if (ip_pullup(mp, len, ira) == NULL) {
7634 7639                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7635 7640                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7636 7641                          freemsg(mp);
7637 7642                          return (NULL);
7638 7643                  }
7639 7644          }
7640 7645          return (mp);
7641 7646  }
7642 7647  
7643 7648  /*
7644 7649   * Returns a new ire, or the same ire, or NULL.
7645 7650   * If a different IRE is returned, then it is held; the caller
7646 7651   * needs to release it.
7647 7652   * In no case is there any hold/release on the ire argument.
7648 7653   */
7649 7654  ire_t *
7650 7655  ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
7651 7656  {
7652 7657          ire_t           *new_ire;
7653 7658          ill_t           *ire_ill;
7654 7659          uint_t          ifindex;
7655 7660          ip_stack_t      *ipst = ill->ill_ipst;
7656 7661          boolean_t       strict_check = B_FALSE;
7657 7662  
7658 7663          /*
7659 7664           * IPMP common case: if IRE and ILL are in the same group, there's no
7660 7665           * issue (e.g. packet received on an underlying interface matched an
7661 7666           * IRE_LOCAL on its associated group interface).
7662 7667           */
7663 7668          ASSERT(ire->ire_ill != NULL);
7664 7669          if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
7665 7670                  return (ire);
7666 7671  
7667 7672          /*
7668 7673           * Do another ire lookup here, using the ingress ill, to see if the
7669 7674           * interface is in a usesrc group.
7670 7675           * As long as the ills belong to the same group, we don't consider
7671 7676           * them to be arriving on the wrong interface. Thus, if the switch
7672 7677           * is doing inbound load spreading, we won't drop packets when the
7673 7678           * ip*_strict_dst_multihoming switch is on.
7674 7679           * We also need to check for IPIF_UNNUMBERED point2point interfaces
7675 7680           * where the local address may not be unique. In this case we were
7676 7681           * at the mercy of the initial ire lookup and the IRE_LOCAL it
7677 7682           * actually returned. The new lookup, which is more specific, should
7678 7683           * only find the IRE_LOCAL associated with the ingress ill if one
7679 7684           * exists.
7680 7685           */
7681 7686          if (ire->ire_ipversion == IPV4_VERSION) {
7682 7687                  if (ipst->ips_ip_strict_dst_multihoming)
7683 7688                          strict_check = B_TRUE;
7684 7689                  new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
7685 7690                      IRE_LOCAL, ill, ALL_ZONES, NULL,
7686 7691                      (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7687 7692          } else {
7688 7693                  ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
7689 7694                  if (ipst->ips_ipv6_strict_dst_multihoming)
7690 7695                          strict_check = B_TRUE;
7691 7696                  new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
7692 7697                      IRE_LOCAL, ill, ALL_ZONES, NULL,
7693 7698                      (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7694 7699          }
7695 7700          /*
7696 7701           * If the same ire that was returned in ip_input() is found then this
7697 7702           * is an indication that usesrc groups are in use. The packet
7698 7703           * arrived on a different ill in the group than the one associated with
7699 7704           * the destination address.  If a different ire was found then the same
7700 7705           * IP address must be hosted on multiple ills. This is possible with
7701 7706           * unnumbered point2point interfaces. We switch to use this new ire in
7702 7707           * order to have accurate interface statistics.
7703 7708           */
7704 7709          if (new_ire != NULL) {
7705 7710                  /* Note: held in one case but not the other? Caller handles */
7706 7711                  if (new_ire != ire)
7707 7712                          return (new_ire);
7708 7713                  /* Unchanged */
7709 7714                  ire_refrele(new_ire);
7710 7715                  return (ire);
7711 7716          }
7712 7717  
7713 7718          /*
7714 7719           * Chase pointers once and store locally.
7715 7720           */
7716 7721          ASSERT(ire->ire_ill != NULL);
7717 7722          ire_ill = ire->ire_ill;
7718 7723          ifindex = ill->ill_usesrc_ifindex;
7719 7724  
7720 7725          /*
7721 7726           * Check if it's a legal address on the 'usesrc' interface.
7722 7727           * For IPMP data addresses the IRE_LOCAL is the upper, hence we
7723 7728           * can just check phyint_ifindex.
7724 7729           */
7725 7730          if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
7726 7731                  return (ire);
7727 7732          }
7728 7733  
7729 7734          /*
7730 7735           * If the ip*_strict_dst_multihoming switch is on then we can
7731 7736           * only accept this packet if the interface is marked as routing.
7732 7737           */
7733 7738          if (!(strict_check))
7734 7739                  return (ire);
7735 7740  
7736 7741          if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
7737 7742                  return (ire);
7738 7743          }
7739 7744          return (NULL);
7740 7745  }
7741 7746  
7742 7747  /*
7743 7748   * This function is used to construct a mac_header_info_s from a
7744 7749   * DL_UNITDATA_IND message.
7745 7750   * The address fields in the mhi structure points into the message,
7746 7751   * thus the caller can't use those fields after freeing the message.
7747 7752   *
7748 7753   * We determine whether the packet received is a non-unicast packet
7749 7754   * and in doing so, determine whether or not it is broadcast vs multicast.
7750 7755   * For it to be a broadcast packet, we must have the appropriate mblk_t
7751 7756   * hanging off the ill_t.  If this is either not present or doesn't match
7752 7757   * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7753 7758   * to be multicast.  Thus NICs that have no broadcast address (or no
7754 7759   * capability for one, such as point to point links) cannot return as
7755 7760   * the packet being broadcast.
7756 7761   */
7757 7762  void
7758 7763  ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
7759 7764  {
7760 7765          dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
7761 7766          mblk_t *bmp;
7762 7767          uint_t extra_offset;
7763 7768  
7764 7769          bzero(mhip, sizeof (struct mac_header_info_s));
7765 7770  
7766 7771          mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7767 7772  
7768 7773          if (ill->ill_sap_length < 0)
7769 7774                  extra_offset = 0;
7770 7775          else
7771 7776                  extra_offset = ill->ill_sap_length;
7772 7777  
7773 7778          mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
7774 7779              extra_offset;
7775 7780          mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
7776 7781              extra_offset;
7777 7782  
7778 7783          if (!ind->dl_group_address)
7779 7784                  return;
7780 7785  
7781 7786          /* Multicast or broadcast */
7782 7787          mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7783 7788  
7784 7789          if (ind->dl_dest_addr_offset > sizeof (*ind) &&
7785 7790              ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
7786 7791              (bmp = ill->ill_bcast_mp) != NULL) {
7787 7792                  dl_unitdata_req_t *dlur;
7788 7793                  uint8_t *bphys_addr;
7789 7794  
7790 7795                  dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7791 7796                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
7792 7797                      extra_offset;
7793 7798  
7794 7799                  if (bcmp(mhip->mhi_daddr, bphys_addr,
7795 7800                      ind->dl_dest_addr_length) == 0)
7796 7801                          mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7797 7802          }
7798 7803  }
7799 7804  
7800 7805  /*
7801 7806   * This function is used to construct a mac_header_info_s from a
7802 7807   * M_DATA fastpath message from a DLPI driver.
7803 7808   * The address fields in the mhi structure points into the message,
7804 7809   * thus the caller can't use those fields after freeing the message.
7805 7810   *
7806 7811   * We determine whether the packet received is a non-unicast packet
7807 7812   * and in doing so, determine whether or not it is broadcast vs multicast.
7808 7813   * For it to be a broadcast packet, we must have the appropriate mblk_t
7809 7814   * hanging off the ill_t.  If this is either not present or doesn't match
7810 7815   * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7811 7816   * to be multicast.  Thus NICs that have no broadcast address (or no
7812 7817   * capability for one, such as point to point links) cannot return as
7813 7818   * the packet being broadcast.
7814 7819   */
7815 7820  void
7816 7821  ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
7817 7822  {
7818 7823          mblk_t *bmp;
7819 7824          struct ether_header *pether;
7820 7825  
7821 7826          bzero(mhip, sizeof (struct mac_header_info_s));
7822 7827  
7823 7828          mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7824 7829  
7825 7830          pether = (struct ether_header *)((char *)mp->b_rptr
7826 7831              - sizeof (struct ether_header));
7827 7832  
7828 7833          /*
7829 7834           * Make sure the interface is an ethernet type, since we don't
7830 7835           * know the header format for anything but Ethernet. Also make
7831 7836           * sure we are pointing correctly above db_base.
7832 7837           */
7833 7838          if (ill->ill_type != IFT_ETHER)
7834 7839                  return;
7835 7840  
7836 7841  retry:
7837 7842          if ((uchar_t *)pether < mp->b_datap->db_base)
7838 7843                  return;
7839 7844  
7840 7845          /* Is there a VLAN tag? */
7841 7846          if (ill->ill_isv6) {
7842 7847                  if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
7843 7848                          pether = (struct ether_header *)((char *)pether - 4);
7844 7849                          goto retry;
7845 7850                  }
7846 7851          } else {
7847 7852                  if (pether->ether_type != htons(ETHERTYPE_IP)) {
7848 7853                          pether = (struct ether_header *)((char *)pether - 4);
7849 7854                          goto retry;
7850 7855                  }
7851 7856          }
7852 7857          mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
7853 7858          mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
7854 7859  
7855 7860          if (!(mhip->mhi_daddr[0] & 0x01))
7856 7861                  return;
7857 7862  
7858 7863          /* Multicast or broadcast */
7859 7864          mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7860 7865  
7861 7866          if ((bmp = ill->ill_bcast_mp) != NULL) {
7862 7867                  dl_unitdata_req_t *dlur;
7863 7868                  uint8_t *bphys_addr;
7864 7869                  uint_t  addrlen;
7865 7870  
7866 7871                  dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7867 7872                  addrlen = dlur->dl_dest_addr_length;
7868 7873                  if (ill->ill_sap_length < 0) {
7869 7874                          bphys_addr = (uchar_t *)dlur +
7870 7875                              dlur->dl_dest_addr_offset;
7871 7876                          addrlen += ill->ill_sap_length;
7872 7877                  } else {
7873 7878                          bphys_addr = (uchar_t *)dlur +
7874 7879                              dlur->dl_dest_addr_offset +
7875 7880                              ill->ill_sap_length;
7876 7881                          addrlen -= ill->ill_sap_length;
7877 7882                  }
7878 7883                  if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
7879 7884                          mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7880 7885          }
7881 7886  }
7882 7887  
7883 7888  /*
7884 7889   * Handle anything but M_DATA messages
7885 7890   * We see the DL_UNITDATA_IND which are part
7886 7891   * of the data path, and also the other messages from the driver.
7887 7892   */
7888 7893  void
7889 7894  ip_rput_notdata(ill_t *ill, mblk_t *mp)
7890 7895  {
7891 7896          mblk_t          *first_mp;
7892 7897          struct iocblk   *iocp;
7893 7898          struct mac_header_info_s mhi;
7894 7899  
7895 7900          switch (DB_TYPE(mp)) {
7896 7901          case M_PROTO:
7897 7902          case M_PCPROTO: {
7898 7903                  if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
7899 7904                      DL_UNITDATA_IND) {
7900 7905                          /* Go handle anything other than data elsewhere. */
7901 7906                          ip_rput_dlpi(ill, mp);
7902 7907                          return;
7903 7908                  }
7904 7909  
7905 7910                  first_mp = mp;
7906 7911                  mp = first_mp->b_cont;
7907 7912                  first_mp->b_cont = NULL;
7908 7913  
7909 7914                  if (mp == NULL) {
7910 7915                          freeb(first_mp);
7911 7916                          return;
7912 7917                  }
7913 7918                  ip_dlur_to_mhi(ill, first_mp, &mhi);
7914 7919                  if (ill->ill_isv6)
7915 7920                          ip_input_v6(ill, NULL, mp, &mhi);
7916 7921                  else
7917 7922                          ip_input(ill, NULL, mp, &mhi);
7918 7923  
7919 7924                  /* Ditch the DLPI header. */
7920 7925                  freeb(first_mp);
7921 7926                  return;
7922 7927          }
7923 7928          case M_IOCACK:
7924 7929                  iocp = (struct iocblk *)mp->b_rptr;
7925 7930                  switch (iocp->ioc_cmd) {
7926 7931                  case DL_IOC_HDR_INFO:
7927 7932                          ill_fastpath_ack(ill, mp);
7928 7933                          return;
7929 7934                  default:
7930 7935                          putnext(ill->ill_rq, mp);
7931 7936                          return;
7932 7937                  }
7933 7938                  /* FALLTHRU */
7934 7939          case M_ERROR:
7935 7940          case M_HANGUP:
7936 7941                  mutex_enter(&ill->ill_lock);
7937 7942                  if (ill->ill_state_flags & ILL_CONDEMNED) {
7938 7943                          mutex_exit(&ill->ill_lock);
7939 7944                          freemsg(mp);
7940 7945                          return;
7941 7946                  }
7942 7947                  ill_refhold_locked(ill);
7943 7948                  mutex_exit(&ill->ill_lock);
7944 7949                  qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
7945 7950                      B_FALSE);
7946 7951                  return;
7947 7952          case M_CTL:
7948 7953                  putnext(ill->ill_rq, mp);
7949 7954                  return;
7950 7955          case M_IOCNAK:
7951 7956                  ip1dbg(("got iocnak "));
7952 7957                  iocp = (struct iocblk *)mp->b_rptr;
7953 7958                  switch (iocp->ioc_cmd) {
7954 7959                  case DL_IOC_HDR_INFO:
7955 7960                          ip_rput_other(NULL, ill->ill_rq, mp, NULL);
7956 7961                          return;
7957 7962                  default:
7958 7963                          break;
7959 7964                  }
7960 7965                  /* FALLTHRU */
7961 7966          default:
7962 7967                  putnext(ill->ill_rq, mp);
7963 7968                  return;
7964 7969          }
7965 7970  }
7966 7971  
7967 7972  /* Read side put procedure.  Packets coming from the wire arrive here. */
7968 7973  void
7969 7974  ip_rput(queue_t *q, mblk_t *mp)
7970 7975  {
7971 7976          ill_t   *ill;
7972 7977          union DL_primitives *dl;
7973 7978  
7974 7979          ill = (ill_t *)q->q_ptr;
7975 7980  
7976 7981          if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
7977 7982                  /*
7978 7983                   * If things are opening or closing, only accept high-priority
7979 7984                   * DLPI messages.  (On open ill->ill_ipif has not yet been
7980 7985                   * created; on close, things hanging off the ill may have been
7981 7986                   * freed already.)
7982 7987                   */
7983 7988                  dl = (union DL_primitives *)mp->b_rptr;
7984 7989                  if (DB_TYPE(mp) != M_PCPROTO ||
7985 7990                      dl->dl_primitive == DL_UNITDATA_IND) {
7986 7991                          inet_freemsg(mp);
7987 7992                          return;
7988 7993                  }
7989 7994          }
7990 7995          if (DB_TYPE(mp) == M_DATA) {
7991 7996                  struct mac_header_info_s mhi;
7992 7997  
7993 7998                  ip_mdata_to_mhi(ill, mp, &mhi);
7994 7999                  ip_input(ill, NULL, mp, &mhi);
7995 8000          } else {
7996 8001                  ip_rput_notdata(ill, mp);
7997 8002          }
7998 8003  }
7999 8004  
8000 8005  /*
8001 8006   * Move the information to a copy.
8002 8007   */
8003 8008  mblk_t *
8004 8009  ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
8005 8010  {
8006 8011          mblk_t          *mp1;
8007 8012          ill_t           *ill = ira->ira_ill;
8008 8013          ip_stack_t      *ipst = ill->ill_ipst;
8009 8014  
8010 8015          IP_STAT(ipst, ip_db_ref);
8011 8016  
8012 8017          /* Make sure we have ira_l2src before we loose the original mblk */
8013 8018          if (!(ira->ira_flags & IRAF_L2SRC_SET))
8014 8019                  ip_setl2src(mp, ira, ira->ira_rill);
8015 8020  
8016 8021          mp1 = copymsg(mp);
8017 8022          if (mp1 == NULL) {
8018 8023                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
8019 8024                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
8020 8025                  freemsg(mp);
8021 8026                  return (NULL);
8022 8027          }
8023 8028          /* preserve the hardware checksum flags and data, if present */
8024 8029          if (DB_CKSUMFLAGS(mp) != 0) {
8025 8030                  DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
8026 8031                  DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
8027 8032                  DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
8028 8033                  DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
8029 8034                  DB_CKSUM16(mp1) = DB_CKSUM16(mp);
8030 8035          }
8031 8036          freemsg(mp);
8032 8037          return (mp1);
8033 8038  }
8034 8039  
8035 8040  static void
8036 8041  ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
8037 8042      t_uscalar_t err)
8038 8043  {
8039 8044          if (dl_err == DL_SYSERR) {
8040 8045                  (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8041 8046                      "%s: %s failed: DL_SYSERR (errno %u)\n",
8042 8047                      ill->ill_name, dl_primstr(prim), err);
8043 8048                  return;
8044 8049          }
8045 8050  
8046 8051          (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8047 8052              "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim),
8048 8053              dl_errstr(dl_err));
8049 8054  }
8050 8055  
8051 8056  /*
8052 8057   * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other
8053 8058   * than DL_UNITDATA_IND messages. If we need to process this message
8054 8059   * exclusively, we call qwriter_ip, in which case we also need to call
8055 8060   * ill_refhold before that, since qwriter_ip does an ill_refrele.
8056 8061   */
8057 8062  void
8058 8063  ip_rput_dlpi(ill_t *ill, mblk_t *mp)
8059 8064  {
8060 8065          dl_ok_ack_t     *dloa = (dl_ok_ack_t *)mp->b_rptr;
8061 8066          dl_error_ack_t  *dlea = (dl_error_ack_t *)dloa;
8062 8067          queue_t         *q = ill->ill_rq;
8063 8068          t_uscalar_t     prim = dloa->dl_primitive;
8064 8069          t_uscalar_t     reqprim = DL_PRIM_INVAL;
8065 8070  
8066 8071          DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
8067 8072              char *, dl_primstr(prim), ill_t *, ill);
8068 8073          ip1dbg(("ip_rput_dlpi"));
8069 8074  
8070 8075          /*
8071 8076           * If we received an ACK but didn't send a request for it, then it
8072 8077           * can't be part of any pending operation; discard up-front.
8073 8078           */
8074 8079          switch (prim) {
8075 8080          case DL_ERROR_ACK:
8076 8081                  reqprim = dlea->dl_error_primitive;
8077 8082                  ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s "
8078 8083                      "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim),
8079 8084                      reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno,
8080 8085                      dlea->dl_unix_errno));
8081 8086                  break;
8082 8087          case DL_OK_ACK:
8083 8088                  reqprim = dloa->dl_correct_primitive;
8084 8089                  break;
8085 8090          case DL_INFO_ACK:
8086 8091                  reqprim = DL_INFO_REQ;
8087 8092                  break;
8088 8093          case DL_BIND_ACK:
8089 8094                  reqprim = DL_BIND_REQ;
8090 8095                  break;
8091 8096          case DL_PHYS_ADDR_ACK:
8092 8097                  reqprim = DL_PHYS_ADDR_REQ;
8093 8098                  break;
8094 8099          case DL_NOTIFY_ACK:
8095 8100                  reqprim = DL_NOTIFY_REQ;
8096 8101                  break;
8097 8102          case DL_CAPABILITY_ACK:
8098 8103                  reqprim = DL_CAPABILITY_REQ;
8099 8104                  break;
8100 8105          }
8101 8106  
8102 8107          if (prim != DL_NOTIFY_IND) {
8103 8108                  if (reqprim == DL_PRIM_INVAL ||
8104 8109                      !ill_dlpi_pending(ill, reqprim)) {
8105 8110                          /* Not a DLPI message we support or expected */
8106 8111                          freemsg(mp);
8107 8112                          return;
8108 8113                  }
8109 8114                  ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim),
8110 8115                      dl_primstr(reqprim)));
8111 8116          }
8112 8117  
8113 8118          switch (reqprim) {
8114 8119          case DL_UNBIND_REQ:
8115 8120                  /*
8116 8121                   * NOTE: we mark the unbind as complete even if we got a
8117 8122                   * DL_ERROR_ACK, since there's not much else we can do.
8118 8123                   */
8119 8124                  mutex_enter(&ill->ill_lock);
8120 8125                  ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
8121 8126                  cv_signal(&ill->ill_cv);
8122 8127                  mutex_exit(&ill->ill_lock);
8123 8128                  break;
8124 8129  
8125 8130          case DL_ENABMULTI_REQ:
8126 8131                  if (prim == DL_OK_ACK) {
8127 8132                          if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8128 8133                                  ill->ill_dlpi_multicast_state = IDS_OK;
8129 8134                  }
8130 8135                  break;
8131 8136          }
8132 8137  
8133 8138          /*
8134 8139           * The message is one we're waiting for (or DL_NOTIFY_IND), but we
8135 8140           * need to become writer to continue to process it.  Because an
8136 8141           * exclusive operation doesn't complete until replies to all queued
8137 8142           * DLPI messages have been received, we know we're in the middle of an
8138 8143           * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND).
8139 8144           *
8140 8145           * As required by qwriter_ip(), we refhold the ill; it will refrele.
8141 8146           * Since this is on the ill stream we unconditionally bump up the
8142 8147           * refcount without doing ILL_CAN_LOOKUP().
8143 8148           */
8144 8149          ill_refhold(ill);
8145 8150          if (prim == DL_NOTIFY_IND)
8146 8151                  qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE);
8147 8152          else
8148 8153                  qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE);
8149 8154  }
8150 8155  
8151 8156  /*
8152 8157   * Handling of DLPI messages that require exclusive access to the ipsq.
8153 8158   *
8154 8159   * Need to do ipsq_pending_mp_get on ioctl completion, which could
8155 8160   * happen here. (along with mi_copy_done)
8156 8161   */
8157 8162  /* ARGSUSED */
8158 8163  static void
8159 8164  ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8160 8165  {
8161 8166          dl_ok_ack_t     *dloa = (dl_ok_ack_t *)mp->b_rptr;
8162 8167          dl_error_ack_t  *dlea = (dl_error_ack_t *)dloa;
8163 8168          int             err = 0;
8164 8169          ill_t           *ill = (ill_t *)q->q_ptr;
8165 8170          ipif_t          *ipif = NULL;
8166 8171          mblk_t          *mp1 = NULL;
8167 8172          conn_t          *connp = NULL;
8168 8173          t_uscalar_t     paddrreq;
8169 8174          mblk_t          *mp_hw;
8170 8175          boolean_t       success;
8171 8176          boolean_t       ioctl_aborted = B_FALSE;
8172 8177          boolean_t       log = B_TRUE;
8173 8178  
8174 8179          DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
8175 8180              char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
8176 8181  
8177 8182          ip1dbg(("ip_rput_dlpi_writer .."));
8178 8183          ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
8179 8184          ASSERT(IAM_WRITER_ILL(ill));
8180 8185  
8181 8186          ipif = ipsq->ipsq_xop->ipx_pending_ipif;
8182 8187          /*
8183 8188           * The current ioctl could have been aborted by the user and a new
8184 8189           * ioctl to bring up another ill could have started. We could still
8185 8190           * get a response from the driver later.
8186 8191           */
8187 8192          if (ipif != NULL && ipif->ipif_ill != ill)
8188 8193                  ioctl_aborted = B_TRUE;
8189 8194  
8190 8195          switch (dloa->dl_primitive) {
8191 8196          case DL_ERROR_ACK:
8192 8197                  ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
8193 8198                      dl_primstr(dlea->dl_error_primitive)));
8194 8199  
8195 8200                  DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
8196 8201                      char *, dl_primstr(dlea->dl_error_primitive),
8197 8202                      ill_t *, ill);
8198 8203  
8199 8204                  switch (dlea->dl_error_primitive) {
8200 8205                  case DL_DISABMULTI_REQ:
8201 8206                          ill_dlpi_done(ill, dlea->dl_error_primitive);
8202 8207                          break;
8203 8208                  case DL_PROMISCON_REQ:
8204 8209                  case DL_PROMISCOFF_REQ:
8205 8210                  case DL_UNBIND_REQ:
8206 8211                  case DL_ATTACH_REQ:
8207 8212                  case DL_INFO_REQ:
8208 8213                          ill_dlpi_done(ill, dlea->dl_error_primitive);
8209 8214                          break;
8210 8215                  case DL_NOTIFY_REQ:
8211 8216                          ill_dlpi_done(ill, DL_NOTIFY_REQ);
8212 8217                          log = B_FALSE;
8213 8218                          break;
8214 8219                  case DL_PHYS_ADDR_REQ:
8215 8220                          /*
8216 8221                           * For IPv6 only, there are two additional
8217 8222                           * phys_addr_req's sent to the driver to get the
8218 8223                           * IPv6 token and lla. This allows IP to acquire
8219 8224                           * the hardware address format for a given interface
8220 8225                           * without having built in knowledge of the hardware
8221 8226                           * address. ill_phys_addr_pend keeps track of the last
8222 8227                           * DL_PAR sent so we know which response we are
8223 8228                           * dealing with. ill_dlpi_done will update
8224 8229                           * ill_phys_addr_pend when it sends the next req.
8225 8230                           * We don't complete the IOCTL until all three DL_PARs
8226 8231                           * have been attempted, so set *_len to 0 and break.
8227 8232                           */
8228 8233                          paddrreq = ill->ill_phys_addr_pend;
8229 8234                          ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8230 8235                          if (paddrreq == DL_IPV6_TOKEN) {
8231 8236                                  ill->ill_token_length = 0;
8232 8237                                  log = B_FALSE;
8233 8238                                  break;
8234 8239                          } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8235 8240                                  ill->ill_nd_lla_len = 0;
8236 8241                                  log = B_FALSE;
8237 8242                                  break;
8238 8243                          }
8239 8244                          /*
8240 8245                           * Something went wrong with the DL_PHYS_ADDR_REQ.
8241 8246                           * We presumably have an IOCTL hanging out waiting
8242 8247                           * for completion. Find it and complete the IOCTL
8243 8248                           * with the error noted.
8244 8249                           * However, ill_dl_phys was called on an ill queue
8245 8250                           * (from SIOCSLIFNAME), thus conn_pending_ill is not
8246 8251                           * set. But the ioctl is known to be pending on ill_wq.
8247 8252                           */
8248 8253                          if (!ill->ill_ifname_pending)
8249 8254                                  break;
8250 8255                          ill->ill_ifname_pending = 0;
8251 8256                          if (!ioctl_aborted)
8252 8257                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8253 8258                          if (mp1 != NULL) {
8254 8259                                  /*
8255 8260                                   * This operation (SIOCSLIFNAME) must have
8256 8261                                   * happened on the ill. Assert there is no conn
8257 8262                                   */
8258 8263                                  ASSERT(connp == NULL);
8259 8264                                  q = ill->ill_wq;
8260 8265                          }
8261 8266                          break;
8262 8267                  case DL_BIND_REQ:
8263 8268                          ill_dlpi_done(ill, DL_BIND_REQ);
8264 8269                          if (ill->ill_ifname_pending)
8265 8270                                  break;
8266 8271                          mutex_enter(&ill->ill_lock);
8267 8272                          ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8268 8273                          mutex_exit(&ill->ill_lock);
8269 8274                          /*
8270 8275                           * Something went wrong with the bind.  We presumably
8271 8276                           * have an IOCTL hanging out waiting for completion.
8272 8277                           * Find it, take down the interface that was coming
8273 8278                           * up, and complete the IOCTL with the error noted.
8274 8279                           */
8275 8280                          if (!ioctl_aborted)
8276 8281                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8277 8282                          if (mp1 != NULL) {
8278 8283                                  /*
8279 8284                                   * This might be a result of a DL_NOTE_REPLUMB
8280 8285                                   * notification. In that case, connp is NULL.
8281 8286                                   */
8282 8287                                  if (connp != NULL)
8283 8288                                          q = CONNP_TO_WQ(connp);
8284 8289  
8285 8290                                  (void) ipif_down(ipif, NULL, NULL);
8286 8291                                  /* error is set below the switch */
8287 8292                          }
8288 8293                          break;
8289 8294                  case DL_ENABMULTI_REQ:
8290 8295                          ill_dlpi_done(ill, DL_ENABMULTI_REQ);
8291 8296  
8292 8297                          if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8293 8298                                  ill->ill_dlpi_multicast_state = IDS_FAILED;
8294 8299                          if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
8295 8300  
8296 8301                                  printf("ip: joining multicasts failed (%d)"
8297 8302                                      " on %s - will use link layer "
8298 8303                                      "broadcasts for multicast\n",
8299 8304                                      dlea->dl_errno, ill->ill_name);
8300 8305  
8301 8306                                  /*
8302 8307                                   * Set up for multi_bcast; We are the
8303 8308                                   * writer, so ok to access ill->ill_ipif
8304 8309                                   * without any lock.
8305 8310                                   */
8306 8311                                  mutex_enter(&ill->ill_phyint->phyint_lock);
8307 8312                                  ill->ill_phyint->phyint_flags |=
8308 8313                                      PHYI_MULTI_BCAST;
8309 8314                                  mutex_exit(&ill->ill_phyint->phyint_lock);
8310 8315  
8311 8316                          }
8312 8317                          freemsg(mp);    /* Don't want to pass this up */
8313 8318                          return;
8314 8319                  case DL_CAPABILITY_REQ:
8315 8320                          ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
8316 8321                              "DL_CAPABILITY REQ\n"));
8317 8322                          if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
8318 8323                                  ill->ill_dlpi_capab_state = IDCS_FAILED;
8319 8324                          ill_capability_done(ill);
8320 8325                          freemsg(mp);
8321 8326                          return;
8322 8327                  }
8323 8328                  /*
8324 8329                   * Note the error for IOCTL completion (mp1 is set when
8325 8330                   * ready to complete ioctl). If ill_ifname_pending_err is
8326 8331                   * set, an error occured during plumbing (ill_ifname_pending),
8327 8332                   * so we want to report that error.
8328 8333                   *
8329 8334                   * NOTE: there are two addtional DL_PHYS_ADDR_REQ's
8330 8335                   * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are
8331 8336                   * expected to get errack'd if the driver doesn't support
8332 8337                   * these flags (e.g. ethernet). log will be set to B_FALSE
8333 8338                   * if these error conditions are encountered.
8334 8339                   */
8335 8340                  if (mp1 != NULL) {
8336 8341                          if (ill->ill_ifname_pending_err != 0)  {
8337 8342                                  err = ill->ill_ifname_pending_err;
8338 8343                                  ill->ill_ifname_pending_err = 0;
8339 8344                          } else {
8340 8345                                  err = dlea->dl_unix_errno ?
8341 8346                                      dlea->dl_unix_errno : ENXIO;
8342 8347                          }
8343 8348                  /*
8344 8349                   * If we're plumbing an interface and an error hasn't already
8345 8350                   * been saved, set ill_ifname_pending_err to the error passed
8346 8351                   * up. Ignore the error if log is B_FALSE (see comment above).
8347 8352                   */
8348 8353                  } else if (log && ill->ill_ifname_pending &&
8349 8354                      ill->ill_ifname_pending_err == 0) {
8350 8355                          ill->ill_ifname_pending_err = dlea->dl_unix_errno ?
8351 8356                              dlea->dl_unix_errno : ENXIO;
8352 8357                  }
8353 8358  
8354 8359                  if (log)
8355 8360                          ip_dlpi_error(ill, dlea->dl_error_primitive,
8356 8361                              dlea->dl_errno, dlea->dl_unix_errno);
8357 8362                  break;
8358 8363          case DL_CAPABILITY_ACK:
8359 8364                  ill_capability_ack(ill, mp);
8360 8365                  /*
8361 8366                   * The message has been handed off to ill_capability_ack
8362 8367                   * and must not be freed below
8363 8368                   */
8364 8369                  mp = NULL;
8365 8370                  break;
8366 8371  
8367 8372          case DL_INFO_ACK:
8368 8373                  /* Call a routine to handle this one. */
8369 8374                  ill_dlpi_done(ill, DL_INFO_REQ);
8370 8375                  ip_ll_subnet_defaults(ill, mp);
8371 8376                  ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock));
8372 8377                  return;
8373 8378          case DL_BIND_ACK:
8374 8379                  /*
8375 8380                   * We should have an IOCTL waiting on this unless
8376 8381                   * sent by ill_dl_phys, in which case just return
8377 8382                   */
8378 8383                  ill_dlpi_done(ill, DL_BIND_REQ);
8379 8384  
8380 8385                  if (ill->ill_ifname_pending) {
8381 8386                          DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
8382 8387                              ill_t *, ill, mblk_t *, mp);
8383 8388                          break;
8384 8389                  }
8385 8390                  mutex_enter(&ill->ill_lock);
8386 8391                  ill->ill_dl_up = 1;
8387 8392                  ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8388 8393                  mutex_exit(&ill->ill_lock);
8389 8394  
8390 8395                  if (!ioctl_aborted)
8391 8396                          mp1 = ipsq_pending_mp_get(ipsq, &connp);
8392 8397                  if (mp1 == NULL) {
8393 8398                          DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
8394 8399                          break;
8395 8400                  }
8396 8401                  /*
8397 8402                   * mp1 was added by ill_dl_up(). if that is a result of
8398 8403                   * a DL_NOTE_REPLUMB notification, connp could be NULL.
8399 8404                   */
8400 8405                  if (connp != NULL)
8401 8406                          q = CONNP_TO_WQ(connp);
8402 8407                  /*
8403 8408                   * We are exclusive. So nothing can change even after
8404 8409                   * we get the pending mp.
8405 8410                   */
8406 8411                  ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
8407 8412                  DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
8408 8413                  ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
8409 8414  
8410 8415                  /*
8411 8416                   * Now bring up the resolver; when that is complete, we'll
8412 8417                   * create IREs.  Note that we intentionally mirror what
8413 8418                   * ipif_up() would have done, because we got here by way of
8414 8419                   * ill_dl_up(), which stopped ipif_up()'s processing.
8415 8420                   */
8416 8421                  if (ill->ill_isv6) {
8417 8422                          /*
8418 8423                           * v6 interfaces.
8419 8424                           * Unlike ARP which has to do another bind
8420 8425                           * and attach, once we get here we are
8421 8426                           * done with NDP
8422 8427                           */
8423 8428                          (void) ipif_resolver_up(ipif, Res_act_initial);
8424 8429                          if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
8425 8430                                  err = ipif_up_done_v6(ipif);
8426 8431                  } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
8427 8432                          /*
8428 8433                           * ARP and other v4 external resolvers.
8429 8434                           * Leave the pending mblk intact so that
8430 8435                           * the ioctl completes in ip_rput().
8431 8436                           */
8432 8437                          if (connp != NULL)
8433 8438                                  mutex_enter(&connp->conn_lock);
8434 8439                          mutex_enter(&ill->ill_lock);
8435 8440                          success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
8436 8441                          mutex_exit(&ill->ill_lock);
8437 8442                          if (connp != NULL)
8438 8443                                  mutex_exit(&connp->conn_lock);
8439 8444                          if (success) {
8440 8445                                  err = ipif_resolver_up(ipif, Res_act_initial);
8441 8446                                  if (err == EINPROGRESS) {
8442 8447                                          freemsg(mp);
8443 8448                                          return;
8444 8449                                  }
8445 8450                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8446 8451                          } else {
8447 8452                                  /* The conn has started closing */
8448 8453                                  err = EINTR;
8449 8454                          }
8450 8455                  } else {
8451 8456                          /*
8452 8457                           * This one is complete. Reply to pending ioctl.
8453 8458                           */
8454 8459                          (void) ipif_resolver_up(ipif, Res_act_initial);
8455 8460                          err = ipif_up_done(ipif);
8456 8461                  }
8457 8462  
8458 8463                  if ((err == 0) && (ill->ill_up_ipifs)) {
8459 8464                          err = ill_up_ipifs(ill, q, mp1);
8460 8465                          if (err == EINPROGRESS) {
8461 8466                                  freemsg(mp);
8462 8467                                  return;
8463 8468                          }
8464 8469                  }
8465 8470  
8466 8471                  /*
8467 8472                   * If we have a moved ipif to bring up, and everything has
8468 8473                   * succeeded to this point, bring it up on the IPMP ill.
8469 8474                   * Otherwise, leave it down -- the admin can try to bring it
8470 8475                   * up by hand if need be.
8471 8476                   */
8472 8477                  if (ill->ill_move_ipif != NULL) {
8473 8478                          if (err != 0) {
8474 8479                                  ill->ill_move_ipif = NULL;
8475 8480                          } else {
8476 8481                                  ipif = ill->ill_move_ipif;
8477 8482                                  ill->ill_move_ipif = NULL;
8478 8483                                  err = ipif_up(ipif, q, mp1);
8479 8484                                  if (err == EINPROGRESS) {
8480 8485                                          freemsg(mp);
8481 8486                                          return;
8482 8487                                  }
8483 8488                          }
8484 8489                  }
8485 8490                  break;
8486 8491  
8487 8492          case DL_NOTIFY_IND: {
8488 8493                  dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
8489 8494                  uint_t orig_mtu, orig_mc_mtu;
8490 8495  
8491 8496                  switch (notify->dl_notification) {
8492 8497                  case DL_NOTE_PHYS_ADDR:
8493 8498                          err = ill_set_phys_addr(ill, mp);
8494 8499                          break;
8495 8500  
8496 8501                  case DL_NOTE_REPLUMB:
8497 8502                          /*
8498 8503                           * Directly return after calling ill_replumb().
8499 8504                           * Note that we should not free mp as it is reused
8500 8505                           * in the ill_replumb() function.
8501 8506                           */
8502 8507                          err = ill_replumb(ill, mp);
8503 8508                          return;
8504 8509  
8505 8510                  case DL_NOTE_FASTPATH_FLUSH:
8506 8511                          nce_flush(ill, B_FALSE);
8507 8512                          break;
8508 8513  
8509 8514                  case DL_NOTE_SDU_SIZE:
8510 8515                  case DL_NOTE_SDU_SIZE2:
8511 8516                          /*
8512 8517                           * The dce and fragmentation code can cope with
8513 8518                           * this changing while packets are being sent.
8514 8519                           * When packets are sent ip_output will discover
8515 8520                           * a change.
8516 8521                           *
8517 8522                           * Change the MTU size of the interface.
8518 8523                           */
8519 8524                          mutex_enter(&ill->ill_lock);
8520 8525                          orig_mtu = ill->ill_mtu;
8521 8526                          orig_mc_mtu = ill->ill_mc_mtu;
8522 8527                          switch (notify->dl_notification) {
8523 8528                          case DL_NOTE_SDU_SIZE:
8524 8529                                  ill->ill_current_frag =
8525 8530                                      (uint_t)notify->dl_data;
8526 8531                                  ill->ill_mc_mtu = (uint_t)notify->dl_data;
8527 8532                                  break;
8528 8533                          case DL_NOTE_SDU_SIZE2:
8529 8534                                  ill->ill_current_frag =
8530 8535                                      (uint_t)notify->dl_data1;
8531 8536                                  ill->ill_mc_mtu = (uint_t)notify->dl_data2;
8532 8537                                  break;
8533 8538                          }
8534 8539                          if (ill->ill_current_frag > ill->ill_max_frag)
8535 8540                                  ill->ill_max_frag = ill->ill_current_frag;
8536 8541  
8537 8542                          if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
8538 8543                                  ill->ill_mtu = ill->ill_current_frag;
8539 8544  
8540 8545                                  /*
8541 8546                                   * If ill_user_mtu was set (via
8542 8547                                   * SIOCSLIFLNKINFO), clamp ill_mtu at it.
8543 8548                                   */
8544 8549                                  if (ill->ill_user_mtu != 0 &&
8545 8550                                      ill->ill_user_mtu < ill->ill_mtu)
8546 8551                                          ill->ill_mtu = ill->ill_user_mtu;
8547 8552  
8548 8553                                  if (ill->ill_user_mtu != 0 &&
8549 8554                                      ill->ill_user_mtu < ill->ill_mc_mtu)
8550 8555                                          ill->ill_mc_mtu = ill->ill_user_mtu;
8551 8556  
8552 8557                                  if (ill->ill_isv6) {
8553 8558                                          if (ill->ill_mtu < IPV6_MIN_MTU)
8554 8559                                                  ill->ill_mtu = IPV6_MIN_MTU;
8555 8560                                          if (ill->ill_mc_mtu < IPV6_MIN_MTU)
8556 8561                                                  ill->ill_mc_mtu = IPV6_MIN_MTU;
8557 8562                                  } else {
8558 8563                                          if (ill->ill_mtu < IP_MIN_MTU)
8559 8564                                                  ill->ill_mtu = IP_MIN_MTU;
8560 8565                                          if (ill->ill_mc_mtu < IP_MIN_MTU)
8561 8566                                                  ill->ill_mc_mtu = IP_MIN_MTU;
8562 8567                                  }
8563 8568                          } else if (ill->ill_mc_mtu > ill->ill_mtu) {
8564 8569                                  ill->ill_mc_mtu = ill->ill_mtu;
8565 8570                          }
8566 8571  
8567 8572                          mutex_exit(&ill->ill_lock);
8568 8573                          /*
8569 8574                           * Make sure all dce_generation checks find out
8570 8575                           * that ill_mtu/ill_mc_mtu has changed.
8571 8576                           */
8572 8577                          if (orig_mtu != ill->ill_mtu ||
8573 8578                              orig_mc_mtu != ill->ill_mc_mtu) {
8574 8579                                  dce_increment_all_generations(ill->ill_isv6,
8575 8580                                      ill->ill_ipst);
8576 8581                          }
8577 8582  
8578 8583                          /*
8579 8584                           * Refresh IPMP meta-interface MTU if necessary.
8580 8585                           */
8581 8586                          if (IS_UNDER_IPMP(ill))
8582 8587                                  ipmp_illgrp_refresh_mtu(ill->ill_grp);
8583 8588                          break;
8584 8589  
8585 8590                  case DL_NOTE_LINK_UP:
8586 8591                  case DL_NOTE_LINK_DOWN: {
8587 8592                          /*
8588 8593                           * We are writer. ill / phyint / ipsq assocs stable.
8589 8594                           * The RUNNING flag reflects the state of the link.
8590 8595                           */
8591 8596                          phyint_t *phyint = ill->ill_phyint;
8592 8597                          uint64_t new_phyint_flags;
8593 8598                          boolean_t changed = B_FALSE;
8594 8599                          boolean_t went_up;
8595 8600  
8596 8601                          went_up = notify->dl_notification == DL_NOTE_LINK_UP;
8597 8602                          mutex_enter(&phyint->phyint_lock);
8598 8603  
8599 8604                          new_phyint_flags = went_up ?
8600 8605                              phyint->phyint_flags | PHYI_RUNNING :
8601 8606                              phyint->phyint_flags & ~PHYI_RUNNING;
8602 8607  
8603 8608                          if (IS_IPMP(ill)) {
8604 8609                                  new_phyint_flags = went_up ?
8605 8610                                      new_phyint_flags & ~PHYI_FAILED :
8606 8611                                      new_phyint_flags | PHYI_FAILED;
8607 8612                          }
8608 8613  
8609 8614                          if (new_phyint_flags != phyint->phyint_flags) {
8610 8615                                  phyint->phyint_flags = new_phyint_flags;
8611 8616                                  changed = B_TRUE;
8612 8617                          }
8613 8618                          mutex_exit(&phyint->phyint_lock);
8614 8619                          /*
8615 8620                           * ill_restart_dad handles the DAD restart and routing
8616 8621                           * socket notification logic.
8617 8622                           */
8618 8623                          if (changed) {
8619 8624                                  ill_restart_dad(phyint->phyint_illv4, went_up);
8620 8625                                  ill_restart_dad(phyint->phyint_illv6, went_up);
8621 8626                          }
8622 8627                          break;
8623 8628                  }
8624 8629                  case DL_NOTE_PROMISC_ON_PHYS: {
8625 8630                          phyint_t *phyint = ill->ill_phyint;
8626 8631  
8627 8632                          mutex_enter(&phyint->phyint_lock);
8628 8633                          phyint->phyint_flags |= PHYI_PROMISC;
8629 8634                          mutex_exit(&phyint->phyint_lock);
8630 8635                          break;
8631 8636                  }
8632 8637                  case DL_NOTE_PROMISC_OFF_PHYS: {
8633 8638                          phyint_t *phyint = ill->ill_phyint;
8634 8639  
8635 8640                          mutex_enter(&phyint->phyint_lock);
8636 8641                          phyint->phyint_flags &= ~PHYI_PROMISC;
8637 8642                          mutex_exit(&phyint->phyint_lock);
8638 8643                          break;
8639 8644                  }
8640 8645                  case DL_NOTE_CAPAB_RENEG:
8641 8646                          /*
8642 8647                           * Something changed on the driver side.
8643 8648                           * It wants us to renegotiate the capabilities
8644 8649                           * on this ill. One possible cause is the aggregation
8645 8650                           * interface under us where a port got added or
8646 8651                           * went away.
8647 8652                           *
8648 8653                           * If the capability negotiation is already done
8649 8654                           * or is in progress, reset the capabilities and
8650 8655                           * mark the ill's ill_capab_reneg to be B_TRUE,
8651 8656                           * so that when the ack comes back, we can start
8652 8657                           * the renegotiation process.
8653 8658                           *
8654 8659                           * Note that if ill_capab_reneg is already B_TRUE
8655 8660                           * (ill_dlpi_capab_state is IDS_UNKNOWN in this case),
8656 8661                           * the capability resetting request has been sent
8657 8662                           * and the renegotiation has not been started yet;
8658 8663                           * nothing needs to be done in this case.
8659 8664                           */
8660 8665                          ipsq_current_start(ipsq, ill->ill_ipif, 0);
8661 8666                          ill_capability_reset(ill, B_TRUE);
8662 8667                          ipsq_current_finish(ipsq);
8663 8668                          break;
8664 8669  
8665 8670                  case DL_NOTE_ALLOWED_IPS:
8666 8671                          ill_set_allowed_ips(ill, mp);
8667 8672                          break;
8668 8673                  default:
8669 8674                          ip0dbg(("ip_rput_dlpi_writer: unknown notification "
8670 8675                              "type 0x%x for DL_NOTIFY_IND\n",
8671 8676                              notify->dl_notification));
8672 8677                          break;
8673 8678                  }
8674 8679  
8675 8680                  /*
8676 8681                   * As this is an asynchronous operation, we
8677 8682                   * should not call ill_dlpi_done
8678 8683                   */
8679 8684                  break;
8680 8685          }
8681 8686          case DL_NOTIFY_ACK: {
8682 8687                  dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr;
8683 8688  
8684 8689                  if (noteack->dl_notifications & DL_NOTE_LINK_UP)
8685 8690                          ill->ill_note_link = 1;
8686 8691                  ill_dlpi_done(ill, DL_NOTIFY_REQ);
8687 8692                  break;
8688 8693          }
8689 8694          case DL_PHYS_ADDR_ACK: {
8690 8695                  /*
8691 8696                   * As part of plumbing the interface via SIOCSLIFNAME,
8692 8697                   * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs,
8693 8698                   * whose answers we receive here.  As each answer is received,
8694 8699                   * we call ill_dlpi_done() to dispatch the next request as
8695 8700                   * we're processing the current one.  Once all answers have
8696 8701                   * been received, we use ipsq_pending_mp_get() to dequeue the
8697 8702                   * outstanding IOCTL and reply to it.  (Because ill_dl_phys()
8698 8703                   * is invoked from an ill queue, conn_oper_pending_ill is not
8699 8704                   * available, but we know the ioctl is pending on ill_wq.)
8700 8705                   */
8701 8706                  uint_t  paddrlen, paddroff;
8702 8707                  uint8_t *addr;
8703 8708  
8704 8709                  paddrreq = ill->ill_phys_addr_pend;
8705 8710                  paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
8706 8711                  paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset;
8707 8712                  addr = mp->b_rptr + paddroff;
8708 8713  
8709 8714                  ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8710 8715                  if (paddrreq == DL_IPV6_TOKEN) {
8711 8716                          /*
8712 8717                           * bcopy to low-order bits of ill_token
8713 8718                           *
8714 8719                           * XXX Temporary hack - currently, all known tokens
8715 8720                           * are 64 bits, so I'll cheat for the moment.
8716 8721                           */
8717 8722                          bcopy(addr, &ill->ill_token.s6_addr32[2], paddrlen);
8718 8723                          ill->ill_token_length = paddrlen;
8719 8724                          break;
8720 8725                  } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8721 8726                          ASSERT(ill->ill_nd_lla_mp == NULL);
8722 8727                          ill_set_ndmp(ill, mp, paddroff, paddrlen);
8723 8728                          mp = NULL;
8724 8729                          break;
8725 8730                  } else if (paddrreq == DL_CURR_DEST_ADDR) {
8726 8731                          ASSERT(ill->ill_dest_addr_mp == NULL);
8727 8732                          ill->ill_dest_addr_mp = mp;
8728 8733                          ill->ill_dest_addr = addr;
8729 8734                          mp = NULL;
8730 8735                          if (ill->ill_isv6) {
8731 8736                                  ill_setdesttoken(ill);
8732 8737                                  ipif_setdestlinklocal(ill->ill_ipif);
8733 8738                          }
8734 8739                          break;
8735 8740                  }
8736 8741  
8737 8742                  ASSERT(paddrreq == DL_CURR_PHYS_ADDR);
8738 8743                  ASSERT(ill->ill_phys_addr_mp == NULL);
8739 8744                  if (!ill->ill_ifname_pending)
8740 8745                          break;
8741 8746                  ill->ill_ifname_pending = 0;
8742 8747                  if (!ioctl_aborted)
8743 8748                          mp1 = ipsq_pending_mp_get(ipsq, &connp);
8744 8749                  if (mp1 != NULL) {
8745 8750                          ASSERT(connp == NULL);
8746 8751                          q = ill->ill_wq;
8747 8752                  }
8748 8753                  /*
8749 8754                   * If any error acks received during the plumbing sequence,
8750 8755                   * ill_ifname_pending_err will be set. Break out and send up
8751 8756                   * the error to the pending ioctl.
8752 8757                   */
8753 8758                  if (ill->ill_ifname_pending_err != 0) {
8754 8759                          err = ill->ill_ifname_pending_err;
8755 8760                          ill->ill_ifname_pending_err = 0;
8756 8761                          break;
8757 8762                  }
8758 8763  
8759 8764                  ill->ill_phys_addr_mp = mp;
8760 8765                  ill->ill_phys_addr = (paddrlen == 0 ? NULL : addr);
8761 8766                  mp = NULL;
8762 8767  
8763 8768                  /*
8764 8769                   * If paddrlen or ill_phys_addr_length is zero, the DLPI
8765 8770                   * provider doesn't support physical addresses.  We check both
8766 8771                   * paddrlen and ill_phys_addr_length because sppp (PPP) does
8767 8772                   * not have physical addresses, but historically adversises a
8768 8773                   * physical address length of 0 in its DL_INFO_ACK, but 6 in
8769 8774                   * its DL_PHYS_ADDR_ACK.
8770 8775                   */
8771 8776                  if (paddrlen == 0 || ill->ill_phys_addr_length == 0) {
8772 8777                          ill->ill_phys_addr = NULL;
8773 8778                  } else if (paddrlen != ill->ill_phys_addr_length) {
8774 8779                          ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d",
8775 8780                              paddrlen, ill->ill_phys_addr_length));
8776 8781                          err = EINVAL;
8777 8782                          break;
8778 8783                  }
8779 8784  
8780 8785                  if (ill->ill_nd_lla_mp == NULL) {
8781 8786                          if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) {
8782 8787                                  err = ENOMEM;
8783 8788                                  break;
8784 8789                          }
8785 8790                          ill_set_ndmp(ill, mp_hw, paddroff, paddrlen);
8786 8791                  }
8787 8792  
8788 8793                  if (ill->ill_isv6) {
8789 8794                          ill_setdefaulttoken(ill);
8790 8795                          ipif_setlinklocal(ill->ill_ipif);
8791 8796                  }
8792 8797                  break;
8793 8798          }
8794 8799          case DL_OK_ACK:
8795 8800                  ip2dbg(("DL_OK_ACK %s (0x%x)\n",
8796 8801                      dl_primstr((int)dloa->dl_correct_primitive),
8797 8802                      dloa->dl_correct_primitive));
8798 8803                  DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
8799 8804                      char *, dl_primstr(dloa->dl_correct_primitive),
8800 8805                      ill_t *, ill);
8801 8806  
8802 8807                  switch (dloa->dl_correct_primitive) {
8803 8808                  case DL_ENABMULTI_REQ:
8804 8809                  case DL_DISABMULTI_REQ:
8805 8810                          ill_dlpi_done(ill, dloa->dl_correct_primitive);
8806 8811                          break;
8807 8812                  case DL_PROMISCON_REQ:
8808 8813                  case DL_PROMISCOFF_REQ:
8809 8814                  case DL_UNBIND_REQ:
8810 8815                  case DL_ATTACH_REQ:
8811 8816                          ill_dlpi_done(ill, dloa->dl_correct_primitive);
8812 8817                          break;
8813 8818                  }
8814 8819                  break;
8815 8820          default:
8816 8821                  break;
8817 8822          }
8818 8823  
8819 8824          freemsg(mp);
8820 8825          if (mp1 == NULL)
8821 8826                  return;
8822 8827  
8823 8828          /*
8824 8829           * The operation must complete without EINPROGRESS since
8825 8830           * ipsq_pending_mp_get() has removed the mblk (mp1).  Otherwise,
8826 8831           * the operation will be stuck forever inside the IPSQ.
8827 8832           */
8828 8833          ASSERT(err != EINPROGRESS);
8829 8834  
8830 8835          DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
8831 8836              int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
8832 8837              ipif_t *, NULL);
8833 8838  
8834 8839          switch (ipsq->ipsq_xop->ipx_current_ioctl) {
8835 8840          case 0:
8836 8841                  ipsq_current_finish(ipsq);
8837 8842                  break;
8838 8843  
8839 8844          case SIOCSLIFNAME:
8840 8845          case IF_UNITSEL: {
8841 8846                  ill_t *ill_other = ILL_OTHER(ill);
8842 8847  
8843 8848                  /*
8844 8849                   * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
8845 8850                   * ill has a peer which is in an IPMP group, then place ill
8846 8851                   * into the same group.  One catch: although ifconfig plumbs
8847 8852                   * the appropriate IPMP meta-interface prior to plumbing this
8848 8853                   * ill, it is possible for multiple ifconfig applications to
8849 8854                   * race (or for another application to adjust plumbing), in
8850 8855                   * which case the IPMP meta-interface we need will be missing.
8851 8856                   * If so, kick the phyint out of the group.
8852 8857                   */
8853 8858                  if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
8854 8859                          ipmp_grp_t      *grp = ill->ill_phyint->phyint_grp;
8855 8860                          ipmp_illgrp_t   *illg;
8856 8861  
8857 8862                          illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
8858 8863                          if (illg == NULL)
8859 8864                                  ipmp_phyint_leave_grp(ill->ill_phyint);
8860 8865                          else
8861 8866                                  ipmp_ill_join_illgrp(ill, illg);
8862 8867                  }
8863 8868  
8864 8869                  if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
8865 8870                          ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8866 8871                  else
8867 8872                          ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8868 8873                  break;
8869 8874          }
8870 8875          case SIOCLIFADDIF:
8871 8876                  ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8872 8877                  break;
8873 8878  
8874 8879          default:
8875 8880                  ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8876 8881                  break;
8877 8882          }
8878 8883  }
8879 8884  
8880 8885  /*
8881 8886   * ip_rput_other is called by ip_rput to handle messages modifying the global
8882 8887   * state in IP.  If 'ipsq' is non-NULL, caller is writer on it.
8883 8888   */
8884 8889  /* ARGSUSED */
8885 8890  void
8886 8891  ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8887 8892  {
8888 8893          ill_t           *ill = q->q_ptr;
8889 8894          struct iocblk   *iocp;
8890 8895  
8891 8896          ip1dbg(("ip_rput_other "));
8892 8897          if (ipsq != NULL) {
8893 8898                  ASSERT(IAM_WRITER_IPSQ(ipsq));
8894 8899                  ASSERT(ipsq->ipsq_xop ==
8895 8900                      ill->ill_phyint->phyint_ipsq->ipsq_xop);
8896 8901          }
8897 8902  
8898 8903          switch (mp->b_datap->db_type) {
8899 8904          case M_ERROR:
8900 8905          case M_HANGUP:
8901 8906                  /*
8902 8907                   * The device has a problem.  We force the ILL down.  It can
8903 8908                   * be brought up again manually using SIOCSIFFLAGS (via
8904 8909                   * ifconfig or equivalent).
8905 8910                   */
8906 8911                  ASSERT(ipsq != NULL);
8907 8912                  if (mp->b_rptr < mp->b_wptr)
8908 8913                          ill->ill_error = (int)(*mp->b_rptr & 0xFF);
8909 8914                  if (ill->ill_error == 0)
8910 8915                          ill->ill_error = ENXIO;
8911 8916                  if (!ill_down_start(q, mp))
8912 8917                          return;
8913 8918                  ipif_all_down_tail(ipsq, q, mp, NULL);
8914 8919                  break;
8915 8920          case M_IOCNAK: {
8916 8921                  iocp = (struct iocblk *)mp->b_rptr;
8917 8922  
8918 8923                  ASSERT(iocp->ioc_cmd == DL_IOC_HDR_INFO);
8919 8924                  /*
8920 8925                   * If this was the first attempt, turn off the fastpath
8921 8926                   * probing.
8922 8927                   */
8923 8928                  mutex_enter(&ill->ill_lock);
8924 8929                  if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
8925 8930                          ill->ill_dlpi_fastpath_state = IDS_FAILED;
8926 8931                          mutex_exit(&ill->ill_lock);
8927 8932                          /*
8928 8933                           * don't flush the nce_t entries: we use them
8929 8934                           * as an index to the ncec itself.
8930 8935                           */
8931 8936                          ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
8932 8937                              ill->ill_name));
8933 8938                  } else {
8934 8939                          mutex_exit(&ill->ill_lock);
8935 8940                  }
8936 8941                  freemsg(mp);
8937 8942                  break;
8938 8943          }
8939 8944          default:
8940 8945                  ASSERT(0);
8941 8946                  break;
8942 8947          }
8943 8948  }
8944 8949  
8945 8950  /*
8946 8951   * Update any source route, record route or timestamp options
8947 8952   * When it fails it has consumed the message and BUMPed the MIB.
8948 8953   */
8949 8954  boolean_t
8950 8955  ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
8951 8956      ip_recv_attr_t *ira)
8952 8957  {
8953 8958          ipoptp_t        opts;
8954 8959          uchar_t         *opt;
8955 8960          uint8_t         optval;
8956 8961          uint8_t         optlen;
8957 8962          ipaddr_t        dst;
8958 8963          ipaddr_t        ifaddr;
8959 8964          uint32_t        ts;
8960 8965          timestruc_t     now;
8961 8966          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
8962 8967  
8963 8968          ip2dbg(("ip_forward_options\n"));
8964 8969          dst = ipha->ipha_dst;
8965 8970          for (optval = ipoptp_first(&opts, ipha);
8966 8971              optval != IPOPT_EOL;
8967 8972              optval = ipoptp_next(&opts)) {
8968 8973                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
8969 8974                  opt = opts.ipoptp_cur;
8970 8975                  optlen = opts.ipoptp_len;
8971 8976                  ip2dbg(("ip_forward_options: opt %d, len %d\n",
8972 8977                      optval, opts.ipoptp_len));
8973 8978                  switch (optval) {
8974 8979                          uint32_t off;
8975 8980                  case IPOPT_SSRR:
8976 8981                  case IPOPT_LSRR:
8977 8982                          /* Check if adminstratively disabled */
8978 8983                          if (!ipst->ips_ip_forward_src_routed) {
8979 8984                                  BUMP_MIB(dst_ill->ill_ip_mib,
8980 8985                                      ipIfStatsForwProhibits);
8981 8986                                  ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
8982 8987                                      mp, dst_ill);
8983 8988                                  icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
8984 8989                                      ira);
8985 8990                                  return (B_FALSE);
8986 8991                          }
8987 8992                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
8988 8993                                  /*
8989 8994                                   * Must be partial since ip_input_options
8990 8995                                   * checked for strict.
8991 8996                                   */
8992 8997                                  break;
8993 8998                          }
8994 8999                          off = opt[IPOPT_OFFSET];
8995 9000                          off--;
8996 9001                  redo_srr:
8997 9002                          if (optlen < IP_ADDR_LEN ||
8998 9003                              off > optlen - IP_ADDR_LEN) {
8999 9004                                  /* End of source route */
9000 9005                                  ip1dbg((
9001 9006                                      "ip_forward_options: end of SR\n"));
9002 9007                                  break;
9003 9008                          }
9004 9009                          /* Pick a reasonable address on the outbound if */
9005 9010                          ASSERT(dst_ill != NULL);
9006 9011                          if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9007 9012                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9008 9013                              NULL) != 0) {
9009 9014                                  /* No source! Shouldn't happen */
9010 9015                                  ifaddr = INADDR_ANY;
9011 9016                          }
9012 9017                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9013 9018                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9014 9019                          ip1dbg(("ip_forward_options: next hop 0x%x\n",
9015 9020                              ntohl(dst)));
9016 9021  
9017 9022                          /*
9018 9023                           * Check if our address is present more than
9019 9024                           * once as consecutive hops in source route.
9020 9025                           */
9021 9026                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9022 9027                                  off += IP_ADDR_LEN;
9023 9028                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9024 9029                                  goto redo_srr;
9025 9030                          }
9026 9031                          ipha->ipha_dst = dst;
9027 9032                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9028 9033                          break;
9029 9034                  case IPOPT_RR:
9030 9035                          off = opt[IPOPT_OFFSET];
9031 9036                          off--;
9032 9037                          if (optlen < IP_ADDR_LEN ||
9033 9038                              off > optlen - IP_ADDR_LEN) {
9034 9039                                  /* No more room - ignore */
9035 9040                                  ip1dbg((
9036 9041                                      "ip_forward_options: end of RR\n"));
9037 9042                                  break;
9038 9043                          }
9039 9044                          /* Pick a reasonable address on the outbound if */
9040 9045                          ASSERT(dst_ill != NULL);
9041 9046                          if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9042 9047                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9043 9048                              NULL) != 0) {
9044 9049                                  /* No source! Shouldn't happen */
9045 9050                                  ifaddr = INADDR_ANY;
9046 9051                          }
9047 9052                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9048 9053                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9049 9054                          break;
9050 9055                  case IPOPT_TS:
9051 9056                          /* Insert timestamp if there is room */
9052 9057                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9053 9058                          case IPOPT_TS_TSONLY:
9054 9059                                  off = IPOPT_TS_TIMELEN;
9055 9060                                  break;
9056 9061                          case IPOPT_TS_PRESPEC:
9057 9062                          case IPOPT_TS_PRESPEC_RFC791:
9058 9063                                  /* Verify that the address matched */
9059 9064                                  off = opt[IPOPT_OFFSET] - 1;
9060 9065                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9061 9066                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9062 9067                                          /* Not for us */
9063 9068                                          break;
9064 9069                                  }
9065 9070                                  /* FALLTHRU */
9066 9071                          case IPOPT_TS_TSANDADDR:
9067 9072                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9068 9073                                  break;
9069 9074                          default:
9070 9075                                  /*
9071 9076                                   * ip_*put_options should have already
9072 9077                                   * dropped this packet.
9073 9078                                   */
9074 9079                                  cmn_err(CE_PANIC, "ip_forward_options: "
9075 9080                                      "unknown IT - bug in ip_input_options?\n");
9076 9081                                  return (B_TRUE);        /* Keep "lint" happy */
9077 9082                          }
9078 9083                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9079 9084                                  /* Increase overflow counter */
9080 9085                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9081 9086                                  opt[IPOPT_POS_OV_FLG] =
9082 9087                                      (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9083 9088                                      (off << 4));
9084 9089                                  break;
9085 9090                          }
9086 9091                          off = opt[IPOPT_OFFSET] - 1;
9087 9092                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9088 9093                          case IPOPT_TS_PRESPEC:
9089 9094                          case IPOPT_TS_PRESPEC_RFC791:
9090 9095                          case IPOPT_TS_TSANDADDR:
9091 9096                                  /* Pick a reasonable addr on the outbound if */
9092 9097                                  ASSERT(dst_ill != NULL);
9093 9098                                  if (ip_select_source_v4(dst_ill, INADDR_ANY,
9094 9099                                      dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
9095 9100                                      NULL, NULL) != 0) {
9096 9101                                          /* No source! Shouldn't happen */
9097 9102                                          ifaddr = INADDR_ANY;
9098 9103                                  }
9099 9104                                  bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9100 9105                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9101 9106                                  /* FALLTHRU */
9102 9107                          case IPOPT_TS_TSONLY:
9103 9108                                  off = opt[IPOPT_OFFSET] - 1;
9104 9109                                  /* Compute # of milliseconds since midnight */
9105 9110                                  gethrestime(&now);
9106 9111                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9107 9112                                      now.tv_nsec / (NANOSEC / MILLISEC);
9108 9113                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9109 9114                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9110 9115                                  break;
9111 9116                          }
9112 9117                          break;
9113 9118                  }
9114 9119          }
9115 9120          return (B_TRUE);
9116 9121  }
9117 9122  
9118 9123  /*
9119 9124   * Call ill_frag_timeout to do garbage collection. ill_frag_timeout
9120 9125   * returns 'true' if there are still fragments left on the queue, in
9121 9126   * which case we restart the timer.
9122 9127   */
9123 9128  void
9124 9129  ill_frag_timer(void *arg)
9125 9130  {
9126 9131          ill_t   *ill = (ill_t *)arg;
9127 9132          boolean_t frag_pending;
9128 9133          ip_stack_t *ipst = ill->ill_ipst;
9129 9134          time_t  timeout;
9130 9135  
9131 9136          mutex_enter(&ill->ill_lock);
9132 9137          ASSERT(!ill->ill_fragtimer_executing);
9133 9138          if (ill->ill_state_flags & ILL_CONDEMNED) {
9134 9139                  ill->ill_frag_timer_id = 0;
9135 9140                  mutex_exit(&ill->ill_lock);
9136 9141                  return;
9137 9142          }
9138 9143          ill->ill_fragtimer_executing = 1;
9139 9144          mutex_exit(&ill->ill_lock);
9140 9145  
9141 9146          timeout = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9142 9147              ipst->ips_ip_reassembly_timeout);
9143 9148  
9144 9149          frag_pending = ill_frag_timeout(ill, timeout);
9145 9150  
9146 9151          /*
9147 9152           * Restart the timer, if we have fragments pending or if someone
9148 9153           * wanted us to be scheduled again.
9149 9154           */
9150 9155          mutex_enter(&ill->ill_lock);
9151 9156          ill->ill_fragtimer_executing = 0;
9152 9157          ill->ill_frag_timer_id = 0;
9153 9158          if (frag_pending || ill->ill_fragtimer_needrestart)
9154 9159                  ill_frag_timer_start(ill);
9155 9160          mutex_exit(&ill->ill_lock);
9156 9161  }
9157 9162  
9158 9163  void
9159 9164  ill_frag_timer_start(ill_t *ill)
9160 9165  {
9161 9166          ip_stack_t *ipst = ill->ill_ipst;
9162 9167          clock_t timeo_ms;
9163 9168  
9164 9169          ASSERT(MUTEX_HELD(&ill->ill_lock));
9165 9170  
9166 9171          /* If the ill is closing or opening don't proceed */
9167 9172          if (ill->ill_state_flags & ILL_CONDEMNED)
9168 9173                  return;
9169 9174  
9170 9175          if (ill->ill_fragtimer_executing) {
9171 9176                  /*
9172 9177                   * ill_frag_timer is currently executing. Just record the
9173 9178                   * the fact that we want the timer to be restarted.
9174 9179                   * ill_frag_timer will post a timeout before it returns,
9175 9180                   * ensuring it will be called again.
9176 9181                   */
9177 9182                  ill->ill_fragtimer_needrestart = 1;
9178 9183                  return;
9179 9184          }
9180 9185  
9181 9186          if (ill->ill_frag_timer_id == 0) {
9182 9187                  timeo_ms = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9183 9188                      ipst->ips_ip_reassembly_timeout) * SECONDS;
9184 9189  
9185 9190                  /*
9186 9191                   * The timer is neither running nor is the timeout handler
9187 9192                   * executing. Post a timeout so that ill_frag_timer will be
9188 9193                   * called
9189 9194                   */
9190 9195                  ill->ill_frag_timer_id = timeout(ill_frag_timer, ill,
9191 9196                      MSEC_TO_TICK(timeo_ms >> 1));
9192 9197                  ill->ill_fragtimer_needrestart = 0;
9193 9198          }
9194 9199  }
9195 9200  
9196 9201  /*
9197 9202   * Update any source route, record route or timestamp options.
9198 9203   * Check that we are at end of strict source route.
9199 9204   * The options have already been checked for sanity in ip_input_options().
9200 9205   */
9201 9206  boolean_t
9202 9207  ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
9203 9208  {
9204 9209          ipoptp_t        opts;
9205 9210          uchar_t         *opt;
9206 9211          uint8_t         optval;
9207 9212          uint8_t         optlen;
9208 9213          ipaddr_t        dst;
9209 9214          ipaddr_t        ifaddr;
9210 9215          uint32_t        ts;
9211 9216          timestruc_t     now;
9212 9217          ill_t           *ill = ira->ira_ill;
9213 9218          ip_stack_t      *ipst = ill->ill_ipst;
9214 9219  
9215 9220          ip2dbg(("ip_input_local_options\n"));
9216 9221  
9217 9222          for (optval = ipoptp_first(&opts, ipha);
9218 9223              optval != IPOPT_EOL;
9219 9224              optval = ipoptp_next(&opts)) {
9220 9225                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
9221 9226                  opt = opts.ipoptp_cur;
9222 9227                  optlen = opts.ipoptp_len;
9223 9228                  ip2dbg(("ip_input_local_options: opt %d, len %d\n",
9224 9229                      optval, optlen));
9225 9230                  switch (optval) {
9226 9231                          uint32_t off;
9227 9232                  case IPOPT_SSRR:
9228 9233                  case IPOPT_LSRR:
9229 9234                          off = opt[IPOPT_OFFSET];
9230 9235                          off--;
9231 9236                          if (optlen < IP_ADDR_LEN ||
9232 9237                              off > optlen - IP_ADDR_LEN) {
9233 9238                                  /* End of source route */
9234 9239                                  ip1dbg(("ip_input_local_options: end of SR\n"));
9235 9240                                  break;
9236 9241                          }
9237 9242                          /*
9238 9243                           * This will only happen if two consecutive entries
9239 9244                           * in the source route contains our address or if
9240 9245                           * it is a packet with a loose source route which
9241 9246                           * reaches us before consuming the whole source route
9242 9247                           */
9243 9248                          ip1dbg(("ip_input_local_options: not end of SR\n"));
9244 9249                          if (optval == IPOPT_SSRR) {
9245 9250                                  goto bad_src_route;
9246 9251                          }
9247 9252                          /*
9248 9253                           * Hack: instead of dropping the packet truncate the
9249 9254                           * source route to what has been used by filling the
9250 9255                           * rest with IPOPT_NOP.
9251 9256                           */
9252 9257                          opt[IPOPT_OLEN] = (uint8_t)off;
9253 9258                          while (off < optlen) {
9254 9259                                  opt[off++] = IPOPT_NOP;
9255 9260                          }
9256 9261                          break;
9257 9262                  case IPOPT_RR:
9258 9263                          off = opt[IPOPT_OFFSET];
9259 9264                          off--;
9260 9265                          if (optlen < IP_ADDR_LEN ||
9261 9266                              off > optlen - IP_ADDR_LEN) {
9262 9267                                  /* No more room - ignore */
9263 9268                                  ip1dbg((
9264 9269                                      "ip_input_local_options: end of RR\n"));
9265 9270                                  break;
9266 9271                          }
9267 9272                          /* Pick a reasonable address on the outbound if */
9268 9273                          if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
9269 9274                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9270 9275                              NULL) != 0) {
9271 9276                                  /* No source! Shouldn't happen */
9272 9277                                  ifaddr = INADDR_ANY;
9273 9278                          }
9274 9279                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9275 9280                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9276 9281                          break;
9277 9282                  case IPOPT_TS:
9278 9283                          /* Insert timestamp if there is romm */
9279 9284                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9280 9285                          case IPOPT_TS_TSONLY:
9281 9286                                  off = IPOPT_TS_TIMELEN;
9282 9287                                  break;
9283 9288                          case IPOPT_TS_PRESPEC:
9284 9289                          case IPOPT_TS_PRESPEC_RFC791:
9285 9290                                  /* Verify that the address matched */
9286 9291                                  off = opt[IPOPT_OFFSET] - 1;
9287 9292                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9288 9293                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9289 9294                                          /* Not for us */
9290 9295                                          break;
9291 9296                                  }
9292 9297                                  /* FALLTHRU */
9293 9298                          case IPOPT_TS_TSANDADDR:
9294 9299                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9295 9300                                  break;
9296 9301                          default:
9297 9302                                  /*
9298 9303                                   * ip_*put_options should have already
9299 9304                                   * dropped this packet.
9300 9305                                   */
9301 9306                                  cmn_err(CE_PANIC, "ip_input_local_options: "
9302 9307                                      "unknown IT - bug in ip_input_options?\n");
9303 9308                                  return (B_TRUE);        /* Keep "lint" happy */
9304 9309                          }
9305 9310                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9306 9311                                  /* Increase overflow counter */
9307 9312                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9308 9313                                  opt[IPOPT_POS_OV_FLG] =
9309 9314                                      (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9310 9315                                      (off << 4));
9311 9316                                  break;
9312 9317                          }
9313 9318                          off = opt[IPOPT_OFFSET] - 1;
9314 9319                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9315 9320                          case IPOPT_TS_PRESPEC:
9316 9321                          case IPOPT_TS_PRESPEC_RFC791:
9317 9322                          case IPOPT_TS_TSANDADDR:
9318 9323                                  /* Pick a reasonable addr on the outbound if */
9319 9324                                  if (ip_select_source_v4(ill, INADDR_ANY,
9320 9325                                      ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
9321 9326                                      &ifaddr, NULL, NULL) != 0) {
9322 9327                                          /* No source! Shouldn't happen */
9323 9328                                          ifaddr = INADDR_ANY;
9324 9329                                  }
9325 9330                                  bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9326 9331                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9327 9332                                  /* FALLTHRU */
9328 9333                          case IPOPT_TS_TSONLY:
9329 9334                                  off = opt[IPOPT_OFFSET] - 1;
9330 9335                                  /* Compute # of milliseconds since midnight */
9331 9336                                  gethrestime(&now);
9332 9337                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9333 9338                                      now.tv_nsec / (NANOSEC / MILLISEC);
9334 9339                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9335 9340                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9336 9341                                  break;
9337 9342                          }
9338 9343                          break;
9339 9344                  }
9340 9345          }
9341 9346          return (B_TRUE);
9342 9347  
9343 9348  bad_src_route:
9344 9349          /* make sure we clear any indication of a hardware checksum */
9345 9350          DB_CKSUMFLAGS(mp) = 0;
9346 9351          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
9347 9352          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9348 9353          return (B_FALSE);
9349 9354  
9350 9355  }
9351 9356  
9352 9357  /*
9353 9358   * Process IP options in an inbound packet.  Always returns the nexthop.
9354 9359   * Normally this is the passed in nexthop, but if there is an option
9355 9360   * that effects the nexthop (such as a source route) that will be returned.
9356 9361   * Sets *errorp if there is an error, in which case an ICMP error has been sent
9357 9362   * and mp freed.
9358 9363   */
9359 9364  ipaddr_t
9360 9365  ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
9361 9366      ip_recv_attr_t *ira, int *errorp)
9362 9367  {
9363 9368          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
9364 9369          ipoptp_t        opts;
9365 9370          uchar_t         *opt;
9366 9371          uint8_t         optval;
9367 9372          uint8_t         optlen;
9368 9373          intptr_t        code = 0;
9369 9374          ire_t           *ire;
9370 9375  
9371 9376          ip2dbg(("ip_input_options\n"));
9372 9377          *errorp = 0;
9373 9378          for (optval = ipoptp_first(&opts, ipha);
9374 9379              optval != IPOPT_EOL;
9375 9380              optval = ipoptp_next(&opts)) {
9376 9381                  opt = opts.ipoptp_cur;
9377 9382                  optlen = opts.ipoptp_len;
9378 9383                  ip2dbg(("ip_input_options: opt %d, len %d\n",
9379 9384                      optval, optlen));
9380 9385                  /*
9381 9386                   * Note: we need to verify the checksum before we
9382 9387                   * modify anything thus this routine only extracts the next
9383 9388                   * hop dst from any source route.
9384 9389                   */
9385 9390                  switch (optval) {
9386 9391                          uint32_t off;
9387 9392                  case IPOPT_SSRR:
9388 9393                  case IPOPT_LSRR:
9389 9394                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9390 9395                                  if (optval == IPOPT_SSRR) {
9391 9396                                          ip1dbg(("ip_input_options: not next"
9392 9397                                              " strict source route 0x%x\n",
9393 9398                                              ntohl(dst)));
9394 9399                                          code = (char *)&ipha->ipha_dst -
9395 9400                                              (char *)ipha;
9396 9401                                          goto param_prob; /* RouterReq's */
9397 9402                                  }
9398 9403                                  ip2dbg(("ip_input_options: "
9399 9404                                      "not next source route 0x%x\n",
9400 9405                                      ntohl(dst)));
9401 9406                                  break;
9402 9407                          }
9403 9408  
9404 9409                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9405 9410                                  ip1dbg((
9406 9411                                      "ip_input_options: bad option offset\n"));
9407 9412                                  code = (char *)&opt[IPOPT_OLEN] -
9408 9413                                      (char *)ipha;
9409 9414                                  goto param_prob;
9410 9415                          }
9411 9416                          off = opt[IPOPT_OFFSET];
9412 9417                          off--;
9413 9418                  redo_srr:
9414 9419                          if (optlen < IP_ADDR_LEN ||
9415 9420                              off > optlen - IP_ADDR_LEN) {
9416 9421                                  /* End of source route */
9417 9422                                  ip1dbg(("ip_input_options: end of SR\n"));
9418 9423                                  break;
9419 9424                          }
9420 9425                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9421 9426                          ip1dbg(("ip_input_options: next hop 0x%x\n",
9422 9427                              ntohl(dst)));
9423 9428  
9424 9429                          /*
9425 9430                           * Check if our address is present more than
9426 9431                           * once as consecutive hops in source route.
9427 9432                           * XXX verify per-interface ip_forwarding
9428 9433                           * for source route?
9429 9434                           */
9430 9435                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9431 9436                                  off += IP_ADDR_LEN;
9432 9437                                  goto redo_srr;
9433 9438                          }
9434 9439  
9435 9440                          if (dst == htonl(INADDR_LOOPBACK)) {
9436 9441                                  ip1dbg(("ip_input_options: loopback addr in "
9437 9442                                      "source route!\n"));
9438 9443                                  goto bad_src_route;
9439 9444                          }
9440 9445                          /*
9441 9446                           * For strict: verify that dst is directly
9442 9447                           * reachable.
9443 9448                           */
9444 9449                          if (optval == IPOPT_SSRR) {
9445 9450                                  ire = ire_ftable_lookup_v4(dst, 0, 0,
9446 9451                                      IRE_INTERFACE, NULL, ALL_ZONES,
9447 9452                                      ira->ira_tsl,
9448 9453                                      MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
9449 9454                                      NULL);
9450 9455                                  if (ire == NULL) {
9451 9456                                          ip1dbg(("ip_input_options: SSRR not "
9452 9457                                              "directly reachable: 0x%x\n",
9453 9458                                              ntohl(dst)));
9454 9459                                          goto bad_src_route;
9455 9460                                  }
9456 9461                                  ire_refrele(ire);
9457 9462                          }
9458 9463                          /*
9459 9464                           * Defer update of the offset and the record route
9460 9465                           * until the packet is forwarded.
9461 9466                           */
9462 9467                          break;
9463 9468                  case IPOPT_RR:
9464 9469                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9465 9470                                  ip1dbg((
9466 9471                                      "ip_input_options: bad option offset\n"));
9467 9472                                  code = (char *)&opt[IPOPT_OLEN] -
9468 9473                                      (char *)ipha;
9469 9474                                  goto param_prob;
9470 9475                          }
9471 9476                          break;
9472 9477                  case IPOPT_TS:
9473 9478                          /*
9474 9479                           * Verify that length >= 5 and that there is either
9475 9480                           * room for another timestamp or that the overflow
9476 9481                           * counter is not maxed out.
9477 9482                           */
9478 9483                          code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
9479 9484                          if (optlen < IPOPT_MINLEN_IT) {
9480 9485                                  goto param_prob;
9481 9486                          }
9482 9487                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9483 9488                                  ip1dbg((
9484 9489                                      "ip_input_options: bad option offset\n"));
9485 9490                                  code = (char *)&opt[IPOPT_OFFSET] -
9486 9491                                      (char *)ipha;
9487 9492                                  goto param_prob;
9488 9493                          }
9489 9494                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9490 9495                          case IPOPT_TS_TSONLY:
9491 9496                                  off = IPOPT_TS_TIMELEN;
9492 9497                                  break;
9493 9498                          case IPOPT_TS_TSANDADDR:
9494 9499                          case IPOPT_TS_PRESPEC:
9495 9500                          case IPOPT_TS_PRESPEC_RFC791:
9496 9501                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9497 9502                                  break;
9498 9503                          default:
9499 9504                                  code = (char *)&opt[IPOPT_POS_OV_FLG] -
9500 9505                                      (char *)ipha;
9501 9506                                  goto param_prob;
9502 9507                          }
9503 9508                          if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
9504 9509                              (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
9505 9510                                  /*
9506 9511                                   * No room and the overflow counter is 15
9507 9512                                   * already.
9508 9513                                   */
9509 9514                                  goto param_prob;
9510 9515                          }
9511 9516                          break;
9512 9517                  }
9513 9518          }
9514 9519  
9515 9520          if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
9516 9521                  return (dst);
9517 9522          }
9518 9523  
9519 9524          ip1dbg(("ip_input_options: error processing IP options."));
9520 9525          code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
9521 9526  
9522 9527  param_prob:
9523 9528          /* make sure we clear any indication of a hardware checksum */
9524 9529          DB_CKSUMFLAGS(mp) = 0;
9525 9530          ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
9526 9531          icmp_param_problem(mp, (uint8_t)code, ira);
9527 9532          *errorp = -1;
9528 9533          return (dst);
9529 9534  
9530 9535  bad_src_route:
9531 9536          /* make sure we clear any indication of a hardware checksum */
9532 9537          DB_CKSUMFLAGS(mp) = 0;
9533 9538          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
9534 9539          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9535 9540          *errorp = -1;
9536 9541          return (dst);
9537 9542  }
9538 9543  
9539 9544  /*
9540 9545   * IP & ICMP info in >=14 msg's ...
9541 9546   *  - ip fixed part (mib2_ip_t)
9542 9547   *  - icmp fixed part (mib2_icmp_t)
9543 9548   *  - ipAddrEntryTable (ip 20)          all IPv4 ipifs
9544 9549   *  - ipRouteEntryTable (ip 21)         all IPv4 IREs
9545 9550   *  - ipNetToMediaEntryTable (ip 22)    all IPv4 Neighbor Cache entries
9546 9551   *  - ipRouteAttributeTable (ip 102)    labeled routes
9547 9552   *  - ip multicast membership (ip_member_t)
9548 9553   *  - ip multicast source filtering (ip_grpsrc_t)
9549 9554   *  - igmp fixed part (struct igmpstat)
9550 9555   *  - multicast routing stats (struct mrtstat)
9551 9556   *  - multicast routing vifs (array of struct vifctl)
9552 9557   *  - multicast routing routes (array of struct mfcctl)
9553 9558   *  - ip6 fixed part (mib2_ipv6IfStatsEntry_t)
9554 9559   *                                      One per ill plus one generic
9555 9560   *  - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t)
9556 9561   *                                      One per ill plus one generic
9557 9562   *  - ipv6RouteEntry                    all IPv6 IREs
9558 9563   *  - ipv6RouteAttributeTable (ip6 102) labeled routes
9559 9564   *  - ipv6NetToMediaEntry               all IPv6 Neighbor Cache entries
9560 9565   *  - ipv6AddrEntry                     all IPv6 ipifs
9561 9566   *  - ipv6 multicast membership (ipv6_member_t)
9562 9567   *  - ipv6 multicast source filtering (ipv6_grpsrc_t)
9563 9568   *
9564 9569   * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
9565 9570   * already filled in by the caller.
9566 9571   * If legacy_req is true then MIB structures needs to be truncated to their
9567 9572   * legacy sizes before being returned.
9568 9573   * Return value of 0 indicates that no messages were sent and caller
9569 9574   * should free mpctl.
9570 9575   */
9571 9576  int
9572 9577  ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
9573 9578  {
9574 9579          ip_stack_t *ipst;
9575 9580          sctp_stack_t *sctps;
9576 9581  
9577 9582          if (q->q_next != NULL) {
9578 9583                  ipst = ILLQ_TO_IPST(q);
9579 9584          } else {
9580 9585                  ipst = CONNQ_TO_IPST(q);
9581 9586          }
9582 9587          ASSERT(ipst != NULL);
9583 9588          sctps = ipst->ips_netstack->netstack_sctp;
9584 9589  
9585 9590          if (mpctl == NULL || mpctl->b_cont == NULL) {
9586 9591                  return (0);
9587 9592          }
9588 9593  
9589 9594          /*
9590 9595           * For the purposes of the (broken) packet shell use
9591 9596           * of the level we make sure MIB2_TCP/MIB2_UDP can be used
9592 9597           * to make TCP and UDP appear first in the list of mib items.
9593 9598           * TBD: We could expand this and use it in netstat so that
9594 9599           * the kernel doesn't have to produce large tables (connections,
9595 9600           * routes, etc) when netstat only wants the statistics or a particular
9596 9601           * table.
9597 9602           */
9598 9603          if (!(level == MIB2_TCP || level == MIB2_UDP)) {
9599 9604                  if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) {
9600 9605                          return (1);
9601 9606                  }
9602 9607          }
9603 9608  
9604 9609          if (level != MIB2_TCP) {
9605 9610                  if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9606 9611                          return (1);
9607 9612                  }
9608 9613          }
9609 9614  
9610 9615          if (level != MIB2_UDP) {
9611 9616                  if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9612 9617                          return (1);
9613 9618                  }
9614 9619          }
9615 9620  
9616 9621          if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl,
9617 9622              ipst, legacy_req)) == NULL) {
9618 9623                  return (1);
9619 9624          }
9620 9625  
9621 9626          if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst,
9622 9627              legacy_req)) == NULL) {
9623 9628                  return (1);
9624 9629          }
9625 9630  
9626 9631          if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) {
9627 9632                  return (1);
9628 9633          }
9629 9634  
9630 9635          if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) {
9631 9636                  return (1);
9632 9637          }
9633 9638  
9634 9639          if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) {
9635 9640                  return (1);
9636 9641          }
9637 9642  
9638 9643          if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) {
9639 9644                  return (1);
9640 9645          }
9641 9646  
9642 9647          if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst,
9643 9648              legacy_req)) == NULL) {
9644 9649                  return (1);
9645 9650          }
9646 9651  
9647 9652          if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst,
9648 9653              legacy_req)) == NULL) {
9649 9654                  return (1);
9650 9655          }
9651 9656  
9652 9657          if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) {
9653 9658                  return (1);
9654 9659          }
9655 9660  
9656 9661          if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) {
9657 9662                  return (1);
9658 9663          }
9659 9664  
9660 9665          if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) {
9661 9666                  return (1);
9662 9667          }
9663 9668  
9664 9669          if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) {
9665 9670                  return (1);
9666 9671          }
9667 9672  
9668 9673          if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) {
9669 9674                  return (1);
9670 9675          }
9671 9676  
9672 9677          if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) {
9673 9678                  return (1);
9674 9679          }
9675 9680  
9676 9681          mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);

↓ open down ↓

5117 lines elided

↑ open up ↑

9677 9682          if (mpctl == NULL)
9678 9683                  return (1);
9679 9684  
9680 9685          mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
9681 9686          if (mpctl == NULL)
9682 9687                  return (1);
9683 9688  
9684 9689          if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
9685 9690                  return (1);
9686 9691          }
     9692 +
9687 9693          if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
9688 9694                  return (1);
9689 9695          }
     9696 +
     9697 +        if ((mpctl = dccp_snmp_get(q, mpctl, legacy_req)) == NULL) {
     9698 +                return (1);
     9699 +        }
     9700 +
9690 9701          freemsg(mpctl);
9691 9702          return (1);
9692 9703  }
9693 9704  
9694 9705  /* Get global (legacy) IPv4 statistics */
9695 9706  static mblk_t *
9696 9707  ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
9697 9708      ip_stack_t *ipst, boolean_t legacy_req)
9698 9709  {
9699 9710          mib2_ip_t               old_ip_mib;

9700 9711          struct opthdr           *optp;
9701 9712          mblk_t                  *mp2ctl;
9702 9713          mib2_ipAddrEntry_t      mae;
9703 9714  
9704 9715          /*
9705 9716           * make a copy of the original message
9706 9717           */
9707 9718          mp2ctl = copymsg(mpctl);
9708 9719  
9709 9720          /* fixed length IP structure... */
9710 9721          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9711 9722          optp->level = MIB2_IP;
9712 9723          optp->name = 0;
9713 9724          SET_MIB(old_ip_mib.ipForwarding,
9714 9725              (WE_ARE_FORWARDING(ipst) ? 1 : 2));
9715 9726          SET_MIB(old_ip_mib.ipDefaultTTL,
9716 9727              (uint32_t)ipst->ips_ip_def_ttl);
9717 9728          SET_MIB(old_ip_mib.ipReasmTimeout,
9718 9729              ipst->ips_ip_reassembly_timeout);
9719 9730          SET_MIB(old_ip_mib.ipAddrEntrySize,
9720 9731              (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
9721 9732              sizeof (mib2_ipAddrEntry_t));
9722 9733          SET_MIB(old_ip_mib.ipRouteEntrySize,
9723 9734              sizeof (mib2_ipRouteEntry_t));
9724 9735          SET_MIB(old_ip_mib.ipNetToMediaEntrySize,
9725 9736              sizeof (mib2_ipNetToMediaEntry_t));
9726 9737          SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t));
9727 9738          SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t));
9728 9739          SET_MIB(old_ip_mib.ipRouteAttributeSize,
9729 9740              sizeof (mib2_ipAttributeEntry_t));
9730 9741          SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
9731 9742          SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
9732 9743  
9733 9744          /*
9734 9745           * Grab the statistics from the new IP MIB
9735 9746           */
9736 9747          SET_MIB(old_ip_mib.ipInReceives,
9737 9748              (uint32_t)ipmib->ipIfStatsHCInReceives);
9738 9749          SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors);
9739 9750          SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors);
9740 9751          SET_MIB(old_ip_mib.ipForwDatagrams,
9741 9752              (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams);
9742 9753          SET_MIB(old_ip_mib.ipInUnknownProtos,
9743 9754              ipmib->ipIfStatsInUnknownProtos);
9744 9755          SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards);
9745 9756          SET_MIB(old_ip_mib.ipInDelivers,
9746 9757              (uint32_t)ipmib->ipIfStatsHCInDelivers);
9747 9758          SET_MIB(old_ip_mib.ipOutRequests,
9748 9759              (uint32_t)ipmib->ipIfStatsHCOutRequests);
9749 9760          SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards);
9750 9761          SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes);
9751 9762          SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds);
9752 9763          SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs);
9753 9764          SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails);
9754 9765          SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs);
9755 9766          SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails);
9756 9767          SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates);
9757 9768  
9758 9769          /* ipRoutingDiscards is not being used */
9759 9770          SET_MIB(old_ip_mib.ipRoutingDiscards, 0);
9760 9771          SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs);
9761 9772          SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts);
9762 9773          SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs);
9763 9774          SET_MIB(old_ip_mib.ipReasmDuplicates,
9764 9775              ipmib->ipIfStatsReasmDuplicates);
9765 9776          SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups);
9766 9777          SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits);
9767 9778          SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs);
9768 9779          SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows);
9769 9780          SET_MIB(old_ip_mib.rawipInOverflows,
9770 9781              ipmib->rawipIfStatsInOverflows);
9771 9782  
9772 9783          SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded);
9773 9784          SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed);
9774 9785          SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion);
9775 9786          SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion);
9776 9787          SET_MIB(old_ip_mib.ipOutSwitchIPv6,
9777 9788              ipmib->ipIfStatsOutSwitchIPVersion);
9778 9789  
9779 9790          if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib,
9780 9791              (int)sizeof (old_ip_mib))) {
9781 9792                  ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n",
9782 9793                      (uint_t)sizeof (old_ip_mib)));
9783 9794          }
9784 9795  
9785 9796          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9786 9797          ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n",
9787 9798              (int)optp->level, (int)optp->name, (int)optp->len));
9788 9799          qreply(q, mpctl);
9789 9800          return (mp2ctl);
9790 9801  }
9791 9802  
9792 9803  /* Per interface IPv4 statistics */
9793 9804  static mblk_t *
9794 9805  ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9795 9806      boolean_t legacy_req)
9796 9807  {
9797 9808          struct opthdr           *optp;
9798 9809          mblk_t                  *mp2ctl;
9799 9810          ill_t                   *ill;
9800 9811          ill_walk_context_t      ctx;
9801 9812          mblk_t                  *mp_tail = NULL;
9802 9813          mib2_ipIfStatsEntry_t   global_ip_mib;
9803 9814          mib2_ipAddrEntry_t      mae;
9804 9815  
9805 9816          /*
9806 9817           * Make a copy of the original message
9807 9818           */
9808 9819          mp2ctl = copymsg(mpctl);
9809 9820  
9810 9821          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9811 9822          optp->level = MIB2_IP;
9812 9823          optp->name = MIB2_IP_TRAFFIC_STATS;
9813 9824          /* Include "unknown interface" ip_mib */
9814 9825          ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
9815 9826          ipst->ips_ip_mib.ipIfStatsIfIndex =
9816 9827              MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
9817 9828          SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding,
9818 9829              (ipst->ips_ip_forwarding ? 1 : 2));
9819 9830          SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL,
9820 9831              (uint32_t)ipst->ips_ip_def_ttl);
9821 9832          SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize,
9822 9833              sizeof (mib2_ipIfStatsEntry_t));
9823 9834          SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize,
9824 9835              sizeof (mib2_ipAddrEntry_t));
9825 9836          SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize,
9826 9837              sizeof (mib2_ipRouteEntry_t));
9827 9838          SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize,
9828 9839              sizeof (mib2_ipNetToMediaEntry_t));
9829 9840          SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize,
9830 9841              sizeof (ip_member_t));
9831 9842          SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize,
9832 9843              sizeof (ip_grpsrc_t));
9833 9844  
9834 9845          bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib));
9835 9846  
9836 9847          if (legacy_req) {
9837 9848                  SET_MIB(global_ip_mib.ipIfStatsAddrEntrySize,
9838 9849                      LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t));
9839 9850          }
9840 9851  
9841 9852          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9842 9853              (char *)&global_ip_mib, (int)sizeof (global_ip_mib))) {
9843 9854                  ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9844 9855                      "failed to allocate %u bytes\n",
9845 9856                      (uint_t)sizeof (global_ip_mib)));
9846 9857          }
9847 9858  
9848 9859          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
9849 9860          ill = ILL_START_WALK_V4(&ctx, ipst);
9850 9861          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
9851 9862                  ill->ill_ip_mib->ipIfStatsIfIndex =
9852 9863                      ill->ill_phyint->phyint_ifindex;
9853 9864                  SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
9854 9865                      (ipst->ips_ip_forwarding ? 1 : 2));
9855 9866                  SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL,
9856 9867                      (uint32_t)ipst->ips_ip_def_ttl);
9857 9868  
9858 9869                  ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib);
9859 9870                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9860 9871                      (char *)ill->ill_ip_mib,
9861 9872                      (int)sizeof (*ill->ill_ip_mib))) {
9862 9873                          ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9863 9874                              "failed to allocate %u bytes\n",
9864 9875                              (uint_t)sizeof (*ill->ill_ip_mib)));
9865 9876                  }
9866 9877          }
9867 9878          rw_exit(&ipst->ips_ill_g_lock);
9868 9879  
9869 9880          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9870 9881          ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9871 9882              "level %d, name %d, len %d\n",
9872 9883              (int)optp->level, (int)optp->name, (int)optp->len));
9873 9884          qreply(q, mpctl);
9874 9885  
9875 9886          if (mp2ctl == NULL)
9876 9887                  return (NULL);
9877 9888  
9878 9889          return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst,
9879 9890              legacy_req));
9880 9891  }
9881 9892  
9882 9893  /* Global IPv4 ICMP statistics */
9883 9894  static mblk_t *
9884 9895  ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9885 9896  {
9886 9897          struct opthdr           *optp;
9887 9898          mblk_t                  *mp2ctl;
9888 9899  
9889 9900          /*
9890 9901           * Make a copy of the original message
9891 9902           */
9892 9903          mp2ctl = copymsg(mpctl);
9893 9904  
9894 9905          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9895 9906          optp->level = MIB2_ICMP;
9896 9907          optp->name = 0;
9897 9908          if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib,
9898 9909              (int)sizeof (ipst->ips_icmp_mib))) {
9899 9910                  ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n",
9900 9911                      (uint_t)sizeof (ipst->ips_icmp_mib)));
9901 9912          }
9902 9913          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9903 9914          ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n",
9904 9915              (int)optp->level, (int)optp->name, (int)optp->len));
9905 9916          qreply(q, mpctl);
9906 9917          return (mp2ctl);
9907 9918  }
9908 9919  
9909 9920  /* Global IPv4 IGMP statistics */
9910 9921  static mblk_t *
9911 9922  ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9912 9923  {
9913 9924          struct opthdr           *optp;
9914 9925          mblk_t                  *mp2ctl;
9915 9926  
9916 9927          /*
9917 9928           * make a copy of the original message
9918 9929           */
9919 9930          mp2ctl = copymsg(mpctl);
9920 9931  
9921 9932          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9922 9933          optp->level = EXPER_IGMP;
9923 9934          optp->name = 0;
9924 9935          if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat,
9925 9936              (int)sizeof (ipst->ips_igmpstat))) {
9926 9937                  ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n",
9927 9938                      (uint_t)sizeof (ipst->ips_igmpstat)));
9928 9939          }
9929 9940          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9930 9941          ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n",
9931 9942              (int)optp->level, (int)optp->name, (int)optp->len));
9932 9943          qreply(q, mpctl);
9933 9944          return (mp2ctl);
9934 9945  }
9935 9946  
9936 9947  /* Global IPv4 Multicast Routing statistics */
9937 9948  static mblk_t *
9938 9949  ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9939 9950  {
9940 9951          struct opthdr           *optp;
9941 9952          mblk_t                  *mp2ctl;
9942 9953  
9943 9954          /*
9944 9955           * make a copy of the original message
9945 9956           */
9946 9957          mp2ctl = copymsg(mpctl);
9947 9958  
9948 9959          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9949 9960          optp->level = EXPER_DVMRP;
9950 9961          optp->name = 0;
9951 9962          if (!ip_mroute_stats(mpctl->b_cont, ipst)) {
9952 9963                  ip0dbg(("ip_mroute_stats: failed\n"));
9953 9964          }
9954 9965          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9955 9966          ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n",
9956 9967              (int)optp->level, (int)optp->name, (int)optp->len));
9957 9968          qreply(q, mpctl);
9958 9969          return (mp2ctl);
9959 9970  }
9960 9971  
9961 9972  /* IPv4 address information */
9962 9973  static mblk_t *
9963 9974  ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9964 9975      boolean_t legacy_req)
9965 9976  {
9966 9977          struct opthdr           *optp;
9967 9978          mblk_t                  *mp2ctl;
9968 9979          mblk_t                  *mp_tail = NULL;
9969 9980          ill_t                   *ill;
9970 9981          ipif_t                  *ipif;
9971 9982          uint_t                  bitval;
9972 9983          mib2_ipAddrEntry_t      mae;
9973 9984          size_t                  mae_size;
9974 9985          zoneid_t                zoneid;
9975 9986          ill_walk_context_t      ctx;
9976 9987  
9977 9988          /*
9978 9989           * make a copy of the original message
9979 9990           */
9980 9991          mp2ctl = copymsg(mpctl);
9981 9992  
9982 9993          mae_size = (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
9983 9994              sizeof (mib2_ipAddrEntry_t);
9984 9995  
9985 9996          /* ipAddrEntryTable */
9986 9997  
9987 9998          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9988 9999          optp->level = MIB2_IP;
9989 10000          optp->name = MIB2_IP_ADDR;
9990 10001          zoneid = Q_TO_CONN(q)->conn_zoneid;
9991 10002  
9992 10003          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
9993 10004          ill = ILL_START_WALK_V4(&ctx, ipst);
9994 10005          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
9995 10006                  for (ipif = ill->ill_ipif; ipif != NULL;
9996 10007                      ipif = ipif->ipif_next) {
9997 10008                          if (ipif->ipif_zoneid != zoneid &&
9998 10009                              ipif->ipif_zoneid != ALL_ZONES)
9999 10010                                  continue;
10000 10011                          /* Sum of count from dead IRE_LO* and our current */
10001 10012                          mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10002 10013                          if (ipif->ipif_ire_local != NULL) {
10003 10014                                  mae.ipAdEntInfo.ae_ibcnt +=
10004 10015                                      ipif->ipif_ire_local->ire_ib_pkt_count;
10005 10016                          }
10006 10017                          mae.ipAdEntInfo.ae_obcnt = 0;
10007 10018                          mae.ipAdEntInfo.ae_focnt = 0;
10008 10019  
10009 10020                          ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
10010 10021                              OCTET_LENGTH);
10011 10022                          mae.ipAdEntIfIndex.o_length =
10012 10023                              mi_strlen(mae.ipAdEntIfIndex.o_bytes);
10013 10024                          mae.ipAdEntAddr = ipif->ipif_lcl_addr;
10014 10025                          mae.ipAdEntNetMask = ipif->ipif_net_mask;
10015 10026                          mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
10016 10027                          mae.ipAdEntInfo.ae_subnet_len =
10017 10028                              ip_mask_to_plen(ipif->ipif_net_mask);
10018 10029                          mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
10019 10030                          for (bitval = 1;
10020 10031                              bitval &&
10021 10032                              !(bitval & ipif->ipif_brd_addr);
10022 10033                              bitval <<= 1)
10023 10034                                  noop;
10024 10035                          mae.ipAdEntBcastAddr = bitval;
10025 10036                          mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
10026 10037                          mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10027 10038                          mae.ipAdEntInfo.ae_metric  = ipif->ipif_ill->ill_metric;
10028 10039                          mae.ipAdEntInfo.ae_broadcast_addr =
10029 10040                              ipif->ipif_brd_addr;
10030 10041                          mae.ipAdEntInfo.ae_pp_dst_addr =
10031 10042                              ipif->ipif_pp_dst_addr;
10032 10043                          mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
10033 10044                              ill->ill_flags | ill->ill_phyint->phyint_flags;
10034 10045                          mae.ipAdEntRetransmitTime =
10035 10046                              ill->ill_reachable_retrans_time;
10036 10047  
10037 10048                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10038 10049                              (char *)&mae, (int)mae_size)) {
10039 10050                                  ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to "
10040 10051                                      "allocate %u bytes\n", (uint_t)mae_size));
10041 10052                          }
10042 10053                  }
10043 10054          }
10044 10055          rw_exit(&ipst->ips_ill_g_lock);
10045 10056  
10046 10057          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10047 10058          ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n",
10048 10059              (int)optp->level, (int)optp->name, (int)optp->len));
10049 10060          qreply(q, mpctl);
10050 10061          return (mp2ctl);
10051 10062  }
10052 10063  
10053 10064  /* IPv6 address information */
10054 10065  static mblk_t *
10055 10066  ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10056 10067      boolean_t legacy_req)
10057 10068  {
10058 10069          struct opthdr           *optp;
10059 10070          mblk_t                  *mp2ctl;
10060 10071          mblk_t                  *mp_tail = NULL;
10061 10072          ill_t                   *ill;
10062 10073          ipif_t                  *ipif;
10063 10074          mib2_ipv6AddrEntry_t    mae6;
10064 10075          size_t                  mae6_size;
10065 10076          zoneid_t                zoneid;
10066 10077          ill_walk_context_t      ctx;
10067 10078  
10068 10079          /*
10069 10080           * make a copy of the original message
10070 10081           */
10071 10082          mp2ctl = copymsg(mpctl);
10072 10083  
10073 10084          mae6_size = (legacy_req) ?
10074 10085              LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t) :
10075 10086              sizeof (mib2_ipv6AddrEntry_t);
10076 10087  
10077 10088          /* ipv6AddrEntryTable */
10078 10089  
10079 10090          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10080 10091          optp->level = MIB2_IP6;
10081 10092          optp->name = MIB2_IP6_ADDR;
10082 10093          zoneid = Q_TO_CONN(q)->conn_zoneid;
10083 10094  
10084 10095          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10085 10096          ill = ILL_START_WALK_V6(&ctx, ipst);
10086 10097          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10087 10098                  for (ipif = ill->ill_ipif; ipif != NULL;
10088 10099                      ipif = ipif->ipif_next) {
10089 10100                          if (ipif->ipif_zoneid != zoneid &&
10090 10101                              ipif->ipif_zoneid != ALL_ZONES)
10091 10102                                  continue;
10092 10103                          /* Sum of count from dead IRE_LO* and our current */
10093 10104                          mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10094 10105                          if (ipif->ipif_ire_local != NULL) {
10095 10106                                  mae6.ipv6AddrInfo.ae_ibcnt +=
10096 10107                                      ipif->ipif_ire_local->ire_ib_pkt_count;
10097 10108                          }
10098 10109                          mae6.ipv6AddrInfo.ae_obcnt = 0;
10099 10110                          mae6.ipv6AddrInfo.ae_focnt = 0;
10100 10111  
10101 10112                          ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
10102 10113                              OCTET_LENGTH);
10103 10114                          mae6.ipv6AddrIfIndex.o_length =
10104 10115                              mi_strlen(mae6.ipv6AddrIfIndex.o_bytes);
10105 10116                          mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr;
10106 10117                          mae6.ipv6AddrPfxLength =
10107 10118                              ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10108 10119                          mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
10109 10120                          mae6.ipv6AddrInfo.ae_subnet_len =
10110 10121                              mae6.ipv6AddrPfxLength;
10111 10122                          mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
10112 10123  
10113 10124                          /* Type: stateless(1), stateful(2), unknown(3) */
10114 10125                          if (ipif->ipif_flags & IPIF_ADDRCONF)
10115 10126                                  mae6.ipv6AddrType = 1;
10116 10127                          else
10117 10128                                  mae6.ipv6AddrType = 2;
10118 10129                          /* Anycast: true(1), false(2) */
10119 10130                          if (ipif->ipif_flags & IPIF_ANYCAST)
10120 10131                                  mae6.ipv6AddrAnycastFlag = 1;
10121 10132                          else
10122 10133                                  mae6.ipv6AddrAnycastFlag = 2;
10123 10134  
10124 10135                          /*
10125 10136                           * Address status: preferred(1), deprecated(2),
10126 10137                           * invalid(3), inaccessible(4), unknown(5)
10127 10138                           */
10128 10139                          if (ipif->ipif_flags & IPIF_NOLOCAL)
10129 10140                                  mae6.ipv6AddrStatus = 3;
10130 10141                          else if (ipif->ipif_flags & IPIF_DEPRECATED)
10131 10142                                  mae6.ipv6AddrStatus = 2;
10132 10143                          else
10133 10144                                  mae6.ipv6AddrStatus = 1;
10134 10145                          mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10135 10146                          mae6.ipv6AddrInfo.ae_metric  =
10136 10147                              ipif->ipif_ill->ill_metric;
10137 10148                          mae6.ipv6AddrInfo.ae_pp_dst_addr =
10138 10149                              ipif->ipif_v6pp_dst_addr;
10139 10150                          mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags |
10140 10151                              ill->ill_flags | ill->ill_phyint->phyint_flags;
10141 10152                          mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET;
10142 10153                          mae6.ipv6AddrIdentifier = ill->ill_token;
10143 10154                          mae6.ipv6AddrIdentifierLen = ill->ill_token_length;
10144 10155                          mae6.ipv6AddrReachableTime = ill->ill_reachable_time;
10145 10156                          mae6.ipv6AddrRetransmitTime =
10146 10157                              ill->ill_reachable_retrans_time;
10147 10158                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10148 10159                              (char *)&mae6, (int)mae6_size)) {
10149 10160                                  ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to "
10150 10161                                      "allocate %u bytes\n",
10151 10162                                      (uint_t)mae6_size));
10152 10163                          }
10153 10164                  }
10154 10165          }
10155 10166          rw_exit(&ipst->ips_ill_g_lock);
10156 10167  
10157 10168          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10158 10169          ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n",
10159 10170              (int)optp->level, (int)optp->name, (int)optp->len));
10160 10171          qreply(q, mpctl);
10161 10172          return (mp2ctl);
10162 10173  }
10163 10174  
10164 10175  /* IPv4 multicast group membership. */
10165 10176  static mblk_t *
10166 10177  ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10167 10178  {
10168 10179          struct opthdr           *optp;
10169 10180          mblk_t                  *mp2ctl;
10170 10181          ill_t                   *ill;
10171 10182          ipif_t                  *ipif;
10172 10183          ilm_t                   *ilm;
10173 10184          ip_member_t             ipm;
10174 10185          mblk_t                  *mp_tail = NULL;
10175 10186          ill_walk_context_t      ctx;
10176 10187          zoneid_t                zoneid;
10177 10188  
10178 10189          /*
10179 10190           * make a copy of the original message
10180 10191           */
10181 10192          mp2ctl = copymsg(mpctl);
10182 10193          zoneid = Q_TO_CONN(q)->conn_zoneid;
10183 10194  
10184 10195          /* ipGroupMember table */
10185 10196          optp = (struct opthdr *)&mpctl->b_rptr[
10186 10197              sizeof (struct T_optmgmt_ack)];
10187 10198          optp->level = MIB2_IP;
10188 10199          optp->name = EXPER_IP_GROUP_MEMBERSHIP;
10189 10200  
10190 10201          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10191 10202          ill = ILL_START_WALK_V4(&ctx, ipst);
10192 10203          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10193 10204                  /* Make sure the ill isn't going away. */
10194 10205                  if (!ill_check_and_refhold(ill))
10195 10206                          continue;
10196 10207                  rw_exit(&ipst->ips_ill_g_lock);
10197 10208                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10198 10209                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10199 10210                          if (ilm->ilm_zoneid != zoneid &&
10200 10211                              ilm->ilm_zoneid != ALL_ZONES)
10201 10212                                  continue;
10202 10213  
10203 10214                          /* Is there an ipif for ilm_ifaddr? */
10204 10215                          for (ipif = ill->ill_ipif; ipif != NULL;
10205 10216                              ipif = ipif->ipif_next) {
10206 10217                                  if (!IPIF_IS_CONDEMNED(ipif) &&
10207 10218                                      ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10208 10219                                      ilm->ilm_ifaddr != INADDR_ANY)
10209 10220                                          break;
10210 10221                          }
10211 10222                          if (ipif != NULL) {
10212 10223                                  ipif_get_name(ipif,
10213 10224                                      ipm.ipGroupMemberIfIndex.o_bytes,
10214 10225                                      OCTET_LENGTH);
10215 10226                          } else {
10216 10227                                  ill_get_name(ill,
10217 10228                                      ipm.ipGroupMemberIfIndex.o_bytes,
10218 10229                                      OCTET_LENGTH);
10219 10230                          }
10220 10231                          ipm.ipGroupMemberIfIndex.o_length =
10221 10232                              mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
10222 10233  
10223 10234                          ipm.ipGroupMemberAddress = ilm->ilm_addr;
10224 10235                          ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
10225 10236                          ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
10226 10237                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10227 10238                              (char *)&ipm, (int)sizeof (ipm))) {
10228 10239                                  ip1dbg(("ip_snmp_get_mib2_ip_group: "
10229 10240                                      "failed to allocate %u bytes\n",
10230 10241                                      (uint_t)sizeof (ipm)));
10231 10242                          }
10232 10243                  }
10233 10244                  rw_exit(&ill->ill_mcast_lock);
10234 10245                  ill_refrele(ill);
10235 10246                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10236 10247          }
10237 10248          rw_exit(&ipst->ips_ill_g_lock);
10238 10249          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10239 10250          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10240 10251              (int)optp->level, (int)optp->name, (int)optp->len));
10241 10252          qreply(q, mpctl);
10242 10253          return (mp2ctl);
10243 10254  }
10244 10255  
10245 10256  /* IPv6 multicast group membership. */
10246 10257  static mblk_t *
10247 10258  ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10248 10259  {
10249 10260          struct opthdr           *optp;
10250 10261          mblk_t                  *mp2ctl;
10251 10262          ill_t                   *ill;
10252 10263          ilm_t                   *ilm;
10253 10264          ipv6_member_t           ipm6;
10254 10265          mblk_t                  *mp_tail = NULL;
10255 10266          ill_walk_context_t      ctx;
10256 10267          zoneid_t                zoneid;
10257 10268  
10258 10269          /*
10259 10270           * make a copy of the original message
10260 10271           */
10261 10272          mp2ctl = copymsg(mpctl);
10262 10273          zoneid = Q_TO_CONN(q)->conn_zoneid;
10263 10274  
10264 10275          /* ip6GroupMember table */
10265 10276          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10266 10277          optp->level = MIB2_IP6;
10267 10278          optp->name = EXPER_IP6_GROUP_MEMBERSHIP;
10268 10279  
10269 10280          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10270 10281          ill = ILL_START_WALK_V6(&ctx, ipst);
10271 10282          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10272 10283                  /* Make sure the ill isn't going away. */
10273 10284                  if (!ill_check_and_refhold(ill))
10274 10285                          continue;
10275 10286                  rw_exit(&ipst->ips_ill_g_lock);
10276 10287                  /*
10277 10288                   * Normally we don't have any members on under IPMP interfaces.
10278 10289                   * We report them as a debugging aid.
10279 10290                   */
10280 10291                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10281 10292                  ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
10282 10293                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10283 10294                          if (ilm->ilm_zoneid != zoneid &&
10284 10295                              ilm->ilm_zoneid != ALL_ZONES)
10285 10296                                  continue;       /* not this zone */
10286 10297                          ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
10287 10298                          ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
10288 10299                          ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode;
10289 10300                          if (!snmp_append_data2(mpctl->b_cont,
10290 10301                              &mp_tail,
10291 10302                              (char *)&ipm6, (int)sizeof (ipm6))) {
10292 10303                                  ip1dbg(("ip_snmp_get_mib2_ip6_group: "
10293 10304                                      "failed to allocate %u bytes\n",
10294 10305                                      (uint_t)sizeof (ipm6)));
10295 10306                          }
10296 10307                  }
10297 10308                  rw_exit(&ill->ill_mcast_lock);
10298 10309                  ill_refrele(ill);
10299 10310                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10300 10311          }
10301 10312          rw_exit(&ipst->ips_ill_g_lock);
10302 10313  
10303 10314          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10304 10315          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10305 10316              (int)optp->level, (int)optp->name, (int)optp->len));
10306 10317          qreply(q, mpctl);
10307 10318          return (mp2ctl);
10308 10319  }
10309 10320  
10310 10321  /* IP multicast filtered sources */
10311 10322  static mblk_t *
10312 10323  ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10313 10324  {
10314 10325          struct opthdr           *optp;
10315 10326          mblk_t                  *mp2ctl;
10316 10327          ill_t                   *ill;
10317 10328          ipif_t                  *ipif;
10318 10329          ilm_t                   *ilm;
10319 10330          ip_grpsrc_t             ips;
10320 10331          mblk_t                  *mp_tail = NULL;
10321 10332          ill_walk_context_t      ctx;
10322 10333          zoneid_t                zoneid;
10323 10334          int                     i;
10324 10335          slist_t                 *sl;
10325 10336  
10326 10337          /*
10327 10338           * make a copy of the original message
10328 10339           */
10329 10340          mp2ctl = copymsg(mpctl);
10330 10341          zoneid = Q_TO_CONN(q)->conn_zoneid;
10331 10342  
10332 10343          /* ipGroupSource table */
10333 10344          optp = (struct opthdr *)&mpctl->b_rptr[
10334 10345              sizeof (struct T_optmgmt_ack)];
10335 10346          optp->level = MIB2_IP;
10336 10347          optp->name = EXPER_IP_GROUP_SOURCES;
10337 10348  
10338 10349          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10339 10350          ill = ILL_START_WALK_V4(&ctx, ipst);
10340 10351          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10341 10352                  /* Make sure the ill isn't going away. */
10342 10353                  if (!ill_check_and_refhold(ill))
10343 10354                          continue;
10344 10355                  rw_exit(&ipst->ips_ill_g_lock);
10345 10356                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10346 10357                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10347 10358                          sl = ilm->ilm_filter;
10348 10359                          if (ilm->ilm_zoneid != zoneid &&
10349 10360                              ilm->ilm_zoneid != ALL_ZONES)
10350 10361                                  continue;
10351 10362                          if (SLIST_IS_EMPTY(sl))
10352 10363                                  continue;
10353 10364  
10354 10365                          /* Is there an ipif for ilm_ifaddr? */
10355 10366                          for (ipif = ill->ill_ipif; ipif != NULL;
10356 10367                              ipif = ipif->ipif_next) {
10357 10368                                  if (!IPIF_IS_CONDEMNED(ipif) &&
10358 10369                                      ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10359 10370                                      ilm->ilm_ifaddr != INADDR_ANY)
10360 10371                                          break;
10361 10372                          }
10362 10373                          if (ipif != NULL) {
10363 10374                                  ipif_get_name(ipif,
10364 10375                                      ips.ipGroupSourceIfIndex.o_bytes,
10365 10376                                      OCTET_LENGTH);
10366 10377                          } else {
10367 10378                                  ill_get_name(ill,
10368 10379                                      ips.ipGroupSourceIfIndex.o_bytes,
10369 10380                                      OCTET_LENGTH);
10370 10381                          }
10371 10382                          ips.ipGroupSourceIfIndex.o_length =
10372 10383                              mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
10373 10384  
10374 10385                          ips.ipGroupSourceGroup = ilm->ilm_addr;
10375 10386                          for (i = 0; i < sl->sl_numsrc; i++) {
10376 10387                                  if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
10377 10388                                          continue;
10378 10389                                  IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
10379 10390                                      ips.ipGroupSourceAddress);
10380 10391                                  if (snmp_append_data2(mpctl->b_cont, &mp_tail,
10381 10392                                      (char *)&ips, (int)sizeof (ips)) == 0) {
10382 10393                                          ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
10383 10394                                              " failed to allocate %u bytes\n",
10384 10395                                              (uint_t)sizeof (ips)));
10385 10396                                  }
10386 10397                          }
10387 10398                  }
10388 10399                  rw_exit(&ill->ill_mcast_lock);
10389 10400                  ill_refrele(ill);
10390 10401                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10391 10402          }
10392 10403          rw_exit(&ipst->ips_ill_g_lock);
10393 10404          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10394 10405          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10395 10406              (int)optp->level, (int)optp->name, (int)optp->len));
10396 10407          qreply(q, mpctl);
10397 10408          return (mp2ctl);
10398 10409  }
10399 10410  
10400 10411  /* IPv6 multicast filtered sources. */
10401 10412  static mblk_t *
10402 10413  ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10403 10414  {
10404 10415          struct opthdr           *optp;
10405 10416          mblk_t                  *mp2ctl;
10406 10417          ill_t                   *ill;
10407 10418          ilm_t                   *ilm;
10408 10419          ipv6_grpsrc_t           ips6;
10409 10420          mblk_t                  *mp_tail = NULL;
10410 10421          ill_walk_context_t      ctx;
10411 10422          zoneid_t                zoneid;
10412 10423          int                     i;
10413 10424          slist_t                 *sl;
10414 10425  
10415 10426          /*
10416 10427           * make a copy of the original message
10417 10428           */
10418 10429          mp2ctl = copymsg(mpctl);
10419 10430          zoneid = Q_TO_CONN(q)->conn_zoneid;
10420 10431  
10421 10432          /* ip6GroupMember table */
10422 10433          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10423 10434          optp->level = MIB2_IP6;
10424 10435          optp->name = EXPER_IP6_GROUP_SOURCES;
10425 10436  
10426 10437          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10427 10438          ill = ILL_START_WALK_V6(&ctx, ipst);
10428 10439          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10429 10440                  /* Make sure the ill isn't going away. */
10430 10441                  if (!ill_check_and_refhold(ill))
10431 10442                          continue;
10432 10443                  rw_exit(&ipst->ips_ill_g_lock);
10433 10444                  /*
10434 10445                   * Normally we don't have any members on under IPMP interfaces.
10435 10446                   * We report them as a debugging aid.
10436 10447                   */
10437 10448                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10438 10449                  ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
10439 10450                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10440 10451                          sl = ilm->ilm_filter;
10441 10452                          if (ilm->ilm_zoneid != zoneid &&
10442 10453                              ilm->ilm_zoneid != ALL_ZONES)
10443 10454                                  continue;
10444 10455                          if (SLIST_IS_EMPTY(sl))
10445 10456                                  continue;
10446 10457                          ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
10447 10458                          for (i = 0; i < sl->sl_numsrc; i++) {
10448 10459                                  ips6.ipv6GroupSourceAddress = sl->sl_addr[i];
10449 10460                                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10450 10461                                      (char *)&ips6, (int)sizeof (ips6))) {
10451 10462                                          ip1dbg(("ip_snmp_get_mib2_ip6_"
10452 10463                                              "group_src: failed to allocate "
10453 10464                                              "%u bytes\n",
10454 10465                                              (uint_t)sizeof (ips6)));
10455 10466                                  }
10456 10467                          }
10457 10468                  }
10458 10469                  rw_exit(&ill->ill_mcast_lock);
10459 10470                  ill_refrele(ill);
10460 10471                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10461 10472          }
10462 10473          rw_exit(&ipst->ips_ill_g_lock);
10463 10474  
10464 10475          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10465 10476          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10466 10477              (int)optp->level, (int)optp->name, (int)optp->len));
10467 10478          qreply(q, mpctl);
10468 10479          return (mp2ctl);
10469 10480  }
10470 10481  
10471 10482  /* Multicast routing virtual interface table. */
10472 10483  static mblk_t *
10473 10484  ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10474 10485  {
10475 10486          struct opthdr           *optp;
10476 10487          mblk_t                  *mp2ctl;
10477 10488  
10478 10489          /*
10479 10490           * make a copy of the original message
10480 10491           */
10481 10492          mp2ctl = copymsg(mpctl);
10482 10493  
10483 10494          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10484 10495          optp->level = EXPER_DVMRP;
10485 10496          optp->name = EXPER_DVMRP_VIF;
10486 10497          if (!ip_mroute_vif(mpctl->b_cont, ipst)) {
10487 10498                  ip0dbg(("ip_mroute_vif: failed\n"));
10488 10499          }
10489 10500          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10490 10501          ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n",
10491 10502              (int)optp->level, (int)optp->name, (int)optp->len));
10492 10503          qreply(q, mpctl);
10493 10504          return (mp2ctl);
10494 10505  }
10495 10506  
10496 10507  /* Multicast routing table. */
10497 10508  static mblk_t *
10498 10509  ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10499 10510  {
10500 10511          struct opthdr           *optp;
10501 10512          mblk_t                  *mp2ctl;
10502 10513  
10503 10514          /*
10504 10515           * make a copy of the original message
10505 10516           */
10506 10517          mp2ctl = copymsg(mpctl);
10507 10518  
10508 10519          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10509 10520          optp->level = EXPER_DVMRP;
10510 10521          optp->name = EXPER_DVMRP_MRT;
10511 10522          if (!ip_mroute_mrt(mpctl->b_cont, ipst)) {
10512 10523                  ip0dbg(("ip_mroute_mrt: failed\n"));
10513 10524          }
10514 10525          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10515 10526          ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n",
10516 10527              (int)optp->level, (int)optp->name, (int)optp->len));
10517 10528          qreply(q, mpctl);
10518 10529          return (mp2ctl);
10519 10530  }
10520 10531  
10521 10532  /*
10522 10533   * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable
10523 10534   * in one IRE walk.
10524 10535   */
10525 10536  static mblk_t *
10526 10537  ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
10527 10538      ip_stack_t *ipst)
10528 10539  {
10529 10540          struct opthdr   *optp;
10530 10541          mblk_t          *mp2ctl;        /* Returned */
10531 10542          mblk_t          *mp3ctl;        /* nettomedia */
10532 10543          mblk_t          *mp4ctl;        /* routeattrs */
10533 10544          iproutedata_t   ird;
10534 10545          zoneid_t        zoneid;
10535 10546  
10536 10547          /*
10537 10548           * make copies of the original message
10538 10549           *      - mp2ctl is returned unchanged to the caller for his use
10539 10550           *      - mpctl is sent upstream as ipRouteEntryTable
10540 10551           *      - mp3ctl is sent upstream as ipNetToMediaEntryTable
10541 10552           *      - mp4ctl is sent upstream as ipRouteAttributeTable
10542 10553           */
10543 10554          mp2ctl = copymsg(mpctl);
10544 10555          mp3ctl = copymsg(mpctl);
10545 10556          mp4ctl = copymsg(mpctl);
10546 10557          if (mp3ctl == NULL || mp4ctl == NULL) {
10547 10558                  freemsg(mp4ctl);
10548 10559                  freemsg(mp3ctl);
10549 10560                  freemsg(mp2ctl);
10550 10561                  freemsg(mpctl);
10551 10562                  return (NULL);
10552 10563          }
10553 10564  
10554 10565          bzero(&ird, sizeof (ird));
10555 10566  
10556 10567          ird.ird_route.lp_head = mpctl->b_cont;
10557 10568          ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10558 10569          ird.ird_attrs.lp_head = mp4ctl->b_cont;
10559 10570          /*
10560 10571           * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10561 10572           * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
10562 10573           * intended a temporary solution until a proper MIB API is provided
10563 10574           * that provides complete filtering/caller-opt-in.
10564 10575           */
10565 10576          if (level == EXPER_IP_AND_ALL_IRES)
10566 10577                  ird.ird_flags |= IRD_REPORT_ALL;
10567 10578  
10568 10579          zoneid = Q_TO_CONN(q)->conn_zoneid;
10569 10580          ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
10570 10581  
10571 10582          /* ipRouteEntryTable in mpctl */
10572 10583          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10573 10584          optp->level = MIB2_IP;
10574 10585          optp->name = MIB2_IP_ROUTE;
10575 10586          optp->len = msgdsize(ird.ird_route.lp_head);
10576 10587          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10577 10588              (int)optp->level, (int)optp->name, (int)optp->len));
10578 10589          qreply(q, mpctl);
10579 10590  
10580 10591          /* ipNetToMediaEntryTable in mp3ctl */
10581 10592          ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
10582 10593  
10583 10594          optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10584 10595          optp->level = MIB2_IP;
10585 10596          optp->name = MIB2_IP_MEDIA;
10586 10597          optp->len = msgdsize(ird.ird_netmedia.lp_head);
10587 10598          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10588 10599              (int)optp->level, (int)optp->name, (int)optp->len));
10589 10600          qreply(q, mp3ctl);
10590 10601  
10591 10602          /* ipRouteAttributeTable in mp4ctl */
10592 10603          optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10593 10604          optp->level = MIB2_IP;
10594 10605          optp->name = EXPER_IP_RTATTR;
10595 10606          optp->len = msgdsize(ird.ird_attrs.lp_head);
10596 10607          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10597 10608              (int)optp->level, (int)optp->name, (int)optp->len));
10598 10609          if (optp->len == 0)
10599 10610                  freemsg(mp4ctl);
10600 10611          else
10601 10612                  qreply(q, mp4ctl);
10602 10613  
10603 10614          return (mp2ctl);
10604 10615  }
10605 10616  
10606 10617  /*
10607 10618   * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and
10608 10619   * ipv6NetToMediaEntryTable in an NDP walk.
10609 10620   */
10610 10621  static mblk_t *
10611 10622  ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
10612 10623      ip_stack_t *ipst)
10613 10624  {
10614 10625          struct opthdr   *optp;
10615 10626          mblk_t          *mp2ctl;        /* Returned */
10616 10627          mblk_t          *mp3ctl;        /* nettomedia */
10617 10628          mblk_t          *mp4ctl;        /* routeattrs */
10618 10629          iproutedata_t   ird;
10619 10630          zoneid_t        zoneid;
10620 10631  
10621 10632          /*
10622 10633           * make copies of the original message
10623 10634           *      - mp2ctl is returned unchanged to the caller for his use
10624 10635           *      - mpctl is sent upstream as ipv6RouteEntryTable
10625 10636           *      - mp3ctl is sent upstream as ipv6NetToMediaEntryTable
10626 10637           *      - mp4ctl is sent upstream as ipv6RouteAttributeTable
10627 10638           */
10628 10639          mp2ctl = copymsg(mpctl);
10629 10640          mp3ctl = copymsg(mpctl);
10630 10641          mp4ctl = copymsg(mpctl);
10631 10642          if (mp3ctl == NULL || mp4ctl == NULL) {
10632 10643                  freemsg(mp4ctl);
10633 10644                  freemsg(mp3ctl);
10634 10645                  freemsg(mp2ctl);
10635 10646                  freemsg(mpctl);
10636 10647                  return (NULL);
10637 10648          }
10638 10649  
10639 10650          bzero(&ird, sizeof (ird));
10640 10651  
10641 10652          ird.ird_route.lp_head = mpctl->b_cont;
10642 10653          ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10643 10654          ird.ird_attrs.lp_head = mp4ctl->b_cont;
10644 10655          /*
10645 10656           * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10646 10657           * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
10647 10658           * intended a temporary solution until a proper MIB API is provided
10648 10659           * that provides complete filtering/caller-opt-in.
10649 10660           */
10650 10661          if (level == EXPER_IP_AND_ALL_IRES)
10651 10662                  ird.ird_flags |= IRD_REPORT_ALL;
10652 10663  
10653 10664          zoneid = Q_TO_CONN(q)->conn_zoneid;
10654 10665          ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
10655 10666  
10656 10667          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10657 10668          optp->level = MIB2_IP6;
10658 10669          optp->name = MIB2_IP6_ROUTE;
10659 10670          optp->len = msgdsize(ird.ird_route.lp_head);
10660 10671          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10661 10672              (int)optp->level, (int)optp->name, (int)optp->len));
10662 10673          qreply(q, mpctl);
10663 10674  
10664 10675          /* ipv6NetToMediaEntryTable in mp3ctl */
10665 10676          ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
10666 10677  
10667 10678          optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10668 10679          optp->level = MIB2_IP6;
10669 10680          optp->name = MIB2_IP6_MEDIA;
10670 10681          optp->len = msgdsize(ird.ird_netmedia.lp_head);
10671 10682          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10672 10683              (int)optp->level, (int)optp->name, (int)optp->len));
10673 10684          qreply(q, mp3ctl);
10674 10685  
10675 10686          /* ipv6RouteAttributeTable in mp4ctl */
10676 10687          optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10677 10688          optp->level = MIB2_IP6;
10678 10689          optp->name = EXPER_IP_RTATTR;
10679 10690          optp->len = msgdsize(ird.ird_attrs.lp_head);
10680 10691          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10681 10692              (int)optp->level, (int)optp->name, (int)optp->len));
10682 10693          if (optp->len == 0)
10683 10694                  freemsg(mp4ctl);
10684 10695          else
10685 10696                  qreply(q, mp4ctl);
10686 10697  
10687 10698          return (mp2ctl);
10688 10699  }
10689 10700  
10690 10701  /*
10691 10702   * IPv6 mib: One per ill
10692 10703   */
10693 10704  static mblk_t *
10694 10705  ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10695 10706      boolean_t legacy_req)
10696 10707  {
10697 10708          struct opthdr           *optp;
10698 10709          mblk_t                  *mp2ctl;
10699 10710          ill_t                   *ill;
10700 10711          ill_walk_context_t      ctx;
10701 10712          mblk_t                  *mp_tail = NULL;
10702 10713          mib2_ipv6AddrEntry_t    mae6;
10703 10714          mib2_ipIfStatsEntry_t   *ise;
10704 10715          size_t                  ise_size, iae_size;
10705 10716  
10706 10717          /*
10707 10718           * Make a copy of the original message
10708 10719           */
10709 10720          mp2ctl = copymsg(mpctl);
10710 10721  
10711 10722          /* fixed length IPv6 structure ... */
10712 10723  
10713 10724          if (legacy_req) {
10714 10725                  ise_size = LEGACY_MIB_SIZE(&ipst->ips_ip6_mib,
10715 10726                      mib2_ipIfStatsEntry_t);
10716 10727                  iae_size = LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t);
10717 10728          } else {
10718 10729                  ise_size = sizeof (mib2_ipIfStatsEntry_t);
10719 10730                  iae_size = sizeof (mib2_ipv6AddrEntry_t);
10720 10731          }
10721 10732  
10722 10733          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10723 10734          optp->level = MIB2_IP6;
10724 10735          optp->name = 0;
10725 10736          /* Include "unknown interface" ip6_mib */
10726 10737          ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
10727 10738          ipst->ips_ip6_mib.ipIfStatsIfIndex =
10728 10739              MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
10729 10740          SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding,
10730 10741              ipst->ips_ipv6_forwarding ? 1 : 2);
10731 10742          SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit,
10732 10743              ipst->ips_ipv6_def_hops);
10733 10744          SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize,
10734 10745              sizeof (mib2_ipIfStatsEntry_t));
10735 10746          SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize,
10736 10747              sizeof (mib2_ipv6AddrEntry_t));
10737 10748          SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize,
10738 10749              sizeof (mib2_ipv6RouteEntry_t));
10739 10750          SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize,
10740 10751              sizeof (mib2_ipv6NetToMediaEntry_t));
10741 10752          SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize,
10742 10753              sizeof (ipv6_member_t));
10743 10754          SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize,
10744 10755              sizeof (ipv6_grpsrc_t));
10745 10756  
10746 10757          /*
10747 10758           * Synchronize 64- and 32-bit counters
10748 10759           */
10749 10760          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives,
10750 10761              ipIfStatsHCInReceives);
10751 10762          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers,
10752 10763              ipIfStatsHCInDelivers);
10753 10764          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests,
10754 10765              ipIfStatsHCOutRequests);
10755 10766          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams,
10756 10767              ipIfStatsHCOutForwDatagrams);
10757 10768          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts,
10758 10769              ipIfStatsHCOutMcastPkts);
10759 10770          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts,
10760 10771              ipIfStatsHCInMcastPkts);
10761 10772  
10762 10773          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10763 10774              (char *)&ipst->ips_ip6_mib, (int)ise_size)) {
10764 10775                  ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n",
10765 10776                      (uint_t)ise_size));
10766 10777          } else if (legacy_req) {
10767 10778                  /* Adjust the EntrySize fields for legacy requests. */
10768 10779                  ise =
10769 10780                      (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr - (int)ise_size);
10770 10781                  SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10771 10782                  SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10772 10783          }
10773 10784  
10774 10785          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10775 10786          ill = ILL_START_WALK_V6(&ctx, ipst);
10776 10787          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10777 10788                  ill->ill_ip_mib->ipIfStatsIfIndex =
10778 10789                      ill->ill_phyint->phyint_ifindex;
10779 10790                  SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
10780 10791                      ipst->ips_ipv6_forwarding ? 1 : 2);
10781 10792                  SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit,
10782 10793                      ill->ill_max_hops);
10783 10794  
10784 10795                  /*
10785 10796                   * Synchronize 64- and 32-bit counters
10786 10797                   */
10787 10798                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives,
10788 10799                      ipIfStatsHCInReceives);
10789 10800                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers,
10790 10801                      ipIfStatsHCInDelivers);
10791 10802                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests,
10792 10803                      ipIfStatsHCOutRequests);
10793 10804                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams,
10794 10805                      ipIfStatsHCOutForwDatagrams);
10795 10806                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts,
10796 10807                      ipIfStatsHCOutMcastPkts);
10797 10808                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts,
10798 10809                      ipIfStatsHCInMcastPkts);
10799 10810  
10800 10811                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10801 10812                      (char *)ill->ill_ip_mib, (int)ise_size)) {
10802 10813                          ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate "
10803 10814                          "%u bytes\n", (uint_t)ise_size));
10804 10815                  } else if (legacy_req) {
10805 10816                          /* Adjust the EntrySize fields for legacy requests. */
10806 10817                          ise = (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr -
10807 10818                              (int)ise_size);
10808 10819                          SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10809 10820                          SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10810 10821                  }
10811 10822          }
10812 10823          rw_exit(&ipst->ips_ill_g_lock);
10813 10824  
10814 10825          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10815 10826          ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n",
10816 10827              (int)optp->level, (int)optp->name, (int)optp->len));
10817 10828          qreply(q, mpctl);
10818 10829          return (mp2ctl);
10819 10830  }
10820 10831  
10821 10832  /*
10822 10833   * ICMPv6 mib: One per ill
10823 10834   */
10824 10835  static mblk_t *
10825 10836  ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10826 10837  {
10827 10838          struct opthdr           *optp;
10828 10839          mblk_t                  *mp2ctl;
10829 10840          ill_t                   *ill;
10830 10841          ill_walk_context_t      ctx;
10831 10842          mblk_t                  *mp_tail = NULL;
10832 10843          /*
10833 10844           * Make a copy of the original message
10834 10845           */
10835 10846          mp2ctl = copymsg(mpctl);
10836 10847  
10837 10848          /* fixed length ICMPv6 structure ... */
10838 10849  
10839 10850          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10840 10851          optp->level = MIB2_ICMP6;
10841 10852          optp->name = 0;
10842 10853          /* Include "unknown interface" icmp6_mib */
10843 10854          ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex =
10844 10855              MIB2_UNKNOWN_INTERFACE; /* netstat flag */
10845 10856          ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize =
10846 10857              sizeof (mib2_ipv6IfIcmpEntry_t);
10847 10858          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10848 10859              (char *)&ipst->ips_icmp6_mib,
10849 10860              (int)sizeof (ipst->ips_icmp6_mib))) {
10850 10861                  ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n",
10851 10862                      (uint_t)sizeof (ipst->ips_icmp6_mib)));
10852 10863          }
10853 10864  
10854 10865          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10855 10866          ill = ILL_START_WALK_V6(&ctx, ipst);
10856 10867          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10857 10868                  ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
10858 10869                      ill->ill_phyint->phyint_ifindex;
10859 10870                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10860 10871                      (char *)ill->ill_icmp6_mib,
10861 10872                      (int)sizeof (*ill->ill_icmp6_mib))) {
10862 10873                          ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate "
10863 10874                              "%u bytes\n",
10864 10875                              (uint_t)sizeof (*ill->ill_icmp6_mib)));
10865 10876                  }
10866 10877          }
10867 10878          rw_exit(&ipst->ips_ill_g_lock);
10868 10879  
10869 10880          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10870 10881          ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n",
10871 10882              (int)optp->level, (int)optp->name, (int)optp->len));
10872 10883          qreply(q, mpctl);
10873 10884          return (mp2ctl);
10874 10885  }
10875 10886  
10876 10887  /*
10877 10888   * ire_walk routine to create both ipRouteEntryTable and
10878 10889   * ipRouteAttributeTable in one IRE walk
10879 10890   */
10880 10891  static void
10881 10892  ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
10882 10893  {
10883 10894          ill_t                           *ill;
10884 10895          mib2_ipRouteEntry_t             *re;
10885 10896          mib2_ipAttributeEntry_t         iaes;
10886 10897          tsol_ire_gw_secattr_t           *attrp;
10887 10898          tsol_gc_t                       *gc = NULL;
10888 10899          tsol_gcgrp_t                    *gcgrp = NULL;
10889 10900          ip_stack_t                      *ipst = ire->ire_ipst;
10890 10901  
10891 10902          ASSERT(ire->ire_ipversion == IPV4_VERSION);
10892 10903  
10893 10904          if (!(ird->ird_flags & IRD_REPORT_ALL)) {
10894 10905                  if (ire->ire_testhidden)
10895 10906                          return;
10896 10907                  if (ire->ire_type & IRE_IF_CLONE)
10897 10908                          return;
10898 10909          }
10899 10910  
10900 10911          if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
10901 10912                  return;
10902 10913  
10903 10914          if ((attrp = ire->ire_gw_secattr) != NULL) {
10904 10915                  mutex_enter(&attrp->igsa_lock);
10905 10916                  if ((gc = attrp->igsa_gc) != NULL) {
10906 10917                          gcgrp = gc->gc_grp;
10907 10918                          ASSERT(gcgrp != NULL);
10908 10919                          rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
10909 10920                  }
10910 10921                  mutex_exit(&attrp->igsa_lock);
10911 10922          }
10912 10923          /*
10913 10924           * Return all IRE types for route table... let caller pick and choose
10914 10925           */
10915 10926          re->ipRouteDest = ire->ire_addr;
10916 10927          ill = ire->ire_ill;
10917 10928          re->ipRouteIfIndex.o_length = 0;
10918 10929          if (ill != NULL) {
10919 10930                  ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
10920 10931                  re->ipRouteIfIndex.o_length =
10921 10932                      mi_strlen(re->ipRouteIfIndex.o_bytes);
10922 10933          }
10923 10934          re->ipRouteMetric1 = -1;
10924 10935          re->ipRouteMetric2 = -1;
10925 10936          re->ipRouteMetric3 = -1;
10926 10937          re->ipRouteMetric4 = -1;
10927 10938  
10928 10939          re->ipRouteNextHop = ire->ire_gateway_addr;
10929 10940          /* indirect(4), direct(3), or invalid(2) */
10930 10941          if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
10931 10942                  re->ipRouteType = 2;
10932 10943          else if (ire->ire_type & IRE_ONLINK)
10933 10944                  re->ipRouteType = 3;
10934 10945          else
10935 10946                  re->ipRouteType = 4;
10936 10947  
10937 10948          re->ipRouteProto = -1;
10938 10949          re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
10939 10950          re->ipRouteMask = ire->ire_mask;
10940 10951          re->ipRouteMetric5 = -1;
10941 10952          re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
10942 10953          if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
10943 10954                  re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
10944 10955  
10945 10956          re->ipRouteInfo.re_frag_flag    = 0;
10946 10957          re->ipRouteInfo.re_rtt          = 0;
10947 10958          re->ipRouteInfo.re_src_addr     = 0;
10948 10959          re->ipRouteInfo.re_ref          = ire->ire_refcnt;
10949 10960          re->ipRouteInfo.re_obpkt        = ire->ire_ob_pkt_count;
10950 10961          re->ipRouteInfo.re_ibpkt        = ire->ire_ib_pkt_count;
10951 10962          re->ipRouteInfo.re_flags        = ire->ire_flags;
10952 10963  
10953 10964          /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
10954 10965          if (ire->ire_type & IRE_INTERFACE) {
10955 10966                  ire_t *child;
10956 10967  
10957 10968                  rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
10958 10969                  child = ire->ire_dep_children;
10959 10970                  while (child != NULL) {
10960 10971                          re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
10961 10972                          re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
10962 10973                          child = child->ire_dep_sib_next;
10963 10974                  }
10964 10975                  rw_exit(&ipst->ips_ire_dep_lock);
10965 10976          }
10966 10977  
10967 10978          if (ire->ire_flags & RTF_DYNAMIC) {
10968 10979                  re->ipRouteInfo.re_ire_type     = IRE_HOST_REDIRECT;
10969 10980          } else {
10970 10981                  re->ipRouteInfo.re_ire_type     = ire->ire_type;
10971 10982          }
10972 10983  
10973 10984          if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
10974 10985              (char *)re, (int)sizeof (*re))) {
10975 10986                  ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
10976 10987                      (uint_t)sizeof (*re)));
10977 10988          }
10978 10989  
10979 10990          if (gc != NULL) {
10980 10991                  iaes.iae_routeidx = ird->ird_idx;
10981 10992                  iaes.iae_doi = gc->gc_db->gcdb_doi;
10982 10993                  iaes.iae_slrange = gc->gc_db->gcdb_slrange;
10983 10994  
10984 10995                  if (!snmp_append_data2(ird->ird_attrs.lp_head,
10985 10996                      &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
10986 10997                          ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
10987 10998                              "bytes\n", (uint_t)sizeof (iaes)));
10988 10999                  }
10989 11000          }
10990 11001  
10991 11002          /* bump route index for next pass */
10992 11003          ird->ird_idx++;
10993 11004  
10994 11005          kmem_free(re, sizeof (*re));
10995 11006          if (gcgrp != NULL)
10996 11007                  rw_exit(&gcgrp->gcgrp_rwlock);
10997 11008  }
10998 11009  
10999 11010  /*
11000 11011   * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable.
11001 11012   */
11002 11013  static void
11003 11014  ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
11004 11015  {
11005 11016          ill_t                           *ill;
11006 11017          mib2_ipv6RouteEntry_t           *re;
11007 11018          mib2_ipAttributeEntry_t         iaes;
11008 11019          tsol_ire_gw_secattr_t           *attrp;
11009 11020          tsol_gc_t                       *gc = NULL;
11010 11021          tsol_gcgrp_t                    *gcgrp = NULL;
11011 11022          ip_stack_t                      *ipst = ire->ire_ipst;
11012 11023  
11013 11024          ASSERT(ire->ire_ipversion == IPV6_VERSION);
11014 11025  
11015 11026          if (!(ird->ird_flags & IRD_REPORT_ALL)) {
11016 11027                  if (ire->ire_testhidden)
11017 11028                          return;
11018 11029                  if (ire->ire_type & IRE_IF_CLONE)
11019 11030                          return;
11020 11031          }
11021 11032  
11022 11033          if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
11023 11034                  return;
11024 11035  
11025 11036          if ((attrp = ire->ire_gw_secattr) != NULL) {
11026 11037                  mutex_enter(&attrp->igsa_lock);
11027 11038                  if ((gc = attrp->igsa_gc) != NULL) {
11028 11039                          gcgrp = gc->gc_grp;
11029 11040                          ASSERT(gcgrp != NULL);
11030 11041                          rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
11031 11042                  }
11032 11043                  mutex_exit(&attrp->igsa_lock);
11033 11044          }
11034 11045          /*
11035 11046           * Return all IRE types for route table... let caller pick and choose
11036 11047           */
11037 11048          re->ipv6RouteDest = ire->ire_addr_v6;
11038 11049          re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
11039 11050          re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */
11040 11051          re->ipv6RouteIfIndex.o_length = 0;
11041 11052          ill = ire->ire_ill;
11042 11053          if (ill != NULL) {
11043 11054                  ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
11044 11055                  re->ipv6RouteIfIndex.o_length =
11045 11056                      mi_strlen(re->ipv6RouteIfIndex.o_bytes);
11046 11057          }
11047 11058  
11048 11059          ASSERT(!(ire->ire_type & IRE_BROADCAST));
11049 11060  
11050 11061          mutex_enter(&ire->ire_lock);
11051 11062          re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
11052 11063          mutex_exit(&ire->ire_lock);
11053 11064  
11054 11065          /* remote(4), local(3), or discard(2) */
11055 11066          if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
11056 11067                  re->ipv6RouteType = 2;
11057 11068          else if (ire->ire_type & IRE_ONLINK)
11058 11069                  re->ipv6RouteType = 3;
11059 11070          else
11060 11071                  re->ipv6RouteType = 4;
11061 11072  
11062 11073          re->ipv6RouteProtocol   = -1;
11063 11074          re->ipv6RoutePolicy     = 0;
11064 11075          re->ipv6RouteAge        = gethrestime_sec() - ire->ire_create_time;
11065 11076          re->ipv6RouteNextHopRDI = 0;
11066 11077          re->ipv6RouteWeight     = 0;
11067 11078          re->ipv6RouteMetric     = 0;
11068 11079          re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
11069 11080          if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
11070 11081                  re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
11071 11082  
11072 11083          re->ipv6RouteInfo.re_frag_flag  = 0;
11073 11084          re->ipv6RouteInfo.re_rtt        = 0;
11074 11085          re->ipv6RouteInfo.re_src_addr   = ipv6_all_zeros;
11075 11086          re->ipv6RouteInfo.re_obpkt      = ire->ire_ob_pkt_count;
11076 11087          re->ipv6RouteInfo.re_ibpkt      = ire->ire_ib_pkt_count;
11077 11088          re->ipv6RouteInfo.re_ref        = ire->ire_refcnt;
11078 11089          re->ipv6RouteInfo.re_flags      = ire->ire_flags;
11079 11090  
11080 11091          /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
11081 11092          if (ire->ire_type & IRE_INTERFACE) {
11082 11093                  ire_t *child;
11083 11094  
11084 11095                  rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
11085 11096                  child = ire->ire_dep_children;
11086 11097                  while (child != NULL) {
11087 11098                          re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
11088 11099                          re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
11089 11100                          child = child->ire_dep_sib_next;
11090 11101                  }
11091 11102                  rw_exit(&ipst->ips_ire_dep_lock);
11092 11103          }
11093 11104          if (ire->ire_flags & RTF_DYNAMIC) {
11094 11105                  re->ipv6RouteInfo.re_ire_type   = IRE_HOST_REDIRECT;
11095 11106          } else {
11096 11107                  re->ipv6RouteInfo.re_ire_type   = ire->ire_type;
11097 11108          }
11098 11109  
11099 11110          if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
11100 11111              (char *)re, (int)sizeof (*re))) {
11101 11112                  ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
11102 11113                      (uint_t)sizeof (*re)));
11103 11114          }
11104 11115  
11105 11116          if (gc != NULL) {
11106 11117                  iaes.iae_routeidx = ird->ird_idx;
11107 11118                  iaes.iae_doi = gc->gc_db->gcdb_doi;
11108 11119                  iaes.iae_slrange = gc->gc_db->gcdb_slrange;
11109 11120  
11110 11121                  if (!snmp_append_data2(ird->ird_attrs.lp_head,
11111 11122                      &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
11112 11123                          ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
11113 11124                              "bytes\n", (uint_t)sizeof (iaes)));
11114 11125                  }
11115 11126          }
11116 11127  
11117 11128          /* bump route index for next pass */
11118 11129          ird->ird_idx++;
11119 11130  
11120 11131          kmem_free(re, sizeof (*re));
11121 11132          if (gcgrp != NULL)
11122 11133                  rw_exit(&gcgrp->gcgrp_rwlock);
11123 11134  }
11124 11135  
11125 11136  /*
11126 11137   * ncec_walk routine to create ipv6NetToMediaEntryTable
11127 11138   */
11128 11139  static int
11129 11140  ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird)
11130 11141  {
11131 11142          ill_t                           *ill;
11132 11143          mib2_ipv6NetToMediaEntry_t      ntme;
11133 11144  
11134 11145          ill = ncec->ncec_ill;
11135 11146          /* skip arpce entries, and loopback ncec entries */
11136 11147          if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
11137 11148                  return (0);
11138 11149          /*
11139 11150           * Neighbor cache entry attached to IRE with on-link
11140 11151           * destination.
11141 11152           * We report all IPMP groups on ncec_ill which is normally the upper.
11142 11153           */
11143 11154          ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
11144 11155          ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
11145 11156          ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
11146 11157          if (ncec->ncec_lladdr != NULL) {
11147 11158                  bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
11148 11159                      ntme.ipv6NetToMediaPhysAddress.o_length);
11149 11160          }
11150 11161          /*
11151 11162           * Note: Returns ND_* states. Should be:
11152 11163           * reachable(1), stale(2), delay(3), probe(4),
11153 11164           * invalid(5), unknown(6)
11154 11165           */
11155 11166          ntme.ipv6NetToMediaState = ncec->ncec_state;
11156 11167          ntme.ipv6NetToMediaLastUpdated = 0;
11157 11168  
11158 11169          /* other(1), dynamic(2), static(3), local(4) */
11159 11170          if (NCE_MYADDR(ncec)) {
11160 11171                  ntme.ipv6NetToMediaType = 4;
11161 11172          } else if (ncec->ncec_flags & NCE_F_PUBLISH) {
11162 11173                  ntme.ipv6NetToMediaType = 1; /* proxy */
11163 11174          } else if (ncec->ncec_flags & NCE_F_STATIC) {
11164 11175                  ntme.ipv6NetToMediaType = 3;
11165 11176          } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
11166 11177                  ntme.ipv6NetToMediaType = 1;
11167 11178          } else {
11168 11179                  ntme.ipv6NetToMediaType = 2;
11169 11180          }
11170 11181  
11171 11182          if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11172 11183              &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11173 11184                  ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n",
11174 11185                      (uint_t)sizeof (ntme)));
11175 11186          }
11176 11187          return (0);
11177 11188  }
11178 11189  
11179 11190  int
11180 11191  nce2ace(ncec_t *ncec)
11181 11192  {
11182 11193          int flags = 0;
11183 11194  
11184 11195          if (NCE_ISREACHABLE(ncec))
11185 11196                  flags |= ACE_F_RESOLVED;
11186 11197          if (ncec->ncec_flags & NCE_F_AUTHORITY)
11187 11198                  flags |= ACE_F_AUTHORITY;
11188 11199          if (ncec->ncec_flags & NCE_F_PUBLISH)
11189 11200                  flags |= ACE_F_PUBLISH;
11190 11201          if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
11191 11202                  flags |= ACE_F_PERMANENT;
11192 11203          if (NCE_MYADDR(ncec))
11193 11204                  flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
11194 11205          if (ncec->ncec_flags & NCE_F_UNVERIFIED)
11195 11206                  flags |= ACE_F_UNVERIFIED;
11196 11207          if (ncec->ncec_flags & NCE_F_AUTHORITY)
11197 11208                  flags |= ACE_F_AUTHORITY;
11198 11209          if (ncec->ncec_flags & NCE_F_DELAYED)
11199 11210                  flags |= ACE_F_DELAYED;
11200 11211          return (flags);
11201 11212  }
11202 11213  
11203 11214  /*
11204 11215   * ncec_walk routine to create ipNetToMediaEntryTable
11205 11216   */
11206 11217  static int
11207 11218  ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird)
11208 11219  {
11209 11220          ill_t                           *ill;
11210 11221          mib2_ipNetToMediaEntry_t        ntme;
11211 11222          const char                      *name = "unknown";
11212 11223          ipaddr_t                        ncec_addr;
11213 11224  
11214 11225          ill = ncec->ncec_ill;
11215 11226          if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
11216 11227              ill->ill_net_type == IRE_LOOPBACK)
11217 11228                  return (0);
11218 11229  
11219 11230          /* We report all IPMP groups on ncec_ill which is normally the upper. */
11220 11231          name = ill->ill_name;
11221 11232          /* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
11222 11233          if (NCE_MYADDR(ncec)) {
11223 11234                  ntme.ipNetToMediaType = 4;
11224 11235          } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
11225 11236                  ntme.ipNetToMediaType = 1;
11226 11237          } else {
11227 11238                  ntme.ipNetToMediaType = 3;
11228 11239          }
11229 11240          ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
11230 11241          bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
11231 11242              ntme.ipNetToMediaIfIndex.o_length);
11232 11243  
11233 11244          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
11234 11245          bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
11235 11246  
11236 11247          ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
11237 11248          ncec_addr = INADDR_BROADCAST;
11238 11249          bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
11239 11250              sizeof (ncec_addr));
11240 11251          /*
11241 11252           * map all the flags to the ACE counterpart.
11242 11253           */
11243 11254          ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
11244 11255  
11245 11256          ntme.ipNetToMediaPhysAddress.o_length =
11246 11257              MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
11247 11258  
11248 11259          if (!NCE_ISREACHABLE(ncec))
11249 11260                  ntme.ipNetToMediaPhysAddress.o_length = 0;
11250 11261          else {
11251 11262                  if (ncec->ncec_lladdr != NULL) {
11252 11263                          bcopy(ncec->ncec_lladdr,
11253 11264                              ntme.ipNetToMediaPhysAddress.o_bytes,
11254 11265                              ntme.ipNetToMediaPhysAddress.o_length);
11255 11266                  }
11256 11267          }
11257 11268  
11258 11269          if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11259 11270              &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11260 11271                  ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
11261 11272                      (uint_t)sizeof (ntme)));
11262 11273          }
11263 11274          return (0);
11264 11275  }
11265 11276  
11266 11277  /*
11267 11278   * return (0) if invalid set request, 1 otherwise, including non-tcp requests
11268 11279   */
11269 11280  /* ARGSUSED */
11270 11281  int
11271 11282  ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
11272 11283  {
11273 11284          switch (level) {
11274 11285          case MIB2_IP:
11275 11286          case MIB2_ICMP:
11276 11287                  switch (name) {
11277 11288                  default:
11278 11289                          break;
11279 11290                  }
11280 11291                  return (1);
11281 11292          default:
11282 11293                  return (1);
11283 11294          }
11284 11295  }
11285 11296  
11286 11297  /*
11287 11298   * When there exists both a 64- and 32-bit counter of a particular type
11288 11299   * (i.e., InReceives), only the 64-bit counters are added.
11289 11300   */
11290 11301  void
11291 11302  ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2)
11292 11303  {
11293 11304          UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors);
11294 11305          UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors);
11295 11306          UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes);
11296 11307          UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors);
11297 11308          UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos);
11298 11309          UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts);
11299 11310          UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards);
11300 11311          UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards);
11301 11312          UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs);
11302 11313          UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails);
11303 11314          UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates);
11304 11315          UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds);
11305 11316          UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs);
11306 11317          UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails);
11307 11318          UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes);
11308 11319          UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates);
11309 11320          UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups);
11310 11321          UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits);
11311 11322          UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs);
11312 11323          UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows);
11313 11324          UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows);
11314 11325          UPDATE_MIB(o1, ipIfStatsInWrongIPVersion,
11315 11326              o2->ipIfStatsInWrongIPVersion);
11316 11327          UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion,
11317 11328              o2->ipIfStatsInWrongIPVersion);
11318 11329          UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion,
11319 11330              o2->ipIfStatsOutSwitchIPVersion);
11320 11331          UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives);
11321 11332          UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets);
11322 11333          UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams,
11323 11334              o2->ipIfStatsHCInForwDatagrams);
11324 11335          UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers);
11325 11336          UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests);
11326 11337          UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams,
11327 11338              o2->ipIfStatsHCOutForwDatagrams);
11328 11339          UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds);
11329 11340          UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits);
11330 11341          UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets);
11331 11342          UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts);
11332 11343          UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets);
11333 11344          UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts);
11334 11345          UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets,
11335 11346              o2->ipIfStatsHCOutMcastOctets);
11336 11347          UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts);
11337 11348          UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts);
11338 11349          UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded);
11339 11350          UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed);
11340 11351          UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs);
11341 11352          UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs);
11342 11353          UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts);
11343 11354  }
11344 11355  
11345 11356  void
11346 11357  ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
11347 11358  {
11348 11359          UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs);
11349 11360          UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors);
11350 11361          UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs);
11351 11362          UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs);
11352 11363          UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds);
11353 11364          UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems);
11354 11365          UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs);
11355 11366          UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos);
11356 11367          UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies);
11357 11368          UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits,
11358 11369              o2->ipv6IfIcmpInRouterSolicits);
11359 11370          UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements,
11360 11371              o2->ipv6IfIcmpInRouterAdvertisements);
11361 11372          UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits,
11362 11373              o2->ipv6IfIcmpInNeighborSolicits);
11363 11374          UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements,
11364 11375              o2->ipv6IfIcmpInNeighborAdvertisements);
11365 11376          UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects);
11366 11377          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries,
11367 11378              o2->ipv6IfIcmpInGroupMembQueries);
11368 11379          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses,
11369 11380              o2->ipv6IfIcmpInGroupMembResponses);
11370 11381          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions,
11371 11382              o2->ipv6IfIcmpInGroupMembReductions);
11372 11383          UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs);
11373 11384          UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors);
11374 11385          UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs,
11375 11386              o2->ipv6IfIcmpOutDestUnreachs);
11376 11387          UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs,
11377 11388              o2->ipv6IfIcmpOutAdminProhibs);
11378 11389          UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds);
11379 11390          UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems,
11380 11391              o2->ipv6IfIcmpOutParmProblems);
11381 11392          UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs);
11382 11393          UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos);
11383 11394          UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies);
11384 11395          UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits,
11385 11396              o2->ipv6IfIcmpOutRouterSolicits);
11386 11397          UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements,
11387 11398              o2->ipv6IfIcmpOutRouterAdvertisements);
11388 11399          UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits,
11389 11400              o2->ipv6IfIcmpOutNeighborSolicits);
11390 11401          UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements,
11391 11402              o2->ipv6IfIcmpOutNeighborAdvertisements);
11392 11403          UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects);
11393 11404          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries,
11394 11405              o2->ipv6IfIcmpOutGroupMembQueries);
11395 11406          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses,
11396 11407              o2->ipv6IfIcmpOutGroupMembResponses);
11397 11408          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions,
11398 11409              o2->ipv6IfIcmpOutGroupMembReductions);
11399 11410          UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows);
11400 11411          UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit);
11401 11412          UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements,
11402 11413              o2->ipv6IfIcmpInBadNeighborAdvertisements);
11403 11414          UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations,
11404 11415              o2->ipv6IfIcmpInBadNeighborSolicitations);
11405 11416          UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects);
11406 11417          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal,
11407 11418              o2->ipv6IfIcmpInGroupMembTotal);
11408 11419          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries,
11409 11420              o2->ipv6IfIcmpInGroupMembBadQueries);
11410 11421          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports,
11411 11422              o2->ipv6IfIcmpInGroupMembBadReports);
11412 11423          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports,
11413 11424              o2->ipv6IfIcmpInGroupMembOurReports);
11414 11425  }
11415 11426  
11416 11427  /*
11417 11428   * Called before the options are updated to check if this packet will
11418 11429   * be source routed from here.
11419 11430   * This routine assumes that the options are well formed i.e. that they
11420 11431   * have already been checked.
11421 11432   */
11422 11433  boolean_t
11423 11434  ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
11424 11435  {
11425 11436          ipoptp_t        opts;
11426 11437          uchar_t         *opt;
11427 11438          uint8_t         optval;
11428 11439          uint8_t         optlen;
11429 11440          ipaddr_t        dst;
11430 11441  
11431 11442          if (IS_SIMPLE_IPH(ipha)) {
11432 11443                  ip2dbg(("not source routed\n"));
11433 11444                  return (B_FALSE);
11434 11445          }
11435 11446          dst = ipha->ipha_dst;
11436 11447          for (optval = ipoptp_first(&opts, ipha);
11437 11448              optval != IPOPT_EOL;
11438 11449              optval = ipoptp_next(&opts)) {
11439 11450                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11440 11451                  opt = opts.ipoptp_cur;
11441 11452                  optlen = opts.ipoptp_len;
11442 11453                  ip2dbg(("ip_source_routed: opt %d, len %d\n",
11443 11454                      optval, optlen));
11444 11455                  switch (optval) {
11445 11456                          uint32_t off;
11446 11457                  case IPOPT_SSRR:
11447 11458                  case IPOPT_LSRR:
11448 11459                          /*
11449 11460                           * If dst is one of our addresses and there are some
11450 11461                           * entries left in the source route return (true).
11451 11462                           */
11452 11463                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
11453 11464                                  ip2dbg(("ip_source_routed: not next"
11454 11465                                      " source route 0x%x\n",
11455 11466                                      ntohl(dst)));
11456 11467                                  return (B_FALSE);
11457 11468                          }
11458 11469                          off = opt[IPOPT_OFFSET];
11459 11470                          off--;
11460 11471                          if (optlen < IP_ADDR_LEN ||
11461 11472                              off > optlen - IP_ADDR_LEN) {
11462 11473                                  /* End of source route */
11463 11474                                  ip1dbg(("ip_source_routed: end of SR\n"));
11464 11475                                  return (B_FALSE);
11465 11476                          }
11466 11477                          return (B_TRUE);
11467 11478                  }
11468 11479          }
11469 11480          ip2dbg(("not source routed\n"));
11470 11481          return (B_FALSE);
11471 11482  }
11472 11483  
11473 11484  /*
11474 11485   * ip_unbind is called by the transports to remove a conn from
11475 11486   * the fanout table.
11476 11487   */
11477 11488  void
11478 11489  ip_unbind(conn_t *connp)
11479 11490  {
11480 11491  
11481 11492          ASSERT(!MUTEX_HELD(&connp->conn_lock));
11482 11493  
11483 11494          if (is_system_labeled() && connp->conn_anon_port) {
11484 11495                  (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
11485 11496                      connp->conn_mlp_type, connp->conn_proto,
11486 11497                      ntohs(connp->conn_lport), B_FALSE);
11487 11498                  connp->conn_anon_port = 0;
11488 11499          }
11489 11500          connp->conn_mlp_type = mlptSingle;
11490 11501  
11491 11502          ipcl_hash_remove(connp);
11492 11503  }
11493 11504  
11494 11505  /*
11495 11506   * Used for deciding the MSS size for the upper layer. Thus
11496 11507   * we need to check the outbound policy values in the conn.
11497 11508   */
11498 11509  int
11499 11510  conn_ipsec_length(conn_t *connp)
11500 11511  {
11501 11512          ipsec_latch_t *ipl;
11502 11513  
11503 11514          ipl = connp->conn_latch;
11504 11515          if (ipl == NULL)
11505 11516                  return (0);
11506 11517  
11507 11518          if (connp->conn_ixa->ixa_ipsec_policy == NULL)
11508 11519                  return (0);
11509 11520  
11510 11521          return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
11511 11522  }
11512 11523  
11513 11524  /*
11514 11525   * Returns an estimate of the IPsec headers size. This is used if
11515 11526   * we don't want to call into IPsec to get the exact size.
11516 11527   */
11517 11528  int
11518 11529  ipsec_out_extra_length(ip_xmit_attr_t *ixa)
11519 11530  {
11520 11531          ipsec_action_t *a;
11521 11532  
11522 11533          if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
11523 11534                  return (0);
11524 11535  
11525 11536          a = ixa->ixa_ipsec_action;
11526 11537          if (a == NULL) {
11527 11538                  ASSERT(ixa->ixa_ipsec_policy != NULL);
11528 11539                  a = ixa->ixa_ipsec_policy->ipsp_act;
11529 11540          }
11530 11541          ASSERT(a != NULL);
11531 11542  
11532 11543          return (a->ipa_ovhd);
11533 11544  }
11534 11545  
11535 11546  /*
11536 11547   * If there are any source route options, return the true final
11537 11548   * destination. Otherwise, return the destination.
11538 11549   */
11539 11550  ipaddr_t
11540 11551  ip_get_dst(ipha_t *ipha)
11541 11552  {
11542 11553          ipoptp_t        opts;
11543 11554          uchar_t         *opt;
11544 11555          uint8_t         optval;
11545 11556          uint8_t         optlen;
11546 11557          ipaddr_t        dst;
11547 11558          uint32_t off;
11548 11559  
11549 11560          dst = ipha->ipha_dst;
11550 11561  
11551 11562          if (IS_SIMPLE_IPH(ipha))
11552 11563                  return (dst);
11553 11564  
11554 11565          for (optval = ipoptp_first(&opts, ipha);
11555 11566              optval != IPOPT_EOL;
11556 11567              optval = ipoptp_next(&opts)) {
11557 11568                  opt = opts.ipoptp_cur;
11558 11569                  optlen = opts.ipoptp_len;
11559 11570                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11560 11571                  switch (optval) {
11561 11572                  case IPOPT_SSRR:
11562 11573                  case IPOPT_LSRR:
11563 11574                          off = opt[IPOPT_OFFSET];
11564 11575                          /*
11565 11576                           * If one of the conditions is true, it means
11566 11577                           * end of options and dst already has the right
11567 11578                           * value.
11568 11579                           */
11569 11580                          if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) {
11570 11581                                  off = optlen - IP_ADDR_LEN;
11571 11582                                  bcopy(&opt[off], &dst, IP_ADDR_LEN);
11572 11583                          }
11573 11584                          return (dst);
11574 11585                  default:
11575 11586                          break;
11576 11587                  }
11577 11588          }
11578 11589  
11579 11590          return (dst);
11580 11591  }
11581 11592  
11582 11593  /*
11583 11594   * Outbound IP fragmentation routine.
11584 11595   * Assumes the caller has checked whether or not fragmentation should
11585 11596   * be allowed. Here we copy the DF bit from the header to all the generated
11586 11597   * fragments.
11587 11598   */
11588 11599  int
11589 11600  ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
11590 11601      uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
11591 11602      zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
11592 11603  {
11593 11604          int             i1;
11594 11605          int             hdr_len;
11595 11606          mblk_t          *hdr_mp;
11596 11607          ipha_t          *ipha;
11597 11608          int             ip_data_end;
11598 11609          int             len;
11599 11610          mblk_t          *mp = mp_orig;
11600 11611          int             offset;
11601 11612          ill_t           *ill = nce->nce_ill;
11602 11613          ip_stack_t      *ipst = ill->ill_ipst;
11603 11614          mblk_t          *carve_mp;
11604 11615          uint32_t        frag_flag;
11605 11616          uint_t          priority = mp->b_band;
11606 11617          int             error = 0;
11607 11618  
11608 11619          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
11609 11620  
11610 11621          if (pkt_len != msgdsize(mp)) {
11611 11622                  ip0dbg(("Packet length mismatch: %d, %ld\n",
11612 11623                      pkt_len, msgdsize(mp)));
11613 11624                  freemsg(mp);
11614 11625                  return (EINVAL);
11615 11626          }
11616 11627  
11617 11628          if (max_frag == 0) {
11618 11629                  ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
11619 11630                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11620 11631                  ip_drop_output("FragFails: zero max_frag", mp, ill);
11621 11632                  freemsg(mp);
11622 11633                  return (EINVAL);
11623 11634          }
11624 11635  
11625 11636          ASSERT(MBLKL(mp) >= sizeof (ipha_t));
11626 11637          ipha = (ipha_t *)mp->b_rptr;
11627 11638          ASSERT(ntohs(ipha->ipha_length) == pkt_len);
11628 11639          frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
11629 11640  
11630 11641          /*
11631 11642           * Establish the starting offset.  May not be zero if we are fragging
11632 11643           * a fragment that is being forwarded.
11633 11644           */
11634 11645          offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
11635 11646  
11636 11647          /* TODO why is this test needed? */
11637 11648          if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
11638 11649                  /* TODO: notify ulp somehow */
11639 11650                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11640 11651                  ip_drop_output("FragFails: bad starting offset", mp, ill);
11641 11652                  freemsg(mp);
11642 11653                  return (EINVAL);
11643 11654          }
11644 11655  
11645 11656          hdr_len = IPH_HDR_LENGTH(ipha);
11646 11657          ipha->ipha_hdr_checksum = 0;
11647 11658  
11648 11659          /*
11649 11660           * Establish the number of bytes maximum per frag, after putting
11650 11661           * in the header.
11651 11662           */
11652 11663          len = (max_frag - hdr_len) & ~7;
11653 11664  
11654 11665          /* Get a copy of the header for the trailing frags */
11655 11666          hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
11656 11667              mp);
11657 11668          if (hdr_mp == NULL) {
11658 11669                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11659 11670                  ip_drop_output("FragFails: no hdr_mp", mp, ill);
11660 11671                  freemsg(mp);
11661 11672                  return (ENOBUFS);
11662 11673          }
11663 11674  
11664 11675          /* Store the starting offset, with the MoreFrags flag. */
11665 11676          i1 = offset | IPH_MF | frag_flag;
11666 11677          ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1);
11667 11678  
11668 11679          /* Establish the ending byte offset, based on the starting offset. */
11669 11680          offset <<= 3;
11670 11681          ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
11671 11682  
11672 11683          /* Store the length of the first fragment in the IP header. */
11673 11684          i1 = len + hdr_len;
11674 11685          ASSERT(i1 <= IP_MAXPACKET);
11675 11686          ipha->ipha_length = htons((uint16_t)i1);
11676 11687  
11677 11688          /*
11678 11689           * Compute the IP header checksum for the first frag.  We have to
11679 11690           * watch out that we stop at the end of the header.
11680 11691           */
11681 11692          ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11682 11693  
11683 11694          /*
11684 11695           * Now carve off the first frag.  Note that this will include the
11685 11696           * original IP header.
11686 11697           */
11687 11698          if (!(mp = ip_carve_mp(&mp_orig, i1))) {
11688 11699                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11689 11700                  ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
11690 11701                  freeb(hdr_mp);
11691 11702                  freemsg(mp_orig);
11692 11703                  return (ENOBUFS);
11693 11704          }
11694 11705  
11695 11706          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11696 11707  
11697 11708          error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
11698 11709              ixa_cookie);
11699 11710          if (error != 0 && error != EWOULDBLOCK) {
11700 11711                  /* No point in sending the other fragments */
11701 11712                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11702 11713                  ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
11703 11714                  freeb(hdr_mp);
11704 11715                  freemsg(mp_orig);
11705 11716                  return (error);
11706 11717          }
11707 11718  
11708 11719          /* No need to redo state machine in loop */
11709 11720          ixaflags &= ~IXAF_REACH_CONF;
11710 11721  
11711 11722          /* Advance the offset to the second frag starting point. */
11712 11723          offset += len;
11713 11724          /*
11714 11725           * Update hdr_len from the copied header - there might be less options
11715 11726           * in the later fragments.
11716 11727           */
11717 11728          hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr);
11718 11729          /* Loop until done. */
11719 11730          for (;;) {
11720 11731                  uint16_t        offset_and_flags;
11721 11732                  uint16_t        ip_len;
11722 11733  
11723 11734                  if (ip_data_end - offset > len) {
11724 11735                          /*
11725 11736                           * Carve off the appropriate amount from the original
11726 11737                           * datagram.
11727 11738                           */
11728 11739                          if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11729 11740                                  mp = NULL;
11730 11741                                  break;
11731 11742                          }
11732 11743                          /*
11733 11744                           * More frags after this one.  Get another copy
11734 11745                           * of the header.
11735 11746                           */
11736 11747                          if (carve_mp->b_datap->db_ref == 1 &&
11737 11748                              hdr_mp->b_wptr - hdr_mp->b_rptr <
11738 11749                              carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11739 11750                                  /* Inline IP header */
11740 11751                                  carve_mp->b_rptr -= hdr_mp->b_wptr -
11741 11752                                      hdr_mp->b_rptr;
11742 11753                                  bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11743 11754                                      hdr_mp->b_wptr - hdr_mp->b_rptr);
11744 11755                                  mp = carve_mp;
11745 11756                          } else {
11746 11757                                  if (!(mp = copyb(hdr_mp))) {
11747 11758                                          freemsg(carve_mp);
11748 11759                                          break;
11749 11760                                  }
11750 11761                                  /* Get priority marking, if any. */
11751 11762                                  mp->b_band = priority;
11752 11763                                  mp->b_cont = carve_mp;
11753 11764                          }
11754 11765                          ipha = (ipha_t *)mp->b_rptr;
11755 11766                          offset_and_flags = IPH_MF;
11756 11767                  } else {
11757 11768                          /*
11758 11769                           * Last frag.  Consume the header. Set len to
11759 11770                           * the length of this last piece.
11760 11771                           */
11761 11772                          len = ip_data_end - offset;
11762 11773  
11763 11774                          /*
11764 11775                           * Carve off the appropriate amount from the original
11765 11776                           * datagram.
11766 11777                           */
11767 11778                          if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11768 11779                                  mp = NULL;
11769 11780                                  break;
11770 11781                          }
11771 11782                          if (carve_mp->b_datap->db_ref == 1 &&
11772 11783                              hdr_mp->b_wptr - hdr_mp->b_rptr <
11773 11784                              carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11774 11785                                  /* Inline IP header */
11775 11786                                  carve_mp->b_rptr -= hdr_mp->b_wptr -
11776 11787                                      hdr_mp->b_rptr;
11777 11788                                  bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11778 11789                                      hdr_mp->b_wptr - hdr_mp->b_rptr);
11779 11790                                  mp = carve_mp;
11780 11791                                  freeb(hdr_mp);
11781 11792                                  hdr_mp = mp;
11782 11793                          } else {
11783 11794                                  mp = hdr_mp;
11784 11795                                  /* Get priority marking, if any. */
11785 11796                                  mp->b_band = priority;
11786 11797                                  mp->b_cont = carve_mp;
11787 11798                          }
11788 11799                          ipha = (ipha_t *)mp->b_rptr;
11789 11800                          /* A frag of a frag might have IPH_MF non-zero */
11790 11801                          offset_and_flags =
11791 11802                              ntohs(ipha->ipha_fragment_offset_and_flags) &
11792 11803                              IPH_MF;
11793 11804                  }
11794 11805                  offset_and_flags |= (uint16_t)(offset >> 3);
11795 11806                  offset_and_flags |= (uint16_t)frag_flag;
11796 11807                  /* Store the offset and flags in the IP header. */
11797 11808                  ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
11798 11809  
11799 11810                  /* Store the length in the IP header. */
11800 11811                  ip_len = (uint16_t)(len + hdr_len);
11801 11812                  ipha->ipha_length = htons(ip_len);
11802 11813  
11803 11814                  /*
11804 11815                   * Set the IP header checksum.  Note that mp is just
11805 11816                   * the header, so this is easy to pass to ip_csum.
11806 11817                   */
11807 11818                  ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11808 11819  
11809 11820                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11810 11821  
11811 11822                  error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
11812 11823                      nolzid, ixa_cookie);
11813 11824                  /* All done if we just consumed the hdr_mp. */
11814 11825                  if (mp == hdr_mp) {
11815 11826                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
11816 11827                          return (error);
11817 11828                  }
11818 11829                  if (error != 0 && error != EWOULDBLOCK) {
11819 11830                          DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
11820 11831                              mblk_t *, hdr_mp);
11821 11832                          /* No point in sending the other fragments */
11822 11833                          break;
11823 11834                  }
11824 11835  
11825 11836                  /* Otherwise, advance and loop. */
11826 11837                  offset += len;
11827 11838          }
11828 11839          /* Clean up following allocation failure. */
11829 11840          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11830 11841          ip_drop_output("FragFails: loop ended", NULL, ill);
11831 11842          if (mp != hdr_mp)
11832 11843                  freeb(hdr_mp);
11833 11844          if (mp != mp_orig)
11834 11845                  freemsg(mp_orig);
11835 11846          return (error);
11836 11847  }
11837 11848  
11838 11849  /*
11839 11850   * Copy the header plus those options which have the copy bit set
11840 11851   */
11841 11852  static mblk_t *
11842 11853  ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
11843 11854      mblk_t *src)
11844 11855  {
11845 11856          mblk_t  *mp;
11846 11857          uchar_t *up;
11847 11858  
11848 11859          /*
11849 11860           * Quick check if we need to look for options without the copy bit
11850 11861           * set
11851 11862           */
11852 11863          mp = allocb_tmpl(ipst->ips_ip_wroff_extra + hdr_len, src);
11853 11864          if (!mp)
11854 11865                  return (mp);
11855 11866          mp->b_rptr += ipst->ips_ip_wroff_extra;
11856 11867          if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) {
11857 11868                  bcopy(rptr, mp->b_rptr, hdr_len);
11858 11869                  mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra;
11859 11870                  return (mp);
11860 11871          }
11861 11872          up  = mp->b_rptr;
11862 11873          bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH);
11863 11874          up += IP_SIMPLE_HDR_LENGTH;
11864 11875          rptr += IP_SIMPLE_HDR_LENGTH;
11865 11876          hdr_len -= IP_SIMPLE_HDR_LENGTH;
11866 11877          while (hdr_len > 0) {
11867 11878                  uint32_t optval;
11868 11879                  uint32_t optlen;
11869 11880  
11870 11881                  optval = *rptr;
11871 11882                  if (optval == IPOPT_EOL)
11872 11883                          break;
11873 11884                  if (optval == IPOPT_NOP)
11874 11885                          optlen = 1;
11875 11886                  else
11876 11887                          optlen = rptr[1];
11877 11888                  if (optval & IPOPT_COPY) {
11878 11889                          bcopy(rptr, up, optlen);
11879 11890                          up += optlen;
11880 11891                  }
11881 11892                  rptr += optlen;
11882 11893                  hdr_len -= optlen;
11883 11894          }
11884 11895          /*
11885 11896           * Make sure that we drop an even number of words by filling
11886 11897           * with EOL to the next word boundary.
11887 11898           */
11888 11899          for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH);
11889 11900              hdr_len & 0x3; hdr_len++)
11890 11901                  *up++ = IPOPT_EOL;
11891 11902          mp->b_wptr = up;
11892 11903          /* Update header length */
11893 11904          mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2));
11894 11905          return (mp);
11895 11906  }
11896 11907  
11897 11908  /*
11898 11909   * Update any source route, record route, or timestamp options when
11899 11910   * sending a packet back to ourselves.
11900 11911   * Check that we are at end of strict source route.
11901 11912   * The options have been sanity checked by ip_output_options().
11902 11913   */
11903 11914  void
11904 11915  ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
11905 11916  {
11906 11917          ipoptp_t        opts;
11907 11918          uchar_t         *opt;
11908 11919          uint8_t         optval;
11909 11920          uint8_t         optlen;
11910 11921          ipaddr_t        dst;
11911 11922          uint32_t        ts;
11912 11923          timestruc_t     now;
11913 11924  
11914 11925          for (optval = ipoptp_first(&opts, ipha);
11915 11926              optval != IPOPT_EOL;
11916 11927              optval = ipoptp_next(&opts)) {
11917 11928                  opt = opts.ipoptp_cur;
11918 11929                  optlen = opts.ipoptp_len;
11919 11930                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11920 11931                  switch (optval) {
11921 11932                          uint32_t off;
11922 11933                  case IPOPT_SSRR:
11923 11934                  case IPOPT_LSRR:
11924 11935                          off = opt[IPOPT_OFFSET];
11925 11936                          off--;
11926 11937                          if (optlen < IP_ADDR_LEN ||
11927 11938                              off > optlen - IP_ADDR_LEN) {
11928 11939                                  /* End of source route */
11929 11940                                  break;
11930 11941                          }
11931 11942                          /*
11932 11943                           * This will only happen if two consecutive entries
11933 11944                           * in the source route contains our address or if
11934 11945                           * it is a packet with a loose source route which
11935 11946                           * reaches us before consuming the whole source route
11936 11947                           */
11937 11948  
11938 11949                          if (optval == IPOPT_SSRR) {
11939 11950                                  return;
11940 11951                          }
11941 11952                          /*
11942 11953                           * Hack: instead of dropping the packet truncate the
11943 11954                           * source route to what has been used by filling the
11944 11955                           * rest with IPOPT_NOP.
11945 11956                           */
11946 11957                          opt[IPOPT_OLEN] = (uint8_t)off;
11947 11958                          while (off < optlen) {
11948 11959                                  opt[off++] = IPOPT_NOP;
11949 11960                          }
11950 11961                          break;
11951 11962                  case IPOPT_RR:
11952 11963                          off = opt[IPOPT_OFFSET];
11953 11964                          off--;
11954 11965                          if (optlen < IP_ADDR_LEN ||
11955 11966                              off > optlen - IP_ADDR_LEN) {
11956 11967                                  /* No more room - ignore */
11957 11968                                  ip1dbg((
11958 11969                                      "ip_output_local_options: end of RR\n"));
11959 11970                                  break;
11960 11971                          }
11961 11972                          dst = htonl(INADDR_LOOPBACK);
11962 11973                          bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
11963 11974                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
11964 11975                          break;
11965 11976                  case IPOPT_TS:
11966 11977                          /* Insert timestamp if there is romm */
11967 11978                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
11968 11979                          case IPOPT_TS_TSONLY:
11969 11980                                  off = IPOPT_TS_TIMELEN;
11970 11981                                  break;
11971 11982                          case IPOPT_TS_PRESPEC:
11972 11983                          case IPOPT_TS_PRESPEC_RFC791:
11973 11984                                  /* Verify that the address matched */
11974 11985                                  off = opt[IPOPT_OFFSET] - 1;
11975 11986                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
11976 11987                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
11977 11988                                          /* Not for us */
11978 11989                                          break;
11979 11990                                  }
11980 11991                                  /* FALLTHRU */
11981 11992                          case IPOPT_TS_TSANDADDR:
11982 11993                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
11983 11994                                  break;
11984 11995                          default:
11985 11996                                  /*
11986 11997                                   * ip_*put_options should have already
11987 11998                                   * dropped this packet.
11988 11999                                   */
11989 12000                                  cmn_err(CE_PANIC, "ip_output_local_options: "
11990 12001                                      "unknown IT - bug in ip_output_options?\n");
11991 12002                                  return; /* Keep "lint" happy */
11992 12003                          }
11993 12004                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
11994 12005                                  /* Increase overflow counter */
11995 12006                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
11996 12007                                  opt[IPOPT_POS_OV_FLG] = (uint8_t)
11997 12008                                      (opt[IPOPT_POS_OV_FLG] & 0x0F) |
11998 12009                                      (off << 4);
11999 12010                                  break;
12000 12011                          }
12001 12012                          off = opt[IPOPT_OFFSET] - 1;
12002 12013                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
12003 12014                          case IPOPT_TS_PRESPEC:
12004 12015                          case IPOPT_TS_PRESPEC_RFC791:
12005 12016                          case IPOPT_TS_TSANDADDR:
12006 12017                                  dst = htonl(INADDR_LOOPBACK);
12007 12018                                  bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
12008 12019                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
12009 12020                                  /* FALLTHRU */
12010 12021                          case IPOPT_TS_TSONLY:
12011 12022                                  off = opt[IPOPT_OFFSET] - 1;
12012 12023                                  /* Compute # of milliseconds since midnight */
12013 12024                                  gethrestime(&now);
12014 12025                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
12015 12026                                      now.tv_nsec / (NANOSEC / MILLISEC);
12016 12027                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
12017 12028                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
12018 12029                                  break;
12019 12030                          }
12020 12031                          break;
12021 12032                  }
12022 12033          }
12023 12034  }
12024 12035  
12025 12036  /*
12026 12037   * Prepend an M_DATA fastpath header, and if none present prepend a
12027 12038   * DL_UNITDATA_REQ. Frees the mblk on failure.
12028 12039   *
12029 12040   * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
12030 12041   * If there is a change to them, the nce will be deleted (condemned) and
12031 12042   * a new nce_t will be created when packets are sent. Thus we need no locks
12032 12043   * to access those fields.
12033 12044   *
12034 12045   * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
12035 12046   * we place b_band in dl_priority.dl_max.
12036 12047   */
12037 12048  static mblk_t *
12038 12049  ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
12039 12050  {
12040 12051          uint_t  hlen;
12041 12052          mblk_t *mp1;
12042 12053          uint_t  priority;
12043 12054          uchar_t *rptr;
12044 12055  
12045 12056          rptr = mp->b_rptr;
12046 12057  
12047 12058          ASSERT(DB_TYPE(mp) == M_DATA);
12048 12059          priority = mp->b_band;
12049 12060  
12050 12061          ASSERT(nce != NULL);
12051 12062          if ((mp1 = nce->nce_fp_mp) != NULL) {
12052 12063                  hlen = MBLKL(mp1);
12053 12064                  /*
12054 12065                   * Check if we have enough room to prepend fastpath
12055 12066                   * header
12056 12067                   */
12057 12068                  if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
12058 12069                          rptr -= hlen;
12059 12070                          bcopy(mp1->b_rptr, rptr, hlen);
12060 12071                          /*
12061 12072                           * Set the b_rptr to the start of the link layer
12062 12073                           * header
12063 12074                           */
12064 12075                          mp->b_rptr = rptr;
12065 12076                          return (mp);
12066 12077                  }
12067 12078                  mp1 = copyb(mp1);
12068 12079                  if (mp1 == NULL) {
12069 12080                          ill_t *ill = nce->nce_ill;
12070 12081  
12071 12082                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12072 12083                          ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12073 12084                          freemsg(mp);
12074 12085                          return (NULL);
12075 12086                  }
12076 12087                  mp1->b_band = priority;
12077 12088                  mp1->b_cont = mp;
12078 12089                  DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
12079 12090                  DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
12080 12091                  DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
12081 12092                  DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
12082 12093                  DB_LSOMSS(mp1) = DB_LSOMSS(mp);
12083 12094                  DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
12084 12095                  /*
12085 12096                   * XXX disable ICK_VALID and compute checksum
12086 12097                   * here; can happen if nce_fp_mp changes and
12087 12098                   * it can't be copied now due to insufficient
12088 12099                   * space. (unlikely, fp mp can change, but it
12089 12100                   * does not increase in length)
12090 12101                   */
12091 12102                  return (mp1);
12092 12103          }
12093 12104          mp1 = copyb(nce->nce_dlur_mp);
12094 12105  
12095 12106          if (mp1 == NULL) {
12096 12107                  ill_t *ill = nce->nce_ill;
12097 12108  
12098 12109                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12099 12110                  ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12100 12111                  freemsg(mp);
12101 12112                  return (NULL);
12102 12113          }
12103 12114          mp1->b_cont = mp;
12104 12115          if (priority != 0) {
12105 12116                  mp1->b_band = priority;
12106 12117                  ((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
12107 12118                      priority;
12108 12119          }
12109 12120          return (mp1);
12110 12121  #undef rptr
12111 12122  }
12112 12123  
12113 12124  /*
12114 12125   * Finish the outbound IPsec processing. This function is called from
12115 12126   * ipsec_out_process() if the IPsec packet was processed
12116 12127   * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12117 12128   * asynchronously.
12118 12129   *
12119 12130   * This is common to IPv4 and IPv6.
12120 12131   */
12121 12132  int
12122 12133  ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
12123 12134  {
12124 12135          iaflags_t       ixaflags = ixa->ixa_flags;
12125 12136          uint_t          pktlen;
12126 12137  
12127 12138  
12128 12139          /* AH/ESP don't update ixa_pktlen when they modify the packet */
12129 12140          if (ixaflags & IXAF_IS_IPV4) {
12130 12141                  ipha_t          *ipha = (ipha_t *)mp->b_rptr;
12131 12142  
12132 12143                  ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12133 12144                  pktlen = ntohs(ipha->ipha_length);
12134 12145          } else {
12135 12146                  ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
12136 12147  
12137 12148                  ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12138 12149                  pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12139 12150          }
12140 12151  
12141 12152          /*
12142 12153           * We release any hard reference on the SAs here to make
12143 12154           * sure the SAs can be garbage collected. ipsr_sa has a soft reference
12144 12155           * on the SAs.
12145 12156           * If in the future we want the hard latching of the SAs in the
12146 12157           * ip_xmit_attr_t then we should remove this.
12147 12158           */
12148 12159          if (ixa->ixa_ipsec_esp_sa != NULL) {
12149 12160                  IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12150 12161                  ixa->ixa_ipsec_esp_sa = NULL;
12151 12162          }
12152 12163          if (ixa->ixa_ipsec_ah_sa != NULL) {
12153 12164                  IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12154 12165                  ixa->ixa_ipsec_ah_sa = NULL;
12155 12166          }
12156 12167  
12157 12168          /* Do we need to fragment? */
12158 12169          if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
12159 12170              pktlen > ixa->ixa_fragsize) {
12160 12171                  if (ixaflags & IXAF_IS_IPV4) {
12161 12172                          ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
12162 12173                          /*
12163 12174                           * We check for the DF case in ipsec_out_process
12164 12175                           * hence this only handles the non-DF case.
12165 12176                           */
12166 12177                          return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
12167 12178                              pktlen, ixa->ixa_fragsize,
12168 12179                              ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12169 12180                              ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
12170 12181                              &ixa->ixa_cookie));
12171 12182                  } else {
12172 12183                          mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
12173 12184                          if (mp == NULL) {
12174 12185                                  /* MIB and ip_drop_output already done */
12175 12186                                  return (ENOMEM);
12176 12187                          }
12177 12188                          pktlen += sizeof (ip6_frag_t);
12178 12189                          if (pktlen > ixa->ixa_fragsize) {
12179 12190                                  return (ip_fragment_v6(mp, ixa->ixa_nce,
12180 12191                                      ixa->ixa_flags, pktlen,
12181 12192                                      ixa->ixa_fragsize, ixa->ixa_xmit_hint,
12182 12193                                      ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
12183 12194                                      ixa->ixa_postfragfn, &ixa->ixa_cookie));
12184 12195                          }
12185 12196                  }
12186 12197          }
12187 12198          return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
12188 12199              pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12189 12200              ixa->ixa_no_loop_zoneid, NULL));
12190 12201  }
12191 12202  
12192 12203  /*
12193 12204   * Finish the inbound IPsec processing. This function is called from
12194 12205   * ipsec_out_process() if the IPsec packet was processed
12195 12206   * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12196 12207   * asynchronously.
12197 12208   *
12198 12209   * This is common to IPv4 and IPv6.
12199 12210   */
12200 12211  void
12201 12212  ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
12202 12213  {
12203 12214          iaflags_t       iraflags = ira->ira_flags;
12204 12215  
12205 12216          /* Length might have changed */
12206 12217          if (iraflags & IRAF_IS_IPV4) {
12207 12218                  ipha_t          *ipha = (ipha_t *)mp->b_rptr;
12208 12219  
12209 12220                  ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12210 12221                  ira->ira_pktlen = ntohs(ipha->ipha_length);
12211 12222                  ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
12212 12223                  ira->ira_protocol = ipha->ipha_protocol;
12213 12224  
12214 12225                  ip_fanout_v4(mp, ipha, ira);
12215 12226          } else {
12216 12227                  ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
12217 12228                  uint8_t         *nexthdrp;
12218 12229  
12219 12230                  ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12220 12231                  ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12221 12232                  if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
12222 12233                      &nexthdrp)) {
12223 12234                          /* Malformed packet */
12224 12235                          BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
12225 12236                          ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
12226 12237                          freemsg(mp);
12227 12238                          return;
12228 12239                  }
12229 12240                  ira->ira_protocol = *nexthdrp;
12230 12241                  ip_fanout_v6(mp, ip6h, ira);
12231 12242          }
12232 12243  }
12233 12244  
12234 12245  /*
12235 12246   * Select which AH & ESP SA's to use (if any) for the outbound packet.
12236 12247   *
12237 12248   * If this function returns B_TRUE, the requested SA's have been filled
12238 12249   * into the ixa_ipsec_*_sa pointers.
12239 12250   *
12240 12251   * If the function returns B_FALSE, the packet has been "consumed", most
12241 12252   * likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
12242 12253   *
12243 12254   * The SA references created by the protocol-specific "select"
12244 12255   * function will be released in ip_output_post_ipsec.
12245 12256   */
12246 12257  static boolean_t
12247 12258  ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
12248 12259  {
12249 12260          boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
12250 12261          ipsec_policy_t *pp;
12251 12262          ipsec_action_t *ap;
12252 12263  
12253 12264          ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12254 12265          ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12255 12266              (ixa->ixa_ipsec_action != NULL));
12256 12267  
12257 12268          ap = ixa->ixa_ipsec_action;
12258 12269          if (ap == NULL) {
12259 12270                  pp = ixa->ixa_ipsec_policy;
12260 12271                  ASSERT(pp != NULL);
12261 12272                  ap = pp->ipsp_act;
12262 12273                  ASSERT(ap != NULL);
12263 12274          }
12264 12275  
12265 12276          /*
12266 12277           * We have an action.  now, let's select SA's.
12267 12278           * A side effect of setting ixa_ipsec_*_sa is that it will
12268 12279           * be cached in the conn_t.
12269 12280           */
12270 12281          if (ap->ipa_want_esp) {
12271 12282                  if (ixa->ixa_ipsec_esp_sa == NULL) {
12272 12283                          need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
12273 12284                              IPPROTO_ESP);
12274 12285                  }
12275 12286                  ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
12276 12287          }
12277 12288  
12278 12289          if (ap->ipa_want_ah) {
12279 12290                  if (ixa->ixa_ipsec_ah_sa == NULL) {
12280 12291                          need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
12281 12292                              IPPROTO_AH);
12282 12293                  }
12283 12294                  ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
12284 12295                  /*
12285 12296                   * The ESP and AH processing order needs to be preserved
12286 12297                   * when both protocols are required (ESP should be applied
12287 12298                   * before AH for an outbound packet). Force an ESP ACQUIRE
12288 12299                   * when both ESP and AH are required, and an AH ACQUIRE
12289 12300                   * is needed.
12290 12301                   */
12291 12302                  if (ap->ipa_want_esp && need_ah_acquire)
12292 12303                          need_esp_acquire = B_TRUE;
12293 12304          }
12294 12305  
12295 12306          /*
12296 12307           * Send an ACQUIRE (extended, regular, or both) if we need one.
12297 12308           * Release SAs that got referenced, but will not be used until we
12298 12309           * acquire _all_ of the SAs we need.
12299 12310           */
12300 12311          if (need_ah_acquire || need_esp_acquire) {
12301 12312                  if (ixa->ixa_ipsec_ah_sa != NULL) {
12302 12313                          IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12303 12314                          ixa->ixa_ipsec_ah_sa = NULL;
12304 12315                  }
12305 12316                  if (ixa->ixa_ipsec_esp_sa != NULL) {
12306 12317                          IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12307 12318                          ixa->ixa_ipsec_esp_sa = NULL;
12308 12319                  }
12309 12320  
12310 12321                  sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
12311 12322                  return (B_FALSE);
12312 12323          }
12313 12324  
12314 12325          return (B_TRUE);
12315 12326  }
12316 12327  
12317 12328  /*
12318 12329   * Handle IPsec output processing.
12319 12330   * This function is only entered once for a given packet.
12320 12331   * We try to do things synchronously, but if we need to have user-level
12321 12332   * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
12322 12333   * will be completed
12323 12334   *  - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
12324 12335   *  - when asynchronous ESP is done it will do AH
12325 12336   *
12326 12337   * In all cases we come back in ip_output_post_ipsec() to fragment and
12327 12338   * send out the packet.
12328 12339   */
12329 12340  int
12330 12341  ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
12331 12342  {
12332 12343          ill_t           *ill = ixa->ixa_nce->nce_ill;
12333 12344          ip_stack_t      *ipst = ixa->ixa_ipst;
12334 12345          ipsec_stack_t   *ipss;
12335 12346          ipsec_policy_t  *pp;
12336 12347          ipsec_action_t  *ap;
12337 12348  
12338 12349          ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12339 12350  
12340 12351          ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12341 12352              (ixa->ixa_ipsec_action != NULL));
12342 12353  
12343 12354          ipss = ipst->ips_netstack->netstack_ipsec;
12344 12355          if (!ipsec_loaded(ipss)) {
12345 12356                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12346 12357                  ip_drop_packet(mp, B_TRUE, ill,
12347 12358                      DROPPER(ipss, ipds_ip_ipsec_not_loaded),
12348 12359                      &ipss->ipsec_dropper);
12349 12360                  return (ENOTSUP);
12350 12361          }
12351 12362  
12352 12363          ap = ixa->ixa_ipsec_action;
12353 12364          if (ap == NULL) {
12354 12365                  pp = ixa->ixa_ipsec_policy;
12355 12366                  ASSERT(pp != NULL);
12356 12367                  ap = pp->ipsp_act;
12357 12368                  ASSERT(ap != NULL);
12358 12369          }
12359 12370  
12360 12371          /* Handle explicit drop action and bypass. */
12361 12372          switch (ap->ipa_act.ipa_type) {
12362 12373          case IPSEC_ACT_DISCARD:
12363 12374          case IPSEC_ACT_REJECT:
12364 12375                  ip_drop_packet(mp, B_FALSE, ill,
12365 12376                      DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
12366 12377                  return (EHOSTUNREACH);  /* IPsec policy failure */
12367 12378          case IPSEC_ACT_BYPASS:
12368 12379                  return (ip_output_post_ipsec(mp, ixa));
12369 12380          }
12370 12381  
12371 12382          /*
12372 12383           * The order of processing is first insert a IP header if needed.
12373 12384           * Then insert the ESP header and then the AH header.
12374 12385           */
12375 12386          if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
12376 12387                  /*
12377 12388                   * First get the outer IP header before sending
12378 12389                   * it to ESP.
12379 12390                   */
12380 12391                  ipha_t *oipha, *iipha;
12381 12392                  mblk_t *outer_mp, *inner_mp;
12382 12393  
12383 12394                  if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
12384 12395                          (void) mi_strlog(ill->ill_rq, 0,
12385 12396                              SL_ERROR|SL_TRACE|SL_CONSOLE,
12386 12397                              "ipsec_out_process: "
12387 12398                              "Self-Encapsulation failed: Out of memory\n");
12388 12399                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12389 12400                          ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12390 12401                          freemsg(mp);
12391 12402                          return (ENOBUFS);
12392 12403                  }
12393 12404                  inner_mp = mp;
12394 12405                  ASSERT(inner_mp->b_datap->db_type == M_DATA);
12395 12406                  oipha = (ipha_t *)outer_mp->b_rptr;
12396 12407                  iipha = (ipha_t *)inner_mp->b_rptr;
12397 12408                  *oipha = *iipha;
12398 12409                  outer_mp->b_wptr += sizeof (ipha_t);
12399 12410                  oipha->ipha_length = htons(ntohs(iipha->ipha_length) +
12400 12411                      sizeof (ipha_t));
12401 12412                  oipha->ipha_protocol = IPPROTO_ENCAP;
12402 12413                  oipha->ipha_version_and_hdr_length =
12403 12414                      IP_SIMPLE_HDR_VERSION;
12404 12415                  oipha->ipha_hdr_checksum = 0;
12405 12416                  oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
12406 12417                  outer_mp->b_cont = inner_mp;
12407 12418                  mp = outer_mp;
12408 12419  
12409 12420                  ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
12410 12421          }
12411 12422  
12412 12423          /* If we need to wait for a SA then we can't return any errno */
12413 12424          if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
12414 12425              (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
12415 12426              !ipsec_out_select_sa(mp, ixa))
12416 12427                  return (0);
12417 12428  
12418 12429          /*
12419 12430           * By now, we know what SA's to use.  Toss over to ESP & AH
12420 12431           * to do the heavy lifting.
12421 12432           */
12422 12433          if (ap->ipa_want_esp) {
12423 12434                  ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
12424 12435  
12425 12436                  mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
12426 12437                  if (mp == NULL) {
12427 12438                          /*
12428 12439                           * Either it failed or is pending. In the former case
12429 12440                           * ipIfStatsInDiscards was increased.
12430 12441                           */
12431 12442                          return (0);
12432 12443                  }
12433 12444          }
12434 12445  
12435 12446          if (ap->ipa_want_ah) {
12436 12447                  ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
12437 12448  
12438 12449                  mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
12439 12450                  if (mp == NULL) {
12440 12451                          /*
12441 12452                           * Either it failed or is pending. In the former case
12442 12453                           * ipIfStatsInDiscards was increased.
12443 12454                           */
12444 12455                          return (0);
12445 12456                  }
12446 12457          }
12447 12458          /*
12448 12459           * We are done with IPsec processing. Send it over
12449 12460           * the wire.
12450 12461           */
12451 12462          return (ip_output_post_ipsec(mp, ixa));
12452 12463  }
12453 12464  
12454 12465  /*
12455 12466   * ioctls that go through a down/up sequence may need to wait for the down
12456 12467   * to complete. This involves waiting for the ire and ipif refcnts to go down
12457 12468   * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail.
12458 12469   */
12459 12470  /* ARGSUSED */
12460 12471  void
12461 12472  ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
12462 12473  {
12463 12474          struct iocblk *iocp;
12464 12475          mblk_t *mp1;
12465 12476          ip_ioctl_cmd_t *ipip;
12466 12477          int err;
12467 12478          sin_t   *sin;
12468 12479          struct lifreq *lifr;
12469 12480          struct ifreq *ifr;
12470 12481  
12471 12482          iocp = (struct iocblk *)mp->b_rptr;
12472 12483          ASSERT(ipsq != NULL);
12473 12484          /* Existence of mp1 verified in ip_wput_nondata */
12474 12485          mp1 = mp->b_cont->b_cont;
12475 12486          ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12476 12487          if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
12477 12488                  /*
12478 12489                   * Special case where ipx_current_ipif is not set:
12479 12490                   * ill_phyint_reinit merged the v4 and v6 into a single ipsq.
12480 12491                   * We are here as were not able to complete the operation in
12481 12492                   * ipif_set_values because we could not become exclusive on
12482 12493                   * the new ipsq.
12483 12494                   */
12484 12495                  ill_t *ill = q->q_ptr;
12485 12496                  ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
12486 12497          }
12487 12498          ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
12488 12499  
12489 12500          if (ipip->ipi_cmd_type == IF_CMD) {
12490 12501                  /* This a old style SIOC[GS]IF* command */
12491 12502                  ifr = (struct ifreq *)mp1->b_rptr;
12492 12503                  sin = (sin_t *)&ifr->ifr_addr;
12493 12504          } else if (ipip->ipi_cmd_type == LIF_CMD) {
12494 12505                  /* This a new style SIOC[GS]LIF* command */
12495 12506                  lifr = (struct lifreq *)mp1->b_rptr;
12496 12507                  sin = (sin_t *)&lifr->lifr_addr;
12497 12508          } else {
12498 12509                  sin = NULL;
12499 12510          }
12500 12511  
12501 12512          err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
12502 12513              q, mp, ipip, mp1->b_rptr);
12503 12514  
12504 12515          DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
12505 12516              int, ipip->ipi_cmd,
12506 12517              ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
12507 12518              ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
12508 12519  
12509 12520          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12510 12521  }
12511 12522  
12512 12523  /*
12513 12524   * ioctl processing
12514 12525   *
12515 12526   * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up
12516 12527   * the ioctl command in the ioctl tables, determines the copyin data size
12517 12528   * from the ipi_copyin_size field, and does an mi_copyin() of that size.
12518 12529   *
12519 12530   * ioctl processing then continues when the M_IOCDATA makes its way down to
12520 12531   * ip_wput_nondata().  The ioctl is looked up again in the ioctl table, its
12521 12532   * associated 'conn' is refheld till the end of the ioctl and the general
12522 12533   * ioctl processing function ip_process_ioctl() is called to extract the
12523 12534   * arguments and process the ioctl.  To simplify extraction, ioctl commands
12524 12535   * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a
12525 12536   * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq())
12526 12537   * is used to extract the ioctl's arguments.
12527 12538   *
12528 12539   * ip_process_ioctl determines if the ioctl needs to be serialized, and if
12529 12540   * so goes thru the serialization primitive ipsq_try_enter. Then the
12530 12541   * appropriate function to handle the ioctl is called based on the entry in
12531 12542   * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish
12532 12543   * which also refreleases the 'conn' that was refheld at the start of the
12533 12544   * ioctl. Finally ipsq_exit is called if needed to exit the ipsq.
12534 12545   *
12535 12546   * Many exclusive ioctls go thru an internal down up sequence as part of
12536 12547   * the operation. For example an attempt to change the IP address of an
12537 12548   * ipif entails ipif_down, set address, ipif_up. Bringing down the interface
12538 12549   * does all the cleanup such as deleting all ires that use this address.
12539 12550   * Then we need to wait till all references to the interface go away.
12540 12551   */
12541 12552  void
12542 12553  ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12543 12554  {
12544 12555          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
12545 12556          ip_ioctl_cmd_t *ipip = arg;
12546 12557          ip_extract_func_t *extract_funcp;
12547 12558          cmd_info_t ci;
12548 12559          int err;
12549 12560          boolean_t entered_ipsq = B_FALSE;
12550 12561  
12551 12562          ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
12552 12563  
12553 12564          if (ipip == NULL)
12554 12565                  ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12555 12566  
12556 12567          /*
12557 12568           * SIOCLIFADDIF needs to go thru a special path since the
12558 12569           * ill may not exist yet. This happens in the case of lo0
12559 12570           * which is created using this ioctl.
12560 12571           */
12561 12572          if (ipip->ipi_cmd == SIOCLIFADDIF) {
12562 12573                  err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
12563 12574                  DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
12564 12575                      int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12565 12576                  ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12566 12577                  return;
12567 12578          }
12568 12579  
12569 12580          ci.ci_ipif = NULL;
12570 12581          switch (ipip->ipi_cmd_type) {
12571 12582          case MISC_CMD:
12572 12583          case MSFILT_CMD:
12573 12584                  /*
12574 12585                   * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
12575 12586                   */
12576 12587                  if (ipip->ipi_cmd == IF_UNITSEL) {
12577 12588                          /* ioctl comes down the ill */
12578 12589                          ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif;
12579 12590                          ipif_refhold(ci.ci_ipif);
12580 12591                  }
12581 12592                  err = 0;
12582 12593                  ci.ci_sin = NULL;
12583 12594                  ci.ci_sin6 = NULL;
12584 12595                  ci.ci_lifr = NULL;
12585 12596                  extract_funcp = NULL;
12586 12597                  break;
12587 12598  
12588 12599          case IF_CMD:
12589 12600          case LIF_CMD:
12590 12601                  extract_funcp = ip_extract_lifreq;
12591 12602                  break;
12592 12603  
12593 12604          case ARP_CMD:
12594 12605          case XARP_CMD:
12595 12606                  extract_funcp = ip_extract_arpreq;
12596 12607                  break;
12597 12608  
12598 12609          default:
12599 12610                  ASSERT(0);
12600 12611          }
12601 12612  
12602 12613          if (extract_funcp != NULL) {
12603 12614                  err = (*extract_funcp)(q, mp, ipip, &ci);
12604 12615                  if (err != 0) {
12605 12616                          DTRACE_PROBE4(ipif__ioctl,
12606 12617                              char *, "ip_process_ioctl finish err",
12607 12618                              int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12608 12619                          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12609 12620                          return;
12610 12621                  }
12611 12622  
12612 12623                  /*
12613 12624                   * All of the extraction functions return a refheld ipif.
12614 12625                   */
12615 12626                  ASSERT(ci.ci_ipif != NULL);
12616 12627          }
12617 12628  
12618 12629          if (!(ipip->ipi_flags & IPI_WR)) {
12619 12630                  /*
12620 12631                   * A return value of EINPROGRESS means the ioctl is
12621 12632                   * either queued and waiting for some reason or has
12622 12633                   * already completed.
12623 12634                   */
12624 12635                  err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
12625 12636                      ci.ci_lifr);
12626 12637                  if (ci.ci_ipif != NULL) {
12627 12638                          DTRACE_PROBE4(ipif__ioctl,
12628 12639                              char *, "ip_process_ioctl finish RD",
12629 12640                              int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
12630 12641                              ipif_t *, ci.ci_ipif);
12631 12642                          ipif_refrele(ci.ci_ipif);
12632 12643                  } else {
12633 12644                          DTRACE_PROBE4(ipif__ioctl,
12634 12645                              char *, "ip_process_ioctl finish RD",
12635 12646                              int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12636 12647                  }
12637 12648                  ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12638 12649                  return;
12639 12650          }
12640 12651  
12641 12652          ASSERT(ci.ci_ipif != NULL);
12642 12653  
12643 12654          /*
12644 12655           * If ipsq is non-NULL, we are already being called exclusively
12645 12656           */
12646 12657          ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
12647 12658          if (ipsq == NULL) {
12648 12659                  ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
12649 12660                      NEW_OP, B_TRUE);
12650 12661                  if (ipsq == NULL) {
12651 12662                          ipif_refrele(ci.ci_ipif);
12652 12663                          return;
12653 12664                  }
12654 12665                  entered_ipsq = B_TRUE;
12655 12666          }
12656 12667          /*
12657 12668           * Release the ipif so that ipif_down and friends that wait for
12658 12669           * references to go away are not misled about the current ipif_refcnt
12659 12670           * values. We are writer so we can access the ipif even after releasing
12660 12671           * the ipif.
12661 12672           */
12662 12673          ipif_refrele(ci.ci_ipif);
12663 12674  
12664 12675          ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
12665 12676  
12666 12677          /*
12667 12678           * A return value of EINPROGRESS means the ioctl is
12668 12679           * either queued and waiting for some reason or has
12669 12680           * already completed.
12670 12681           */
12671 12682          err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
12672 12683  
12673 12684          DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
12674 12685              int, ipip->ipi_cmd,
12675 12686              ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
12676 12687              ipif_t *, ci.ci_ipif);
12677 12688          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12678 12689  
12679 12690          if (entered_ipsq)
12680 12691                  ipsq_exit(ipsq);
12681 12692  }
12682 12693  
12683 12694  /*
12684 12695   * Complete the ioctl. Typically ioctls use the mi package and need to
12685 12696   * do mi_copyout/mi_copy_done.
12686 12697   */
12687 12698  void
12688 12699  ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
12689 12700  {
12690 12701          conn_t  *connp = NULL;
12691 12702  
12692 12703          if (err == EINPROGRESS)
12693 12704                  return;
12694 12705  
12695 12706          if (CONN_Q(q)) {
12696 12707                  connp = Q_TO_CONN(q);
12697 12708                  ASSERT(connp->conn_ref >= 2);
12698 12709          }
12699 12710  
12700 12711          switch (mode) {
12701 12712          case COPYOUT:
12702 12713                  if (err == 0)
12703 12714                          mi_copyout(q, mp);
12704 12715                  else
12705 12716                          mi_copy_done(q, mp, err);
12706 12717                  break;
12707 12718  
12708 12719          case NO_COPYOUT:
12709 12720                  mi_copy_done(q, mp, err);
12710 12721                  break;
12711 12722  
12712 12723          default:
12713 12724                  ASSERT(mode == CONN_CLOSE);     /* aborted through CONN_CLOSE */
12714 12725                  break;
12715 12726          }
12716 12727  
12717 12728          /*
12718 12729           * The conn refhold and ioctlref placed on the conn at the start of the
12719 12730           * ioctl are released here.
12720 12731           */
12721 12732          if (connp != NULL) {
12722 12733                  CONN_DEC_IOCTLREF(connp);
12723 12734                  CONN_OPER_PENDING_DONE(connp);
12724 12735          }
12725 12736  
12726 12737          if (ipsq != NULL)
12727 12738                  ipsq_current_finish(ipsq);
12728 12739  }
12729 12740  
12730 12741  /* Handles all non data messages */
12731 12742  void
12732 12743  ip_wput_nondata(queue_t *q, mblk_t *mp)
12733 12744  {
12734 12745          mblk_t          *mp1;
12735 12746          struct iocblk   *iocp;
12736 12747          ip_ioctl_cmd_t  *ipip;
12737 12748          conn_t          *connp;
12738 12749          cred_t          *cr;
12739 12750          char            *proto_str;
12740 12751  
12741 12752          if (CONN_Q(q))
12742 12753                  connp = Q_TO_CONN(q);
12743 12754          else
12744 12755                  connp = NULL;
12745 12756  
12746 12757          switch (DB_TYPE(mp)) {
12747 12758          case M_IOCTL:
12748 12759                  /*
12749 12760                   * IOCTL processing begins in ip_sioctl_copyin_setup which
12750 12761                   * will arrange to copy in associated control structures.
12751 12762                   */
12752 12763                  ip_sioctl_copyin_setup(q, mp);
12753 12764                  return;
12754 12765          case M_IOCDATA:
12755 12766                  /*
12756 12767                   * Ensure that this is associated with one of our trans-
12757 12768                   * parent ioctls.  If it's not ours, discard it if we're
12758 12769                   * running as a driver, or pass it on if we're a module.
12759 12770                   */
12760 12771                  iocp = (struct iocblk *)mp->b_rptr;
12761 12772                  ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12762 12773                  if (ipip == NULL) {
12763 12774                          if (q->q_next == NULL) {
12764 12775                                  goto nak;
12765 12776                          } else {
12766 12777                                  putnext(q, mp);
12767 12778                          }
12768 12779                          return;
12769 12780                  }
12770 12781                  if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
12771 12782                          /*
12772 12783                           * The ioctl is one we recognise, but is not consumed
12773 12784                           * by IP as a module and we are a module, so we drop
12774 12785                           */
12775 12786                          goto nak;
12776 12787                  }
12777 12788  
12778 12789                  /* IOCTL continuation following copyin or copyout. */
12779 12790                  if (mi_copy_state(q, mp, NULL) == -1) {
12780 12791                          /*
12781 12792                           * The copy operation failed.  mi_copy_state already
12782 12793                           * cleaned up, so we're out of here.
12783 12794                           */
12784 12795                          return;
12785 12796                  }
12786 12797                  /*
12787 12798                   * If we just completed a copy in, we become writer and
12788 12799                   * continue processing in ip_sioctl_copyin_done.  If it
12789 12800                   * was a copy out, we call mi_copyout again.  If there is
12790 12801                   * nothing more to copy out, it will complete the IOCTL.
12791 12802                   */
12792 12803                  if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) {
12793 12804                          if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) {
12794 12805                                  mi_copy_done(q, mp, EPROTO);
12795 12806                                  return;
12796 12807                          }
12797 12808                          /*
12798 12809                           * Check for cases that need more copying.  A return
12799 12810                           * value of 0 means a second copyin has been started,
12800 12811                           * so we return; a return value of 1 means no more
12801 12812                           * copying is needed, so we continue.
12802 12813                           */
12803 12814                          if (ipip->ipi_cmd_type == MSFILT_CMD &&
12804 12815                              MI_COPY_COUNT(mp) == 1) {
12805 12816                                  if (ip_copyin_msfilter(q, mp) == 0)
12806 12817                                          return;
12807 12818                          }
12808 12819                          /*
12809 12820                           * Refhold the conn, till the ioctl completes. This is
12810 12821                           * needed in case the ioctl ends up in the pending mp
12811 12822                           * list. Every mp in the ipx_pending_mp list must have
12812 12823                           * a refhold on the conn to resume processing. The
12813 12824                           * refhold is released when the ioctl completes
12814 12825                           * (whether normally or abnormally). An ioctlref is also
12815 12826                           * placed on the conn to prevent TCP from removing the
12816 12827                           * queue needed to send the ioctl reply back.
12817 12828                           * In all cases ip_ioctl_finish is called to finish
12818 12829                           * the ioctl and release the refholds.
12819 12830                           */
12820 12831                          if (connp != NULL) {
12821 12832                                  /* This is not a reentry */
12822 12833                                  CONN_INC_REF(connp);
12823 12834                                  CONN_INC_IOCTLREF(connp);
12824 12835                          } else {
12825 12836                                  if (!(ipip->ipi_flags & IPI_MODOK)) {
12826 12837                                          mi_copy_done(q, mp, EINVAL);
12827 12838                                          return;
12828 12839                                  }
12829 12840                          }
12830 12841  
12831 12842                          ip_process_ioctl(NULL, q, mp, ipip);
12832 12843  
12833 12844                  } else {
12834 12845                          mi_copyout(q, mp);
12835 12846                  }
12836 12847                  return;
12837 12848  
12838 12849          case M_IOCNAK:
12839 12850                  /*
12840 12851                   * The only way we could get here is if a resolver didn't like
12841 12852                   * an IOCTL we sent it.  This shouldn't happen.
12842 12853                   */
12843 12854                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
12844 12855                      "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
12845 12856                      ((struct iocblk *)mp->b_rptr)->ioc_cmd);
12846 12857                  freemsg(mp);
12847 12858                  return;
12848 12859          case M_IOCACK:
12849 12860                  /* /dev/ip shouldn't see this */
12850 12861                  goto nak;
12851 12862          case M_FLUSH:
12852 12863                  if (*mp->b_rptr & FLUSHW)
12853 12864                          flushq(q, FLUSHALL);
12854 12865                  if (q->q_next) {
12855 12866                          putnext(q, mp);
12856 12867                          return;
12857 12868                  }
12858 12869                  if (*mp->b_rptr & FLUSHR) {
12859 12870                          *mp->b_rptr &= ~FLUSHW;
12860 12871                          qreply(q, mp);
12861 12872                          return;
12862 12873                  }
12863 12874                  freemsg(mp);
12864 12875                  return;
12865 12876          case M_CTL:
12866 12877                  break;
12867 12878          case M_PROTO:
12868 12879          case M_PCPROTO:
12869 12880                  /*
12870 12881                   * The only PROTO messages we expect are SNMP-related.
12871 12882                   */
12872 12883                  switch (((union T_primitives *)mp->b_rptr)->type) {
12873 12884                  case T_SVR4_OPTMGMT_REQ:
12874 12885                          ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
12875 12886                              "flags %x\n",
12876 12887                              ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
12877 12888  
12878 12889                          if (connp == NULL) {
12879 12890                                  proto_str = "T_SVR4_OPTMGMT_REQ";
12880 12891                                  goto protonak;
12881 12892                          }
12882 12893  
12883 12894                          /*
12884 12895                           * All Solaris components should pass a db_credp
12885 12896                           * for this TPI message, hence we ASSERT.
12886 12897                           * But in case there is some other M_PROTO that looks
12887 12898                           * like a TPI message sent by some other kernel
12888 12899                           * component, we check and return an error.
12889 12900                           */
12890 12901                          cr = msg_getcred(mp, NULL);
12891 12902                          ASSERT(cr != NULL);
12892 12903                          if (cr == NULL) {
12893 12904                                  mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
12894 12905                                  if (mp != NULL)
12895 12906                                          qreply(q, mp);
12896 12907                                  return;
12897 12908                          }
12898 12909  
12899 12910                          if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
12900 12911                                  proto_str = "Bad SNMPCOM request?";
12901 12912                                  goto protonak;
12902 12913                          }
12903 12914                          return;
12904 12915                  default:
12905 12916                          ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
12906 12917                              (int)*(uint_t *)mp->b_rptr));
12907 12918                          freemsg(mp);
12908 12919                          return;
12909 12920                  }
12910 12921          default:
12911 12922                  break;
12912 12923          }
12913 12924          if (q->q_next) {
12914 12925                  putnext(q, mp);
12915 12926          } else
12916 12927                  freemsg(mp);
12917 12928          return;
12918 12929  
12919 12930  nak:
12920 12931          iocp->ioc_error = EINVAL;
12921 12932          mp->b_datap->db_type = M_IOCNAK;
12922 12933          iocp->ioc_count = 0;
12923 12934          qreply(q, mp);
12924 12935          return;
12925 12936  
12926 12937  protonak:
12927 12938          cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
12928 12939          if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
12929 12940                  qreply(q, mp);
12930 12941  }
12931 12942  
12932 12943  /*
12933 12944   * Process IP options in an outbound packet.  Verify that the nexthop in a
12934 12945   * strict source route is onlink.
12935 12946   * Returns non-zero if something fails in which case an ICMP error has been
12936 12947   * sent and mp freed.
12937 12948   *
12938 12949   * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
12939 12950   */
12940 12951  int
12941 12952  ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
12942 12953  {
12943 12954          ipoptp_t        opts;
12944 12955          uchar_t         *opt;
12945 12956          uint8_t         optval;
12946 12957          uint8_t         optlen;
12947 12958          ipaddr_t        dst;
12948 12959          intptr_t        code = 0;
12949 12960          ire_t           *ire;
12950 12961          ip_stack_t      *ipst = ixa->ixa_ipst;
12951 12962          ip_recv_attr_t  iras;
12952 12963  
12953 12964          ip2dbg(("ip_output_options\n"));
12954 12965  
12955 12966          dst = ipha->ipha_dst;
12956 12967          for (optval = ipoptp_first(&opts, ipha);
12957 12968              optval != IPOPT_EOL;
12958 12969              optval = ipoptp_next(&opts)) {
12959 12970                  opt = opts.ipoptp_cur;
12960 12971                  optlen = opts.ipoptp_len;
12961 12972                  ip2dbg(("ip_output_options: opt %d, len %d\n",
12962 12973                      optval, optlen));
12963 12974                  switch (optval) {
12964 12975                          uint32_t off;
12965 12976                  case IPOPT_SSRR:
12966 12977                  case IPOPT_LSRR:
12967 12978                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
12968 12979                                  ip1dbg((
12969 12980                                      "ip_output_options: bad option offset\n"));
12970 12981                                  code = (char *)&opt[IPOPT_OLEN] -
12971 12982                                      (char *)ipha;
12972 12983                                  goto param_prob;
12973 12984                          }
12974 12985                          off = opt[IPOPT_OFFSET];
12975 12986                          ip1dbg(("ip_output_options: next hop 0x%x\n",
12976 12987                              ntohl(dst)));
12977 12988                          /*
12978 12989                           * For strict: verify that dst is directly
12979 12990                           * reachable.
12980 12991                           */
12981 12992                          if (optval == IPOPT_SSRR) {
12982 12993                                  ire = ire_ftable_lookup_v4(dst, 0, 0,
12983 12994                                      IRE_INTERFACE, NULL, ALL_ZONES,
12984 12995                                      ixa->ixa_tsl,
12985 12996                                      MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
12986 12997                                      NULL);
12987 12998                                  if (ire == NULL) {
12988 12999                                          ip1dbg(("ip_output_options: SSRR not"
12989 13000                                              " directly reachable: 0x%x\n",
12990 13001                                              ntohl(dst)));
12991 13002                                          goto bad_src_route;
12992 13003                                  }
12993 13004                                  ire_refrele(ire);
12994 13005                          }
12995 13006                          break;
12996 13007                  case IPOPT_RR:
12997 13008                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
12998 13009                                  ip1dbg((
12999 13010                                      "ip_output_options: bad option offset\n"));
13000 13011                                  code = (char *)&opt[IPOPT_OLEN] -
13001 13012                                      (char *)ipha;
13002 13013                                  goto param_prob;
13003 13014                          }
13004 13015                          break;
13005 13016                  case IPOPT_TS:
13006 13017                          /*
13007 13018                           * Verify that length >=5 and that there is either
13008 13019                           * room for another timestamp or that the overflow
13009 13020                           * counter is not maxed out.
13010 13021                           */
13011 13022                          code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
13012 13023                          if (optlen < IPOPT_MINLEN_IT) {
13013 13024                                  goto param_prob;
13014 13025                          }
13015 13026                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13016 13027                                  ip1dbg((
13017 13028                                      "ip_output_options: bad option offset\n"));
13018 13029                                  code = (char *)&opt[IPOPT_OFFSET] -
13019 13030                                      (char *)ipha;
13020 13031                                  goto param_prob;
13021 13032                          }
13022 13033                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
13023 13034                          case IPOPT_TS_TSONLY:
13024 13035                                  off = IPOPT_TS_TIMELEN;
13025 13036                                  break;
13026 13037                          case IPOPT_TS_TSANDADDR:
13027 13038                          case IPOPT_TS_PRESPEC:
13028 13039                          case IPOPT_TS_PRESPEC_RFC791:
13029 13040                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
13030 13041                                  break;
13031 13042                          default:
13032 13043                                  code = (char *)&opt[IPOPT_POS_OV_FLG] -
13033 13044                                      (char *)ipha;
13034 13045                                  goto param_prob;
13035 13046                          }
13036 13047                          if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
13037 13048                              (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
13038 13049                                  /*
13039 13050                                   * No room and the overflow counter is 15
13040 13051                                   * already.
13041 13052                                   */
13042 13053                                  goto param_prob;
13043 13054                          }
13044 13055                          break;
13045 13056                  }
13046 13057          }
13047 13058  
13048 13059          if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
13049 13060                  return (0);
13050 13061  
13051 13062          ip1dbg(("ip_output_options: error processing IP options."));
13052 13063          code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
13053 13064  
13054 13065  param_prob:
13055 13066          bzero(&iras, sizeof (iras));
13056 13067          iras.ira_ill = iras.ira_rill = ill;
13057 13068          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13058 13069          iras.ira_rifindex = iras.ira_ruifindex;
13059 13070          iras.ira_flags = IRAF_IS_IPV4;
13060 13071  
13061 13072          ip_drop_output("ip_output_options", mp, ill);
13062 13073          icmp_param_problem(mp, (uint8_t)code, &iras);
13063 13074          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13064 13075          return (-1);
13065 13076  
13066 13077  bad_src_route:
13067 13078          bzero(&iras, sizeof (iras));
13068 13079          iras.ira_ill = iras.ira_rill = ill;
13069 13080          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13070 13081          iras.ira_rifindex = iras.ira_ruifindex;
13071 13082          iras.ira_flags = IRAF_IS_IPV4;
13072 13083  
13073 13084          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
13074 13085          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
13075 13086          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13076 13087          return (-1);
13077 13088  }
13078 13089  
13079 13090  /*
13080 13091   * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT.
13081 13092   * conn_drain_list_cnt can be changed by setting conn_drain_nthreads
13082 13093   * thru /etc/system.
13083 13094   */
13084 13095  #define CONN_MAXDRAINCNT        64
13085 13096  
13086 13097  static void
13087 13098  conn_drain_init(ip_stack_t *ipst)
13088 13099  {
13089 13100          int i, j;
13090 13101          idl_tx_list_t *itl_tx;
13091 13102  
13092 13103          ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
13093 13104  
13094 13105          if ((ipst->ips_conn_drain_list_cnt == 0) ||
13095 13106              (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) {
13096 13107                  /*
13097 13108                   * Default value of the number of drainers is the
13098 13109                   * number of cpus, subject to maximum of 8 drainers.
13099 13110                   */
13100 13111                  if (boot_max_ncpus != -1)
13101 13112                          ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8);
13102 13113                  else
13103 13114                          ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
13104 13115          }
13105 13116  
13106 13117          ipst->ips_idl_tx_list =
13107 13118              kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
13108 13119          for (i = 0; i < TX_FANOUT_SIZE; i++) {
13109 13120                  itl_tx =  &ipst->ips_idl_tx_list[i];
13110 13121                  itl_tx->txl_drain_list =
13111 13122                      kmem_zalloc(ipst->ips_conn_drain_list_cnt *
13112 13123                      sizeof (idl_t), KM_SLEEP);
13113 13124                  mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
13114 13125                  for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
13115 13126                          mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
13116 13127                              MUTEX_DEFAULT, NULL);
13117 13128                          itl_tx->txl_drain_list[j].idl_itl = itl_tx;
13118 13129                  }
13119 13130          }
13120 13131  }
13121 13132  
13122 13133  static void
13123 13134  conn_drain_fini(ip_stack_t *ipst)
13124 13135  {
13125 13136          int i;
13126 13137          idl_tx_list_t *itl_tx;
13127 13138  
13128 13139          for (i = 0; i < TX_FANOUT_SIZE; i++) {
13129 13140                  itl_tx =  &ipst->ips_idl_tx_list[i];
13130 13141                  kmem_free(itl_tx->txl_drain_list,
13131 13142                      ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
13132 13143          }
13133 13144          kmem_free(ipst->ips_idl_tx_list,
13134 13145              TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
13135 13146          ipst->ips_idl_tx_list = NULL;
13136 13147  }
13137 13148  
13138 13149  /*
13139 13150   * Flow control has blocked us from proceeding.  Insert the given conn in one
13140 13151   * of the conn drain lists.  When flow control is unblocked, either ip_wsrv()
13141 13152   * (STREAMS) or ill_flow_enable() (direct) will be called back, which in turn
13142 13153   * will call conn_walk_drain().  See the flow control notes at the top of this
13143 13154   * file for more details.
13144 13155   */
13145 13156  void
13146 13157  conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
13147 13158  {
13148 13159          idl_t   *idl = tx_list->txl_drain_list;
13149 13160          uint_t  index;
13150 13161          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
13151 13162  
13152 13163          mutex_enter(&connp->conn_lock);
13153 13164          if (connp->conn_state_flags & CONN_CLOSING) {
13154 13165                  /*
13155 13166                   * The conn is closing as a result of which CONN_CLOSING
13156 13167                   * is set. Return.
13157 13168                   */
13158 13169                  mutex_exit(&connp->conn_lock);
13159 13170                  return;
13160 13171          } else if (connp->conn_idl == NULL) {
13161 13172                  /*
13162 13173                   * Assign the next drain list round robin. We dont' use
13163 13174                   * a lock, and thus it may not be strictly round robin.
13164 13175                   * Atomicity of load/stores is enough to make sure that
13165 13176                   * conn_drain_list_index is always within bounds.
13166 13177                   */
13167 13178                  index = tx_list->txl_drain_index;
13168 13179                  ASSERT(index < ipst->ips_conn_drain_list_cnt);
13169 13180                  connp->conn_idl = &tx_list->txl_drain_list[index];
13170 13181                  index++;
13171 13182                  if (index == ipst->ips_conn_drain_list_cnt)
13172 13183                          index = 0;
13173 13184                  tx_list->txl_drain_index = index;
13174 13185          } else {
13175 13186                  ASSERT(connp->conn_idl->idl_itl == tx_list);
13176 13187          }
13177 13188          mutex_exit(&connp->conn_lock);
13178 13189  
13179 13190          idl = connp->conn_idl;
13180 13191          mutex_enter(&idl->idl_lock);
13181 13192          if ((connp->conn_drain_prev != NULL) ||
13182 13193              (connp->conn_state_flags & CONN_CLOSING)) {
13183 13194                  /*
13184 13195                   * The conn is either already in the drain list or closing.
13185 13196                   * (We needed to check for CONN_CLOSING again since close can
13186 13197                   * sneak in between dropping conn_lock and acquiring idl_lock.)
13187 13198                   */
13188 13199                  mutex_exit(&idl->idl_lock);
13189 13200                  return;
13190 13201          }
13191 13202  
13192 13203          /*
13193 13204           * The conn is not in the drain list. Insert it at the
13194 13205           * tail of the drain list. The drain list is circular
13195 13206           * and doubly linked. idl_conn points to the 1st element
13196 13207           * in the list.
13197 13208           */
13198 13209          if (idl->idl_conn == NULL) {
13199 13210                  idl->idl_conn = connp;
13200 13211                  connp->conn_drain_next = connp;
13201 13212                  connp->conn_drain_prev = connp;
13202 13213          } else {
13203 13214                  conn_t *head = idl->idl_conn;
13204 13215  
13205 13216                  connp->conn_drain_next = head;
13206 13217                  connp->conn_drain_prev = head->conn_drain_prev;
13207 13218                  head->conn_drain_prev->conn_drain_next = connp;
13208 13219                  head->conn_drain_prev = connp;
13209 13220          }
13210 13221          /*
13211 13222           * For non streams based sockets assert flow control.
13212 13223           */
13213 13224          conn_setqfull(connp, NULL);
13214 13225          mutex_exit(&idl->idl_lock);
13215 13226  }
13216 13227  
13217 13228  static void
13218 13229  conn_drain_remove(conn_t *connp)
13219 13230  {
13220 13231          idl_t *idl = connp->conn_idl;
13221 13232  
13222 13233          if (idl != NULL) {
13223 13234                  /*
13224 13235                   * Remove ourself from the drain list.
13225 13236                   */
13226 13237                  if (connp->conn_drain_next == connp) {
13227 13238                          /* Singleton in the list */
13228 13239                          ASSERT(connp->conn_drain_prev == connp);
13229 13240                          idl->idl_conn = NULL;
13230 13241                  } else {
13231 13242                          connp->conn_drain_prev->conn_drain_next =
13232 13243                              connp->conn_drain_next;
13233 13244                          connp->conn_drain_next->conn_drain_prev =
13234 13245                              connp->conn_drain_prev;
13235 13246                          if (idl->idl_conn == connp)
13236 13247                                  idl->idl_conn = connp->conn_drain_next;
13237 13248                  }
13238 13249  
13239 13250                  /*
13240 13251                   * NOTE: because conn_idl is associated with a specific drain
13241 13252                   * list which in turn is tied to the index the TX ring
13242 13253                   * (txl_cookie) hashes to, and because the TX ring can change
13243 13254                   * over the lifetime of the conn_t, we must clear conn_idl so
13244 13255                   * a subsequent conn_drain_insert() will set conn_idl again
13245 13256                   * based on the latest txl_cookie.
13246 13257                   */
13247 13258                  connp->conn_idl = NULL;
13248 13259          }
13249 13260          connp->conn_drain_next = NULL;
13250 13261          connp->conn_drain_prev = NULL;
13251 13262  
13252 13263          conn_clrqfull(connp, NULL);
13253 13264          /*
13254 13265           * For streams based sockets open up flow control.
13255 13266           */
13256 13267          if (!IPCL_IS_NONSTR(connp))
13257 13268                  enableok(connp->conn_wq);
13258 13269  }
13259 13270  
13260 13271  /*
13261 13272   * This conn is closing, and we are called from ip_close. OR
13262 13273   * this conn is draining because flow-control on the ill has been relieved.
13263 13274   *
13264 13275   * We must also need to remove conn's on this idl from the list, and also
13265 13276   * inform the sockfs upcalls about the change in flow-control.
13266 13277   */
13267 13278  static void
13268 13279  conn_drain(conn_t *connp, boolean_t closing)
13269 13280  {
13270 13281          idl_t *idl;
13271 13282          conn_t *next_connp;
13272 13283  
13273 13284          /*
13274 13285           * connp->conn_idl is stable at this point, and no lock is needed
13275 13286           * to check it. If we are called from ip_close, close has already
13276 13287           * set CONN_CLOSING, thus freezing the value of conn_idl, and
13277 13288           * called us only because conn_idl is non-null. If we are called thru
13278 13289           * service, conn_idl could be null, but it cannot change because
13279 13290           * service is single-threaded per queue, and there cannot be another
13280 13291           * instance of service trying to call conn_drain_insert on this conn
13281 13292           * now.
13282 13293           */
13283 13294          ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
13284 13295  
13285 13296          /*
13286 13297           * If the conn doesn't exist or is not on a drain list, bail.
13287 13298           */
13288 13299          if (connp == NULL || connp->conn_idl == NULL ||
13289 13300              connp->conn_drain_prev == NULL) {
13290 13301                  return;
13291 13302          }
13292 13303  
13293 13304          idl = connp->conn_idl;
13294 13305          ASSERT(MUTEX_HELD(&idl->idl_lock));
13295 13306  
13296 13307          if (!closing) {
13297 13308                  next_connp = connp->conn_drain_next;
13298 13309                  while (next_connp != connp) {
13299 13310                          conn_t *delconnp = next_connp;
13300 13311  
13301 13312                          next_connp = next_connp->conn_drain_next;
13302 13313                          conn_drain_remove(delconnp);
13303 13314                  }
13304 13315                  ASSERT(connp->conn_drain_next == idl->idl_conn);
13305 13316          }
13306 13317          conn_drain_remove(connp);
13307 13318  }
13308 13319  
13309 13320  /*
13310 13321   * Write service routine. Shared perimeter entry point.
13311 13322   * The device queue's messages has fallen below the low water mark and STREAMS
13312 13323   * has backenabled the ill_wq. Send sockfs notification about flow-control on
13313 13324   * each waiting conn.
13314 13325   */
13315 13326  void
13316 13327  ip_wsrv(queue_t *q)
13317 13328  {
13318 13329          ill_t   *ill;
13319 13330  
13320 13331          ill = (ill_t *)q->q_ptr;
13321 13332          if (ill->ill_state_flags == 0) {
13322 13333                  ip_stack_t *ipst = ill->ill_ipst;
13323 13334  
13324 13335                  /*
13325 13336                   * The device flow control has opened up.
13326 13337                   * Walk through conn drain lists and qenable the
13327 13338                   * first conn in each list. This makes sense only
13328 13339                   * if the stream is fully plumbed and setup.
13329 13340                   * Hence the ill_state_flags check above.
13330 13341                   */
13331 13342                  ip1dbg(("ip_wsrv: walking\n"));
13332 13343                  conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
13333 13344                  enableok(ill->ill_wq);
13334 13345          }
13335 13346  }
13336 13347  
13337 13348  /*
13338 13349   * Callback to disable flow control in IP.
13339 13350   *
13340 13351   * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
13341 13352   * is enabled.
13342 13353   *
13343 13354   * When MAC_TX() is not able to send any more packets, dld sets its queue
13344 13355   * to QFULL and enable the STREAMS flow control. Later, when the underlying
13345 13356   * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
13346 13357   * function and wakes up corresponding mac worker threads, which in turn
13347 13358   * calls this callback function, and disables flow control.
13348 13359   */
13349 13360  void
13350 13361  ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
13351 13362  {
13352 13363          ill_t *ill = (ill_t *)arg;
13353 13364          ip_stack_t *ipst = ill->ill_ipst;
13354 13365          idl_tx_list_t *idl_txl;
13355 13366  
13356 13367          idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
13357 13368          mutex_enter(&idl_txl->txl_lock);
13358 13369          /* add code to to set a flag to indicate idl_txl is enabled */
13359 13370          conn_walk_drain(ipst, idl_txl);
13360 13371          mutex_exit(&idl_txl->txl_lock);
13361 13372  }
13362 13373  
13363 13374  /*
13364 13375   * Flow control has been relieved and STREAMS has backenabled us; drain
13365 13376   * all the conn lists on `tx_list'.
13366 13377   */
13367 13378  static void
13368 13379  conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
13369 13380  {
13370 13381          int i;
13371 13382          idl_t *idl;
13372 13383  
13373 13384          IP_STAT(ipst, ip_conn_walk_drain);
13374 13385  
13375 13386          for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
13376 13387                  idl = &tx_list->txl_drain_list[i];
13377 13388                  mutex_enter(&idl->idl_lock);
13378 13389                  conn_drain(idl->idl_conn, B_FALSE);
13379 13390                  mutex_exit(&idl->idl_lock);
13380 13391          }
13381 13392  }
13382 13393  
13383 13394  /*
13384 13395   * Determine if the ill and multicast aspects of that packets
13385 13396   * "matches" the conn.
13386 13397   */
13387 13398  boolean_t
13388 13399  conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
13389 13400  {
13390 13401          ill_t           *ill = ira->ira_rill;
13391 13402          zoneid_t        zoneid = ira->ira_zoneid;
13392 13403          uint_t          in_ifindex;
13393 13404          ipaddr_t        dst, src;
13394 13405  
13395 13406          dst = ipha->ipha_dst;
13396 13407          src = ipha->ipha_src;
13397 13408  
13398 13409          /*
13399 13410           * conn_incoming_ifindex is set by IP_BOUND_IF which limits
13400 13411           * unicast, broadcast and multicast reception to
13401 13412           * conn_incoming_ifindex.
13402 13413           * conn_wantpacket is called for unicast, broadcast and
13403 13414           * multicast packets.
13404 13415           */
13405 13416          in_ifindex = connp->conn_incoming_ifindex;
13406 13417  
13407 13418          /* mpathd can bind to the under IPMP interface, which we allow */
13408 13419          if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
13409 13420                  if (!IS_UNDER_IPMP(ill))
13410 13421                          return (B_FALSE);
13411 13422  
13412 13423                  if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
13413 13424                          return (B_FALSE);
13414 13425          }
13415 13426  
13416 13427          if (!IPCL_ZONE_MATCH(connp, zoneid))
13417 13428                  return (B_FALSE);
13418 13429  
13419 13430          if (!(ira->ira_flags & IRAF_MULTICAST))
13420 13431                  return (B_TRUE);
13421 13432  
13422 13433          if (connp->conn_multi_router) {
13423 13434                  /* multicast packet and multicast router socket: send up */
13424 13435                  return (B_TRUE);
13425 13436          }
13426 13437  
13427 13438          if (ipha->ipha_protocol == IPPROTO_PIM ||
13428 13439              ipha->ipha_protocol == IPPROTO_RSVP)
13429 13440                  return (B_TRUE);
13430 13441  
13431 13442          return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
13432 13443  }
13433 13444  
13434 13445  void
13435 13446  conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
13436 13447  {
13437 13448          if (IPCL_IS_NONSTR(connp)) {
13438 13449                  (*connp->conn_upcalls->su_txq_full)
13439 13450                      (connp->conn_upper_handle, B_TRUE);
13440 13451                  if (flow_stopped != NULL)
13441 13452                          *flow_stopped = B_TRUE;
13442 13453          } else {
13443 13454                  queue_t *q = connp->conn_wq;
13444 13455  
13445 13456                  ASSERT(q != NULL);
13446 13457                  if (!(q->q_flag & QFULL)) {
13447 13458                          mutex_enter(QLOCK(q));
13448 13459                          if (!(q->q_flag & QFULL)) {
13449 13460                                  /* still need to set QFULL */
13450 13461                                  q->q_flag |= QFULL;
13451 13462                                  /* set flow_stopped to true under QLOCK */
13452 13463                                  if (flow_stopped != NULL)
13453 13464                                          *flow_stopped = B_TRUE;
13454 13465                                  mutex_exit(QLOCK(q));
13455 13466                          } else {
13456 13467                                  /* flow_stopped is left unchanged */
13457 13468                                  mutex_exit(QLOCK(q));
13458 13469                          }
13459 13470                  }
13460 13471          }
13461 13472  }
13462 13473  
13463 13474  void
13464 13475  conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
13465 13476  {
13466 13477          if (IPCL_IS_NONSTR(connp)) {
13467 13478                  (*connp->conn_upcalls->su_txq_full)
13468 13479                      (connp->conn_upper_handle, B_FALSE);
13469 13480                  if (flow_stopped != NULL)
13470 13481                          *flow_stopped = B_FALSE;
13471 13482          } else {
13472 13483                  queue_t *q = connp->conn_wq;
13473 13484  
13474 13485                  ASSERT(q != NULL);
13475 13486                  if (q->q_flag & QFULL) {
13476 13487                          mutex_enter(QLOCK(q));
13477 13488                          if (q->q_flag & QFULL) {
13478 13489                                  q->q_flag &= ~QFULL;
13479 13490                                  /* set flow_stopped to false under QLOCK */
13480 13491                                  if (flow_stopped != NULL)
13481 13492                                          *flow_stopped = B_FALSE;
13482 13493                                  mutex_exit(QLOCK(q));
13483 13494                                  if (q->q_flag & QWANTW)
13484 13495                                          qbackenable(q, 0);
13485 13496                          } else {
13486 13497                                  /* flow_stopped is left unchanged */
13487 13498                                  mutex_exit(QLOCK(q));
13488 13499                          }
13489 13500                  }
13490 13501          }
13491 13502  
13492 13503          mutex_enter(&connp->conn_lock);
13493 13504          connp->conn_blocked = B_FALSE;
13494 13505          mutex_exit(&connp->conn_lock);
13495 13506  }
13496 13507  
13497 13508  /*
13498 13509   * Return the length in bytes of the IPv4 headers (base header, label, and
13499 13510   * other IP options) that will be needed based on the
13500 13511   * ip_pkt_t structure passed by the caller.
13501 13512   *
13502 13513   * The returned length does not include the length of the upper level
13503 13514   * protocol (ULP) header.
13504 13515   * The caller needs to check that the length doesn't exceed the max for IPv4.
13505 13516   */
13506 13517  int
13507 13518  ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
13508 13519  {
13509 13520          int len;
13510 13521  
13511 13522          len = IP_SIMPLE_HDR_LENGTH;
13512 13523          if (ipp->ipp_fields & IPPF_LABEL_V4) {
13513 13524                  ASSERT(ipp->ipp_label_len_v4 != 0);
13514 13525                  /* We need to round up here */
13515 13526                  len += (ipp->ipp_label_len_v4 + 3) & ~3;
13516 13527          }
13517 13528  
13518 13529          if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13519 13530                  ASSERT(ipp->ipp_ipv4_options_len != 0);
13520 13531                  ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13521 13532                  len += ipp->ipp_ipv4_options_len;
13522 13533          }
13523 13534          return (len);
13524 13535  }
13525 13536  
13526 13537  /*
13527 13538   * All-purpose routine to build an IPv4 header with options based
13528 13539   * on the abstract ip_pkt_t.
13529 13540   *
13530 13541   * The caller has to set the source and destination address as well as
13531 13542   * ipha_length. The caller has to massage any source route and compensate
13532 13543   * for the ULP pseudo-header checksum due to the source route.
13533 13544   */
13534 13545  void
13535 13546  ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
13536 13547      uint8_t protocol)
13537 13548  {
13538 13549          ipha_t  *ipha = (ipha_t *)buf;
13539 13550          uint8_t *cp;
13540 13551  
13541 13552          /* Initialize IPv4 header */
13542 13553          ipha->ipha_type_of_service = ipp->ipp_type_of_service;
13543 13554          ipha->ipha_length = 0;  /* Caller will set later */
13544 13555          ipha->ipha_ident = 0;
13545 13556          ipha->ipha_fragment_offset_and_flags = 0;
13546 13557          ipha->ipha_ttl = ipp->ipp_unicast_hops;
13547 13558          ipha->ipha_protocol = protocol;
13548 13559          ipha->ipha_hdr_checksum = 0;
13549 13560  
13550 13561          if ((ipp->ipp_fields & IPPF_ADDR) &&
13551 13562              IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
13552 13563                  ipha->ipha_src = ipp->ipp_addr_v4;
13553 13564  
13554 13565          cp = (uint8_t *)&ipha[1];
13555 13566          if (ipp->ipp_fields & IPPF_LABEL_V4) {
13556 13567                  ASSERT(ipp->ipp_label_len_v4 != 0);
13557 13568                  bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
13558 13569                  cp += ipp->ipp_label_len_v4;
13559 13570                  /* We need to round up here */
13560 13571                  while ((uintptr_t)cp & 0x3) {
13561 13572                          *cp++ = IPOPT_NOP;
13562 13573                  }
13563 13574          }
13564 13575  
13565 13576          if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13566 13577                  ASSERT(ipp->ipp_ipv4_options_len != 0);
13567 13578                  ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13568 13579                  bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
13569 13580                  cp += ipp->ipp_ipv4_options_len;
13570 13581          }
13571 13582          ipha->ipha_version_and_hdr_length =
13572 13583              (uint8_t)((IP_VERSION << 4) + buf_len / 4);
13573 13584  
13574 13585          ASSERT((int)(cp - buf) == buf_len);
13575 13586  }
13576 13587  
13577 13588  /* Allocate the private structure */
13578 13589  static int
13579 13590  ip_priv_alloc(void **bufp)
13580 13591  {
13581 13592          void    *buf;
13582 13593  
13583 13594          if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL)
13584 13595                  return (ENOMEM);
13585 13596  
13586 13597          *bufp = buf;
13587 13598          return (0);
13588 13599  }
13589 13600  
13590 13601  /* Function to delete the private structure */
13591 13602  void
13592 13603  ip_priv_free(void *buf)
13593 13604  {
13594 13605          ASSERT(buf != NULL);
13595 13606          kmem_free(buf, sizeof (ip_priv_t));
13596 13607  }
13597 13608  
13598 13609  /*
13599 13610   * The entry point for IPPF processing.
13600 13611   * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the
13601 13612   * routine just returns.
13602 13613   *
13603 13614   * When called, ip_process generates an ipp_packet_t structure
13604 13615   * which holds the state information for this packet and invokes the
13605 13616   * the classifier (via ipp_packet_process). The classification, depending on
13606 13617   * configured filters, results in a list of actions for this packet. Invoking
13607 13618   * an action may cause the packet to be dropped, in which case we return NULL.
13608 13619   * proc indicates the callout position for
13609 13620   * this packet and ill is the interface this packet arrived on or will leave
13610 13621   * on (inbound and outbound resp.).
13611 13622   *
13612 13623   * We do the processing on the rill (mapped to the upper if ipmp), but MIB
13613 13624   * on the ill corrsponding to the destination IP address.
13614 13625   */
13615 13626  mblk_t *
13616 13627  ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
13617 13628  {
13618 13629          ip_priv_t       *priv;
13619 13630          ipp_action_id_t aid;
13620 13631          int             rc = 0;
13621 13632          ipp_packet_t    *pp;
13622 13633  
13623 13634          /* If the classifier is not loaded, return  */
13624 13635          if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
13625 13636                  return (mp);
13626 13637          }
13627 13638  
13628 13639          ASSERT(mp != NULL);
13629 13640  
13630 13641          /* Allocate the packet structure */
13631 13642          rc = ipp_packet_alloc(&pp, "ip", aid);
13632 13643          if (rc != 0)
13633 13644                  goto drop;
13634 13645  
13635 13646          /* Allocate the private structure */
13636 13647          rc = ip_priv_alloc((void **)&priv);
13637 13648          if (rc != 0) {
13638 13649                  ipp_packet_free(pp);
13639 13650                  goto drop;
13640 13651          }
13641 13652          priv->proc = proc;
13642 13653          priv->ill_index = ill_get_upper_ifindex(rill);
13643 13654  
13644 13655          ipp_packet_set_private(pp, priv, ip_priv_free);
13645 13656          ipp_packet_set_data(pp, mp);
13646 13657  
13647 13658          /* Invoke the classifier */
13648 13659          rc = ipp_packet_process(&pp);
13649 13660          if (pp != NULL) {
13650 13661                  mp = ipp_packet_get_data(pp);
13651 13662                  ipp_packet_free(pp);
13652 13663                  if (rc != 0)
13653 13664                          goto drop;
13654 13665                  return (mp);
13655 13666          } else {
13656 13667                  /* No mp to trace in ip_drop_input/ip_drop_output  */
13657 13668                  mp = NULL;
13658 13669          }
13659 13670  drop:
13660 13671          if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
13661 13672                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
13662 13673                  ip_drop_input("ip_process", mp, ill);
13663 13674          } else {
13664 13675                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13665 13676                  ip_drop_output("ip_process", mp, ill);
13666 13677          }
13667 13678          freemsg(mp);
13668 13679          return (NULL);
13669 13680  }
13670 13681  
13671 13682  /*
13672 13683   * Propagate a multicast group membership operation (add/drop) on
13673 13684   * all the interfaces crossed by the related multirt routes.
13674 13685   * The call is considered successful if the operation succeeds
13675 13686   * on at least one interface.
13676 13687   *
13677 13688   * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
13678 13689   * multicast addresses with the ire argument being the first one.
13679 13690   * We walk the bucket to find all the of those.
13680 13691   *
13681 13692   * Common to IPv4 and IPv6.
13682 13693   */
13683 13694  static int
13684 13695  ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
13685 13696      const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
13686 13697      ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
13687 13698      mcast_record_t fmode, const in6_addr_t *v6src)
13688 13699  {
13689 13700          ire_t           *ire_gw;
13690 13701          irb_t           *irb;
13691 13702          int             ifindex;
13692 13703          int             error = 0;
13693 13704          int             result;
13694 13705          ip_stack_t      *ipst = ire->ire_ipst;
13695 13706          ipaddr_t        group;
13696 13707          boolean_t       isv6;
13697 13708          int             match_flags;
13698 13709  
13699 13710          if (IN6_IS_ADDR_V4MAPPED(v6group)) {
13700 13711                  IN6_V4MAPPED_TO_IPADDR(v6group, group);
13701 13712                  isv6 = B_FALSE;
13702 13713          } else {
13703 13714                  isv6 = B_TRUE;
13704 13715          }
13705 13716  
13706 13717          irb = ire->ire_bucket;
13707 13718          ASSERT(irb != NULL);
13708 13719  
13709 13720          result = 0;
13710 13721          irb_refhold(irb);
13711 13722          for (; ire != NULL; ire = ire->ire_next) {
13712 13723                  if ((ire->ire_flags & RTF_MULTIRT) == 0)
13713 13724                          continue;
13714 13725  
13715 13726                  /* We handle -ifp routes by matching on the ill if set */
13716 13727                  match_flags = MATCH_IRE_TYPE;
13717 13728                  if (ire->ire_ill != NULL)
13718 13729                          match_flags |= MATCH_IRE_ILL;
13719 13730  
13720 13731                  if (isv6) {
13721 13732                          if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
13722 13733                                  continue;
13723 13734  
13724 13735                          ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
13725 13736                              0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13726 13737                              match_flags, 0, ipst, NULL);
13727 13738                  } else {
13728 13739                          if (ire->ire_addr != group)
13729 13740                                  continue;
13730 13741  
13731 13742                          ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
13732 13743                              0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13733 13744                              match_flags, 0, ipst, NULL);
13734 13745                  }
13735 13746                  /* No interface route exists for the gateway; skip this ire. */
13736 13747                  if (ire_gw == NULL)
13737 13748                          continue;
13738 13749                  if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
13739 13750                          ire_refrele(ire_gw);
13740 13751                          continue;
13741 13752                  }
13742 13753                  ASSERT(ire_gw->ire_ill != NULL);        /* IRE_INTERFACE */
13743 13754                  ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
13744 13755  
13745 13756                  /*
13746 13757                   * The operation is considered a success if
13747 13758                   * it succeeds at least once on any one interface.
13748 13759                   */
13749 13760                  error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
13750 13761                      fmode, v6src);
13751 13762                  if (error == 0)
13752 13763                          result = CGTP_MCAST_SUCCESS;
13753 13764  
13754 13765                  ire_refrele(ire_gw);
13755 13766          }
13756 13767          irb_refrele(irb);
13757 13768          /*
13758 13769           * Consider the call as successful if we succeeded on at least
13759 13770           * one interface. Otherwise, return the last encountered error.
13760 13771           */
13761 13772          return (result == CGTP_MCAST_SUCCESS ? 0 : error);
13762 13773  }
13763 13774  
13764 13775  /*
13765 13776   * Return the expected CGTP hooks version number.
13766 13777   */
13767 13778  int
13768 13779  ip_cgtp_filter_supported(void)
13769 13780  {
13770 13781          return (ip_cgtp_filter_rev);
13771 13782  }
13772 13783  
13773 13784  /*
13774 13785   * CGTP hooks can be registered by invoking this function.
13775 13786   * Checks that the version number matches.
13776 13787   */
13777 13788  int
13778 13789  ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
13779 13790  {
13780 13791          netstack_t *ns;
13781 13792          ip_stack_t *ipst;
13782 13793  
13783 13794          if (ops->cfo_filter_rev != CGTP_FILTER_REV)
13784 13795                  return (ENOTSUP);
13785 13796  
13786 13797          ns = netstack_find_by_stackid(stackid);
13787 13798          if (ns == NULL)
13788 13799                  return (EINVAL);
13789 13800          ipst = ns->netstack_ip;
13790 13801          ASSERT(ipst != NULL);
13791 13802  
13792 13803          if (ipst->ips_ip_cgtp_filter_ops != NULL) {
13793 13804                  netstack_rele(ns);
13794 13805                  return (EALREADY);
13795 13806          }
13796 13807  
13797 13808          ipst->ips_ip_cgtp_filter_ops = ops;
13798 13809  
13799 13810          ill_set_inputfn_all(ipst);
13800 13811  
13801 13812          netstack_rele(ns);
13802 13813          return (0);
13803 13814  }
13804 13815  
13805 13816  /*
13806 13817   * CGTP hooks can be unregistered by invoking this function.
13807 13818   * Returns ENXIO if there was no registration.
13808 13819   * Returns EBUSY if the ndd variable has not been turned off.
13809 13820   */
13810 13821  int
13811 13822  ip_cgtp_filter_unregister(netstackid_t stackid)
13812 13823  {
13813 13824          netstack_t *ns;
13814 13825          ip_stack_t *ipst;
13815 13826  
13816 13827          ns = netstack_find_by_stackid(stackid);
13817 13828          if (ns == NULL)
13818 13829                  return (EINVAL);
13819 13830          ipst = ns->netstack_ip;
13820 13831          ASSERT(ipst != NULL);
13821 13832  
13822 13833          if (ipst->ips_ip_cgtp_filter) {
13823 13834                  netstack_rele(ns);
13824 13835                  return (EBUSY);
13825 13836          }
13826 13837  
13827 13838          if (ipst->ips_ip_cgtp_filter_ops == NULL) {
13828 13839                  netstack_rele(ns);
13829 13840                  return (ENXIO);
13830 13841          }
13831 13842          ipst->ips_ip_cgtp_filter_ops = NULL;
13832 13843  
13833 13844          ill_set_inputfn_all(ipst);
13834 13845  
13835 13846          netstack_rele(ns);
13836 13847          return (0);
13837 13848  }
13838 13849  
13839 13850  /*
13840 13851   * Check whether there is a CGTP filter registration.
13841 13852   * Returns non-zero if there is a registration, otherwise returns zero.
13842 13853   * Note: returns zero if bad stackid.
13843 13854   */
13844 13855  int
13845 13856  ip_cgtp_filter_is_registered(netstackid_t stackid)
13846 13857  {
13847 13858          netstack_t *ns;
13848 13859          ip_stack_t *ipst;
13849 13860          int ret;
13850 13861  
13851 13862          ns = netstack_find_by_stackid(stackid);
13852 13863          if (ns == NULL)
13853 13864                  return (0);
13854 13865          ipst = ns->netstack_ip;
13855 13866          ASSERT(ipst != NULL);
13856 13867  
13857 13868          if (ipst->ips_ip_cgtp_filter_ops != NULL)
13858 13869                  ret = 1;
13859 13870          else
13860 13871                  ret = 0;
13861 13872  
13862 13873          netstack_rele(ns);
13863 13874          return (ret);
13864 13875  }
13865 13876  
13866 13877  static int
13867 13878  ip_squeue_switch(int val)
13868 13879  {
13869 13880          int rval;
13870 13881  
13871 13882          switch (val) {
13872 13883          case IP_SQUEUE_ENTER_NODRAIN:
13873 13884                  rval = SQ_NODRAIN;
13874 13885                  break;
13875 13886          case IP_SQUEUE_ENTER:
13876 13887                  rval = SQ_PROCESS;
13877 13888                  break;
13878 13889          case IP_SQUEUE_FILL:
13879 13890          default:
13880 13891                  rval = SQ_FILL;
13881 13892                  break;
13882 13893          }
13883 13894          return (rval);
13884 13895  }
13885 13896  
13886 13897  static void *
13887 13898  ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
13888 13899  {
13889 13900          kstat_t *ksp;
13890 13901  
13891 13902          ip_stat_t template = {
13892 13903                  { "ip_udp_fannorm",             KSTAT_DATA_UINT64 },
13893 13904                  { "ip_udp_fanmb",               KSTAT_DATA_UINT64 },
13894 13905                  { "ip_recv_pullup",             KSTAT_DATA_UINT64 },
13895 13906                  { "ip_db_ref",                  KSTAT_DATA_UINT64 },
13896 13907                  { "ip_notaligned",              KSTAT_DATA_UINT64 },
13897 13908                  { "ip_multimblk",               KSTAT_DATA_UINT64 },
13898 13909                  { "ip_opt",                     KSTAT_DATA_UINT64 },
13899 13910                  { "ipsec_proto_ahesp",          KSTAT_DATA_UINT64 },
13900 13911                  { "ip_conn_flputbq",            KSTAT_DATA_UINT64 },
13901 13912                  { "ip_conn_walk_drain",         KSTAT_DATA_UINT64 },
13902 13913                  { "ip_out_sw_cksum",            KSTAT_DATA_UINT64 },
13903 13914                  { "ip_out_sw_cksum_bytes",      KSTAT_DATA_UINT64 },
13904 13915                  { "ip_in_sw_cksum",             KSTAT_DATA_UINT64 },
13905 13916                  { "ip_ire_reclaim_calls",       KSTAT_DATA_UINT64 },
13906 13917                  { "ip_ire_reclaim_deleted",     KSTAT_DATA_UINT64 },
13907 13918                  { "ip_nce_reclaim_calls",       KSTAT_DATA_UINT64 },
13908 13919                  { "ip_nce_reclaim_deleted",     KSTAT_DATA_UINT64 },
13909 13920                  { "ip_dce_reclaim_calls",       KSTAT_DATA_UINT64 },
13910 13921                  { "ip_dce_reclaim_deleted",     KSTAT_DATA_UINT64 },
13911 13922                  { "ip_tcp_in_full_hw_cksum_err",        KSTAT_DATA_UINT64 },
13912 13923                  { "ip_tcp_in_part_hw_cksum_err",        KSTAT_DATA_UINT64 },
13913 13924                  { "ip_tcp_in_sw_cksum_err",             KSTAT_DATA_UINT64 },
13914 13925                  { "ip_udp_in_full_hw_cksum_err",        KSTAT_DATA_UINT64 },
13915 13926                  { "ip_udp_in_part_hw_cksum_err",        KSTAT_DATA_UINT64 },
13916 13927                  { "ip_udp_in_sw_cksum_err",     KSTAT_DATA_UINT64 },
13917 13928                  { "conn_in_recvdstaddr",        KSTAT_DATA_UINT64 },
13918 13929                  { "conn_in_recvopts",           KSTAT_DATA_UINT64 },
13919 13930                  { "conn_in_recvif",             KSTAT_DATA_UINT64 },
13920 13931                  { "conn_in_recvslla",           KSTAT_DATA_UINT64 },
13921 13932                  { "conn_in_recvucred",          KSTAT_DATA_UINT64 },
13922 13933                  { "conn_in_recvttl",            KSTAT_DATA_UINT64 },
13923 13934                  { "conn_in_recvhopopts",        KSTAT_DATA_UINT64 },
13924 13935                  { "conn_in_recvhoplimit",       KSTAT_DATA_UINT64 },
13925 13936                  { "conn_in_recvdstopts",        KSTAT_DATA_UINT64 },
13926 13937                  { "conn_in_recvrthdrdstopts",   KSTAT_DATA_UINT64 },
13927 13938                  { "conn_in_recvrthdr",          KSTAT_DATA_UINT64 },
13928 13939                  { "conn_in_recvpktinfo",        KSTAT_DATA_UINT64 },
13929 13940                  { "conn_in_recvtclass",         KSTAT_DATA_UINT64 },
13930 13941                  { "conn_in_timestamp",          KSTAT_DATA_UINT64 },
13931 13942          };
13932 13943  
13933 13944          ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
13934 13945              KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
13935 13946              KSTAT_FLAG_VIRTUAL, stackid);
13936 13947  
13937 13948          if (ksp == NULL)
13938 13949                  return (NULL);
13939 13950  
13940 13951          bcopy(&template, ip_statisticsp, sizeof (template));
13941 13952          ksp->ks_data = (void *)ip_statisticsp;
13942 13953          ksp->ks_private = (void *)(uintptr_t)stackid;
13943 13954  
13944 13955          kstat_install(ksp);
13945 13956          return (ksp);
13946 13957  }
13947 13958  
13948 13959  static void
13949 13960  ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp)
13950 13961  {
13951 13962          if (ksp != NULL) {
13952 13963                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
13953 13964                  kstat_delete_netstack(ksp, stackid);
13954 13965          }
13955 13966  }
13956 13967  
13957 13968  static void *
13958 13969  ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst)
13959 13970  {
13960 13971          kstat_t *ksp;
13961 13972  
13962 13973          ip_named_kstat_t template = {
13963 13974                  { "forwarding",         KSTAT_DATA_UINT32, 0 },
13964 13975                  { "defaultTTL",         KSTAT_DATA_UINT32, 0 },
13965 13976                  { "inReceives",         KSTAT_DATA_UINT64, 0 },
13966 13977                  { "inHdrErrors",        KSTAT_DATA_UINT32, 0 },
13967 13978                  { "inAddrErrors",       KSTAT_DATA_UINT32, 0 },
13968 13979                  { "forwDatagrams",      KSTAT_DATA_UINT64, 0 },
13969 13980                  { "inUnknownProtos",    KSTAT_DATA_UINT32, 0 },
13970 13981                  { "inDiscards",         KSTAT_DATA_UINT32, 0 },
13971 13982                  { "inDelivers",         KSTAT_DATA_UINT64, 0 },
13972 13983                  { "outRequests",        KSTAT_DATA_UINT64, 0 },
13973 13984                  { "outDiscards",        KSTAT_DATA_UINT32, 0 },
13974 13985                  { "outNoRoutes",        KSTAT_DATA_UINT32, 0 },
13975 13986                  { "reasmTimeout",       KSTAT_DATA_UINT32, 0 },
13976 13987                  { "reasmReqds",         KSTAT_DATA_UINT32, 0 },
13977 13988                  { "reasmOKs",           KSTAT_DATA_UINT32, 0 },
13978 13989                  { "reasmFails",         KSTAT_DATA_UINT32, 0 },
13979 13990                  { "fragOKs",            KSTAT_DATA_UINT32, 0 },
13980 13991                  { "fragFails",          KSTAT_DATA_UINT32, 0 },
13981 13992                  { "fragCreates",        KSTAT_DATA_UINT32, 0 },
13982 13993                  { "addrEntrySize",      KSTAT_DATA_INT32, 0 },
13983 13994                  { "routeEntrySize",     KSTAT_DATA_INT32, 0 },
13984 13995                  { "netToMediaEntrySize",        KSTAT_DATA_INT32, 0 },
13985 13996                  { "routingDiscards",    KSTAT_DATA_UINT32, 0 },
13986 13997                  { "inErrs",             KSTAT_DATA_UINT32, 0 },
13987 13998                  { "noPorts",            KSTAT_DATA_UINT32, 0 },
13988 13999                  { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
13989 14000                  { "reasmDuplicates",    KSTAT_DATA_UINT32, 0 },
13990 14001                  { "reasmPartDups",      KSTAT_DATA_UINT32, 0 },
13991 14002                  { "forwProhibits",      KSTAT_DATA_UINT32, 0 },
13992 14003                  { "udpInCksumErrs",     KSTAT_DATA_UINT32, 0 },
13993 14004                  { "udpInOverflows",     KSTAT_DATA_UINT32, 0 },
13994 14005                  { "rawipInOverflows",   KSTAT_DATA_UINT32, 0 },
13995 14006                  { "ipsecInSucceeded",   KSTAT_DATA_UINT32, 0 },
13996 14007                  { "ipsecInFailed",      KSTAT_DATA_INT32, 0 },
13997 14008                  { "memberEntrySize",    KSTAT_DATA_INT32, 0 },
13998 14009                  { "inIPv6",             KSTAT_DATA_UINT32, 0 },
13999 14010                  { "outIPv6",            KSTAT_DATA_UINT32, 0 },
14000 14011                  { "outSwitchIPv6",      KSTAT_DATA_UINT32, 0 },
14001 14012          };
14002 14013  
14003 14014          ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED,
14004 14015              NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid);
14005 14016          if (ksp == NULL || ksp->ks_data == NULL)
14006 14017                  return (NULL);
14007 14018  
14008 14019          template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2;
14009 14020          template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl;
14010 14021          template.reasmTimeout.value.ui32 = ipst->ips_ip_reassembly_timeout;
14011 14022          template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t);
14012 14023          template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t);
14013 14024  
14014 14025          template.netToMediaEntrySize.value.i32 =
14015 14026              sizeof (mib2_ipNetToMediaEntry_t);
14016 14027  
14017 14028          template.memberEntrySize.value.i32 = sizeof (ipv6_member_t);
14018 14029  
14019 14030          bcopy(&template, ksp->ks_data, sizeof (template));
14020 14031          ksp->ks_update = ip_kstat_update;
14021 14032          ksp->ks_private = (void *)(uintptr_t)stackid;
14022 14033  
14023 14034          kstat_install(ksp);
14024 14035          return (ksp);
14025 14036  }
14026 14037  
14027 14038  static void
14028 14039  ip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14029 14040  {
14030 14041          if (ksp != NULL) {
14031 14042                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14032 14043                  kstat_delete_netstack(ksp, stackid);
14033 14044          }
14034 14045  }
14035 14046  
14036 14047  static int
14037 14048  ip_kstat_update(kstat_t *kp, int rw)
14038 14049  {
14039 14050          ip_named_kstat_t *ipkp;
14040 14051          mib2_ipIfStatsEntry_t ipmib;
14041 14052          ill_walk_context_t ctx;
14042 14053          ill_t *ill;
14043 14054          netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14044 14055          netstack_t      *ns;
14045 14056          ip_stack_t      *ipst;
14046 14057  
14047 14058          if (kp == NULL || kp->ks_data == NULL)
14048 14059                  return (EIO);
14049 14060  
14050 14061          if (rw == KSTAT_WRITE)
14051 14062                  return (EACCES);
14052 14063  
14053 14064          ns = netstack_find_by_stackid(stackid);
14054 14065          if (ns == NULL)
14055 14066                  return (-1);
14056 14067          ipst = ns->netstack_ip;
14057 14068          if (ipst == NULL) {
14058 14069                  netstack_rele(ns);
14059 14070                  return (-1);
14060 14071          }
14061 14072          ipkp = (ip_named_kstat_t *)kp->ks_data;
14062 14073  
14063 14074          bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib));
14064 14075          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
14065 14076          ill = ILL_START_WALK_V4(&ctx, ipst);
14066 14077          for (; ill != NULL; ill = ill_next(&ctx, ill))
14067 14078                  ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib);
14068 14079          rw_exit(&ipst->ips_ill_g_lock);
14069 14080  
14070 14081          ipkp->forwarding.value.ui32 =           ipmib.ipIfStatsForwarding;
14071 14082          ipkp->defaultTTL.value.ui32 =           ipmib.ipIfStatsDefaultTTL;
14072 14083          ipkp->inReceives.value.ui64 =           ipmib.ipIfStatsHCInReceives;
14073 14084          ipkp->inHdrErrors.value.ui32 =          ipmib.ipIfStatsInHdrErrors;
14074 14085          ipkp->inAddrErrors.value.ui32 =         ipmib.ipIfStatsInAddrErrors;
14075 14086          ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams;
14076 14087          ipkp->inUnknownProtos.value.ui32 =      ipmib.ipIfStatsInUnknownProtos;
14077 14088          ipkp->inDiscards.value.ui32 =           ipmib.ipIfStatsInDiscards;
14078 14089          ipkp->inDelivers.value.ui64 =           ipmib.ipIfStatsHCInDelivers;
14079 14090          ipkp->outRequests.value.ui64 =          ipmib.ipIfStatsHCOutRequests;
14080 14091          ipkp->outDiscards.value.ui32 =          ipmib.ipIfStatsOutDiscards;
14081 14092          ipkp->outNoRoutes.value.ui32 =          ipmib.ipIfStatsOutNoRoutes;
14082 14093          ipkp->reasmTimeout.value.ui32 =         ipst->ips_ip_reassembly_timeout;
14083 14094          ipkp->reasmReqds.value.ui32 =           ipmib.ipIfStatsReasmReqds;
14084 14095          ipkp->reasmOKs.value.ui32 =             ipmib.ipIfStatsReasmOKs;
14085 14096          ipkp->reasmFails.value.ui32 =           ipmib.ipIfStatsReasmFails;
14086 14097          ipkp->fragOKs.value.ui32 =              ipmib.ipIfStatsOutFragOKs;
14087 14098          ipkp->fragFails.value.ui32 =            ipmib.ipIfStatsOutFragFails;
14088 14099          ipkp->fragCreates.value.ui32 =          ipmib.ipIfStatsOutFragCreates;
14089 14100  
14090 14101          ipkp->routingDiscards.value.ui32 =      0;
14091 14102          ipkp->inErrs.value.ui32 =               ipmib.tcpIfStatsInErrs;
14092 14103          ipkp->noPorts.value.ui32 =              ipmib.udpIfStatsNoPorts;
14093 14104          ipkp->inCksumErrs.value.ui32 =          ipmib.ipIfStatsInCksumErrs;
14094 14105          ipkp->reasmDuplicates.value.ui32 =      ipmib.ipIfStatsReasmDuplicates;
14095 14106          ipkp->reasmPartDups.value.ui32 =        ipmib.ipIfStatsReasmPartDups;
14096 14107          ipkp->forwProhibits.value.ui32 =        ipmib.ipIfStatsForwProhibits;
14097 14108          ipkp->udpInCksumErrs.value.ui32 =       ipmib.udpIfStatsInCksumErrs;
14098 14109          ipkp->udpInOverflows.value.ui32 =       ipmib.udpIfStatsInOverflows;
14099 14110          ipkp->rawipInOverflows.value.ui32 =     ipmib.rawipIfStatsInOverflows;
14100 14111          ipkp->ipsecInSucceeded.value.ui32 =     ipmib.ipsecIfStatsInSucceeded;
14101 14112          ipkp->ipsecInFailed.value.i32 =         ipmib.ipsecIfStatsInFailed;
14102 14113  
14103 14114          ipkp->inIPv6.value.ui32 =       ipmib.ipIfStatsInWrongIPVersion;
14104 14115          ipkp->outIPv6.value.ui32 =      ipmib.ipIfStatsOutWrongIPVersion;
14105 14116          ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion;
14106 14117  
14107 14118          netstack_rele(ns);
14108 14119  
14109 14120          return (0);
14110 14121  }
14111 14122  
14112 14123  static void *
14113 14124  icmp_kstat_init(netstackid_t stackid)
14114 14125  {
14115 14126          kstat_t *ksp;
14116 14127  
14117 14128          icmp_named_kstat_t template = {
14118 14129                  { "inMsgs",             KSTAT_DATA_UINT32 },
14119 14130                  { "inErrors",           KSTAT_DATA_UINT32 },
14120 14131                  { "inDestUnreachs",     KSTAT_DATA_UINT32 },
14121 14132                  { "inTimeExcds",        KSTAT_DATA_UINT32 },
14122 14133                  { "inParmProbs",        KSTAT_DATA_UINT32 },
14123 14134                  { "inSrcQuenchs",       KSTAT_DATA_UINT32 },
14124 14135                  { "inRedirects",        KSTAT_DATA_UINT32 },
14125 14136                  { "inEchos",            KSTAT_DATA_UINT32 },
14126 14137                  { "inEchoReps",         KSTAT_DATA_UINT32 },
14127 14138                  { "inTimestamps",       KSTAT_DATA_UINT32 },
14128 14139                  { "inTimestampReps",    KSTAT_DATA_UINT32 },
14129 14140                  { "inAddrMasks",        KSTAT_DATA_UINT32 },
14130 14141                  { "inAddrMaskReps",     KSTAT_DATA_UINT32 },
14131 14142                  { "outMsgs",            KSTAT_DATA_UINT32 },
14132 14143                  { "outErrors",          KSTAT_DATA_UINT32 },
14133 14144                  { "outDestUnreachs",    KSTAT_DATA_UINT32 },
14134 14145                  { "outTimeExcds",       KSTAT_DATA_UINT32 },
14135 14146                  { "outParmProbs",       KSTAT_DATA_UINT32 },
14136 14147                  { "outSrcQuenchs",      KSTAT_DATA_UINT32 },
14137 14148                  { "outRedirects",       KSTAT_DATA_UINT32 },
14138 14149                  { "outEchos",           KSTAT_DATA_UINT32 },
14139 14150                  { "outEchoReps",        KSTAT_DATA_UINT32 },
14140 14151                  { "outTimestamps",      KSTAT_DATA_UINT32 },
14141 14152                  { "outTimestampReps",   KSTAT_DATA_UINT32 },
14142 14153                  { "outAddrMasks",       KSTAT_DATA_UINT32 },
14143 14154                  { "outAddrMaskReps",    KSTAT_DATA_UINT32 },
14144 14155                  { "inChksumErrs",       KSTAT_DATA_UINT32 },
14145 14156                  { "inUnknowns",         KSTAT_DATA_UINT32 },
14146 14157                  { "inFragNeeded",       KSTAT_DATA_UINT32 },
14147 14158                  { "outFragNeeded",      KSTAT_DATA_UINT32 },
14148 14159                  { "outDrops",           KSTAT_DATA_UINT32 },
14149 14160                  { "inOverFlows",        KSTAT_DATA_UINT32 },
14150 14161                  { "inBadRedirects",     KSTAT_DATA_UINT32 },
14151 14162          };
14152 14163  
14153 14164          ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED,
14154 14165              NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid);
14155 14166          if (ksp == NULL || ksp->ks_data == NULL)
14156 14167                  return (NULL);
14157 14168  
14158 14169          bcopy(&template, ksp->ks_data, sizeof (template));
14159 14170  
14160 14171          ksp->ks_update = icmp_kstat_update;
14161 14172          ksp->ks_private = (void *)(uintptr_t)stackid;
14162 14173  
14163 14174          kstat_install(ksp);
14164 14175          return (ksp);
14165 14176  }
14166 14177  
14167 14178  static void
14168 14179  icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14169 14180  {
14170 14181          if (ksp != NULL) {
14171 14182                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14172 14183                  kstat_delete_netstack(ksp, stackid);
14173 14184          }
14174 14185  }
14175 14186  
14176 14187  static int
14177 14188  icmp_kstat_update(kstat_t *kp, int rw)
14178 14189  {
14179 14190          icmp_named_kstat_t *icmpkp;
14180 14191          netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14181 14192          netstack_t      *ns;
14182 14193          ip_stack_t      *ipst;
14183 14194  
14184 14195          if ((kp == NULL) || (kp->ks_data == NULL))
14185 14196                  return (EIO);
14186 14197  
14187 14198          if (rw == KSTAT_WRITE)
14188 14199                  return (EACCES);
14189 14200  
14190 14201          ns = netstack_find_by_stackid(stackid);
14191 14202          if (ns == NULL)
14192 14203                  return (-1);
14193 14204          ipst = ns->netstack_ip;
14194 14205          if (ipst == NULL) {
14195 14206                  netstack_rele(ns);
14196 14207                  return (-1);
14197 14208          }
14198 14209          icmpkp = (icmp_named_kstat_t *)kp->ks_data;
14199 14210  
14200 14211          icmpkp->inMsgs.value.ui32 =         ipst->ips_icmp_mib.icmpInMsgs;
14201 14212          icmpkp->inErrors.value.ui32 =       ipst->ips_icmp_mib.icmpInErrors;
14202 14213          icmpkp->inDestUnreachs.value.ui32 =
14203 14214              ipst->ips_icmp_mib.icmpInDestUnreachs;
14204 14215          icmpkp->inTimeExcds.value.ui32 =    ipst->ips_icmp_mib.icmpInTimeExcds;
14205 14216          icmpkp->inParmProbs.value.ui32 =    ipst->ips_icmp_mib.icmpInParmProbs;
14206 14217          icmpkp->inSrcQuenchs.value.ui32 =   ipst->ips_icmp_mib.icmpInSrcQuenchs;
14207 14218          icmpkp->inRedirects.value.ui32 =    ipst->ips_icmp_mib.icmpInRedirects;
14208 14219          icmpkp->inEchos.value.ui32 =        ipst->ips_icmp_mib.icmpInEchos;
14209 14220          icmpkp->inEchoReps.value.ui32 =     ipst->ips_icmp_mib.icmpInEchoReps;
14210 14221          icmpkp->inTimestamps.value.ui32 =   ipst->ips_icmp_mib.icmpInTimestamps;
14211 14222          icmpkp->inTimestampReps.value.ui32 =
14212 14223              ipst->ips_icmp_mib.icmpInTimestampReps;
14213 14224          icmpkp->inAddrMasks.value.ui32 =    ipst->ips_icmp_mib.icmpInAddrMasks;
14214 14225          icmpkp->inAddrMaskReps.value.ui32 =
14215 14226              ipst->ips_icmp_mib.icmpInAddrMaskReps;
14216 14227          icmpkp->outMsgs.value.ui32 =        ipst->ips_icmp_mib.icmpOutMsgs;
14217 14228          icmpkp->outErrors.value.ui32 =      ipst->ips_icmp_mib.icmpOutErrors;
14218 14229          icmpkp->outDestUnreachs.value.ui32 =
14219 14230              ipst->ips_icmp_mib.icmpOutDestUnreachs;
14220 14231          icmpkp->outTimeExcds.value.ui32 =   ipst->ips_icmp_mib.icmpOutTimeExcds;
14221 14232          icmpkp->outParmProbs.value.ui32 =   ipst->ips_icmp_mib.icmpOutParmProbs;
14222 14233          icmpkp->outSrcQuenchs.value.ui32 =
14223 14234              ipst->ips_icmp_mib.icmpOutSrcQuenchs;
14224 14235          icmpkp->outRedirects.value.ui32 =   ipst->ips_icmp_mib.icmpOutRedirects;
14225 14236          icmpkp->outEchos.value.ui32 =       ipst->ips_icmp_mib.icmpOutEchos;
14226 14237          icmpkp->outEchoReps.value.ui32 =    ipst->ips_icmp_mib.icmpOutEchoReps;
14227 14238          icmpkp->outTimestamps.value.ui32 =
14228 14239              ipst->ips_icmp_mib.icmpOutTimestamps;
14229 14240          icmpkp->outTimestampReps.value.ui32 =
14230 14241              ipst->ips_icmp_mib.icmpOutTimestampReps;
14231 14242          icmpkp->outAddrMasks.value.ui32 =
14232 14243              ipst->ips_icmp_mib.icmpOutAddrMasks;
14233 14244          icmpkp->outAddrMaskReps.value.ui32 =
14234 14245              ipst->ips_icmp_mib.icmpOutAddrMaskReps;
14235 14246          icmpkp->inCksumErrs.value.ui32 =    ipst->ips_icmp_mib.icmpInCksumErrs;
14236 14247          icmpkp->inUnknowns.value.ui32 =     ipst->ips_icmp_mib.icmpInUnknowns;
14237 14248          icmpkp->inFragNeeded.value.ui32 =   ipst->ips_icmp_mib.icmpInFragNeeded;
14238 14249          icmpkp->outFragNeeded.value.ui32 =
14239 14250              ipst->ips_icmp_mib.icmpOutFragNeeded;
14240 14251          icmpkp->outDrops.value.ui32 =       ipst->ips_icmp_mib.icmpOutDrops;
14241 14252          icmpkp->inOverflows.value.ui32 =    ipst->ips_icmp_mib.icmpInOverflows;
14242 14253          icmpkp->inBadRedirects.value.ui32 =
14243 14254              ipst->ips_icmp_mib.icmpInBadRedirects;
14244 14255  
14245 14256          netstack_rele(ns);
14246 14257          return (0);
14247 14258  }
14248 14259  
14249 14260  /*
14250 14261   * This is the fanout function for raw socket opened for SCTP.  Note
14251 14262   * that it is called after SCTP checks that there is no socket which
14252 14263   * wants a packet.  Then before SCTP handles this out of the blue packet,
14253 14264   * this function is called to see if there is any raw socket for SCTP.
14254 14265   * If there is and it is bound to the correct address, the packet will
14255 14266   * be sent to that socket.  Note that only one raw socket can be bound to
14256 14267   * a port.  This is assured in ipcl_sctp_hash_insert();
14257 14268   */
14258 14269  void
14259 14270  ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
14260 14271      ip_recv_attr_t *ira)
14261 14272  {
14262 14273          conn_t          *connp;
14263 14274          queue_t         *rq;
14264 14275          boolean_t       secure;
14265 14276          ill_t           *ill = ira->ira_ill;
14266 14277          ip_stack_t      *ipst = ill->ill_ipst;
14267 14278          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
14268 14279          sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
14269 14280          iaflags_t       iraflags = ira->ira_flags;
14270 14281          ill_t           *rill = ira->ira_rill;
14271 14282  
14272 14283          secure = iraflags & IRAF_IPSEC_SECURE;
14273 14284  
14274 14285          connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
14275 14286              ira, ipst);
14276 14287          if (connp == NULL) {
14277 14288                  /*
14278 14289                   * Although raw sctp is not summed, OOB chunks must be.
14279 14290                   * Drop the packet here if the sctp checksum failed.
14280 14291                   */
14281 14292                  if (iraflags & IRAF_SCTP_CSUM_ERR) {
14282 14293                          SCTPS_BUMP_MIB(sctps, sctpChecksumError);
14283 14294                          freemsg(mp);
14284 14295                          return;
14285 14296                  }
14286 14297                  ira->ira_ill = ira->ira_rill = NULL;
14287 14298                  sctp_ootb_input(mp, ira, ipst);
14288 14299                  ira->ira_ill = ill;
14289 14300                  ira->ira_rill = rill;
14290 14301                  return;
14291 14302          }
14292 14303          rq = connp->conn_rq;
14293 14304          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
14294 14305                  CONN_DEC_REF(connp);
14295 14306                  BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
14296 14307                  freemsg(mp);
14297 14308                  return;
14298 14309          }
14299 14310          if (((iraflags & IRAF_IS_IPV4) ?
14300 14311              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
14301 14312              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
14302 14313              secure) {
14303 14314                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
14304 14315                      ip6h, ira);
14305 14316                  if (mp == NULL) {
14306 14317                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
14307 14318                          /* Note that mp is NULL */
14308 14319                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
14309 14320                          CONN_DEC_REF(connp);
14310 14321                          return;
14311 14322                  }
14312 14323          }
14313 14324  
14314 14325          if (iraflags & IRAF_ICMP_ERROR) {
14315 14326                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
14316 14327          } else {
14317 14328                  ill_t *rill = ira->ira_rill;
14318 14329  
14319 14330                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
14320 14331                  /* This is the SOCK_RAW, IPPROTO_SCTP case. */
14321 14332                  ira->ira_ill = ira->ira_rill = NULL;
14322 14333                  (connp->conn_recv)(connp, mp, NULL, ira);
14323 14334                  ira->ira_ill = ill;
14324 14335                  ira->ira_rill = rill;
14325 14336          }
14326 14337          CONN_DEC_REF(connp);
14327 14338  }
14328 14339  
14329 14340  /*
14330 14341   * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
14331 14342   * header before the ip payload.
14332 14343   */
14333 14344  static void
14334 14345  ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
14335 14346  {
14336 14347          int len = (mp->b_wptr - mp->b_rptr);
14337 14348          mblk_t *ip_mp;
14338 14349  
14339 14350          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14340 14351          if (is_fp_mp || len != fp_mp_len) {
14341 14352                  if (len > fp_mp_len) {
14342 14353                          /*
14343 14354                           * fastpath header and ip header in the first mblk
14344 14355                           */
14345 14356                          mp->b_rptr += fp_mp_len;
14346 14357                  } else {
14347 14358                          /*
14348 14359                           * ip_xmit_attach_llhdr had to prepend an mblk to
14349 14360                           * attach the fastpath header before ip header.
14350 14361                           */
14351 14362                          ip_mp = mp->b_cont;
14352 14363                          freeb(mp);
14353 14364                          mp = ip_mp;
14354 14365                          mp->b_rptr += (fp_mp_len - len);
14355 14366                  }
14356 14367          } else {
14357 14368                  ip_mp = mp->b_cont;
14358 14369                  freeb(mp);
14359 14370                  mp = ip_mp;
14360 14371          }
14361 14372          ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
14362 14373          freemsg(mp);
14363 14374  }
14364 14375  
14365 14376  /*
14366 14377   * Normal post fragmentation function.
14367 14378   *
14368 14379   * Send a packet using the passed in nce. This handles both IPv4 and IPv6
14369 14380   * using the same state machine.
14370 14381   *
14371 14382   * We return an error on failure. In particular we return EWOULDBLOCK
14372 14383   * when the driver flow controls. In that case this ensures that ip_wsrv runs
14373 14384   * (currently by canputnext failure resulting in backenabling from GLD.)
14374 14385   * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
14375 14386   * indication that they can flow control until ip_wsrv() tells then to restart.
14376 14387   *
14377 14388   * If the nce passed by caller is incomplete, this function
14378 14389   * queues the packet and if necessary, sends ARP request and bails.
14379 14390   * If the Neighbor Cache passed is fully resolved, we simply prepend
14380 14391   * the link-layer header to the packet, do ipsec hw acceleration
14381 14392   * work if necessary, and send the packet out on the wire.
14382 14393   */
14383 14394  /* ARGSUSED6 */
14384 14395  int
14385 14396  ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
14386 14397      uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
14387 14398  {
14388 14399          queue_t         *wq;
14389 14400          ill_t           *ill = nce->nce_ill;
14390 14401          ip_stack_t      *ipst = ill->ill_ipst;
14391 14402          uint64_t        delta;
14392 14403          boolean_t       isv6 = ill->ill_isv6;
14393 14404          boolean_t       fp_mp;
14394 14405          ncec_t          *ncec = nce->nce_common;
14395 14406          int64_t         now = LBOLT_FASTPATH64;
14396 14407          boolean_t       is_probe;
14397 14408  
14398 14409          DTRACE_PROBE1(ip__xmit, nce_t *, nce);
14399 14410  
14400 14411          ASSERT(mp != NULL);
14401 14412          ASSERT(mp->b_datap->db_type == M_DATA);
14402 14413          ASSERT(pkt_len == msgdsize(mp));
14403 14414  
14404 14415          /*
14405 14416           * If we have already been here and are coming back after ARP/ND.
14406 14417           * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
14407 14418           * in that case since they have seen the packet when it came here
14408 14419           * the first time.
14409 14420           */
14410 14421          if (ixaflags & IXAF_NO_TRACE)
14411 14422                  goto sendit;
14412 14423  
14413 14424          if (ixaflags & IXAF_IS_IPV4) {
14414 14425                  ipha_t *ipha = (ipha_t *)mp->b_rptr;
14415 14426  
14416 14427                  ASSERT(!isv6);
14417 14428                  ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
14418 14429                  if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
14419 14430                      !(ixaflags & IXAF_NO_PFHOOK)) {
14420 14431                          int     error;
14421 14432  
14422 14433                          FW_HOOKS(ipst->ips_ip4_physical_out_event,
14423 14434                              ipst->ips_ipv4firewall_physical_out,
14424 14435                              NULL, ill, ipha, mp, mp, 0, ipst, error);
14425 14436                          DTRACE_PROBE1(ip4__physical__out__end,
14426 14437                              mblk_t *, mp);
14427 14438                          if (mp == NULL)
14428 14439                                  return (error);
14429 14440  
14430 14441                          /* The length could have changed */
14431 14442                          pkt_len = msgdsize(mp);
14432 14443                  }
14433 14444                  if (ipst->ips_ip4_observe.he_interested) {
14434 14445                          /*
14435 14446                           * Note that for TX the zoneid is the sending
14436 14447                           * zone, whether or not MLP is in play.
14437 14448                           * Since the szone argument is the IP zoneid (i.e.,
14438 14449                           * zero for exclusive-IP zones) and ipobs wants
14439 14450                           * the system zoneid, we map it here.
14440 14451                           */
14441 14452                          szone = IP_REAL_ZONEID(szone, ipst);
14442 14453  
14443 14454                          /*
14444 14455                           * On the outbound path the destination zone will be
14445 14456                           * unknown as we're sending this packet out on the
14446 14457                           * wire.
14447 14458                           */
14448 14459                          ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14449 14460                              ill, ipst);
14450 14461                  }
14451 14462                  DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
14452 14463                      void_ip_t *, ipha,  __dtrace_ipsr_ill_t *, ill,
14453 14464                      ipha_t *, ipha, ip6_t *, NULL, int, 0);
14454 14465          } else {
14455 14466                  ip6_t *ip6h = (ip6_t *)mp->b_rptr;
14456 14467  
14457 14468                  ASSERT(isv6);
14458 14469                  ASSERT(pkt_len ==
14459 14470                      ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
14460 14471                  if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
14461 14472                      !(ixaflags & IXAF_NO_PFHOOK)) {
14462 14473                          int     error;
14463 14474  
14464 14475                          FW_HOOKS6(ipst->ips_ip6_physical_out_event,
14465 14476                              ipst->ips_ipv6firewall_physical_out,
14466 14477                              NULL, ill, ip6h, mp, mp, 0, ipst, error);
14467 14478                          DTRACE_PROBE1(ip6__physical__out__end,
14468 14479                              mblk_t *, mp);
14469 14480                          if (mp == NULL)
14470 14481                                  return (error);
14471 14482  
14472 14483                          /* The length could have changed */
14473 14484                          pkt_len = msgdsize(mp);
14474 14485                  }
14475 14486                  if (ipst->ips_ip6_observe.he_interested) {
14476 14487                          /* See above */
14477 14488                          szone = IP_REAL_ZONEID(szone, ipst);
14478 14489  
14479 14490                          ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14480 14491                              ill, ipst);
14481 14492                  }
14482 14493                  DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
14483 14494                      void_ip_t *, ip6h,  __dtrace_ipsr_ill_t *, ill,
14484 14495                      ipha_t *, NULL, ip6_t *, ip6h, int, 0);
14485 14496          }
14486 14497  
14487 14498  sendit:
14488 14499          /*
14489 14500           * We check the state without a lock because the state can never
14490 14501           * move "backwards" to initial or incomplete.
14491 14502           */
14492 14503          switch (ncec->ncec_state) {
14493 14504          case ND_REACHABLE:
14494 14505          case ND_STALE:
14495 14506          case ND_DELAY:
14496 14507          case ND_PROBE:
14497 14508                  mp = ip_xmit_attach_llhdr(mp, nce);
14498 14509                  if (mp == NULL) {
14499 14510                          /*
14500 14511                           * ip_xmit_attach_llhdr has increased
14501 14512                           * ipIfStatsOutDiscards and called ip_drop_output()
14502 14513                           */
14503 14514                          return (ENOBUFS);
14504 14515                  }
14505 14516                  /*
14506 14517                   * check if nce_fastpath completed and we tagged on a
14507 14518                   * copy of nce_fp_mp in ip_xmit_attach_llhdr().
14508 14519                   */
14509 14520                  fp_mp = (mp->b_datap->db_type == M_DATA);
14510 14521  
14511 14522                  if (fp_mp &&
14512 14523                      (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
14513 14524                          ill_dld_direct_t *idd;
14514 14525  
14515 14526                          idd = &ill->ill_dld_capab->idc_direct;
14516 14527                          /*
14517 14528                           * Send the packet directly to DLD, where it
14518 14529                           * may be queued depending on the availability
14519 14530                           * of transmit resources at the media layer.
14520 14531                           * Return value should be taken into
14521 14532                           * account and flow control the TCP.
14522 14533                           */
14523 14534                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14524 14535                          UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14525 14536                              pkt_len);
14526 14537  
14527 14538                          if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
14528 14539                                  (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
14529 14540                                      (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
14530 14541                          } else {
14531 14542                                  uintptr_t cookie;
14532 14543  
14533 14544                                  if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
14534 14545                                      mp, (uintptr_t)xmit_hint, 0)) != 0) {
14535 14546                                          if (ixacookie != NULL)
14536 14547                                                  *ixacookie = cookie;
14537 14548                                          return (EWOULDBLOCK);
14538 14549                                  }
14539 14550                          }
14540 14551                  } else {
14541 14552                          wq = ill->ill_wq;
14542 14553  
14543 14554                          if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
14544 14555                              !canputnext(wq)) {
14545 14556                                  if (ixacookie != NULL)
14546 14557                                          *ixacookie = 0;
14547 14558                                  ip_xmit_flowctl_drop(ill, mp, fp_mp,
14548 14559                                      nce->nce_fp_mp != NULL ?
14549 14560                                      MBLKL(nce->nce_fp_mp) : 0);
14550 14561                                  return (EWOULDBLOCK);
14551 14562                          }
14552 14563                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14553 14564                          UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14554 14565                              pkt_len);
14555 14566                          putnext(wq, mp);
14556 14567                  }
14557 14568  
14558 14569                  /*
14559 14570                   * The rest of this function implements Neighbor Unreachability
14560 14571                   * detection. Determine if the ncec is eligible for NUD.
14561 14572                   */
14562 14573                  if (ncec->ncec_flags & NCE_F_NONUD)
14563 14574                          return (0);
14564 14575  
14565 14576                  ASSERT(ncec->ncec_state != ND_INCOMPLETE);
14566 14577  
14567 14578                  /*
14568 14579                   * Check for upper layer advice
14569 14580                   */
14570 14581                  if (ixaflags & IXAF_REACH_CONF) {
14571 14582                          timeout_id_t tid;
14572 14583  
14573 14584                          /*
14574 14585                           * It should be o.k. to check the state without
14575 14586                           * a lock here, at most we lose an advice.
14576 14587                           */
14577 14588                          ncec->ncec_last = TICK_TO_MSEC(now);
14578 14589                          if (ncec->ncec_state != ND_REACHABLE) {
14579 14590                                  mutex_enter(&ncec->ncec_lock);
14580 14591                                  ncec->ncec_state = ND_REACHABLE;
14581 14592                                  tid = ncec->ncec_timeout_id;
14582 14593                                  ncec->ncec_timeout_id = 0;
14583 14594                                  mutex_exit(&ncec->ncec_lock);
14584 14595                                  (void) untimeout(tid);
14585 14596                                  if (ip_debug > 2) {
14586 14597                                          /* ip1dbg */
14587 14598                                          pr_addr_dbg("ip_xmit: state"
14588 14599                                              " for %s changed to"
14589 14600                                              " REACHABLE\n", AF_INET6,
14590 14601                                              &ncec->ncec_addr);
14591 14602                                  }
14592 14603                          }
14593 14604                          return (0);
14594 14605                  }
14595 14606  
14596 14607                  delta =  TICK_TO_MSEC(now) - ncec->ncec_last;
14597 14608                  ip1dbg(("ip_xmit: delta = %" PRId64
14598 14609                      " ill_reachable_time = %d \n", delta,
14599 14610                      ill->ill_reachable_time));
14600 14611                  if (delta > (uint64_t)ill->ill_reachable_time) {
14601 14612                          mutex_enter(&ncec->ncec_lock);
14602 14613                          switch (ncec->ncec_state) {
14603 14614                          case ND_REACHABLE:
14604 14615                                  ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
14605 14616                                  /* FALLTHROUGH */
14606 14617                          case ND_STALE:
14607 14618                                  /*
14608 14619                                   * ND_REACHABLE is identical to
14609 14620                                   * ND_STALE in this specific case. If
14610 14621                                   * reachable time has expired for this
14611 14622                                   * neighbor (delta is greater than
14612 14623                                   * reachable time), conceptually, the
14613 14624                                   * neighbor cache is no longer in
14614 14625                                   * REACHABLE state, but already in
14615 14626                                   * STALE state.  So the correct
14616 14627                                   * transition here is to ND_DELAY.
14617 14628                                   */
14618 14629                                  ncec->ncec_state = ND_DELAY;
14619 14630                                  mutex_exit(&ncec->ncec_lock);
14620 14631                                  nce_restart_timer(ncec,
14621 14632                                      ipst->ips_delay_first_probe_time);
14622 14633                                  if (ip_debug > 3) {
14623 14634                                          /* ip2dbg */
14624 14635                                          pr_addr_dbg("ip_xmit: state"
14625 14636                                              " for %s changed to"
14626 14637                                              " DELAY\n", AF_INET6,
14627 14638                                              &ncec->ncec_addr);
14628 14639                                  }
14629 14640                                  break;
14630 14641                          case ND_DELAY:
14631 14642                          case ND_PROBE:
14632 14643                                  mutex_exit(&ncec->ncec_lock);
14633 14644                                  /* Timers have already started */
14634 14645                                  break;
14635 14646                          case ND_UNREACHABLE:
14636 14647                                  /*
14637 14648                                   * nce_timer has detected that this ncec
14638 14649                                   * is unreachable and initiated deleting
14639 14650                                   * this ncec.
14640 14651                                   * This is a harmless race where we found the
14641 14652                                   * ncec before it was deleted and have
14642 14653                                   * just sent out a packet using this
14643 14654                                   * unreachable ncec.
14644 14655                                   */
14645 14656                                  mutex_exit(&ncec->ncec_lock);
14646 14657                                  break;
14647 14658                          default:
14648 14659                                  ASSERT(0);
14649 14660                                  mutex_exit(&ncec->ncec_lock);
14650 14661                          }
14651 14662                  }
14652 14663                  return (0);
14653 14664  
14654 14665          case ND_INCOMPLETE:
14655 14666                  /*
14656 14667                   * the state could have changed since we didn't hold the lock.
14657 14668                   * Re-verify state under lock.
14658 14669                   */
14659 14670                  is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14660 14671                  mutex_enter(&ncec->ncec_lock);
14661 14672                  if (NCE_ISREACHABLE(ncec)) {
14662 14673                          mutex_exit(&ncec->ncec_lock);
14663 14674                          goto sendit;
14664 14675                  }
14665 14676                  /* queue the packet */
14666 14677                  nce_queue_mp(ncec, mp, is_probe);
14667 14678                  mutex_exit(&ncec->ncec_lock);
14668 14679                  DTRACE_PROBE2(ip__xmit__incomplete,
14669 14680                      (ncec_t *), ncec, (mblk_t *), mp);
14670 14681                  return (0);
14671 14682  
14672 14683          case ND_INITIAL:
14673 14684                  /*
14674 14685                   * State could have changed since we didn't hold the lock, so
14675 14686                   * re-verify state.
14676 14687                   */
14677 14688                  is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14678 14689                  mutex_enter(&ncec->ncec_lock);
14679 14690                  if (NCE_ISREACHABLE(ncec))  {
14680 14691                          mutex_exit(&ncec->ncec_lock);
14681 14692                          goto sendit;
14682 14693                  }
14683 14694                  nce_queue_mp(ncec, mp, is_probe);
14684 14695                  if (ncec->ncec_state == ND_INITIAL) {
14685 14696                          ncec->ncec_state = ND_INCOMPLETE;
14686 14697                          mutex_exit(&ncec->ncec_lock);
14687 14698                          /*
14688 14699                           * figure out the source we want to use
14689 14700                           * and resolve it.
14690 14701                           */
14691 14702                          ip_ndp_resolve(ncec);
14692 14703                  } else  {
14693 14704                          mutex_exit(&ncec->ncec_lock);
14694 14705                  }
14695 14706                  return (0);
14696 14707  
14697 14708          case ND_UNREACHABLE:
14698 14709                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14699 14710                  ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
14700 14711                      mp, ill);
14701 14712                  freemsg(mp);
14702 14713                  return (0);
14703 14714  
14704 14715          default:
14705 14716                  ASSERT(0);
14706 14717                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14707 14718                  ip_drop_output("ipIfStatsOutDiscards - ND_other",
14708 14719                      mp, ill);
14709 14720                  freemsg(mp);
14710 14721                  return (ENETUNREACH);
14711 14722          }
14712 14723  }
14713 14724  
14714 14725  /*
14715 14726   * Return B_TRUE if the buffers differ in length or content.
14716 14727   * This is used for comparing extension header buffers.
14717 14728   * Note that an extension header would be declared different
14718 14729   * even if all that changed was the next header value in that header i.e.
14719 14730   * what really changed is the next extension header.
14720 14731   */
14721 14732  boolean_t
14722 14733  ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf,
14723 14734      uint_t blen)
14724 14735  {
14725 14736          if (!b_valid)
14726 14737                  blen = 0;
14727 14738  
14728 14739          if (alen != blen)
14729 14740                  return (B_TRUE);
14730 14741          if (alen == 0)
14731 14742                  return (B_FALSE);       /* Both zero length */
14732 14743          return (bcmp(abuf, bbuf, alen));
14733 14744  }
14734 14745  
14735 14746  /*
14736 14747   * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok.
14737 14748   * Return B_FALSE if memory allocation fails - don't change any state!
14738 14749   */
14739 14750  boolean_t
14740 14751  ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14741 14752      const void *src, uint_t srclen)
14742 14753  {
14743 14754          void *dst;
14744 14755  
14745 14756          if (!src_valid)
14746 14757                  srclen = 0;
14747 14758  
14748 14759          ASSERT(*dstlenp == 0);
14749 14760          if (src != NULL && srclen != 0) {
14750 14761                  dst = mi_alloc(srclen, BPRI_MED);
14751 14762                  if (dst == NULL)
14752 14763                          return (B_FALSE);
14753 14764          } else {
14754 14765                  dst = NULL;
14755 14766          }
14756 14767          if (*dstp != NULL)
14757 14768                  mi_free(*dstp);
14758 14769          *dstp = dst;
14759 14770          *dstlenp = dst == NULL ? 0 : srclen;
14760 14771          return (B_TRUE);
14761 14772  }
14762 14773  
14763 14774  /*
14764 14775   * Replace what is in *dst, *dstlen with the source.
14765 14776   * Assumes ip_allocbuf has already been called.
14766 14777   */
14767 14778  void
14768 14779  ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14769 14780      const void *src, uint_t srclen)
14770 14781  {
14771 14782          if (!src_valid)
14772 14783                  srclen = 0;
14773 14784  
14774 14785          ASSERT(*dstlenp == srclen);
14775 14786          if (src != NULL && srclen != 0)
14776 14787                  bcopy(src, *dstp, srclen);
14777 14788  }
14778 14789  
14779 14790  /*
14780 14791   * Free the storage pointed to by the members of an ip_pkt_t.
14781 14792   */
14782 14793  void
14783 14794  ip_pkt_free(ip_pkt_t *ipp)
14784 14795  {
14785 14796          uint_t  fields = ipp->ipp_fields;
14786 14797  
14787 14798          if (fields & IPPF_HOPOPTS) {
14788 14799                  kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
14789 14800                  ipp->ipp_hopopts = NULL;
14790 14801                  ipp->ipp_hopoptslen = 0;
14791 14802          }
14792 14803          if (fields & IPPF_RTHDRDSTOPTS) {
14793 14804                  kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
14794 14805                  ipp->ipp_rthdrdstopts = NULL;
14795 14806                  ipp->ipp_rthdrdstoptslen = 0;
14796 14807          }
14797 14808          if (fields & IPPF_DSTOPTS) {
14798 14809                  kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
14799 14810                  ipp->ipp_dstopts = NULL;
14800 14811                  ipp->ipp_dstoptslen = 0;
14801 14812          }
14802 14813          if (fields & IPPF_RTHDR) {
14803 14814                  kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
14804 14815                  ipp->ipp_rthdr = NULL;
14805 14816                  ipp->ipp_rthdrlen = 0;
14806 14817          }
14807 14818          if (fields & IPPF_IPV4_OPTIONS) {
14808 14819                  kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
14809 14820                  ipp->ipp_ipv4_options = NULL;
14810 14821                  ipp->ipp_ipv4_options_len = 0;
14811 14822          }
14812 14823          if (fields & IPPF_LABEL_V4) {
14813 14824                  kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
14814 14825                  ipp->ipp_label_v4 = NULL;
14815 14826                  ipp->ipp_label_len_v4 = 0;
14816 14827          }
14817 14828          if (fields & IPPF_LABEL_V6) {
14818 14829                  kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
14819 14830                  ipp->ipp_label_v6 = NULL;
14820 14831                  ipp->ipp_label_len_v6 = 0;
14821 14832          }
14822 14833          ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14823 14834              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14824 14835  }
14825 14836  
14826 14837  /*
14827 14838   * Copy from src to dst and allocate as needed.
14828 14839   * Returns zero or ENOMEM.
14829 14840   *
14830 14841   * The caller must initialize dst to zero.
14831 14842   */
14832 14843  int
14833 14844  ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
14834 14845  {
14835 14846          uint_t  fields = src->ipp_fields;
14836 14847  
14837 14848          /* Start with fields that don't require memory allocation */
14838 14849          dst->ipp_fields = fields &
14839 14850              ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14840 14851              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14841 14852  
14842 14853          dst->ipp_addr = src->ipp_addr;
14843 14854          dst->ipp_unicast_hops = src->ipp_unicast_hops;
14844 14855          dst->ipp_hoplimit = src->ipp_hoplimit;
14845 14856          dst->ipp_tclass = src->ipp_tclass;
14846 14857          dst->ipp_type_of_service = src->ipp_type_of_service;
14847 14858  
14848 14859          if (!(fields & (IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14849 14860              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6)))
14850 14861                  return (0);
14851 14862  
14852 14863          if (fields & IPPF_HOPOPTS) {
14853 14864                  dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
14854 14865                  if (dst->ipp_hopopts == NULL) {
14855 14866                          ip_pkt_free(dst);
14856 14867                          return (ENOMEM);
14857 14868                  }
14858 14869                  dst->ipp_fields |= IPPF_HOPOPTS;
14859 14870                  bcopy(src->ipp_hopopts, dst->ipp_hopopts,
14860 14871                      src->ipp_hopoptslen);
14861 14872                  dst->ipp_hopoptslen = src->ipp_hopoptslen;
14862 14873          }
14863 14874          if (fields & IPPF_RTHDRDSTOPTS) {
14864 14875                  dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
14865 14876                      kmflag);
14866 14877                  if (dst->ipp_rthdrdstopts == NULL) {
14867 14878                          ip_pkt_free(dst);
14868 14879                          return (ENOMEM);
14869 14880                  }
14870 14881                  dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
14871 14882                  bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
14872 14883                      src->ipp_rthdrdstoptslen);
14873 14884                  dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
14874 14885          }
14875 14886          if (fields & IPPF_DSTOPTS) {
14876 14887                  dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
14877 14888                  if (dst->ipp_dstopts == NULL) {
14878 14889                          ip_pkt_free(dst);
14879 14890                          return (ENOMEM);
14880 14891                  }
14881 14892                  dst->ipp_fields |= IPPF_DSTOPTS;
14882 14893                  bcopy(src->ipp_dstopts, dst->ipp_dstopts,
14883 14894                      src->ipp_dstoptslen);
14884 14895                  dst->ipp_dstoptslen = src->ipp_dstoptslen;
14885 14896          }
14886 14897          if (fields & IPPF_RTHDR) {
14887 14898                  dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
14888 14899                  if (dst->ipp_rthdr == NULL) {
14889 14900                          ip_pkt_free(dst);
14890 14901                          return (ENOMEM);
14891 14902                  }
14892 14903                  dst->ipp_fields |= IPPF_RTHDR;
14893 14904                  bcopy(src->ipp_rthdr, dst->ipp_rthdr,
14894 14905                      src->ipp_rthdrlen);
14895 14906                  dst->ipp_rthdrlen = src->ipp_rthdrlen;
14896 14907          }
14897 14908          if (fields & IPPF_IPV4_OPTIONS) {
14898 14909                  dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
14899 14910                      kmflag);
14900 14911                  if (dst->ipp_ipv4_options == NULL) {
14901 14912                          ip_pkt_free(dst);
14902 14913                          return (ENOMEM);
14903 14914                  }
14904 14915                  dst->ipp_fields |= IPPF_IPV4_OPTIONS;
14905 14916                  bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
14906 14917                      src->ipp_ipv4_options_len);
14907 14918                  dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
14908 14919          }
14909 14920          if (fields & IPPF_LABEL_V4) {
14910 14921                  dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
14911 14922                  if (dst->ipp_label_v4 == NULL) {
14912 14923                          ip_pkt_free(dst);
14913 14924                          return (ENOMEM);
14914 14925                  }
14915 14926                  dst->ipp_fields |= IPPF_LABEL_V4;
14916 14927                  bcopy(src->ipp_label_v4, dst->ipp_label_v4,
14917 14928                      src->ipp_label_len_v4);
14918 14929                  dst->ipp_label_len_v4 = src->ipp_label_len_v4;
14919 14930          }
14920 14931          if (fields & IPPF_LABEL_V6) {
14921 14932                  dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
14922 14933                  if (dst->ipp_label_v6 == NULL) {
14923 14934                          ip_pkt_free(dst);
14924 14935                          return (ENOMEM);
14925 14936                  }
14926 14937                  dst->ipp_fields |= IPPF_LABEL_V6;
14927 14938                  bcopy(src->ipp_label_v6, dst->ipp_label_v6,
14928 14939                      src->ipp_label_len_v6);
14929 14940                  dst->ipp_label_len_v6 = src->ipp_label_len_v6;
14930 14941          }
14931 14942          if (fields & IPPF_FRAGHDR) {
14932 14943                  dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
14933 14944                  if (dst->ipp_fraghdr == NULL) {
14934 14945                          ip_pkt_free(dst);
14935 14946                          return (ENOMEM);
14936 14947                  }
14937 14948                  dst->ipp_fields |= IPPF_FRAGHDR;
14938 14949                  bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
14939 14950                      src->ipp_fraghdrlen);
14940 14951                  dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
14941 14952          }
14942 14953          return (0);
14943 14954  }
14944 14955  
14945 14956  /*
14946 14957   * Returns INADDR_ANY if no source route
14947 14958   */
14948 14959  ipaddr_t
14949 14960  ip_pkt_source_route_v4(const ip_pkt_t *ipp)
14950 14961  {
14951 14962          ipaddr_t        nexthop = INADDR_ANY;
14952 14963          ipoptp_t        opts;
14953 14964          uchar_t         *opt;
14954 14965          uint8_t         optval;
14955 14966          uint8_t         optlen;
14956 14967          uint32_t        totallen;
14957 14968  
14958 14969          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
14959 14970                  return (INADDR_ANY);
14960 14971  
14961 14972          totallen = ipp->ipp_ipv4_options_len;
14962 14973          if (totallen & 0x3)
14963 14974                  return (INADDR_ANY);
14964 14975  
14965 14976          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
14966 14977              optval != IPOPT_EOL;
14967 14978              optval = ipoptp_next(&opts)) {
14968 14979                  opt = opts.ipoptp_cur;
14969 14980                  switch (optval) {
14970 14981                          uint8_t off;
14971 14982                  case IPOPT_SSRR:
14972 14983                  case IPOPT_LSRR:
14973 14984                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
14974 14985                                  break;
14975 14986                          }
14976 14987                          optlen = opts.ipoptp_len;
14977 14988                          off = opt[IPOPT_OFFSET];
14978 14989                          off--;
14979 14990                          if (optlen < IP_ADDR_LEN ||
14980 14991                              off > optlen - IP_ADDR_LEN) {
14981 14992                                  /* End of source route */
14982 14993                                  break;
14983 14994                          }
14984 14995                          bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
14985 14996                          if (nexthop == htonl(INADDR_LOOPBACK)) {
14986 14997                                  /* Ignore */
14987 14998                                  nexthop = INADDR_ANY;
14988 14999                                  break;
14989 15000                          }
14990 15001                          break;
14991 15002                  }
14992 15003          }
14993 15004          return (nexthop);
14994 15005  }
14995 15006  
14996 15007  /*
14997 15008   * Reverse a source route.
14998 15009   */
14999 15010  void
15000 15011  ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
15001 15012  {
15002 15013          ipaddr_t        tmp;
15003 15014          ipoptp_t        opts;
15004 15015          uchar_t         *opt;
15005 15016          uint8_t         optval;
15006 15017          uint32_t        totallen;
15007 15018  
15008 15019          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
15009 15020                  return;
15010 15021  
15011 15022          totallen = ipp->ipp_ipv4_options_len;
15012 15023          if (totallen & 0x3)
15013 15024                  return;
15014 15025  
15015 15026          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
15016 15027              optval != IPOPT_EOL;
15017 15028              optval = ipoptp_next(&opts)) {
15018 15029                  uint8_t off1, off2;
15019 15030  
15020 15031                  opt = opts.ipoptp_cur;
15021 15032                  switch (optval) {
15022 15033                  case IPOPT_SSRR:
15023 15034                  case IPOPT_LSRR:
15024 15035                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
15025 15036                                  break;
15026 15037                          }
15027 15038                          off1 = IPOPT_MINOFF_SR - 1;
15028 15039                          off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
15029 15040                          while (off2 > off1) {
15030 15041                                  bcopy(opt + off2, &tmp, IP_ADDR_LEN);
15031 15042                                  bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
15032 15043                                  bcopy(&tmp, opt + off2, IP_ADDR_LEN);
15033 15044                                  off2 -= IP_ADDR_LEN;
15034 15045                                  off1 += IP_ADDR_LEN;
15035 15046                          }
15036 15047                          opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
15037 15048                          break;
15038 15049                  }
15039 15050          }
15040 15051  }
15041 15052  
15042 15053  /*
15043 15054   * Returns NULL if no routing header
15044 15055   */
15045 15056  in6_addr_t *
15046 15057  ip_pkt_source_route_v6(const ip_pkt_t *ipp)
15047 15058  {
15048 15059          in6_addr_t      *nexthop = NULL;
15049 15060          ip6_rthdr0_t    *rthdr;
15050 15061  
15051 15062          if (!(ipp->ipp_fields & IPPF_RTHDR))
15052 15063                  return (NULL);
15053 15064  
15054 15065          rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
15055 15066          if (rthdr->ip6r0_segleft == 0)
15056 15067                  return (NULL);
15057 15068  
15058 15069          nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
15059 15070          return (nexthop);
15060 15071  }
15061 15072  
15062 15073  zoneid_t
15063 15074  ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
15064 15075      zoneid_t lookup_zoneid)
15065 15076  {
15066 15077          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
15067 15078          ire_t           *ire;
15068 15079          int             ire_flags = MATCH_IRE_TYPE;
15069 15080          zoneid_t        zoneid = ALL_ZONES;
15070 15081  
15071 15082          if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15072 15083                  return (ALL_ZONES);
15073 15084  
15074 15085          if (lookup_zoneid != ALL_ZONES)
15075 15086                  ire_flags |= MATCH_IRE_ZONEONLY;
15076 15087          ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15077 15088              NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15078 15089          if (ire != NULL) {
15079 15090                  zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15080 15091                  ire_refrele(ire);
15081 15092          }
15082 15093          return (zoneid);
15083 15094  }
15084 15095  
15085 15096  zoneid_t
15086 15097  ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
15087 15098      ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
15088 15099  {
15089 15100          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
15090 15101          ire_t           *ire;
15091 15102          int             ire_flags = MATCH_IRE_TYPE;
15092 15103          zoneid_t        zoneid = ALL_ZONES;
15093 15104  
15094 15105          if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15095 15106                  return (ALL_ZONES);
15096 15107  
15097 15108          if (IN6_IS_ADDR_LINKLOCAL(addr))
15098 15109                  ire_flags |= MATCH_IRE_ILL;
15099 15110  
15100 15111          if (lookup_zoneid != ALL_ZONES)
15101 15112                  ire_flags |= MATCH_IRE_ZONEONLY;
15102 15113          ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15103 15114              ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15104 15115          if (ire != NULL) {
15105 15116                  zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15106 15117                  ire_refrele(ire);
15107 15118          }
15108 15119          return (zoneid);
15109 15120  }
15110 15121  
15111 15122  /*
15112 15123   * IP obserability hook support functions.
15113 15124   */
15114 15125  static void
15115 15126  ipobs_init(ip_stack_t *ipst)
15116 15127  {
15117 15128          netid_t id;
15118 15129  
15119 15130          id = net_getnetidbynetstackid(ipst->ips_netstack->netstack_stackid);
15120 15131  
15121 15132          ipst->ips_ip4_observe_pr = net_protocol_lookup(id, NHF_INET);
15122 15133          VERIFY(ipst->ips_ip4_observe_pr != NULL);
15123 15134  
15124 15135          ipst->ips_ip6_observe_pr = net_protocol_lookup(id, NHF_INET6);
15125 15136          VERIFY(ipst->ips_ip6_observe_pr != NULL);
15126 15137  }
15127 15138  
15128 15139  static void
15129 15140  ipobs_fini(ip_stack_t *ipst)
15130 15141  {
15131 15142  
15132 15143          VERIFY(net_protocol_release(ipst->ips_ip4_observe_pr) == 0);
15133 15144          VERIFY(net_protocol_release(ipst->ips_ip6_observe_pr) == 0);
15134 15145  }
15135 15146  
15136 15147  /*
15137 15148   * hook_pkt_observe_t is composed in network byte order so that the
15138 15149   * entire mblk_t chain handed into hook_run can be used as-is.
15139 15150   * The caveat is that use of the fields, such as the zone fields,
15140 15151   * requires conversion into host byte order first.
15141 15152   */
15142 15153  void
15143 15154  ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
15144 15155      const ill_t *ill, ip_stack_t *ipst)
15145 15156  {
15146 15157          hook_pkt_observe_t *hdr;
15147 15158          uint64_t grifindex;
15148 15159          mblk_t *imp;
15149 15160  
15150 15161          imp = allocb(sizeof (*hdr), BPRI_HI);
15151 15162          if (imp == NULL)
15152 15163                  return;
15153 15164  
15154 15165          hdr = (hook_pkt_observe_t *)imp->b_rptr;
15155 15166          /*
15156 15167           * b_wptr is set to make the apparent size of the data in the mblk_t
15157 15168           * to exclude the pointers at the end of hook_pkt_observer_t.
15158 15169           */
15159 15170          imp->b_wptr = imp->b_rptr + sizeof (dl_ipnetinfo_t);
15160 15171          imp->b_cont = mp;
15161 15172  
15162 15173          ASSERT(DB_TYPE(mp) == M_DATA);
15163 15174  
15164 15175          if (IS_UNDER_IPMP(ill))
15165 15176                  grifindex = ipmp_ill_get_ipmp_ifindex(ill);
15166 15177          else
15167 15178                  grifindex = 0;
15168 15179  
15169 15180          hdr->hpo_version = 1;
15170 15181          hdr->hpo_htype = htons(htype);
15171 15182          hdr->hpo_pktlen = htonl((ulong_t)msgdsize(mp));
15172 15183          hdr->hpo_ifindex = htonl(ill->ill_phyint->phyint_ifindex);
15173 15184          hdr->hpo_grifindex = htonl(grifindex);
15174 15185          hdr->hpo_zsrc = htonl(zsrc);
15175 15186          hdr->hpo_zdst = htonl(zdst);
15176 15187          hdr->hpo_pkt = imp;
15177 15188          hdr->hpo_ctx = ipst->ips_netstack;
15178 15189  
15179 15190          if (ill->ill_isv6) {
15180 15191                  hdr->hpo_family = AF_INET6;
15181 15192                  (void) hook_run(ipst->ips_ipv6_net_data->netd_hooks,
15182 15193                      ipst->ips_ipv6observing, (hook_data_t)hdr);
15183 15194          } else {
15184 15195                  hdr->hpo_family = AF_INET;
15185 15196                  (void) hook_run(ipst->ips_ipv4_net_data->netd_hooks,
15186 15197                      ipst->ips_ipv4observing, (hook_data_t)hdr);
15187 15198          }
15188 15199  
15189 15200          imp->b_cont = NULL;
15190 15201          freemsg(imp);
15191 15202  }
15192 15203  
15193 15204  /*
15194 15205   * Utility routine that checks if `v4srcp' is a valid address on underlying
15195 15206   * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
15196 15207   * associated with `v4srcp' on success.  NOTE: if this is not called from
15197 15208   * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
15198 15209   * group during or after this lookup.
15199 15210   */
15200 15211  boolean_t
15201 15212  ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
15202 15213  {
15203 15214          ipif_t *ipif;
15204 15215  
15205 15216          ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
15206 15217          if (ipif != NULL) {
15207 15218                  if (ipifp != NULL)
15208 15219                          *ipifp = ipif;
15209 15220                  else
15210 15221                          ipif_refrele(ipif);
15211 15222                  return (B_TRUE);
15212 15223          }
15213 15224  
15214 15225          ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
15215 15226              *v4srcp));
15216 15227          return (B_FALSE);
15217 15228  }
15218 15229  
15219 15230  /*
15220 15231   * Transport protocol call back function for CPU state change.
15221 15232   */
15222 15233  /* ARGSUSED */
15223 15234  static int
15224 15235  ip_tp_cpu_update(cpu_setup_t what, int id, void *arg)
15225 15236  {
15226 15237          processorid_t cpu_seqid;
15227 15238          netstack_handle_t nh;
15228 15239          netstack_t *ns;
15229 15240

↓ open down ↓

5530 lines elided

↑ open up ↑

15230 15241          ASSERT(MUTEX_HELD(&cpu_lock));
15231 15242  
15232 15243          switch (what) {
15233 15244          case CPU_CONFIG:
15234 15245          case CPU_ON:
15235 15246          case CPU_INIT:
15236 15247          case CPU_CPUPART_IN:
15237 15248                  cpu_seqid = cpu[id]->cpu_seqid;
15238 15249                  netstack_next_init(&nh);
15239 15250                  while ((ns = netstack_next(&nh)) != NULL) {
     15251 +                        dccp_stack_cpu_add(ns->netstack_dccp, cpu_seqid);
15240 15252                          tcp_stack_cpu_add(ns->netstack_tcp, cpu_seqid);
15241 15253                          sctp_stack_cpu_add(ns->netstack_sctp, cpu_seqid);
15242 15254                          udp_stack_cpu_add(ns->netstack_udp, cpu_seqid);
15243 15255                          netstack_rele(ns);
15244 15256                  }
15245 15257                  netstack_next_fini(&nh);
15246 15258                  break;
15247 15259          case CPU_UNCONFIG:
15248 15260          case CPU_OFF:
15249 15261          case CPU_CPUPART_OUT:
15250 15262                  /*
15251 15263                   * Nothing to do.  We don't remove the per CPU stats from
15252 15264                   * the IP stack even when the CPU goes offline.
15253 15265                   */
15254 15266                  break;
15255 15267          default:
15256 15268                  break;
15257 15269          }
15258 15270          return (0);
15259 15271  }

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX