illumos Wdiff usr/src/uts/common/inet/ip/ip.c

Print this page

13175 Add support for IP_RECVTOS
13182 CMSG_ macros should have man pages
Change-ID: I784aa36cfd3c17e3cccbf1fd329fa7e69b663ef9

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/ip/ip.c
          +++ new/usr/src/uts/common/inet/ip/ip.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]

↓ open down ↓

17 lines elided

↑ open up ↑

  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 1990 Mentat Inc.
  25   25   * Copyright (c) 2017 OmniTI Computer Consulting, Inc. All rights reserved.
  26   26   * Copyright (c) 2016 by Delphix. All rights reserved.
  27   27   * Copyright (c) 2019 Joyent, Inc. All rights reserved.
       28 + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  28   29   */
  29   30  
  30   31  #include <sys/types.h>
  31   32  #include <sys/stream.h>
  32   33  #include <sys/dlpi.h>
  33   34  #include <sys/stropts.h>
  34   35  #include <sys/sysmacros.h>
  35   36  #include <sys/strsubr.h>
  36   37  #include <sys/strlog.h>
  37   38  #include <sys/strsun.h>

  38   39  #include <sys/zone.h>
  39   40  #define _SUN_TPI_VERSION 2
  40   41  #include <sys/tihdr.h>
  41   42  #include <sys/xti_inet.h>
  42   43  #include <sys/ddi.h>
  43   44  #include <sys/suntpi.h>
  44   45  #include <sys/cmn_err.h>
  45   46  #include <sys/debug.h>
  46   47  #include <sys/kobj.h>
  47   48  #include <sys/modctl.h>
  48   49  #include <sys/atomic.h>
  49   50  #include <sys/policy.h>
  50   51  #include <sys/priv.h>
  51   52  #include <sys/taskq.h>
  52   53  
  53   54  #include <sys/systm.h>
  54   55  #include <sys/param.h>
  55   56  #include <sys/kmem.h>
  56   57  #include <sys/sdt.h>
  57   58  #include <sys/socket.h>
  58   59  #include <sys/vtrace.h>
  59   60  #include <sys/isa_defs.h>
  60   61  #include <sys/mac.h>
  61   62  #include <net/if.h>
  62   63  #include <net/if_arp.h>
  63   64  #include <net/route.h>
  64   65  #include <sys/sockio.h>
  65   66  #include <netinet/in.h>
  66   67  #include <net/if_dl.h>
  67   68  
  68   69  #include <inet/common.h>
  69   70  #include <inet/mi.h>
  70   71  #include <inet/mib2.h>
  71   72  #include <inet/nd.h>
  72   73  #include <inet/arp.h>
  73   74  #include <inet/snmpcom.h>
  74   75  #include <inet/optcom.h>
  75   76  #include <inet/kstatcom.h>
  76   77  
  77   78  #include <netinet/igmp_var.h>
  78   79  #include <netinet/ip6.h>
  79   80  #include <netinet/icmp6.h>
  80   81  #include <netinet/sctp.h>
  81   82  
  82   83  #include <inet/ip.h>
  83   84  #include <inet/ip_impl.h>
  84   85  #include <inet/ip6.h>
  85   86  #include <inet/ip6_asp.h>
  86   87  #include <inet/tcp.h>
  87   88  #include <inet/tcp_impl.h>
  88   89  #include <inet/ip_multi.h>
  89   90  #include <inet/ip_if.h>
  90   91  #include <inet/ip_ire.h>
  91   92  #include <inet/ip_ftable.h>
  92   93  #include <inet/ip_rts.h>
  93   94  #include <inet/ip_ndp.h>
  94   95  #include <inet/ip_listutils.h>
  95   96  #include <netinet/igmp.h>
  96   97  #include <netinet/ip_mroute.h>
  97   98  #include <inet/ipp_common.h>
  98   99  #include <inet/cc.h>
  99  100  
 100  101  #include <net/pfkeyv2.h>
 101  102  #include <inet/sadb.h>
 102  103  #include <inet/ipsec_impl.h>
 103  104  #include <inet/iptun/iptun_impl.h>
 104  105  #include <inet/ipdrop.h>
 105  106  #include <inet/ip_netinfo.h>
 106  107  #include <inet/ilb_ip.h>
 107  108  
 108  109  #include <sys/ethernet.h>
 109  110  #include <net/if_types.h>
 110  111  #include <sys/cpuvar.h>
 111  112  
 112  113  #include <ipp/ipp.h>
 113  114  #include <ipp/ipp_impl.h>
 114  115  #include <ipp/ipgpc/ipgpc.h>
 115  116  
 116  117  #include <sys/pattr.h>
 117  118  #include <inet/ipclassifier.h>
 118  119  #include <inet/sctp_ip.h>
 119  120  #include <inet/sctp/sctp_impl.h>
 120  121  #include <inet/udp_impl.h>
 121  122  #include <inet/rawip_impl.h>
 122  123  #include <inet/rts_impl.h>
 123  124  
 124  125  #include <sys/tsol/label.h>
 125  126  #include <sys/tsol/tnet.h>
 126  127  
 127  128  #include <sys/squeue_impl.h>
 128  129  #include <inet/ip_arp.h>
 129  130  
 130  131  #include <sys/clock_impl.h>     /* For LBOLT_FASTPATH{,64} */
 131  132  
 132  133  /*
 133  134   * Values for squeue switch:
 134  135   * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
 135  136   * IP_SQUEUE_ENTER: SQ_PROCESS
 136  137   * IP_SQUEUE_FILL: SQ_FILL
 137  138   */
 138  139  int ip_squeue_enter = IP_SQUEUE_ENTER;  /* Setable in /etc/system */
 139  140  
 140  141  int ip_squeue_flag;
 141  142  
 142  143  /*
 143  144   * Setable in /etc/system
 144  145   */
 145  146  int ip_poll_normal_ms = 100;
 146  147  int ip_poll_normal_ticks = 0;
 147  148  int ip_modclose_ackwait_ms = 3000;
 148  149  
 149  150  /*
 150  151   * It would be nice to have these present only in DEBUG systems, but the
 151  152   * current design of the global symbol checking logic requires them to be
 152  153   * unconditionally present.
 153  154   */
 154  155  uint_t ip_thread_data;                  /* TSD key for debug support */
 155  156  krwlock_t ip_thread_rwlock;
 156  157  list_t  ip_thread_list;
 157  158  
 158  159  /*
 159  160   * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
 160  161   */
 161  162  
 162  163  struct listptr_s {
 163  164          mblk_t  *lp_head;       /* pointer to the head of the list */
 164  165          mblk_t  *lp_tail;       /* pointer to the tail of the list */
 165  166  };
 166  167  
 167  168  typedef struct listptr_s listptr_t;
 168  169  
 169  170  /*
 170  171   * This is used by ip_snmp_get_mib2_ip_route_media and
 171  172   * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
 172  173   */
 173  174  typedef struct iproutedata_s {
 174  175          uint_t          ird_idx;
 175  176          uint_t          ird_flags;      /* see below */
 176  177          listptr_t       ird_route;      /* ipRouteEntryTable */
 177  178          listptr_t       ird_netmedia;   /* ipNetToMediaEntryTable */
 178  179          listptr_t       ird_attrs;      /* ipRouteAttributeTable */
 179  180  } iproutedata_t;
 180  181  
 181  182  /* Include ire_testhidden and IRE_IF_CLONE routes */
 182  183  #define IRD_REPORT_ALL  0x01
 183  184  
 184  185  /*
 185  186   * Cluster specific hooks. These should be NULL when booted as a non-cluster
 186  187   */
 187  188  
 188  189  /*
 189  190   * Hook functions to enable cluster networking
 190  191   * On non-clustered systems these vectors must always be NULL.
 191  192   *
 192  193   * Hook function to Check ip specified ip address is a shared ip address
 193  194   * in the cluster
 194  195   *
 195  196   */
 196  197  int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
 197  198      sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
 198  199  
 199  200  /*
 200  201   * Hook function to generate cluster wide ip fragment identifier
 201  202   */
 202  203  uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
 203  204      sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
 204  205      void *args) = NULL;
 205  206  
 206  207  /*
 207  208   * Hook function to generate cluster wide SPI.
 208  209   */
 209  210  void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
 210  211      void *) = NULL;
 211  212  
 212  213  /*
 213  214   * Hook function to verify if the SPI is already utlized.
 214  215   */
 215  216  
 216  217  int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
 217  218  
 218  219  /*
 219  220   * Hook function to delete the SPI from the cluster wide repository.
 220  221   */
 221  222  
 222  223  void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
 223  224  
 224  225  /*
 225  226   * Hook function to inform the cluster when packet received on an IDLE SA
 226  227   */
 227  228  
 228  229  void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
 229  230      in6_addr_t, in6_addr_t, void *) = NULL;
 230  231  
 231  232  /*
 232  233   * Synchronization notes:
 233  234   *
 234  235   * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
 235  236   * MT level protection given by STREAMS. IP uses a combination of its own
 236  237   * internal serialization mechanism and standard Solaris locking techniques.
 237  238   * The internal serialization is per phyint.  This is used to serialize
 238  239   * plumbing operations, IPMP operations, most set ioctls, etc.
 239  240   *
 240  241   * Plumbing is a long sequence of operations involving message
 241  242   * exchanges between IP, ARP and device drivers. Many set ioctls are typically
 242  243   * involved in plumbing operations. A natural model is to serialize these
 243  244   * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
 244  245   * parallel without any interference. But various set ioctls on hme0 are best
 245  246   * serialized, along with IPMP operations and processing of DLPI control
 246  247   * messages received from drivers on a per phyint basis. This serialization is
 247  248   * provided by the ipsq_t and primitives operating on this. Details can
 248  249   * be found in ip_if.c above the core primitives operating on ipsq_t.
 249  250   *
 250  251   * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
 251  252   * Simiarly lookup of an ire by a thread also returns a refheld ire.
 252  253   * In addition ipif's and ill's referenced by the ire are also indirectly
 253  254   * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
 254  255   * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
 255  256   * address of an ipif has to go through the ipsq_t. This ensures that only
 256  257   * one such exclusive operation proceeds at any time on the ipif. It then
 257  258   * waits for all refcnts
 258  259   * associated with this ipif to come down to zero. The address is changed
 259  260   * only after the ipif has been quiesced. Then the ipif is brought up again.
 260  261   * More details are described above the comment in ip_sioctl_flags.
 261  262   *
 262  263   * Packet processing is based mostly on IREs and are fully multi-threaded
 263  264   * using standard Solaris MT techniques.
 264  265   *
 265  266   * There are explicit locks in IP to handle:
 266  267   * - The ip_g_head list maintained by mi_open_link() and friends.
 267  268   *
 268  269   * - The reassembly data structures (one lock per hash bucket)
 269  270   *
 270  271   * - conn_lock is meant to protect conn_t fields. The fields actually
 271  272   *   protected by conn_lock are documented in the conn_t definition.
 272  273   *
 273  274   * - ire_lock to protect some of the fields of the ire, IRE tables
 274  275   *   (one lock per hash bucket). Refer to ip_ire.c for details.
 275  276   *
 276  277   * - ndp_g_lock and ncec_lock for protecting NCEs.
 277  278   *
 278  279   * - ill_lock protects fields of the ill and ipif. Details in ip.h
 279  280   *
 280  281   * - ill_g_lock: This is a global reader/writer lock. Protects the following
 281  282   *      * The AVL tree based global multi list of all ills.
 282  283   *      * The linked list of all ipifs of an ill
 283  284   *      * The <ipsq-xop> mapping
 284  285   *      * <ill-phyint> association
 285  286   *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
 286  287   *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
 287  288   *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
 288  289   *   writer for the actual duration of the insertion/deletion/change.
 289  290   *
 290  291   * - ill_lock:  This is a per ill mutex.
 291  292   *   It protects some members of the ill_t struct; see ip.h for details.
 292  293   *   It also protects the <ill-phyint> assoc.
 293  294   *   It also protects the list of ipifs hanging off the ill.
 294  295   *
 295  296   * - ipsq_lock: This is a per ipsq_t mutex lock.
 296  297   *   This protects some members of the ipsq_t struct; see ip.h for details.
 297  298   *   It also protects the <ipsq-ipxop> mapping
 298  299   *
 299  300   * - ipx_lock: This is a per ipxop_t mutex lock.
 300  301   *   This protects some members of the ipxop_t struct; see ip.h for details.
 301  302   *
 302  303   * - phyint_lock: This is a per phyint mutex lock. Protects just the
 303  304   *   phyint_flags
 304  305   *
 305  306   * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
 306  307   *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
 307  308   *   uniqueness check also done atomically.
 308  309   *
 309  310   * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
 310  311   *   group list linked by ill_usesrc_grp_next. It also protects the
 311  312   *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
 312  313   *   group is being added or deleted.  This lock is taken as a reader when
 313  314   *   walking the list/group(eg: to get the number of members in a usesrc group).
 314  315   *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
 315  316   *   field is changing state i.e from NULL to non-NULL or vice-versa. For
 316  317   *   example, it is not necessary to take this lock in the initial portion
 317  318   *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
 318  319   *   operations are executed exclusively and that ensures that the "usesrc
 319  320   *   group state" cannot change. The "usesrc group state" change can happen
 320  321   *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
 321  322   *
 322  323   * Changing <ill-phyint>, <ipsq-xop> assocications:
 323  324   *
 324  325   * To change the <ill-phyint> association, the ill_g_lock must be held
 325  326   * as writer, and the ill_locks of both the v4 and v6 instance of the ill
 326  327   * must be held.
 327  328   *
 328  329   * To change the <ipsq-xop> association, the ill_g_lock must be held as
 329  330   * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
 330  331   * This is only done when ills are added or removed from IPMP groups.
 331  332   *
 332  333   * To add or delete an ipif from the list of ipifs hanging off the ill,
 333  334   * ill_g_lock (writer) and ill_lock must be held and the thread must be
 334  335   * a writer on the associated ipsq.
 335  336   *
 336  337   * To add or delete an ill to the system, the ill_g_lock must be held as
 337  338   * writer and the thread must be a writer on the associated ipsq.
 338  339   *
 339  340   * To add or delete an ilm to an ill, the ill_lock must be held and the thread
 340  341   * must be a writer on the associated ipsq.
 341  342   *
 342  343   * Lock hierarchy
 343  344   *
 344  345   * Some lock hierarchy scenarios are listed below.
 345  346   *
 346  347   * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
 347  348   * ill_g_lock -> ill_lock(s) -> phyint_lock
 348  349   * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
 349  350   * ill_g_lock -> ip_addr_avail_lock
 350  351   * conn_lock -> irb_lock -> ill_lock -> ire_lock
 351  352   * ill_g_lock -> ip_g_nd_lock
 352  353   * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
 353  354   * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
 354  355   * arl_lock -> ill_lock
 355  356   * ips_ire_dep_lock -> irb_lock
 356  357   *
 357  358   * When more than 1 ill lock is needed to be held, all ill lock addresses
 358  359   * are sorted on address and locked starting from highest addressed lock
 359  360   * downward.
 360  361   *
 361  362   * Multicast scenarios
 362  363   * ips_ill_g_lock -> ill_mcast_lock
 363  364   * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
 364  365   * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
 365  366   * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
 366  367   * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
 367  368   * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
 368  369   *
 369  370   * IPsec scenarios
 370  371   *
 371  372   * ipsa_lock -> ill_g_lock -> ill_lock
 372  373   * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
 373  374   *
 374  375   * Trusted Solaris scenarios
 375  376   *
 376  377   * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
 377  378   * igsa_lock -> gcdb_lock
 378  379   * gcgrp_rwlock -> ire_lock
 379  380   * gcgrp_rwlock -> gcdb_lock
 380  381   *
 381  382   * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
 382  383   *
 383  384   * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
 384  385   * sq_lock -> conn_lock -> QLOCK(q)
 385  386   * ill_lock -> ft_lock -> fe_lock
 386  387   *
 387  388   * Routing/forwarding table locking notes:
 388  389   *
 389  390   * Lock acquisition order: Radix tree lock, irb_lock.
 390  391   * Requirements:
 391  392   * i.  Walker must not hold any locks during the walker callback.
 392  393   * ii  Walker must not see a truncated tree during the walk because of any node
 393  394   *     deletion.
 394  395   * iii Existing code assumes ire_bucket is valid if it is non-null and is used
 395  396   *     in many places in the code to walk the irb list. Thus even if all the
 396  397   *     ires in a bucket have been deleted, we still can't free the radix node
 397  398   *     until the ires have actually been inactive'd (freed).
 398  399   *
 399  400   * Tree traversal - Need to hold the global tree lock in read mode.
 400  401   * Before dropping the global tree lock, need to either increment the ire_refcnt
 401  402   * to ensure that the radix node can't be deleted.
 402  403   *
 403  404   * Tree add - Need to hold the global tree lock in write mode to add a
 404  405   * radix node. To prevent the node from being deleted, increment the
 405  406   * irb_refcnt, after the node is added to the tree. The ire itself is
 406  407   * added later while holding the irb_lock, but not the tree lock.
 407  408   *
 408  409   * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
 409  410   * All associated ires must be inactive (i.e. freed), and irb_refcnt
 410  411   * must be zero.
 411  412   *
 412  413   * Walker - Increment irb_refcnt before calling the walker callback. Hold the
 413  414   * global tree lock (read mode) for traversal.
 414  415   *
 415  416   * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
 416  417   * hence we will acquire irb_lock while holding ips_ire_dep_lock.
 417  418   *
 418  419   * IPsec notes :
 419  420   *
 420  421   * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
 421  422   * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
 422  423   * ip_xmit_attr_t has the
 423  424   * information used by the IPsec code for applying the right level of
 424  425   * protection. The information initialized by IP in the ip_xmit_attr_t
 425  426   * is determined by the per-socket policy or global policy in the system.
 426  427   * For inbound datagrams, the ip_recv_attr_t
 427  428   * starts out with nothing in it. It gets filled
 428  429   * with the right information if it goes through the AH/ESP code, which
 429  430   * happens if the incoming packet is secure. The information initialized
 430  431   * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
 431  432   * the policy requirements needed by per-socket policy or global policy
 432  433   * is met or not.
 433  434   *
 434  435   * For fully connected sockets i.e dst, src [addr, port] is known,
 435  436   * conn_policy_cached is set indicating that policy has been cached.
 436  437   * conn_in_enforce_policy may or may not be set depending on whether
 437  438   * there is a global policy match or per-socket policy match.
 438  439   * Policy inheriting happpens in ip_policy_set once the destination is known.
 439  440   * Once the right policy is set on the conn_t, policy cannot change for
 440  441   * this socket. This makes life simpler for TCP (UDP ?) where
 441  442   * re-transmissions go out with the same policy. For symmetry, policy
 442  443   * is cached for fully connected UDP sockets also. Thus if policy is cached,
 443  444   * it also implies that policy is latched i.e policy cannot change
 444  445   * on these sockets. As we have the right policy on the conn, we don't
 445  446   * have to lookup global policy for every outbound and inbound datagram
 446  447   * and thus serving as an optimization. Note that a global policy change
 447  448   * does not affect fully connected sockets if they have policy. If fully
 448  449   * connected sockets did not have any policy associated with it, global
 449  450   * policy change may affect them.
 450  451   *
 451  452   * IP Flow control notes:
 452  453   * ---------------------
 453  454   * Non-TCP streams are flow controlled by IP. The way this is accomplished
 454  455   * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
 455  456   * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
 456  457   * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
 457  458   * functions.
 458  459   *
 459  460   * Per Tx ring udp flow control:
 460  461   * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
 461  462   * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
 462  463   *
 463  464   * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
 464  465   * To achieve best performance, outgoing traffic need to be fanned out among
 465  466   * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
 466  467   * traffic out of the NIC and it takes a fanout hint. UDP connections pass
 467  468   * the address of connp as fanout hint to mac_tx(). Under flow controlled
 468  469   * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
 469  470   * cookie points to a specific Tx ring that is blocked. The cookie is used to
 470  471   * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
 471  472   * point to drain_lists (idl_t's). These drain list will store the blocked UDP
 472  473   * connp's. The drain list is not a single list but a configurable number of
 473  474   * lists.
 474  475   *
 475  476   * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
 476  477   * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
 477  478   * which is equal to 128. This array in turn contains a pointer to idl_t[],
 478  479   * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
 479  480   * list will point to the list of connp's that are flow controlled.
 480  481   *
 481  482   *                      ---------------   -------   -------   -------
 482  483   *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
 483  484   *                   |  ---------------   -------   -------   -------
 484  485   *                   |  ---------------   -------   -------   -------
 485  486   *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
 486  487   * ----------------  |  ---------------   -------   -------   -------
 487  488   * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
 488  489   * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
 489  490   *                   |  ---------------   -------   -------   -------
 490  491   *                   .        .              .         .         .
 491  492   *                   |  ---------------   -------   -------   -------
 492  493   *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
 493  494   *                      ---------------   -------   -------   -------
 494  495   *                      ---------------   -------   -------   -------
 495  496   *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
 496  497   *                   |  ---------------   -------   -------   -------
 497  498   *                   |  ---------------   -------   -------   -------
 498  499   * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
 499  500   * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
 500  501   * ----------------  |        .              .         .         .
 501  502   *                   |  ---------------   -------   -------   -------
 502  503   *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
 503  504   *                      ---------------   -------   -------   -------
 504  505   *     .....
 505  506   * ----------------
 506  507   * |idl_tx_list[n]|-> ...
 507  508   * ----------------
 508  509   *
 509  510   * When mac_tx() returns a cookie, the cookie is hashed into an index into
 510  511   * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
 511  512   * to insert the conn onto.  conn_drain_insert() asserts flow control for the
 512  513   * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
 513  514   * Further, conn_blocked is set to indicate that the conn is blocked.
 514  515   *
 515  516   * GLDv3 calls ill_flow_enable() when flow control is relieved.  The cookie
 516  517   * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
 517  518   * is again hashed to locate the appropriate idl_tx_list, which is then
 518  519   * drained via conn_walk_drain().  conn_walk_drain() goes through each conn in
 519  520   * the drain list and calls conn_drain_remove() to clear flow control (via
 520  521   * calling su_txq_full() or clearing QFULL), and remove the conn from the
 521  522   * drain list.
 522  523   *
 523  524   * Note that the drain list is not a single list but a (configurable) array of
 524  525   * lists (8 elements by default).  Synchronization between drain insertion and
 525  526   * flow control wakeup is handled by using idl_txl->txl_lock, and only
 526  527   * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
 527  528   *
 528  529   * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
 529  530   * On the send side, if the packet cannot be sent down to the driver by IP
 530  531   * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
 531  532   * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
 532  533   * the 0'th drain list.  When ip_wsrv() runs on the ill_wq because flow
 533  534   * control has been relieved, the blocked conns in the 0'th drain list are
 534  535   * drained as in the non-STREAMS case.
 535  536   *
 536  537   * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
 537  538   * is done when the conn is inserted into the drain list (conn_drain_insert())
 538  539   * and cleared when the conn is removed from the it (conn_drain_remove()).
 539  540   *
 540  541   * IPQOS notes:
 541  542   *
 542  543   * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
 543  544   * and IPQoS modules. IPPF includes hooks in IP at different control points
 544  545   * (callout positions) which direct packets to IPQoS modules for policy
 545  546   * processing. Policies, if present, are global.
 546  547   *
 547  548   * The callout positions are located in the following paths:
 548  549   *              o local_in (packets destined for this host)
 549  550   *              o local_out (packets orginating from this host )
 550  551   *              o fwd_in  (packets forwarded by this m/c - inbound)
 551  552   *              o fwd_out (packets forwarded by this m/c - outbound)
 552  553   * Hooks at these callout points can be enabled/disabled using the ndd variable
 553  554   * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
 554  555   * By default all the callout positions are enabled.
 555  556   *
 556  557   * Outbound (local_out)
 557  558   * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
 558  559   *
 559  560   * Inbound (local_in)
 560  561   * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
 561  562   *
 562  563   * Forwarding (in and out)
 563  564   * Hooks are placed in ire_recv_forward_v4/v6.
 564  565   *
 565  566   * IP Policy Framework processing (IPPF processing)
 566  567   * Policy processing for a packet is initiated by ip_process, which ascertains
 567  568   * that the classifier (ipgpc) is loaded and configured, failing which the
 568  569   * packet resumes normal processing in IP. If the clasifier is present, the
 569  570   * packet is acted upon by one or more IPQoS modules (action instances), per
 570  571   * filters configured in ipgpc and resumes normal IP processing thereafter.
 571  572   * An action instance can drop a packet in course of its processing.
 572  573   *
 573  574   * Zones notes:
 574  575   *
 575  576   * The partitioning rules for networking are as follows:
 576  577   * 1) Packets coming from a zone must have a source address belonging to that
 577  578   * zone.
 578  579   * 2) Packets coming from a zone can only be sent on a physical interface on
 579  580   * which the zone has an IP address.
 580  581   * 3) Between two zones on the same machine, packet delivery is only allowed if
 581  582   * there's a matching route for the destination and zone in the forwarding
 582  583   * table.
 583  584   * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
 584  585   * different zones can bind to the same port with the wildcard address
 585  586   * (INADDR_ANY).
 586  587   *
 587  588   * The granularity of interface partitioning is at the logical interface level.
 588  589   * Therefore, every zone has its own IP addresses, and incoming packets can be
 589  590   * attributed to a zone unambiguously. A logical interface is placed into a zone
 590  591   * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
 591  592   * structure. Rule (1) is implemented by modifying the source address selection
 592  593   * algorithm so that the list of eligible addresses is filtered based on the
 593  594   * sending process zone.
 594  595   *
 595  596   * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
 596  597   * across all zones, depending on their type. Here is the break-up:
 597  598   *
 598  599   * IRE type                             Shared/exclusive
 599  600   * --------                             ----------------
 600  601   * IRE_BROADCAST                        Exclusive
 601  602   * IRE_DEFAULT (default routes)         Shared (*)
 602  603   * IRE_LOCAL                            Exclusive (x)
 603  604   * IRE_LOOPBACK                         Exclusive
 604  605   * IRE_PREFIX (net routes)              Shared (*)
 605  606   * IRE_IF_NORESOLVER (interface routes) Exclusive
 606  607   * IRE_IF_RESOLVER (interface routes)   Exclusive
 607  608   * IRE_IF_CLONE (interface routes)      Exclusive
 608  609   * IRE_HOST (host routes)               Shared (*)
 609  610   *
 610  611   * (*) A zone can only use a default or off-subnet route if the gateway is
 611  612   * directly reachable from the zone, that is, if the gateway's address matches
 612  613   * one of the zone's logical interfaces.
 613  614   *
 614  615   * (x) IRE_LOCAL are handled a bit differently.
 615  616   * When ip_restrict_interzone_loopback is set (the default),
 616  617   * ire_route_recursive restricts loopback using an IRE_LOCAL
 617  618   * between zone to the case when L2 would have conceptually looped the packet
 618  619   * back, i.e. the loopback which is required since neither Ethernet drivers
 619  620   * nor Ethernet hardware loops them back. This is the case when the normal
 620  621   * routes (ignoring IREs with different zoneids) would send out the packet on
 621  622   * the same ill as the ill with which is IRE_LOCAL is associated.
 622  623   *
 623  624   * Multiple zones can share a common broadcast address; typically all zones
 624  625   * share the 255.255.255.255 address. Incoming as well as locally originated
 625  626   * broadcast packets must be dispatched to all the zones on the broadcast
 626  627   * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
 627  628   * since some zones may not be on the 10.16.72/24 network. To handle this, each
 628  629   * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
 629  630   * sent to every zone that has an IRE_BROADCAST entry for the destination
 630  631   * address on the input ill, see ip_input_broadcast().
 631  632   *
 632  633   * Applications in different zones can join the same multicast group address.
 633  634   * The same logic applies for multicast as for broadcast. ip_input_multicast
 634  635   * dispatches packets to all zones that have members on the physical interface.
 635  636   */
 636  637  
 637  638  /*
 638  639   * Squeue Fanout flags:
 639  640   *      0: No fanout.
 640  641   *      1: Fanout across all squeues
 641  642   */
 642  643  boolean_t       ip_squeue_fanout = 0;
 643  644  
 644  645  /*
 645  646   * Maximum dups allowed per packet.
 646  647   */
 647  648  uint_t ip_max_frag_dups = 10;
 648  649  
 649  650  static int      ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
 650  651                      cred_t *credp, boolean_t isv6);
 651  652  static mblk_t   *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
 652  653  
 653  654  static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
 654  655  static void     icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
 655  656  static void     icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
 656  657      ip_recv_attr_t *);
 657  658  static void     icmp_options_update(ipha_t *);
 658  659  static void     icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
 659  660  static void     icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
 660  661  static mblk_t   *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
 661  662  static void     icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
 662  663      ip_recv_attr_t *);
 663  664  static void     icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
 664  665  static void     icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
 665  666      ip_recv_attr_t *);
 666  667  
 667  668  mblk_t          *ip_dlpi_alloc(size_t, t_uscalar_t);
 668  669  char            *ip_dot_addr(ipaddr_t, char *);
 669  670  mblk_t          *ip_carve_mp(mblk_t **, ssize_t);
 670  671  static char     *ip_dot_saddr(uchar_t *, char *);
 671  672  static int      ip_lrput(queue_t *, mblk_t *);
 672  673  ipaddr_t        ip_net_mask(ipaddr_t);
 673  674  char            *ip_nv_lookup(nv_t *, int);
 674  675  int             ip_rput(queue_t *, mblk_t *);
 675  676  static void     ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
 676  677                      void *dummy_arg);
 677  678  int             ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
 678  679  static mblk_t   *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
 679  680                      mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
 680  681  static mblk_t   *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
 681  682                      ip_stack_t *, boolean_t);
 682  683  static mblk_t   *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
 683  684                      boolean_t);
 684  685  static mblk_t   *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
 685  686  static mblk_t   *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
 686  687  static mblk_t   *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
 687  688  static mblk_t   *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
 688  689  static mblk_t   *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
 689  690                      ip_stack_t *ipst, boolean_t);
 690  691  static mblk_t   *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
 691  692                      ip_stack_t *ipst, boolean_t);
 692  693  static mblk_t   *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
 693  694                      ip_stack_t *ipst);
 694  695  static mblk_t   *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
 695  696                      ip_stack_t *ipst);
 696  697  static mblk_t   *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
 697  698                      ip_stack_t *ipst);
 698  699  static mblk_t   *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
 699  700                      ip_stack_t *ipst);
 700  701  static mblk_t   *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
 701  702                      ip_stack_t *ipst);
 702  703  static mblk_t   *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
 703  704                      ip_stack_t *ipst);
 704  705  static mblk_t   *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
 705  706                      ip_stack_t *ipst);
 706  707  static mblk_t   *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
 707  708                      ip_stack_t *ipst);
 708  709  static void     ip_snmp_get2_v4(ire_t *, iproutedata_t *);
 709  710  static void     ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
 710  711  static void     ip_snmp_get2_v4_media(ncec_t *, void *);
 711  712  static void     ip_snmp_get2_v6_media(ncec_t *, void *);
 712  713  int             ip_snmp_set(queue_t *, int, int, uchar_t *, int);
 713  714  
 714  715  static mblk_t   *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
 715  716                      mblk_t *);
 716  717  
 717  718  static void     conn_drain_init(ip_stack_t *);
 718  719  static void     conn_drain_fini(ip_stack_t *);
 719  720  static void     conn_drain(conn_t *connp, boolean_t closing);
 720  721  
 721  722  static void     conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
 722  723  static void     conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
 723  724  
 724  725  static void     *ip_stack_init(netstackid_t stackid, netstack_t *ns);
 725  726  static void     ip_stack_shutdown(netstackid_t stackid, void *arg);
 726  727  static void     ip_stack_fini(netstackid_t stackid, void *arg);
 727  728  
 728  729  static int      ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
 729  730      const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
 730  731      ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
 731  732      const in6_addr_t *);
 732  733  
 733  734  static int      ip_squeue_switch(int);
 734  735  
 735  736  static void     *ip_kstat_init(netstackid_t, ip_stack_t *);
 736  737  static void     ip_kstat_fini(netstackid_t, kstat_t *);
 737  738  static int      ip_kstat_update(kstat_t *kp, int rw);
 738  739  static void     *icmp_kstat_init(netstackid_t);
 739  740  static void     icmp_kstat_fini(netstackid_t, kstat_t *);
 740  741  static int      icmp_kstat_update(kstat_t *kp, int rw);
 741  742  static void     *ip_kstat2_init(netstackid_t, ip_stat_t *);
 742  743  static void     ip_kstat2_fini(netstackid_t, kstat_t *);
 743  744  
 744  745  static void     ipobs_init(ip_stack_t *);
 745  746  static void     ipobs_fini(ip_stack_t *);
 746  747  
 747  748  static int      ip_tp_cpu_update(cpu_setup_t, int, void *);
 748  749  
 749  750  ipaddr_t        ip_g_all_ones = IP_HOST_MASK;
 750  751  
 751  752  static long ip_rput_pullups;
 752  753  int     dohwcksum = 1;  /* use h/w cksum if supported by the hardware */
 753  754  
 754  755  vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
 755  756  vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
 756  757  
 757  758  int     ip_debug;
 758  759  
 759  760  /*
 760  761   * Multirouting/CGTP stuff
 761  762   */
 762  763  int     ip_cgtp_filter_rev = CGTP_FILTER_REV;   /* CGTP hooks version */
 763  764  
 764  765  /*
 765  766   * IP tunables related declarations. Definitions are in ip_tunables.c
 766  767   */
 767  768  extern mod_prop_info_t ip_propinfo_tbl[];
 768  769  extern int ip_propinfo_count;
 769  770  
 770  771  /*
 771  772   * Table of IP ioctls encoding the various properties of the ioctl and
 772  773   * indexed based on the last byte of the ioctl command. Occasionally there
 773  774   * is a clash, and there is more than 1 ioctl with the same last byte.
 774  775   * In such a case 1 ioctl is encoded in the ndx table and the remaining
 775  776   * ioctls are encoded in the misc table. An entry in the ndx table is
 776  777   * retrieved by indexing on the last byte of the ioctl command and comparing
 777  778   * the ioctl command with the value in the ndx table. In the event of a
 778  779   * mismatch the misc table is then searched sequentially for the desired
 779  780   * ioctl command.
 780  781   *
 781  782   * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
 782  783   */
 783  784  ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 784  785          /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 785  786          /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 786  787          /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 787  788          /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 788  789          /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 789  790          /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 790  791          /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 791  792          /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 792  793          /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 793  794          /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 794  795  
 795  796          /* 010 */ { SIOCADDRT,  sizeof (struct rtentry), IPI_PRIV,
 796  797                          MISC_CMD, ip_siocaddrt, NULL },
 797  798          /* 011 */ { SIOCDELRT,  sizeof (struct rtentry), IPI_PRIV,
 798  799                          MISC_CMD, ip_siocdelrt, NULL },
 799  800  
 800  801          /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 801  802                          IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 802  803          /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
 803  804                          IF_CMD, ip_sioctl_get_addr, NULL },
 804  805  
 805  806          /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 806  807                          IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 807  808          /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
 808  809                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
 809  810  
 810  811          /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
 811  812                          IPI_PRIV | IPI_WR,
 812  813                          IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 813  814          /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
 814  815                          IPI_MODOK | IPI_GET_CMD,
 815  816                          IF_CMD, ip_sioctl_get_flags, NULL },
 816  817  
 817  818          /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 818  819          /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 819  820  
 820  821          /* copyin size cannot be coded for SIOCGIFCONF */
 821  822          /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
 822  823                          MISC_CMD, ip_sioctl_get_ifconf, NULL },
 823  824  
 824  825          /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 825  826                          IF_CMD, ip_sioctl_mtu, NULL },
 826  827          /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
 827  828                          IF_CMD, ip_sioctl_get_mtu, NULL },
 828  829          /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
 829  830                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
 830  831          /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 831  832                          IF_CMD, ip_sioctl_brdaddr, NULL },
 832  833          /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
 833  834                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
 834  835          /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 835  836                          IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 836  837          /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
 837  838                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
 838  839          /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
 839  840                          IF_CMD, ip_sioctl_metric, NULL },
 840  841          /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 841  842  
 842  843          /* See 166-168 below for extended SIOC*XARP ioctls */
 843  844          /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 844  845                          ARP_CMD, ip_sioctl_arp, NULL },
 845  846          /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
 846  847                          ARP_CMD, ip_sioctl_arp, NULL },
 847  848          /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 848  849                          ARP_CMD, ip_sioctl_arp, NULL },
 849  850  
 850  851          /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 851  852          /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 852  853          /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 853  854          /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 854  855          /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 855  856          /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 856  857          /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 857  858          /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 858  859          /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 859  860          /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 860  861          /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 861  862          /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 862  863          /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 863  864          /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 864  865          /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 865  866          /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 866  867          /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 867  868          /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 868  869          /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 869  870          /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 870  871          /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 871  872  
 872  873          /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
 873  874                          MISC_CMD, if_unitsel, if_unitsel_restart },
 874  875  
 875  876          /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 876  877          /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 877  878          /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 878  879          /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 879  880          /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 880  881          /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 881  882          /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 882  883          /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 883  884          /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 884  885          /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 885  886          /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 886  887          /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 887  888          /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 888  889          /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 889  890          /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 890  891          /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 891  892          /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 892  893          /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 893  894  
 894  895          /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
 895  896                          IPI_PRIV | IPI_WR | IPI_MODOK,
 896  897                          IF_CMD, ip_sioctl_sifname, NULL },
 897  898  
 898  899          /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 899  900          /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 900  901          /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 901  902          /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 902  903          /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 903  904          /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 904  905          /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 905  906          /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 906  907          /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 907  908          /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 908  909          /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 909  910          /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 910  911          /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 911  912  
 912  913          /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
 913  914                          MISC_CMD, ip_sioctl_get_ifnum, NULL },
 914  915          /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
 915  916                          IF_CMD, ip_sioctl_get_muxid, NULL },
 916  917          /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
 917  918                          IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
 918  919  
 919  920          /* Both if and lif variants share same func */
 920  921          /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
 921  922                          IF_CMD, ip_sioctl_get_lifindex, NULL },
 922  923          /* Both if and lif variants share same func */
 923  924          /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
 924  925                          IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
 925  926  
 926  927          /* copyin size cannot be coded for SIOCGIFCONF */
 927  928          /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
 928  929                          MISC_CMD, ip_sioctl_get_ifconf, NULL },
 929  930          /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 930  931          /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 931  932          /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 932  933          /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 933  934          /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 934  935          /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 935  936          /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 936  937          /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 937  938          /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 938  939          /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 939  940          /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 940  941          /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 941  942          /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 942  943          /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 943  944          /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 944  945          /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 945  946          /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 946  947  
 947  948          /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
 948  949                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
 949  950                          ip_sioctl_removeif_restart },
 950  951          /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
 951  952                          IPI_GET_CMD | IPI_PRIV | IPI_WR,
 952  953                          LIF_CMD, ip_sioctl_addif, NULL },
 953  954  #define SIOCLIFADDR_NDX 112
 954  955          /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 955  956                          LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 956  957          /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
 957  958                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
 958  959          /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 959  960                          LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 960  961          /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
 961  962                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
 962  963          /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
 963  964                          IPI_PRIV | IPI_WR,
 964  965                          LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 965  966          /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
 966  967                          IPI_GET_CMD | IPI_MODOK,
 967  968                          LIF_CMD, ip_sioctl_get_flags, NULL },
 968  969  
 969  970          /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 970  971          /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 971  972  
 972  973          /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
 973  974                          ip_sioctl_get_lifconf, NULL },
 974  975          /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 975  976                          LIF_CMD, ip_sioctl_mtu, NULL },
 976  977          /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
 977  978                          LIF_CMD, ip_sioctl_get_mtu, NULL },
 978  979          /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
 979  980                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
 980  981          /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 981  982                          LIF_CMD, ip_sioctl_brdaddr, NULL },
 982  983          /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
 983  984                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
 984  985          /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 985  986                          LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 986  987          /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
 987  988                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
 988  989          /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 989  990                          LIF_CMD, ip_sioctl_metric, NULL },
 990  991          /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
 991  992                          IPI_PRIV | IPI_WR | IPI_MODOK,
 992  993                          LIF_CMD, ip_sioctl_slifname,
 993  994                          ip_sioctl_slifname_restart },
 994  995  
 995  996          /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
 996  997                          MISC_CMD, ip_sioctl_get_lifnum, NULL },
 997  998          /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
 998  999                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
 999 1000          /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
1000 1001                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
1001 1002          /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
1002 1003                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
1003 1004          /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
1004 1005                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
1005 1006          /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1006 1007                          LIF_CMD, ip_sioctl_token, NULL },
1007 1008          /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
1008 1009                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
1009 1010          /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1010 1011                          LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
1011 1012          /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
1012 1013                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
1013 1014          /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1014 1015                          LIF_CMD, ip_sioctl_lnkinfo, NULL },
1015 1016  
1016 1017          /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
1017 1018                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
1018 1019          /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
1019 1020                          LIF_CMD, ip_siocdelndp_v6, NULL },
1020 1021          /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
1021 1022                          LIF_CMD, ip_siocqueryndp_v6, NULL },
1022 1023          /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
1023 1024                          LIF_CMD, ip_siocsetndp_v6, NULL },
1024 1025          /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1025 1026                          MISC_CMD, ip_sioctl_tmyaddr, NULL },
1026 1027          /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1027 1028                          MISC_CMD, ip_sioctl_tonlink, NULL },
1028 1029          /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
1029 1030                          MISC_CMD, ip_sioctl_tmysite, NULL },
1030 1031          /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1031 1032          /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1032 1033  
1033 1034          /* Old *IPSECONFIG ioctls are now deprecated, now see spdsock.c */
1034 1035          /* 149 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1035 1036          /* 150 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1036 1037          /* 151 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1037 1038          /* 152 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1038 1039  
1039 1040          /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1040 1041  
1041 1042          /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
1042 1043                          LIF_CMD, ip_sioctl_get_binding, NULL },
1043 1044          /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
1044 1045                          IPI_PRIV | IPI_WR,
1045 1046                          LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
1046 1047          /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
1047 1048                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
1048 1049          /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
1049 1050                          IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
1050 1051  
1051 1052          /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
1052 1053          /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1053 1054          /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1054 1055          /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1055 1056  
1056 1057          /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1057 1058  
1058 1059          /* These are handled in ip_sioctl_copyin_setup itself */
1059 1060          /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
1060 1061                          MISC_CMD, NULL, NULL },
1061 1062          /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
1062 1063                          MISC_CMD, NULL, NULL },
1063 1064          /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
1064 1065  
1065 1066          /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
1066 1067                          ip_sioctl_get_lifconf, NULL },
1067 1068  
1068 1069          /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1069 1070                          XARP_CMD, ip_sioctl_arp, NULL },
1070 1071          /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
1071 1072                          XARP_CMD, ip_sioctl_arp, NULL },
1072 1073          /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1073 1074                          XARP_CMD, ip_sioctl_arp, NULL },
1074 1075  
1075 1076          /* SIOCPOPSOCKFS is not handled by IP */
1076 1077          /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
1077 1078  
1078 1079          /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
1079 1080                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
1080 1081          /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
1081 1082                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
1082 1083                          ip_sioctl_slifzone_restart },
1083 1084          /* 172-174 are SCTP ioctls and not handled by IP */
1084 1085          /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1085 1086          /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1086 1087          /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1087 1088          /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
1088 1089                          IPI_GET_CMD, LIF_CMD,
1089 1090                          ip_sioctl_get_lifusesrc, 0 },
1090 1091          /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
1091 1092                          IPI_PRIV | IPI_WR,
1092 1093                          LIF_CMD, ip_sioctl_slifusesrc,
1093 1094                          NULL },
1094 1095          /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
1095 1096                          ip_sioctl_get_lifsrcof, NULL },
1096 1097          /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
1097 1098                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1098 1099          /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
1099 1100                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1100 1101          /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
1101 1102                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1102 1103          /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
1103 1104                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1104 1105          /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1105 1106          /* SIOCSENABLESDP is handled by SDP */
1106 1107          /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
1107 1108          /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
1108 1109          /* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
1109 1110                          IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
1110 1111          /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
1111 1112          /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
1112 1113                          ip_sioctl_ilb_cmd, NULL },
1113 1114          /* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
1114 1115          /* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
1115 1116          /* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
1116 1117                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
1117 1118          /* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1118 1119                          LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
1119 1120          /* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
1120 1121                          LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
1121 1122  };
1122 1123  
1123 1124  int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1124 1125  
1125 1126  ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
1126 1127          { I_LINK,       0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1127 1128          { I_UNLINK,     0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1128 1129          { I_PLINK,      0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1129 1130          { I_PUNLINK,    0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1130 1131          { ND_GET,       0, 0, 0, NULL, NULL },
1131 1132          { ND_SET,       0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1132 1133          { IP_IOCTL,     0, 0, 0, NULL, NULL },
1133 1134          { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
1134 1135                  MISC_CMD, mrt_ioctl},
1135 1136          { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
1136 1137                  MISC_CMD, mrt_ioctl},
1137 1138          { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
1138 1139                  MISC_CMD, mrt_ioctl}
1139 1140  };
1140 1141  
1141 1142  int ip_misc_ioctl_count =
1142 1143      sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1143 1144  
1144 1145  int     conn_drain_nthreads;            /* Number of drainers reqd. */
1145 1146                                          /* Settable in /etc/system */
1146 1147  /* Defined in ip_ire.c */
1147 1148  extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
1148 1149  extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
1149 1150  extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
1150 1151  
1151 1152  static nv_t     ire_nv_arr[] = {
1152 1153          { IRE_BROADCAST, "BROADCAST" },
1153 1154          { IRE_LOCAL, "LOCAL" },
1154 1155          { IRE_LOOPBACK, "LOOPBACK" },
1155 1156          { IRE_DEFAULT, "DEFAULT" },
1156 1157          { IRE_PREFIX, "PREFIX" },
1157 1158          { IRE_IF_NORESOLVER, "IF_NORESOL" },
1158 1159          { IRE_IF_RESOLVER, "IF_RESOLV" },
1159 1160          { IRE_IF_CLONE, "IF_CLONE" },
1160 1161          { IRE_HOST, "HOST" },
1161 1162          { IRE_MULTICAST, "MULTICAST" },
1162 1163          { IRE_NOROUTE, "NOROUTE" },
1163 1164          { 0 }
1164 1165  };
1165 1166  
1166 1167  nv_t    *ire_nv_tbl = ire_nv_arr;
1167 1168  
1168 1169  /* Simple ICMP IP Header Template */
1169 1170  static ipha_t icmp_ipha = {
1170 1171          IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
1171 1172  };
1172 1173  
1173 1174  struct module_info ip_mod_info = {
1174 1175          IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
1175 1176          IP_MOD_LOWAT
1176 1177  };
1177 1178  
1178 1179  /*
1179 1180   * Duplicate static symbols within a module confuses mdb; so we avoid the
1180 1181   * problem by making the symbols here distinct from those in udp.c.
1181 1182   */
1182 1183  
1183 1184  /*
1184 1185   * Entry points for IP as a device and as a module.
1185 1186   * We have separate open functions for the /dev/ip and /dev/ip6 devices.
1186 1187   */
1187 1188  static struct qinit iprinitv4 = {
1188 1189          ip_rput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info
1189 1190  };
1190 1191  
1191 1192  struct qinit iprinitv6 = {
1192 1193          ip_rput_v6, NULL, ip_openv6, ip_close, NULL, &ip_mod_info
1193 1194  };
1194 1195  
1195 1196  static struct qinit ipwinit = {
1196 1197          ip_wput_nondata, ip_wsrv, NULL, NULL, NULL, &ip_mod_info
1197 1198  };
1198 1199  
1199 1200  static struct qinit iplrinit = {
1200 1201          ip_lrput, NULL, ip_openv4, ip_close, NULL, &ip_mod_info
1201 1202  };
1202 1203  
1203 1204  static struct qinit iplwinit = {
1204 1205          ip_lwput, NULL, NULL, NULL, NULL, &ip_mod_info
1205 1206  };
1206 1207  
1207 1208  /* For AF_INET aka /dev/ip */
1208 1209  struct streamtab ipinfov4 = {
1209 1210          &iprinitv4, &ipwinit, &iplrinit, &iplwinit
1210 1211  };
1211 1212  
1212 1213  /* For AF_INET6 aka /dev/ip6 */
1213 1214  struct streamtab ipinfov6 = {
1214 1215          &iprinitv6, &ipwinit, &iplrinit, &iplwinit
1215 1216  };
1216 1217  
1217 1218  #ifdef  DEBUG
1218 1219  boolean_t skip_sctp_cksum = B_FALSE;
1219 1220  #endif
1220 1221  
1221 1222  /*
1222 1223   * Generate an ICMP fragmentation needed message.
1223 1224   * When called from ip_output side a minimal ip_recv_attr_t needs to be
1224 1225   * constructed by the caller.
1225 1226   */
1226 1227  void
1227 1228  icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
1228 1229  {
1229 1230          icmph_t icmph;
1230 1231          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1231 1232  
1232 1233          mp = icmp_pkt_err_ok(mp, ira);
1233 1234          if (mp == NULL)
1234 1235                  return;
1235 1236  
1236 1237          bzero(&icmph, sizeof (icmph_t));
1237 1238          icmph.icmph_type = ICMP_DEST_UNREACHABLE;
1238 1239          icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
1239 1240          icmph.icmph_du_mtu = htons((uint16_t)mtu);
1240 1241          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
1241 1242          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
1242 1243  
1243 1244          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
1244 1245  }
1245 1246  
1246 1247  /*
1247 1248   * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
1248 1249   * If the ICMP message is consumed by IP, i.e., it should not be delivered
1249 1250   * to any IPPROTO_ICMP raw sockets, then it returns NULL.
1250 1251   * Likewise, if the ICMP error is misformed (too short, etc), then it
1251 1252   * returns NULL. The caller uses this to determine whether or not to send
1252 1253   * to raw sockets.
1253 1254   *
1254 1255   * All error messages are passed to the matching transport stream.
1255 1256   *
1256 1257   * The following cases are handled by icmp_inbound:
1257 1258   * 1) It needs to send a reply back and possibly delivering it
1258 1259   *    to the "interested" upper clients.
1259 1260   * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
1260 1261   * 3) It needs to change some values in IP only.
1261 1262   * 4) It needs to change some values in IP and upper layers e.g TCP
1262 1263   *    by delivering an error to the upper layers.
1263 1264   *
1264 1265   * We handle the above three cases in the context of IPsec in the
1265 1266   * following way :
1266 1267   *
1267 1268   * 1) Send the reply back in the same way as the request came in.
1268 1269   *    If it came in encrypted, it goes out encrypted. If it came in
1269 1270   *    clear, it goes out in clear. Thus, this will prevent chosen
1270 1271   *    plain text attack.
1271 1272   * 2) The client may or may not expect things to come in secure.
1272 1273   *    If it comes in secure, the policy constraints are checked
1273 1274   *    before delivering it to the upper layers. If it comes in
1274 1275   *    clear, ipsec_inbound_accept_clear will decide whether to
1275 1276   *    accept this in clear or not. In both the cases, if the returned
1276 1277   *    message (IP header + 8 bytes) that caused the icmp message has
1277 1278   *    AH/ESP headers, it is sent up to AH/ESP for validation before
1278 1279   *    sending up. If there are only 8 bytes of returned message, then
1279 1280   *    upper client will not be notified.
1280 1281   * 3) Check with global policy to see whether it matches the constaints.
1281 1282   *    But this will be done only if icmp_accept_messages_in_clear is
1282 1283   *    zero.
1283 1284   * 4) If we need to change both in IP and ULP, then the decision taken
1284 1285   *    while affecting the values in IP and while delivering up to TCP
1285 1286   *    should be the same.
1286 1287   *
1287 1288   *      There are two cases.
1288 1289   *
1289 1290   *      a) If we reject data at the IP layer (ipsec_check_global_policy()
1290 1291   *         failed), we will not deliver it to the ULP, even though they
1291 1292   *         are *willing* to accept in *clear*. This is fine as our global
1292 1293   *         disposition to icmp messages asks us reject the datagram.
1293 1294   *
1294 1295   *      b) If we accept data at the IP layer (ipsec_check_global_policy()
1295 1296   *         succeeded or icmp_accept_messages_in_clear is 1), and not able
1296 1297   *         to deliver it to ULP (policy failed), it can lead to
1297 1298   *         consistency problems. The cases known at this time are
1298 1299   *         ICMP_DESTINATION_UNREACHABLE  messages with following code
1299 1300   *         values :
1300 1301   *
1301 1302   *         - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
1302 1303   *           and Upper layer rejects. Then the communication will
1303 1304   *           come to a stop. This is solved by making similar decisions
1304 1305   *           at both levels. Currently, when we are unable to deliver
1305 1306   *           to the Upper Layer (due to policy failures) while IP has
1306 1307   *           adjusted dce_pmtu, the next outbound datagram would
1307 1308   *           generate a local ICMP_FRAGMENTATION_NEEDED message - which
1308 1309   *           will be with the right level of protection. Thus the right
1309 1310   *           value will be communicated even if we are not able to
1310 1311   *           communicate when we get from the wire initially. But this
1311 1312   *           assumes there would be at least one outbound datagram after
1312 1313   *           IP has adjusted its dce_pmtu value. To make things
1313 1314   *           simpler, we accept in clear after the validation of
1314 1315   *           AH/ESP headers.
1315 1316   *
1316 1317   *         - Other ICMP ERRORS : We may not be able to deliver it to the
1317 1318   *           upper layer depending on the level of protection the upper
1318 1319   *           layer expects and the disposition in ipsec_inbound_accept_clear().
1319 1320   *           ipsec_inbound_accept_clear() decides whether a given ICMP error
1320 1321   *           should be accepted in clear when the Upper layer expects secure.
1321 1322   *           Thus the communication may get aborted by some bad ICMP
1322 1323   *           packets.
1323 1324   */
1324 1325  mblk_t *
1325 1326  icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
1326 1327  {
1327 1328          icmph_t         *icmph;
1328 1329          ipha_t          *ipha;          /* Outer header */
1329 1330          int             ip_hdr_length;  /* Outer header length */
1330 1331          boolean_t       interested;
1331 1332          ipif_t          *ipif;
1332 1333          uint32_t        ts;
1333 1334          uint32_t        *tsp;
1334 1335          timestruc_t     now;
1335 1336          ill_t           *ill = ira->ira_ill;
1336 1337          ip_stack_t      *ipst = ill->ill_ipst;
1337 1338          zoneid_t        zoneid = ira->ira_zoneid;
1338 1339          int             len_needed;
1339 1340          mblk_t          *mp_ret = NULL;
1340 1341  
1341 1342          ipha = (ipha_t *)mp->b_rptr;
1342 1343  
1343 1344          BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
1344 1345  
1345 1346          ip_hdr_length = ira->ira_ip_hdr_length;
1346 1347          if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
1347 1348                  if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
1348 1349                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1349 1350                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1350 1351                          freemsg(mp);
1351 1352                          return (NULL);
1352 1353                  }
1353 1354                  /* Last chance to get real. */
1354 1355                  ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
1355 1356                  if (ipha == NULL) {
1356 1357                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
1357 1358                          freemsg(mp);
1358 1359                          return (NULL);
1359 1360                  }
1360 1361          }
1361 1362  
1362 1363          /* The IP header will always be a multiple of four bytes */
1363 1364          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1364 1365          ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
1365 1366              icmph->icmph_code));
1366 1367  
1367 1368          /*
1368 1369           * We will set "interested" to "true" if we should pass a copy to
1369 1370           * the transport or if we handle the packet locally.
1370 1371           */
1371 1372          interested = B_FALSE;
1372 1373          switch (icmph->icmph_type) {
1373 1374          case ICMP_ECHO_REPLY:
1374 1375                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
1375 1376                  break;
1376 1377          case ICMP_DEST_UNREACHABLE:
1377 1378                  if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
1378 1379                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
1379 1380                  interested = B_TRUE;    /* Pass up to transport */
1380 1381                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
1381 1382                  break;
1382 1383          case ICMP_SOURCE_QUENCH:
1383 1384                  interested = B_TRUE;    /* Pass up to transport */
1384 1385                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
1385 1386                  break;
1386 1387          case ICMP_REDIRECT:
1387 1388                  if (!ipst->ips_ip_ignore_redirect)
1388 1389                          interested = B_TRUE;
1389 1390                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
1390 1391                  break;
1391 1392          case ICMP_ECHO_REQUEST:
1392 1393                  /*
1393 1394                   * Whether to respond to echo requests that come in as IP
1394 1395                   * broadcasts or as IP multicast is subject to debate
1395 1396                   * (what isn't?).  We aim to please, you pick it.
1396 1397                   * Default is do it.
1397 1398                   */
1398 1399                  if (ira->ira_flags & IRAF_MULTICAST) {
1399 1400                          /* multicast: respond based on tunable */
1400 1401                          interested = ipst->ips_ip_g_resp_to_echo_mcast;
1401 1402                  } else if (ira->ira_flags & IRAF_BROADCAST) {
1402 1403                          /* broadcast: respond based on tunable */
1403 1404                          interested = ipst->ips_ip_g_resp_to_echo_bcast;
1404 1405                  } else {
1405 1406                          /* unicast: always respond */
1406 1407                          interested = B_TRUE;
1407 1408                  }
1408 1409                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
1409 1410                  if (!interested) {
1410 1411                          /* We never pass these to RAW sockets */
1411 1412                          freemsg(mp);
1412 1413                          return (NULL);
1413 1414                  }
1414 1415  
1415 1416                  /* Check db_ref to make sure we can modify the packet. */
1416 1417                  if (mp->b_datap->db_ref > 1) {
1417 1418                          mblk_t  *mp1;
1418 1419  
1419 1420                          mp1 = copymsg(mp);
1420 1421                          freemsg(mp);
1421 1422                          if (!mp1) {
1422 1423                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1423 1424                                  return (NULL);
1424 1425                          }
1425 1426                          mp = mp1;
1426 1427                          ipha = (ipha_t *)mp->b_rptr;
1427 1428                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1428 1429                  }
1429 1430                  icmph->icmph_type = ICMP_ECHO_REPLY;
1430 1431                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
1431 1432                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1432 1433                  return (NULL);
1433 1434  
1434 1435          case ICMP_ROUTER_ADVERTISEMENT:
1435 1436          case ICMP_ROUTER_SOLICITATION:
1436 1437                  break;
1437 1438          case ICMP_TIME_EXCEEDED:
1438 1439                  interested = B_TRUE;    /* Pass up to transport */
1439 1440                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
1440 1441                  break;
1441 1442          case ICMP_PARAM_PROBLEM:
1442 1443                  interested = B_TRUE;    /* Pass up to transport */
1443 1444                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
1444 1445                  break;
1445 1446          case ICMP_TIME_STAMP_REQUEST:
1446 1447                  /* Response to Time Stamp Requests is local policy. */
1447 1448                  if (ipst->ips_ip_g_resp_to_timestamp) {
1448 1449                          if (ira->ira_flags & IRAF_MULTIBROADCAST)
1449 1450                                  interested =
1450 1451                                      ipst->ips_ip_g_resp_to_timestamp_bcast;
1451 1452                          else
1452 1453                                  interested = B_TRUE;
1453 1454                  }
1454 1455                  if (!interested) {
1455 1456                          /* We never pass these to RAW sockets */
1456 1457                          freemsg(mp);
1457 1458                          return (NULL);
1458 1459                  }
1459 1460  
1460 1461                  /* Make sure we have enough of the packet */
1461 1462                  len_needed = ip_hdr_length + ICMPH_SIZE +
1462 1463                      3 * sizeof (uint32_t);
1463 1464  
1464 1465                  if (mp->b_wptr - mp->b_rptr < len_needed) {
1465 1466                          ipha = ip_pullup(mp, len_needed, ira);
1466 1467                          if (ipha == NULL) {
1467 1468                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1468 1469                                  ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1469 1470                                      mp, ill);
1470 1471                                  freemsg(mp);
1471 1472                                  return (NULL);
1472 1473                          }
1473 1474                          /* Refresh following the pullup. */
1474 1475                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1475 1476                  }
1476 1477                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
1477 1478                  /* Check db_ref to make sure we can modify the packet. */
1478 1479                  if (mp->b_datap->db_ref > 1) {
1479 1480                          mblk_t  *mp1;
1480 1481  
1481 1482                          mp1 = copymsg(mp);
1482 1483                          freemsg(mp);
1483 1484                          if (!mp1) {
1484 1485                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1485 1486                                  return (NULL);
1486 1487                          }
1487 1488                          mp = mp1;
1488 1489                          ipha = (ipha_t *)mp->b_rptr;
1489 1490                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1490 1491                  }
1491 1492                  icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
1492 1493                  tsp = (uint32_t *)&icmph[1];
1493 1494                  tsp++;          /* Skip past 'originate time' */
1494 1495                  /* Compute # of milliseconds since midnight */
1495 1496                  gethrestime(&now);
1496 1497                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
1497 1498                      NSEC2MSEC(now.tv_nsec);
1498 1499                  *tsp++ = htonl(ts);     /* Lay in 'receive time' */
1499 1500                  *tsp++ = htonl(ts);     /* Lay in 'send time' */
1500 1501                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
1501 1502                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1502 1503                  return (NULL);
1503 1504  
1504 1505          case ICMP_TIME_STAMP_REPLY:
1505 1506                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
1506 1507                  break;
1507 1508          case ICMP_INFO_REQUEST:
1508 1509                  /* Per RFC 1122 3.2.2.7, ignore this. */
1509 1510          case ICMP_INFO_REPLY:
1510 1511                  break;
1511 1512          case ICMP_ADDRESS_MASK_REQUEST:
1512 1513                  if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1513 1514                          interested =
1514 1515                              ipst->ips_ip_respond_to_address_mask_broadcast;
1515 1516                  } else {
1516 1517                          interested = B_TRUE;
1517 1518                  }
1518 1519                  if (!interested) {
1519 1520                          /* We never pass these to RAW sockets */
1520 1521                          freemsg(mp);
1521 1522                          return (NULL);
1522 1523                  }
1523 1524                  len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
1524 1525                  if (mp->b_wptr - mp->b_rptr < len_needed) {
1525 1526                          ipha = ip_pullup(mp, len_needed, ira);
1526 1527                          if (ipha == NULL) {
1527 1528                                  BUMP_MIB(ill->ill_ip_mib,
1528 1529                                      ipIfStatsInTruncatedPkts);
1529 1530                                  ip_drop_input("ipIfStatsInTruncatedPkts", mp,
1530 1531                                      ill);
1531 1532                                  freemsg(mp);
1532 1533                                  return (NULL);
1533 1534                          }
1534 1535                          /* Refresh following the pullup. */
1535 1536                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1536 1537                  }
1537 1538                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
1538 1539                  /* Check db_ref to make sure we can modify the packet. */
1539 1540                  if (mp->b_datap->db_ref > 1) {
1540 1541                          mblk_t  *mp1;
1541 1542  
1542 1543                          mp1 = copymsg(mp);
1543 1544                          freemsg(mp);
1544 1545                          if (!mp1) {
1545 1546                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1546 1547                                  return (NULL);
1547 1548                          }
1548 1549                          mp = mp1;
1549 1550                          ipha = (ipha_t *)mp->b_rptr;
1550 1551                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1551 1552                  }
1552 1553                  /*
1553 1554                   * Need the ipif with the mask be the same as the source
1554 1555                   * address of the mask reply. For unicast we have a specific
1555 1556                   * ipif. For multicast/broadcast we only handle onlink
1556 1557                   * senders, and use the source address to pick an ipif.
1557 1558                   */
1558 1559                  ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
1559 1560                  if (ipif == NULL) {
1560 1561                          /* Broadcast or multicast */
1561 1562                          ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
1562 1563                          if (ipif == NULL) {
1563 1564                                  freemsg(mp);
1564 1565                                  return (NULL);
1565 1566                          }
1566 1567                  }
1567 1568                  icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
1568 1569                  bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
1569 1570                  ipif_refrele(ipif);
1570 1571                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
1571 1572                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1572 1573                  return (NULL);
1573 1574  
1574 1575          case ICMP_ADDRESS_MASK_REPLY:
1575 1576                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
1576 1577                  break;
1577 1578          default:
1578 1579                  interested = B_TRUE;    /* Pass up to transport */
1579 1580                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
1580 1581                  break;
1581 1582          }
1582 1583          /*
1583 1584           * See if there is an ICMP client to avoid an extra copymsg/freemsg
1584 1585           * if there isn't one.
1585 1586           */
1586 1587          if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
1587 1588                  /* If there is an ICMP client and we want one too, copy it. */
1588 1589  
1589 1590                  if (!interested) {
1590 1591                          /* Caller will deliver to RAW sockets */
1591 1592                          return (mp);
1592 1593                  }
1593 1594                  mp_ret = copymsg(mp);
1594 1595                  if (mp_ret == NULL) {
1595 1596                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1596 1597                          ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1597 1598                  }
1598 1599          } else if (!interested) {
1599 1600                  /* Neither we nor raw sockets are interested. Drop packet now */
1600 1601                  freemsg(mp);
1601 1602                  return (NULL);
1602 1603          }
1603 1604  
1604 1605          /*
1605 1606           * ICMP error or redirect packet. Make sure we have enough of
1606 1607           * the header and that db_ref == 1 since we might end up modifying
1607 1608           * the packet.
1608 1609           */
1609 1610          if (mp->b_cont != NULL) {
1610 1611                  if (ip_pullup(mp, -1, ira) == NULL) {
1611 1612                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1612 1613                          ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1613 1614                              mp, ill);
1614 1615                          freemsg(mp);
1615 1616                          return (mp_ret);
1616 1617                  }
1617 1618          }
1618 1619  
1619 1620          if (mp->b_datap->db_ref > 1) {
1620 1621                  mblk_t  *mp1;
1621 1622  
1622 1623                  mp1 = copymsg(mp);
1623 1624                  if (mp1 == NULL) {
1624 1625                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1625 1626                          ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1626 1627                          freemsg(mp);
1627 1628                          return (mp_ret);
1628 1629                  }
1629 1630                  freemsg(mp);
1630 1631                  mp = mp1;
1631 1632          }
1632 1633  
1633 1634          /*
1634 1635           * In case mp has changed, verify the message before any further
1635 1636           * processes.
1636 1637           */
1637 1638          ipha = (ipha_t *)mp->b_rptr;
1638 1639          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1639 1640          if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
1640 1641                  freemsg(mp);
1641 1642                  return (mp_ret);
1642 1643          }
1643 1644  
1644 1645          switch (icmph->icmph_type) {
1645 1646          case ICMP_REDIRECT:
1646 1647                  icmp_redirect_v4(mp, ipha, icmph, ira);
1647 1648                  break;
1648 1649          case ICMP_DEST_UNREACHABLE:
1649 1650                  if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
1650 1651                          /* Update DCE and adjust MTU is icmp header if needed */
1651 1652                          icmp_inbound_too_big_v4(icmph, ira);
1652 1653                  }
1653 1654                  /* FALLTHROUGH */
1654 1655          default:
1655 1656                  icmp_inbound_error_fanout_v4(mp, icmph, ira);
1656 1657                  break;
1657 1658          }
1658 1659          return (mp_ret);
1659 1660  }
1660 1661  
1661 1662  /*
1662 1663   * Send an ICMP echo, timestamp or address mask reply.
1663 1664   * The caller has already updated the payload part of the packet.
1664 1665   * We handle the ICMP checksum, IP source address selection and feed
1665 1666   * the packet into ip_output_simple.
1666 1667   */
1667 1668  static void
1668 1669  icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
1669 1670      ip_recv_attr_t *ira)
1670 1671  {
1671 1672          uint_t          ip_hdr_length = ira->ira_ip_hdr_length;
1672 1673          ill_t           *ill = ira->ira_ill;
1673 1674          ip_stack_t      *ipst = ill->ill_ipst;
1674 1675          ip_xmit_attr_t  ixas;
1675 1676  
1676 1677          /* Send out an ICMP packet */
1677 1678          icmph->icmph_checksum = 0;
1678 1679          icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
1679 1680          /* Reset time to live. */
1680 1681          ipha->ipha_ttl = ipst->ips_ip_def_ttl;
1681 1682          {
1682 1683                  /* Swap source and destination addresses */
1683 1684                  ipaddr_t tmp;
1684 1685  
1685 1686                  tmp = ipha->ipha_src;
1686 1687                  ipha->ipha_src = ipha->ipha_dst;
1687 1688                  ipha->ipha_dst = tmp;
1688 1689          }
1689 1690          ipha->ipha_ident = 0;
1690 1691          if (!IS_SIMPLE_IPH(ipha))
1691 1692                  icmp_options_update(ipha);
1692 1693  
1693 1694          bzero(&ixas, sizeof (ixas));
1694 1695          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1695 1696          ixas.ixa_zoneid = ira->ira_zoneid;
1696 1697          ixas.ixa_cred = kcred;
1697 1698          ixas.ixa_cpid = NOPID;
1698 1699          ixas.ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
1699 1700          ixas.ixa_ifindex = 0;
1700 1701          ixas.ixa_ipst = ipst;
1701 1702          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1702 1703  
1703 1704          if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
1704 1705                  /*
1705 1706                   * This packet should go out the same way as it
1706 1707                   * came in i.e in clear, independent of the IPsec policy
1707 1708                   * for transmitting packets.
1708 1709                   */
1709 1710                  ixas.ixa_flags |= IXAF_NO_IPSEC;
1710 1711          } else {
1711 1712                  if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
1712 1713                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1713 1714                          /* Note: mp already consumed and ip_drop_packet done */
1714 1715                          return;
1715 1716                  }
1716 1717          }
1717 1718          if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1718 1719                  /*
1719 1720                   * Not one or our addresses (IRE_LOCALs), thus we let
1720 1721                   * ip_output_simple pick the source.
1721 1722                   */
1722 1723                  ipha->ipha_src = INADDR_ANY;
1723 1724                  ixas.ixa_flags |= IXAF_SET_SOURCE;
1724 1725          }
1725 1726          /* Should we send with DF and use dce_pmtu? */
1726 1727          if (ipst->ips_ipv4_icmp_return_pmtu) {
1727 1728                  ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
1728 1729                  ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
1729 1730          }
1730 1731  
1731 1732          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
1732 1733  
1733 1734          (void) ip_output_simple(mp, &ixas);
1734 1735          ixa_cleanup(&ixas);
1735 1736  }
1736 1737  
1737 1738  /*
1738 1739   * Verify the ICMP messages for either for ICMP error or redirect packet.
1739 1740   * The caller should have fully pulled up the message. If it's a redirect
1740 1741   * packet, only basic checks on IP header will be done; otherwise, verify
1741 1742   * the packet by looking at the included ULP header.
1742 1743   *
1743 1744   * Called before icmp_inbound_error_fanout_v4 is called.
1744 1745   */
1745 1746  static boolean_t
1746 1747  icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
1747 1748  {
1748 1749          ill_t           *ill = ira->ira_ill;
1749 1750          int             hdr_length;
1750 1751          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1751 1752          conn_t          *connp;
1752 1753          ipha_t          *ipha;  /* Inner IP header */
1753 1754  
1754 1755          ipha = (ipha_t *)&icmph[1];
1755 1756          if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
1756 1757                  goto truncated;
1757 1758  
1758 1759          hdr_length = IPH_HDR_LENGTH(ipha);
1759 1760  
1760 1761          if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
1761 1762                  goto discard_pkt;
1762 1763  
1763 1764          if (hdr_length < sizeof (ipha_t))
1764 1765                  goto truncated;
1765 1766  
1766 1767          if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
1767 1768                  goto truncated;
1768 1769  
1769 1770          /*
1770 1771           * Stop here for ICMP_REDIRECT.
1771 1772           */
1772 1773          if (icmph->icmph_type == ICMP_REDIRECT)
1773 1774                  return (B_TRUE);
1774 1775  
1775 1776          /*
1776 1777           * ICMP errors only.
1777 1778           */
1778 1779          switch (ipha->ipha_protocol) {
1779 1780          case IPPROTO_UDP:
1780 1781                  /*
1781 1782                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1782 1783                   * transport header.
1783 1784                   */
1784 1785                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1785 1786                      mp->b_wptr)
1786 1787                          goto truncated;
1787 1788                  break;
1788 1789          case IPPROTO_TCP: {
1789 1790                  tcpha_t         *tcpha;
1790 1791  
1791 1792                  /*
1792 1793                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1793 1794                   * transport header.
1794 1795                   */
1795 1796                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1796 1797                      mp->b_wptr)
1797 1798                          goto truncated;
1798 1799  
1799 1800                  tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
1800 1801                  connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
1801 1802                      ipst);
1802 1803                  if (connp == NULL)
1803 1804                          goto discard_pkt;
1804 1805  
1805 1806                  if ((connp->conn_verifyicmp != NULL) &&
1806 1807                      !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
1807 1808                          CONN_DEC_REF(connp);
1808 1809                          goto discard_pkt;
1809 1810                  }
1810 1811                  CONN_DEC_REF(connp);
1811 1812                  break;
1812 1813          }
1813 1814          case IPPROTO_SCTP:
1814 1815                  /*
1815 1816                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1816 1817                   * transport header.
1817 1818                   */
1818 1819                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1819 1820                      mp->b_wptr)
1820 1821                          goto truncated;
1821 1822                  break;
1822 1823          case IPPROTO_ESP:
1823 1824          case IPPROTO_AH:
1824 1825                  break;
1825 1826          case IPPROTO_ENCAP:
1826 1827                  if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
1827 1828                      mp->b_wptr)
1828 1829                          goto truncated;
1829 1830                  break;
1830 1831          default:
1831 1832                  break;
1832 1833          }
1833 1834  
1834 1835          return (B_TRUE);
1835 1836  
1836 1837  discard_pkt:
1837 1838          /* Bogus ICMP error. */
1838 1839          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1839 1840          return (B_FALSE);
1840 1841  
1841 1842  truncated:
1842 1843          /* We pulled up everthing already. Must be truncated */
1843 1844          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1844 1845          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1845 1846          return (B_FALSE);
1846 1847  }
1847 1848  
1848 1849  /* Table from RFC 1191 */
1849 1850  static int icmp_frag_size_table[] =
1850 1851  { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
1851 1852  
1852 1853  /*
1853 1854   * Process received ICMP Packet too big.
1854 1855   * Just handles the DCE create/update, including using the above table of
1855 1856   * PMTU guesses. The caller is responsible for validating the packet before
1856 1857   * passing it in and also to fanout the ICMP error to any matching transport
1857 1858   * conns. Assumes the message has been fully pulled up and verified.
1858 1859   *
1859 1860   * Before getting here, the caller has called icmp_inbound_verify_v4()
1860 1861   * that should have verified with ULP to prevent undoing the changes we're
1861 1862   * going to make to DCE. For example, TCP might have verified that the packet
1862 1863   * which generated error is in the send window.
1863 1864   *
1864 1865   * In some cases modified this MTU in the ICMP header packet; the caller
1865 1866   * should pass to the matching ULP after this returns.
1866 1867   */
1867 1868  static void
1868 1869  icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
1869 1870  {
1870 1871          dce_t           *dce;
1871 1872          int             old_mtu;
1872 1873          int             mtu, orig_mtu;
1873 1874          ipaddr_t        dst;
1874 1875          boolean_t       disable_pmtud;
1875 1876          ill_t           *ill = ira->ira_ill;
1876 1877          ip_stack_t      *ipst = ill->ill_ipst;
1877 1878          uint_t          hdr_length;
1878 1879          ipha_t          *ipha;
1879 1880  
1880 1881          /* Caller already pulled up everything. */
1881 1882          ipha = (ipha_t *)&icmph[1];
1882 1883          ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
1883 1884              icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
1884 1885          ASSERT(ill != NULL);
1885 1886  
1886 1887          hdr_length = IPH_HDR_LENGTH(ipha);
1887 1888  
1888 1889          /*
1889 1890           * We handle path MTU for source routed packets since the DCE
1890 1891           * is looked up using the final destination.
1891 1892           */
1892 1893          dst = ip_get_dst(ipha);
1893 1894  
1894 1895          dce = dce_lookup_and_add_v4(dst, ipst);
1895 1896          if (dce == NULL) {
1896 1897                  /* Couldn't add a unique one - ENOMEM */
1897 1898                  ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
1898 1899                      ntohl(dst)));
1899 1900                  return;
1900 1901          }
1901 1902  
1902 1903          /* Check for MTU discovery advice as described in RFC 1191 */
1903 1904          mtu = ntohs(icmph->icmph_du_mtu);
1904 1905          orig_mtu = mtu;
1905 1906          disable_pmtud = B_FALSE;
1906 1907  
1907 1908          mutex_enter(&dce->dce_lock);
1908 1909          if (dce->dce_flags & DCEF_PMTU)
1909 1910                  old_mtu = dce->dce_pmtu;
1910 1911          else
1911 1912                  old_mtu = ill->ill_mtu;
1912 1913  
1913 1914          if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
1914 1915                  uint32_t length;
1915 1916                  int     i;
1916 1917  
1917 1918                  /*
1918 1919                   * Use the table from RFC 1191 to figure out
1919 1920                   * the next "plateau" based on the length in
1920 1921                   * the original IP packet.
1921 1922                   */
1922 1923                  length = ntohs(ipha->ipha_length);
1923 1924                  DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
1924 1925                      uint32_t, length);
1925 1926                  if (old_mtu <= length &&
1926 1927                      old_mtu >= length - hdr_length) {
1927 1928                          /*
1928 1929                           * Handle broken BSD 4.2 systems that
1929 1930                           * return the wrong ipha_length in ICMP
1930 1931                           * errors.
1931 1932                           */
1932 1933                          ip1dbg(("Wrong mtu: sent %d, dce %d\n",
1933 1934                              length, old_mtu));
1934 1935                          length -= hdr_length;
1935 1936                  }
1936 1937                  for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
1937 1938                          if (length > icmp_frag_size_table[i])
1938 1939                                  break;
1939 1940                  }
1940 1941                  if (i == A_CNT(icmp_frag_size_table)) {
1941 1942                          /* Smaller than IP_MIN_MTU! */
1942 1943                          ip1dbg(("Too big for packet size %d\n",
1943 1944                              length));
1944 1945                          disable_pmtud = B_TRUE;
1945 1946                          mtu = ipst->ips_ip_pmtu_min;
1946 1947                  } else {
1947 1948                          mtu = icmp_frag_size_table[i];
1948 1949                          ip1dbg(("Calculated mtu %d, packet size %d, "
1949 1950                              "before %d\n", mtu, length, old_mtu));
1950 1951                          if (mtu < ipst->ips_ip_pmtu_min) {
1951 1952                                  mtu = ipst->ips_ip_pmtu_min;
1952 1953                                  disable_pmtud = B_TRUE;
1953 1954                          }
1954 1955                  }
1955 1956          }
1956 1957          if (disable_pmtud)
1957 1958                  dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
1958 1959          else
1959 1960                  dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
1960 1961  
1961 1962          dce->dce_pmtu = MIN(old_mtu, mtu);
1962 1963          /* Prepare to send the new max frag size for the ULP. */
1963 1964          icmph->icmph_du_zero = 0;
1964 1965          icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
1965 1966          DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
1966 1967              dce, int, orig_mtu, int, mtu);
1967 1968  
1968 1969          /* We now have a PMTU for sure */
1969 1970          dce->dce_flags |= DCEF_PMTU;
1970 1971          dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
1971 1972          mutex_exit(&dce->dce_lock);
1972 1973          /*
1973 1974           * After dropping the lock the new value is visible to everyone.
1974 1975           * Then we bump the generation number so any cached values reinspect
1975 1976           * the dce_t.
1976 1977           */
1977 1978          dce_increment_generation(dce);
1978 1979          dce_refrele(dce);
1979 1980  }
1980 1981  
1981 1982  /*
1982 1983   * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
1983 1984   * calls this function.
1984 1985   */
1985 1986  static mblk_t *
1986 1987  icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
1987 1988  {
1988 1989          int length;
1989 1990  
1990 1991          ASSERT(mp->b_datap->db_type == M_DATA);
1991 1992  
1992 1993          /* icmp_inbound_v4 has already pulled up the whole error packet */
1993 1994          ASSERT(mp->b_cont == NULL);
1994 1995  
1995 1996          /*
1996 1997           * The length that we want to overlay is the inner header
1997 1998           * and what follows it.
1998 1999           */
1999 2000          length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
2000 2001  
2001 2002          /*
2002 2003           * Overlay the inner header and whatever follows it over the
2003 2004           * outer header.
2004 2005           */
2005 2006          bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
2006 2007  
2007 2008          /* Adjust for what we removed */
2008 2009          mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
2009 2010          return (mp);
2010 2011  }
2011 2012  
2012 2013  /*
2013 2014   * Try to pass the ICMP message upstream in case the ULP cares.
2014 2015   *
2015 2016   * If the packet that caused the ICMP error is secure, we send
2016 2017   * it to AH/ESP to make sure that the attached packet has a
2017 2018   * valid association. ipha in the code below points to the
2018 2019   * IP header of the packet that caused the error.
2019 2020   *
2020 2021   * For IPsec cases, we let the next-layer-up (which has access to
2021 2022   * cached policy on the conn_t, or can query the SPD directly)
2022 2023   * subtract out any IPsec overhead if they must.  We therefore make no
2023 2024   * adjustments here for IPsec overhead.
2024 2025   *
2025 2026   * IFN could have been generated locally or by some router.
2026 2027   *
2027 2028   * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
2028 2029   * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
2029 2030   *          This happens because IP adjusted its value of MTU on an
2030 2031   *          earlier IFN message and could not tell the upper layer,
2031 2032   *          the new adjusted value of MTU e.g. Packet was encrypted
2032 2033   *          or there was not enough information to fanout to upper
2033 2034   *          layers. Thus on the next outbound datagram, ire_send_wire
2034 2035   *          generates the IFN, where IPsec processing has *not* been
2035 2036   *          done.
2036 2037   *
2037 2038   *          Note that we retain ixa_fragsize across IPsec thus once
2038 2039   *          we have picking ixa_fragsize and entered ipsec_out_process we do
2039 2040   *          no change the fragsize even if the path MTU changes before
2040 2041   *          we reach ip_output_post_ipsec.
2041 2042   *
2042 2043   *          In the local case, IRAF_LOOPBACK will be set indicating
2043 2044   *          that IFN was generated locally.
2044 2045   *
2045 2046   * ROUTER : IFN could be secure or non-secure.
2046 2047   *
2047 2048   *          * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
2048 2049   *            packet in error has AH/ESP headers to validate the AH/ESP
2049 2050   *            headers. AH/ESP will verify whether there is a valid SA or
2050 2051   *            not and send it back. We will fanout again if we have more
2051 2052   *            data in the packet.
2052 2053   *
2053 2054   *            If the packet in error does not have AH/ESP, we handle it
2054 2055   *            like any other case.
2055 2056   *
2056 2057   *          * NON_SECURE : If the packet in error has AH/ESP headers, we send it
2057 2058   *            up to AH/ESP for validation. AH/ESP will verify whether there is a
2058 2059   *            valid SA or not and send it back. We will fanout again if
2059 2060   *            we have more data in the packet.
2060 2061   *
2061 2062   *            If the packet in error does not have AH/ESP, we handle it
2062 2063   *            like any other case.
2063 2064   *
2064 2065   * The caller must have called icmp_inbound_verify_v4.
2065 2066   */
2066 2067  static void
2067 2068  icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
2068 2069  {
2069 2070          uint16_t        *up;    /* Pointer to ports in ULP header */
2070 2071          uint32_t        ports;  /* reversed ports for fanout */
2071 2072          ipha_t          ripha;  /* With reversed addresses */
2072 2073          ipha_t          *ipha;  /* Inner IP header */
2073 2074          uint_t          hdr_length; /* Inner IP header length */
2074 2075          tcpha_t         *tcpha;
2075 2076          conn_t          *connp;
2076 2077          ill_t           *ill = ira->ira_ill;
2077 2078          ip_stack_t      *ipst = ill->ill_ipst;
2078 2079          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
2079 2080          ill_t           *rill = ira->ira_rill;
2080 2081  
2081 2082          /* Caller already pulled up everything. */
2082 2083          ipha = (ipha_t *)&icmph[1];
2083 2084          ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
2084 2085          ASSERT(mp->b_cont == NULL);
2085 2086  
2086 2087          hdr_length = IPH_HDR_LENGTH(ipha);
2087 2088          ira->ira_protocol = ipha->ipha_protocol;
2088 2089  
2089 2090          /*
2090 2091           * We need a separate IP header with the source and destination
2091 2092           * addresses reversed to do fanout/classification because the ipha in
2092 2093           * the ICMP error is in the form we sent it out.
2093 2094           */
2094 2095          ripha.ipha_src = ipha->ipha_dst;
2095 2096          ripha.ipha_dst = ipha->ipha_src;
2096 2097          ripha.ipha_protocol = ipha->ipha_protocol;
2097 2098          ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
2098 2099  
2099 2100          ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
2100 2101              ripha.ipha_protocol, ntohl(ipha->ipha_src),
2101 2102              ntohl(ipha->ipha_dst),
2102 2103              icmph->icmph_type, icmph->icmph_code));
2103 2104  
2104 2105          switch (ipha->ipha_protocol) {
2105 2106          case IPPROTO_UDP:
2106 2107                  up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2107 2108  
2108 2109                  /* Attempt to find a client stream based on port. */
2109 2110                  ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
2110 2111                      ntohs(up[0]), ntohs(up[1])));
2111 2112  
2112 2113                  /* Note that we send error to all matches. */
2113 2114                  ira->ira_flags |= IRAF_ICMP_ERROR;
2114 2115                  ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
2115 2116                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2116 2117                  return;
2117 2118  
2118 2119          case IPPROTO_TCP:
2119 2120                  /*
2120 2121                   * Find a TCP client stream for this packet.
2121 2122                   * Note that we do a reverse lookup since the header is
2122 2123                   * in the form we sent it out.
2123 2124                   */
2124 2125                  tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
2125 2126                  connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
2126 2127                      ipst);
2127 2128                  if (connp == NULL)
2128 2129                          goto discard_pkt;
2129 2130  
2130 2131                  if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2131 2132                      (ira->ira_flags & IRAF_IPSEC_SECURE)) {
2132 2133                          mp = ipsec_check_inbound_policy(mp, connp,
2133 2134                              ipha, NULL, ira);
2134 2135                          if (mp == NULL) {
2135 2136                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2136 2137                                  /* Note that mp is NULL */
2137 2138                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
2138 2139                                  CONN_DEC_REF(connp);
2139 2140                                  return;
2140 2141                          }
2141 2142                  }
2142 2143  
2143 2144                  ira->ira_flags |= IRAF_ICMP_ERROR;
2144 2145                  ira->ira_ill = ira->ira_rill = NULL;
2145 2146                  if (IPCL_IS_TCP(connp)) {
2146 2147                          SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2147 2148                              connp->conn_recvicmp, connp, ira, SQ_FILL,
2148 2149                              SQTAG_TCP_INPUT_ICMP_ERR);
2149 2150                  } else {
2150 2151                          /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2151 2152                          (connp->conn_recv)(connp, mp, NULL, ira);
2152 2153                          CONN_DEC_REF(connp);
2153 2154                  }
2154 2155                  ira->ira_ill = ill;
2155 2156                  ira->ira_rill = rill;
2156 2157                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2157 2158                  return;
2158 2159  
2159 2160          case IPPROTO_SCTP:
2160 2161                  up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2161 2162                  /* Find a SCTP client stream for this packet. */
2162 2163                  ((uint16_t *)&ports)[0] = up[1];
2163 2164                  ((uint16_t *)&ports)[1] = up[0];
2164 2165  
2165 2166                  ira->ira_flags |= IRAF_ICMP_ERROR;
2166 2167                  ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
2167 2168                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2168 2169                  return;
2169 2170  
2170 2171          case IPPROTO_ESP:
2171 2172          case IPPROTO_AH:
2172 2173                  if (!ipsec_loaded(ipss)) {
2173 2174                          ip_proto_not_sup(mp, ira);
2174 2175                          return;
2175 2176                  }
2176 2177  
2177 2178                  if (ipha->ipha_protocol == IPPROTO_ESP)
2178 2179                          mp = ipsecesp_icmp_error(mp, ira);
2179 2180                  else
2180 2181                          mp = ipsecah_icmp_error(mp, ira);
2181 2182                  if (mp == NULL)
2182 2183                          return;
2183 2184  
2184 2185                  /* Just in case ipsec didn't preserve the NULL b_cont */
2185 2186                  if (mp->b_cont != NULL) {
2186 2187                          if (!pullupmsg(mp, -1))
2187 2188                                  goto discard_pkt;
2188 2189                  }
2189 2190  
2190 2191                  /*
2191 2192                   * Note that ira_pktlen and ira_ip_hdr_length are no longer
2192 2193                   * correct, but we don't use them any more here.
2193 2194                   *
2194 2195                   * If succesful, the mp has been modified to not include
2195 2196                   * the ESP/AH header so we can fanout to the ULP's icmp
2196 2197                   * error handler.
2197 2198                   */
2198 2199                  if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2199 2200                          goto truncated;
2200 2201  
2201 2202                  /* Verify the modified message before any further processes. */
2202 2203                  ipha = (ipha_t *)mp->b_rptr;
2203 2204                  hdr_length = IPH_HDR_LENGTH(ipha);
2204 2205                  icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2205 2206                  if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2206 2207                          freemsg(mp);
2207 2208                          return;
2208 2209                  }
2209 2210  
2210 2211                  icmp_inbound_error_fanout_v4(mp, icmph, ira);
2211 2212                  return;
2212 2213  
2213 2214          case IPPROTO_ENCAP: {
2214 2215                  /* Look for self-encapsulated packets that caused an error */
2215 2216                  ipha_t *in_ipha;
2216 2217  
2217 2218                  /*
2218 2219                   * Caller has verified that length has to be
2219 2220                   * at least the size of IP header.
2220 2221                   */
2221 2222                  ASSERT(hdr_length >= sizeof (ipha_t));
2222 2223                  /*
2223 2224                   * Check the sanity of the inner IP header like
2224 2225                   * we did for the outer header.
2225 2226                   */
2226 2227                  in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
2227 2228                  if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
2228 2229                          goto discard_pkt;
2229 2230                  }
2230 2231                  if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
2231 2232                          goto discard_pkt;
2232 2233                  }
2233 2234                  /* Check for Self-encapsulated tunnels */
2234 2235                  if (in_ipha->ipha_src == ipha->ipha_src &&
2235 2236                      in_ipha->ipha_dst == ipha->ipha_dst) {
2236 2237  
2237 2238                          mp = icmp_inbound_self_encap_error_v4(mp, ipha,
2238 2239                              in_ipha);
2239 2240                          if (mp == NULL)
2240 2241                                  goto discard_pkt;
2241 2242  
2242 2243                          /*
2243 2244                           * Just in case self_encap didn't preserve the NULL
2244 2245                           * b_cont
2245 2246                           */
2246 2247                          if (mp->b_cont != NULL) {
2247 2248                                  if (!pullupmsg(mp, -1))
2248 2249                                          goto discard_pkt;
2249 2250                          }
2250 2251                          /*
2251 2252                           * Note that ira_pktlen and ira_ip_hdr_length are no
2252 2253                           * longer correct, but we don't use them any more here.
2253 2254                           */
2254 2255                          if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2255 2256                                  goto truncated;
2256 2257  
2257 2258                          /*
2258 2259                           * Verify the modified message before any further
2259 2260                           * processes.
2260 2261                           */
2261 2262                          ipha = (ipha_t *)mp->b_rptr;
2262 2263                          hdr_length = IPH_HDR_LENGTH(ipha);
2263 2264                          icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2264 2265                          if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2265 2266                                  freemsg(mp);
2266 2267                                  return;
2267 2268                          }
2268 2269  
2269 2270                          /*
2270 2271                           * The packet in error is self-encapsualted.
2271 2272                           * And we are finding it further encapsulated
2272 2273                           * which we could not have possibly generated.
2273 2274                           */
2274 2275                          if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2275 2276                                  goto discard_pkt;
2276 2277                          }
2277 2278                          icmp_inbound_error_fanout_v4(mp, icmph, ira);
2278 2279                          return;
2279 2280                  }
2280 2281                  /* No self-encapsulated */
2281 2282          }
2282 2283          /* FALLTHROUGH */
2283 2284          case IPPROTO_IPV6:
2284 2285                  if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
2285 2286                      &ripha.ipha_dst, ipst)) != NULL) {
2286 2287                          ira->ira_flags |= IRAF_ICMP_ERROR;
2287 2288                          connp->conn_recvicmp(connp, mp, NULL, ira);
2288 2289                          CONN_DEC_REF(connp);
2289 2290                          ira->ira_flags &= ~IRAF_ICMP_ERROR;
2290 2291                          return;
2291 2292                  }
2292 2293                  /*
2293 2294                   * No IP tunnel is interested, fallthrough and see
2294 2295                   * if a raw socket will want it.
2295 2296                   */
2296 2297                  /* FALLTHROUGH */
2297 2298          default:
2298 2299                  ira->ira_flags |= IRAF_ICMP_ERROR;
2299 2300                  ip_fanout_proto_v4(mp, &ripha, ira);
2300 2301                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2301 2302                  return;
2302 2303          }
2303 2304          /* NOTREACHED */
2304 2305  discard_pkt:
2305 2306          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2306 2307          ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
2307 2308          ip_drop_input("ipIfStatsInDiscards", mp, ill);
2308 2309          freemsg(mp);
2309 2310          return;
2310 2311  
2311 2312  truncated:
2312 2313          /* We pulled up everthing already. Must be truncated */
2313 2314          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2314 2315          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2315 2316          freemsg(mp);
2316 2317  }
2317 2318  
2318 2319  /*
2319 2320   * Common IP options parser.
2320 2321   *
2321 2322   * Setup routine: fill in *optp with options-parsing state, then
2322 2323   * tail-call ipoptp_next to return the first option.
2323 2324   */
2324 2325  uint8_t
2325 2326  ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
2326 2327  {
2327 2328          uint32_t totallen; /* total length of all options */
2328 2329  
2329 2330          totallen = ipha->ipha_version_and_hdr_length -
2330 2331              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
2331 2332          totallen <<= 2;
2332 2333          optp->ipoptp_next = (uint8_t *)(&ipha[1]);
2333 2334          optp->ipoptp_end = optp->ipoptp_next + totallen;
2334 2335          optp->ipoptp_flags = 0;
2335 2336          return (ipoptp_next(optp));
2336 2337  }
2337 2338  
2338 2339  /* Like above but without an ipha_t */
2339 2340  uint8_t
2340 2341  ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
2341 2342  {
2342 2343          optp->ipoptp_next = opt;
2343 2344          optp->ipoptp_end = optp->ipoptp_next + totallen;
2344 2345          optp->ipoptp_flags = 0;
2345 2346          return (ipoptp_next(optp));
2346 2347  }
2347 2348  
2348 2349  /*
2349 2350   * Common IP options parser: extract next option.
2350 2351   */
2351 2352  uint8_t
2352 2353  ipoptp_next(ipoptp_t *optp)
2353 2354  {
2354 2355          uint8_t *end = optp->ipoptp_end;
2355 2356          uint8_t *cur = optp->ipoptp_next;
2356 2357          uint8_t opt, len, pointer;
2357 2358  
2358 2359          /*
2359 2360           * If cur > end already, then the ipoptp_end or ipoptp_next pointer
2360 2361           * has been corrupted.
2361 2362           */
2362 2363          ASSERT(cur <= end);
2363 2364  
2364 2365          if (cur == end)
2365 2366                  return (IPOPT_EOL);
2366 2367  
2367 2368          opt = cur[IPOPT_OPTVAL];
2368 2369  
2369 2370          /*
2370 2371           * Skip any NOP options.
2371 2372           */
2372 2373          while (opt == IPOPT_NOP) {
2373 2374                  cur++;
2374 2375                  if (cur == end)
2375 2376                          return (IPOPT_EOL);
2376 2377                  opt = cur[IPOPT_OPTVAL];
2377 2378          }
2378 2379  
2379 2380          if (opt == IPOPT_EOL)
2380 2381                  return (IPOPT_EOL);
2381 2382  
2382 2383          /*
2383 2384           * Option requiring a length.
2384 2385           */
2385 2386          if ((cur + 1) >= end) {
2386 2387                  optp->ipoptp_flags |= IPOPTP_ERROR;
2387 2388                  return (IPOPT_EOL);
2388 2389          }
2389 2390          len = cur[IPOPT_OLEN];
2390 2391          if (len < 2) {
2391 2392                  optp->ipoptp_flags |= IPOPTP_ERROR;
2392 2393                  return (IPOPT_EOL);
2393 2394          }
2394 2395          optp->ipoptp_cur = cur;
2395 2396          optp->ipoptp_len = len;
2396 2397          optp->ipoptp_next = cur + len;
2397 2398          if (cur + len > end) {
2398 2399                  optp->ipoptp_flags |= IPOPTP_ERROR;
2399 2400                  return (IPOPT_EOL);
2400 2401          }
2401 2402  
2402 2403          /*
2403 2404           * For the options which require a pointer field, make sure
2404 2405           * its there, and make sure it points to either something
2405 2406           * inside this option, or the end of the option.
2406 2407           */
2407 2408          pointer = IPOPT_EOL;
2408 2409          switch (opt) {
2409 2410          case IPOPT_RR:
2410 2411          case IPOPT_TS:
2411 2412          case IPOPT_LSRR:
2412 2413          case IPOPT_SSRR:
2413 2414                  if (len <= IPOPT_OFFSET) {
2414 2415                          optp->ipoptp_flags |= IPOPTP_ERROR;
2415 2416                          return (opt);
2416 2417                  }
2417 2418                  pointer = cur[IPOPT_OFFSET];
2418 2419                  if (pointer - 1 > len) {
2419 2420                          optp->ipoptp_flags |= IPOPTP_ERROR;
2420 2421                          return (opt);
2421 2422                  }
2422 2423                  break;
2423 2424          }
2424 2425  
2425 2426          /*
2426 2427           * Sanity check the pointer field based on the type of the
2427 2428           * option.
2428 2429           */
2429 2430          switch (opt) {
2430 2431          case IPOPT_RR:
2431 2432          case IPOPT_SSRR:
2432 2433          case IPOPT_LSRR:
2433 2434                  if (pointer < IPOPT_MINOFF_SR)
2434 2435                          optp->ipoptp_flags |= IPOPTP_ERROR;
2435 2436                  break;
2436 2437          case IPOPT_TS:
2437 2438                  if (pointer < IPOPT_MINOFF_IT)
2438 2439                          optp->ipoptp_flags |= IPOPTP_ERROR;
2439 2440                  /*
2440 2441                   * Note that the Internet Timestamp option also
2441 2442                   * contains two four bit fields (the Overflow field,
2442 2443                   * and the Flag field), which follow the pointer
2443 2444                   * field.  We don't need to check that these fields
2444 2445                   * fall within the length of the option because this
2445 2446                   * was implicitely done above.  We've checked that the
2446 2447                   * pointer value is at least IPOPT_MINOFF_IT, and that
2447 2448                   * it falls within the option.  Since IPOPT_MINOFF_IT >
2448 2449                   * IPOPT_POS_OV_FLG, we don't need the explicit check.
2449 2450                   */
2450 2451                  ASSERT(len > IPOPT_POS_OV_FLG);
2451 2452                  break;
2452 2453          }
2453 2454  
2454 2455          return (opt);
2455 2456  }
2456 2457  
2457 2458  /*
2458 2459   * Use the outgoing IP header to create an IP_OPTIONS option the way
2459 2460   * it was passed down from the application.
2460 2461   *
2461 2462   * This is compatible with BSD in that it returns
2462 2463   * the reverse source route with the final destination
2463 2464   * as the last entry. The first 4 bytes of the option
2464 2465   * will contain the final destination.
2465 2466   */
2466 2467  int
2467 2468  ip_opt_get_user(conn_t *connp, uchar_t *buf)
2468 2469  {
2469 2470          ipoptp_t        opts;
2470 2471          uchar_t         *opt;
2471 2472          uint8_t         optval;
2472 2473          uint8_t         optlen;
2473 2474          uint32_t        len = 0;
2474 2475          uchar_t         *buf1 = buf;
2475 2476          uint32_t        totallen;
2476 2477          ipaddr_t        dst;
2477 2478          ip_pkt_t        *ipp = &connp->conn_xmit_ipp;
2478 2479  
2479 2480          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
2480 2481                  return (0);
2481 2482  
2482 2483          totallen = ipp->ipp_ipv4_options_len;
2483 2484          if (totallen & 0x3)
2484 2485                  return (0);
2485 2486  
2486 2487          buf += IP_ADDR_LEN;     /* Leave room for final destination */
2487 2488          len += IP_ADDR_LEN;
2488 2489          bzero(buf1, IP_ADDR_LEN);
2489 2490  
2490 2491          dst = connp->conn_faddr_v4;
2491 2492  
2492 2493          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
2493 2494              optval != IPOPT_EOL;
2494 2495              optval = ipoptp_next(&opts)) {
2495 2496                  int     off;
2496 2497  
2497 2498                  opt = opts.ipoptp_cur;
2498 2499                  if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
2499 2500                          break;
2500 2501                  }
2501 2502                  optlen = opts.ipoptp_len;
2502 2503  
2503 2504                  switch (optval) {
2504 2505                  case IPOPT_SSRR:
2505 2506                  case IPOPT_LSRR:
2506 2507  
2507 2508                          /*
2508 2509                           * Insert destination as the first entry in the source
2509 2510                           * route and move down the entries on step.
2510 2511                           * The last entry gets placed at buf1.
2511 2512                           */
2512 2513                          buf[IPOPT_OPTVAL] = optval;
2513 2514                          buf[IPOPT_OLEN] = optlen;
2514 2515                          buf[IPOPT_OFFSET] = optlen;
2515 2516  
2516 2517                          off = optlen - IP_ADDR_LEN;
2517 2518                          if (off < 0) {
2518 2519                                  /* No entries in source route */
2519 2520                                  break;
2520 2521                          }
2521 2522                          /* Last entry in source route if not already set */
2522 2523                          if (dst == INADDR_ANY)
2523 2524                                  bcopy(opt + off, buf1, IP_ADDR_LEN);
2524 2525                          off -= IP_ADDR_LEN;
2525 2526  
2526 2527                          while (off > 0) {
2527 2528                                  bcopy(opt + off,
2528 2529                                      buf + off + IP_ADDR_LEN,
2529 2530                                      IP_ADDR_LEN);
2530 2531                                  off -= IP_ADDR_LEN;
2531 2532                          }
2532 2533                          /* ipha_dst into first slot */
2533 2534                          bcopy(&dst, buf + off + IP_ADDR_LEN,
2534 2535                              IP_ADDR_LEN);
2535 2536                          buf += optlen;
2536 2537                          len += optlen;
2537 2538                          break;
2538 2539  
2539 2540                  default:
2540 2541                          bcopy(opt, buf, optlen);
2541 2542                          buf += optlen;
2542 2543                          len += optlen;
2543 2544                          break;
2544 2545                  }
2545 2546          }
2546 2547  done:
2547 2548          /* Pad the resulting options */
2548 2549          while (len & 0x3) {
2549 2550                  *buf++ = IPOPT_EOL;
2550 2551                  len++;
2551 2552          }
2552 2553          return (len);
2553 2554  }
2554 2555  
2555 2556  /*
2556 2557   * Update any record route or timestamp options to include this host.
2557 2558   * Reverse any source route option.
2558 2559   * This routine assumes that the options are well formed i.e. that they
2559 2560   * have already been checked.
2560 2561   */
2561 2562  static void
2562 2563  icmp_options_update(ipha_t *ipha)
2563 2564  {
2564 2565          ipoptp_t        opts;
2565 2566          uchar_t         *opt;
2566 2567          uint8_t         optval;
2567 2568          ipaddr_t        src;            /* Our local address */
2568 2569          ipaddr_t        dst;
2569 2570  
2570 2571          ip2dbg(("icmp_options_update\n"));
2571 2572          src = ipha->ipha_src;
2572 2573          dst = ipha->ipha_dst;
2573 2574  
2574 2575          for (optval = ipoptp_first(&opts, ipha);
2575 2576              optval != IPOPT_EOL;
2576 2577              optval = ipoptp_next(&opts)) {
2577 2578                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
2578 2579                  opt = opts.ipoptp_cur;
2579 2580                  ip2dbg(("icmp_options_update: opt %d, len %d\n",
2580 2581                      optval, opts.ipoptp_len));
2581 2582                  switch (optval) {
2582 2583                          int off1, off2;
2583 2584                  case IPOPT_SSRR:
2584 2585                  case IPOPT_LSRR:
2585 2586                          /*
2586 2587                           * Reverse the source route.  The first entry
2587 2588                           * should be the next to last one in the current
2588 2589                           * source route (the last entry is our address).
2589 2590                           * The last entry should be the final destination.
2590 2591                           */
2591 2592                          off1 = IPOPT_MINOFF_SR - 1;
2592 2593                          off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
2593 2594                          if (off2 < 0) {
2594 2595                                  /* No entries in source route */
2595 2596                                  ip1dbg((
2596 2597                                      "icmp_options_update: bad src route\n"));
2597 2598                                  break;
2598 2599                          }
2599 2600                          bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
2600 2601                          bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
2601 2602                          bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
2602 2603                          off2 -= IP_ADDR_LEN;
2603 2604  
2604 2605                          while (off1 < off2) {
2605 2606                                  bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
2606 2607                                  bcopy((char *)opt + off2, (char *)opt + off1,
2607 2608                                      IP_ADDR_LEN);
2608 2609                                  bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
2609 2610                                  off1 += IP_ADDR_LEN;
2610 2611                                  off2 -= IP_ADDR_LEN;
2611 2612                          }
2612 2613                          opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
2613 2614                          break;
2614 2615                  }
2615 2616          }
2616 2617  }
2617 2618  
2618 2619  /*
2619 2620   * Process received ICMP Redirect messages.
2620 2621   * Assumes the caller has verified that the headers are in the pulled up mblk.
2621 2622   * Consumes mp.
2622 2623   */
2623 2624  static void
2624 2625  icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
2625 2626  {
2626 2627          ire_t           *ire, *nire;
2627 2628          ire_t           *prev_ire;
2628 2629          ipaddr_t        src, dst, gateway;
2629 2630          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2630 2631          ipha_t          *inner_ipha;    /* Inner IP header */
2631 2632  
2632 2633          /* Caller already pulled up everything. */
2633 2634          inner_ipha = (ipha_t *)&icmph[1];
2634 2635          src = ipha->ipha_src;
2635 2636          dst = inner_ipha->ipha_dst;
2636 2637          gateway = icmph->icmph_rd_gateway;
2637 2638          /* Make sure the new gateway is reachable somehow. */
2638 2639          ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
2639 2640              ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
2640 2641          /*
2641 2642           * Make sure we had a route for the dest in question and that
2642 2643           * that route was pointing to the old gateway (the source of the
2643 2644           * redirect packet.)
2644 2645           * We do longest match and then compare ire_gateway_addr below.
2645 2646           */
2646 2647          prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
2647 2648              NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2648 2649          /*
2649 2650           * Check that
2650 2651           *      the redirect was not from ourselves
2651 2652           *      the new gateway and the old gateway are directly reachable
2652 2653           */
2653 2654          if (prev_ire == NULL || ire == NULL ||
2654 2655              (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
2655 2656              (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2656 2657              !(ire->ire_type & IRE_IF_ALL) ||
2657 2658              prev_ire->ire_gateway_addr != src) {
2658 2659                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2659 2660                  ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
2660 2661                  freemsg(mp);
2661 2662                  if (ire != NULL)
2662 2663                          ire_refrele(ire);
2663 2664                  if (prev_ire != NULL)
2664 2665                          ire_refrele(prev_ire);
2665 2666                  return;
2666 2667          }
2667 2668  
2668 2669          ire_refrele(prev_ire);
2669 2670          ire_refrele(ire);
2670 2671  
2671 2672          /*
2672 2673           * TODO: more precise handling for cases 0, 2, 3, the latter two
2673 2674           * require TOS routing
2674 2675           */
2675 2676          switch (icmph->icmph_code) {
2676 2677          case 0:
2677 2678          case 1:
2678 2679                  /* TODO: TOS specificity for cases 2 and 3 */
2679 2680          case 2:
2680 2681          case 3:
2681 2682                  break;
2682 2683          default:
2683 2684                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2684 2685                  ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
2685 2686                  freemsg(mp);
2686 2687                  return;
2687 2688          }
2688 2689          /*
2689 2690           * Create a Route Association.  This will allow us to remember that
2690 2691           * someone we believe told us to use the particular gateway.
2691 2692           */
2692 2693          ire = ire_create(
2693 2694              (uchar_t *)&dst,                    /* dest addr */
2694 2695              (uchar_t *)&ip_g_all_ones,          /* mask */
2695 2696              (uchar_t *)&gateway,                /* gateway addr */
2696 2697              IRE_HOST,
2697 2698              NULL,                               /* ill */
2698 2699              ALL_ZONES,
2699 2700              (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
2700 2701              NULL,                               /* tsol_gc_t */
2701 2702              ipst);
2702 2703  
2703 2704          if (ire == NULL) {
2704 2705                  freemsg(mp);
2705 2706                  return;
2706 2707          }
2707 2708          nire = ire_add(ire);
2708 2709          /* Check if it was a duplicate entry */
2709 2710          if (nire != NULL && nire != ire) {
2710 2711                  ASSERT(nire->ire_identical_ref > 1);
2711 2712                  ire_delete(nire);
2712 2713                  ire_refrele(nire);
2713 2714                  nire = NULL;
2714 2715          }
2715 2716          ire = nire;
2716 2717          if (ire != NULL) {
2717 2718                  ire_refrele(ire);               /* Held in ire_add */
2718 2719  
2719 2720                  /* tell routing sockets that we received a redirect */
2720 2721                  ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
2721 2722                      (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
2722 2723                      (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
2723 2724          }
2724 2725  
2725 2726          /*
2726 2727           * Delete any existing IRE_HOST type redirect ires for this destination.
2727 2728           * This together with the added IRE has the effect of
2728 2729           * modifying an existing redirect.
2729 2730           */
2730 2731          prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
2731 2732              ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
2732 2733          if (prev_ire != NULL) {
2733 2734                  if (prev_ire ->ire_flags & RTF_DYNAMIC)
2734 2735                          ire_delete(prev_ire);
2735 2736                  ire_refrele(prev_ire);
2736 2737          }
2737 2738  
2738 2739          freemsg(mp);
2739 2740  }
2740 2741  
2741 2742  /*
2742 2743   * Generate an ICMP parameter problem message.
2743 2744   * When called from ip_output side a minimal ip_recv_attr_t needs to be
2744 2745   * constructed by the caller.
2745 2746   */
2746 2747  static void
2747 2748  icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
2748 2749  {
2749 2750          icmph_t icmph;
2750 2751          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2751 2752  
2752 2753          mp = icmp_pkt_err_ok(mp, ira);
2753 2754          if (mp == NULL)
2754 2755                  return;
2755 2756  
2756 2757          bzero(&icmph, sizeof (icmph_t));
2757 2758          icmph.icmph_type = ICMP_PARAM_PROBLEM;
2758 2759          icmph.icmph_pp_ptr = ptr;
2759 2760          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
2760 2761          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
2761 2762  }
2762 2763  
2763 2764  /*
2764 2765   * Build and ship an IPv4 ICMP message using the packet data in mp, and
2765 2766   * the ICMP header pointed to by "stuff".  (May be called as writer.)
2766 2767   * Note: assumes that icmp_pkt_err_ok has been called to verify that
2767 2768   * an icmp error packet can be sent.
2768 2769   * Assigns an appropriate source address to the packet. If ipha_dst is
2769 2770   * one of our addresses use it for source. Otherwise let ip_output_simple
2770 2771   * pick the source address.
2771 2772   */
2772 2773  static void
2773 2774  icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
2774 2775  {
2775 2776          ipaddr_t dst;
2776 2777          icmph_t *icmph;
2777 2778          ipha_t  *ipha;
2778 2779          uint_t  len_needed;
2779 2780          size_t  msg_len;
2780 2781          mblk_t  *mp1;
2781 2782          ipaddr_t src;
2782 2783          ire_t   *ire;
2783 2784          ip_xmit_attr_t ixas;
2784 2785          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2785 2786  
2786 2787          ipha = (ipha_t *)mp->b_rptr;
2787 2788  
2788 2789          bzero(&ixas, sizeof (ixas));
2789 2790          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2790 2791          ixas.ixa_zoneid = ira->ira_zoneid;
2791 2792          ixas.ixa_ifindex = 0;
2792 2793          ixas.ixa_ipst = ipst;
2793 2794          ixas.ixa_cred = kcred;
2794 2795          ixas.ixa_cpid = NOPID;
2795 2796          ixas.ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
2796 2797          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2797 2798  
2798 2799          if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2799 2800                  /*
2800 2801                   * Apply IPsec based on how IPsec was applied to
2801 2802                   * the packet that had the error.
2802 2803                   *
2803 2804                   * If it was an outbound packet that caused the ICMP
2804 2805                   * error, then the caller will have setup the IRA
2805 2806                   * appropriately.
2806 2807                   */
2807 2808                  if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
2808 2809                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2809 2810                          /* Note: mp already consumed and ip_drop_packet done */
2810 2811                          return;
2811 2812                  }
2812 2813          } else {
2813 2814                  /*
2814 2815                   * This is in clear. The icmp message we are building
2815 2816                   * here should go out in clear, independent of our policy.
2816 2817                   */
2817 2818                  ixas.ixa_flags |= IXAF_NO_IPSEC;
2818 2819          }
2819 2820  
2820 2821          /* Remember our eventual destination */
2821 2822          dst = ipha->ipha_src;
2822 2823  
2823 2824          /*
2824 2825           * If the packet was for one of our unicast addresses, make
2825 2826           * sure we respond with that as the source. Otherwise
2826 2827           * have ip_output_simple pick the source address.
2827 2828           */
2828 2829          ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
2829 2830              (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
2830 2831              MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
2831 2832          if (ire != NULL) {
2832 2833                  ire_refrele(ire);
2833 2834                  src = ipha->ipha_dst;
2834 2835          } else {
2835 2836                  src = INADDR_ANY;
2836 2837                  ixas.ixa_flags |= IXAF_SET_SOURCE;
2837 2838          }
2838 2839  
2839 2840          /*
2840 2841           * Check if we can send back more then 8 bytes in addition to
2841 2842           * the IP header.  We try to send 64 bytes of data and the internal
2842 2843           * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
2843 2844           */
2844 2845          len_needed = IPH_HDR_LENGTH(ipha);
2845 2846          if (ipha->ipha_protocol == IPPROTO_ENCAP ||
2846 2847              ipha->ipha_protocol == IPPROTO_IPV6) {
2847 2848                  if (!pullupmsg(mp, -1)) {
2848 2849                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2849 2850                          ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2850 2851                          freemsg(mp);
2851 2852                          return;
2852 2853                  }
2853 2854                  ipha = (ipha_t *)mp->b_rptr;
2854 2855  
2855 2856                  if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2856 2857                          len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
2857 2858                              len_needed));
2858 2859                  } else {
2859 2860                          ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
2860 2861  
2861 2862                          ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
2862 2863                          len_needed += ip_hdr_length_v6(mp, ip6h);
2863 2864                  }
2864 2865          }
2865 2866          len_needed += ipst->ips_ip_icmp_return;
2866 2867          msg_len = msgdsize(mp);
2867 2868          if (msg_len > len_needed) {
2868 2869                  (void) adjmsg(mp, len_needed - msg_len);
2869 2870                  msg_len = len_needed;
2870 2871          }
2871 2872          mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
2872 2873          if (mp1 == NULL) {
2873 2874                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
2874 2875                  freemsg(mp);
2875 2876                  return;
2876 2877          }
2877 2878          mp1->b_cont = mp;
2878 2879          mp = mp1;
2879 2880  
2880 2881          /*
2881 2882           * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
2882 2883           * node generates be accepted in peace by all on-host destinations.
2883 2884           * If we do NOT assume that all on-host destinations trust
2884 2885           * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
2885 2886           * (Look for IXAF_TRUSTED_ICMP).
2886 2887           */
2887 2888          ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
2888 2889  
2889 2890          ipha = (ipha_t *)mp->b_rptr;
2890 2891          mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
2891 2892          *ipha = icmp_ipha;
2892 2893          ipha->ipha_src = src;
2893 2894          ipha->ipha_dst = dst;
2894 2895          ipha->ipha_ttl = ipst->ips_ip_def_ttl;
2895 2896          msg_len += sizeof (icmp_ipha) + len;
2896 2897          if (msg_len > IP_MAXPACKET) {
2897 2898                  (void) adjmsg(mp, IP_MAXPACKET - msg_len);
2898 2899                  msg_len = IP_MAXPACKET;
2899 2900          }
2900 2901          ipha->ipha_length = htons((uint16_t)msg_len);
2901 2902          icmph = (icmph_t *)&ipha[1];
2902 2903          bcopy(stuff, icmph, len);
2903 2904          icmph->icmph_checksum = 0;
2904 2905          icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
2905 2906          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
2906 2907  
2907 2908          (void) ip_output_simple(mp, &ixas);
2908 2909          ixa_cleanup(&ixas);
2909 2910  }
2910 2911  
2911 2912  /*
2912 2913   * Determine if an ICMP error packet can be sent given the rate limit.
2913 2914   * The limit consists of an average frequency (icmp_pkt_err_interval measured
2914 2915   * in milliseconds) and a burst size. Burst size number of packets can
2915 2916   * be sent arbitrarely closely spaced.
2916 2917   * The state is tracked using two variables to implement an approximate
2917 2918   * token bucket filter:
2918 2919   *      icmp_pkt_err_last - lbolt value when the last burst started
2919 2920   *      icmp_pkt_err_sent - number of packets sent in current burst
2920 2921   */
2921 2922  boolean_t
2922 2923  icmp_err_rate_limit(ip_stack_t *ipst)
2923 2924  {
2924 2925          clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
2925 2926          uint_t refilled; /* Number of packets refilled in tbf since last */
2926 2927          /* Guard against changes by loading into local variable */
2927 2928          uint_t err_interval = ipst->ips_ip_icmp_err_interval;
2928 2929  
2929 2930          if (err_interval == 0)
2930 2931                  return (B_FALSE);
2931 2932  
2932 2933          if (ipst->ips_icmp_pkt_err_last > now) {
2933 2934                  /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
2934 2935                  ipst->ips_icmp_pkt_err_last = 0;
2935 2936                  ipst->ips_icmp_pkt_err_sent = 0;
2936 2937          }
2937 2938          /*
2938 2939           * If we are in a burst update the token bucket filter.
2939 2940           * Update the "last" time to be close to "now" but make sure
2940 2941           * we don't loose precision.
2941 2942           */
2942 2943          if (ipst->ips_icmp_pkt_err_sent != 0) {
2943 2944                  refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
2944 2945                  if (refilled > ipst->ips_icmp_pkt_err_sent) {
2945 2946                          ipst->ips_icmp_pkt_err_sent = 0;
2946 2947                  } else {
2947 2948                          ipst->ips_icmp_pkt_err_sent -= refilled;
2948 2949                          ipst->ips_icmp_pkt_err_last += refilled * err_interval;
2949 2950                  }
2950 2951          }
2951 2952          if (ipst->ips_icmp_pkt_err_sent == 0) {
2952 2953                  /* Start of new burst */
2953 2954                  ipst->ips_icmp_pkt_err_last = now;
2954 2955          }
2955 2956          if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
2956 2957                  ipst->ips_icmp_pkt_err_sent++;
2957 2958                  ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
2958 2959                      ipst->ips_icmp_pkt_err_sent));
2959 2960                  return (B_FALSE);
2960 2961          }
2961 2962          ip1dbg(("icmp_err_rate_limit: dropped\n"));
2962 2963          return (B_TRUE);
2963 2964  }
2964 2965  
2965 2966  /*
2966 2967   * Check if it is ok to send an IPv4 ICMP error packet in
2967 2968   * response to the IPv4 packet in mp.
2968 2969   * Free the message and return null if no
2969 2970   * ICMP error packet should be sent.
2970 2971   */
2971 2972  static mblk_t *
2972 2973  icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
2973 2974  {
2974 2975          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2975 2976          icmph_t *icmph;
2976 2977          ipha_t  *ipha;
2977 2978          uint_t  len_needed;
2978 2979  
2979 2980          if (!mp)
2980 2981                  return (NULL);
2981 2982          ipha = (ipha_t *)mp->b_rptr;
2982 2983          if (ip_csum_hdr(ipha)) {
2983 2984                  BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
2984 2985                  ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
2985 2986                  freemsg(mp);
2986 2987                  return (NULL);
2987 2988          }
2988 2989          if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
2989 2990              ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
2990 2991              CLASSD(ipha->ipha_dst) ||
2991 2992              CLASSD(ipha->ipha_src) ||
2992 2993              (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
2993 2994                  /* Note: only errors to the fragment with offset 0 */
2994 2995                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
2995 2996                  freemsg(mp);
2996 2997                  return (NULL);
2997 2998          }
2998 2999          if (ipha->ipha_protocol == IPPROTO_ICMP) {
2999 3000                  /*
3000 3001                   * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
3001 3002                   * errors in response to any ICMP errors.
3002 3003                   */
3003 3004                  len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
3004 3005                  if (mp->b_wptr - mp->b_rptr < len_needed) {
3005 3006                          if (!pullupmsg(mp, len_needed)) {
3006 3007                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
3007 3008                                  freemsg(mp);
3008 3009                                  return (NULL);
3009 3010                          }
3010 3011                          ipha = (ipha_t *)mp->b_rptr;
3011 3012                  }
3012 3013                  icmph = (icmph_t *)
3013 3014                      (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
3014 3015                  switch (icmph->icmph_type) {
3015 3016                  case ICMP_DEST_UNREACHABLE:
3016 3017                  case ICMP_SOURCE_QUENCH:
3017 3018                  case ICMP_TIME_EXCEEDED:
3018 3019                  case ICMP_PARAM_PROBLEM:
3019 3020                  case ICMP_REDIRECT:
3020 3021                          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3021 3022                          freemsg(mp);
3022 3023                          return (NULL);
3023 3024                  default:
3024 3025                          break;
3025 3026                  }
3026 3027          }
3027 3028          /*
3028 3029           * If this is a labeled system, then check to see if we're allowed to
3029 3030           * send a response to this particular sender.  If not, then just drop.
3030 3031           */
3031 3032          if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
3032 3033                  ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
3033 3034                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3034 3035                  freemsg(mp);
3035 3036                  return (NULL);
3036 3037          }
3037 3038          if (icmp_err_rate_limit(ipst)) {
3038 3039                  /*
3039 3040                   * Only send ICMP error packets every so often.
3040 3041                   * This should be done on a per port/source basis,
3041 3042                   * but for now this will suffice.
3042 3043                   */
3043 3044                  freemsg(mp);
3044 3045                  return (NULL);
3045 3046          }
3046 3047          return (mp);
3047 3048  }
3048 3049  
3049 3050  /*
3050 3051   * Called when a packet was sent out the same link that it arrived on.
3051 3052   * Check if it is ok to send a redirect and then send it.
3052 3053   */
3053 3054  void
3054 3055  ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
3055 3056      ip_recv_attr_t *ira)
3056 3057  {
3057 3058          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
3058 3059          ipaddr_t        src, nhop;
3059 3060          mblk_t          *mp1;
3060 3061          ire_t           *nhop_ire;
3061 3062  
3062 3063          /*
3063 3064           * Check the source address to see if it originated
3064 3065           * on the same logical subnet it is going back out on.
3065 3066           * If so, we should be able to send it a redirect.
3066 3067           * Avoid sending a redirect if the destination
3067 3068           * is directly connected (i.e., we matched an IRE_ONLINK),
3068 3069           * or if the packet was source routed out this interface.
3069 3070           *
3070 3071           * We avoid sending a redirect if the
3071 3072           * destination is directly connected
3072 3073           * because it is possible that multiple
3073 3074           * IP subnets may have been configured on
3074 3075           * the link, and the source may not
3075 3076           * be on the same subnet as ip destination,
3076 3077           * even though they are on the same
3077 3078           * physical link.
3078 3079           */
3079 3080          if ((ire->ire_type & IRE_ONLINK) ||
3080 3081              ip_source_routed(ipha, ipst))
3081 3082                  return;
3082 3083  
3083 3084          nhop_ire = ire_nexthop(ire);
3084 3085          if (nhop_ire == NULL)
3085 3086                  return;
3086 3087  
3087 3088          nhop = nhop_ire->ire_addr;
3088 3089  
3089 3090          if (nhop_ire->ire_type & IRE_IF_CLONE) {
3090 3091                  ire_t   *ire2;
3091 3092  
3092 3093                  /* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
3093 3094                  mutex_enter(&nhop_ire->ire_lock);
3094 3095                  ire2 = nhop_ire->ire_dep_parent;
3095 3096                  if (ire2 != NULL)
3096 3097                          ire_refhold(ire2);
3097 3098                  mutex_exit(&nhop_ire->ire_lock);
3098 3099                  ire_refrele(nhop_ire);
3099 3100                  nhop_ire = ire2;
3100 3101          }
3101 3102          if (nhop_ire == NULL)
3102 3103                  return;
3103 3104  
3104 3105          ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
3105 3106  
3106 3107          src = ipha->ipha_src;
3107 3108  
3108 3109          /*
3109 3110           * We look at the interface ire for the nexthop,
3110 3111           * to see if ipha_src is in the same subnet
3111 3112           * as the nexthop.
3112 3113           */
3113 3114          if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
3114 3115                  /*
3115 3116                   * The source is directly connected.
3116 3117                   */
3117 3118                  mp1 = copymsg(mp);
3118 3119                  if (mp1 != NULL) {
3119 3120                          icmp_send_redirect(mp1, nhop, ira);
3120 3121                  }
3121 3122          }
3122 3123          ire_refrele(nhop_ire);
3123 3124  }
3124 3125  
3125 3126  /*
3126 3127   * Generate an ICMP redirect message.
3127 3128   */
3128 3129  static void
3129 3130  icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
3130 3131  {
3131 3132          icmph_t icmph;
3132 3133          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3133 3134  
3134 3135          mp = icmp_pkt_err_ok(mp, ira);
3135 3136          if (mp == NULL)
3136 3137                  return;
3137 3138  
3138 3139          bzero(&icmph, sizeof (icmph_t));
3139 3140          icmph.icmph_type = ICMP_REDIRECT;
3140 3141          icmph.icmph_code = 1;
3141 3142          icmph.icmph_rd_gateway = gateway;
3142 3143          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
3143 3144          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3144 3145  }
3145 3146  
3146 3147  /*
3147 3148   * Generate an ICMP time exceeded message.
3148 3149   */
3149 3150  void
3150 3151  icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3151 3152  {
3152 3153          icmph_t icmph;
3153 3154          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3154 3155  
3155 3156          mp = icmp_pkt_err_ok(mp, ira);
3156 3157          if (mp == NULL)
3157 3158                  return;
3158 3159  
3159 3160          bzero(&icmph, sizeof (icmph_t));
3160 3161          icmph.icmph_type = ICMP_TIME_EXCEEDED;
3161 3162          icmph.icmph_code = code;
3162 3163          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
3163 3164          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3164 3165  }
3165 3166  
3166 3167  /*
3167 3168   * Generate an ICMP unreachable message.
3168 3169   * When called from ip_output side a minimal ip_recv_attr_t needs to be
3169 3170   * constructed by the caller.
3170 3171   */
3171 3172  void
3172 3173  icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3173 3174  {
3174 3175          icmph_t icmph;
3175 3176          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3176 3177  
3177 3178          mp = icmp_pkt_err_ok(mp, ira);
3178 3179          if (mp == NULL)
3179 3180                  return;
3180 3181  
3181 3182          bzero(&icmph, sizeof (icmph_t));
3182 3183          icmph.icmph_type = ICMP_DEST_UNREACHABLE;
3183 3184          icmph.icmph_code = code;
3184 3185          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
3185 3186          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3186 3187  }
3187 3188  
3188 3189  /*
3189 3190   * Latch in the IPsec state for a stream based the policy in the listener
3190 3191   * and the actions in the ip_recv_attr_t.
3191 3192   * Called directly from TCP and SCTP.
3192 3193   */
3193 3194  boolean_t
3194 3195  ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
3195 3196  {
3196 3197          ASSERT(lconnp->conn_policy != NULL);
3197 3198          ASSERT(connp->conn_policy == NULL);
3198 3199  
3199 3200          IPPH_REFHOLD(lconnp->conn_policy);
3200 3201          connp->conn_policy = lconnp->conn_policy;
3201 3202  
3202 3203          if (ira->ira_ipsec_action != NULL) {
3203 3204                  if (connp->conn_latch == NULL) {
3204 3205                          connp->conn_latch = iplatch_create();
3205 3206                          if (connp->conn_latch == NULL)
3206 3207                                  return (B_FALSE);
3207 3208                  }
3208 3209                  ipsec_latch_inbound(connp, ira);
3209 3210          }
3210 3211          return (B_TRUE);
3211 3212  }
3212 3213  
3213 3214  /*
3214 3215   * Verify whether or not the IP address is a valid local address.
3215 3216   * Could be a unicast, including one for a down interface.
3216 3217   * If allow_mcbc then a multicast or broadcast address is also
3217 3218   * acceptable.
3218 3219   *
3219 3220   * In the case of a broadcast/multicast address, however, the
3220 3221   * upper protocol is expected to reset the src address
3221 3222   * to zero when we return IPVL_MCAST/IPVL_BCAST so that
3222 3223   * no packets are emitted with broadcast/multicast address as
3223 3224   * source address (that violates hosts requirements RFC 1122)
3224 3225   * The addresses valid for bind are:
3225 3226   *      (1) - INADDR_ANY (0)
3226 3227   *      (2) - IP address of an UP interface
3227 3228   *      (3) - IP address of a DOWN interface
3228 3229   *      (4) - valid local IP broadcast addresses. In this case
3229 3230   *      the conn will only receive packets destined to
3230 3231   *      the specified broadcast address.
3231 3232   *      (5) - a multicast address. In this case
3232 3233   *      the conn will only receive packets destined to
3233 3234   *      the specified multicast address. Note: the
3234 3235   *      application still has to issue an
3235 3236   *      IP_ADD_MEMBERSHIP socket option.
3236 3237   *
3237 3238   * In all the above cases, the bound address must be valid in the current zone.
3238 3239   * When the address is loopback, multicast or broadcast, there might be many
3239 3240   * matching IREs so bind has to look up based on the zone.
3240 3241   */
3241 3242  ip_laddr_t
3242 3243  ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
3243 3244      ip_stack_t *ipst, boolean_t allow_mcbc)
3244 3245  {
3245 3246          ire_t *src_ire;
3246 3247  
3247 3248          ASSERT(src_addr != INADDR_ANY);
3248 3249  
3249 3250          src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
3250 3251              NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
3251 3252  
3252 3253          /*
3253 3254           * If an address other than in6addr_any is requested,
3254 3255           * we verify that it is a valid address for bind
3255 3256           * Note: Following code is in if-else-if form for
3256 3257           * readability compared to a condition check.
3257 3258           */
3258 3259          if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
3259 3260                  /*
3260 3261                   * (2) Bind to address of local UP interface
3261 3262                   */
3262 3263                  ire_refrele(src_ire);
3263 3264                  return (IPVL_UNICAST_UP);
3264 3265          } else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
3265 3266                  /*
3266 3267                   * (4) Bind to broadcast address
3267 3268                   */
3268 3269                  ire_refrele(src_ire);
3269 3270                  if (allow_mcbc)
3270 3271                          return (IPVL_BCAST);
3271 3272                  else
3272 3273                          return (IPVL_BAD);
3273 3274          } else if (CLASSD(src_addr)) {
3274 3275                  /* (5) bind to multicast address. */
3275 3276                  if (src_ire != NULL)
3276 3277                          ire_refrele(src_ire);
3277 3278  
3278 3279                  if (allow_mcbc)
3279 3280                          return (IPVL_MCAST);
3280 3281                  else
3281 3282                          return (IPVL_BAD);
3282 3283          } else {
3283 3284                  ipif_t *ipif;
3284 3285  
3285 3286                  /*
3286 3287                   * (3) Bind to address of local DOWN interface?
3287 3288                   * (ipif_lookup_addr() looks up all interfaces
3288 3289                   * but we do not get here for UP interfaces
3289 3290                   * - case (2) above)
3290 3291                   */
3291 3292                  if (src_ire != NULL)
3292 3293                          ire_refrele(src_ire);
3293 3294  
3294 3295                  ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
3295 3296                  if (ipif == NULL)
3296 3297                          return (IPVL_BAD);
3297 3298  
3298 3299                  /* Not a useful source? */
3299 3300                  if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
3300 3301                          ipif_refrele(ipif);
3301 3302                          return (IPVL_BAD);
3302 3303                  }
3303 3304                  ipif_refrele(ipif);
3304 3305                  return (IPVL_UNICAST_DOWN);
3305 3306          }
3306 3307  }
3307 3308  
3308 3309  /*
3309 3310   * Insert in the bind fanout for IPv4 and IPv6.
3310 3311   * The caller should already have used ip_laddr_verify_v*() before calling
3311 3312   * this.
3312 3313   */
3313 3314  int
3314 3315  ip_laddr_fanout_insert(conn_t *connp)
3315 3316  {
3316 3317          int             error;
3317 3318  
3318 3319          /*
3319 3320           * Allow setting new policies. For example, disconnects result
3320 3321           * in us being called. As we would have set conn_policy_cached
3321 3322           * to B_TRUE before, we should set it to B_FALSE, so that policy
3322 3323           * can change after the disconnect.
3323 3324           */
3324 3325          connp->conn_policy_cached = B_FALSE;
3325 3326  
3326 3327          error = ipcl_bind_insert(connp);
3327 3328          if (error != 0) {
3328 3329                  if (connp->conn_anon_port) {
3329 3330                          (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
3330 3331                              connp->conn_mlp_type, connp->conn_proto,
3331 3332                              ntohs(connp->conn_lport), B_FALSE);
3332 3333                  }
3333 3334                  connp->conn_mlp_type = mlptSingle;
3334 3335          }
3335 3336          return (error);
3336 3337  }
3337 3338  
3338 3339  /*
3339 3340   * Verify that both the source and destination addresses are valid. If
3340 3341   * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
3341 3342   * i.e. have no route to it.  Protocols like TCP want to verify destination
3342 3343   * reachability, while tunnels do not.
3343 3344   *
3344 3345   * Determine the route, the interface, and (optionally) the source address
3345 3346   * to use to reach a given destination.
3346 3347   * Note that we allow connect to broadcast and multicast addresses when
3347 3348   * IPDF_ALLOW_MCBC is set.
3348 3349   * first_hop and dst_addr are normally the same, but if source routing
3349 3350   * they will differ; in that case the first_hop is what we'll use for the
3350 3351   * routing lookup but the dce and label checks will be done on dst_addr,
3351 3352   *
3352 3353   * If uinfo is set, then we fill in the best available information
3353 3354   * we have for the destination. This is based on (in priority order) any
3354 3355   * metrics and path MTU stored in a dce_t, route metrics, and finally the
3355 3356   * ill_mtu/ill_mc_mtu.
3356 3357   *
3357 3358   * Tsol note: If we have a source route then dst_addr != firsthop. But we
3358 3359   * always do the label check on dst_addr.
3359 3360   */
3360 3361  int
3361 3362  ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
3362 3363      ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
3363 3364  {
3364 3365          ire_t           *ire = NULL;
3365 3366          int             error = 0;
3366 3367          ipaddr_t        setsrc;                         /* RTF_SETSRC */
3367 3368          zoneid_t        zoneid = ixa->ixa_zoneid;       /* Honors SO_ALLZONES */
3368 3369          ip_stack_t      *ipst = ixa->ixa_ipst;
3369 3370          dce_t           *dce;
3370 3371          uint_t          pmtu;
3371 3372          uint_t          generation;
3372 3373          nce_t           *nce;
3373 3374          ill_t           *ill = NULL;
3374 3375          boolean_t       multirt = B_FALSE;
3375 3376  
3376 3377          ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
3377 3378  
3378 3379          /*
3379 3380           * We never send to zero; the ULPs map it to the loopback address.
3380 3381           * We can't allow it since we use zero to mean unitialized in some
3381 3382           * places.
3382 3383           */
3383 3384          ASSERT(dst_addr != INADDR_ANY);
3384 3385  
3385 3386          if (is_system_labeled()) {
3386 3387                  ts_label_t *tsl = NULL;
3387 3388  
3388 3389                  error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
3389 3390                      mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
3390 3391                  if (error != 0)
3391 3392                          return (error);
3392 3393                  if (tsl != NULL) {
3393 3394                          /* Update the label */
3394 3395                          ip_xmit_attr_replace_tsl(ixa, tsl);
3395 3396                  }
3396 3397          }
3397 3398  
3398 3399          setsrc = INADDR_ANY;
3399 3400          /*
3400 3401           * Select a route; For IPMP interfaces, we would only select
3401 3402           * a "hidden" route (i.e., going through a specific under_ill)
3402 3403           * if ixa_ifindex has been specified.
3403 3404           */
3404 3405          ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
3405 3406              &generation, &setsrc, &error, &multirt);
3406 3407          ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
3407 3408          if (error != 0)
3408 3409                  goto bad_addr;
3409 3410  
3410 3411          /*
3411 3412           * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
3412 3413           * If IPDF_VERIFY_DST is set, the destination must be reachable;
3413 3414           * Otherwise the destination needn't be reachable.
3414 3415           *
3415 3416           * If we match on a reject or black hole, then we've got a
3416 3417           * local failure.  May as well fail out the connect() attempt,
3417 3418           * since it's never going to succeed.
3418 3419           */
3419 3420          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
3420 3421                  /*
3421 3422                   * If we're verifying destination reachability, we always want
3422 3423                   * to complain here.
3423 3424                   *
3424 3425                   * If we're not verifying destination reachability but the
3425 3426                   * destination has a route, we still want to fail on the
3426 3427                   * temporary address and broadcast address tests.
3427 3428                   *
3428 3429                   * In both cases do we let the code continue so some reasonable
3429 3430                   * information is returned to the caller. That enables the
3430 3431                   * caller to use (and even cache) the IRE. conn_ip_ouput will
3431 3432                   * use the generation mismatch path to check for the unreachable
3432 3433                   * case thereby avoiding any specific check in the main path.
3433 3434                   */
3434 3435                  ASSERT(generation == IRE_GENERATION_VERIFY);
3435 3436                  if (flags & IPDF_VERIFY_DST) {
3436 3437                          /*
3437 3438                           * Set errno but continue to set up ixa_ire to be
3438 3439                           * the RTF_REJECT|RTF_BLACKHOLE IRE.
3439 3440                           * That allows callers to use ip_output to get an
3440 3441                           * ICMP error back.
3441 3442                           */
3442 3443                          if (!(ire->ire_type & IRE_HOST))
3443 3444                                  error = ENETUNREACH;
3444 3445                          else
3445 3446                                  error = EHOSTUNREACH;
3446 3447                  }
3447 3448          }
3448 3449  
3449 3450          if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
3450 3451              !(flags & IPDF_ALLOW_MCBC)) {
3451 3452                  ire_refrele(ire);
3452 3453                  ire = ire_reject(ipst, B_FALSE);
3453 3454                  generation = IRE_GENERATION_VERIFY;
3454 3455                  error = ENETUNREACH;
3455 3456          }
3456 3457  
3457 3458          /* Cache things */
3458 3459          if (ixa->ixa_ire != NULL)
3459 3460                  ire_refrele_notr(ixa->ixa_ire);
3460 3461  #ifdef DEBUG
3461 3462          ire_refhold_notr(ire);
3462 3463          ire_refrele(ire);
3463 3464  #endif
3464 3465          ixa->ixa_ire = ire;
3465 3466          ixa->ixa_ire_generation = generation;
3466 3467  
3467 3468          /*
3468 3469           * Ensure that ixa_dce is always set any time that ixa_ire is set,
3469 3470           * since some callers will send a packet to conn_ip_output() even if
3470 3471           * there's an error.
3471 3472           */
3472 3473          if (flags & IPDF_UNIQUE_DCE) {
3473 3474                  /* Fallback to the default dce if allocation fails */
3474 3475                  dce = dce_lookup_and_add_v4(dst_addr, ipst);
3475 3476                  if (dce != NULL)
3476 3477                          generation = dce->dce_generation;
3477 3478                  else
3478 3479                          dce = dce_lookup_v4(dst_addr, ipst, &generation);
3479 3480          } else {
3480 3481                  dce = dce_lookup_v4(dst_addr, ipst, &generation);
3481 3482          }
3482 3483          ASSERT(dce != NULL);
3483 3484          if (ixa->ixa_dce != NULL)
3484 3485                  dce_refrele_notr(ixa->ixa_dce);
3485 3486  #ifdef DEBUG
3486 3487          dce_refhold_notr(dce);
3487 3488          dce_refrele(dce);
3488 3489  #endif
3489 3490          ixa->ixa_dce = dce;
3490 3491          ixa->ixa_dce_generation = generation;
3491 3492  
3492 3493          /*
3493 3494           * For multicast with multirt we have a flag passed back from
3494 3495           * ire_lookup_multi_ill_v4 since we don't have an IRE for each
3495 3496           * possible multicast address.
3496 3497           * We also need a flag for multicast since we can't check
3497 3498           * whether RTF_MULTIRT is set in ixa_ire for multicast.
3498 3499           */
3499 3500          if (multirt) {
3500 3501                  ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
3501 3502                  ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
3502 3503          } else {
3503 3504                  ixa->ixa_postfragfn = ire->ire_postfragfn;
3504 3505                  ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
3505 3506          }
3506 3507          if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3507 3508                  /* Get an nce to cache. */
3508 3509                  nce = ire_to_nce(ire, firsthop, NULL);
3509 3510                  if (nce == NULL) {
3510 3511                          /* Allocation failure? */
3511 3512                          ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3512 3513                  } else {
3513 3514                          if (ixa->ixa_nce != NULL)
3514 3515                                  nce_refrele(ixa->ixa_nce);
3515 3516                          ixa->ixa_nce = nce;
3516 3517                  }
3517 3518          }
3518 3519  
3519 3520          /*
3520 3521           * If the source address is a loopback address, the
3521 3522           * destination had best be local or multicast.
3522 3523           * If we are sending to an IRE_LOCAL using a loopback source then
3523 3524           * it had better be the same zoneid.
3524 3525           */
3525 3526          if (*src_addrp == htonl(INADDR_LOOPBACK)) {
3526 3527                  if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
3527 3528                          ire = NULL;     /* Stored in ixa_ire */
3528 3529                          error = EADDRNOTAVAIL;
3529 3530                          goto bad_addr;
3530 3531                  }
3531 3532                  if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
3532 3533                          ire = NULL;     /* Stored in ixa_ire */
3533 3534                          error = EADDRNOTAVAIL;
3534 3535                          goto bad_addr;
3535 3536                  }
3536 3537          }
3537 3538          if (ire->ire_type & IRE_BROADCAST) {
3538 3539                  /*
3539 3540                   * If the ULP didn't have a specified source, then we
3540 3541                   * make sure we reselect the source when sending
3541 3542                   * broadcasts out different interfaces.
3542 3543                   */
3543 3544                  if (flags & IPDF_SELECT_SRC)
3544 3545                          ixa->ixa_flags |= IXAF_SET_SOURCE;
3545 3546                  else
3546 3547                          ixa->ixa_flags &= ~IXAF_SET_SOURCE;
3547 3548          }
3548 3549  
3549 3550          /*
3550 3551           * Does the caller want us to pick a source address?
3551 3552           */
3552 3553          if (flags & IPDF_SELECT_SRC) {
3553 3554                  ipaddr_t        src_addr;
3554 3555  
3555 3556                  /*
3556 3557                   * We use use ire_nexthop_ill to avoid the under ipmp
3557 3558                   * interface for source address selection. Note that for ipmp
3558 3559                   * probe packets, ixa_ifindex would have been specified, and
3559 3560                   * the ip_select_route() invocation would have picked an ire
3560 3561                   * will ire_ill pointing at an under interface.
3561 3562                   */
3562 3563                  ill = ire_nexthop_ill(ire);
3563 3564  
3564 3565                  /* If unreachable we have no ill but need some source */
3565 3566                  if (ill == NULL) {
3566 3567                          src_addr = htonl(INADDR_LOOPBACK);
3567 3568                          /* Make sure we look for a better source address */
3568 3569                          generation = SRC_GENERATION_VERIFY;
3569 3570                  } else {
3570 3571                          error = ip_select_source_v4(ill, setsrc, dst_addr,
3571 3572                              ixa->ixa_multicast_ifaddr, zoneid,
3572 3573                              ipst, &src_addr, &generation, NULL);
3573 3574                          if (error != 0) {
3574 3575                                  ire = NULL;     /* Stored in ixa_ire */
3575 3576                                  goto bad_addr;
3576 3577                          }
3577 3578                  }
3578 3579  
3579 3580                  /*
3580 3581                   * We allow the source address to to down.
3581 3582                   * However, we check that we don't use the loopback address
3582 3583                   * as a source when sending out on the wire.
3583 3584                   */
3584 3585                  if ((src_addr == htonl(INADDR_LOOPBACK)) &&
3585 3586                      !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
3586 3587                      !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3587 3588                          ire = NULL;     /* Stored in ixa_ire */
3588 3589                          error = EADDRNOTAVAIL;
3589 3590                          goto bad_addr;
3590 3591                  }
3591 3592  
3592 3593                  *src_addrp = src_addr;
3593 3594                  ixa->ixa_src_generation = generation;
3594 3595          }
3595 3596  
3596 3597          /*
3597 3598           * Make sure we don't leave an unreachable ixa_nce in place
3598 3599           * since ip_select_route is used when we unplumb i.e., remove
3599 3600           * references on ixa_ire, ixa_nce, and ixa_dce.
3600 3601           */
3601 3602          nce = ixa->ixa_nce;
3602 3603          if (nce != NULL && nce->nce_is_condemned) {
3603 3604                  nce_refrele(nce);
3604 3605                  ixa->ixa_nce = NULL;
3605 3606                  ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3606 3607          }
3607 3608  
3608 3609          /*
3609 3610           * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
3610 3611           * However, we can't do it for IPv4 multicast or broadcast.
3611 3612           */
3612 3613          if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
3613 3614                  ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3614 3615  
3615 3616          /*
3616 3617           * Set initial value for fragmentation limit. Either conn_ip_output
3617 3618           * or ULP might updates it when there are routing changes.
3618 3619           * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
3619 3620           */
3620 3621          pmtu = ip_get_pmtu(ixa);
3621 3622          ixa->ixa_fragsize = pmtu;
3622 3623          /* Make sure ixa_fragsize and ixa_pmtu remain identical */
3623 3624          if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
3624 3625                  ixa->ixa_pmtu = pmtu;
3625 3626  
3626 3627          /*
3627 3628           * Extract information useful for some transports.
3628 3629           * First we look for DCE metrics. Then we take what we have in
3629 3630           * the metrics in the route, where the offlink is used if we have
3630 3631           * one.
3631 3632           */
3632 3633          if (uinfo != NULL) {
3633 3634                  bzero(uinfo, sizeof (*uinfo));
3634 3635  
3635 3636                  if (dce->dce_flags & DCEF_UINFO)
3636 3637                          *uinfo = dce->dce_uinfo;
3637 3638  
3638 3639                  rts_merge_metrics(uinfo, &ire->ire_metrics);
3639 3640  
3640 3641                  /* Allow ire_metrics to decrease the path MTU from above */
3641 3642                  if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
3642 3643                          uinfo->iulp_mtu = pmtu;
3643 3644  
3644 3645                  uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
3645 3646                  uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
3646 3647                  uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
3647 3648          }
3648 3649  
3649 3650          if (ill != NULL)
3650 3651                  ill_refrele(ill);
3651 3652  
3652 3653          return (error);
3653 3654  
3654 3655  bad_addr:
3655 3656          if (ire != NULL)
3656 3657                  ire_refrele(ire);
3657 3658  
3658 3659          if (ill != NULL)
3659 3660                  ill_refrele(ill);
3660 3661  
3661 3662          /*
3662 3663           * Make sure we don't leave an unreachable ixa_nce in place
3663 3664           * since ip_select_route is used when we unplumb i.e., remove
3664 3665           * references on ixa_ire, ixa_nce, and ixa_dce.
3665 3666           */
3666 3667          nce = ixa->ixa_nce;
3667 3668          if (nce != NULL && nce->nce_is_condemned) {
3668 3669                  nce_refrele(nce);
3669 3670                  ixa->ixa_nce = NULL;
3670 3671                  ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3671 3672          }
3672 3673  
3673 3674          return (error);
3674 3675  }
3675 3676  
3676 3677  
3677 3678  /*
3678 3679   * Get the base MTU for the case when path MTU discovery is not used.
3679 3680   * Takes the MTU of the IRE into account.
3680 3681   */
3681 3682  uint_t
3682 3683  ip_get_base_mtu(ill_t *ill, ire_t *ire)
3683 3684  {
3684 3685          uint_t mtu;
3685 3686          uint_t iremtu = ire->ire_metrics.iulp_mtu;
3686 3687  
3687 3688          if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST))
3688 3689                  mtu = ill->ill_mc_mtu;
3689 3690          else
3690 3691                  mtu = ill->ill_mtu;
3691 3692  
3692 3693          if (iremtu != 0 && iremtu < mtu)
3693 3694                  mtu = iremtu;
3694 3695  
3695 3696          return (mtu);
3696 3697  }
3697 3698  
3698 3699  /*
3699 3700   * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
3700 3701   * Assumes that ixa_ire, dce, and nce have already been set up.
3701 3702   *
3702 3703   * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
3703 3704   * We avoid path MTU discovery if it is disabled with ndd.
3704 3705   * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
3705 3706   *
3706 3707   * NOTE: We also used to turn it off for source routed packets. That
3707 3708   * is no longer required since the dce is per final destination.
3708 3709   */
3709 3710  uint_t
3710 3711  ip_get_pmtu(ip_xmit_attr_t *ixa)
3711 3712  {
3712 3713          ip_stack_t      *ipst = ixa->ixa_ipst;
3713 3714          dce_t           *dce;
3714 3715          nce_t           *nce;
3715 3716          ire_t           *ire;
3716 3717          uint_t          pmtu;
3717 3718  
3718 3719          ire = ixa->ixa_ire;
3719 3720          dce = ixa->ixa_dce;
3720 3721          nce = ixa->ixa_nce;
3721 3722  
3722 3723          /*
3723 3724           * If path MTU discovery has been turned off by ndd, then we ignore
3724 3725           * any dce_pmtu and for IPv4 we will not set DF.
3725 3726           */
3726 3727          if (!ipst->ips_ip_path_mtu_discovery)
3727 3728                  ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3728 3729  
3729 3730          pmtu = IP_MAXPACKET;
3730 3731          /*
3731 3732           * Decide whether whether IPv4 sets DF
3732 3733           * For IPv6 "no DF" means to use the 1280 mtu
3733 3734           */
3734 3735          if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3735 3736                  ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3736 3737          } else {
3737 3738                  ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3738 3739                  if (!(ixa->ixa_flags & IXAF_IS_IPV4))
3739 3740                          pmtu = IPV6_MIN_MTU;
3740 3741          }
3741 3742  
3742 3743          /* Check if the PMTU is to old before we use it */
3743 3744          if ((dce->dce_flags & DCEF_PMTU) &&
3744 3745              TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
3745 3746              ipst->ips_ip_pathmtu_interval) {
3746 3747                  /*
3747 3748                   * Older than 20 minutes. Drop the path MTU information.
3748 3749                   */
3749 3750                  mutex_enter(&dce->dce_lock);
3750 3751                  dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
3751 3752                  dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
3752 3753                  mutex_exit(&dce->dce_lock);
3753 3754                  dce_increment_generation(dce);
3754 3755          }
3755 3756  
3756 3757          /* The metrics on the route can lower the path MTU */
3757 3758          if (ire->ire_metrics.iulp_mtu != 0 &&
3758 3759              ire->ire_metrics.iulp_mtu < pmtu)
3759 3760                  pmtu = ire->ire_metrics.iulp_mtu;
3760 3761  
3761 3762          /*
3762 3763           * If the path MTU is smaller than some minimum, we still use dce_pmtu
3763 3764           * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
3764 3765           * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
3765 3766           */
3766 3767          if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3767 3768                  if (dce->dce_flags & DCEF_PMTU) {
3768 3769                          if (dce->dce_pmtu < pmtu)
3769 3770                                  pmtu = dce->dce_pmtu;
3770 3771  
3771 3772                          if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
3772 3773                                  ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
3773 3774                                  ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3774 3775                          } else {
3775 3776                                  ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3776 3777                                  ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3777 3778                          }
3778 3779                  } else {
3779 3780                          ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3780 3781                          ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3781 3782                  }
3782 3783          }
3783 3784  
3784 3785          /*
3785 3786           * If we have an IRE_LOCAL we use the loopback mtu instead of
3786 3787           * the ill for going out the wire i.e., IRE_LOCAL gets the same
3787 3788           * mtu as IRE_LOOPBACK.
3788 3789           */
3789 3790          if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
3790 3791                  uint_t loopback_mtu;
3791 3792  
3792 3793                  loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
3793 3794                      ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
3794 3795  
3795 3796                  if (loopback_mtu < pmtu)
3796 3797                          pmtu = loopback_mtu;
3797 3798          } else if (nce != NULL) {
3798 3799                  /*
3799 3800                   * Make sure we don't exceed the interface MTU.
3800 3801                   * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
3801 3802                   * an ill. We'd use the above IP_MAXPACKET in that case just
3802 3803                   * to tell the transport something larger than zero.
3803 3804                   */
3804 3805                  if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)) {
3805 3806                          if (nce->nce_common->ncec_ill->ill_mc_mtu < pmtu)
3806 3807                                  pmtu = nce->nce_common->ncec_ill->ill_mc_mtu;
3807 3808                          if (nce->nce_common->ncec_ill != nce->nce_ill &&
3808 3809                              nce->nce_ill->ill_mc_mtu < pmtu) {
3809 3810                                  /*
3810 3811                                   * for interfaces in an IPMP group, the mtu of
3811 3812                                   * the nce_ill (under_ill) could be different
3812 3813                                   * from the mtu of the ncec_ill, so we take the
3813 3814                                   * min of the two.
3814 3815                                   */
3815 3816                                  pmtu = nce->nce_ill->ill_mc_mtu;
3816 3817                          }
3817 3818                  } else {
3818 3819                          if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
3819 3820                                  pmtu = nce->nce_common->ncec_ill->ill_mtu;
3820 3821                          if (nce->nce_common->ncec_ill != nce->nce_ill &&
3821 3822                              nce->nce_ill->ill_mtu < pmtu) {
3822 3823                                  /*
3823 3824                                   * for interfaces in an IPMP group, the mtu of
3824 3825                                   * the nce_ill (under_ill) could be different
3825 3826                                   * from the mtu of the ncec_ill, so we take the
3826 3827                                   * min of the two.
3827 3828                                   */
3828 3829                                  pmtu = nce->nce_ill->ill_mtu;
3829 3830                          }
3830 3831                  }
3831 3832          }
3832 3833  
3833 3834          /*
3834 3835           * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
3835 3836           * Only applies to IPv6.
3836 3837           */
3837 3838          if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3838 3839                  if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
3839 3840                          switch (ixa->ixa_use_min_mtu) {
3840 3841                          case IPV6_USE_MIN_MTU_MULTICAST:
3841 3842                                  if (ire->ire_type & IRE_MULTICAST)
3842 3843                                          pmtu = IPV6_MIN_MTU;
3843 3844                                  break;
3844 3845                          case IPV6_USE_MIN_MTU_ALWAYS:
3845 3846                                  pmtu = IPV6_MIN_MTU;
3846 3847                                  break;
3847 3848                          case IPV6_USE_MIN_MTU_NEVER:
3848 3849                                  break;
3849 3850                          }
3850 3851                  } else {
3851 3852                          /* Default is IPV6_USE_MIN_MTU_MULTICAST */
3852 3853                          if (ire->ire_type & IRE_MULTICAST)
3853 3854                                  pmtu = IPV6_MIN_MTU;
3854 3855                  }
3855 3856          }
3856 3857  
3857 3858          /*
3858 3859           * For multirouted IPv6 packets, the IP layer will insert a 8-byte
3859 3860           * fragment header in every packet. We compensate for those cases by
3860 3861           * returning a smaller path MTU to the ULP.
3861 3862           *
3862 3863           * In the case of CGTP then ip_output will add a fragment header.
3863 3864           * Make sure there is room for it by telling a smaller number
3864 3865           * to the transport.
3865 3866           *
3866 3867           * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
3867 3868           * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
3868 3869           * which is the size of the packets it can send.
3869 3870           */
3870 3871          if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3871 3872                  if ((ire->ire_flags & RTF_MULTIRT) ||
3872 3873                      (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
3873 3874                          pmtu -= sizeof (ip6_frag_t);
3874 3875                          ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
3875 3876                  }
3876 3877          }
3877 3878  
3878 3879          return (pmtu);
3879 3880  }
3880 3881  
3881 3882  /*
3882 3883   * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
3883 3884   * the final piece where we don't.  Return a pointer to the first mblk in the
3884 3885   * result, and update the pointer to the next mblk to chew on.  If anything
3885 3886   * goes wrong (i.e., dupb fails), we waste everything in sight and return a
3886 3887   * NULL pointer.
3887 3888   */
3888 3889  mblk_t *
3889 3890  ip_carve_mp(mblk_t **mpp, ssize_t len)
3890 3891  {
3891 3892          mblk_t  *mp0;
3892 3893          mblk_t  *mp1;
3893 3894          mblk_t  *mp2;
3894 3895  
3895 3896          if (!len || !mpp || !(mp0 = *mpp))
3896 3897                  return (NULL);
3897 3898          /* If we aren't going to consume the first mblk, we need a dup. */
3898 3899          if (mp0->b_wptr - mp0->b_rptr > len) {
3899 3900                  mp1 = dupb(mp0);
3900 3901                  if (mp1) {
3901 3902                          /* Partition the data between the two mblks. */
3902 3903                          mp1->b_wptr = mp1->b_rptr + len;
3903 3904                          mp0->b_rptr = mp1->b_wptr;
3904 3905                          /*
3905 3906                           * after adjustments if mblk not consumed is now
3906 3907                           * unaligned, try to align it. If this fails free
3907 3908                           * all messages and let upper layer recover.
3908 3909                           */
3909 3910                          if (!OK_32PTR(mp0->b_rptr)) {
3910 3911                                  if (!pullupmsg(mp0, -1)) {
3911 3912                                          freemsg(mp0);
3912 3913                                          freemsg(mp1);
3913 3914                                          *mpp = NULL;
3914 3915                                          return (NULL);
3915 3916                                  }
3916 3917                          }
3917 3918                  }
3918 3919                  return (mp1);
3919 3920          }
3920 3921          /* Eat through as many mblks as we need to get len bytes. */
3921 3922          len -= mp0->b_wptr - mp0->b_rptr;
3922 3923          for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
3923 3924                  if (mp2->b_wptr - mp2->b_rptr > len) {
3924 3925                          /*
3925 3926                           * We won't consume the entire last mblk.  Like
3926 3927                           * above, dup and partition it.
3927 3928                           */
3928 3929                          mp1->b_cont = dupb(mp2);
3929 3930                          mp1 = mp1->b_cont;
3930 3931                          if (!mp1) {
3931 3932                                  /*
3932 3933                                   * Trouble.  Rather than go to a lot of
3933 3934                                   * trouble to clean up, we free the messages.
3934 3935                                   * This won't be any worse than losing it on
3935 3936                                   * the wire.
3936 3937                                   */
3937 3938                                  freemsg(mp0);
3938 3939                                  freemsg(mp2);
3939 3940                                  *mpp = NULL;
3940 3941                                  return (NULL);
3941 3942                          }
3942 3943                          mp1->b_wptr = mp1->b_rptr + len;
3943 3944                          mp2->b_rptr = mp1->b_wptr;
3944 3945                          /*
3945 3946                           * after adjustments if mblk not consumed is now
3946 3947                           * unaligned, try to align it. If this fails free
3947 3948                           * all messages and let upper layer recover.
3948 3949                           */
3949 3950                          if (!OK_32PTR(mp2->b_rptr)) {
3950 3951                                  if (!pullupmsg(mp2, -1)) {
3951 3952                                          freemsg(mp0);
3952 3953                                          freemsg(mp2);
3953 3954                                          *mpp = NULL;
3954 3955                                          return (NULL);
3955 3956                                  }
3956 3957                          }
3957 3958                          *mpp = mp2;
3958 3959                          return (mp0);
3959 3960                  }
3960 3961                  /* Decrement len by the amount we just got. */
3961 3962                  len -= mp2->b_wptr - mp2->b_rptr;
3962 3963          }
3963 3964          /*
3964 3965           * len should be reduced to zero now.  If not our caller has
3965 3966           * screwed up.
3966 3967           */
3967 3968          if (len) {
3968 3969                  /* Shouldn't happen! */
3969 3970                  freemsg(mp0);
3970 3971                  *mpp = NULL;
3971 3972                  return (NULL);
3972 3973          }
3973 3974          /*
3974 3975           * We consumed up to exactly the end of an mblk.  Detach the part
3975 3976           * we are returning from the rest of the chain.
3976 3977           */
3977 3978          mp1->b_cont = NULL;
3978 3979          *mpp = mp2;
3979 3980          return (mp0);
3980 3981  }
3981 3982  
3982 3983  /* The ill stream is being unplumbed. Called from ip_close */
3983 3984  int
3984 3985  ip_modclose(ill_t *ill)
3985 3986  {
3986 3987          boolean_t success;
3987 3988          ipsq_t  *ipsq;
3988 3989          ipif_t  *ipif;
3989 3990          queue_t *q = ill->ill_rq;
3990 3991          ip_stack_t      *ipst = ill->ill_ipst;
3991 3992          int     i;
3992 3993          arl_ill_common_t *ai = ill->ill_common;
3993 3994  
3994 3995          /*
3995 3996           * The punlink prior to this may have initiated a capability
3996 3997           * negotiation. But ipsq_enter will block until that finishes or
3997 3998           * times out.
3998 3999           */
3999 4000          success = ipsq_enter(ill, B_FALSE, NEW_OP);
4000 4001  
4001 4002          /*
4002 4003           * Open/close/push/pop is guaranteed to be single threaded
4003 4004           * per stream by STREAMS. FS guarantees that all references
4004 4005           * from top are gone before close is called. So there can't
4005 4006           * be another close thread that has set CONDEMNED on this ill.
4006 4007           * and cause ipsq_enter to return failure.
4007 4008           */
4008 4009          ASSERT(success);
4009 4010          ipsq = ill->ill_phyint->phyint_ipsq;
4010 4011  
4011 4012          /*
4012 4013           * Mark it condemned. No new reference will be made to this ill.
4013 4014           * Lookup functions will return an error. Threads that try to
4014 4015           * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
4015 4016           * that the refcnt will drop down to zero.
4016 4017           */
4017 4018          mutex_enter(&ill->ill_lock);
4018 4019          ill->ill_state_flags |= ILL_CONDEMNED;
4019 4020          for (ipif = ill->ill_ipif; ipif != NULL;
4020 4021              ipif = ipif->ipif_next) {
4021 4022                  ipif->ipif_state_flags |= IPIF_CONDEMNED;
4022 4023          }
4023 4024          /*
4024 4025           * Wake up anybody waiting to enter the ipsq. ipsq_enter
4025 4026           * returns  error if ILL_CONDEMNED is set
4026 4027           */
4027 4028          cv_broadcast(&ill->ill_cv);
4028 4029          mutex_exit(&ill->ill_lock);
4029 4030  
4030 4031          /*
4031 4032           * Send all the deferred DLPI messages downstream which came in
4032 4033           * during the small window right before ipsq_enter(). We do this
4033 4034           * without waiting for the ACKs because all the ACKs for M_PROTO
4034 4035           * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
4035 4036           */
4036 4037          ill_dlpi_send_deferred(ill);
4037 4038  
4038 4039          /*
4039 4040           * Shut down fragmentation reassembly.
4040 4041           * ill_frag_timer won't start a timer again.
4041 4042           * Now cancel any existing timer
4042 4043           */
4043 4044          (void) untimeout(ill->ill_frag_timer_id);
4044 4045          (void) ill_frag_timeout(ill, 0);
4045 4046  
4046 4047          /*
4047 4048           * Call ill_delete to bring down the ipifs, ilms and ill on
4048 4049           * this ill. Then wait for the refcnts to drop to zero.
4049 4050           * ill_is_freeable checks whether the ill is really quiescent.
4050 4051           * Then make sure that threads that are waiting to enter the
4051 4052           * ipsq have seen the error returned by ipsq_enter and have
4052 4053           * gone away. Then we call ill_delete_tail which does the
4053 4054           * DL_UNBIND_REQ with the driver and then qprocsoff.
4054 4055           */
4055 4056          ill_delete(ill);
4056 4057          mutex_enter(&ill->ill_lock);
4057 4058          while (!ill_is_freeable(ill))
4058 4059                  cv_wait(&ill->ill_cv, &ill->ill_lock);
4059 4060  
4060 4061          while (ill->ill_waiters)
4061 4062                  cv_wait(&ill->ill_cv, &ill->ill_lock);
4062 4063  
4063 4064          mutex_exit(&ill->ill_lock);
4064 4065  
4065 4066          /*
4066 4067           * ill_delete_tail drops reference on ill_ipst, but we need to keep
4067 4068           * it held until the end of the function since the cleanup
4068 4069           * below needs to be able to use the ip_stack_t.
4069 4070           */
4070 4071          netstack_hold(ipst->ips_netstack);
4071 4072  
4072 4073          /* qprocsoff is done via ill_delete_tail */
4073 4074          ill_delete_tail(ill);
4074 4075          /*
4075 4076           * synchronously wait for arp stream to unbind. After this, we
4076 4077           * cannot get any data packets up from the driver.
4077 4078           */
4078 4079          arp_unbind_complete(ill);
4079 4080          ASSERT(ill->ill_ipst == NULL);
4080 4081  
4081 4082          /*
4082 4083           * Walk through all conns and qenable those that have queued data.
4083 4084           * Close synchronization needs this to
4084 4085           * be done to ensure that all upper layers blocked
4085 4086           * due to flow control to the closing device
4086 4087           * get unblocked.
4087 4088           */
4088 4089          ip1dbg(("ip_wsrv: walking\n"));
4089 4090          for (i = 0; i < TX_FANOUT_SIZE; i++) {
4090 4091                  conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
4091 4092          }
4092 4093  
4093 4094          /*
4094 4095           * ai can be null if this is an IPv6 ill, or if the IPv4
4095 4096           * stream is being torn down before ARP was plumbed (e.g.,
4096 4097           * /sbin/ifconfig plumbing a stream twice, and encountering
4097 4098           * an error
4098 4099           */
4099 4100          if (ai != NULL) {
4100 4101                  ASSERT(!ill->ill_isv6);
4101 4102                  mutex_enter(&ai->ai_lock);
4102 4103                  ai->ai_ill = NULL;
4103 4104                  if (ai->ai_arl == NULL) {
4104 4105                          mutex_destroy(&ai->ai_lock);
4105 4106                          kmem_free(ai, sizeof (*ai));
4106 4107                  } else {
4107 4108                          cv_signal(&ai->ai_ill_unplumb_done);
4108 4109                          mutex_exit(&ai->ai_lock);
4109 4110                  }
4110 4111          }
4111 4112  
4112 4113          mutex_enter(&ipst->ips_ip_mi_lock);
4113 4114          mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
4114 4115          mutex_exit(&ipst->ips_ip_mi_lock);
4115 4116  
4116 4117          /*
4117 4118           * credp could be null if the open didn't succeed and ip_modopen
4118 4119           * itself calls ip_close.
4119 4120           */
4120 4121          if (ill->ill_credp != NULL)
4121 4122                  crfree(ill->ill_credp);
4122 4123  
4123 4124          mutex_destroy(&ill->ill_saved_ire_lock);
4124 4125          mutex_destroy(&ill->ill_lock);
4125 4126          rw_destroy(&ill->ill_mcast_lock);
4126 4127          mutex_destroy(&ill->ill_mcast_serializer);
4127 4128          list_destroy(&ill->ill_nce);
4128 4129  
4129 4130          /*
4130 4131           * Now we are done with the module close pieces that
4131 4132           * need the netstack_t.
4132 4133           */
4133 4134          netstack_rele(ipst->ips_netstack);
4134 4135  
4135 4136          mi_close_free((IDP)ill);
4136 4137          q->q_ptr = WR(q)->q_ptr = NULL;
4137 4138  
4138 4139          ipsq_exit(ipsq);
4139 4140  
4140 4141          return (0);
4141 4142  }
4142 4143  
4143 4144  /*
4144 4145   * This is called as part of close() for IP, UDP, ICMP, and RTS
4145 4146   * in order to quiesce the conn.
4146 4147   */
4147 4148  void
4148 4149  ip_quiesce_conn(conn_t *connp)
4149 4150  {
4150 4151          boolean_t       drain_cleanup_reqd = B_FALSE;
4151 4152          boolean_t       conn_ioctl_cleanup_reqd = B_FALSE;
4152 4153          boolean_t       ilg_cleanup_reqd = B_FALSE;
4153 4154          ip_stack_t      *ipst;
4154 4155  
4155 4156          ASSERT(!IPCL_IS_TCP(connp));
4156 4157          ipst = connp->conn_netstack->netstack_ip;
4157 4158  
4158 4159          /*
4159 4160           * Mark the conn as closing, and this conn must not be
4160 4161           * inserted in future into any list. Eg. conn_drain_insert(),
4161 4162           * won't insert this conn into the conn_drain_list.
4162 4163           *
4163 4164           * conn_idl, and conn_ilg cannot get set henceforth.
4164 4165           */
4165 4166          mutex_enter(&connp->conn_lock);
4166 4167          ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
4167 4168          connp->conn_state_flags |= CONN_CLOSING;
4168 4169          if (connp->conn_idl != NULL)
4169 4170                  drain_cleanup_reqd = B_TRUE;
4170 4171          if (connp->conn_oper_pending_ill != NULL)
4171 4172                  conn_ioctl_cleanup_reqd = B_TRUE;
4172 4173          if (connp->conn_dhcpinit_ill != NULL) {
4173 4174                  ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
4174 4175                  atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
4175 4176                  ill_set_inputfn(connp->conn_dhcpinit_ill);
4176 4177                  connp->conn_dhcpinit_ill = NULL;
4177 4178          }
4178 4179          if (connp->conn_ilg != NULL)
4179 4180                  ilg_cleanup_reqd = B_TRUE;
4180 4181          mutex_exit(&connp->conn_lock);
4181 4182  
4182 4183          if (conn_ioctl_cleanup_reqd)
4183 4184                  conn_ioctl_cleanup(connp);
4184 4185  
4185 4186          if (is_system_labeled() && connp->conn_anon_port) {
4186 4187                  (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
4187 4188                      connp->conn_mlp_type, connp->conn_proto,
4188 4189                      ntohs(connp->conn_lport), B_FALSE);
4189 4190                  connp->conn_anon_port = 0;
4190 4191          }
4191 4192          connp->conn_mlp_type = mlptSingle;
4192 4193  
4193 4194          /*
4194 4195           * Remove this conn from any fanout list it is on.
4195 4196           * and then wait for any threads currently operating
4196 4197           * on this endpoint to finish
4197 4198           */
4198 4199          ipcl_hash_remove(connp);
4199 4200  
4200 4201          /*
4201 4202           * Remove this conn from the drain list, and do any other cleanup that
4202 4203           * may be required.  (TCP conns are never flow controlled, and
4203 4204           * conn_idl will be NULL.)
4204 4205           */
4205 4206          if (drain_cleanup_reqd && connp->conn_idl != NULL) {
4206 4207                  idl_t *idl = connp->conn_idl;
4207 4208  
4208 4209                  mutex_enter(&idl->idl_lock);
4209 4210                  conn_drain(connp, B_TRUE);
4210 4211                  mutex_exit(&idl->idl_lock);
4211 4212          }
4212 4213  
4213 4214          if (connp == ipst->ips_ip_g_mrouter)
4214 4215                  (void) ip_mrouter_done(ipst);
4215 4216  
4216 4217          if (ilg_cleanup_reqd)
4217 4218                  ilg_delete_all(connp);
4218 4219  
4219 4220          /*
4220 4221           * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
4221 4222           * callers from write side can't be there now because close
4222 4223           * is in progress. The only other caller is ipcl_walk
4223 4224           * which checks for the condemned flag.
4224 4225           */
4225 4226          mutex_enter(&connp->conn_lock);
4226 4227          connp->conn_state_flags |= CONN_CONDEMNED;
4227 4228          while (connp->conn_ref != 1)
4228 4229                  cv_wait(&connp->conn_cv, &connp->conn_lock);
4229 4230          connp->conn_state_flags |= CONN_QUIESCED;
4230 4231          mutex_exit(&connp->conn_lock);
4231 4232  }
4232 4233  
4233 4234  /* ARGSUSED */
4234 4235  int
4235 4236  ip_close(queue_t *q, int flags, cred_t *credp __unused)
4236 4237  {
4237 4238          conn_t          *connp;
4238 4239  
4239 4240          /*
4240 4241           * Call the appropriate delete routine depending on whether this is
4241 4242           * a module or device.
4242 4243           */
4243 4244          if (WR(q)->q_next != NULL) {
4244 4245                  /* This is a module close */
4245 4246                  return (ip_modclose((ill_t *)q->q_ptr));
4246 4247          }
4247 4248  
4248 4249          connp = q->q_ptr;
4249 4250          ip_quiesce_conn(connp);
4250 4251  
4251 4252          qprocsoff(q);
4252 4253  
4253 4254          /*
4254 4255           * Now we are truly single threaded on this stream, and can
4255 4256           * delete the things hanging off the connp, and finally the connp.
4256 4257           * We removed this connp from the fanout list, it cannot be
4257 4258           * accessed thru the fanouts, and we already waited for the
4258 4259           * conn_ref to drop to 0. We are already in close, so
4259 4260           * there cannot be any other thread from the top. qprocsoff
4260 4261           * has completed, and service has completed or won't run in
4261 4262           * future.
4262 4263           */
4263 4264          ASSERT(connp->conn_ref == 1);
4264 4265  
4265 4266          inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4266 4267  
4267 4268          connp->conn_ref--;
4268 4269          ipcl_conn_destroy(connp);
4269 4270  
4270 4271          q->q_ptr = WR(q)->q_ptr = NULL;
4271 4272          return (0);
4272 4273  }
4273 4274  
4274 4275  /*
4275 4276   * Wapper around putnext() so that ip_rts_request can merely use
4276 4277   * conn_recv.
4277 4278   */
4278 4279  /*ARGSUSED2*/
4279 4280  static void
4280 4281  ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4281 4282  {
4282 4283          conn_t *connp = (conn_t *)arg1;
4283 4284  
4284 4285          putnext(connp->conn_rq, mp);
4285 4286  }
4286 4287  
4287 4288  /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
4288 4289  /* ARGSUSED */
4289 4290  static void
4290 4291  ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4291 4292  {
4292 4293          freemsg(mp);
4293 4294  }
4294 4295  
4295 4296  /*
4296 4297   * Called when the module is about to be unloaded
4297 4298   */
4298 4299  void
4299 4300  ip_ddi_destroy(void)
4300 4301  {
4301 4302          /* This needs to be called before destroying any transports. */
4302 4303          mutex_enter(&cpu_lock);
4303 4304          unregister_cpu_setup_func(ip_tp_cpu_update, NULL);
4304 4305          mutex_exit(&cpu_lock);
4305 4306  
4306 4307          tnet_fini();
4307 4308  
4308 4309          icmp_ddi_g_destroy();
4309 4310          rts_ddi_g_destroy();
4310 4311          udp_ddi_g_destroy();
4311 4312          sctp_ddi_g_destroy();
4312 4313          tcp_ddi_g_destroy();
4313 4314          ilb_ddi_g_destroy();
4314 4315          dce_g_destroy();
4315 4316          ipsec_policy_g_destroy();
4316 4317          ipcl_g_destroy();
4317 4318          ip_net_g_destroy();
4318 4319          ip_ire_g_fini();
4319 4320          inet_minor_destroy(ip_minor_arena_sa);
4320 4321  #if defined(_LP64)
4321 4322          inet_minor_destroy(ip_minor_arena_la);
4322 4323  #endif
4323 4324  
4324 4325  #ifdef DEBUG
4325 4326          list_destroy(&ip_thread_list);
4326 4327          rw_destroy(&ip_thread_rwlock);
4327 4328          tsd_destroy(&ip_thread_data);
4328 4329  #endif
4329 4330  
4330 4331          netstack_unregister(NS_IP);
4331 4332  }
4332 4333  
4333 4334  /*
4334 4335   * First step in cleanup.
4335 4336   */
4336 4337  /* ARGSUSED */
4337 4338  static void
4338 4339  ip_stack_shutdown(netstackid_t stackid, void *arg)
4339 4340  {
4340 4341          ip_stack_t *ipst = (ip_stack_t *)arg;
4341 4342          kt_did_t ktid;
4342 4343  
4343 4344  #ifdef NS_DEBUG
4344 4345          printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
4345 4346  #endif
4346 4347  
4347 4348          /*
4348 4349           * Perform cleanup for special interfaces (loopback and IPMP).
4349 4350           */
4350 4351          ip_interface_cleanup(ipst);
4351 4352  
4352 4353          /*
4353 4354           * The *_hook_shutdown()s start the process of notifying any
4354 4355           * consumers that things are going away.... nothing is destroyed.
4355 4356           */
4356 4357          ipv4_hook_shutdown(ipst);
4357 4358          ipv6_hook_shutdown(ipst);
4358 4359          arp_hook_shutdown(ipst);
4359 4360  
4360 4361          mutex_enter(&ipst->ips_capab_taskq_lock);
4361 4362          ktid = ipst->ips_capab_taskq_thread->t_did;
4362 4363          ipst->ips_capab_taskq_quit = B_TRUE;
4363 4364          cv_signal(&ipst->ips_capab_taskq_cv);
4364 4365          mutex_exit(&ipst->ips_capab_taskq_lock);
4365 4366  
4366 4367          /*
4367 4368           * In rare occurrences, particularly on virtual hardware where CPUs can
4368 4369           * be de-scheduled, the thread that we just signaled will not run until
4369 4370           * after we have gotten through parts of ip_stack_fini. If that happens
4370 4371           * then we'll try to grab the ips_capab_taskq_lock as part of returning
4371 4372           * from cv_wait which no longer exists.
4372 4373           */
4373 4374          thread_join(ktid);
4374 4375  }
4375 4376  
4376 4377  /*
4377 4378   * Free the IP stack instance.
4378 4379   */
4379 4380  static void
4380 4381  ip_stack_fini(netstackid_t stackid, void *arg)
4381 4382  {
4382 4383          ip_stack_t *ipst = (ip_stack_t *)arg;
4383 4384          int ret;
4384 4385  
4385 4386  #ifdef NS_DEBUG
4386 4387          printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
4387 4388  #endif
4388 4389          /*
4389 4390           * At this point, all of the notifications that the events and
4390 4391           * protocols are going away have been run, meaning that we can
4391 4392           * now set about starting to clean things up.
4392 4393           */
4393 4394          ipobs_fini(ipst);
4394 4395          ipv4_hook_destroy(ipst);
4395 4396          ipv6_hook_destroy(ipst);
4396 4397          arp_hook_destroy(ipst);
4397 4398          ip_net_destroy(ipst);
4398 4399  
4399 4400          ipmp_destroy(ipst);
4400 4401  
4401 4402          ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
4402 4403          ipst->ips_ip_mibkp = NULL;
4403 4404          icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
4404 4405          ipst->ips_icmp_mibkp = NULL;
4405 4406          ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
4406 4407          ipst->ips_ip_kstat = NULL;
4407 4408          bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
4408 4409          ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
4409 4410          ipst->ips_ip6_kstat = NULL;
4410 4411          bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
4411 4412  
4412 4413          kmem_free(ipst->ips_propinfo_tbl,
4413 4414              ip_propinfo_count * sizeof (mod_prop_info_t));
4414 4415          ipst->ips_propinfo_tbl = NULL;
4415 4416  
4416 4417          dce_stack_destroy(ipst);
4417 4418          ip_mrouter_stack_destroy(ipst);
4418 4419  
4419 4420          /*
4420 4421           * Quiesce all of our timers. Note we set the quiesce flags before we
4421 4422           * call untimeout. The slowtimers may actually kick off another instance
4422 4423           * of the non-slow timers.
4423 4424           */
4424 4425          mutex_enter(&ipst->ips_igmp_timer_lock);
4425 4426          ipst->ips_igmp_timer_quiesce = B_TRUE;
4426 4427          mutex_exit(&ipst->ips_igmp_timer_lock);
4427 4428  
4428 4429          mutex_enter(&ipst->ips_mld_timer_lock);
4429 4430          ipst->ips_mld_timer_quiesce = B_TRUE;
4430 4431          mutex_exit(&ipst->ips_mld_timer_lock);
4431 4432  
4432 4433          mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
4433 4434          ipst->ips_igmp_slowtimeout_quiesce = B_TRUE;
4434 4435          mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
4435 4436  
4436 4437          mutex_enter(&ipst->ips_mld_slowtimeout_lock);
4437 4438          ipst->ips_mld_slowtimeout_quiesce = B_TRUE;
4438 4439          mutex_exit(&ipst->ips_mld_slowtimeout_lock);
4439 4440  
4440 4441          ret = untimeout(ipst->ips_igmp_timeout_id);
4441 4442          if (ret == -1) {
4442 4443                  ASSERT(ipst->ips_igmp_timeout_id == 0);
4443 4444          } else {
4444 4445                  ASSERT(ipst->ips_igmp_timeout_id != 0);
4445 4446                  ipst->ips_igmp_timeout_id = 0;
4446 4447          }
4447 4448          ret = untimeout(ipst->ips_igmp_slowtimeout_id);
4448 4449          if (ret == -1) {
4449 4450                  ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
4450 4451          } else {
4451 4452                  ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
4452 4453                  ipst->ips_igmp_slowtimeout_id = 0;
4453 4454          }
4454 4455          ret = untimeout(ipst->ips_mld_timeout_id);
4455 4456          if (ret == -1) {
4456 4457                  ASSERT(ipst->ips_mld_timeout_id == 0);
4457 4458          } else {
4458 4459                  ASSERT(ipst->ips_mld_timeout_id != 0);
4459 4460                  ipst->ips_mld_timeout_id = 0;
4460 4461          }
4461 4462          ret = untimeout(ipst->ips_mld_slowtimeout_id);
4462 4463          if (ret == -1) {
4463 4464                  ASSERT(ipst->ips_mld_slowtimeout_id == 0);
4464 4465          } else {
4465 4466                  ASSERT(ipst->ips_mld_slowtimeout_id != 0);
4466 4467                  ipst->ips_mld_slowtimeout_id = 0;
4467 4468          }
4468 4469  
4469 4470          ip_ire_fini(ipst);
4470 4471          ip6_asp_free(ipst);
4471 4472          conn_drain_fini(ipst);
4472 4473          ipcl_destroy(ipst);
4473 4474  
4474 4475          mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
4475 4476          mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
4476 4477          kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
4477 4478          ipst->ips_ndp4 = NULL;
4478 4479          kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
4479 4480          ipst->ips_ndp6 = NULL;
4480 4481  
4481 4482          if (ipst->ips_loopback_ksp != NULL) {
4482 4483                  kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
4483 4484                  ipst->ips_loopback_ksp = NULL;
4484 4485          }
4485 4486  
4486 4487          mutex_destroy(&ipst->ips_capab_taskq_lock);
4487 4488          cv_destroy(&ipst->ips_capab_taskq_cv);
4488 4489  
4489 4490          rw_destroy(&ipst->ips_srcid_lock);
4490 4491  
4491 4492          mutex_destroy(&ipst->ips_ip_mi_lock);
4492 4493          rw_destroy(&ipst->ips_ill_g_usesrc_lock);
4493 4494  
4494 4495          mutex_destroy(&ipst->ips_igmp_timer_lock);
4495 4496          mutex_destroy(&ipst->ips_mld_timer_lock);
4496 4497          mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
4497 4498          mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
4498 4499          mutex_destroy(&ipst->ips_ip_addr_avail_lock);
4499 4500          rw_destroy(&ipst->ips_ill_g_lock);
4500 4501  
4501 4502          kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
4502 4503          ipst->ips_phyint_g_list = NULL;
4503 4504          kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
4504 4505          ipst->ips_ill_g_heads = NULL;
4505 4506  
4506 4507          ldi_ident_release(ipst->ips_ldi_ident);
4507 4508          kmem_free(ipst, sizeof (*ipst));
4508 4509  }
4509 4510  
4510 4511  /*
4511 4512   * This function is called from the TSD destructor, and is used to debug
4512 4513   * reference count issues in IP. See block comment in <inet/ip_if.h> for
4513 4514   * details.
4514 4515   */
4515 4516  static void
4516 4517  ip_thread_exit(void *phash)
4517 4518  {
4518 4519          th_hash_t *thh = phash;
4519 4520  
4520 4521          rw_enter(&ip_thread_rwlock, RW_WRITER);
4521 4522          list_remove(&ip_thread_list, thh);
4522 4523          rw_exit(&ip_thread_rwlock);
4523 4524          mod_hash_destroy_hash(thh->thh_hash);
4524 4525          kmem_free(thh, sizeof (*thh));
4525 4526  }
4526 4527  
4527 4528  /*
4528 4529   * Called when the IP kernel module is loaded into the kernel
4529 4530   */
4530 4531  void
4531 4532  ip_ddi_init(void)
4532 4533  {
4533 4534          ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
4534 4535  
4535 4536          /*
4536 4537           * For IP and TCP the minor numbers should start from 2 since we have 4
4537 4538           * initial devices: ip, ip6, tcp, tcp6.
4538 4539           */
4539 4540          /*
4540 4541           * If this is a 64-bit kernel, then create two separate arenas -
4541 4542           * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
4542 4543           * other for socket apps in the range 2^^18 through 2^^32-1.
4543 4544           */
4544 4545          ip_minor_arena_la = NULL;
4545 4546          ip_minor_arena_sa = NULL;
4546 4547  #if defined(_LP64)
4547 4548          if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4548 4549              INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
4549 4550                  cmn_err(CE_PANIC,
4550 4551                      "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4551 4552          }
4552 4553          if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
4553 4554              MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
4554 4555                  cmn_err(CE_PANIC,
4555 4556                      "ip_ddi_init: ip_minor_arena_la creation failed\n");
4556 4557          }
4557 4558  #else
4558 4559          if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4559 4560              INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
4560 4561                  cmn_err(CE_PANIC,
4561 4562                      "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4562 4563          }
4563 4564  #endif
4564 4565          ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
4565 4566  
4566 4567          ipcl_g_init();
4567 4568          ip_ire_g_init();
4568 4569          ip_net_g_init();
4569 4570  
4570 4571  #ifdef DEBUG
4571 4572          tsd_create(&ip_thread_data, ip_thread_exit);
4572 4573          rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
4573 4574          list_create(&ip_thread_list, sizeof (th_hash_t),
4574 4575              offsetof(th_hash_t, thh_link));
4575 4576  #endif
4576 4577          ipsec_policy_g_init();
4577 4578          tcp_ddi_g_init();
4578 4579          sctp_ddi_g_init();
4579 4580          dce_g_init();
4580 4581  
4581 4582          /*
4582 4583           * We want to be informed each time a stack is created or
4583 4584           * destroyed in the kernel, so we can maintain the
4584 4585           * set of udp_stack_t's.
4585 4586           */
4586 4587          netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
4587 4588              ip_stack_fini);
4588 4589  
4589 4590          tnet_init();
4590 4591  
4591 4592          udp_ddi_g_init();
4592 4593          rts_ddi_g_init();
4593 4594          icmp_ddi_g_init();
4594 4595          ilb_ddi_g_init();
4595 4596  
4596 4597          /* This needs to be called after all transports are initialized. */
4597 4598          mutex_enter(&cpu_lock);
4598 4599          register_cpu_setup_func(ip_tp_cpu_update, NULL);
4599 4600          mutex_exit(&cpu_lock);
4600 4601  }
4601 4602  
4602 4603  /*
4603 4604   * Initialize the IP stack instance.
4604 4605   */
4605 4606  static void *
4606 4607  ip_stack_init(netstackid_t stackid, netstack_t *ns)
4607 4608  {
4608 4609          ip_stack_t      *ipst;
4609 4610          size_t          arrsz;
4610 4611          major_t         major;
4611 4612  
4612 4613  #ifdef NS_DEBUG
4613 4614          printf("ip_stack_init(stack %d)\n", stackid);
4614 4615  #endif
4615 4616  
4616 4617          ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
4617 4618          ipst->ips_netstack = ns;
4618 4619  
4619 4620          ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
4620 4621              KM_SLEEP);
4621 4622          ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
4622 4623              KM_SLEEP);
4623 4624          ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4624 4625          ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4625 4626          mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4626 4627          mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4627 4628  
4628 4629          mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4629 4630          ipst->ips_igmp_deferred_next = INFINITY;
4630 4631          mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4631 4632          ipst->ips_mld_deferred_next = INFINITY;
4632 4633          mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4633 4634          mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4634 4635          mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
4635 4636          mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
4636 4637          rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
4637 4638          rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
4638 4639  
4639 4640          ipcl_init(ipst);
4640 4641          ip_ire_init(ipst);
4641 4642          ip6_asp_init(ipst);
4642 4643          ipif_init(ipst);
4643 4644          conn_drain_init(ipst);
4644 4645          ip_mrouter_stack_init(ipst);
4645 4646          dce_stack_init(ipst);
4646 4647  
4647 4648          ipst->ips_ip_multirt_log_interval = 1000;
4648 4649  
4649 4650          ipst->ips_ill_index = 1;
4650 4651  
4651 4652          ipst->ips_saved_ip_forwarding = -1;
4652 4653          ipst->ips_reg_vif_num = ALL_VIFS;       /* Index to Register vif */
4653 4654  
4654 4655          arrsz = ip_propinfo_count * sizeof (mod_prop_info_t);
4655 4656          ipst->ips_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4656 4657          bcopy(ip_propinfo_tbl, ipst->ips_propinfo_tbl, arrsz);
4657 4658  
4658 4659          ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
4659 4660          ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
4660 4661          ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
4661 4662          ipst->ips_ip6_kstat =
4662 4663              ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
4663 4664  
4664 4665          ipst->ips_ip_src_id = 1;
4665 4666          rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
4666 4667  
4667 4668          ipst->ips_src_generation = SRC_GENERATION_INITIAL;
4668 4669  
4669 4670          ip_net_init(ipst, ns);
4670 4671          ipv4_hook_init(ipst);
4671 4672          ipv6_hook_init(ipst);
4672 4673          arp_hook_init(ipst);
4673 4674          ipmp_init(ipst);
4674 4675          ipobs_init(ipst);
4675 4676  
4676 4677          /*
4677 4678           * Create the taskq dispatcher thread and initialize related stuff.
4678 4679           */
4679 4680          mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
4680 4681          cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
4681 4682          ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
4682 4683              ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
4683 4684  
4684 4685          major = mod_name_to_major(INET_NAME);
4685 4686          (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
4686 4687          return (ipst);
4687 4688  }
4688 4689  
4689 4690  /*
4690 4691   * Allocate and initialize a DLPI template of the specified length.  (May be
4691 4692   * called as writer.)
4692 4693   */
4693 4694  mblk_t *
4694 4695  ip_dlpi_alloc(size_t len, t_uscalar_t prim)
4695 4696  {
4696 4697          mblk_t  *mp;
4697 4698  
4698 4699          mp = allocb(len, BPRI_MED);
4699 4700          if (!mp)
4700 4701                  return (NULL);
4701 4702  
4702 4703          /*
4703 4704           * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
4704 4705           * of which we don't seem to use) are sent with M_PCPROTO, and
4705 4706           * that other DLPI are M_PROTO.
4706 4707           */
4707 4708          if (prim == DL_INFO_REQ) {
4708 4709                  mp->b_datap->db_type = M_PCPROTO;
4709 4710          } else {
4710 4711                  mp->b_datap->db_type = M_PROTO;
4711 4712          }
4712 4713  
4713 4714          mp->b_wptr = mp->b_rptr + len;
4714 4715          bzero(mp->b_rptr, len);
4715 4716          ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
4716 4717          return (mp);
4717 4718  }
4718 4719  
4719 4720  /*
4720 4721   * Allocate and initialize a DLPI notification.  (May be called as writer.)
4721 4722   */
4722 4723  mblk_t *
4723 4724  ip_dlnotify_alloc(uint_t notification, uint_t data)
4724 4725  {
4725 4726          dl_notify_ind_t *notifyp;
4726 4727          mblk_t          *mp;
4727 4728  
4728 4729          if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4729 4730                  return (NULL);
4730 4731  
4731 4732          notifyp = (dl_notify_ind_t *)mp->b_rptr;
4732 4733          notifyp->dl_notification = notification;
4733 4734          notifyp->dl_data = data;
4734 4735          return (mp);
4735 4736  }
4736 4737  
4737 4738  mblk_t *
4738 4739  ip_dlnotify_alloc2(uint_t notification, uint_t data1, uint_t data2)
4739 4740  {
4740 4741          dl_notify_ind_t *notifyp;
4741 4742          mblk_t          *mp;
4742 4743  
4743 4744          if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4744 4745                  return (NULL);
4745 4746  
4746 4747          notifyp = (dl_notify_ind_t *)mp->b_rptr;
4747 4748          notifyp->dl_notification = notification;
4748 4749          notifyp->dl_data1 = data1;
4749 4750          notifyp->dl_data2 = data2;
4750 4751          return (mp);
4751 4752  }
4752 4753  
4753 4754  /*
4754 4755   * Debug formatting routine.  Returns a character string representation of the
4755 4756   * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
4756 4757   * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
4757 4758   *
4758 4759   * Once the ndd table-printing interfaces are removed, this can be changed to
4759 4760   * standard dotted-decimal form.
4760 4761   */
4761 4762  char *
4762 4763  ip_dot_addr(ipaddr_t addr, char *buf)
4763 4764  {
4764 4765          uint8_t *ap = (uint8_t *)&addr;
4765 4766  
4766 4767          (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
4767 4768              ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
4768 4769          return (buf);
4769 4770  }
4770 4771  
4771 4772  /*
4772 4773   * Write the given MAC address as a printable string in the usual colon-
4773 4774   * separated format.
4774 4775   */
4775 4776  const char *
4776 4777  mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
4777 4778  {
4778 4779          char *bp;
4779 4780  
4780 4781          if (alen == 0 || buflen < 4)
4781 4782                  return ("?");
4782 4783          bp = buf;
4783 4784          for (;;) {
4784 4785                  /*
4785 4786                   * If there are more MAC address bytes available, but we won't
4786 4787                   * have any room to print them, then add "..." to the string
4787 4788                   * instead.  See below for the 'magic number' explanation.
4788 4789                   */
4789 4790                  if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
4790 4791                          (void) strcpy(bp, "...");
4791 4792                          break;
4792 4793                  }
4793 4794                  (void) sprintf(bp, "%02x", *addr++);
4794 4795                  bp += 2;
4795 4796                  if (--alen == 0)
4796 4797                          break;
4797 4798                  *bp++ = ':';
4798 4799                  buflen -= 3;
4799 4800                  /*
4800 4801                   * At this point, based on the first 'if' statement above,
4801 4802                   * either alen == 1 and buflen >= 3, or alen > 1 and
4802 4803                   * buflen >= 4.  The first case leaves room for the final "xx"
4803 4804                   * number and trailing NUL byte.  The second leaves room for at
4804 4805                   * least "...".  Thus the apparently 'magic' numbers chosen for
4805 4806                   * that statement.
4806 4807                   */
4807 4808          }
4808 4809          return (buf);
4809 4810  }
4810 4811  
4811 4812  /*
4812 4813   * Called when it is conceptually a ULP that would sent the packet
4813 4814   * e.g., port unreachable and protocol unreachable. Check that the packet
4814 4815   * would have passed the IPsec global policy before sending the error.
4815 4816   *
4816 4817   * Send an ICMP error after patching up the packet appropriately.
4817 4818   * Uses ip_drop_input and bumps the appropriate MIB.
4818 4819   */
4819 4820  void
4820 4821  ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
4821 4822      ip_recv_attr_t *ira)
4822 4823  {
4823 4824          ipha_t          *ipha;
4824 4825          boolean_t       secure;
4825 4826          ill_t           *ill = ira->ira_ill;
4826 4827          ip_stack_t      *ipst = ill->ill_ipst;
4827 4828          netstack_t      *ns = ipst->ips_netstack;
4828 4829          ipsec_stack_t   *ipss = ns->netstack_ipsec;
4829 4830  
4830 4831          secure = ira->ira_flags & IRAF_IPSEC_SECURE;
4831 4832  
4832 4833          /*
4833 4834           * We are generating an icmp error for some inbound packet.
4834 4835           * Called from all ip_fanout_(udp, tcp, proto) functions.
4835 4836           * Before we generate an error, check with global policy
4836 4837           * to see whether this is allowed to enter the system. As
4837 4838           * there is no "conn", we are checking with global policy.
4838 4839           */
4839 4840          ipha = (ipha_t *)mp->b_rptr;
4840 4841          if (secure || ipss->ipsec_inbound_v4_policy_present) {
4841 4842                  mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
4842 4843                  if (mp == NULL)
4843 4844                          return;
4844 4845          }
4845 4846  
4846 4847          /* We never send errors for protocols that we do implement */
4847 4848          if (ira->ira_protocol == IPPROTO_ICMP ||
4848 4849              ira->ira_protocol == IPPROTO_IGMP) {
4849 4850                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4850 4851                  ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
4851 4852                  freemsg(mp);
4852 4853                  return;
4853 4854          }
4854 4855          /*
4855 4856           * Have to correct checksum since
4856 4857           * the packet might have been
4857 4858           * fragmented and the reassembly code in ip_rput
4858 4859           * does not restore the IP checksum.
4859 4860           */
4860 4861          ipha->ipha_hdr_checksum = 0;
4861 4862          ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
4862 4863  
4863 4864          switch (icmp_type) {
4864 4865          case ICMP_DEST_UNREACHABLE:
4865 4866                  switch (icmp_code) {
4866 4867                  case ICMP_PROTOCOL_UNREACHABLE:
4867 4868                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
4868 4869                          ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
4869 4870                          break;
4870 4871                  case ICMP_PORT_UNREACHABLE:
4871 4872                          BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
4872 4873                          ip_drop_input("ipIfStatsNoPorts", mp, ill);
4873 4874                          break;
4874 4875                  }
4875 4876  
4876 4877                  icmp_unreachable(mp, icmp_code, ira);
4877 4878                  break;
4878 4879          default:
4879 4880  #ifdef DEBUG
4880 4881                  panic("ip_fanout_send_icmp_v4: wrong type");
4881 4882                  /*NOTREACHED*/
4882 4883  #else
4883 4884                  freemsg(mp);
4884 4885                  break;
4885 4886  #endif
4886 4887          }
4887 4888  }
4888 4889  
4889 4890  /*
4890 4891   * Used to send an ICMP error message when a packet is received for
4891 4892   * a protocol that is not supported. The mblk passed as argument
4892 4893   * is consumed by this function.
4893 4894   */
4894 4895  void
4895 4896  ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
4896 4897  {
4897 4898          ipha_t          *ipha;
4898 4899  
4899 4900          ipha = (ipha_t *)mp->b_rptr;
4900 4901          if (ira->ira_flags & IRAF_IS_IPV4) {
4901 4902                  ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
4902 4903                  ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
4903 4904                      ICMP_PROTOCOL_UNREACHABLE, ira);
4904 4905          } else {
4905 4906                  ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
4906 4907                  ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
4907 4908                      ICMP6_PARAMPROB_NEXTHEADER, ira);
4908 4909          }
4909 4910  }
4910 4911  
4911 4912  /*
4912 4913   * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
4913 4914   * Handles IPv4 and IPv6.
4914 4915   * We are responsible for disposing of mp, such as by freemsg() or putnext()
4915 4916   * Caller is responsible for dropping references to the conn.
4916 4917   */
4917 4918  void
4918 4919  ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
4919 4920      ip_recv_attr_t *ira)
4920 4921  {
4921 4922          ill_t           *ill = ira->ira_ill;
4922 4923          ip_stack_t      *ipst = ill->ill_ipst;
4923 4924          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
4924 4925          boolean_t       secure;
4925 4926          uint_t          protocol = ira->ira_protocol;
4926 4927          iaflags_t       iraflags = ira->ira_flags;
4927 4928          queue_t         *rq;
4928 4929  
4929 4930          secure = iraflags & IRAF_IPSEC_SECURE;
4930 4931  
4931 4932          rq = connp->conn_rq;
4932 4933          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
4933 4934                  switch (protocol) {
4934 4935                  case IPPROTO_ICMPV6:
4935 4936                          BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
4936 4937                          break;
4937 4938                  case IPPROTO_ICMP:
4938 4939                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
4939 4940                          break;
4940 4941                  default:
4941 4942                          BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
4942 4943                          break;
4943 4944                  }
4944 4945                  freemsg(mp);
4945 4946                  return;
4946 4947          }
4947 4948  
4948 4949          ASSERT(!(IPCL_IS_IPTUN(connp)));
4949 4950  
4950 4951          if (((iraflags & IRAF_IS_IPV4) ?
4951 4952              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
4952 4953              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
4953 4954              secure) {
4954 4955                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
4955 4956                      ip6h, ira);
4956 4957                  if (mp == NULL) {
4957 4958                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4958 4959                          /* Note that mp is NULL */
4959 4960                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
4960 4961                          return;
4961 4962                  }
4962 4963          }
4963 4964  
4964 4965          if (iraflags & IRAF_ICMP_ERROR) {
4965 4966                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
4966 4967          } else {
4967 4968                  ill_t *rill = ira->ira_rill;
4968 4969  
4969 4970                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
4970 4971                  ira->ira_ill = ira->ira_rill = NULL;
4971 4972                  /* Send it upstream */
4972 4973                  (connp->conn_recv)(connp, mp, NULL, ira);
4973 4974                  ira->ira_ill = ill;
4974 4975                  ira->ira_rill = rill;
4975 4976          }
4976 4977  }
4977 4978  
4978 4979  /*
4979 4980   * Handle protocols with which IP is less intimate.  There
4980 4981   * can be more than one stream bound to a particular
4981 4982   * protocol.  When this is the case, normally each one gets a copy
4982 4983   * of any incoming packets.
4983 4984   *
4984 4985   * IPsec NOTE :
4985 4986   *
4986 4987   * Don't allow a secure packet going up a non-secure connection.
4987 4988   * We don't allow this because
4988 4989   *
4989 4990   * 1) Reply might go out in clear which will be dropped at
4990 4991   *    the sending side.
4991 4992   * 2) If the reply goes out in clear it will give the
4992 4993   *    adversary enough information for getting the key in
4993 4994   *    most of the cases.
4994 4995   *
4995 4996   * Moreover getting a secure packet when we expect clear
4996 4997   * implies that SA's were added without checking for
4997 4998   * policy on both ends. This should not happen once ISAKMP
4998 4999   * is used to negotiate SAs as SAs will be added only after
4999 5000   * verifying the policy.
5000 5001   *
5001 5002   * Zones notes:
5002 5003   * Earlier in ip_input on a system with multiple shared-IP zones we
5003 5004   * duplicate the multicast and broadcast packets and send them up
5004 5005   * with each explicit zoneid that exists on that ill.
5005 5006   * This means that here we can match the zoneid with SO_ALLZONES being special.
5006 5007   */
5007 5008  void
5008 5009  ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
5009 5010  {
5010 5011          mblk_t          *mp1;
5011 5012          ipaddr_t        laddr;
5012 5013          conn_t          *connp, *first_connp, *next_connp;
5013 5014          connf_t         *connfp;
5014 5015          ill_t           *ill = ira->ira_ill;
5015 5016          ip_stack_t      *ipst = ill->ill_ipst;
5016 5017  
5017 5018          laddr = ipha->ipha_dst;
5018 5019  
5019 5020          connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
5020 5021          mutex_enter(&connfp->connf_lock);
5021 5022          connp = connfp->connf_head;
5022 5023          for (connp = connfp->connf_head; connp != NULL;
5023 5024              connp = connp->conn_next) {
5024 5025                  /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5025 5026                  if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5026 5027                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5027 5028                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
5028 5029                          break;
5029 5030                  }
5030 5031          }
5031 5032  
5032 5033          if (connp == NULL) {
5033 5034                  /*
5034 5035                   * No one bound to these addresses.  Is
5035 5036                   * there a client that wants all
5036 5037                   * unclaimed datagrams?
5037 5038                   */
5038 5039                  mutex_exit(&connfp->connf_lock);
5039 5040                  ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
5040 5041                      ICMP_PROTOCOL_UNREACHABLE, ira);
5041 5042                  return;
5042 5043          }
5043 5044  
5044 5045          ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5045 5046  
5046 5047          CONN_INC_REF(connp);
5047 5048          first_connp = connp;
5048 5049          connp = connp->conn_next;
5049 5050  
5050 5051          for (;;) {
5051 5052                  while (connp != NULL) {
5052 5053                          /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5053 5054                          if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5054 5055                              (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5055 5056                              tsol_receive_local(mp, &laddr, IPV4_VERSION,
5056 5057                              ira, connp)))
5057 5058                                  break;
5058 5059                          connp = connp->conn_next;
5059 5060                  }
5060 5061  
5061 5062                  if (connp == NULL) {
5062 5063                          /* No more interested clients */
5063 5064                          connp = first_connp;
5064 5065                          break;
5065 5066                  }
5066 5067                  if (((mp1 = dupmsg(mp)) == NULL) &&
5067 5068                      ((mp1 = copymsg(mp)) == NULL)) {
5068 5069                          /* Memory allocation failed */
5069 5070                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5070 5071                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
5071 5072                          connp = first_connp;
5072 5073                          break;
5073 5074                  }
5074 5075  
5075 5076                  CONN_INC_REF(connp);
5076 5077                  mutex_exit(&connfp->connf_lock);
5077 5078  
5078 5079                  ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
5079 5080                      ira);
5080 5081  
5081 5082                  mutex_enter(&connfp->connf_lock);
5082 5083                  /* Follow the next pointer before releasing the conn. */
5083 5084                  next_connp = connp->conn_next;
5084 5085                  CONN_DEC_REF(connp);
5085 5086                  connp = next_connp;
5086 5087          }
5087 5088  
5088 5089          /* Last one.  Send it upstream. */
5089 5090          mutex_exit(&connfp->connf_lock);
5090 5091  
5091 5092          ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
5092 5093  
5093 5094          CONN_DEC_REF(connp);
5094 5095  }
5095 5096  
5096 5097  /*
5097 5098   * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
5098 5099   * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
5099 5100   * is not consumed.
5100 5101   *
5101 5102   * One of three things can happen, all of which affect the passed-in mblk:
5102 5103   *
5103 5104   * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
5104 5105   *
5105 5106   * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
5106 5107   *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
5107 5108   *
5108 5109   * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
5109 5110   */
5110 5111  mblk_t *
5111 5112  zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
5112 5113  {
5113 5114          int shift, plen, iph_len;
5114 5115          ipha_t *ipha;
5115 5116          udpha_t *udpha;
5116 5117          uint32_t *spi;
5117 5118          uint32_t esp_ports;
5118 5119          uint8_t *orptr;
5119 5120          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
5120 5121          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
5121 5122  
5122 5123          ipha = (ipha_t *)mp->b_rptr;
5123 5124          iph_len = ira->ira_ip_hdr_length;
5124 5125          plen = ira->ira_pktlen;
5125 5126  
5126 5127          if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
5127 5128                  /*
5128 5129                   * Most likely a keepalive for the benefit of an intervening
5129 5130                   * NAT.  These aren't for us, per se, so drop it.
5130 5131                   *
5131 5132                   * RFC 3947/8 doesn't say for sure what to do for 2-3
5132 5133                   * byte packets (keepalives are 1-byte), but we'll drop them
5133 5134                   * also.
5134 5135                   */
5135 5136                  ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5136 5137                      DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
5137 5138                  return (NULL);
5138 5139          }
5139 5140  
5140 5141          if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
5141 5142                  /* might as well pull it all up - it might be ESP. */
5142 5143                  if (!pullupmsg(mp, -1)) {
5143 5144                          ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5144 5145                              DROPPER(ipss, ipds_esp_nomem),
5145 5146                              &ipss->ipsec_dropper);
5146 5147                          return (NULL);
5147 5148                  }
5148 5149  
5149 5150                  ipha = (ipha_t *)mp->b_rptr;
5150 5151          }
5151 5152          spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
5152 5153          if (*spi == 0) {
5153 5154                  /* UDP packet - remove 0-spi. */
5154 5155                  shift = sizeof (uint32_t);
5155 5156          } else {
5156 5157                  /* ESP-in-UDP packet - reduce to ESP. */
5157 5158                  ipha->ipha_protocol = IPPROTO_ESP;
5158 5159                  shift = sizeof (udpha_t);
5159 5160          }
5160 5161  
5161 5162          /* Fix IP header */
5162 5163          ira->ira_pktlen = (plen - shift);
5163 5164          ipha->ipha_length = htons(ira->ira_pktlen);
5164 5165          ipha->ipha_hdr_checksum = 0;
5165 5166  
5166 5167          orptr = mp->b_rptr;
5167 5168          mp->b_rptr += shift;
5168 5169  
5169 5170          udpha = (udpha_t *)(orptr + iph_len);
5170 5171          if (*spi == 0) {
5171 5172                  ASSERT((uint8_t *)ipha == orptr);
5172 5173                  udpha->uha_length = htons(plen - shift - iph_len);
5173 5174                  iph_len += sizeof (udpha_t);    /* For the call to ovbcopy(). */
5174 5175                  esp_ports = 0;
5175 5176          } else {
5176 5177                  esp_ports = *((uint32_t *)udpha);
5177 5178                  ASSERT(esp_ports != 0);
5178 5179          }
5179 5180          ovbcopy(orptr, orptr + shift, iph_len);
5180 5181          if (esp_ports != 0) /* Punt up for ESP processing. */ {
5181 5182                  ipha = (ipha_t *)(orptr + shift);
5182 5183  
5183 5184                  ira->ira_flags |= IRAF_ESP_UDP_PORTS;
5184 5185                  ira->ira_esp_udp_ports = esp_ports;
5185 5186                  ip_fanout_v4(mp, ipha, ira);
5186 5187                  return (NULL);
5187 5188          }
5188 5189          return (mp);
5189 5190  }
5190 5191  
5191 5192  /*
5192 5193   * Deliver a udp packet to the given conn, possibly applying ipsec policy.
5193 5194   * Handles IPv4 and IPv6.
5194 5195   * We are responsible for disposing of mp, such as by freemsg() or putnext()
5195 5196   * Caller is responsible for dropping references to the conn.
5196 5197   */
5197 5198  void
5198 5199  ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
5199 5200      ip_recv_attr_t *ira)
5200 5201  {
5201 5202          ill_t           *ill = ira->ira_ill;
5202 5203          ip_stack_t      *ipst = ill->ill_ipst;
5203 5204          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
5204 5205          boolean_t       secure;
5205 5206          iaflags_t       iraflags = ira->ira_flags;
5206 5207  
5207 5208          secure = iraflags & IRAF_IPSEC_SECURE;
5208 5209  
5209 5210          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
5210 5211              !canputnext(connp->conn_rq)) {
5211 5212                  BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
5212 5213                  freemsg(mp);
5213 5214                  return;
5214 5215          }
5215 5216  
5216 5217          if (((iraflags & IRAF_IS_IPV4) ?
5217 5218              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
5218 5219              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
5219 5220              secure) {
5220 5221                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
5221 5222                      ip6h, ira);
5222 5223                  if (mp == NULL) {
5223 5224                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5224 5225                          /* Note that mp is NULL */
5225 5226                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
5226 5227                          return;
5227 5228                  }
5228 5229          }
5229 5230  
5230 5231          /*
5231 5232           * Since this code is not used for UDP unicast we don't need a NAT_T
5232 5233           * check. Only ip_fanout_v4 has that check.
5233 5234           */
5234 5235          if (ira->ira_flags & IRAF_ICMP_ERROR) {
5235 5236                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
5236 5237          } else {
5237 5238                  ill_t *rill = ira->ira_rill;
5238 5239  
5239 5240                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
5240 5241                  ira->ira_ill = ira->ira_rill = NULL;
5241 5242                  /* Send it upstream */
5242 5243                  (connp->conn_recv)(connp, mp, NULL, ira);
5243 5244                  ira->ira_ill = ill;
5244 5245                  ira->ira_rill = rill;
5245 5246          }
5246 5247  }
5247 5248  
5248 5249  /*
5249 5250   * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
5250 5251   * (Unicast fanout is handled in ip_input_v4.)
5251 5252   *
5252 5253   * If SO_REUSEADDR is set all multicast and broadcast packets
5253 5254   * will be delivered to all conns bound to the same port.
5254 5255   *
5255 5256   * If there is at least one matching AF_INET receiver, then we will
5256 5257   * ignore any AF_INET6 receivers.
5257 5258   * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
5258 5259   * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
5259 5260   * packets.
5260 5261   *
5261 5262   * Zones notes:
5262 5263   * Earlier in ip_input on a system with multiple shared-IP zones we
5263 5264   * duplicate the multicast and broadcast packets and send them up
5264 5265   * with each explicit zoneid that exists on that ill.
5265 5266   * This means that here we can match the zoneid with SO_ALLZONES being special.
5266 5267   */
5267 5268  void
5268 5269  ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
5269 5270      ip_recv_attr_t *ira)
5270 5271  {
5271 5272          ipaddr_t        laddr;
5272 5273          in6_addr_t      v6faddr;
5273 5274          conn_t          *connp;
5274 5275          connf_t         *connfp;
5275 5276          ipaddr_t        faddr;
5276 5277          ill_t           *ill = ira->ira_ill;
5277 5278          ip_stack_t      *ipst = ill->ill_ipst;
5278 5279  
5279 5280          ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
5280 5281  
5281 5282          laddr = ipha->ipha_dst;
5282 5283          faddr = ipha->ipha_src;
5283 5284  
5284 5285          connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5285 5286          mutex_enter(&connfp->connf_lock);
5286 5287          connp = connfp->connf_head;
5287 5288  
5288 5289          /*
5289 5290           * If SO_REUSEADDR has been set on the first we send the
5290 5291           * packet to all clients that have joined the group and
5291 5292           * match the port.
5292 5293           */
5293 5294          while (connp != NULL) {
5294 5295                  if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
5295 5296                      conn_wantpacket(connp, ira, ipha) &&
5296 5297                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5297 5298                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5298 5299                          break;
5299 5300                  connp = connp->conn_next;
5300 5301          }
5301 5302  
5302 5303          if (connp == NULL)
5303 5304                  goto notfound;
5304 5305  
5305 5306          CONN_INC_REF(connp);
5306 5307  
5307 5308          if (connp->conn_reuseaddr) {
5308 5309                  conn_t          *first_connp = connp;
5309 5310                  conn_t          *next_connp;
5310 5311                  mblk_t          *mp1;
5311 5312  
5312 5313                  connp = connp->conn_next;
5313 5314                  for (;;) {
5314 5315                          while (connp != NULL) {
5315 5316                                  if (IPCL_UDP_MATCH(connp, lport, laddr,
5316 5317                                      fport, faddr) &&
5317 5318                                      conn_wantpacket(connp, ira, ipha) &&
5318 5319                                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5319 5320                                      tsol_receive_local(mp, &laddr, IPV4_VERSION,
5320 5321                                      ira, connp)))
5321 5322                                          break;
5322 5323                                  connp = connp->conn_next;
5323 5324                          }
5324 5325                          if (connp == NULL) {
5325 5326                                  /* No more interested clients */
5326 5327                                  connp = first_connp;
5327 5328                                  break;
5328 5329                          }
5329 5330                          if (((mp1 = dupmsg(mp)) == NULL) &&
5330 5331                              ((mp1 = copymsg(mp)) == NULL)) {
5331 5332                                  /* Memory allocation failed */
5332 5333                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5333 5334                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
5334 5335                                  connp = first_connp;
5335 5336                                  break;
5336 5337                          }
5337 5338                          CONN_INC_REF(connp);
5338 5339                          mutex_exit(&connfp->connf_lock);
5339 5340  
5340 5341                          IP_STAT(ipst, ip_udp_fanmb);
5341 5342                          ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5342 5343                              NULL, ira);
5343 5344                          mutex_enter(&connfp->connf_lock);
5344 5345                          /* Follow the next pointer before releasing the conn */
5345 5346                          next_connp = connp->conn_next;
5346 5347                          CONN_DEC_REF(connp);
5347 5348                          connp = next_connp;
5348 5349                  }
5349 5350          }
5350 5351  
5351 5352          /* Last one.  Send it upstream. */
5352 5353          mutex_exit(&connfp->connf_lock);
5353 5354          IP_STAT(ipst, ip_udp_fanmb);
5354 5355          ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5355 5356          CONN_DEC_REF(connp);
5356 5357          return;
5357 5358  
5358 5359  notfound:
5359 5360          mutex_exit(&connfp->connf_lock);
5360 5361          /*
5361 5362           * IPv6 endpoints bound to multicast IPv4-mapped addresses
5362 5363           * have already been matched above, since they live in the IPv4
5363 5364           * fanout tables. This implies we only need to
5364 5365           * check for IPv6 in6addr_any endpoints here.
5365 5366           * Thus we compare using ipv6_all_zeros instead of the destination
5366 5367           * address, except for the multicast group membership lookup which
5367 5368           * uses the IPv4 destination.
5368 5369           */
5369 5370          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
5370 5371          connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5371 5372          mutex_enter(&connfp->connf_lock);
5372 5373          connp = connfp->connf_head;
5373 5374          /*
5374 5375           * IPv4 multicast packet being delivered to an AF_INET6
5375 5376           * in6addr_any endpoint.
5376 5377           * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
5377 5378           * and not conn_wantpacket_v6() since any multicast membership is
5378 5379           * for an IPv4-mapped multicast address.
5379 5380           */
5380 5381          while (connp != NULL) {
5381 5382                  if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
5382 5383                      fport, v6faddr) &&
5383 5384                      conn_wantpacket(connp, ira, ipha) &&
5384 5385                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5385 5386                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5386 5387                          break;
5387 5388                  connp = connp->conn_next;
5388 5389          }
5389 5390  
5390 5391          if (connp == NULL) {
5391 5392                  /*
5392 5393                   * No one bound to this port.  Is
5393 5394                   * there a client that wants all
5394 5395                   * unclaimed datagrams?
5395 5396                   */
5396 5397                  mutex_exit(&connfp->connf_lock);
5397 5398  
5398 5399                  if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
5399 5400                      NULL) {
5400 5401                          ASSERT(ira->ira_protocol == IPPROTO_UDP);
5401 5402                          ip_fanout_proto_v4(mp, ipha, ira);
5402 5403                  } else {
5403 5404                          /*
5404 5405                           * We used to attempt to send an icmp error here, but
5405 5406                           * since this is known to be a multicast packet
5406 5407                           * and we don't send icmp errors in response to
5407 5408                           * multicast, just drop the packet and give up sooner.
5408 5409                           */
5409 5410                          BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
5410 5411                          freemsg(mp);
5411 5412                  }
5412 5413                  return;
5413 5414          }
5414 5415          CONN_INC_REF(connp);
5415 5416          ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5416 5417  
5417 5418          /*
5418 5419           * If SO_REUSEADDR has been set on the first we send the
5419 5420           * packet to all clients that have joined the group and
5420 5421           * match the port.
5421 5422           */
5422 5423          if (connp->conn_reuseaddr) {
5423 5424                  conn_t          *first_connp = connp;
5424 5425                  conn_t          *next_connp;
5425 5426                  mblk_t          *mp1;
5426 5427  
5427 5428                  connp = connp->conn_next;
5428 5429                  for (;;) {
5429 5430                          while (connp != NULL) {
5430 5431                                  if (IPCL_UDP_MATCH_V6(connp, lport,
5431 5432                                      ipv6_all_zeros, fport, v6faddr) &&
5432 5433                                      conn_wantpacket(connp, ira, ipha) &&
5433 5434                                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5434 5435                                      tsol_receive_local(mp, &laddr, IPV4_VERSION,
5435 5436                                      ira, connp)))
5436 5437                                          break;
5437 5438                                  connp = connp->conn_next;
5438 5439                          }
5439 5440                          if (connp == NULL) {
5440 5441                                  /* No more interested clients */
5441 5442                                  connp = first_connp;
5442 5443                                  break;
5443 5444                          }
5444 5445                          if (((mp1 = dupmsg(mp)) == NULL) &&
5445 5446                              ((mp1 = copymsg(mp)) == NULL)) {
5446 5447                                  /* Memory allocation failed */
5447 5448                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5448 5449                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
5449 5450                                  connp = first_connp;
5450 5451                                  break;
5451 5452                          }
5452 5453                          CONN_INC_REF(connp);
5453 5454                          mutex_exit(&connfp->connf_lock);
5454 5455  
5455 5456                          IP_STAT(ipst, ip_udp_fanmb);
5456 5457                          ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5457 5458                              NULL, ira);
5458 5459                          mutex_enter(&connfp->connf_lock);
5459 5460                          /* Follow the next pointer before releasing the conn */
5460 5461                          next_connp = connp->conn_next;
5461 5462                          CONN_DEC_REF(connp);
5462 5463                          connp = next_connp;
5463 5464                  }
5464 5465          }
5465 5466  
5466 5467          /* Last one.  Send it upstream. */
5467 5468          mutex_exit(&connfp->connf_lock);
5468 5469          IP_STAT(ipst, ip_udp_fanmb);
5469 5470          ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5470 5471          CONN_DEC_REF(connp);
5471 5472  }
5472 5473  
5473 5474  /*
5474 5475   * Split an incoming packet's IPv4 options into the label and the other options.
5475 5476   * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
5476 5477   * clearing out any leftover label or options.
5477 5478   * Otherwise it just makes ipp point into the packet.
5478 5479   *
5479 5480   * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
5480 5481   */
5481 5482  int
5482 5483  ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
5483 5484  {
5484 5485          uchar_t         *opt;
5485 5486          uint32_t        totallen;
5486 5487          uint32_t        optval;
5487 5488          uint32_t        optlen;
5488 5489  
5489 5490          ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
5490 5491          ipp->ipp_hoplimit = ipha->ipha_ttl;
5491 5492          ipp->ipp_type_of_service = ipha->ipha_type_of_service;
5492 5493          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
5493 5494  
5494 5495          /*
5495 5496           * Get length (in 4 byte octets) of IP header options.
5496 5497           */
5497 5498          totallen = ipha->ipha_version_and_hdr_length -
5498 5499              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5499 5500  
5500 5501          if (totallen == 0) {
5501 5502                  if (!allocate)
5502 5503                          return (0);
5503 5504  
5504 5505                  /* Clear out anything from a previous packet */
5505 5506                  if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5506 5507                          kmem_free(ipp->ipp_ipv4_options,
5507 5508                              ipp->ipp_ipv4_options_len);
5508 5509                          ipp->ipp_ipv4_options = NULL;
5509 5510                          ipp->ipp_ipv4_options_len = 0;
5510 5511                          ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5511 5512                  }
5512 5513                  if (ipp->ipp_fields & IPPF_LABEL_V4) {
5513 5514                          kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5514 5515                          ipp->ipp_label_v4 = NULL;
5515 5516                          ipp->ipp_label_len_v4 = 0;
5516 5517                          ipp->ipp_fields &= ~IPPF_LABEL_V4;
5517 5518                  }
5518 5519                  return (0);
5519 5520          }
5520 5521  
5521 5522          totallen <<= 2;
5522 5523          opt = (uchar_t *)&ipha[1];
5523 5524          if (!is_system_labeled()) {
5524 5525  
5525 5526          copyall:
5526 5527                  if (!allocate) {
5527 5528                          if (totallen != 0) {
5528 5529                                  ipp->ipp_ipv4_options = opt;
5529 5530                                  ipp->ipp_ipv4_options_len = totallen;
5530 5531                                  ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5531 5532                          }
5532 5533                          return (0);
5533 5534                  }
5534 5535                  /* Just copy all of options */
5535 5536                  if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5536 5537                          if (totallen == ipp->ipp_ipv4_options_len) {
5537 5538                                  bcopy(opt, ipp->ipp_ipv4_options, totallen);
5538 5539                                  return (0);
5539 5540                          }
5540 5541                          kmem_free(ipp->ipp_ipv4_options,
5541 5542                              ipp->ipp_ipv4_options_len);
5542 5543                          ipp->ipp_ipv4_options = NULL;
5543 5544                          ipp->ipp_ipv4_options_len = 0;
5544 5545                          ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5545 5546                  }
5546 5547                  if (totallen == 0)
5547 5548                          return (0);
5548 5549  
5549 5550                  ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
5550 5551                  if (ipp->ipp_ipv4_options == NULL)
5551 5552                          return (ENOMEM);
5552 5553                  ipp->ipp_ipv4_options_len = totallen;
5553 5554                  ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5554 5555                  bcopy(opt, ipp->ipp_ipv4_options, totallen);
5555 5556                  return (0);
5556 5557          }
5557 5558  
5558 5559          if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
5559 5560                  kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5560 5561                  ipp->ipp_label_v4 = NULL;
5561 5562                  ipp->ipp_label_len_v4 = 0;
5562 5563                  ipp->ipp_fields &= ~IPPF_LABEL_V4;
5563 5564          }
5564 5565  
5565 5566          /*
5566 5567           * Search for CIPSO option.
5567 5568           * We assume CIPSO is first in options if it is present.
5568 5569           * If it isn't, then ipp_opt_ipv4_options will not include the options
5569 5570           * prior to the CIPSO option.
5570 5571           */
5571 5572          while (totallen != 0) {
5572 5573                  switch (optval = opt[IPOPT_OPTVAL]) {
5573 5574                  case IPOPT_EOL:
5574 5575                          return (0);
5575 5576                  case IPOPT_NOP:
5576 5577                          optlen = 1;
5577 5578                          break;
5578 5579                  default:
5579 5580                          if (totallen <= IPOPT_OLEN)
5580 5581                                  return (EINVAL);
5581 5582                          optlen = opt[IPOPT_OLEN];
5582 5583                          if (optlen < 2)
5583 5584                                  return (EINVAL);
5584 5585                  }
5585 5586                  if (optlen > totallen)
5586 5587                          return (EINVAL);
5587 5588  
5588 5589                  switch (optval) {
5589 5590                  case IPOPT_COMSEC:
5590 5591                          if (!allocate) {
5591 5592                                  ipp->ipp_label_v4 = opt;
5592 5593                                  ipp->ipp_label_len_v4 = optlen;
5593 5594                                  ipp->ipp_fields |= IPPF_LABEL_V4;
5594 5595                          } else {
5595 5596                                  ipp->ipp_label_v4 = kmem_alloc(optlen,
5596 5597                                      KM_NOSLEEP);
5597 5598                                  if (ipp->ipp_label_v4 == NULL)
5598 5599                                          return (ENOMEM);
5599 5600                                  ipp->ipp_label_len_v4 = optlen;
5600 5601                                  ipp->ipp_fields |= IPPF_LABEL_V4;
5601 5602                                  bcopy(opt, ipp->ipp_label_v4, optlen);
5602 5603                          }
5603 5604                          totallen -= optlen;
5604 5605                          opt += optlen;
5605 5606  
5606 5607                          /* Skip padding bytes until we get to a multiple of 4 */
5607 5608                          while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
5608 5609                                  totallen--;
5609 5610                                  opt++;
5610 5611                          }
5611 5612                          /* Remaining as ipp_ipv4_options */
5612 5613                          goto copyall;
5613 5614                  }
5614 5615                  totallen -= optlen;
5615 5616                  opt += optlen;
5616 5617          }
5617 5618          /* No CIPSO found; return everything as ipp_ipv4_options */
5618 5619          totallen = ipha->ipha_version_and_hdr_length -
5619 5620              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5620 5621          totallen <<= 2;
5621 5622          opt = (uchar_t *)&ipha[1];
5622 5623          goto copyall;
5623 5624  }
5624 5625  
5625 5626  /*
5626 5627   * Efficient versions of lookup for an IRE when we only
5627 5628   * match the address.
5628 5629   * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5629 5630   * Does not handle multicast addresses.
5630 5631   */
5631 5632  uint_t
5632 5633  ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
5633 5634  {
5634 5635          ire_t *ire;
5635 5636          uint_t result;
5636 5637  
5637 5638          ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
5638 5639          ASSERT(ire != NULL);
5639 5640          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5640 5641                  result = IRE_NOROUTE;
5641 5642          else
5642 5643                  result = ire->ire_type;
5643 5644          ire_refrele(ire);
5644 5645          return (result);
5645 5646  }
5646 5647  
5647 5648  /*
5648 5649   * Efficient versions of lookup for an IRE when we only
5649 5650   * match the address.
5650 5651   * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5651 5652   * Does not handle multicast addresses.
5652 5653   */
5653 5654  uint_t
5654 5655  ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
5655 5656  {
5656 5657          ire_t *ire;
5657 5658          uint_t result;
5658 5659  
5659 5660          ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
5660 5661          ASSERT(ire != NULL);
5661 5662          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5662 5663                  result = IRE_NOROUTE;
5663 5664          else
5664 5665                  result = ire->ire_type;
5665 5666          ire_refrele(ire);
5666 5667          return (result);
5667 5668  }
5668 5669  
5669 5670  /*
5670 5671   * Nobody should be sending
5671 5672   * packets up this stream
5672 5673   */
5673 5674  static int
5674 5675  ip_lrput(queue_t *q, mblk_t *mp)
5675 5676  {
5676 5677          switch (mp->b_datap->db_type) {
5677 5678          case M_FLUSH:
5678 5679                  /* Turn around */
5679 5680                  if (*mp->b_rptr & FLUSHW) {
5680 5681                          *mp->b_rptr &= ~FLUSHR;
5681 5682                          qreply(q, mp);
5682 5683                          return (0);
5683 5684                  }
5684 5685                  break;
5685 5686          }
5686 5687          freemsg(mp);
5687 5688          return (0);
5688 5689  }
5689 5690  
5690 5691  /* Nobody should be sending packets down this stream */
5691 5692  /* ARGSUSED */
5692 5693  int
5693 5694  ip_lwput(queue_t *q, mblk_t *mp)
5694 5695  {
5695 5696          freemsg(mp);
5696 5697          return (0);
5697 5698  }
5698 5699  
5699 5700  /*
5700 5701   * Move the first hop in any source route to ipha_dst and remove that part of
5701 5702   * the source route.  Called by other protocols.  Errors in option formatting
5702 5703   * are ignored - will be handled by ip_output_options. Return the final
5703 5704   * destination (either ipha_dst or the last entry in a source route.)
5704 5705   */
5705 5706  ipaddr_t
5706 5707  ip_massage_options(ipha_t *ipha, netstack_t *ns)
5707 5708  {
5708 5709          ipoptp_t        opts;
5709 5710          uchar_t         *opt;
5710 5711          uint8_t         optval;
5711 5712          uint8_t         optlen;
5712 5713          ipaddr_t        dst;
5713 5714          int             i;
5714 5715          ip_stack_t      *ipst = ns->netstack_ip;
5715 5716  
5716 5717          ip2dbg(("ip_massage_options\n"));
5717 5718          dst = ipha->ipha_dst;
5718 5719          for (optval = ipoptp_first(&opts, ipha);
5719 5720              optval != IPOPT_EOL;
5720 5721              optval = ipoptp_next(&opts)) {
5721 5722                  opt = opts.ipoptp_cur;
5722 5723                  switch (optval) {
5723 5724                          uint8_t off;
5724 5725                  case IPOPT_SSRR:
5725 5726                  case IPOPT_LSRR:
5726 5727                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
5727 5728                                  ip1dbg(("ip_massage_options: bad src route\n"));
5728 5729                                  break;
5729 5730                          }
5730 5731                          optlen = opts.ipoptp_len;
5731 5732                          off = opt[IPOPT_OFFSET];
5732 5733                          off--;
5733 5734                  redo_srr:
5734 5735                          if (optlen < IP_ADDR_LEN ||
5735 5736                              off > optlen - IP_ADDR_LEN) {
5736 5737                                  /* End of source route */
5737 5738                                  ip1dbg(("ip_massage_options: end of SR\n"));
5738 5739                                  break;
5739 5740                          }
5740 5741                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
5741 5742                          ip1dbg(("ip_massage_options: next hop 0x%x\n",
5742 5743                              ntohl(dst)));
5743 5744                          /*
5744 5745                           * Check if our address is present more than
5745 5746                           * once as consecutive hops in source route.
5746 5747                           * XXX verify per-interface ip_forwarding
5747 5748                           * for source route?
5748 5749                           */
5749 5750                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
5750 5751                                  off += IP_ADDR_LEN;
5751 5752                                  goto redo_srr;
5752 5753                          }
5753 5754                          if (dst == htonl(INADDR_LOOPBACK)) {
5754 5755                                  ip1dbg(("ip_massage_options: loopback addr in "
5755 5756                                      "source route!\n"));
5756 5757                                  break;
5757 5758                          }
5758 5759                          /*
5759 5760                           * Update ipha_dst to be the first hop and remove the
5760 5761                           * first hop from the source route (by overwriting
5761 5762                           * part of the option with NOP options).
5762 5763                           */
5763 5764                          ipha->ipha_dst = dst;
5764 5765                          /* Put the last entry in dst */
5765 5766                          off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
5766 5767                              3;
5767 5768                          bcopy(&opt[off], &dst, IP_ADDR_LEN);
5768 5769  
5769 5770                          ip1dbg(("ip_massage_options: last hop 0x%x\n",
5770 5771                              ntohl(dst)));
5771 5772                          /* Move down and overwrite */
5772 5773                          opt[IP_ADDR_LEN] = opt[0];
5773 5774                          opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
5774 5775                          opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
5775 5776                          for (i = 0; i < IP_ADDR_LEN; i++)
5776 5777                                  opt[i] = IPOPT_NOP;
5777 5778                          break;
5778 5779                  }
5779 5780          }
5780 5781          return (dst);
5781 5782  }
5782 5783  
5783 5784  /*
5784 5785   * Return the network mask
5785 5786   * associated with the specified address.
5786 5787   */
5787 5788  ipaddr_t
5788 5789  ip_net_mask(ipaddr_t addr)
5789 5790  {
5790 5791          uchar_t *up = (uchar_t *)&addr;
5791 5792          ipaddr_t mask = 0;
5792 5793          uchar_t *maskp = (uchar_t *)&mask;
5793 5794  
5794 5795  #if defined(__i386) || defined(__amd64)
5795 5796  #define TOTALLY_BRAIN_DAMAGED_C_COMPILER
5796 5797  #endif
5797 5798  #ifdef  TOTALLY_BRAIN_DAMAGED_C_COMPILER
5798 5799          maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
5799 5800  #endif
5800 5801          if (CLASSD(addr)) {
5801 5802                  maskp[0] = 0xF0;
5802 5803                  return (mask);
5803 5804          }
5804 5805  
5805 5806          /* We assume Class E default netmask to be 32 */
5806 5807          if (CLASSE(addr))
5807 5808                  return (0xffffffffU);
5808 5809  
5809 5810          if (addr == 0)
5810 5811                  return (0);
5811 5812          maskp[0] = 0xFF;
5812 5813          if ((up[0] & 0x80) == 0)
5813 5814                  return (mask);
5814 5815  
5815 5816          maskp[1] = 0xFF;
5816 5817          if ((up[0] & 0xC0) == 0x80)
5817 5818                  return (mask);
5818 5819  
5819 5820          maskp[2] = 0xFF;
5820 5821          if ((up[0] & 0xE0) == 0xC0)
5821 5822                  return (mask);
5822 5823  
5823 5824          /* Otherwise return no mask */
5824 5825          return ((ipaddr_t)0);
5825 5826  }
5826 5827  
5827 5828  /* Name/Value Table Lookup Routine */
5828 5829  char *
5829 5830  ip_nv_lookup(nv_t *nv, int value)
5830 5831  {
5831 5832          if (!nv)
5832 5833                  return (NULL);
5833 5834          for (; nv->nv_name; nv++) {
5834 5835                  if (nv->nv_value == value)
5835 5836                          return (nv->nv_name);
5836 5837          }
5837 5838          return ("unknown");
5838 5839  }
5839 5840  
5840 5841  static int
5841 5842  ip_wait_for_info_ack(ill_t *ill)
5842 5843  {
5843 5844          int err;
5844 5845  
5845 5846          mutex_enter(&ill->ill_lock);
5846 5847          while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
5847 5848                  /*
5848 5849                   * Return value of 0 indicates a pending signal.
5849 5850                   */
5850 5851                  err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
5851 5852                  if (err == 0) {
5852 5853                          mutex_exit(&ill->ill_lock);
5853 5854                          return (EINTR);
5854 5855                  }
5855 5856          }
5856 5857          mutex_exit(&ill->ill_lock);
5857 5858          /*
5858 5859           * ip_rput_other could have set an error  in ill_error on
5859 5860           * receipt of M_ERROR.
5860 5861           */
5861 5862          return (ill->ill_error);
5862 5863  }
5863 5864  
5864 5865  /*
5865 5866   * This is a module open, i.e. this is a control stream for access
5866 5867   * to a DLPI device.  We allocate an ill_t as the instance data in
5867 5868   * this case.
5868 5869   */
5869 5870  static int
5870 5871  ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5871 5872  {
5872 5873          ill_t   *ill;
5873 5874          int     err;
5874 5875          zoneid_t zoneid;
5875 5876          netstack_t *ns;
5876 5877          ip_stack_t *ipst;
5877 5878  
5878 5879          /*
5879 5880           * Prevent unprivileged processes from pushing IP so that
5880 5881           * they can't send raw IP.
5881 5882           */
5882 5883          if (secpolicy_net_rawaccess(credp) != 0)
5883 5884                  return (EPERM);
5884 5885  
5885 5886          ns = netstack_find_by_cred(credp);
5886 5887          ASSERT(ns != NULL);
5887 5888          ipst = ns->netstack_ip;
5888 5889          ASSERT(ipst != NULL);
5889 5890  
5890 5891          /*
5891 5892           * For exclusive stacks we set the zoneid to zero
5892 5893           * to make IP operate as if in the global zone.
5893 5894           */
5894 5895          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5895 5896                  zoneid = GLOBAL_ZONEID;
5896 5897          else
5897 5898                  zoneid = crgetzoneid(credp);
5898 5899  
5899 5900          ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
5900 5901          q->q_ptr = WR(q)->q_ptr = ill;
5901 5902          ill->ill_ipst = ipst;
5902 5903          ill->ill_zoneid = zoneid;
5903 5904  
5904 5905          /*
5905 5906           * ill_init initializes the ill fields and then sends down
5906 5907           * down a DL_INFO_REQ after calling qprocson.
5907 5908           */
5908 5909          err = ill_init(q, ill);
5909 5910  
5910 5911          if (err != 0) {
5911 5912                  mi_free(ill);
5912 5913                  netstack_rele(ipst->ips_netstack);
5913 5914                  q->q_ptr = NULL;
5914 5915                  WR(q)->q_ptr = NULL;
5915 5916                  return (err);
5916 5917          }
5917 5918  
5918 5919          /*
5919 5920           * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
5920 5921           *
5921 5922           * ill_init initializes the ipsq marking this thread as
5922 5923           * writer
5923 5924           */
5924 5925          ipsq_exit(ill->ill_phyint->phyint_ipsq);
5925 5926          err = ip_wait_for_info_ack(ill);
5926 5927          if (err == 0)
5927 5928                  ill->ill_credp = credp;
5928 5929          else
5929 5930                  goto fail;
5930 5931  
5931 5932          crhold(credp);
5932 5933  
5933 5934          mutex_enter(&ipst->ips_ip_mi_lock);
5934 5935          err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
5935 5936              sflag, credp);
5936 5937          mutex_exit(&ipst->ips_ip_mi_lock);
5937 5938  fail:
5938 5939          if (err) {
5939 5940                  (void) ip_close(q, 0, credp);
5940 5941                  return (err);
5941 5942          }
5942 5943          return (0);
5943 5944  }
5944 5945  
5945 5946  /* For /dev/ip aka AF_INET open */
5946 5947  int
5947 5948  ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5948 5949  {
5949 5950          return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
5950 5951  }
5951 5952  
5952 5953  /* For /dev/ip6 aka AF_INET6 open */
5953 5954  int
5954 5955  ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5955 5956  {
5956 5957          return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
5957 5958  }
5958 5959  
5959 5960  /* IP open routine. */
5960 5961  int
5961 5962  ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
5962 5963      boolean_t isv6)
5963 5964  {
5964 5965          conn_t          *connp;
5965 5966          major_t         maj;
5966 5967          zoneid_t        zoneid;
5967 5968          netstack_t      *ns;
5968 5969          ip_stack_t      *ipst;
5969 5970  
5970 5971          /* Allow reopen. */
5971 5972          if (q->q_ptr != NULL)
5972 5973                  return (0);
5973 5974  
5974 5975          if (sflag & MODOPEN) {
5975 5976                  /* This is a module open */
5976 5977                  return (ip_modopen(q, devp, flag, sflag, credp));
5977 5978          }
5978 5979  
5979 5980          if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
5980 5981                  /*
5981 5982                   * Non streams based socket looking for a stream
5982 5983                   * to access IP
5983 5984                   */
5984 5985                  return (ip_helper_stream_setup(q, devp, flag, sflag,
5985 5986                      credp, isv6));
5986 5987          }
5987 5988  
5988 5989          ns = netstack_find_by_cred(credp);
5989 5990          ASSERT(ns != NULL);
5990 5991          ipst = ns->netstack_ip;
5991 5992          ASSERT(ipst != NULL);
5992 5993  
5993 5994          /*
5994 5995           * For exclusive stacks we set the zoneid to zero
5995 5996           * to make IP operate as if in the global zone.
5996 5997           */
5997 5998          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5998 5999                  zoneid = GLOBAL_ZONEID;
5999 6000          else
6000 6001                  zoneid = crgetzoneid(credp);
6001 6002  
6002 6003          /*
6003 6004           * We are opening as a device. This is an IP client stream, and we
6004 6005           * allocate an conn_t as the instance data.
6005 6006           */
6006 6007          connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
6007 6008  
6008 6009          /*
6009 6010           * ipcl_conn_create did a netstack_hold. Undo the hold that was
6010 6011           * done by netstack_find_by_cred()
6011 6012           */
6012 6013          netstack_rele(ipst->ips_netstack);
6013 6014  
6014 6015          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
6015 6016          /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
6016 6017          connp->conn_ixa->ixa_zoneid = zoneid;
6017 6018          connp->conn_zoneid = zoneid;
6018 6019  
6019 6020          connp->conn_rq = q;
6020 6021          q->q_ptr = WR(q)->q_ptr = connp;
6021 6022  
6022 6023          /* Minor tells us which /dev entry was opened */
6023 6024          if (isv6) {
6024 6025                  connp->conn_family = AF_INET6;
6025 6026                  connp->conn_ipversion = IPV6_VERSION;
6026 6027                  connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
6027 6028                  connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
6028 6029          } else {
6029 6030                  connp->conn_family = AF_INET;
6030 6031                  connp->conn_ipversion = IPV4_VERSION;
6031 6032                  connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
6032 6033          }
6033 6034  
6034 6035          if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
6035 6036              ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
6036 6037                  connp->conn_minor_arena = ip_minor_arena_la;
6037 6038          } else {
6038 6039                  /*
6039 6040                   * Either minor numbers in the large arena were exhausted
6040 6041                   * or a non socket application is doing the open.
6041 6042                   * Try to allocate from the small arena.
6042 6043                   */
6043 6044                  if ((connp->conn_dev =
6044 6045                      inet_minor_alloc(ip_minor_arena_sa)) == 0) {
6045 6046                          /* CONN_DEC_REF takes care of netstack_rele() */
6046 6047                          q->q_ptr = WR(q)->q_ptr = NULL;
6047 6048                          CONN_DEC_REF(connp);
6048 6049                          return (EBUSY);
6049 6050                  }
6050 6051                  connp->conn_minor_arena = ip_minor_arena_sa;
6051 6052          }
6052 6053  
6053 6054          maj = getemajor(*devp);
6054 6055          *devp = makedevice(maj, (minor_t)connp->conn_dev);
6055 6056  
6056 6057          /*
6057 6058           * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
6058 6059           */
6059 6060          connp->conn_cred = credp;
6060 6061          connp->conn_cpid = curproc->p_pid;
6061 6062          /* Cache things in ixa without an extra refhold */
6062 6063          ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
6063 6064          connp->conn_ixa->ixa_cred = connp->conn_cred;
6064 6065          connp->conn_ixa->ixa_cpid = connp->conn_cpid;
6065 6066          if (is_system_labeled())
6066 6067                  connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
6067 6068  
6068 6069          /*
6069 6070           * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
6070 6071           */
6071 6072          connp->conn_recv = ip_conn_input;
6072 6073          connp->conn_recvicmp = ip_conn_input_icmp;
6073 6074  
6074 6075          crhold(connp->conn_cred);
6075 6076  
6076 6077          /*
6077 6078           * If the caller has the process-wide flag set, then default to MAC
6078 6079           * exempt mode.  This allows read-down to unlabeled hosts.
6079 6080           */
6080 6081          if (getpflags(NET_MAC_AWARE, credp) != 0)
6081 6082                  connp->conn_mac_mode = CONN_MAC_AWARE;
6082 6083  
6083 6084          connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
6084 6085  
6085 6086          connp->conn_rq = q;
6086 6087          connp->conn_wq = WR(q);
6087 6088  
6088 6089          /* Non-zero default values */
6089 6090          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
6090 6091  
6091 6092          /*
6092 6093           * Make the conn globally visible to walkers
6093 6094           */
6094 6095          ASSERT(connp->conn_ref == 1);
6095 6096          mutex_enter(&connp->conn_lock);
6096 6097          connp->conn_state_flags &= ~CONN_INCIPIENT;
6097 6098          mutex_exit(&connp->conn_lock);
6098 6099  
6099 6100          qprocson(q);
6100 6101  
6101 6102          return (0);
6102 6103  }
6103 6104  
6104 6105  /*
6105 6106   * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
6106 6107   * all of them are copied to the conn_t. If the req is "zero", the policy is
6107 6108   * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
6108 6109   * fields.
6109 6110   * We keep only the latest setting of the policy and thus policy setting
6110 6111   * is not incremental/cumulative.
6111 6112   *
6112 6113   * Requests to set policies with multiple alternative actions will
6113 6114   * go through a different API.
6114 6115   */
6115 6116  int
6116 6117  ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
6117 6118  {
6118 6119          uint_t ah_req = 0;
6119 6120          uint_t esp_req = 0;
6120 6121          uint_t se_req = 0;
6121 6122          ipsec_act_t *actp = NULL;
6122 6123          uint_t nact;
6123 6124          ipsec_policy_head_t *ph;
6124 6125          boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
6125 6126          int error = 0;
6126 6127          netstack_t      *ns = connp->conn_netstack;
6127 6128          ip_stack_t      *ipst = ns->netstack_ip;
6128 6129          ipsec_stack_t   *ipss = ns->netstack_ipsec;
6129 6130  
6130 6131  #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
6131 6132  
6132 6133          /*
6133 6134           * The IP_SEC_OPT option does not allow variable length parameters,
6134 6135           * hence a request cannot be NULL.
6135 6136           */
6136 6137          if (req == NULL)
6137 6138                  return (EINVAL);
6138 6139  
6139 6140          ah_req = req->ipsr_ah_req;
6140 6141          esp_req = req->ipsr_esp_req;
6141 6142          se_req = req->ipsr_self_encap_req;
6142 6143  
6143 6144          /* Don't allow setting self-encap without one or more of AH/ESP. */
6144 6145          if (se_req != 0 && esp_req == 0 && ah_req == 0)
6145 6146                  return (EINVAL);
6146 6147  
6147 6148          /*
6148 6149           * Are we dealing with a request to reset the policy (i.e.
6149 6150           * zero requests).
6150 6151           */
6151 6152          is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
6152 6153              (esp_req & REQ_MASK) == 0 &&
6153 6154              (se_req & REQ_MASK) == 0);
6154 6155  
6155 6156          if (!is_pol_reset) {
6156 6157                  /*
6157 6158                   * If we couldn't load IPsec, fail with "protocol
6158 6159                   * not supported".
6159 6160                   * IPsec may not have been loaded for a request with zero
6160 6161                   * policies, so we don't fail in this case.
6161 6162                   */
6162 6163                  mutex_enter(&ipss->ipsec_loader_lock);
6163 6164                  if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
6164 6165                          mutex_exit(&ipss->ipsec_loader_lock);
6165 6166                          return (EPROTONOSUPPORT);
6166 6167                  }
6167 6168                  mutex_exit(&ipss->ipsec_loader_lock);
6168 6169  
6169 6170                  /*
6170 6171                   * Test for valid requests. Invalid algorithms
6171 6172                   * need to be tested by IPsec code because new
6172 6173                   * algorithms can be added dynamically.
6173 6174                   */
6174 6175                  if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6175 6176                      (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6176 6177                      (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
6177 6178                          return (EINVAL);
6178 6179                  }
6179 6180  
6180 6181                  /*
6181 6182                   * Only privileged users can issue these
6182 6183                   * requests.
6183 6184                   */
6184 6185                  if (((ah_req & IPSEC_PREF_NEVER) ||
6185 6186                      (esp_req & IPSEC_PREF_NEVER) ||
6186 6187                      (se_req & IPSEC_PREF_NEVER)) &&
6187 6188                      secpolicy_ip_config(cr, B_FALSE) != 0) {
6188 6189                          return (EPERM);
6189 6190                  }
6190 6191  
6191 6192                  /*
6192 6193                   * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
6193 6194                   * are mutually exclusive.
6194 6195                   */
6195 6196                  if (((ah_req & REQ_MASK) == REQ_MASK) ||
6196 6197                      ((esp_req & REQ_MASK) == REQ_MASK) ||
6197 6198                      ((se_req & REQ_MASK) == REQ_MASK)) {
6198 6199                          /* Both of them are set */
6199 6200                          return (EINVAL);
6200 6201                  }
6201 6202          }
6202 6203  
6203 6204          ASSERT(MUTEX_HELD(&connp->conn_lock));
6204 6205  
6205 6206          /*
6206 6207           * If we have already cached policies in conn_connect(), don't
6207 6208           * let them change now. We cache policies for connections
6208 6209           * whose src,dst [addr, port] is known.
6209 6210           */
6210 6211          if (connp->conn_policy_cached) {
6211 6212                  return (EINVAL);
6212 6213          }
6213 6214  
6214 6215          /*
6215 6216           * We have a zero policies, reset the connection policy if already
6216 6217           * set. This will cause the connection to inherit the
6217 6218           * global policy, if any.
6218 6219           */
6219 6220          if (is_pol_reset) {
6220 6221                  if (connp->conn_policy != NULL) {
6221 6222                          IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
6222 6223                          connp->conn_policy = NULL;
6223 6224                  }
6224 6225                  connp->conn_in_enforce_policy = B_FALSE;
6225 6226                  connp->conn_out_enforce_policy = B_FALSE;
6226 6227                  return (0);
6227 6228          }
6228 6229  
6229 6230          ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
6230 6231              ipst->ips_netstack);
6231 6232          if (ph == NULL)
6232 6233                  goto enomem;
6233 6234  
6234 6235          ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
6235 6236          if (actp == NULL)
6236 6237                  goto enomem;
6237 6238  
6238 6239          /*
6239 6240           * Always insert IPv4 policy entries, since they can also apply to
6240 6241           * ipv6 sockets being used in ipv4-compat mode.
6241 6242           */
6242 6243          if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6243 6244              IPSEC_TYPE_INBOUND, ns))
6244 6245                  goto enomem;
6245 6246          is_pol_inserted = B_TRUE;
6246 6247          if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6247 6248              IPSEC_TYPE_OUTBOUND, ns))
6248 6249                  goto enomem;
6249 6250  
6250 6251          /*
6251 6252           * We're looking at a v6 socket, also insert the v6-specific
6252 6253           * entries.
6253 6254           */
6254 6255          if (connp->conn_family == AF_INET6) {
6255 6256                  if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6256 6257                      IPSEC_TYPE_INBOUND, ns))
6257 6258                          goto enomem;
6258 6259                  if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6259 6260                      IPSEC_TYPE_OUTBOUND, ns))
6260 6261                          goto enomem;
6261 6262          }
6262 6263  
6263 6264          ipsec_actvec_free(actp, nact);
6264 6265  
6265 6266          /*
6266 6267           * If the requests need security, set enforce_policy.
6267 6268           * If the requests are IPSEC_PREF_NEVER, one should
6268 6269           * still set conn_out_enforce_policy so that ip_set_destination
6269 6270           * marks the ip_xmit_attr_t appropriatly. This is needed so that
6270 6271           * for connections that we don't cache policy in at connect time,
6271 6272           * if global policy matches in ip_output_attach_policy, we
6272 6273           * don't wrongly inherit global policy. Similarly, we need
6273 6274           * to set conn_in_enforce_policy also so that we don't verify
6274 6275           * policy wrongly.
6275 6276           */
6276 6277          if ((ah_req & REQ_MASK) != 0 ||
6277 6278              (esp_req & REQ_MASK) != 0 ||
6278 6279              (se_req & REQ_MASK) != 0) {
6279 6280                  connp->conn_in_enforce_policy = B_TRUE;
6280 6281                  connp->conn_out_enforce_policy = B_TRUE;
6281 6282          }
6282 6283  
6283 6284          return (error);
6284 6285  #undef REQ_MASK
6285 6286  
6286 6287          /*
6287 6288           * Common memory-allocation-failure exit path.
6288 6289           */
6289 6290  enomem:
6290 6291          if (actp != NULL)
6291 6292                  ipsec_actvec_free(actp, nact);
6292 6293          if (is_pol_inserted)
6293 6294                  ipsec_polhead_flush(ph, ns);
6294 6295          return (ENOMEM);
6295 6296  }
6296 6297  
6297 6298  /*
6298 6299   * Set socket options for joining and leaving multicast groups.
6299 6300   * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6300 6301   * The caller has already check that the option name is consistent with
6301 6302   * the address family of the socket.
6302 6303   */
6303 6304  int
6304 6305  ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
6305 6306      uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6306 6307  {
6307 6308          int             *i1 = (int *)invalp;
6308 6309          int             error = 0;
6309 6310          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
6310 6311          struct ip_mreq  *v4_mreqp;
6311 6312          struct ipv6_mreq *v6_mreqp;
6312 6313          struct group_req *greqp;
6313 6314          ire_t *ire;
6314 6315          boolean_t done = B_FALSE;
6315 6316          ipaddr_t ifaddr;
6316 6317          in6_addr_t v6group;
6317 6318          uint_t ifindex;
6318 6319          boolean_t mcast_opt = B_TRUE;
6319 6320          mcast_record_t fmode;
6320 6321          int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6321 6322              ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6322 6323  
6323 6324          switch (name) {
6324 6325          case IP_ADD_MEMBERSHIP:
6325 6326          case IPV6_JOIN_GROUP:
6326 6327                  mcast_opt = B_FALSE;
6327 6328                  /* FALLTHROUGH */
6328 6329          case MCAST_JOIN_GROUP:
6329 6330                  fmode = MODE_IS_EXCLUDE;
6330 6331                  optfn = ip_opt_add_group;
6331 6332                  break;
6332 6333  
6333 6334          case IP_DROP_MEMBERSHIP:
6334 6335          case IPV6_LEAVE_GROUP:
6335 6336                  mcast_opt = B_FALSE;
6336 6337                  /* FALLTHROUGH */
6337 6338          case MCAST_LEAVE_GROUP:
6338 6339                  fmode = MODE_IS_INCLUDE;
6339 6340                  optfn = ip_opt_delete_group;
6340 6341                  break;
6341 6342          default:
6342 6343                  /* Should not be reached. */
6343 6344                  fmode = MODE_IS_INCLUDE;
6344 6345                  optfn = NULL;
6345 6346                  ASSERT(0);
6346 6347          }
6347 6348  
6348 6349          if (mcast_opt) {
6349 6350                  struct sockaddr_in *sin;
6350 6351                  struct sockaddr_in6 *sin6;
6351 6352  
6352 6353                  greqp = (struct group_req *)i1;
6353 6354                  if (greqp->gr_group.ss_family == AF_INET) {
6354 6355                          sin = (struct sockaddr_in *)&(greqp->gr_group);
6355 6356                          IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
6356 6357                  } else {
6357 6358                          if (!inet6)
6358 6359                                  return (EINVAL);        /* Not on INET socket */
6359 6360  
6360 6361                          sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
6361 6362                          v6group = sin6->sin6_addr;
6362 6363                  }
6363 6364                  ifaddr = INADDR_ANY;
6364 6365                  ifindex = greqp->gr_interface;
6365 6366          } else if (inet6) {
6366 6367                  v6_mreqp = (struct ipv6_mreq *)i1;
6367 6368                  v6group = v6_mreqp->ipv6mr_multiaddr;
6368 6369                  ifaddr = INADDR_ANY;
6369 6370                  ifindex = v6_mreqp->ipv6mr_interface;
6370 6371          } else {
6371 6372                  v4_mreqp = (struct ip_mreq *)i1;
6372 6373                  IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
6373 6374                  ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
6374 6375                  ifindex = 0;
6375 6376          }
6376 6377  
6377 6378          /*
6378 6379           * In the multirouting case, we need to replicate
6379 6380           * the request on all interfaces that will take part
6380 6381           * in replication.  We do so because multirouting is
6381 6382           * reflective, thus we will probably receive multi-
6382 6383           * casts on those interfaces.
6383 6384           * The ip_multirt_apply_membership() succeeds if
6384 6385           * the operation succeeds on at least one interface.
6385 6386           */
6386 6387          if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6387 6388                  ipaddr_t group;
6388 6389  
6389 6390                  IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6390 6391  
6391 6392                  ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6392 6393                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6393 6394                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6394 6395          } else {
6395 6396                  ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6396 6397                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6397 6398                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6398 6399          }
6399 6400          if (ire != NULL) {
6400 6401                  if (ire->ire_flags & RTF_MULTIRT) {
6401 6402                          error = ip_multirt_apply_membership(optfn, ire, connp,
6402 6403                              checkonly, &v6group, fmode, &ipv6_all_zeros);
6403 6404                          done = B_TRUE;
6404 6405                  }
6405 6406                  ire_refrele(ire);
6406 6407          }
6407 6408  
6408 6409          if (!done) {
6409 6410                  error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6410 6411                      fmode, &ipv6_all_zeros);
6411 6412          }
6412 6413          return (error);
6413 6414  }
6414 6415  
6415 6416  /*
6416 6417   * Set socket options for joining and leaving multicast groups
6417 6418   * for specific sources.
6418 6419   * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6419 6420   * The caller has already check that the option name is consistent with
6420 6421   * the address family of the socket.
6421 6422   */
6422 6423  int
6423 6424  ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
6424 6425      uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6425 6426  {
6426 6427          int             *i1 = (int *)invalp;
6427 6428          int             error = 0;
6428 6429          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
6429 6430          struct ip_mreq_source *imreqp;
6430 6431          struct group_source_req *gsreqp;
6431 6432          in6_addr_t v6group, v6src;
6432 6433          uint32_t ifindex;
6433 6434          ipaddr_t ifaddr;
6434 6435          boolean_t mcast_opt = B_TRUE;
6435 6436          mcast_record_t fmode;
6436 6437          ire_t *ire;
6437 6438          boolean_t done = B_FALSE;
6438 6439          int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6439 6440              ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6440 6441  
6441 6442          switch (name) {
6442 6443          case IP_BLOCK_SOURCE:
6443 6444                  mcast_opt = B_FALSE;
6444 6445                  /* FALLTHROUGH */
6445 6446          case MCAST_BLOCK_SOURCE:
6446 6447                  fmode = MODE_IS_EXCLUDE;
6447 6448                  optfn = ip_opt_add_group;
6448 6449                  break;
6449 6450  
6450 6451          case IP_UNBLOCK_SOURCE:
6451 6452                  mcast_opt = B_FALSE;
6452 6453                  /* FALLTHROUGH */
6453 6454          case MCAST_UNBLOCK_SOURCE:
6454 6455                  fmode = MODE_IS_EXCLUDE;
6455 6456                  optfn = ip_opt_delete_group;
6456 6457                  break;
6457 6458  
6458 6459          case IP_ADD_SOURCE_MEMBERSHIP:
6459 6460                  mcast_opt = B_FALSE;
6460 6461                  /* FALLTHROUGH */
6461 6462          case MCAST_JOIN_SOURCE_GROUP:
6462 6463                  fmode = MODE_IS_INCLUDE;
6463 6464                  optfn = ip_opt_add_group;
6464 6465                  break;
6465 6466  
6466 6467          case IP_DROP_SOURCE_MEMBERSHIP:
6467 6468                  mcast_opt = B_FALSE;
6468 6469                  /* FALLTHROUGH */
6469 6470          case MCAST_LEAVE_SOURCE_GROUP:
6470 6471                  fmode = MODE_IS_INCLUDE;
6471 6472                  optfn = ip_opt_delete_group;
6472 6473                  break;
6473 6474          default:
6474 6475                  /* Should not be reached. */
6475 6476                  optfn = NULL;
6476 6477                  fmode = 0;
6477 6478                  ASSERT(0);
6478 6479          }
6479 6480  
6480 6481          if (mcast_opt) {
6481 6482                  gsreqp = (struct group_source_req *)i1;
6482 6483                  ifindex = gsreqp->gsr_interface;
6483 6484                  if (gsreqp->gsr_group.ss_family == AF_INET) {
6484 6485                          struct sockaddr_in *s;
6485 6486                          s = (struct sockaddr_in *)&gsreqp->gsr_group;
6486 6487                          IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
6487 6488                          s = (struct sockaddr_in *)&gsreqp->gsr_source;
6488 6489                          IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
6489 6490                  } else {
6490 6491                          struct sockaddr_in6 *s6;
6491 6492  
6492 6493                          if (!inet6)
6493 6494                                  return (EINVAL);        /* Not on INET socket */
6494 6495  
6495 6496                          s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
6496 6497                          v6group = s6->sin6_addr;
6497 6498                          s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
6498 6499                          v6src = s6->sin6_addr;
6499 6500                  }
6500 6501                  ifaddr = INADDR_ANY;
6501 6502          } else {
6502 6503                  imreqp = (struct ip_mreq_source *)i1;
6503 6504                  IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
6504 6505                  IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
6505 6506                  ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
6506 6507                  ifindex = 0;
6507 6508          }
6508 6509  
6509 6510          /*
6510 6511           * Handle src being mapped INADDR_ANY by changing it to unspecified.
6511 6512           */
6512 6513          if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
6513 6514                  v6src = ipv6_all_zeros;
6514 6515  
6515 6516          /*
6516 6517           * In the multirouting case, we need to replicate
6517 6518           * the request as noted in the mcast cases above.
6518 6519           */
6519 6520          if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6520 6521                  ipaddr_t group;
6521 6522  
6522 6523                  IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6523 6524  
6524 6525                  ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6525 6526                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6526 6527                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6527 6528          } else {
6528 6529                  ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6529 6530                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6530 6531                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6531 6532          }
6532 6533          if (ire != NULL) {
6533 6534                  if (ire->ire_flags & RTF_MULTIRT) {
6534 6535                          error = ip_multirt_apply_membership(optfn, ire, connp,
6535 6536                              checkonly, &v6group, fmode, &v6src);
6536 6537                          done = B_TRUE;
6537 6538                  }
6538 6539                  ire_refrele(ire);
6539 6540          }
6540 6541          if (!done) {
6541 6542                  error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6542 6543                      fmode, &v6src);
6543 6544          }
6544 6545          return (error);
6545 6546  }
6546 6547  
6547 6548  /*
6548 6549   * Given a destination address and a pointer to where to put the information
6549 6550   * this routine fills in the mtuinfo.
6550 6551   * The socket must be connected.
6551 6552   * For sctp conn_faddr is the primary address.
6552 6553   */
6553 6554  int
6554 6555  ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
6555 6556  {
6556 6557          uint32_t        pmtu = IP_MAXPACKET;
6557 6558          uint_t          scopeid;
6558 6559  
6559 6560          if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
6560 6561                  return (-1);
6561 6562  
6562 6563          /* In case we never sent or called ip_set_destination_v4/v6 */
6563 6564          if (ixa->ixa_ire != NULL)
6564 6565                  pmtu = ip_get_pmtu(ixa);
6565 6566  
6566 6567          if (ixa->ixa_flags & IXAF_SCOPEID_SET)
6567 6568                  scopeid = ixa->ixa_scopeid;
6568 6569          else
6569 6570                  scopeid = 0;
6570 6571  
6571 6572          bzero(mtuinfo, sizeof (*mtuinfo));
6572 6573          mtuinfo->ip6m_addr.sin6_family = AF_INET6;
6573 6574          mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
6574 6575          mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
6575 6576          mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
6576 6577          mtuinfo->ip6m_mtu = pmtu;
6577 6578  
6578 6579          return (sizeof (struct ip6_mtuinfo));
6579 6580  }
6580 6581  
6581 6582  /*
6582 6583   * When the src multihoming is changed from weak to [strong, preferred]
6583 6584   * ip_ire_rebind_walker is called to walk the list of all ire_t entries
6584 6585   * and identify routes that were created by user-applications in the
6585 6586   * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
6586 6587   * currently defined. These routes are then 'rebound', i.e., their ire_ill
6587 6588   * is selected by finding an interface route for the gateway.
6588 6589   */
6589 6590  /* ARGSUSED */
6590 6591  void
6591 6592  ip_ire_rebind_walker(ire_t *ire, void *notused)
6592 6593  {
6593 6594          if (!ire->ire_unbound || ire->ire_ill != NULL)
6594 6595                  return;
6595 6596          ire_rebind(ire);
6596 6597          ire_delete(ire);
6597 6598  }
6598 6599  
6599 6600  /*
6600 6601   * When the src multihoming is changed from  [strong, preferred] to weak,
6601 6602   * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
6602 6603   * set any entries that were created by user-applications in the unbound state
6603 6604   * (i.e., without RTA_IFP) back to having a NULL ire_ill.
6604 6605   */
6605 6606  /* ARGSUSED */
6606 6607  void
6607 6608  ip_ire_unbind_walker(ire_t *ire, void *notused)
6608 6609  {
6609 6610          ire_t *new_ire;
6610 6611  
6611 6612          if (!ire->ire_unbound || ire->ire_ill == NULL)
6612 6613                  return;
6613 6614          if (ire->ire_ipversion == IPV6_VERSION) {
6614 6615                  new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
6615 6616                      &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
6616 6617                      ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6617 6618          } else {
6618 6619                  new_ire = ire_create((uchar_t *)&ire->ire_addr,
6619 6620                      (uchar_t *)&ire->ire_mask,
6620 6621                      (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
6621 6622                      ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6622 6623          }
6623 6624          if (new_ire == NULL)
6624 6625                  return;
6625 6626          new_ire->ire_unbound = B_TRUE;
6626 6627          /*
6627 6628           * The bound ire must first be deleted so that we don't return
6628 6629           * the existing one on the attempt to add the unbound new_ire.
6629 6630           */
6630 6631          ire_delete(ire);
6631 6632          new_ire = ire_add(new_ire);
6632 6633          if (new_ire != NULL)
6633 6634                  ire_refrele(new_ire);
6634 6635  }
6635 6636  
6636 6637  /*
6637 6638   * When the settings of ip*_strict_src_multihoming tunables are changed,
6638 6639   * all cached routes need to be recomputed. This recomputation needs to be
6639 6640   * done when going from weaker to stronger modes so that the cached ire
6640 6641   * for the connection does not violate the current ip*_strict_src_multihoming
6641 6642   * setting. It also needs to be done when going from stronger to weaker modes,
6642 6643   * so that we fall back to matching on the longest-matching-route (as opposed
6643 6644   * to a shorter match that may have been selected in the strong mode
6644 6645   * to satisfy src_multihoming settings).
6645 6646   *
6646 6647   * The cached ixa_ire entires for all conn_t entries are marked as
6647 6648   * "verify" so that they will be recomputed for the next packet.
6648 6649   */
6649 6650  void
6650 6651  conn_ire_revalidate(conn_t *connp, void *arg)
6651 6652  {
6652 6653          boolean_t isv6 = (boolean_t)arg;
6653 6654  
6654 6655          if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
6655 6656              (!isv6 && connp->conn_ipversion != IPV4_VERSION))
6656 6657                  return;
6657 6658          connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
6658 6659  }
6659 6660  
6660 6661  /*
6661 6662   * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
6662 6663   * When an ipf is passed here for the first time, if
6663 6664   * we already have in-order fragments on the queue, we convert from the fast-
6664 6665   * path reassembly scheme to the hard-case scheme.  From then on, additional
6665 6666   * fragments are reassembled here.  We keep track of the start and end offsets
6666 6667   * of each piece, and the number of holes in the chain.  When the hole count
6667 6668   * goes to zero, we are done!
6668 6669   *
6669 6670   * The ipf_count will be updated to account for any mblk(s) added (pointed to
6670 6671   * by mp) or subtracted (freeb()ed dups), upon return the caller must update
6671 6672   * ipfb_count and ill_frag_count by the difference of ipf_count before and
6672 6673   * after the call to ip_reassemble().
6673 6674   */
6674 6675  int
6675 6676  ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
6676 6677      size_t msg_len)
6677 6678  {
6678 6679          uint_t  end;
6679 6680          mblk_t  *next_mp;
6680 6681          mblk_t  *mp1;
6681 6682          uint_t  offset;
6682 6683          boolean_t incr_dups = B_TRUE;
6683 6684          boolean_t offset_zero_seen = B_FALSE;
6684 6685          boolean_t pkt_boundary_checked = B_FALSE;
6685 6686  
6686 6687          /* If start == 0 then ipf_nf_hdr_len has to be set. */
6687 6688          ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
6688 6689  
6689 6690          /* Add in byte count */
6690 6691          ipf->ipf_count += msg_len;
6691 6692          if (ipf->ipf_end) {
6692 6693                  /*
6693 6694                   * We were part way through in-order reassembly, but now there
6694 6695                   * is a hole.  We walk through messages already queued, and
6695 6696                   * mark them for hard case reassembly.  We know that up till
6696 6697                   * now they were in order starting from offset zero.
6697 6698                   */
6698 6699                  offset = 0;
6699 6700                  for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6700 6701                          IP_REASS_SET_START(mp1, offset);
6701 6702                          if (offset == 0) {
6702 6703                                  ASSERT(ipf->ipf_nf_hdr_len != 0);
6703 6704                                  offset = -ipf->ipf_nf_hdr_len;
6704 6705                          }
6705 6706                          offset += mp1->b_wptr - mp1->b_rptr;
6706 6707                          IP_REASS_SET_END(mp1, offset);
6707 6708                  }
6708 6709                  /* One hole at the end. */
6709 6710                  ipf->ipf_hole_cnt = 1;
6710 6711                  /* Brand it as a hard case, forever. */
6711 6712                  ipf->ipf_end = 0;
6712 6713          }
6713 6714          /* Walk through all the new pieces. */
6714 6715          do {
6715 6716                  end = start + (mp->b_wptr - mp->b_rptr);
6716 6717                  /*
6717 6718                   * If start is 0, decrease 'end' only for the first mblk of
6718 6719                   * the fragment. Otherwise 'end' can get wrong value in the
6719 6720                   * second pass of the loop if first mblk is exactly the
6720 6721                   * size of ipf_nf_hdr_len.
6721 6722                   */
6722 6723                  if (start == 0 && !offset_zero_seen) {
6723 6724                          /* First segment */
6724 6725                          ASSERT(ipf->ipf_nf_hdr_len != 0);
6725 6726                          end -= ipf->ipf_nf_hdr_len;
6726 6727                          offset_zero_seen = B_TRUE;
6727 6728                  }
6728 6729                  next_mp = mp->b_cont;
6729 6730                  /*
6730 6731                   * We are checking to see if there is any interesing data
6731 6732                   * to process.  If there isn't and the mblk isn't the
6732 6733                   * one which carries the unfragmentable header then we
6733 6734                   * drop it.  It's possible to have just the unfragmentable
6734 6735                   * header come through without any data.  That needs to be
6735 6736                   * saved.
6736 6737                   *
6737 6738                   * If the assert at the top of this function holds then the
6738 6739                   * term "ipf->ipf_nf_hdr_len != 0" isn't needed.  This code
6739 6740                   * is infrequently traveled enough that the test is left in
6740 6741                   * to protect against future code changes which break that
6741 6742                   * invariant.
6742 6743                   */
6743 6744                  if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
6744 6745                          /* Empty.  Blast it. */
6745 6746                          IP_REASS_SET_START(mp, 0);
6746 6747                          IP_REASS_SET_END(mp, 0);
6747 6748                          /*
6748 6749                           * If the ipf points to the mblk we are about to free,
6749 6750                           * update ipf to point to the next mblk (or NULL
6750 6751                           * if none).
6751 6752                           */
6752 6753                          if (ipf->ipf_mp->b_cont == mp)
6753 6754                                  ipf->ipf_mp->b_cont = next_mp;
6754 6755                          freeb(mp);
6755 6756                          continue;
6756 6757                  }
6757 6758                  mp->b_cont = NULL;
6758 6759                  IP_REASS_SET_START(mp, start);
6759 6760                  IP_REASS_SET_END(mp, end);
6760 6761                  if (!ipf->ipf_tail_mp) {
6761 6762                          ipf->ipf_tail_mp = mp;
6762 6763                          ipf->ipf_mp->b_cont = mp;
6763 6764                          if (start == 0 || !more) {
6764 6765                                  ipf->ipf_hole_cnt = 1;
6765 6766                                  /*
6766 6767                                   * if the first fragment comes in more than one
6767 6768                                   * mblk, this loop will be executed for each
6768 6769                                   * mblk. Need to adjust hole count so exiting
6769 6770                                   * this routine will leave hole count at 1.
6770 6771                                   */
6771 6772                                  if (next_mp)
6772 6773                                          ipf->ipf_hole_cnt++;
6773 6774                          } else
6774 6775                                  ipf->ipf_hole_cnt = 2;
6775 6776                          continue;
6776 6777                  } else if (ipf->ipf_last_frag_seen && !more &&
6777 6778                      !pkt_boundary_checked) {
6778 6779                          /*
6779 6780                           * We check datagram boundary only if this fragment
6780 6781                           * claims to be the last fragment and we have seen a
6781 6782                           * last fragment in the past too. We do this only
6782 6783                           * once for a given fragment.
6783 6784                           *
6784 6785                           * start cannot be 0 here as fragments with start=0
6785 6786                           * and MF=0 gets handled as a complete packet. These
6786 6787                           * fragments should not reach here.
6787 6788                           */
6788 6789  
6789 6790                          if (start + msgdsize(mp) !=
6790 6791                              IP_REASS_END(ipf->ipf_tail_mp)) {
6791 6792                                  /*
6792 6793                                   * We have two fragments both of which claim
6793 6794                                   * to be the last fragment but gives conflicting
6794 6795                                   * information about the whole datagram size.
6795 6796                                   * Something fishy is going on. Drop the
6796 6797                                   * fragment and free up the reassembly list.
6797 6798                                   */
6798 6799                                  return (IP_REASS_FAILED);
6799 6800                          }
6800 6801  
6801 6802                          /*
6802 6803                           * We shouldn't come to this code block again for this
6803 6804                           * particular fragment.
6804 6805                           */
6805 6806                          pkt_boundary_checked = B_TRUE;
6806 6807                  }
6807 6808  
6808 6809                  /* New stuff at or beyond tail? */
6809 6810                  offset = IP_REASS_END(ipf->ipf_tail_mp);
6810 6811                  if (start >= offset) {
6811 6812                          if (ipf->ipf_last_frag_seen) {
6812 6813                                  /* current fragment is beyond last fragment */
6813 6814                                  return (IP_REASS_FAILED);
6814 6815                          }
6815 6816                          /* Link it on end. */
6816 6817                          ipf->ipf_tail_mp->b_cont = mp;
6817 6818                          ipf->ipf_tail_mp = mp;
6818 6819                          if (more) {
6819 6820                                  if (start != offset)
6820 6821                                          ipf->ipf_hole_cnt++;
6821 6822                          } else if (start == offset && next_mp == NULL)
6822 6823                                          ipf->ipf_hole_cnt--;
6823 6824                          continue;
6824 6825                  }
6825 6826                  mp1 = ipf->ipf_mp->b_cont;
6826 6827                  offset = IP_REASS_START(mp1);
6827 6828                  /* New stuff at the front? */
6828 6829                  if (start < offset) {
6829 6830                          if (start == 0) {
6830 6831                                  if (end >= offset) {
6831 6832                                          /* Nailed the hole at the begining. */
6832 6833                                          ipf->ipf_hole_cnt--;
6833 6834                                  }
6834 6835                          } else if (end < offset) {
6835 6836                                  /*
6836 6837                                   * A hole, stuff, and a hole where there used
6837 6838                                   * to be just a hole.
6838 6839                                   */
6839 6840                                  ipf->ipf_hole_cnt++;
6840 6841                          }
6841 6842                          mp->b_cont = mp1;
6842 6843                          /* Check for overlap. */
6843 6844                          while (end > offset) {
6844 6845                                  if (end < IP_REASS_END(mp1)) {
6845 6846                                          mp->b_wptr -= end - offset;
6846 6847                                          IP_REASS_SET_END(mp, offset);
6847 6848                                          BUMP_MIB(ill->ill_ip_mib,
6848 6849                                              ipIfStatsReasmPartDups);
6849 6850                                          break;
6850 6851                                  }
6851 6852                                  /* Did we cover another hole? */
6852 6853                                  if ((mp1->b_cont &&
6853 6854                                      IP_REASS_END(mp1) !=
6854 6855                                      IP_REASS_START(mp1->b_cont) &&
6855 6856                                      end >= IP_REASS_START(mp1->b_cont)) ||
6856 6857                                      (!ipf->ipf_last_frag_seen && !more)) {
6857 6858                                          ipf->ipf_hole_cnt--;
6858 6859                                  }
6859 6860                                  /* Clip out mp1. */
6860 6861                                  if ((mp->b_cont = mp1->b_cont) == NULL) {
6861 6862                                          /*
6862 6863                                           * After clipping out mp1, this guy
6863 6864                                           * is now hanging off the end.
6864 6865                                           */
6865 6866                                          ipf->ipf_tail_mp = mp;
6866 6867                                  }
6867 6868                                  IP_REASS_SET_START(mp1, 0);
6868 6869                                  IP_REASS_SET_END(mp1, 0);
6869 6870                                  /* Subtract byte count */
6870 6871                                  ipf->ipf_count -= mp1->b_datap->db_lim -
6871 6872                                      mp1->b_datap->db_base;
6872 6873                                  freeb(mp1);
6873 6874                                  BUMP_MIB(ill->ill_ip_mib,
6874 6875                                      ipIfStatsReasmPartDups);
6875 6876                                  mp1 = mp->b_cont;
6876 6877                                  if (!mp1)
6877 6878                                          break;
6878 6879                                  offset = IP_REASS_START(mp1);
6879 6880                          }
6880 6881                          ipf->ipf_mp->b_cont = mp;
6881 6882                          continue;
6882 6883                  }
6883 6884                  /*
6884 6885                   * The new piece starts somewhere between the start of the head
6885 6886                   * and before the end of the tail.
6886 6887                   */
6887 6888                  for (; mp1; mp1 = mp1->b_cont) {
6888 6889                          offset = IP_REASS_END(mp1);
6889 6890                          if (start < offset) {
6890 6891                                  if (end <= offset) {
6891 6892                                          /* Nothing new. */
6892 6893                                          IP_REASS_SET_START(mp, 0);
6893 6894                                          IP_REASS_SET_END(mp, 0);
6894 6895                                          /* Subtract byte count */
6895 6896                                          ipf->ipf_count -= mp->b_datap->db_lim -
6896 6897                                              mp->b_datap->db_base;
6897 6898                                          if (incr_dups) {
6898 6899                                                  ipf->ipf_num_dups++;
6899 6900                                                  incr_dups = B_FALSE;
6900 6901                                          }
6901 6902                                          freeb(mp);
6902 6903                                          BUMP_MIB(ill->ill_ip_mib,
6903 6904                                              ipIfStatsReasmDuplicates);
6904 6905                                          break;
6905 6906                                  }
6906 6907                                  /*
6907 6908                                   * Trim redundant stuff off beginning of new
6908 6909                                   * piece.
6909 6910                                   */
6910 6911                                  IP_REASS_SET_START(mp, offset);
6911 6912                                  mp->b_rptr += offset - start;
6912 6913                                  BUMP_MIB(ill->ill_ip_mib,
6913 6914                                      ipIfStatsReasmPartDups);
6914 6915                                  start = offset;
6915 6916                                  if (!mp1->b_cont) {
6916 6917                                          /*
6917 6918                                           * After trimming, this guy is now
6918 6919                                           * hanging off the end.
6919 6920                                           */
6920 6921                                          mp1->b_cont = mp;
6921 6922                                          ipf->ipf_tail_mp = mp;
6922 6923                                          if (!more) {
6923 6924                                                  ipf->ipf_hole_cnt--;
6924 6925                                          }
6925 6926                                          break;
6926 6927                                  }
6927 6928                          }
6928 6929                          if (start >= IP_REASS_START(mp1->b_cont))
6929 6930                                  continue;
6930 6931                          /* Fill a hole */
6931 6932                          if (start > offset)
6932 6933                                  ipf->ipf_hole_cnt++;
6933 6934                          mp->b_cont = mp1->b_cont;
6934 6935                          mp1->b_cont = mp;
6935 6936                          mp1 = mp->b_cont;
6936 6937                          offset = IP_REASS_START(mp1);
6937 6938                          if (end >= offset) {
6938 6939                                  ipf->ipf_hole_cnt--;
6939 6940                                  /* Check for overlap. */
6940 6941                                  while (end > offset) {
6941 6942                                          if (end < IP_REASS_END(mp1)) {
6942 6943                                                  mp->b_wptr -= end - offset;
6943 6944                                                  IP_REASS_SET_END(mp, offset);
6944 6945                                                  /*
6945 6946                                                   * TODO we might bump
6946 6947                                                   * this up twice if there is
6947 6948                                                   * overlap at both ends.
6948 6949                                                   */
6949 6950                                                  BUMP_MIB(ill->ill_ip_mib,
6950 6951                                                      ipIfStatsReasmPartDups);
6951 6952                                                  break;
6952 6953                                          }
6953 6954                                          /* Did we cover another hole? */
6954 6955                                          if ((mp1->b_cont &&
6955 6956                                              IP_REASS_END(mp1)
6956 6957                                              != IP_REASS_START(mp1->b_cont) &&
6957 6958                                              end >=
6958 6959                                              IP_REASS_START(mp1->b_cont)) ||
6959 6960                                              (!ipf->ipf_last_frag_seen &&
6960 6961                                              !more)) {
6961 6962                                                  ipf->ipf_hole_cnt--;
6962 6963                                          }
6963 6964                                          /* Clip out mp1. */
6964 6965                                          if ((mp->b_cont = mp1->b_cont) ==
6965 6966                                              NULL) {
6966 6967                                                  /*
6967 6968                                                   * After clipping out mp1,
6968 6969                                                   * this guy is now hanging
6969 6970                                                   * off the end.
6970 6971                                                   */
6971 6972                                                  ipf->ipf_tail_mp = mp;
6972 6973                                          }
6973 6974                                          IP_REASS_SET_START(mp1, 0);
6974 6975                                          IP_REASS_SET_END(mp1, 0);
6975 6976                                          /* Subtract byte count */
6976 6977                                          ipf->ipf_count -=
6977 6978                                              mp1->b_datap->db_lim -
6978 6979                                              mp1->b_datap->db_base;
6979 6980                                          freeb(mp1);
6980 6981                                          BUMP_MIB(ill->ill_ip_mib,
6981 6982                                              ipIfStatsReasmPartDups);
6982 6983                                          mp1 = mp->b_cont;
6983 6984                                          if (!mp1)
6984 6985                                                  break;
6985 6986                                          offset = IP_REASS_START(mp1);
6986 6987                                  }
6987 6988                          }
6988 6989                          break;
6989 6990                  }
6990 6991          } while (start = end, mp = next_mp);
6991 6992  
6992 6993          /* Fragment just processed could be the last one. Remember this fact */
6993 6994          if (!more)
6994 6995                  ipf->ipf_last_frag_seen = B_TRUE;
6995 6996  
6996 6997          /* Still got holes? */
6997 6998          if (ipf->ipf_hole_cnt)
6998 6999                  return (IP_REASS_PARTIAL);
6999 7000          /* Clean up overloaded fields to avoid upstream disasters. */
7000 7001          for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
7001 7002                  IP_REASS_SET_START(mp1, 0);
7002 7003                  IP_REASS_SET_END(mp1, 0);
7003 7004          }
7004 7005          return (IP_REASS_COMPLETE);
7005 7006  }
7006 7007  
7007 7008  /*
7008 7009   * Fragmentation reassembly.  Each ILL has a hash table for
7009 7010   * queuing packets undergoing reassembly for all IPIFs
7010 7011   * associated with the ILL.  The hash is based on the packet
7011 7012   * IP ident field.  The ILL frag hash table was allocated
7012 7013   * as a timer block at the time the ILL was created.  Whenever
7013 7014   * there is anything on the reassembly queue, the timer will
7014 7015   * be running.  Returns the reassembled packet if reassembly completes.
7015 7016   */
7016 7017  mblk_t *
7017 7018  ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
7018 7019  {
7019 7020          uint32_t        frag_offset_flags;
7020 7021          mblk_t          *t_mp;
7021 7022          ipaddr_t        dst;
7022 7023          uint8_t         proto = ipha->ipha_protocol;
7023 7024          uint32_t        sum_val;
7024 7025          uint16_t        sum_flags;
7025 7026          ipf_t           *ipf;
7026 7027          ipf_t           **ipfp;
7027 7028          ipfb_t          *ipfb;
7028 7029          uint16_t        ident;
7029 7030          uint32_t        offset;
7030 7031          ipaddr_t        src;
7031 7032          uint_t          hdr_length;
7032 7033          uint32_t        end;
7033 7034          mblk_t          *mp1;
7034 7035          mblk_t          *tail_mp;
7035 7036          size_t          count;
7036 7037          size_t          msg_len;
7037 7038          uint8_t         ecn_info = 0;
7038 7039          uint32_t        packet_size;
7039 7040          boolean_t       pruned = B_FALSE;
7040 7041          ill_t           *ill = ira->ira_ill;
7041 7042          ip_stack_t      *ipst = ill->ill_ipst;
7042 7043  
7043 7044          /*
7044 7045           * Drop the fragmented as early as possible, if
7045 7046           * we don't have resource(s) to re-assemble.
7046 7047           */
7047 7048          if (ipst->ips_ip_reass_queue_bytes == 0) {
7048 7049                  freemsg(mp);
7049 7050                  return (NULL);
7050 7051          }
7051 7052  
7052 7053          /* Check for fragmentation offset; return if there's none */
7053 7054          if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
7054 7055              (IPH_MF | IPH_OFFSET)) == 0)
7055 7056                  return (mp);
7056 7057  
7057 7058          /*
7058 7059           * We utilize hardware computed checksum info only for UDP since
7059 7060           * IP fragmentation is a normal occurrence for the protocol.  In
7060 7061           * addition, checksum offload support for IP fragments carrying
7061 7062           * UDP payload is commonly implemented across network adapters.
7062 7063           */
7063 7064          ASSERT(ira->ira_rill != NULL);
7064 7065          if (proto == IPPROTO_UDP && dohwcksum &&
7065 7066              ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
7066 7067              (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
7067 7068                  mblk_t *mp1 = mp->b_cont;
7068 7069                  int32_t len;
7069 7070  
7070 7071                  /* Record checksum information from the packet */
7071 7072                  sum_val = (uint32_t)DB_CKSUM16(mp);
7072 7073                  sum_flags = DB_CKSUMFLAGS(mp);
7073 7074  
7074 7075                  /* IP payload offset from beginning of mblk */
7075 7076                  offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
7076 7077  
7077 7078                  if ((sum_flags & HCK_PARTIALCKSUM) &&
7078 7079                      (mp1 == NULL || mp1->b_cont == NULL) &&
7079 7080                      offset >= DB_CKSUMSTART(mp) &&
7080 7081                      ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
7081 7082                          uint32_t adj;
7082 7083                          /*
7083 7084                           * Partial checksum has been calculated by hardware
7084 7085                           * and attached to the packet; in addition, any
7085 7086                           * prepended extraneous data is even byte aligned.
7086 7087                           * If any such data exists, we adjust the checksum;
7087 7088                           * this would also handle any postpended data.
7088 7089                           */
7089 7090                          IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
7090 7091                              mp, mp1, len, adj);
7091 7092  
7092 7093                          /* One's complement subtract extraneous checksum */
7093 7094                          if (adj >= sum_val)
7094 7095                                  sum_val = ~(adj - sum_val) & 0xFFFF;
7095 7096                          else
7096 7097                                  sum_val -= adj;
7097 7098                  }
7098 7099          } else {
7099 7100                  sum_val = 0;
7100 7101                  sum_flags = 0;
7101 7102          }
7102 7103  
7103 7104          /* Clear hardware checksumming flag */
7104 7105          DB_CKSUMFLAGS(mp) = 0;
7105 7106  
7106 7107          ident = ipha->ipha_ident;
7107 7108          offset = (frag_offset_flags << 3) & 0xFFFF;
7108 7109          src = ipha->ipha_src;
7109 7110          dst = ipha->ipha_dst;
7110 7111          hdr_length = IPH_HDR_LENGTH(ipha);
7111 7112          end = ntohs(ipha->ipha_length) - hdr_length;
7112 7113  
7113 7114          /* If end == 0 then we have a packet with no data, so just free it */
7114 7115          if (end == 0) {
7115 7116                  freemsg(mp);
7116 7117                  return (NULL);
7117 7118          }
7118 7119  
7119 7120          /* Record the ECN field info. */
7120 7121          ecn_info = (ipha->ipha_type_of_service & 0x3);
7121 7122          if (offset != 0) {
7122 7123                  /*
7123 7124                   * If this isn't the first piece, strip the header, and
7124 7125                   * add the offset to the end value.
7125 7126                   */
7126 7127                  mp->b_rptr += hdr_length;
7127 7128                  end += offset;
7128 7129          }
7129 7130  
7130 7131          /* Handle vnic loopback of fragments */
7131 7132          if (mp->b_datap->db_ref > 2)
7132 7133                  msg_len = 0;
7133 7134          else
7134 7135                  msg_len = MBLKSIZE(mp);
7135 7136  
7136 7137          tail_mp = mp;
7137 7138          while (tail_mp->b_cont != NULL) {
7138 7139                  tail_mp = tail_mp->b_cont;
7139 7140                  if (tail_mp->b_datap->db_ref <= 2)
7140 7141                          msg_len += MBLKSIZE(tail_mp);
7141 7142          }
7142 7143  
7143 7144          /* If the reassembly list for this ILL will get too big, prune it */
7144 7145          if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
7145 7146              ipst->ips_ip_reass_queue_bytes) {
7146 7147                  DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
7147 7148                      uint_t, ill->ill_frag_count,
7148 7149                      uint_t, ipst->ips_ip_reass_queue_bytes);
7149 7150                  ill_frag_prune(ill,
7150 7151                      (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
7151 7152                      (ipst->ips_ip_reass_queue_bytes - msg_len));
7152 7153                  pruned = B_TRUE;
7153 7154          }
7154 7155  
7155 7156          ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
7156 7157          mutex_enter(&ipfb->ipfb_lock);
7157 7158  
7158 7159          ipfp = &ipfb->ipfb_ipf;
7159 7160          /* Try to find an existing fragment queue for this packet. */
7160 7161          for (;;) {
7161 7162                  ipf = ipfp[0];
7162 7163                  if (ipf != NULL) {
7163 7164                          /*
7164 7165                           * It has to match on ident and src/dst address.
7165 7166                           */
7166 7167                          if (ipf->ipf_ident == ident &&
7167 7168                              ipf->ipf_src == src &&
7168 7169                              ipf->ipf_dst == dst &&
7169 7170                              ipf->ipf_protocol == proto) {
7170 7171                                  /*
7171 7172                                   * If we have received too many
7172 7173                                   * duplicate fragments for this packet
7173 7174                                   * free it.
7174 7175                                   */
7175 7176                                  if (ipf->ipf_num_dups > ip_max_frag_dups) {
7176 7177                                          ill_frag_free_pkts(ill, ipfb, ipf, 1);
7177 7178                                          freemsg(mp);
7178 7179                                          mutex_exit(&ipfb->ipfb_lock);
7179 7180                                          return (NULL);
7180 7181                                  }
7181 7182                                  /* Found it. */
7182 7183                                  break;
7183 7184                          }
7184 7185                          ipfp = &ipf->ipf_hash_next;
7185 7186                          continue;
7186 7187                  }
7187 7188  
7188 7189                  /*
7189 7190                   * If we pruned the list, do we want to store this new
7190 7191                   * fragment?. We apply an optimization here based on the
7191 7192                   * fact that most fragments will be received in order.
7192 7193                   * So if the offset of this incoming fragment is zero,
7193 7194                   * it is the first fragment of a new packet. We will
7194 7195                   * keep it.  Otherwise drop the fragment, as we have
7195 7196                   * probably pruned the packet already (since the
7196 7197                   * packet cannot be found).
7197 7198                   */
7198 7199                  if (pruned && offset != 0) {
7199 7200                          mutex_exit(&ipfb->ipfb_lock);
7200 7201                          freemsg(mp);
7201 7202                          return (NULL);
7202 7203                  }
7203 7204  
7204 7205                  if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
7205 7206                          /*
7206 7207                           * Too many fragmented packets in this hash
7207 7208                           * bucket. Free the oldest.
7208 7209                           */
7209 7210                          ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
7210 7211                  }
7211 7212  
7212 7213                  /* New guy.  Allocate a frag message. */
7213 7214                  mp1 = allocb(sizeof (*ipf), BPRI_MED);
7214 7215                  if (mp1 == NULL) {
7215 7216                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7216 7217                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7217 7218                          freemsg(mp);
7218 7219  reass_done:
7219 7220                          mutex_exit(&ipfb->ipfb_lock);
7220 7221                          return (NULL);
7221 7222                  }
7222 7223  
7223 7224                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
7224 7225                  mp1->b_cont = mp;
7225 7226  
7226 7227                  /* Initialize the fragment header. */
7227 7228                  ipf = (ipf_t *)mp1->b_rptr;
7228 7229                  ipf->ipf_mp = mp1;
7229 7230                  ipf->ipf_ptphn = ipfp;
7230 7231                  ipfp[0] = ipf;
7231 7232                  ipf->ipf_hash_next = NULL;
7232 7233                  ipf->ipf_ident = ident;
7233 7234                  ipf->ipf_protocol = proto;
7234 7235                  ipf->ipf_src = src;
7235 7236                  ipf->ipf_dst = dst;
7236 7237                  ipf->ipf_nf_hdr_len = 0;
7237 7238                  /* Record reassembly start time. */
7238 7239                  ipf->ipf_timestamp = gethrestime_sec();
7239 7240                  /* Record ipf generation and account for frag header */
7240 7241                  ipf->ipf_gen = ill->ill_ipf_gen++;
7241 7242                  ipf->ipf_count = MBLKSIZE(mp1);
7242 7243                  ipf->ipf_last_frag_seen = B_FALSE;
7243 7244                  ipf->ipf_ecn = ecn_info;
7244 7245                  ipf->ipf_num_dups = 0;
7245 7246                  ipfb->ipfb_frag_pkts++;
7246 7247                  ipf->ipf_checksum = 0;
7247 7248                  ipf->ipf_checksum_flags = 0;
7248 7249  
7249 7250                  /* Store checksum value in fragment header */
7250 7251                  if (sum_flags != 0) {
7251 7252                          sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7252 7253                          sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7253 7254                          ipf->ipf_checksum = sum_val;
7254 7255                          ipf->ipf_checksum_flags = sum_flags;
7255 7256                  }
7256 7257  
7257 7258                  /*
7258 7259                   * We handle reassembly two ways.  In the easy case,
7259 7260                   * where all the fragments show up in order, we do
7260 7261                   * minimal bookkeeping, and just clip new pieces on
7261 7262                   * the end.  If we ever see a hole, then we go off
7262 7263                   * to ip_reassemble which has to mark the pieces and
7263 7264                   * keep track of the number of holes, etc.  Obviously,
7264 7265                   * the point of having both mechanisms is so we can
7265 7266                   * handle the easy case as efficiently as possible.
7266 7267                   */
7267 7268                  if (offset == 0) {
7268 7269                          /* Easy case, in-order reassembly so far. */
7269 7270                          ipf->ipf_count += msg_len;
7270 7271                          ipf->ipf_tail_mp = tail_mp;
7271 7272                          /*
7272 7273                           * Keep track of next expected offset in
7273 7274                           * ipf_end.
7274 7275                           */
7275 7276                          ipf->ipf_end = end;
7276 7277                          ipf->ipf_nf_hdr_len = hdr_length;
7277 7278                  } else {
7278 7279                          /* Hard case, hole at the beginning. */
7279 7280                          ipf->ipf_tail_mp = NULL;
7280 7281                          /*
7281 7282                           * ipf_end == 0 means that we have given up
7282 7283                           * on easy reassembly.
7283 7284                           */
7284 7285                          ipf->ipf_end = 0;
7285 7286  
7286 7287                          /* Forget checksum offload from now on */
7287 7288                          ipf->ipf_checksum_flags = 0;
7288 7289  
7289 7290                          /*
7290 7291                           * ipf_hole_cnt is set by ip_reassemble.
7291 7292                           * ipf_count is updated by ip_reassemble.
7292 7293                           * No need to check for return value here
7293 7294                           * as we don't expect reassembly to complete
7294 7295                           * or fail for the first fragment itself.
7295 7296                           */
7296 7297                          (void) ip_reassemble(mp, ipf,
7297 7298                              (frag_offset_flags & IPH_OFFSET) << 3,
7298 7299                              (frag_offset_flags & IPH_MF), ill, msg_len);
7299 7300                  }
7300 7301                  /* Update per ipfb and ill byte counts */
7301 7302                  ipfb->ipfb_count += ipf->ipf_count;
7302 7303                  ASSERT(ipfb->ipfb_count > 0);   /* Wraparound */
7303 7304                  atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
7304 7305                  /* If the frag timer wasn't already going, start it. */
7305 7306                  mutex_enter(&ill->ill_lock);
7306 7307                  ill_frag_timer_start(ill);
7307 7308                  mutex_exit(&ill->ill_lock);
7308 7309                  goto reass_done;
7309 7310          }
7310 7311  
7311 7312          /*
7312 7313           * If the packet's flag has changed (it could be coming up
7313 7314           * from an interface different than the previous, therefore
7314 7315           * possibly different checksum capability), then forget about
7315 7316           * any stored checksum states.  Otherwise add the value to
7316 7317           * the existing one stored in the fragment header.
7317 7318           */
7318 7319          if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
7319 7320                  sum_val += ipf->ipf_checksum;
7320 7321                  sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7321 7322                  sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7322 7323                  ipf->ipf_checksum = sum_val;
7323 7324          } else if (ipf->ipf_checksum_flags != 0) {
7324 7325                  /* Forget checksum offload from now on */
7325 7326                  ipf->ipf_checksum_flags = 0;
7326 7327          }
7327 7328  
7328 7329          /*
7329 7330           * We have a new piece of a datagram which is already being
7330 7331           * reassembled.  Update the ECN info if all IP fragments
7331 7332           * are ECN capable.  If there is one which is not, clear
7332 7333           * all the info.  If there is at least one which has CE
7333 7334           * code point, IP needs to report that up to transport.
7334 7335           */
7335 7336          if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
7336 7337                  if (ecn_info == IPH_ECN_CE)
7337 7338                          ipf->ipf_ecn = IPH_ECN_CE;
7338 7339          } else {
7339 7340                  ipf->ipf_ecn = IPH_ECN_NECT;
7340 7341          }
7341 7342          if (offset && ipf->ipf_end == offset) {
7342 7343                  /* The new fragment fits at the end */
7343 7344                  ipf->ipf_tail_mp->b_cont = mp;
7344 7345                  /* Update the byte count */
7345 7346                  ipf->ipf_count += msg_len;
7346 7347                  /* Update per ipfb and ill byte counts */
7347 7348                  ipfb->ipfb_count += msg_len;
7348 7349                  ASSERT(ipfb->ipfb_count > 0);   /* Wraparound */
7349 7350                  atomic_add_32(&ill->ill_frag_count, msg_len);
7350 7351                  if (frag_offset_flags & IPH_MF) {
7351 7352                          /* More to come. */
7352 7353                          ipf->ipf_end = end;
7353 7354                          ipf->ipf_tail_mp = tail_mp;
7354 7355                          goto reass_done;
7355 7356                  }
7356 7357          } else {
7357 7358                  /* Go do the hard cases. */
7358 7359                  int ret;
7359 7360  
7360 7361                  if (offset == 0)
7361 7362                          ipf->ipf_nf_hdr_len = hdr_length;
7362 7363  
7363 7364                  /* Save current byte count */
7364 7365                  count = ipf->ipf_count;
7365 7366                  ret = ip_reassemble(mp, ipf,
7366 7367                      (frag_offset_flags & IPH_OFFSET) << 3,
7367 7368                      (frag_offset_flags & IPH_MF), ill, msg_len);
7368 7369                  /* Count of bytes added and subtracted (freeb()ed) */
7369 7370                  count = ipf->ipf_count - count;
7370 7371                  if (count) {
7371 7372                          /* Update per ipfb and ill byte counts */
7372 7373                          ipfb->ipfb_count += count;
7373 7374                          ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
7374 7375                          atomic_add_32(&ill->ill_frag_count, count);
7375 7376                  }
7376 7377                  if (ret == IP_REASS_PARTIAL) {
7377 7378                          goto reass_done;
7378 7379                  } else if (ret == IP_REASS_FAILED) {
7379 7380                          /* Reassembly failed. Free up all resources */
7380 7381                          ill_frag_free_pkts(ill, ipfb, ipf, 1);
7381 7382                          for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
7382 7383                                  IP_REASS_SET_START(t_mp, 0);
7383 7384                                  IP_REASS_SET_END(t_mp, 0);
7384 7385                          }
7385 7386                          freemsg(mp);
7386 7387                          goto reass_done;
7387 7388                  }
7388 7389                  /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
7389 7390          }
7390 7391          /*
7391 7392           * We have completed reassembly.  Unhook the frag header from
7392 7393           * the reassembly list.
7393 7394           *
7394 7395           * Before we free the frag header, record the ECN info
7395 7396           * to report back to the transport.
7396 7397           */
7397 7398          ecn_info = ipf->ipf_ecn;
7398 7399          BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
7399 7400          ipfp = ipf->ipf_ptphn;
7400 7401  
7401 7402          /* We need to supply these to caller */
7402 7403          if ((sum_flags = ipf->ipf_checksum_flags) != 0)
7403 7404                  sum_val = ipf->ipf_checksum;
7404 7405          else
7405 7406                  sum_val = 0;
7406 7407  
7407 7408          mp1 = ipf->ipf_mp;
7408 7409          count = ipf->ipf_count;
7409 7410          ipf = ipf->ipf_hash_next;
7410 7411          if (ipf != NULL)
7411 7412                  ipf->ipf_ptphn = ipfp;
7412 7413          ipfp[0] = ipf;
7413 7414          atomic_add_32(&ill->ill_frag_count, -count);
7414 7415          ASSERT(ipfb->ipfb_count >= count);
7415 7416          ipfb->ipfb_count -= count;
7416 7417          ipfb->ipfb_frag_pkts--;
7417 7418          mutex_exit(&ipfb->ipfb_lock);
7418 7419          /* Ditch the frag header. */
7419 7420          mp = mp1->b_cont;
7420 7421  
7421 7422          freeb(mp1);
7422 7423  
7423 7424          /* Restore original IP length in header. */
7424 7425          packet_size = (uint32_t)msgdsize(mp);
7425 7426          if (packet_size > IP_MAXPACKET) {
7426 7427                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7427 7428                  ip_drop_input("Reassembled packet too large", mp, ill);
7428 7429                  freemsg(mp);
7429 7430                  return (NULL);
7430 7431          }
7431 7432  
7432 7433          if (DB_REF(mp) > 1) {
7433 7434                  mblk_t *mp2 = copymsg(mp);
7434 7435  
7435 7436                  if (mp2 == NULL) {
7436 7437                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7437 7438                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7438 7439                          freemsg(mp);
7439 7440                          return (NULL);
7440 7441                  }
7441 7442                  freemsg(mp);
7442 7443                  mp = mp2;
7443 7444          }
7444 7445          ipha = (ipha_t *)mp->b_rptr;
7445 7446  
7446 7447          ipha->ipha_length = htons((uint16_t)packet_size);
7447 7448          /* We're now complete, zip the frag state */
7448 7449          ipha->ipha_fragment_offset_and_flags = 0;
7449 7450          /* Record the ECN info. */
7450 7451          ipha->ipha_type_of_service &= 0xFC;
7451 7452          ipha->ipha_type_of_service |= ecn_info;
7452 7453  
7453 7454          /* Update the receive attributes */
7454 7455          ira->ira_pktlen = packet_size;
7455 7456          ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
7456 7457  
7457 7458          /* Reassembly is successful; set checksum information in packet */
7458 7459          DB_CKSUM16(mp) = (uint16_t)sum_val;
7459 7460          DB_CKSUMFLAGS(mp) = sum_flags;
7460 7461          DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
7461 7462  
7462 7463          return (mp);
7463 7464  }
7464 7465  
7465 7466  /*
7466 7467   * Pullup function that should be used for IP input in order to
7467 7468   * ensure we do not loose the L2 source address; we need the l2 source
7468 7469   * address for IP_RECVSLLA and for ndp_input.
7469 7470   *
7470 7471   * We return either NULL or b_rptr.
7471 7472   */
7472 7473  void *
7473 7474  ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
7474 7475  {
7475 7476          ill_t           *ill = ira->ira_ill;
7476 7477  
7477 7478          if (ip_rput_pullups++ == 0) {
7478 7479                  (void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
7479 7480                      "ip_pullup: %s forced us to "
7480 7481                      " pullup pkt, hdr len %ld, hdr addr %p",
7481 7482                      ill->ill_name, len, (void *)mp->b_rptr);
7482 7483          }
7483 7484          if (!(ira->ira_flags & IRAF_L2SRC_SET))
7484 7485                  ip_setl2src(mp, ira, ira->ira_rill);
7485 7486          ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7486 7487          if (!pullupmsg(mp, len))
7487 7488                  return (NULL);
7488 7489          else
7489 7490                  return (mp->b_rptr);
7490 7491  }
7491 7492  
7492 7493  /*
7493 7494   * Make sure ira_l2src has an address. If we don't have one fill with zeros.
7494 7495   * When called from the ULP ira_rill will be NULL hence the caller has to
7495 7496   * pass in the ill.
7496 7497   */
7497 7498  /* ARGSUSED */
7498 7499  void
7499 7500  ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
7500 7501  {
7501 7502          const uchar_t *addr;
7502 7503          int alen;
7503 7504  
7504 7505          if (ira->ira_flags & IRAF_L2SRC_SET)
7505 7506                  return;
7506 7507  
7507 7508          ASSERT(ill != NULL);
7508 7509          alen = ill->ill_phys_addr_length;
7509 7510          ASSERT(alen <= sizeof (ira->ira_l2src));
7510 7511          if (ira->ira_mhip != NULL &&
7511 7512              (addr = ira->ira_mhip->mhi_saddr) != NULL) {
7512 7513                  bcopy(addr, ira->ira_l2src, alen);
7513 7514          } else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
7514 7515              (addr = ill->ill_phys_addr) != NULL) {
7515 7516                  bcopy(addr, ira->ira_l2src, alen);
7516 7517          } else {
7517 7518                  bzero(ira->ira_l2src, alen);
7518 7519          }
7519 7520          ira->ira_flags |= IRAF_L2SRC_SET;
7520 7521  }
7521 7522  
7522 7523  /*
7523 7524   * check ip header length and align it.
7524 7525   */
7525 7526  mblk_t *
7526 7527  ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
7527 7528  {
7528 7529          ill_t   *ill = ira->ira_ill;
7529 7530          ssize_t len;
7530 7531  
7531 7532          len = MBLKL(mp);
7532 7533  
7533 7534          if (!OK_32PTR(mp->b_rptr))
7534 7535                  IP_STAT(ill->ill_ipst, ip_notaligned);
7535 7536          else
7536 7537                  IP_STAT(ill->ill_ipst, ip_recv_pullup);
7537 7538  
7538 7539          /* Guard against bogus device drivers */
7539 7540          if (len < 0) {
7540 7541                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7541 7542                  ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7542 7543                  freemsg(mp);
7543 7544                  return (NULL);
7544 7545          }
7545 7546  
7546 7547          if (len == 0) {
7547 7548                  /* GLD sometimes sends up mblk with b_rptr == b_wptr! */
7548 7549                  mblk_t *mp1 = mp->b_cont;
7549 7550  
7550 7551                  if (!(ira->ira_flags & IRAF_L2SRC_SET))
7551 7552                          ip_setl2src(mp, ira, ira->ira_rill);
7552 7553                  ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7553 7554  
7554 7555                  freeb(mp);
7555 7556                  mp = mp1;
7556 7557                  if (mp == NULL)
7557 7558                          return (NULL);
7558 7559  
7559 7560                  if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
7560 7561                          return (mp);
7561 7562          }
7562 7563          if (ip_pullup(mp, min_size, ira) == NULL) {
7563 7564                  if (msgdsize(mp) < min_size) {
7564 7565                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7565 7566                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7566 7567                  } else {
7567 7568                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7568 7569                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7569 7570                  }
7570 7571                  freemsg(mp);
7571 7572                  return (NULL);
7572 7573          }
7573 7574          return (mp);
7574 7575  }
7575 7576  
7576 7577  /*
7577 7578   * Common code for IPv4 and IPv6 to check and pullup multi-mblks
7578 7579   */
7579 7580  mblk_t *
7580 7581  ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len, uint_t pkt_len,
7581 7582      uint_t min_size, ip_recv_attr_t *ira)
7582 7583  {
7583 7584          ill_t   *ill = ira->ira_ill;
7584 7585  
7585 7586          /*
7586 7587           * Make sure we have data length consistent
7587 7588           * with the IP header.
7588 7589           */
7589 7590          if (mp->b_cont == NULL) {
7590 7591                  /* pkt_len is based on ipha_len, not the mblk length */
7591 7592                  if (pkt_len < min_size) {
7592 7593                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7593 7594                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7594 7595                          freemsg(mp);
7595 7596                          return (NULL);
7596 7597                  }
7597 7598                  if (len < 0) {
7598 7599                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7599 7600                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7600 7601                          freemsg(mp);
7601 7602                          return (NULL);
7602 7603                  }
7603 7604                  /* Drop any pad */
7604 7605                  mp->b_wptr = rptr + pkt_len;
7605 7606          } else if ((len += msgdsize(mp->b_cont)) != 0) {
7606 7607                  ASSERT(pkt_len >= min_size);
7607 7608                  if (pkt_len < min_size) {
7608 7609                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7609 7610                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7610 7611                          freemsg(mp);
7611 7612                          return (NULL);
7612 7613                  }
7613 7614                  if (len < 0) {
7614 7615                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7615 7616                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7616 7617                          freemsg(mp);
7617 7618                          return (NULL);
7618 7619                  }
7619 7620                  /* Drop any pad */
7620 7621                  (void) adjmsg(mp, -len);
7621 7622                  /*
7622 7623                   * adjmsg may have freed an mblk from the chain, hence
7623 7624                   * invalidate any hw checksum here. This will force IP to
7624 7625                   * calculate the checksum in sw, but only for this packet.
7625 7626                   */
7626 7627                  DB_CKSUMFLAGS(mp) = 0;
7627 7628                  IP_STAT(ill->ill_ipst, ip_multimblk);
7628 7629          }
7629 7630          return (mp);
7630 7631  }
7631 7632  
7632 7633  /*
7633 7634   * Check that the IPv4 opt_len is consistent with the packet and pullup
7634 7635   * the options.
7635 7636   */
7636 7637  mblk_t *
7637 7638  ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
7638 7639      ip_recv_attr_t *ira)
7639 7640  {
7640 7641          ill_t   *ill = ira->ira_ill;
7641 7642          ssize_t len;
7642 7643  
7643 7644          /* Assume no IPv6 packets arrive over the IPv4 queue */
7644 7645          if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
7645 7646                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7646 7647                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
7647 7648                  ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
7648 7649                  freemsg(mp);
7649 7650                  return (NULL);
7650 7651          }
7651 7652  
7652 7653          if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
7653 7654                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7654 7655                  ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7655 7656                  freemsg(mp);
7656 7657                  return (NULL);
7657 7658          }
7658 7659          /*
7659 7660           * Recompute complete header length and make sure we
7660 7661           * have access to all of it.
7661 7662           */
7662 7663          len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
7663 7664          if (len > (mp->b_wptr - mp->b_rptr)) {
7664 7665                  if (len > pkt_len) {
7665 7666                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7666 7667                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7667 7668                          freemsg(mp);
7668 7669                          return (NULL);
7669 7670                  }
7670 7671                  if (ip_pullup(mp, len, ira) == NULL) {
7671 7672                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7672 7673                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7673 7674                          freemsg(mp);
7674 7675                          return (NULL);
7675 7676                  }
7676 7677          }
7677 7678          return (mp);
7678 7679  }
7679 7680  
7680 7681  /*
7681 7682   * Returns a new ire, or the same ire, or NULL.
7682 7683   * If a different IRE is returned, then it is held; the caller
7683 7684   * needs to release it.
7684 7685   * In no case is there any hold/release on the ire argument.
7685 7686   */
7686 7687  ire_t *
7687 7688  ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
7688 7689  {
7689 7690          ire_t           *new_ire;
7690 7691          ill_t           *ire_ill;
7691 7692          uint_t          ifindex;
7692 7693          ip_stack_t      *ipst = ill->ill_ipst;
7693 7694          boolean_t       strict_check = B_FALSE;
7694 7695  
7695 7696          /*
7696 7697           * IPMP common case: if IRE and ILL are in the same group, there's no
7697 7698           * issue (e.g. packet received on an underlying interface matched an
7698 7699           * IRE_LOCAL on its associated group interface).
7699 7700           */
7700 7701          ASSERT(ire->ire_ill != NULL);
7701 7702          if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
7702 7703                  return (ire);
7703 7704  
7704 7705          /*
7705 7706           * Do another ire lookup here, using the ingress ill, to see if the
7706 7707           * interface is in a usesrc group.
7707 7708           * As long as the ills belong to the same group, we don't consider
7708 7709           * them to be arriving on the wrong interface. Thus, if the switch
7709 7710           * is doing inbound load spreading, we won't drop packets when the
7710 7711           * ip*_strict_dst_multihoming switch is on.
7711 7712           * We also need to check for IPIF_UNNUMBERED point2point interfaces
7712 7713           * where the local address may not be unique. In this case we were
7713 7714           * at the mercy of the initial ire lookup and the IRE_LOCAL it
7714 7715           * actually returned. The new lookup, which is more specific, should
7715 7716           * only find the IRE_LOCAL associated with the ingress ill if one
7716 7717           * exists.
7717 7718           */
7718 7719          if (ire->ire_ipversion == IPV4_VERSION) {
7719 7720                  if (ipst->ips_ip_strict_dst_multihoming)
7720 7721                          strict_check = B_TRUE;
7721 7722                  new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
7722 7723                      IRE_LOCAL, ill, ALL_ZONES, NULL,
7723 7724                      (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7724 7725          } else {
7725 7726                  ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
7726 7727                  if (ipst->ips_ipv6_strict_dst_multihoming)
7727 7728                          strict_check = B_TRUE;
7728 7729                  new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
7729 7730                      IRE_LOCAL, ill, ALL_ZONES, NULL,
7730 7731                      (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7731 7732          }
7732 7733          /*
7733 7734           * If the same ire that was returned in ip_input() is found then this
7734 7735           * is an indication that usesrc groups are in use. The packet
7735 7736           * arrived on a different ill in the group than the one associated with
7736 7737           * the destination address.  If a different ire was found then the same
7737 7738           * IP address must be hosted on multiple ills. This is possible with
7738 7739           * unnumbered point2point interfaces. We switch to use this new ire in
7739 7740           * order to have accurate interface statistics.
7740 7741           */
7741 7742          if (new_ire != NULL) {
7742 7743                  /* Note: held in one case but not the other? Caller handles */
7743 7744                  if (new_ire != ire)
7744 7745                          return (new_ire);
7745 7746                  /* Unchanged */
7746 7747                  ire_refrele(new_ire);
7747 7748                  return (ire);
7748 7749          }
7749 7750  
7750 7751          /*
7751 7752           * Chase pointers once and store locally.
7752 7753           */
7753 7754          ASSERT(ire->ire_ill != NULL);
7754 7755          ire_ill = ire->ire_ill;
7755 7756          ifindex = ill->ill_usesrc_ifindex;
7756 7757  
7757 7758          /*
7758 7759           * Check if it's a legal address on the 'usesrc' interface.
7759 7760           * For IPMP data addresses the IRE_LOCAL is the upper, hence we
7760 7761           * can just check phyint_ifindex.
7761 7762           */
7762 7763          if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
7763 7764                  return (ire);
7764 7765          }
7765 7766  
7766 7767          /*
7767 7768           * If the ip*_strict_dst_multihoming switch is on then we can
7768 7769           * only accept this packet if the interface is marked as routing.
7769 7770           */
7770 7771          if (!(strict_check))
7771 7772                  return (ire);
7772 7773  
7773 7774          if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
7774 7775                  return (ire);
7775 7776          }
7776 7777          return (NULL);
7777 7778  }
7778 7779  
7779 7780  /*
7780 7781   * This function is used to construct a mac_header_info_s from a
7781 7782   * DL_UNITDATA_IND message.
7782 7783   * The address fields in the mhi structure points into the message,
7783 7784   * thus the caller can't use those fields after freeing the message.
7784 7785   *
7785 7786   * We determine whether the packet received is a non-unicast packet
7786 7787   * and in doing so, determine whether or not it is broadcast vs multicast.
7787 7788   * For it to be a broadcast packet, we must have the appropriate mblk_t
7788 7789   * hanging off the ill_t.  If this is either not present or doesn't match
7789 7790   * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7790 7791   * to be multicast.  Thus NICs that have no broadcast address (or no
7791 7792   * capability for one, such as point to point links) cannot return as
7792 7793   * the packet being broadcast.
7793 7794   */
7794 7795  void
7795 7796  ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
7796 7797  {
7797 7798          dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
7798 7799          mblk_t *bmp;
7799 7800          uint_t extra_offset;
7800 7801  
7801 7802          bzero(mhip, sizeof (struct mac_header_info_s));
7802 7803  
7803 7804          mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7804 7805  
7805 7806          if (ill->ill_sap_length < 0)
7806 7807                  extra_offset = 0;
7807 7808          else
7808 7809                  extra_offset = ill->ill_sap_length;
7809 7810  
7810 7811          mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
7811 7812              extra_offset;
7812 7813          mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
7813 7814              extra_offset;
7814 7815  
7815 7816          if (!ind->dl_group_address)
7816 7817                  return;
7817 7818  
7818 7819          /* Multicast or broadcast */
7819 7820          mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7820 7821  
7821 7822          if (ind->dl_dest_addr_offset > sizeof (*ind) &&
7822 7823              ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
7823 7824              (bmp = ill->ill_bcast_mp) != NULL) {
7824 7825                  dl_unitdata_req_t *dlur;
7825 7826                  uint8_t *bphys_addr;
7826 7827  
7827 7828                  dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7828 7829                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
7829 7830                      extra_offset;
7830 7831  
7831 7832                  if (bcmp(mhip->mhi_daddr, bphys_addr,
7832 7833                      ind->dl_dest_addr_length) == 0)
7833 7834                          mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7834 7835          }
7835 7836  }
7836 7837  
7837 7838  /*
7838 7839   * This function is used to construct a mac_header_info_s from a
7839 7840   * M_DATA fastpath message from a DLPI driver.
7840 7841   * The address fields in the mhi structure points into the message,
7841 7842   * thus the caller can't use those fields after freeing the message.
7842 7843   *
7843 7844   * We determine whether the packet received is a non-unicast packet
7844 7845   * and in doing so, determine whether or not it is broadcast vs multicast.
7845 7846   * For it to be a broadcast packet, we must have the appropriate mblk_t
7846 7847   * hanging off the ill_t.  If this is either not present or doesn't match
7847 7848   * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7848 7849   * to be multicast.  Thus NICs that have no broadcast address (or no
7849 7850   * capability for one, such as point to point links) cannot return as
7850 7851   * the packet being broadcast.
7851 7852   */
7852 7853  void
7853 7854  ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
7854 7855  {
7855 7856          mblk_t *bmp;
7856 7857          struct ether_header *pether;
7857 7858  
7858 7859          bzero(mhip, sizeof (struct mac_header_info_s));
7859 7860  
7860 7861          mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7861 7862  
7862 7863          pether = (struct ether_header *)((char *)mp->b_rptr
7863 7864              - sizeof (struct ether_header));
7864 7865  
7865 7866          /*
7866 7867           * Make sure the interface is an ethernet type, since we don't
7867 7868           * know the header format for anything but Ethernet. Also make
7868 7869           * sure we are pointing correctly above db_base.
7869 7870           */
7870 7871          if (ill->ill_type != IFT_ETHER)
7871 7872                  return;
7872 7873  
7873 7874  retry:
7874 7875          if ((uchar_t *)pether < mp->b_datap->db_base)
7875 7876                  return;
7876 7877  
7877 7878          /* Is there a VLAN tag? */
7878 7879          if (ill->ill_isv6) {
7879 7880                  if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
7880 7881                          pether = (struct ether_header *)((char *)pether - 4);
7881 7882                          goto retry;
7882 7883                  }
7883 7884          } else {
7884 7885                  if (pether->ether_type != htons(ETHERTYPE_IP)) {
7885 7886                          pether = (struct ether_header *)((char *)pether - 4);
7886 7887                          goto retry;
7887 7888                  }
7888 7889          }
7889 7890          mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
7890 7891          mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
7891 7892  
7892 7893          if (!(mhip->mhi_daddr[0] & 0x01))
7893 7894                  return;
7894 7895  
7895 7896          /* Multicast or broadcast */
7896 7897          mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7897 7898  
7898 7899          if ((bmp = ill->ill_bcast_mp) != NULL) {
7899 7900                  dl_unitdata_req_t *dlur;
7900 7901                  uint8_t *bphys_addr;
7901 7902                  uint_t  addrlen;
7902 7903  
7903 7904                  dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7904 7905                  addrlen = dlur->dl_dest_addr_length;
7905 7906                  if (ill->ill_sap_length < 0) {
7906 7907                          bphys_addr = (uchar_t *)dlur +
7907 7908                              dlur->dl_dest_addr_offset;
7908 7909                          addrlen += ill->ill_sap_length;
7909 7910                  } else {
7910 7911                          bphys_addr = (uchar_t *)dlur +
7911 7912                              dlur->dl_dest_addr_offset +
7912 7913                              ill->ill_sap_length;
7913 7914                          addrlen -= ill->ill_sap_length;
7914 7915                  }
7915 7916                  if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
7916 7917                          mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7917 7918          }
7918 7919  }
7919 7920  
7920 7921  /*
7921 7922   * Handle anything but M_DATA messages
7922 7923   * We see the DL_UNITDATA_IND which are part
7923 7924   * of the data path, and also the other messages from the driver.
7924 7925   */
7925 7926  void
7926 7927  ip_rput_notdata(ill_t *ill, mblk_t *mp)
7927 7928  {
7928 7929          mblk_t          *first_mp;
7929 7930          struct iocblk   *iocp;
7930 7931          struct mac_header_info_s mhi;
7931 7932  
7932 7933          switch (DB_TYPE(mp)) {
7933 7934          case M_PROTO:
7934 7935          case M_PCPROTO: {
7935 7936                  if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
7936 7937                      DL_UNITDATA_IND) {
7937 7938                          /* Go handle anything other than data elsewhere. */
7938 7939                          ip_rput_dlpi(ill, mp);
7939 7940                          return;
7940 7941                  }
7941 7942  
7942 7943                  first_mp = mp;
7943 7944                  mp = first_mp->b_cont;
7944 7945                  first_mp->b_cont = NULL;
7945 7946  
7946 7947                  if (mp == NULL) {
7947 7948                          freeb(first_mp);
7948 7949                          return;
7949 7950                  }
7950 7951                  ip_dlur_to_mhi(ill, first_mp, &mhi);
7951 7952                  if (ill->ill_isv6)
7952 7953                          ip_input_v6(ill, NULL, mp, &mhi);
7953 7954                  else
7954 7955                          ip_input(ill, NULL, mp, &mhi);
7955 7956  
7956 7957                  /* Ditch the DLPI header. */
7957 7958                  freeb(first_mp);
7958 7959                  return;
7959 7960          }
7960 7961          case M_IOCACK:
7961 7962                  iocp = (struct iocblk *)mp->b_rptr;
7962 7963                  switch (iocp->ioc_cmd) {
7963 7964                  case DL_IOC_HDR_INFO:
7964 7965                          ill_fastpath_ack(ill, mp);
7965 7966                          return;
7966 7967                  default:
7967 7968                          putnext(ill->ill_rq, mp);
7968 7969                          return;
7969 7970                  }
7970 7971                  /* FALLTHROUGH */
7971 7972          case M_ERROR:
7972 7973          case M_HANGUP:
7973 7974                  mutex_enter(&ill->ill_lock);
7974 7975                  if (ill->ill_state_flags & ILL_CONDEMNED) {
7975 7976                          mutex_exit(&ill->ill_lock);
7976 7977                          freemsg(mp);
7977 7978                          return;
7978 7979                  }
7979 7980                  ill_refhold_locked(ill);
7980 7981                  mutex_exit(&ill->ill_lock);
7981 7982                  qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
7982 7983                      B_FALSE);
7983 7984                  return;
7984 7985          case M_CTL:
7985 7986                  putnext(ill->ill_rq, mp);
7986 7987                  return;
7987 7988          case M_IOCNAK:
7988 7989                  ip1dbg(("got iocnak "));
7989 7990                  iocp = (struct iocblk *)mp->b_rptr;
7990 7991                  switch (iocp->ioc_cmd) {
7991 7992                  case DL_IOC_HDR_INFO:
7992 7993                          ip_rput_other(NULL, ill->ill_rq, mp, NULL);
7993 7994                          return;
7994 7995                  default:
7995 7996                          break;
7996 7997                  }
7997 7998                  /* FALLTHROUGH */
7998 7999          default:
7999 8000                  putnext(ill->ill_rq, mp);
8000 8001                  return;
8001 8002          }
8002 8003  }
8003 8004  
8004 8005  /* Read side put procedure.  Packets coming from the wire arrive here. */
8005 8006  int
8006 8007  ip_rput(queue_t *q, mblk_t *mp)
8007 8008  {
8008 8009          ill_t   *ill;
8009 8010          union DL_primitives *dl;
8010 8011  
8011 8012          ill = (ill_t *)q->q_ptr;
8012 8013  
8013 8014          if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
8014 8015                  /*
8015 8016                   * If things are opening or closing, only accept high-priority
8016 8017                   * DLPI messages.  (On open ill->ill_ipif has not yet been
8017 8018                   * created; on close, things hanging off the ill may have been
8018 8019                   * freed already.)
8019 8020                   */
8020 8021                  dl = (union DL_primitives *)mp->b_rptr;
8021 8022                  if (DB_TYPE(mp) != M_PCPROTO ||
8022 8023                      dl->dl_primitive == DL_UNITDATA_IND) {
8023 8024                          inet_freemsg(mp);
8024 8025                          return (0);
8025 8026                  }
8026 8027          }
8027 8028          if (DB_TYPE(mp) == M_DATA) {
8028 8029                  struct mac_header_info_s mhi;
8029 8030  
8030 8031                  ip_mdata_to_mhi(ill, mp, &mhi);
8031 8032                  ip_input(ill, NULL, mp, &mhi);
8032 8033          } else {
8033 8034                  ip_rput_notdata(ill, mp);
8034 8035          }
8035 8036          return (0);
8036 8037  }
8037 8038  
8038 8039  /*
8039 8040   * Move the information to a copy.
8040 8041   */
8041 8042  mblk_t *
8042 8043  ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
8043 8044  {
8044 8045          mblk_t          *mp1;
8045 8046          ill_t           *ill = ira->ira_ill;
8046 8047          ip_stack_t      *ipst = ill->ill_ipst;
8047 8048  
8048 8049          IP_STAT(ipst, ip_db_ref);
8049 8050  
8050 8051          /* Make sure we have ira_l2src before we loose the original mblk */
8051 8052          if (!(ira->ira_flags & IRAF_L2SRC_SET))
8052 8053                  ip_setl2src(mp, ira, ira->ira_rill);
8053 8054  
8054 8055          mp1 = copymsg(mp);
8055 8056          if (mp1 == NULL) {
8056 8057                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
8057 8058                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
8058 8059                  freemsg(mp);
8059 8060                  return (NULL);
8060 8061          }
8061 8062          /* preserve the hardware checksum flags and data, if present */
8062 8063          if (DB_CKSUMFLAGS(mp) != 0) {
8063 8064                  DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
8064 8065                  DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
8065 8066                  DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
8066 8067                  DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
8067 8068                  DB_CKSUM16(mp1) = DB_CKSUM16(mp);
8068 8069          }
8069 8070          freemsg(mp);
8070 8071          return (mp1);
8071 8072  }
8072 8073  
8073 8074  static void
8074 8075  ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
8075 8076      t_uscalar_t err)
8076 8077  {
8077 8078          if (dl_err == DL_SYSERR) {
8078 8079                  (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8079 8080                      "%s: %s failed: DL_SYSERR (errno %u)\n",
8080 8081                      ill->ill_name, dl_primstr(prim), err);
8081 8082                  return;
8082 8083          }
8083 8084  
8084 8085          (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8085 8086              "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim),
8086 8087              dl_errstr(dl_err));
8087 8088  }
8088 8089  
8089 8090  /*
8090 8091   * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other
8091 8092   * than DL_UNITDATA_IND messages. If we need to process this message
8092 8093   * exclusively, we call qwriter_ip, in which case we also need to call
8093 8094   * ill_refhold before that, since qwriter_ip does an ill_refrele.
8094 8095   */
8095 8096  void
8096 8097  ip_rput_dlpi(ill_t *ill, mblk_t *mp)
8097 8098  {
8098 8099          dl_ok_ack_t     *dloa = (dl_ok_ack_t *)mp->b_rptr;
8099 8100          dl_error_ack_t  *dlea = (dl_error_ack_t *)dloa;
8100 8101          queue_t         *q = ill->ill_rq;
8101 8102          t_uscalar_t     prim = dloa->dl_primitive;
8102 8103          t_uscalar_t     reqprim = DL_PRIM_INVAL;
8103 8104  
8104 8105          DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
8105 8106              char *, dl_primstr(prim), ill_t *, ill);
8106 8107          ip1dbg(("ip_rput_dlpi"));
8107 8108  
8108 8109          /*
8109 8110           * If we received an ACK but didn't send a request for it, then it
8110 8111           * can't be part of any pending operation; discard up-front.
8111 8112           */
8112 8113          switch (prim) {
8113 8114          case DL_ERROR_ACK:
8114 8115                  reqprim = dlea->dl_error_primitive;
8115 8116                  ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s "
8116 8117                      "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim),
8117 8118                      reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno,
8118 8119                      dlea->dl_unix_errno));
8119 8120                  break;
8120 8121          case DL_OK_ACK:
8121 8122                  reqprim = dloa->dl_correct_primitive;
8122 8123                  break;
8123 8124          case DL_INFO_ACK:
8124 8125                  reqprim = DL_INFO_REQ;
8125 8126                  break;
8126 8127          case DL_BIND_ACK:
8127 8128                  reqprim = DL_BIND_REQ;
8128 8129                  break;
8129 8130          case DL_PHYS_ADDR_ACK:
8130 8131                  reqprim = DL_PHYS_ADDR_REQ;
8131 8132                  break;
8132 8133          case DL_NOTIFY_ACK:
8133 8134                  reqprim = DL_NOTIFY_REQ;
8134 8135                  break;
8135 8136          case DL_CAPABILITY_ACK:
8136 8137                  reqprim = DL_CAPABILITY_REQ;
8137 8138                  break;
8138 8139          }
8139 8140  
8140 8141          if (prim != DL_NOTIFY_IND) {
8141 8142                  if (reqprim == DL_PRIM_INVAL ||
8142 8143                      !ill_dlpi_pending(ill, reqprim)) {
8143 8144                          /* Not a DLPI message we support or expected */
8144 8145                          freemsg(mp);
8145 8146                          return;
8146 8147                  }
8147 8148                  ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim),
8148 8149                      dl_primstr(reqprim)));
8149 8150          }
8150 8151  
8151 8152          switch (reqprim) {
8152 8153          case DL_UNBIND_REQ:
8153 8154                  /*
8154 8155                   * NOTE: we mark the unbind as complete even if we got a
8155 8156                   * DL_ERROR_ACK, since there's not much else we can do.
8156 8157                   */
8157 8158                  mutex_enter(&ill->ill_lock);
8158 8159                  ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
8159 8160                  cv_signal(&ill->ill_cv);
8160 8161                  mutex_exit(&ill->ill_lock);
8161 8162                  break;
8162 8163  
8163 8164          case DL_ENABMULTI_REQ:
8164 8165                  if (prim == DL_OK_ACK) {
8165 8166                          if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8166 8167                                  ill->ill_dlpi_multicast_state = IDS_OK;
8167 8168                  }
8168 8169                  break;
8169 8170          }
8170 8171  
8171 8172          /*
8172 8173           * The message is one we're waiting for (or DL_NOTIFY_IND), but we
8173 8174           * need to become writer to continue to process it.  Because an
8174 8175           * exclusive operation doesn't complete until replies to all queued
8175 8176           * DLPI messages have been received, we know we're in the middle of an
8176 8177           * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND).
8177 8178           *
8178 8179           * As required by qwriter_ip(), we refhold the ill; it will refrele.
8179 8180           * Since this is on the ill stream we unconditionally bump up the
8180 8181           * refcount without doing ILL_CAN_LOOKUP().
8181 8182           */
8182 8183          ill_refhold(ill);
8183 8184          if (prim == DL_NOTIFY_IND)
8184 8185                  qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE);
8185 8186          else
8186 8187                  qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE);
8187 8188  }
8188 8189  
8189 8190  /*
8190 8191   * Handling of DLPI messages that require exclusive access to the ipsq.
8191 8192   *
8192 8193   * Need to do ipsq_pending_mp_get on ioctl completion, which could
8193 8194   * happen here. (along with mi_copy_done)
8194 8195   */
8195 8196  /* ARGSUSED */
8196 8197  static void
8197 8198  ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8198 8199  {
8199 8200          dl_ok_ack_t     *dloa = (dl_ok_ack_t *)mp->b_rptr;
8200 8201          dl_error_ack_t  *dlea = (dl_error_ack_t *)dloa;
8201 8202          int             err = 0;
8202 8203          ill_t           *ill = (ill_t *)q->q_ptr;
8203 8204          ipif_t          *ipif = NULL;
8204 8205          mblk_t          *mp1 = NULL;
8205 8206          conn_t          *connp = NULL;
8206 8207          t_uscalar_t     paddrreq;
8207 8208          mblk_t          *mp_hw;
8208 8209          boolean_t       success;
8209 8210          boolean_t       ioctl_aborted = B_FALSE;
8210 8211          boolean_t       log = B_TRUE;
8211 8212  
8212 8213          DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
8213 8214              char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
8214 8215  
8215 8216          ip1dbg(("ip_rput_dlpi_writer .."));
8216 8217          ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
8217 8218          ASSERT(IAM_WRITER_ILL(ill));
8218 8219  
8219 8220          ipif = ipsq->ipsq_xop->ipx_pending_ipif;
8220 8221          /*
8221 8222           * The current ioctl could have been aborted by the user and a new
8222 8223           * ioctl to bring up another ill could have started. We could still
8223 8224           * get a response from the driver later.
8224 8225           */
8225 8226          if (ipif != NULL && ipif->ipif_ill != ill)
8226 8227                  ioctl_aborted = B_TRUE;
8227 8228  
8228 8229          switch (dloa->dl_primitive) {
8229 8230          case DL_ERROR_ACK:
8230 8231                  ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
8231 8232                      dl_primstr(dlea->dl_error_primitive)));
8232 8233  
8233 8234                  DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
8234 8235                      char *, dl_primstr(dlea->dl_error_primitive),
8235 8236                      ill_t *, ill);
8236 8237  
8237 8238                  switch (dlea->dl_error_primitive) {
8238 8239                  case DL_DISABMULTI_REQ:
8239 8240                          ill_dlpi_done(ill, dlea->dl_error_primitive);
8240 8241                          break;
8241 8242                  case DL_PROMISCON_REQ:
8242 8243                  case DL_PROMISCOFF_REQ:
8243 8244                  case DL_UNBIND_REQ:
8244 8245                  case DL_ATTACH_REQ:
8245 8246                  case DL_INFO_REQ:
8246 8247                          ill_dlpi_done(ill, dlea->dl_error_primitive);
8247 8248                          break;
8248 8249                  case DL_NOTIFY_REQ:
8249 8250                          ill_dlpi_done(ill, DL_NOTIFY_REQ);
8250 8251                          log = B_FALSE;
8251 8252                          break;
8252 8253                  case DL_PHYS_ADDR_REQ:
8253 8254                          /*
8254 8255                           * For IPv6 only, there are two additional
8255 8256                           * phys_addr_req's sent to the driver to get the
8256 8257                           * IPv6 token and lla. This allows IP to acquire
8257 8258                           * the hardware address format for a given interface
8258 8259                           * without having built in knowledge of the hardware
8259 8260                           * address. ill_phys_addr_pend keeps track of the last
8260 8261                           * DL_PAR sent so we know which response we are
8261 8262                           * dealing with. ill_dlpi_done will update
8262 8263                           * ill_phys_addr_pend when it sends the next req.
8263 8264                           * We don't complete the IOCTL until all three DL_PARs
8264 8265                           * have been attempted, so set *_len to 0 and break.
8265 8266                           */
8266 8267                          paddrreq = ill->ill_phys_addr_pend;
8267 8268                          ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8268 8269                          if (paddrreq == DL_IPV6_TOKEN) {
8269 8270                                  ill->ill_token_length = 0;
8270 8271                                  log = B_FALSE;
8271 8272                                  break;
8272 8273                          } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8273 8274                                  ill->ill_nd_lla_len = 0;
8274 8275                                  log = B_FALSE;
8275 8276                                  break;
8276 8277                          }
8277 8278                          /*
8278 8279                           * Something went wrong with the DL_PHYS_ADDR_REQ.
8279 8280                           * We presumably have an IOCTL hanging out waiting
8280 8281                           * for completion. Find it and complete the IOCTL
8281 8282                           * with the error noted.
8282 8283                           * However, ill_dl_phys was called on an ill queue
8283 8284                           * (from SIOCSLIFNAME), thus conn_pending_ill is not
8284 8285                           * set. But the ioctl is known to be pending on ill_wq.
8285 8286                           */
8286 8287                          if (!ill->ill_ifname_pending)
8287 8288                                  break;
8288 8289                          ill->ill_ifname_pending = 0;
8289 8290                          if (!ioctl_aborted)
8290 8291                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8291 8292                          if (mp1 != NULL) {
8292 8293                                  /*
8293 8294                                   * This operation (SIOCSLIFNAME) must have
8294 8295                                   * happened on the ill. Assert there is no conn
8295 8296                                   */
8296 8297                                  ASSERT(connp == NULL);
8297 8298                                  q = ill->ill_wq;
8298 8299                          }
8299 8300                          break;
8300 8301                  case DL_BIND_REQ:
8301 8302                          ill_dlpi_done(ill, DL_BIND_REQ);
8302 8303                          if (ill->ill_ifname_pending)
8303 8304                                  break;
8304 8305                          mutex_enter(&ill->ill_lock);
8305 8306                          ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8306 8307                          mutex_exit(&ill->ill_lock);
8307 8308                          /*
8308 8309                           * Something went wrong with the bind.  We presumably
8309 8310                           * have an IOCTL hanging out waiting for completion.
8310 8311                           * Find it, take down the interface that was coming
8311 8312                           * up, and complete the IOCTL with the error noted.
8312 8313                           */
8313 8314                          if (!ioctl_aborted)
8314 8315                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8315 8316                          if (mp1 != NULL) {
8316 8317                                  /*
8317 8318                                   * This might be a result of a DL_NOTE_REPLUMB
8318 8319                                   * notification. In that case, connp is NULL.
8319 8320                                   */
8320 8321                                  if (connp != NULL)
8321 8322                                          q = CONNP_TO_WQ(connp);
8322 8323  
8323 8324                                  (void) ipif_down(ipif, NULL, NULL);
8324 8325                                  /* error is set below the switch */
8325 8326                          }
8326 8327                          break;
8327 8328                  case DL_ENABMULTI_REQ:
8328 8329                          ill_dlpi_done(ill, DL_ENABMULTI_REQ);
8329 8330  
8330 8331                          if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8331 8332                                  ill->ill_dlpi_multicast_state = IDS_FAILED;
8332 8333                          if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
8333 8334  
8334 8335                                  printf("ip: joining multicasts failed (%d)"
8335 8336                                      " on %s - will use link layer "
8336 8337                                      "broadcasts for multicast\n",
8337 8338                                      dlea->dl_errno, ill->ill_name);
8338 8339  
8339 8340                                  /*
8340 8341                                   * Set up for multi_bcast; We are the
8341 8342                                   * writer, so ok to access ill->ill_ipif
8342 8343                                   * without any lock.
8343 8344                                   */
8344 8345                                  mutex_enter(&ill->ill_phyint->phyint_lock);
8345 8346                                  ill->ill_phyint->phyint_flags |=
8346 8347                                      PHYI_MULTI_BCAST;
8347 8348                                  mutex_exit(&ill->ill_phyint->phyint_lock);
8348 8349  
8349 8350                          }
8350 8351                          freemsg(mp);    /* Don't want to pass this up */
8351 8352                          return;
8352 8353                  case DL_CAPABILITY_REQ:
8353 8354                          ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
8354 8355                              "DL_CAPABILITY REQ\n"));
8355 8356                          if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
8356 8357                                  ill->ill_dlpi_capab_state = IDCS_FAILED;
8357 8358                          ill_capability_done(ill);
8358 8359                          freemsg(mp);
8359 8360                          return;
8360 8361                  }
8361 8362                  /*
8362 8363                   * Note the error for IOCTL completion (mp1 is set when
8363 8364                   * ready to complete ioctl). If ill_ifname_pending_err is
8364 8365                   * set, an error occured during plumbing (ill_ifname_pending),
8365 8366                   * so we want to report that error.
8366 8367                   *
8367 8368                   * NOTE: there are two addtional DL_PHYS_ADDR_REQ's
8368 8369                   * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are
8369 8370                   * expected to get errack'd if the driver doesn't support
8370 8371                   * these flags (e.g. ethernet). log will be set to B_FALSE
8371 8372                   * if these error conditions are encountered.
8372 8373                   */
8373 8374                  if (mp1 != NULL) {
8374 8375                          if (ill->ill_ifname_pending_err != 0)  {
8375 8376                                  err = ill->ill_ifname_pending_err;
8376 8377                                  ill->ill_ifname_pending_err = 0;
8377 8378                          } else {
8378 8379                                  err = dlea->dl_unix_errno ?
8379 8380                                      dlea->dl_unix_errno : ENXIO;
8380 8381                          }
8381 8382                  /*
8382 8383                   * If we're plumbing an interface and an error hasn't already
8383 8384                   * been saved, set ill_ifname_pending_err to the error passed
8384 8385                   * up. Ignore the error if log is B_FALSE (see comment above).
8385 8386                   */
8386 8387                  } else if (log && ill->ill_ifname_pending &&
8387 8388                      ill->ill_ifname_pending_err == 0) {
8388 8389                          ill->ill_ifname_pending_err = dlea->dl_unix_errno ?
8389 8390                              dlea->dl_unix_errno : ENXIO;
8390 8391                  }
8391 8392  
8392 8393                  if (log)
8393 8394                          ip_dlpi_error(ill, dlea->dl_error_primitive,
8394 8395                              dlea->dl_errno, dlea->dl_unix_errno);
8395 8396                  break;
8396 8397          case DL_CAPABILITY_ACK:
8397 8398                  ill_capability_ack(ill, mp);
8398 8399                  /*
8399 8400                   * The message has been handed off to ill_capability_ack
8400 8401                   * and must not be freed below
8401 8402                   */
8402 8403                  mp = NULL;
8403 8404                  break;
8404 8405  
8405 8406          case DL_INFO_ACK:
8406 8407                  /* Call a routine to handle this one. */
8407 8408                  ill_dlpi_done(ill, DL_INFO_REQ);
8408 8409                  ip_ll_subnet_defaults(ill, mp);
8409 8410                  ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock));
8410 8411                  return;
8411 8412          case DL_BIND_ACK:
8412 8413                  /*
8413 8414                   * We should have an IOCTL waiting on this unless
8414 8415                   * sent by ill_dl_phys, in which case just return
8415 8416                   */
8416 8417                  ill_dlpi_done(ill, DL_BIND_REQ);
8417 8418  
8418 8419                  if (ill->ill_ifname_pending) {
8419 8420                          DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
8420 8421                              ill_t *, ill, mblk_t *, mp);
8421 8422                          break;
8422 8423                  }
8423 8424                  mutex_enter(&ill->ill_lock);
8424 8425                  ill->ill_dl_up = 1;
8425 8426                  ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8426 8427                  mutex_exit(&ill->ill_lock);
8427 8428  
8428 8429                  if (!ioctl_aborted)
8429 8430                          mp1 = ipsq_pending_mp_get(ipsq, &connp);
8430 8431                  if (mp1 == NULL) {
8431 8432                          DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
8432 8433                          break;
8433 8434                  }
8434 8435                  /*
8435 8436                   * mp1 was added by ill_dl_up(). if that is a result of
8436 8437                   * a DL_NOTE_REPLUMB notification, connp could be NULL.
8437 8438                   */
8438 8439                  if (connp != NULL)
8439 8440                          q = CONNP_TO_WQ(connp);
8440 8441                  /*
8441 8442                   * We are exclusive. So nothing can change even after
8442 8443                   * we get the pending mp.
8443 8444                   */
8444 8445                  ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
8445 8446                  DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
8446 8447                  ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
8447 8448  
8448 8449                  /*
8449 8450                   * Now bring up the resolver; when that is complete, we'll
8450 8451                   * create IREs.  Note that we intentionally mirror what
8451 8452                   * ipif_up() would have done, because we got here by way of
8452 8453                   * ill_dl_up(), which stopped ipif_up()'s processing.
8453 8454                   */
8454 8455                  if (ill->ill_isv6) {
8455 8456                          /*
8456 8457                           * v6 interfaces.
8457 8458                           * Unlike ARP which has to do another bind
8458 8459                           * and attach, once we get here we are
8459 8460                           * done with NDP
8460 8461                           */
8461 8462                          (void) ipif_resolver_up(ipif, Res_act_initial);
8462 8463                          if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
8463 8464                                  err = ipif_up_done_v6(ipif);
8464 8465                  } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
8465 8466                          /*
8466 8467                           * ARP and other v4 external resolvers.
8467 8468                           * Leave the pending mblk intact so that
8468 8469                           * the ioctl completes in ip_rput().
8469 8470                           */
8470 8471                          if (connp != NULL)
8471 8472                                  mutex_enter(&connp->conn_lock);
8472 8473                          mutex_enter(&ill->ill_lock);
8473 8474                          success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
8474 8475                          mutex_exit(&ill->ill_lock);
8475 8476                          if (connp != NULL)
8476 8477                                  mutex_exit(&connp->conn_lock);
8477 8478                          if (success) {
8478 8479                                  err = ipif_resolver_up(ipif, Res_act_initial);
8479 8480                                  if (err == EINPROGRESS) {
8480 8481                                          freemsg(mp);
8481 8482                                          return;
8482 8483                                  }
8483 8484                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8484 8485                          } else {
8485 8486                                  /* The conn has started closing */
8486 8487                                  err = EINTR;
8487 8488                          }
8488 8489                  } else {
8489 8490                          /*
8490 8491                           * This one is complete. Reply to pending ioctl.
8491 8492                           */
8492 8493                          (void) ipif_resolver_up(ipif, Res_act_initial);
8493 8494                          err = ipif_up_done(ipif);
8494 8495                  }
8495 8496  
8496 8497                  if ((err == 0) && (ill->ill_up_ipifs)) {
8497 8498                          err = ill_up_ipifs(ill, q, mp1);
8498 8499                          if (err == EINPROGRESS) {
8499 8500                                  freemsg(mp);
8500 8501                                  return;
8501 8502                          }
8502 8503                  }
8503 8504  
8504 8505                  /*
8505 8506                   * If we have a moved ipif to bring up, and everything has
8506 8507                   * succeeded to this point, bring it up on the IPMP ill.
8507 8508                   * Otherwise, leave it down -- the admin can try to bring it
8508 8509                   * up by hand if need be.
8509 8510                   */
8510 8511                  if (ill->ill_move_ipif != NULL) {
8511 8512                          if (err != 0) {
8512 8513                                  ill->ill_move_ipif = NULL;
8513 8514                          } else {
8514 8515                                  ipif = ill->ill_move_ipif;
8515 8516                                  ill->ill_move_ipif = NULL;
8516 8517                                  err = ipif_up(ipif, q, mp1);
8517 8518                                  if (err == EINPROGRESS) {
8518 8519                                          freemsg(mp);
8519 8520                                          return;
8520 8521                                  }
8521 8522                          }
8522 8523                  }
8523 8524                  break;
8524 8525  
8525 8526          case DL_NOTIFY_IND: {
8526 8527                  dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
8527 8528                  uint_t orig_mtu, orig_mc_mtu;
8528 8529  
8529 8530                  switch (notify->dl_notification) {
8530 8531                  case DL_NOTE_PHYS_ADDR:
8531 8532                          err = ill_set_phys_addr(ill, mp);
8532 8533                          break;
8533 8534  
8534 8535                  case DL_NOTE_REPLUMB:
8535 8536                          /*
8536 8537                           * Directly return after calling ill_replumb().
8537 8538                           * Note that we should not free mp as it is reused
8538 8539                           * in the ill_replumb() function.
8539 8540                           */
8540 8541                          err = ill_replumb(ill, mp);
8541 8542                          return;
8542 8543  
8543 8544                  case DL_NOTE_FASTPATH_FLUSH:
8544 8545                          nce_flush(ill, B_FALSE);
8545 8546                          break;
8546 8547  
8547 8548                  case DL_NOTE_SDU_SIZE:
8548 8549                  case DL_NOTE_SDU_SIZE2:
8549 8550                          /*
8550 8551                           * The dce and fragmentation code can cope with
8551 8552                           * this changing while packets are being sent.
8552 8553                           * When packets are sent ip_output will discover
8553 8554                           * a change.
8554 8555                           *
8555 8556                           * Change the MTU size of the interface.
8556 8557                           */
8557 8558                          mutex_enter(&ill->ill_lock);
8558 8559                          orig_mtu = ill->ill_mtu;
8559 8560                          orig_mc_mtu = ill->ill_mc_mtu;
8560 8561                          switch (notify->dl_notification) {
8561 8562                          case DL_NOTE_SDU_SIZE:
8562 8563                                  ill->ill_current_frag =
8563 8564                                      (uint_t)notify->dl_data;
8564 8565                                  ill->ill_mc_mtu = (uint_t)notify->dl_data;
8565 8566                                  break;
8566 8567                          case DL_NOTE_SDU_SIZE2:
8567 8568                                  ill->ill_current_frag =
8568 8569                                      (uint_t)notify->dl_data1;
8569 8570                                  ill->ill_mc_mtu = (uint_t)notify->dl_data2;
8570 8571                                  break;
8571 8572                          }
8572 8573                          if (ill->ill_current_frag > ill->ill_max_frag)
8573 8574                                  ill->ill_max_frag = ill->ill_current_frag;
8574 8575  
8575 8576                          if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
8576 8577                                  ill->ill_mtu = ill->ill_current_frag;
8577 8578  
8578 8579                                  /*
8579 8580                                   * If ill_user_mtu was set (via
8580 8581                                   * SIOCSLIFLNKINFO), clamp ill_mtu at it.
8581 8582                                   */
8582 8583                                  if (ill->ill_user_mtu != 0 &&
8583 8584                                      ill->ill_user_mtu < ill->ill_mtu)
8584 8585                                          ill->ill_mtu = ill->ill_user_mtu;
8585 8586  
8586 8587                                  if (ill->ill_user_mtu != 0 &&
8587 8588                                      ill->ill_user_mtu < ill->ill_mc_mtu)
8588 8589                                          ill->ill_mc_mtu = ill->ill_user_mtu;
8589 8590  
8590 8591                                  if (ill->ill_isv6) {
8591 8592                                          if (ill->ill_mtu < IPV6_MIN_MTU)
8592 8593                                                  ill->ill_mtu = IPV6_MIN_MTU;
8593 8594                                          if (ill->ill_mc_mtu < IPV6_MIN_MTU)
8594 8595                                                  ill->ill_mc_mtu = IPV6_MIN_MTU;
8595 8596                                  } else {
8596 8597                                          if (ill->ill_mtu < IP_MIN_MTU)
8597 8598                                                  ill->ill_mtu = IP_MIN_MTU;
8598 8599                                          if (ill->ill_mc_mtu < IP_MIN_MTU)
8599 8600                                                  ill->ill_mc_mtu = IP_MIN_MTU;
8600 8601                                  }
8601 8602                          } else if (ill->ill_mc_mtu > ill->ill_mtu) {
8602 8603                                  ill->ill_mc_mtu = ill->ill_mtu;
8603 8604                          }
8604 8605  
8605 8606                          mutex_exit(&ill->ill_lock);
8606 8607                          /*
8607 8608                           * Make sure all dce_generation checks find out
8608 8609                           * that ill_mtu/ill_mc_mtu has changed.
8609 8610                           */
8610 8611                          if (orig_mtu != ill->ill_mtu ||
8611 8612                              orig_mc_mtu != ill->ill_mc_mtu) {
8612 8613                                  dce_increment_all_generations(ill->ill_isv6,
8613 8614                                      ill->ill_ipst);
8614 8615                          }
8615 8616  
8616 8617                          /*
8617 8618                           * Refresh IPMP meta-interface MTU if necessary.
8618 8619                           */
8619 8620                          if (IS_UNDER_IPMP(ill))
8620 8621                                  ipmp_illgrp_refresh_mtu(ill->ill_grp);
8621 8622                          break;
8622 8623  
8623 8624                  case DL_NOTE_LINK_UP:
8624 8625                  case DL_NOTE_LINK_DOWN: {
8625 8626                          /*
8626 8627                           * We are writer. ill / phyint / ipsq assocs stable.
8627 8628                           * The RUNNING flag reflects the state of the link.
8628 8629                           */
8629 8630                          phyint_t *phyint = ill->ill_phyint;
8630 8631                          uint64_t new_phyint_flags;
8631 8632                          boolean_t changed = B_FALSE;
8632 8633                          boolean_t went_up;
8633 8634  
8634 8635                          went_up = notify->dl_notification == DL_NOTE_LINK_UP;
8635 8636                          mutex_enter(&phyint->phyint_lock);
8636 8637  
8637 8638                          new_phyint_flags = went_up ?
8638 8639                              phyint->phyint_flags | PHYI_RUNNING :
8639 8640                              phyint->phyint_flags & ~PHYI_RUNNING;
8640 8641  
8641 8642                          if (IS_IPMP(ill)) {
8642 8643                                  new_phyint_flags = went_up ?
8643 8644                                      new_phyint_flags & ~PHYI_FAILED :
8644 8645                                      new_phyint_flags | PHYI_FAILED;
8645 8646                          }
8646 8647  
8647 8648                          if (new_phyint_flags != phyint->phyint_flags) {
8648 8649                                  phyint->phyint_flags = new_phyint_flags;
8649 8650                                  changed = B_TRUE;
8650 8651                          }
8651 8652                          mutex_exit(&phyint->phyint_lock);
8652 8653                          /*
8653 8654                           * ill_restart_dad handles the DAD restart and routing
8654 8655                           * socket notification logic.
8655 8656                           */
8656 8657                          if (changed) {
8657 8658                                  ill_restart_dad(phyint->phyint_illv4, went_up);
8658 8659                                  ill_restart_dad(phyint->phyint_illv6, went_up);
8659 8660                          }
8660 8661                          break;
8661 8662                  }
8662 8663                  case DL_NOTE_PROMISC_ON_PHYS: {
8663 8664                          phyint_t *phyint = ill->ill_phyint;
8664 8665  
8665 8666                          mutex_enter(&phyint->phyint_lock);
8666 8667                          phyint->phyint_flags |= PHYI_PROMISC;
8667 8668                          mutex_exit(&phyint->phyint_lock);
8668 8669                          break;
8669 8670                  }
8670 8671                  case DL_NOTE_PROMISC_OFF_PHYS: {
8671 8672                          phyint_t *phyint = ill->ill_phyint;
8672 8673  
8673 8674                          mutex_enter(&phyint->phyint_lock);
8674 8675                          phyint->phyint_flags &= ~PHYI_PROMISC;
8675 8676                          mutex_exit(&phyint->phyint_lock);
8676 8677                          break;
8677 8678                  }
8678 8679                  case DL_NOTE_CAPAB_RENEG:
8679 8680                          /*
8680 8681                           * Something changed on the driver side.
8681 8682                           * It wants us to renegotiate the capabilities
8682 8683                           * on this ill. One possible cause is the aggregation
8683 8684                           * interface under us where a port got added or
8684 8685                           * went away.
8685 8686                           *
8686 8687                           * If the capability negotiation is already done
8687 8688                           * or is in progress, reset the capabilities and
8688 8689                           * mark the ill's ill_capab_reneg to be B_TRUE,
8689 8690                           * so that when the ack comes back, we can start
8690 8691                           * the renegotiation process.
8691 8692                           *
8692 8693                           * Note that if ill_capab_reneg is already B_TRUE
8693 8694                           * (ill_dlpi_capab_state is IDS_UNKNOWN in this case),
8694 8695                           * the capability resetting request has been sent
8695 8696                           * and the renegotiation has not been started yet;
8696 8697                           * nothing needs to be done in this case.
8697 8698                           */
8698 8699                          ipsq_current_start(ipsq, ill->ill_ipif, 0);
8699 8700                          ill_capability_reset(ill, B_TRUE);
8700 8701                          ipsq_current_finish(ipsq);
8701 8702                          break;
8702 8703  
8703 8704                  case DL_NOTE_ALLOWED_IPS:
8704 8705                          ill_set_allowed_ips(ill, mp);
8705 8706                          break;
8706 8707                  default:
8707 8708                          ip0dbg(("ip_rput_dlpi_writer: unknown notification "
8708 8709                              "type 0x%x for DL_NOTIFY_IND\n",
8709 8710                              notify->dl_notification));
8710 8711                          break;
8711 8712                  }
8712 8713  
8713 8714                  /*
8714 8715                   * As this is an asynchronous operation, we
8715 8716                   * should not call ill_dlpi_done
8716 8717                   */
8717 8718                  break;
8718 8719          }
8719 8720          case DL_NOTIFY_ACK: {
8720 8721                  dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr;
8721 8722  
8722 8723                  if (noteack->dl_notifications & DL_NOTE_LINK_UP)
8723 8724                          ill->ill_note_link = 1;
8724 8725                  ill_dlpi_done(ill, DL_NOTIFY_REQ);
8725 8726                  break;
8726 8727          }
8727 8728          case DL_PHYS_ADDR_ACK: {
8728 8729                  /*
8729 8730                   * As part of plumbing the interface via SIOCSLIFNAME,
8730 8731                   * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs,
8731 8732                   * whose answers we receive here.  As each answer is received,
8732 8733                   * we call ill_dlpi_done() to dispatch the next request as
8733 8734                   * we're processing the current one.  Once all answers have
8734 8735                   * been received, we use ipsq_pending_mp_get() to dequeue the
8735 8736                   * outstanding IOCTL and reply to it.  (Because ill_dl_phys()
8736 8737                   * is invoked from an ill queue, conn_oper_pending_ill is not
8737 8738                   * available, but we know the ioctl is pending on ill_wq.)
8738 8739                   */
8739 8740                  uint_t  paddrlen, paddroff;
8740 8741                  uint8_t *addr;
8741 8742  
8742 8743                  paddrreq = ill->ill_phys_addr_pend;
8743 8744                  paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
8744 8745                  paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset;
8745 8746                  addr = mp->b_rptr + paddroff;
8746 8747  
8747 8748                  ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8748 8749                  if (paddrreq == DL_IPV6_TOKEN) {
8749 8750                          /*
8750 8751                           * bcopy to low-order bits of ill_token
8751 8752                           *
8752 8753                           * XXX Temporary hack - currently, all known tokens
8753 8754                           * are 64 bits, so I'll cheat for the moment.
8754 8755                           */
8755 8756                          bcopy(addr, &ill->ill_token.s6_addr32[2], paddrlen);
8756 8757                          ill->ill_token_length = paddrlen;
8757 8758                          break;
8758 8759                  } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8759 8760                          ASSERT(ill->ill_nd_lla_mp == NULL);
8760 8761                          ill_set_ndmp(ill, mp, paddroff, paddrlen);
8761 8762                          mp = NULL;
8762 8763                          break;
8763 8764                  } else if (paddrreq == DL_CURR_DEST_ADDR) {
8764 8765                          ASSERT(ill->ill_dest_addr_mp == NULL);
8765 8766                          ill->ill_dest_addr_mp = mp;
8766 8767                          ill->ill_dest_addr = addr;
8767 8768                          mp = NULL;
8768 8769                          if (ill->ill_isv6) {
8769 8770                                  ill_setdesttoken(ill);
8770 8771                                  ipif_setdestlinklocal(ill->ill_ipif);
8771 8772                          }
8772 8773                          break;
8773 8774                  }
8774 8775  
8775 8776                  ASSERT(paddrreq == DL_CURR_PHYS_ADDR);
8776 8777                  ASSERT(ill->ill_phys_addr_mp == NULL);
8777 8778                  if (!ill->ill_ifname_pending)
8778 8779                          break;
8779 8780                  ill->ill_ifname_pending = 0;
8780 8781                  if (!ioctl_aborted)
8781 8782                          mp1 = ipsq_pending_mp_get(ipsq, &connp);
8782 8783                  if (mp1 != NULL) {
8783 8784                          ASSERT(connp == NULL);
8784 8785                          q = ill->ill_wq;
8785 8786                  }
8786 8787                  /*
8787 8788                   * If any error acks received during the plumbing sequence,
8788 8789                   * ill_ifname_pending_err will be set. Break out and send up
8789 8790                   * the error to the pending ioctl.
8790 8791                   */
8791 8792                  if (ill->ill_ifname_pending_err != 0) {
8792 8793                          err = ill->ill_ifname_pending_err;
8793 8794                          ill->ill_ifname_pending_err = 0;
8794 8795                          break;
8795 8796                  }
8796 8797  
8797 8798                  ill->ill_phys_addr_mp = mp;
8798 8799                  ill->ill_phys_addr = (paddrlen == 0 ? NULL : addr);
8799 8800                  mp = NULL;
8800 8801  
8801 8802                  /*
8802 8803                   * If paddrlen or ill_phys_addr_length is zero, the DLPI
8803 8804                   * provider doesn't support physical addresses.  We check both
8804 8805                   * paddrlen and ill_phys_addr_length because sppp (PPP) does
8805 8806                   * not have physical addresses, but historically adversises a
8806 8807                   * physical address length of 0 in its DL_INFO_ACK, but 6 in
8807 8808                   * its DL_PHYS_ADDR_ACK.
8808 8809                   */
8809 8810                  if (paddrlen == 0 || ill->ill_phys_addr_length == 0) {
8810 8811                          ill->ill_phys_addr = NULL;
8811 8812                  } else if (paddrlen != ill->ill_phys_addr_length) {
8812 8813                          ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d",
8813 8814                              paddrlen, ill->ill_phys_addr_length));
8814 8815                          err = EINVAL;
8815 8816                          break;
8816 8817                  }
8817 8818  
8818 8819                  if (ill->ill_nd_lla_mp == NULL) {
8819 8820                          if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) {
8820 8821                                  err = ENOMEM;
8821 8822                                  break;
8822 8823                          }
8823 8824                          ill_set_ndmp(ill, mp_hw, paddroff, paddrlen);
8824 8825                  }
8825 8826  
8826 8827                  if (ill->ill_isv6) {
8827 8828                          ill_setdefaulttoken(ill);
8828 8829                          ipif_setlinklocal(ill->ill_ipif);
8829 8830                  }
8830 8831                  break;
8831 8832          }
8832 8833          case DL_OK_ACK:
8833 8834                  ip2dbg(("DL_OK_ACK %s (0x%x)\n",
8834 8835                      dl_primstr((int)dloa->dl_correct_primitive),
8835 8836                      dloa->dl_correct_primitive));
8836 8837                  DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
8837 8838                      char *, dl_primstr(dloa->dl_correct_primitive),
8838 8839                      ill_t *, ill);
8839 8840  
8840 8841                  switch (dloa->dl_correct_primitive) {
8841 8842                  case DL_ENABMULTI_REQ:
8842 8843                  case DL_DISABMULTI_REQ:
8843 8844                          ill_dlpi_done(ill, dloa->dl_correct_primitive);
8844 8845                          break;
8845 8846                  case DL_PROMISCON_REQ:
8846 8847                  case DL_PROMISCOFF_REQ:
8847 8848                  case DL_UNBIND_REQ:
8848 8849                  case DL_ATTACH_REQ:
8849 8850                          ill_dlpi_done(ill, dloa->dl_correct_primitive);
8850 8851                          break;
8851 8852                  }
8852 8853                  break;
8853 8854          default:
8854 8855                  break;
8855 8856          }
8856 8857  
8857 8858          freemsg(mp);
8858 8859          if (mp1 == NULL)
8859 8860                  return;
8860 8861  
8861 8862          /*
8862 8863           * The operation must complete without EINPROGRESS since
8863 8864           * ipsq_pending_mp_get() has removed the mblk (mp1).  Otherwise,
8864 8865           * the operation will be stuck forever inside the IPSQ.
8865 8866           */
8866 8867          ASSERT(err != EINPROGRESS);
8867 8868  
8868 8869          DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
8869 8870              int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
8870 8871              ipif_t *, NULL);
8871 8872  
8872 8873          switch (ipsq->ipsq_xop->ipx_current_ioctl) {
8873 8874          case 0:
8874 8875                  ipsq_current_finish(ipsq);
8875 8876                  break;
8876 8877  
8877 8878          case SIOCSLIFNAME:
8878 8879          case IF_UNITSEL: {
8879 8880                  ill_t *ill_other = ILL_OTHER(ill);
8880 8881  
8881 8882                  /*
8882 8883                   * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
8883 8884                   * ill has a peer which is in an IPMP group, then place ill
8884 8885                   * into the same group.  One catch: although ifconfig plumbs
8885 8886                   * the appropriate IPMP meta-interface prior to plumbing this
8886 8887                   * ill, it is possible for multiple ifconfig applications to
8887 8888                   * race (or for another application to adjust plumbing), in
8888 8889                   * which case the IPMP meta-interface we need will be missing.
8889 8890                   * If so, kick the phyint out of the group.
8890 8891                   */
8891 8892                  if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
8892 8893                          ipmp_grp_t      *grp = ill->ill_phyint->phyint_grp;
8893 8894                          ipmp_illgrp_t   *illg;
8894 8895  
8895 8896                          illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
8896 8897                          if (illg == NULL)
8897 8898                                  ipmp_phyint_leave_grp(ill->ill_phyint);
8898 8899                          else
8899 8900                                  ipmp_ill_join_illgrp(ill, illg);
8900 8901                  }
8901 8902  
8902 8903                  if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
8903 8904                          ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8904 8905                  else
8905 8906                          ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8906 8907                  break;
8907 8908          }
8908 8909          case SIOCLIFADDIF:
8909 8910                  ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8910 8911                  break;
8911 8912  
8912 8913          default:
8913 8914                  ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8914 8915                  break;
8915 8916          }
8916 8917  }
8917 8918  
8918 8919  /*
8919 8920   * ip_rput_other is called by ip_rput to handle messages modifying the global
8920 8921   * state in IP.  If 'ipsq' is non-NULL, caller is writer on it.
8921 8922   */
8922 8923  /* ARGSUSED */
8923 8924  void
8924 8925  ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8925 8926  {
8926 8927          ill_t           *ill = q->q_ptr;
8927 8928          struct iocblk   *iocp;
8928 8929  
8929 8930          ip1dbg(("ip_rput_other "));
8930 8931          if (ipsq != NULL) {
8931 8932                  ASSERT(IAM_WRITER_IPSQ(ipsq));
8932 8933                  ASSERT(ipsq->ipsq_xop ==
8933 8934                      ill->ill_phyint->phyint_ipsq->ipsq_xop);
8934 8935          }
8935 8936  
8936 8937          switch (mp->b_datap->db_type) {
8937 8938          case M_ERROR:
8938 8939          case M_HANGUP:
8939 8940                  /*
8940 8941                   * The device has a problem.  We force the ILL down.  It can
8941 8942                   * be brought up again manually using SIOCSIFFLAGS (via
8942 8943                   * ifconfig or equivalent).
8943 8944                   */
8944 8945                  ASSERT(ipsq != NULL);
8945 8946                  if (mp->b_rptr < mp->b_wptr)
8946 8947                          ill->ill_error = (int)(*mp->b_rptr & 0xFF);
8947 8948                  if (ill->ill_error == 0)
8948 8949                          ill->ill_error = ENXIO;
8949 8950                  if (!ill_down_start(q, mp))
8950 8951                          return;
8951 8952                  ipif_all_down_tail(ipsq, q, mp, NULL);
8952 8953                  break;
8953 8954          case M_IOCNAK: {
8954 8955                  iocp = (struct iocblk *)mp->b_rptr;
8955 8956  
8956 8957                  ASSERT(iocp->ioc_cmd == DL_IOC_HDR_INFO);
8957 8958                  /*
8958 8959                   * If this was the first attempt, turn off the fastpath
8959 8960                   * probing.
8960 8961                   */
8961 8962                  mutex_enter(&ill->ill_lock);
8962 8963                  if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
8963 8964                          ill->ill_dlpi_fastpath_state = IDS_FAILED;
8964 8965                          mutex_exit(&ill->ill_lock);
8965 8966                          /*
8966 8967                           * don't flush the nce_t entries: we use them
8967 8968                           * as an index to the ncec itself.
8968 8969                           */
8969 8970                          ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
8970 8971                              ill->ill_name));
8971 8972                  } else {
8972 8973                          mutex_exit(&ill->ill_lock);
8973 8974                  }
8974 8975                  freemsg(mp);
8975 8976                  break;
8976 8977          }
8977 8978          default:
8978 8979                  ASSERT(0);
8979 8980                  break;
8980 8981          }
8981 8982  }
8982 8983  
8983 8984  /*
8984 8985   * Update any source route, record route or timestamp options
8985 8986   * When it fails it has consumed the message and BUMPed the MIB.
8986 8987   */
8987 8988  boolean_t
8988 8989  ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
8989 8990      ip_recv_attr_t *ira)
8990 8991  {
8991 8992          ipoptp_t        opts;
8992 8993          uchar_t         *opt;
8993 8994          uint8_t         optval;
8994 8995          uint8_t         optlen;
8995 8996          ipaddr_t        dst;
8996 8997          ipaddr_t        ifaddr;
8997 8998          uint32_t        ts;
8998 8999          timestruc_t     now;
8999 9000          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
9000 9001  
9001 9002          ip2dbg(("ip_forward_options\n"));
9002 9003          dst = ipha->ipha_dst;
9003 9004          opt = NULL;
9004 9005  
9005 9006          for (optval = ipoptp_first(&opts, ipha);
9006 9007              optval != IPOPT_EOL;
9007 9008              optval = ipoptp_next(&opts)) {
9008 9009                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
9009 9010                  opt = opts.ipoptp_cur;
9010 9011                  optlen = opts.ipoptp_len;
9011 9012                  ip2dbg(("ip_forward_options: opt %d, len %d\n",
9012 9013                      optval, opts.ipoptp_len));
9013 9014                  switch (optval) {
9014 9015                          uint32_t off;
9015 9016                  case IPOPT_SSRR:
9016 9017                  case IPOPT_LSRR:
9017 9018                          /* Check if adminstratively disabled */
9018 9019                          if (!ipst->ips_ip_forward_src_routed) {
9019 9020                                  BUMP_MIB(dst_ill->ill_ip_mib,
9020 9021                                      ipIfStatsForwProhibits);
9021 9022                                  ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
9022 9023                                      mp, dst_ill);
9023 9024                                  icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
9024 9025                                      ira);
9025 9026                                  return (B_FALSE);
9026 9027                          }
9027 9028                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9028 9029                                  /*
9029 9030                                   * Must be partial since ip_input_options
9030 9031                                   * checked for strict.
9031 9032                                   */
9032 9033                                  break;
9033 9034                          }
9034 9035                          off = opt[IPOPT_OFFSET];
9035 9036                          off--;
9036 9037                  redo_srr:
9037 9038                          if (optlen < IP_ADDR_LEN ||
9038 9039                              off > optlen - IP_ADDR_LEN) {
9039 9040                                  /* End of source route */
9040 9041                                  ip1dbg((
9041 9042                                      "ip_forward_options: end of SR\n"));
9042 9043                                  break;
9043 9044                          }
9044 9045                          /* Pick a reasonable address on the outbound if */
9045 9046                          ASSERT(dst_ill != NULL);
9046 9047                          if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9047 9048                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9048 9049                              NULL) != 0) {
9049 9050                                  /* No source! Shouldn't happen */
9050 9051                                  ifaddr = INADDR_ANY;
9051 9052                          }
9052 9053                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9053 9054                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9054 9055                          ip1dbg(("ip_forward_options: next hop 0x%x\n",
9055 9056                              ntohl(dst)));
9056 9057  
9057 9058                          /*
9058 9059                           * Check if our address is present more than
9059 9060                           * once as consecutive hops in source route.
9060 9061                           */
9061 9062                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9062 9063                                  off += IP_ADDR_LEN;
9063 9064                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9064 9065                                  goto redo_srr;
9065 9066                          }
9066 9067                          ipha->ipha_dst = dst;
9067 9068                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9068 9069                          break;
9069 9070                  case IPOPT_RR:
9070 9071                          off = opt[IPOPT_OFFSET];
9071 9072                          off--;
9072 9073                          if (optlen < IP_ADDR_LEN ||
9073 9074                              off > optlen - IP_ADDR_LEN) {
9074 9075                                  /* No more room - ignore */
9075 9076                                  ip1dbg((
9076 9077                                      "ip_forward_options: end of RR\n"));
9077 9078                                  break;
9078 9079                          }
9079 9080                          /* Pick a reasonable address on the outbound if */
9080 9081                          ASSERT(dst_ill != NULL);
9081 9082                          if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9082 9083                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9083 9084                              NULL) != 0) {
9084 9085                                  /* No source! Shouldn't happen */
9085 9086                                  ifaddr = INADDR_ANY;
9086 9087                          }
9087 9088                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9088 9089                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9089 9090                          break;
9090 9091                  case IPOPT_TS:
9091 9092                          off = 0;
9092 9093                          /* Insert timestamp if there is room */
9093 9094                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9094 9095                          case IPOPT_TS_TSONLY:
9095 9096                                  off = IPOPT_TS_TIMELEN;
9096 9097                                  break;
9097 9098                          case IPOPT_TS_PRESPEC:
9098 9099                          case IPOPT_TS_PRESPEC_RFC791:
9099 9100                                  /* Verify that the address matched */
9100 9101                                  off = opt[IPOPT_OFFSET] - 1;
9101 9102                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9102 9103                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9103 9104                                          /* Not for us */
9104 9105                                          break;
9105 9106                                  }
9106 9107                                  /* FALLTHROUGH */
9107 9108                          case IPOPT_TS_TSANDADDR:
9108 9109                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9109 9110                                  break;
9110 9111                          default:
9111 9112                                  /*
9112 9113                                   * ip_*put_options should have already
9113 9114                                   * dropped this packet.
9114 9115                                   */
9115 9116                                  cmn_err(CE_PANIC, "ip_forward_options: "
9116 9117                                      "unknown IT - bug in ip_input_options?\n");
9117 9118                          }
9118 9119                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9119 9120                                  /* Increase overflow counter */
9120 9121                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9121 9122                                  opt[IPOPT_POS_OV_FLG] =
9122 9123                                      (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9123 9124                                      (off << 4));
9124 9125                                  break;
9125 9126                          }
9126 9127                          off = opt[IPOPT_OFFSET] - 1;
9127 9128                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9128 9129                          case IPOPT_TS_PRESPEC:
9129 9130                          case IPOPT_TS_PRESPEC_RFC791:
9130 9131                          case IPOPT_TS_TSANDADDR:
9131 9132                                  /* Pick a reasonable addr on the outbound if */
9132 9133                                  ASSERT(dst_ill != NULL);
9133 9134                                  if (ip_select_source_v4(dst_ill, INADDR_ANY,
9134 9135                                      dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
9135 9136                                      NULL, NULL) != 0) {
9136 9137                                          /* No source! Shouldn't happen */
9137 9138                                          ifaddr = INADDR_ANY;
9138 9139                                  }
9139 9140                                  bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9140 9141                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9141 9142                                  /* FALLTHROUGH */
9142 9143                          case IPOPT_TS_TSONLY:
9143 9144                                  off = opt[IPOPT_OFFSET] - 1;
9144 9145                                  /* Compute # of milliseconds since midnight */
9145 9146                                  gethrestime(&now);
9146 9147                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9147 9148                                      NSEC2MSEC(now.tv_nsec);
9148 9149                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9149 9150                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9150 9151                                  break;
9151 9152                          }
9152 9153                          break;
9153 9154                  }
9154 9155          }
9155 9156          return (B_TRUE);
9156 9157  }
9157 9158  
9158 9159  /*
9159 9160   * Call ill_frag_timeout to do garbage collection. ill_frag_timeout
9160 9161   * returns 'true' if there are still fragments left on the queue, in
9161 9162   * which case we restart the timer.
9162 9163   */
9163 9164  void
9164 9165  ill_frag_timer(void *arg)
9165 9166  {
9166 9167          ill_t   *ill = (ill_t *)arg;
9167 9168          boolean_t frag_pending;
9168 9169          ip_stack_t *ipst = ill->ill_ipst;
9169 9170          time_t  timeout;
9170 9171  
9171 9172          mutex_enter(&ill->ill_lock);
9172 9173          ASSERT(!ill->ill_fragtimer_executing);
9173 9174          if (ill->ill_state_flags & ILL_CONDEMNED) {
9174 9175                  ill->ill_frag_timer_id = 0;
9175 9176                  mutex_exit(&ill->ill_lock);
9176 9177                  return;
9177 9178          }
9178 9179          ill->ill_fragtimer_executing = 1;
9179 9180          mutex_exit(&ill->ill_lock);
9180 9181  
9181 9182          timeout = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9182 9183              ipst->ips_ip_reassembly_timeout);
9183 9184  
9184 9185          frag_pending = ill_frag_timeout(ill, timeout);
9185 9186  
9186 9187          /*
9187 9188           * Restart the timer, if we have fragments pending or if someone
9188 9189           * wanted us to be scheduled again.
9189 9190           */
9190 9191          mutex_enter(&ill->ill_lock);
9191 9192          ill->ill_fragtimer_executing = 0;
9192 9193          ill->ill_frag_timer_id = 0;
9193 9194          if (frag_pending || ill->ill_fragtimer_needrestart)
9194 9195                  ill_frag_timer_start(ill);
9195 9196          mutex_exit(&ill->ill_lock);
9196 9197  }
9197 9198  
9198 9199  void
9199 9200  ill_frag_timer_start(ill_t *ill)
9200 9201  {
9201 9202          ip_stack_t *ipst = ill->ill_ipst;
9202 9203          clock_t timeo_ms;
9203 9204  
9204 9205          ASSERT(MUTEX_HELD(&ill->ill_lock));
9205 9206  
9206 9207          /* If the ill is closing or opening don't proceed */
9207 9208          if (ill->ill_state_flags & ILL_CONDEMNED)
9208 9209                  return;
9209 9210  
9210 9211          if (ill->ill_fragtimer_executing) {
9211 9212                  /*
9212 9213                   * ill_frag_timer is currently executing. Just record the
9213 9214                   * the fact that we want the timer to be restarted.
9214 9215                   * ill_frag_timer will post a timeout before it returns,
9215 9216                   * ensuring it will be called again.
9216 9217                   */
9217 9218                  ill->ill_fragtimer_needrestart = 1;
9218 9219                  return;
9219 9220          }
9220 9221  
9221 9222          if (ill->ill_frag_timer_id == 0) {
9222 9223                  timeo_ms = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9223 9224                      ipst->ips_ip_reassembly_timeout) * SECONDS;
9224 9225  
9225 9226                  /*
9226 9227                   * The timer is neither running nor is the timeout handler
9227 9228                   * executing. Post a timeout so that ill_frag_timer will be
9228 9229                   * called
9229 9230                   */
9230 9231                  ill->ill_frag_timer_id = timeout(ill_frag_timer, ill,
9231 9232                      MSEC_TO_TICK(timeo_ms >> 1));
9232 9233                  ill->ill_fragtimer_needrestart = 0;
9233 9234          }
9234 9235  }
9235 9236  
9236 9237  /*
9237 9238   * Update any source route, record route or timestamp options.
9238 9239   * Check that we are at end of strict source route.
9239 9240   * The options have already been checked for sanity in ip_input_options().
9240 9241   */
9241 9242  boolean_t
9242 9243  ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
9243 9244  {
9244 9245          ipoptp_t        opts;
9245 9246          uchar_t         *opt;
9246 9247          uint8_t         optval;
9247 9248          uint8_t         optlen;
9248 9249          ipaddr_t        dst;
9249 9250          ipaddr_t        ifaddr;
9250 9251          uint32_t        ts;
9251 9252          timestruc_t     now;
9252 9253          ill_t           *ill = ira->ira_ill;
9253 9254          ip_stack_t      *ipst = ill->ill_ipst;
9254 9255  
9255 9256          ip2dbg(("ip_input_local_options\n"));
9256 9257          opt = NULL;
9257 9258  
9258 9259          for (optval = ipoptp_first(&opts, ipha);
9259 9260              optval != IPOPT_EOL;
9260 9261              optval = ipoptp_next(&opts)) {
9261 9262                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
9262 9263                  opt = opts.ipoptp_cur;
9263 9264                  optlen = opts.ipoptp_len;
9264 9265                  ip2dbg(("ip_input_local_options: opt %d, len %d\n",
9265 9266                      optval, optlen));
9266 9267                  switch (optval) {
9267 9268                          uint32_t off;
9268 9269                  case IPOPT_SSRR:
9269 9270                  case IPOPT_LSRR:
9270 9271                          off = opt[IPOPT_OFFSET];
9271 9272                          off--;
9272 9273                          if (optlen < IP_ADDR_LEN ||
9273 9274                              off > optlen - IP_ADDR_LEN) {
9274 9275                                  /* End of source route */
9275 9276                                  ip1dbg(("ip_input_local_options: end of SR\n"));
9276 9277                                  break;
9277 9278                          }
9278 9279                          /*
9279 9280                           * This will only happen if two consecutive entries
9280 9281                           * in the source route contains our address or if
9281 9282                           * it is a packet with a loose source route which
9282 9283                           * reaches us before consuming the whole source route
9283 9284                           */
9284 9285                          ip1dbg(("ip_input_local_options: not end of SR\n"));
9285 9286                          if (optval == IPOPT_SSRR) {
9286 9287                                  goto bad_src_route;
9287 9288                          }
9288 9289                          /*
9289 9290                           * Hack: instead of dropping the packet truncate the
9290 9291                           * source route to what has been used by filling the
9291 9292                           * rest with IPOPT_NOP.
9292 9293                           */
9293 9294                          opt[IPOPT_OLEN] = (uint8_t)off;
9294 9295                          while (off < optlen) {
9295 9296                                  opt[off++] = IPOPT_NOP;
9296 9297                          }
9297 9298                          break;
9298 9299                  case IPOPT_RR:
9299 9300                          off = opt[IPOPT_OFFSET];
9300 9301                          off--;
9301 9302                          if (optlen < IP_ADDR_LEN ||
9302 9303                              off > optlen - IP_ADDR_LEN) {
9303 9304                                  /* No more room - ignore */
9304 9305                                  ip1dbg((
9305 9306                                      "ip_input_local_options: end of RR\n"));
9306 9307                                  break;
9307 9308                          }
9308 9309                          /* Pick a reasonable address on the outbound if */
9309 9310                          if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
9310 9311                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9311 9312                              NULL) != 0) {
9312 9313                                  /* No source! Shouldn't happen */
9313 9314                                  ifaddr = INADDR_ANY;
9314 9315                          }
9315 9316                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9316 9317                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9317 9318                          break;
9318 9319                  case IPOPT_TS:
9319 9320                          off = 0;
9320 9321                          /* Insert timestamp if there is romm */
9321 9322                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9322 9323                          case IPOPT_TS_TSONLY:
9323 9324                                  off = IPOPT_TS_TIMELEN;
9324 9325                                  break;
9325 9326                          case IPOPT_TS_PRESPEC:
9326 9327                          case IPOPT_TS_PRESPEC_RFC791:
9327 9328                                  /* Verify that the address matched */
9328 9329                                  off = opt[IPOPT_OFFSET] - 1;
9329 9330                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9330 9331                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9331 9332                                          /* Not for us */
9332 9333                                          break;
9333 9334                                  }
9334 9335                                  /* FALLTHROUGH */
9335 9336                          case IPOPT_TS_TSANDADDR:
9336 9337                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9337 9338                                  break;
9338 9339                          default:
9339 9340                                  /*
9340 9341                                   * ip_*put_options should have already
9341 9342                                   * dropped this packet.
9342 9343                                   */
9343 9344                                  cmn_err(CE_PANIC, "ip_input_local_options: "
9344 9345                                      "unknown IT - bug in ip_input_options?\n");
9345 9346                          }
9346 9347                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9347 9348                                  /* Increase overflow counter */
9348 9349                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9349 9350                                  opt[IPOPT_POS_OV_FLG] =
9350 9351                                      (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9351 9352                                      (off << 4));
9352 9353                                  break;
9353 9354                          }
9354 9355                          off = opt[IPOPT_OFFSET] - 1;
9355 9356                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9356 9357                          case IPOPT_TS_PRESPEC:
9357 9358                          case IPOPT_TS_PRESPEC_RFC791:
9358 9359                          case IPOPT_TS_TSANDADDR:
9359 9360                                  /* Pick a reasonable addr on the outbound if */
9360 9361                                  if (ip_select_source_v4(ill, INADDR_ANY,
9361 9362                                      ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
9362 9363                                      &ifaddr, NULL, NULL) != 0) {
9363 9364                                          /* No source! Shouldn't happen */
9364 9365                                          ifaddr = INADDR_ANY;
9365 9366                                  }
9366 9367                                  bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9367 9368                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9368 9369                                  /* FALLTHROUGH */
9369 9370                          case IPOPT_TS_TSONLY:
9370 9371                                  off = opt[IPOPT_OFFSET] - 1;
9371 9372                                  /* Compute # of milliseconds since midnight */
9372 9373                                  gethrestime(&now);
9373 9374                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9374 9375                                      NSEC2MSEC(now.tv_nsec);
9375 9376                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9376 9377                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9377 9378                                  break;
9378 9379                          }
9379 9380                          break;
9380 9381                  }
9381 9382          }
9382 9383          return (B_TRUE);
9383 9384  
9384 9385  bad_src_route:
9385 9386          /* make sure we clear any indication of a hardware checksum */
9386 9387          DB_CKSUMFLAGS(mp) = 0;
9387 9388          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
9388 9389          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9389 9390          return (B_FALSE);
9390 9391  
9391 9392  }
9392 9393  
9393 9394  /*
9394 9395   * Process IP options in an inbound packet.  Always returns the nexthop.
9395 9396   * Normally this is the passed in nexthop, but if there is an option
9396 9397   * that effects the nexthop (such as a source route) that will be returned.
9397 9398   * Sets *errorp if there is an error, in which case an ICMP error has been sent
9398 9399   * and mp freed.
9399 9400   */
9400 9401  ipaddr_t
9401 9402  ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
9402 9403      ip_recv_attr_t *ira, int *errorp)
9403 9404  {
9404 9405          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
9405 9406          ipoptp_t        opts;
9406 9407          uchar_t         *opt;
9407 9408          uint8_t         optval;
9408 9409          uint8_t         optlen;
9409 9410          intptr_t        code = 0;
9410 9411          ire_t           *ire;
9411 9412  
9412 9413          ip2dbg(("ip_input_options\n"));
9413 9414          opt = NULL;
9414 9415          *errorp = 0;
9415 9416          for (optval = ipoptp_first(&opts, ipha);
9416 9417              optval != IPOPT_EOL;
9417 9418              optval = ipoptp_next(&opts)) {
9418 9419                  opt = opts.ipoptp_cur;
9419 9420                  optlen = opts.ipoptp_len;
9420 9421                  ip2dbg(("ip_input_options: opt %d, len %d\n",
9421 9422                      optval, optlen));
9422 9423                  /*
9423 9424                   * Note: we need to verify the checksum before we
9424 9425                   * modify anything thus this routine only extracts the next
9425 9426                   * hop dst from any source route.
9426 9427                   */
9427 9428                  switch (optval) {
9428 9429                          uint32_t off;
9429 9430                  case IPOPT_SSRR:
9430 9431                  case IPOPT_LSRR:
9431 9432                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9432 9433                                  if (optval == IPOPT_SSRR) {
9433 9434                                          ip1dbg(("ip_input_options: not next"
9434 9435                                              " strict source route 0x%x\n",
9435 9436                                              ntohl(dst)));
9436 9437                                          code = (char *)&ipha->ipha_dst -
9437 9438                                              (char *)ipha;
9438 9439                                          goto param_prob; /* RouterReq's */
9439 9440                                  }
9440 9441                                  ip2dbg(("ip_input_options: "
9441 9442                                      "not next source route 0x%x\n",
9442 9443                                      ntohl(dst)));
9443 9444                                  break;
9444 9445                          }
9445 9446  
9446 9447                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9447 9448                                  ip1dbg((
9448 9449                                      "ip_input_options: bad option offset\n"));
9449 9450                                  code = (char *)&opt[IPOPT_OLEN] -
9450 9451                                      (char *)ipha;
9451 9452                                  goto param_prob;
9452 9453                          }
9453 9454                          off = opt[IPOPT_OFFSET];
9454 9455                          off--;
9455 9456                  redo_srr:
9456 9457                          if (optlen < IP_ADDR_LEN ||
9457 9458                              off > optlen - IP_ADDR_LEN) {
9458 9459                                  /* End of source route */
9459 9460                                  ip1dbg(("ip_input_options: end of SR\n"));
9460 9461                                  break;
9461 9462                          }
9462 9463                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9463 9464                          ip1dbg(("ip_input_options: next hop 0x%x\n",
9464 9465                              ntohl(dst)));
9465 9466  
9466 9467                          /*
9467 9468                           * Check if our address is present more than
9468 9469                           * once as consecutive hops in source route.
9469 9470                           * XXX verify per-interface ip_forwarding
9470 9471                           * for source route?
9471 9472                           */
9472 9473                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9473 9474                                  off += IP_ADDR_LEN;
9474 9475                                  goto redo_srr;
9475 9476                          }
9476 9477  
9477 9478                          if (dst == htonl(INADDR_LOOPBACK)) {
9478 9479                                  ip1dbg(("ip_input_options: loopback addr in "
9479 9480                                      "source route!\n"));
9480 9481                                  goto bad_src_route;
9481 9482                          }
9482 9483                          /*
9483 9484                           * For strict: verify that dst is directly
9484 9485                           * reachable.
9485 9486                           */
9486 9487                          if (optval == IPOPT_SSRR) {
9487 9488                                  ire = ire_ftable_lookup_v4(dst, 0, 0,
9488 9489                                      IRE_INTERFACE, NULL, ALL_ZONES,
9489 9490                                      ira->ira_tsl,
9490 9491                                      MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
9491 9492                                      NULL);
9492 9493                                  if (ire == NULL) {
9493 9494                                          ip1dbg(("ip_input_options: SSRR not "
9494 9495                                              "directly reachable: 0x%x\n",
9495 9496                                              ntohl(dst)));
9496 9497                                          goto bad_src_route;
9497 9498                                  }
9498 9499                                  ire_refrele(ire);
9499 9500                          }
9500 9501                          /*
9501 9502                           * Defer update of the offset and the record route
9502 9503                           * until the packet is forwarded.
9503 9504                           */
9504 9505                          break;
9505 9506                  case IPOPT_RR:
9506 9507                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9507 9508                                  ip1dbg((
9508 9509                                      "ip_input_options: bad option offset\n"));
9509 9510                                  code = (char *)&opt[IPOPT_OLEN] -
9510 9511                                      (char *)ipha;
9511 9512                                  goto param_prob;
9512 9513                          }
9513 9514                          break;
9514 9515                  case IPOPT_TS:
9515 9516                          /*
9516 9517                           * Verify that length >= 5 and that there is either
9517 9518                           * room for another timestamp or that the overflow
9518 9519                           * counter is not maxed out.
9519 9520                           */
9520 9521                          code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
9521 9522                          if (optlen < IPOPT_MINLEN_IT) {
9522 9523                                  goto param_prob;
9523 9524                          }
9524 9525                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9525 9526                                  ip1dbg((
9526 9527                                      "ip_input_options: bad option offset\n"));
9527 9528                                  code = (char *)&opt[IPOPT_OFFSET] -
9528 9529                                      (char *)ipha;
9529 9530                                  goto param_prob;
9530 9531                          }
9531 9532                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9532 9533                          case IPOPT_TS_TSONLY:
9533 9534                                  off = IPOPT_TS_TIMELEN;
9534 9535                                  break;
9535 9536                          case IPOPT_TS_TSANDADDR:
9536 9537                          case IPOPT_TS_PRESPEC:
9537 9538                          case IPOPT_TS_PRESPEC_RFC791:
9538 9539                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9539 9540                                  break;
9540 9541                          default:
9541 9542                                  code = (char *)&opt[IPOPT_POS_OV_FLG] -
9542 9543                                      (char *)ipha;
9543 9544                                  goto param_prob;
9544 9545                          }
9545 9546                          if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
9546 9547                              (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
9547 9548                                  /*
9548 9549                                   * No room and the overflow counter is 15
9549 9550                                   * already.
9550 9551                                   */
9551 9552                                  goto param_prob;
9552 9553                          }
9553 9554                          break;
9554 9555                  }
9555 9556          }
9556 9557  
9557 9558          if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
9558 9559                  return (dst);
9559 9560          }
9560 9561  
9561 9562          ip1dbg(("ip_input_options: error processing IP options."));
9562 9563          code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
9563 9564  
9564 9565  param_prob:
9565 9566          /* make sure we clear any indication of a hardware checksum */
9566 9567          DB_CKSUMFLAGS(mp) = 0;
9567 9568          ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
9568 9569          icmp_param_problem(mp, (uint8_t)code, ira);
9569 9570          *errorp = -1;
9570 9571          return (dst);
9571 9572  
9572 9573  bad_src_route:
9573 9574          /* make sure we clear any indication of a hardware checksum */
9574 9575          DB_CKSUMFLAGS(mp) = 0;
9575 9576          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
9576 9577          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9577 9578          *errorp = -1;
9578 9579          return (dst);
9579 9580  }
9580 9581  
9581 9582  /*
9582 9583   * IP & ICMP info in >=14 msg's ...
9583 9584   *  - ip fixed part (mib2_ip_t)
9584 9585   *  - icmp fixed part (mib2_icmp_t)
9585 9586   *  - ipAddrEntryTable (ip 20)          all IPv4 ipifs
9586 9587   *  - ipRouteEntryTable (ip 21)         all IPv4 IREs
9587 9588   *  - ipNetToMediaEntryTable (ip 22)    all IPv4 Neighbor Cache entries
9588 9589   *  - ipRouteAttributeTable (ip 102)    labeled routes
9589 9590   *  - ip multicast membership (ip_member_t)
9590 9591   *  - ip multicast source filtering (ip_grpsrc_t)
9591 9592   *  - igmp fixed part (struct igmpstat)
9592 9593   *  - multicast routing stats (struct mrtstat)
9593 9594   *  - multicast routing vifs (array of struct vifctl)
9594 9595   *  - multicast routing routes (array of struct mfcctl)
9595 9596   *  - ip6 fixed part (mib2_ipv6IfStatsEntry_t)
9596 9597   *                                      One per ill plus one generic
9597 9598   *  - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t)
9598 9599   *                                      One per ill plus one generic
9599 9600   *  - ipv6RouteEntry                    all IPv6 IREs
9600 9601   *  - ipv6RouteAttributeTable (ip6 102) labeled routes
9601 9602   *  - ipv6NetToMediaEntry               all IPv6 Neighbor Cache entries
9602 9603   *  - ipv6AddrEntry                     all IPv6 ipifs
9603 9604   *  - ipv6 multicast membership (ipv6_member_t)
9604 9605   *  - ipv6 multicast source filtering (ipv6_grpsrc_t)
9605 9606   *
9606 9607   * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
9607 9608   * already filled in by the caller.
9608 9609   * If legacy_req is true then MIB structures needs to be truncated to their
9609 9610   * legacy sizes before being returned.
9610 9611   * Return value of 0 indicates that no messages were sent and caller
9611 9612   * should free mpctl.
9612 9613   */
9613 9614  int
9614 9615  ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
9615 9616  {
9616 9617          ip_stack_t *ipst;
9617 9618          sctp_stack_t *sctps;
9618 9619  
9619 9620          if (q->q_next != NULL) {
9620 9621                  ipst = ILLQ_TO_IPST(q);
9621 9622          } else {
9622 9623                  ipst = CONNQ_TO_IPST(q);
9623 9624          }
9624 9625          ASSERT(ipst != NULL);
9625 9626          sctps = ipst->ips_netstack->netstack_sctp;
9626 9627  
9627 9628          if (mpctl == NULL || mpctl->b_cont == NULL) {
9628 9629                  return (0);
9629 9630          }
9630 9631  
9631 9632          /*
9632 9633           * For the purposes of the (broken) packet shell use
9633 9634           * of the level we make sure MIB2_TCP/MIB2_UDP can be used
9634 9635           * to make TCP and UDP appear first in the list of mib items.
9635 9636           * TBD: We could expand this and use it in netstat so that
9636 9637           * the kernel doesn't have to produce large tables (connections,
9637 9638           * routes, etc) when netstat only wants the statistics or a particular
9638 9639           * table.
9639 9640           */
9640 9641          if (!(level == MIB2_TCP || level == MIB2_UDP)) {
9641 9642                  if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) {
9642 9643                          return (1);
9643 9644                  }
9644 9645          }
9645 9646  
9646 9647          if (level != MIB2_TCP) {
9647 9648                  if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9648 9649                          return (1);
9649 9650                  }
9650 9651                  if (level == MIB2_UDP) {
9651 9652                          goto done;
9652 9653                  }
9653 9654          }
9654 9655  
9655 9656          if (level != MIB2_UDP) {
9656 9657                  if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9657 9658                          return (1);
9658 9659                  }
9659 9660                  if (level == MIB2_TCP) {
9660 9661                          goto done;
9661 9662                  }
9662 9663          }
9663 9664  
9664 9665          if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl,
9665 9666              ipst, legacy_req)) == NULL) {
9666 9667                  return (1);
9667 9668          }
9668 9669  
9669 9670          if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst,
9670 9671              legacy_req)) == NULL) {
9671 9672                  return (1);
9672 9673          }
9673 9674  
9674 9675          if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) {
9675 9676                  return (1);
9676 9677          }
9677 9678  
9678 9679          if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) {
9679 9680                  return (1);
9680 9681          }
9681 9682  
9682 9683          if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) {
9683 9684                  return (1);
9684 9685          }
9685 9686  
9686 9687          if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) {
9687 9688                  return (1);
9688 9689          }
9689 9690  
9690 9691          if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst,
9691 9692              legacy_req)) == NULL) {
9692 9693                  return (1);
9693 9694          }
9694 9695  
9695 9696          if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst,
9696 9697              legacy_req)) == NULL) {
9697 9698                  return (1);
9698 9699          }
9699 9700  
9700 9701          if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) {
9701 9702                  return (1);
9702 9703          }
9703 9704  
9704 9705          if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) {
9705 9706                  return (1);
9706 9707          }
9707 9708  
9708 9709          if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) {
9709 9710                  return (1);
9710 9711          }
9711 9712  
9712 9713          if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) {
9713 9714                  return (1);
9714 9715          }
9715 9716  
9716 9717          if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) {
9717 9718                  return (1);
9718 9719          }
9719 9720  
9720 9721          if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) {
9721 9722                  return (1);
9722 9723          }
9723 9724  
9724 9725          mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
9725 9726          if (mpctl == NULL)
9726 9727                  return (1);
9727 9728  
9728 9729          mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
9729 9730          if (mpctl == NULL)
9730 9731                  return (1);
9731 9732  
9732 9733          if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
9733 9734                  return (1);
9734 9735          }
9735 9736          if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
9736 9737                  return (1);
9737 9738          }
9738 9739  done:
9739 9740          freemsg(mpctl);
9740 9741          return (1);
9741 9742  }
9742 9743  
9743 9744  /* Get global (legacy) IPv4 statistics */
9744 9745  static mblk_t *
9745 9746  ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
9746 9747      ip_stack_t *ipst, boolean_t legacy_req)
9747 9748  {
9748 9749          mib2_ip_t               old_ip_mib;
9749 9750          struct opthdr           *optp;
9750 9751          mblk_t                  *mp2ctl;
9751 9752          mib2_ipAddrEntry_t      mae;
9752 9753  
9753 9754          /*
9754 9755           * make a copy of the original message
9755 9756           */
9756 9757          mp2ctl = copymsg(mpctl);
9757 9758  
9758 9759          /* fixed length IP structure... */
9759 9760          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9760 9761          optp->level = MIB2_IP;
9761 9762          optp->name = 0;
9762 9763          SET_MIB(old_ip_mib.ipForwarding,
9763 9764              (WE_ARE_FORWARDING(ipst) ? 1 : 2));
9764 9765          SET_MIB(old_ip_mib.ipDefaultTTL,
9765 9766              (uint32_t)ipst->ips_ip_def_ttl);
9766 9767          SET_MIB(old_ip_mib.ipReasmTimeout,
9767 9768              ipst->ips_ip_reassembly_timeout);
9768 9769          SET_MIB(old_ip_mib.ipAddrEntrySize,
9769 9770              (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
9770 9771              sizeof (mib2_ipAddrEntry_t));
9771 9772          SET_MIB(old_ip_mib.ipRouteEntrySize,
9772 9773              sizeof (mib2_ipRouteEntry_t));
9773 9774          SET_MIB(old_ip_mib.ipNetToMediaEntrySize,
9774 9775              sizeof (mib2_ipNetToMediaEntry_t));
9775 9776          SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t));
9776 9777          SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t));
9777 9778          SET_MIB(old_ip_mib.ipRouteAttributeSize,
9778 9779              sizeof (mib2_ipAttributeEntry_t));
9779 9780          SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
9780 9781          SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
9781 9782  
9782 9783          /*
9783 9784           * Grab the statistics from the new IP MIB
9784 9785           */
9785 9786          SET_MIB(old_ip_mib.ipInReceives,
9786 9787              (uint32_t)ipmib->ipIfStatsHCInReceives);
9787 9788          SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors);
9788 9789          SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors);
9789 9790          SET_MIB(old_ip_mib.ipForwDatagrams,
9790 9791              (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams);
9791 9792          SET_MIB(old_ip_mib.ipInUnknownProtos,
9792 9793              ipmib->ipIfStatsInUnknownProtos);
9793 9794          SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards);
9794 9795          SET_MIB(old_ip_mib.ipInDelivers,
9795 9796              (uint32_t)ipmib->ipIfStatsHCInDelivers);
9796 9797          SET_MIB(old_ip_mib.ipOutRequests,
9797 9798              (uint32_t)ipmib->ipIfStatsHCOutRequests);
9798 9799          SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards);
9799 9800          SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes);
9800 9801          SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds);
9801 9802          SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs);
9802 9803          SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails);
9803 9804          SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs);
9804 9805          SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails);
9805 9806          SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates);
9806 9807  
9807 9808          /* ipRoutingDiscards is not being used */
9808 9809          SET_MIB(old_ip_mib.ipRoutingDiscards, 0);
9809 9810          SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs);
9810 9811          SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts);
9811 9812          SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs);
9812 9813          SET_MIB(old_ip_mib.ipReasmDuplicates,
9813 9814              ipmib->ipIfStatsReasmDuplicates);
9814 9815          SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups);
9815 9816          SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits);
9816 9817          SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs);
9817 9818          SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows);
9818 9819          SET_MIB(old_ip_mib.rawipInOverflows,
9819 9820              ipmib->rawipIfStatsInOverflows);
9820 9821  
9821 9822          SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded);
9822 9823          SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed);
9823 9824          SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion);
9824 9825          SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion);
9825 9826          SET_MIB(old_ip_mib.ipOutSwitchIPv6,
9826 9827              ipmib->ipIfStatsOutSwitchIPVersion);
9827 9828  
9828 9829          if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib,
9829 9830              (int)sizeof (old_ip_mib))) {
9830 9831                  ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n",
9831 9832                      (uint_t)sizeof (old_ip_mib)));
9832 9833          }
9833 9834  
9834 9835          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9835 9836          ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n",
9836 9837              (int)optp->level, (int)optp->name, (int)optp->len));
9837 9838          qreply(q, mpctl);
9838 9839          return (mp2ctl);
9839 9840  }
9840 9841  
9841 9842  /* Per interface IPv4 statistics */
9842 9843  static mblk_t *
9843 9844  ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9844 9845      boolean_t legacy_req)
9845 9846  {
9846 9847          struct opthdr           *optp;
9847 9848          mblk_t                  *mp2ctl;
9848 9849          ill_t                   *ill;
9849 9850          ill_walk_context_t      ctx;
9850 9851          mblk_t                  *mp_tail = NULL;
9851 9852          mib2_ipIfStatsEntry_t   global_ip_mib;
9852 9853          mib2_ipAddrEntry_t      mae;
9853 9854  
9854 9855          /*
9855 9856           * Make a copy of the original message
9856 9857           */
9857 9858          mp2ctl = copymsg(mpctl);
9858 9859  
9859 9860          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9860 9861          optp->level = MIB2_IP;
9861 9862          optp->name = MIB2_IP_TRAFFIC_STATS;
9862 9863          /* Include "unknown interface" ip_mib */
9863 9864          ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
9864 9865          ipst->ips_ip_mib.ipIfStatsIfIndex =
9865 9866              MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
9866 9867          SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding,
9867 9868              (ipst->ips_ip_forwarding ? 1 : 2));
9868 9869          SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL,
9869 9870              (uint32_t)ipst->ips_ip_def_ttl);
9870 9871          SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize,
9871 9872              sizeof (mib2_ipIfStatsEntry_t));
9872 9873          SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize,
9873 9874              sizeof (mib2_ipAddrEntry_t));
9874 9875          SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize,
9875 9876              sizeof (mib2_ipRouteEntry_t));
9876 9877          SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize,
9877 9878              sizeof (mib2_ipNetToMediaEntry_t));
9878 9879          SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize,
9879 9880              sizeof (ip_member_t));
9880 9881          SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize,
9881 9882              sizeof (ip_grpsrc_t));
9882 9883  
9883 9884          bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib));
9884 9885  
9885 9886          if (legacy_req) {
9886 9887                  SET_MIB(global_ip_mib.ipIfStatsAddrEntrySize,
9887 9888                      LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t));
9888 9889          }
9889 9890  
9890 9891          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9891 9892              (char *)&global_ip_mib, (int)sizeof (global_ip_mib))) {
9892 9893                  ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9893 9894                      "failed to allocate %u bytes\n",
9894 9895                      (uint_t)sizeof (global_ip_mib)));
9895 9896          }
9896 9897  
9897 9898          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
9898 9899          ill = ILL_START_WALK_V4(&ctx, ipst);
9899 9900          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
9900 9901                  ill->ill_ip_mib->ipIfStatsIfIndex =
9901 9902                      ill->ill_phyint->phyint_ifindex;
9902 9903                  SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
9903 9904                      (ipst->ips_ip_forwarding ? 1 : 2));
9904 9905                  SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL,
9905 9906                      (uint32_t)ipst->ips_ip_def_ttl);
9906 9907  
9907 9908                  ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib);
9908 9909                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9909 9910                      (char *)ill->ill_ip_mib,
9910 9911                      (int)sizeof (*ill->ill_ip_mib))) {
9911 9912                          ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9912 9913                              "failed to allocate %u bytes\n",
9913 9914                              (uint_t)sizeof (*ill->ill_ip_mib)));
9914 9915                  }
9915 9916          }
9916 9917          rw_exit(&ipst->ips_ill_g_lock);
9917 9918  
9918 9919          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9919 9920          ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9920 9921              "level %d, name %d, len %d\n",
9921 9922              (int)optp->level, (int)optp->name, (int)optp->len));
9922 9923          qreply(q, mpctl);
9923 9924  
9924 9925          if (mp2ctl == NULL)
9925 9926                  return (NULL);
9926 9927  
9927 9928          return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst,
9928 9929              legacy_req));
9929 9930  }
9930 9931  
9931 9932  /* Global IPv4 ICMP statistics */
9932 9933  static mblk_t *
9933 9934  ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9934 9935  {
9935 9936          struct opthdr           *optp;
9936 9937          mblk_t                  *mp2ctl;
9937 9938  
9938 9939          /*
9939 9940           * Make a copy of the original message
9940 9941           */
9941 9942          mp2ctl = copymsg(mpctl);
9942 9943  
9943 9944          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9944 9945          optp->level = MIB2_ICMP;
9945 9946          optp->name = 0;
9946 9947          if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib,
9947 9948              (int)sizeof (ipst->ips_icmp_mib))) {
9948 9949                  ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n",
9949 9950                      (uint_t)sizeof (ipst->ips_icmp_mib)));
9950 9951          }
9951 9952          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9952 9953          ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n",
9953 9954              (int)optp->level, (int)optp->name, (int)optp->len));
9954 9955          qreply(q, mpctl);
9955 9956          return (mp2ctl);
9956 9957  }
9957 9958  
9958 9959  /* Global IPv4 IGMP statistics */
9959 9960  static mblk_t *
9960 9961  ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9961 9962  {
9962 9963          struct opthdr           *optp;
9963 9964          mblk_t                  *mp2ctl;
9964 9965  
9965 9966          /*
9966 9967           * make a copy of the original message
9967 9968           */
9968 9969          mp2ctl = copymsg(mpctl);
9969 9970  
9970 9971          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9971 9972          optp->level = EXPER_IGMP;
9972 9973          optp->name = 0;
9973 9974          if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat,
9974 9975              (int)sizeof (ipst->ips_igmpstat))) {
9975 9976                  ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n",
9976 9977                      (uint_t)sizeof (ipst->ips_igmpstat)));
9977 9978          }
9978 9979          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9979 9980          ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n",
9980 9981              (int)optp->level, (int)optp->name, (int)optp->len));
9981 9982          qreply(q, mpctl);
9982 9983          return (mp2ctl);
9983 9984  }
9984 9985  
9985 9986  /* Global IPv4 Multicast Routing statistics */
9986 9987  static mblk_t *
9987 9988  ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9988 9989  {
9989 9990          struct opthdr           *optp;
9990 9991          mblk_t                  *mp2ctl;
9991 9992  
9992 9993          /*
9993 9994           * make a copy of the original message
9994 9995           */
9995 9996          mp2ctl = copymsg(mpctl);
9996 9997  
9997 9998          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9998 9999          optp->level = EXPER_DVMRP;
9999 10000          optp->name = 0;
10000 10001          if (!ip_mroute_stats(mpctl->b_cont, ipst)) {
10001 10002                  ip0dbg(("ip_mroute_stats: failed\n"));
10002 10003          }
10003 10004          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10004 10005          ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n",
10005 10006              (int)optp->level, (int)optp->name, (int)optp->len));
10006 10007          qreply(q, mpctl);
10007 10008          return (mp2ctl);
10008 10009  }
10009 10010  
10010 10011  /* IPv4 address information */
10011 10012  static mblk_t *
10012 10013  ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10013 10014      boolean_t legacy_req)
10014 10015  {
10015 10016          struct opthdr           *optp;
10016 10017          mblk_t                  *mp2ctl;
10017 10018          mblk_t                  *mp_tail = NULL;
10018 10019          ill_t                   *ill;
10019 10020          ipif_t                  *ipif;
10020 10021          uint_t                  bitval;
10021 10022          mib2_ipAddrEntry_t      mae;
10022 10023          size_t                  mae_size;
10023 10024          zoneid_t                zoneid;
10024 10025          ill_walk_context_t      ctx;
10025 10026  
10026 10027          /*
10027 10028           * make a copy of the original message
10028 10029           */
10029 10030          mp2ctl = copymsg(mpctl);
10030 10031  
10031 10032          mae_size = (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
10032 10033              sizeof (mib2_ipAddrEntry_t);
10033 10034  
10034 10035          /* ipAddrEntryTable */
10035 10036  
10036 10037          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10037 10038          optp->level = MIB2_IP;
10038 10039          optp->name = MIB2_IP_ADDR;
10039 10040          zoneid = Q_TO_CONN(q)->conn_zoneid;
10040 10041  
10041 10042          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10042 10043          ill = ILL_START_WALK_V4(&ctx, ipst);
10043 10044          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10044 10045                  for (ipif = ill->ill_ipif; ipif != NULL;
10045 10046                      ipif = ipif->ipif_next) {
10046 10047                          if (ipif->ipif_zoneid != zoneid &&
10047 10048                              ipif->ipif_zoneid != ALL_ZONES)
10048 10049                                  continue;
10049 10050                          /* Sum of count from dead IRE_LO* and our current */
10050 10051                          mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10051 10052                          if (ipif->ipif_ire_local != NULL) {
10052 10053                                  mae.ipAdEntInfo.ae_ibcnt +=
10053 10054                                      ipif->ipif_ire_local->ire_ib_pkt_count;
10054 10055                          }
10055 10056                          mae.ipAdEntInfo.ae_obcnt = 0;
10056 10057                          mae.ipAdEntInfo.ae_focnt = 0;
10057 10058  
10058 10059                          ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
10059 10060                              OCTET_LENGTH);
10060 10061                          mae.ipAdEntIfIndex.o_length =
10061 10062                              mi_strlen(mae.ipAdEntIfIndex.o_bytes);
10062 10063                          mae.ipAdEntAddr = ipif->ipif_lcl_addr;
10063 10064                          mae.ipAdEntNetMask = ipif->ipif_net_mask;
10064 10065                          mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
10065 10066                          mae.ipAdEntInfo.ae_subnet_len =
10066 10067                              ip_mask_to_plen(ipif->ipif_net_mask);
10067 10068                          mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
10068 10069                          for (bitval = 1;
10069 10070                              bitval &&
10070 10071                              !(bitval & ipif->ipif_brd_addr);
10071 10072                              bitval <<= 1)
10072 10073                                  noop;
10073 10074                          mae.ipAdEntBcastAddr = bitval;
10074 10075                          mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
10075 10076                          mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10076 10077                          mae.ipAdEntInfo.ae_metric  = ipif->ipif_ill->ill_metric;
10077 10078                          mae.ipAdEntInfo.ae_broadcast_addr =
10078 10079                              ipif->ipif_brd_addr;
10079 10080                          mae.ipAdEntInfo.ae_pp_dst_addr =
10080 10081                              ipif->ipif_pp_dst_addr;
10081 10082                          mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
10082 10083                              ill->ill_flags | ill->ill_phyint->phyint_flags;
10083 10084                          mae.ipAdEntRetransmitTime =
10084 10085                              ill->ill_reachable_retrans_time;
10085 10086  
10086 10087                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10087 10088                              (char *)&mae, (int)mae_size)) {
10088 10089                                  ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to "
10089 10090                                      "allocate %u bytes\n", (uint_t)mae_size));
10090 10091                          }
10091 10092                  }
10092 10093          }
10093 10094          rw_exit(&ipst->ips_ill_g_lock);
10094 10095  
10095 10096          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10096 10097          ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n",
10097 10098              (int)optp->level, (int)optp->name, (int)optp->len));
10098 10099          qreply(q, mpctl);
10099 10100          return (mp2ctl);
10100 10101  }
10101 10102  
10102 10103  /* IPv6 address information */
10103 10104  static mblk_t *
10104 10105  ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10105 10106      boolean_t legacy_req)
10106 10107  {
10107 10108          struct opthdr           *optp;
10108 10109          mblk_t                  *mp2ctl;
10109 10110          mblk_t                  *mp_tail = NULL;
10110 10111          ill_t                   *ill;
10111 10112          ipif_t                  *ipif;
10112 10113          mib2_ipv6AddrEntry_t    mae6;
10113 10114          size_t                  mae6_size;
10114 10115          zoneid_t                zoneid;
10115 10116          ill_walk_context_t      ctx;
10116 10117  
10117 10118          /*
10118 10119           * make a copy of the original message
10119 10120           */
10120 10121          mp2ctl = copymsg(mpctl);
10121 10122  
10122 10123          mae6_size = (legacy_req) ?
10123 10124              LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t) :
10124 10125              sizeof (mib2_ipv6AddrEntry_t);
10125 10126  
10126 10127          /* ipv6AddrEntryTable */
10127 10128  
10128 10129          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10129 10130          optp->level = MIB2_IP6;
10130 10131          optp->name = MIB2_IP6_ADDR;
10131 10132          zoneid = Q_TO_CONN(q)->conn_zoneid;
10132 10133  
10133 10134          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10134 10135          ill = ILL_START_WALK_V6(&ctx, ipst);
10135 10136          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10136 10137                  for (ipif = ill->ill_ipif; ipif != NULL;
10137 10138                      ipif = ipif->ipif_next) {
10138 10139                          if (ipif->ipif_zoneid != zoneid &&
10139 10140                              ipif->ipif_zoneid != ALL_ZONES)
10140 10141                                  continue;
10141 10142                          /* Sum of count from dead IRE_LO* and our current */
10142 10143                          mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10143 10144                          if (ipif->ipif_ire_local != NULL) {
10144 10145                                  mae6.ipv6AddrInfo.ae_ibcnt +=
10145 10146                                      ipif->ipif_ire_local->ire_ib_pkt_count;
10146 10147                          }
10147 10148                          mae6.ipv6AddrInfo.ae_obcnt = 0;
10148 10149                          mae6.ipv6AddrInfo.ae_focnt = 0;
10149 10150  
10150 10151                          ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
10151 10152                              OCTET_LENGTH);
10152 10153                          mae6.ipv6AddrIfIndex.o_length =
10153 10154                              mi_strlen(mae6.ipv6AddrIfIndex.o_bytes);
10154 10155                          mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr;
10155 10156                          mae6.ipv6AddrPfxLength =
10156 10157                              ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10157 10158                          mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
10158 10159                          mae6.ipv6AddrInfo.ae_subnet_len =
10159 10160                              mae6.ipv6AddrPfxLength;
10160 10161                          mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
10161 10162  
10162 10163                          /* Type: stateless(1), stateful(2), unknown(3) */
10163 10164                          if (ipif->ipif_flags & IPIF_ADDRCONF)
10164 10165                                  mae6.ipv6AddrType = 1;
10165 10166                          else
10166 10167                                  mae6.ipv6AddrType = 2;
10167 10168                          /* Anycast: true(1), false(2) */
10168 10169                          if (ipif->ipif_flags & IPIF_ANYCAST)
10169 10170                                  mae6.ipv6AddrAnycastFlag = 1;
10170 10171                          else
10171 10172                                  mae6.ipv6AddrAnycastFlag = 2;
10172 10173  
10173 10174                          /*
10174 10175                           * Address status: preferred(1), deprecated(2),
10175 10176                           * invalid(3), inaccessible(4), unknown(5)
10176 10177                           */
10177 10178                          if (ipif->ipif_flags & IPIF_NOLOCAL)
10178 10179                                  mae6.ipv6AddrStatus = 3;
10179 10180                          else if (ipif->ipif_flags & IPIF_DEPRECATED)
10180 10181                                  mae6.ipv6AddrStatus = 2;
10181 10182                          else
10182 10183                                  mae6.ipv6AddrStatus = 1;
10183 10184                          mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10184 10185                          mae6.ipv6AddrInfo.ae_metric  =
10185 10186                              ipif->ipif_ill->ill_metric;
10186 10187                          mae6.ipv6AddrInfo.ae_pp_dst_addr =
10187 10188                              ipif->ipif_v6pp_dst_addr;
10188 10189                          mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags |
10189 10190                              ill->ill_flags | ill->ill_phyint->phyint_flags;
10190 10191                          mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET;
10191 10192                          mae6.ipv6AddrIdentifier = ill->ill_token;
10192 10193                          mae6.ipv6AddrIdentifierLen = ill->ill_token_length;
10193 10194                          mae6.ipv6AddrReachableTime = ill->ill_reachable_time;
10194 10195                          mae6.ipv6AddrRetransmitTime =
10195 10196                              ill->ill_reachable_retrans_time;
10196 10197                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10197 10198                              (char *)&mae6, (int)mae6_size)) {
10198 10199                                  ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to "
10199 10200                                      "allocate %u bytes\n",
10200 10201                                      (uint_t)mae6_size));
10201 10202                          }
10202 10203                  }
10203 10204          }
10204 10205          rw_exit(&ipst->ips_ill_g_lock);
10205 10206  
10206 10207          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10207 10208          ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n",
10208 10209              (int)optp->level, (int)optp->name, (int)optp->len));
10209 10210          qreply(q, mpctl);
10210 10211          return (mp2ctl);
10211 10212  }
10212 10213  
10213 10214  /* IPv4 multicast group membership. */
10214 10215  static mblk_t *
10215 10216  ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10216 10217  {
10217 10218          struct opthdr           *optp;
10218 10219          mblk_t                  *mp2ctl;
10219 10220          ill_t                   *ill;
10220 10221          ipif_t                  *ipif;
10221 10222          ilm_t                   *ilm;
10222 10223          ip_member_t             ipm;
10223 10224          mblk_t                  *mp_tail = NULL;
10224 10225          ill_walk_context_t      ctx;
10225 10226          zoneid_t                zoneid;
10226 10227  
10227 10228          /*
10228 10229           * make a copy of the original message
10229 10230           */
10230 10231          mp2ctl = copymsg(mpctl);
10231 10232          zoneid = Q_TO_CONN(q)->conn_zoneid;
10232 10233  
10233 10234          /* ipGroupMember table */
10234 10235          optp = (struct opthdr *)&mpctl->b_rptr[
10235 10236              sizeof (struct T_optmgmt_ack)];
10236 10237          optp->level = MIB2_IP;
10237 10238          optp->name = EXPER_IP_GROUP_MEMBERSHIP;
10238 10239  
10239 10240          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10240 10241          ill = ILL_START_WALK_V4(&ctx, ipst);
10241 10242          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10242 10243                  /* Make sure the ill isn't going away. */
10243 10244                  if (!ill_check_and_refhold(ill))
10244 10245                          continue;
10245 10246                  rw_exit(&ipst->ips_ill_g_lock);
10246 10247                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10247 10248                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10248 10249                          if (ilm->ilm_zoneid != zoneid &&
10249 10250                              ilm->ilm_zoneid != ALL_ZONES)
10250 10251                                  continue;
10251 10252  
10252 10253                          /* Is there an ipif for ilm_ifaddr? */
10253 10254                          for (ipif = ill->ill_ipif; ipif != NULL;
10254 10255                              ipif = ipif->ipif_next) {
10255 10256                                  if (!IPIF_IS_CONDEMNED(ipif) &&
10256 10257                                      ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10257 10258                                      ilm->ilm_ifaddr != INADDR_ANY)
10258 10259                                          break;
10259 10260                          }
10260 10261                          if (ipif != NULL) {
10261 10262                                  ipif_get_name(ipif,
10262 10263                                      ipm.ipGroupMemberIfIndex.o_bytes,
10263 10264                                      OCTET_LENGTH);
10264 10265                          } else {
10265 10266                                  ill_get_name(ill,
10266 10267                                      ipm.ipGroupMemberIfIndex.o_bytes,
10267 10268                                      OCTET_LENGTH);
10268 10269                          }
10269 10270                          ipm.ipGroupMemberIfIndex.o_length =
10270 10271                              mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
10271 10272  
10272 10273                          ipm.ipGroupMemberAddress = ilm->ilm_addr;
10273 10274                          ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
10274 10275                          ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
10275 10276                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10276 10277                              (char *)&ipm, (int)sizeof (ipm))) {
10277 10278                                  ip1dbg(("ip_snmp_get_mib2_ip_group: "
10278 10279                                      "failed to allocate %u bytes\n",
10279 10280                                      (uint_t)sizeof (ipm)));
10280 10281                          }
10281 10282                  }
10282 10283                  rw_exit(&ill->ill_mcast_lock);
10283 10284                  ill_refrele(ill);
10284 10285                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10285 10286          }
10286 10287          rw_exit(&ipst->ips_ill_g_lock);
10287 10288          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10288 10289          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10289 10290              (int)optp->level, (int)optp->name, (int)optp->len));
10290 10291          qreply(q, mpctl);
10291 10292          return (mp2ctl);
10292 10293  }
10293 10294  
10294 10295  /* IPv6 multicast group membership. */
10295 10296  static mblk_t *
10296 10297  ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10297 10298  {
10298 10299          struct opthdr           *optp;
10299 10300          mblk_t                  *mp2ctl;
10300 10301          ill_t                   *ill;
10301 10302          ilm_t                   *ilm;
10302 10303          ipv6_member_t           ipm6;
10303 10304          mblk_t                  *mp_tail = NULL;
10304 10305          ill_walk_context_t      ctx;
10305 10306          zoneid_t                zoneid;
10306 10307  
10307 10308          /*
10308 10309           * make a copy of the original message
10309 10310           */
10310 10311          mp2ctl = copymsg(mpctl);
10311 10312          zoneid = Q_TO_CONN(q)->conn_zoneid;
10312 10313  
10313 10314          /* ip6GroupMember table */
10314 10315          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10315 10316          optp->level = MIB2_IP6;
10316 10317          optp->name = EXPER_IP6_GROUP_MEMBERSHIP;
10317 10318  
10318 10319          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10319 10320          ill = ILL_START_WALK_V6(&ctx, ipst);
10320 10321          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10321 10322                  /* Make sure the ill isn't going away. */
10322 10323                  if (!ill_check_and_refhold(ill))
10323 10324                          continue;
10324 10325                  rw_exit(&ipst->ips_ill_g_lock);
10325 10326                  /*
10326 10327                   * Normally we don't have any members on under IPMP interfaces.
10327 10328                   * We report them as a debugging aid.
10328 10329                   */
10329 10330                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10330 10331                  ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
10331 10332                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10332 10333                          if (ilm->ilm_zoneid != zoneid &&
10333 10334                              ilm->ilm_zoneid != ALL_ZONES)
10334 10335                                  continue;       /* not this zone */
10335 10336                          ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
10336 10337                          ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
10337 10338                          ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode;
10338 10339                          if (!snmp_append_data2(mpctl->b_cont,
10339 10340                              &mp_tail,
10340 10341                              (char *)&ipm6, (int)sizeof (ipm6))) {
10341 10342                                  ip1dbg(("ip_snmp_get_mib2_ip6_group: "
10342 10343                                      "failed to allocate %u bytes\n",
10343 10344                                      (uint_t)sizeof (ipm6)));
10344 10345                          }
10345 10346                  }
10346 10347                  rw_exit(&ill->ill_mcast_lock);
10347 10348                  ill_refrele(ill);
10348 10349                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10349 10350          }
10350 10351          rw_exit(&ipst->ips_ill_g_lock);
10351 10352  
10352 10353          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10353 10354          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10354 10355              (int)optp->level, (int)optp->name, (int)optp->len));
10355 10356          qreply(q, mpctl);
10356 10357          return (mp2ctl);
10357 10358  }
10358 10359  
10359 10360  /* IP multicast filtered sources */
10360 10361  static mblk_t *
10361 10362  ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10362 10363  {
10363 10364          struct opthdr           *optp;
10364 10365          mblk_t                  *mp2ctl;
10365 10366          ill_t                   *ill;
10366 10367          ipif_t                  *ipif;
10367 10368          ilm_t                   *ilm;
10368 10369          ip_grpsrc_t             ips;
10369 10370          mblk_t                  *mp_tail = NULL;
10370 10371          ill_walk_context_t      ctx;
10371 10372          zoneid_t                zoneid;
10372 10373          int                     i;
10373 10374          slist_t                 *sl;
10374 10375  
10375 10376          /*
10376 10377           * make a copy of the original message
10377 10378           */
10378 10379          mp2ctl = copymsg(mpctl);
10379 10380          zoneid = Q_TO_CONN(q)->conn_zoneid;
10380 10381  
10381 10382          /* ipGroupSource table */
10382 10383          optp = (struct opthdr *)&mpctl->b_rptr[
10383 10384              sizeof (struct T_optmgmt_ack)];
10384 10385          optp->level = MIB2_IP;
10385 10386          optp->name = EXPER_IP_GROUP_SOURCES;
10386 10387  
10387 10388          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10388 10389          ill = ILL_START_WALK_V4(&ctx, ipst);
10389 10390          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10390 10391                  /* Make sure the ill isn't going away. */
10391 10392                  if (!ill_check_and_refhold(ill))
10392 10393                          continue;
10393 10394                  rw_exit(&ipst->ips_ill_g_lock);
10394 10395                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10395 10396                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10396 10397                          sl = ilm->ilm_filter;
10397 10398                          if (ilm->ilm_zoneid != zoneid &&
10398 10399                              ilm->ilm_zoneid != ALL_ZONES)
10399 10400                                  continue;
10400 10401                          if (SLIST_IS_EMPTY(sl))
10401 10402                                  continue;
10402 10403  
10403 10404                          /* Is there an ipif for ilm_ifaddr? */
10404 10405                          for (ipif = ill->ill_ipif; ipif != NULL;
10405 10406                              ipif = ipif->ipif_next) {
10406 10407                                  if (!IPIF_IS_CONDEMNED(ipif) &&
10407 10408                                      ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10408 10409                                      ilm->ilm_ifaddr != INADDR_ANY)
10409 10410                                          break;
10410 10411                          }
10411 10412                          if (ipif != NULL) {
10412 10413                                  ipif_get_name(ipif,
10413 10414                                      ips.ipGroupSourceIfIndex.o_bytes,
10414 10415                                      OCTET_LENGTH);
10415 10416                          } else {
10416 10417                                  ill_get_name(ill,
10417 10418                                      ips.ipGroupSourceIfIndex.o_bytes,
10418 10419                                      OCTET_LENGTH);
10419 10420                          }
10420 10421                          ips.ipGroupSourceIfIndex.o_length =
10421 10422                              mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
10422 10423  
10423 10424                          ips.ipGroupSourceGroup = ilm->ilm_addr;
10424 10425                          for (i = 0; i < sl->sl_numsrc; i++) {
10425 10426                                  if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
10426 10427                                          continue;
10427 10428                                  IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
10428 10429                                      ips.ipGroupSourceAddress);
10429 10430                                  if (snmp_append_data2(mpctl->b_cont, &mp_tail,
10430 10431                                      (char *)&ips, (int)sizeof (ips)) == 0) {
10431 10432                                          ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
10432 10433                                              " failed to allocate %u bytes\n",
10433 10434                                              (uint_t)sizeof (ips)));
10434 10435                                  }
10435 10436                          }
10436 10437                  }
10437 10438                  rw_exit(&ill->ill_mcast_lock);
10438 10439                  ill_refrele(ill);
10439 10440                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10440 10441          }
10441 10442          rw_exit(&ipst->ips_ill_g_lock);
10442 10443          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10443 10444          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10444 10445              (int)optp->level, (int)optp->name, (int)optp->len));
10445 10446          qreply(q, mpctl);
10446 10447          return (mp2ctl);
10447 10448  }
10448 10449  
10449 10450  /* IPv6 multicast filtered sources. */
10450 10451  static mblk_t *
10451 10452  ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10452 10453  {
10453 10454          struct opthdr           *optp;
10454 10455          mblk_t                  *mp2ctl;
10455 10456          ill_t                   *ill;
10456 10457          ilm_t                   *ilm;
10457 10458          ipv6_grpsrc_t           ips6;
10458 10459          mblk_t                  *mp_tail = NULL;
10459 10460          ill_walk_context_t      ctx;
10460 10461          zoneid_t                zoneid;
10461 10462          int                     i;
10462 10463          slist_t                 *sl;
10463 10464  
10464 10465          /*
10465 10466           * make a copy of the original message
10466 10467           */
10467 10468          mp2ctl = copymsg(mpctl);
10468 10469          zoneid = Q_TO_CONN(q)->conn_zoneid;
10469 10470  
10470 10471          /* ip6GroupMember table */
10471 10472          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10472 10473          optp->level = MIB2_IP6;
10473 10474          optp->name = EXPER_IP6_GROUP_SOURCES;
10474 10475  
10475 10476          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10476 10477          ill = ILL_START_WALK_V6(&ctx, ipst);
10477 10478          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10478 10479                  /* Make sure the ill isn't going away. */
10479 10480                  if (!ill_check_and_refhold(ill))
10480 10481                          continue;
10481 10482                  rw_exit(&ipst->ips_ill_g_lock);
10482 10483                  /*
10483 10484                   * Normally we don't have any members on under IPMP interfaces.
10484 10485                   * We report them as a debugging aid.
10485 10486                   */
10486 10487                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10487 10488                  ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
10488 10489                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10489 10490                          sl = ilm->ilm_filter;
10490 10491                          if (ilm->ilm_zoneid != zoneid &&
10491 10492                              ilm->ilm_zoneid != ALL_ZONES)
10492 10493                                  continue;
10493 10494                          if (SLIST_IS_EMPTY(sl))
10494 10495                                  continue;
10495 10496                          ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
10496 10497                          for (i = 0; i < sl->sl_numsrc; i++) {
10497 10498                                  ips6.ipv6GroupSourceAddress = sl->sl_addr[i];
10498 10499                                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10499 10500                                      (char *)&ips6, (int)sizeof (ips6))) {
10500 10501                                          ip1dbg(("ip_snmp_get_mib2_ip6_"
10501 10502                                              "group_src: failed to allocate "
10502 10503                                              "%u bytes\n",
10503 10504                                              (uint_t)sizeof (ips6)));
10504 10505                                  }
10505 10506                          }
10506 10507                  }
10507 10508                  rw_exit(&ill->ill_mcast_lock);
10508 10509                  ill_refrele(ill);
10509 10510                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10510 10511          }
10511 10512          rw_exit(&ipst->ips_ill_g_lock);
10512 10513  
10513 10514          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10514 10515          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10515 10516              (int)optp->level, (int)optp->name, (int)optp->len));
10516 10517          qreply(q, mpctl);
10517 10518          return (mp2ctl);
10518 10519  }
10519 10520  
10520 10521  /* Multicast routing virtual interface table. */
10521 10522  static mblk_t *
10522 10523  ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10523 10524  {
10524 10525          struct opthdr           *optp;
10525 10526          mblk_t                  *mp2ctl;
10526 10527  
10527 10528          /*
10528 10529           * make a copy of the original message
10529 10530           */
10530 10531          mp2ctl = copymsg(mpctl);
10531 10532  
10532 10533          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10533 10534          optp->level = EXPER_DVMRP;
10534 10535          optp->name = EXPER_DVMRP_VIF;
10535 10536          if (!ip_mroute_vif(mpctl->b_cont, ipst)) {
10536 10537                  ip0dbg(("ip_mroute_vif: failed\n"));
10537 10538          }
10538 10539          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10539 10540          ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n",
10540 10541              (int)optp->level, (int)optp->name, (int)optp->len));
10541 10542          qreply(q, mpctl);
10542 10543          return (mp2ctl);
10543 10544  }
10544 10545  
10545 10546  /* Multicast routing table. */
10546 10547  static mblk_t *
10547 10548  ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10548 10549  {
10549 10550          struct opthdr           *optp;
10550 10551          mblk_t                  *mp2ctl;
10551 10552  
10552 10553          /*
10553 10554           * make a copy of the original message
10554 10555           */
10555 10556          mp2ctl = copymsg(mpctl);
10556 10557  
10557 10558          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10558 10559          optp->level = EXPER_DVMRP;
10559 10560          optp->name = EXPER_DVMRP_MRT;
10560 10561          if (!ip_mroute_mrt(mpctl->b_cont, ipst)) {
10561 10562                  ip0dbg(("ip_mroute_mrt: failed\n"));
10562 10563          }
10563 10564          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10564 10565          ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n",
10565 10566              (int)optp->level, (int)optp->name, (int)optp->len));
10566 10567          qreply(q, mpctl);
10567 10568          return (mp2ctl);
10568 10569  }
10569 10570  
10570 10571  /*
10571 10572   * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable
10572 10573   * in one IRE walk.
10573 10574   */
10574 10575  static mblk_t *
10575 10576  ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
10576 10577      ip_stack_t *ipst)
10577 10578  {
10578 10579          struct opthdr   *optp;
10579 10580          mblk_t          *mp2ctl;        /* Returned */
10580 10581          mblk_t          *mp3ctl;        /* nettomedia */
10581 10582          mblk_t          *mp4ctl;        /* routeattrs */
10582 10583          iproutedata_t   ird;
10583 10584          zoneid_t        zoneid;
10584 10585  
10585 10586          /*
10586 10587           * make copies of the original message
10587 10588           *      - mp2ctl is returned unchanged to the caller for its use
10588 10589           *      - mpctl is sent upstream as ipRouteEntryTable
10589 10590           *      - mp3ctl is sent upstream as ipNetToMediaEntryTable
10590 10591           *      - mp4ctl is sent upstream as ipRouteAttributeTable
10591 10592           */
10592 10593          mp2ctl = copymsg(mpctl);
10593 10594          mp3ctl = copymsg(mpctl);
10594 10595          mp4ctl = copymsg(mpctl);
10595 10596          if (mp3ctl == NULL || mp4ctl == NULL) {
10596 10597                  freemsg(mp4ctl);
10597 10598                  freemsg(mp3ctl);
10598 10599                  freemsg(mp2ctl);
10599 10600                  freemsg(mpctl);
10600 10601                  return (NULL);
10601 10602          }
10602 10603  
10603 10604          bzero(&ird, sizeof (ird));
10604 10605  
10605 10606          ird.ird_route.lp_head = mpctl->b_cont;
10606 10607          ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10607 10608          ird.ird_attrs.lp_head = mp4ctl->b_cont;
10608 10609          /*
10609 10610           * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10610 10611           * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
10611 10612           * intended a temporary solution until a proper MIB API is provided
10612 10613           * that provides complete filtering/caller-opt-in.
10613 10614           */
10614 10615          if (level == EXPER_IP_AND_ALL_IRES)
10615 10616                  ird.ird_flags |= IRD_REPORT_ALL;
10616 10617  
10617 10618          zoneid = Q_TO_CONN(q)->conn_zoneid;
10618 10619          ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
10619 10620  
10620 10621          /* ipRouteEntryTable in mpctl */
10621 10622          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10622 10623          optp->level = MIB2_IP;
10623 10624          optp->name = MIB2_IP_ROUTE;
10624 10625          optp->len = msgdsize(ird.ird_route.lp_head);
10625 10626          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10626 10627              (int)optp->level, (int)optp->name, (int)optp->len));
10627 10628          qreply(q, mpctl);
10628 10629  
10629 10630          /* ipNetToMediaEntryTable in mp3ctl */
10630 10631          ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
10631 10632  
10632 10633          optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10633 10634          optp->level = MIB2_IP;
10634 10635          optp->name = MIB2_IP_MEDIA;
10635 10636          optp->len = msgdsize(ird.ird_netmedia.lp_head);
10636 10637          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10637 10638              (int)optp->level, (int)optp->name, (int)optp->len));
10638 10639          qreply(q, mp3ctl);
10639 10640  
10640 10641          /* ipRouteAttributeTable in mp4ctl */
10641 10642          optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10642 10643          optp->level = MIB2_IP;
10643 10644          optp->name = EXPER_IP_RTATTR;
10644 10645          optp->len = msgdsize(ird.ird_attrs.lp_head);
10645 10646          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10646 10647              (int)optp->level, (int)optp->name, (int)optp->len));
10647 10648          if (optp->len == 0)
10648 10649                  freemsg(mp4ctl);
10649 10650          else
10650 10651                  qreply(q, mp4ctl);
10651 10652  
10652 10653          return (mp2ctl);
10653 10654  }
10654 10655  
10655 10656  /*
10656 10657   * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and
10657 10658   * ipv6NetToMediaEntryTable in an NDP walk.
10658 10659   */
10659 10660  static mblk_t *
10660 10661  ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
10661 10662      ip_stack_t *ipst)
10662 10663  {
10663 10664          struct opthdr   *optp;
10664 10665          mblk_t          *mp2ctl;        /* Returned */
10665 10666          mblk_t          *mp3ctl;        /* nettomedia */
10666 10667          mblk_t          *mp4ctl;        /* routeattrs */
10667 10668          iproutedata_t   ird;
10668 10669          zoneid_t        zoneid;
10669 10670  
10670 10671          /*
10671 10672           * make copies of the original message
10672 10673           *      - mp2ctl is returned unchanged to the caller for its use
10673 10674           *      - mpctl is sent upstream as ipv6RouteEntryTable
10674 10675           *      - mp3ctl is sent upstream as ipv6NetToMediaEntryTable
10675 10676           *      - mp4ctl is sent upstream as ipv6RouteAttributeTable
10676 10677           */
10677 10678          mp2ctl = copymsg(mpctl);
10678 10679          mp3ctl = copymsg(mpctl);
10679 10680          mp4ctl = copymsg(mpctl);
10680 10681          if (mp3ctl == NULL || mp4ctl == NULL) {
10681 10682                  freemsg(mp4ctl);
10682 10683                  freemsg(mp3ctl);
10683 10684                  freemsg(mp2ctl);
10684 10685                  freemsg(mpctl);
10685 10686                  return (NULL);
10686 10687          }
10687 10688  
10688 10689          bzero(&ird, sizeof (ird));
10689 10690  
10690 10691          ird.ird_route.lp_head = mpctl->b_cont;
10691 10692          ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10692 10693          ird.ird_attrs.lp_head = mp4ctl->b_cont;
10693 10694          /*
10694 10695           * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10695 10696           * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
10696 10697           * intended a temporary solution until a proper MIB API is provided
10697 10698           * that provides complete filtering/caller-opt-in.
10698 10699           */
10699 10700          if (level == EXPER_IP_AND_ALL_IRES)
10700 10701                  ird.ird_flags |= IRD_REPORT_ALL;
10701 10702  
10702 10703          zoneid = Q_TO_CONN(q)->conn_zoneid;
10703 10704          ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
10704 10705  
10705 10706          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10706 10707          optp->level = MIB2_IP6;
10707 10708          optp->name = MIB2_IP6_ROUTE;
10708 10709          optp->len = msgdsize(ird.ird_route.lp_head);
10709 10710          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10710 10711              (int)optp->level, (int)optp->name, (int)optp->len));
10711 10712          qreply(q, mpctl);
10712 10713  
10713 10714          /* ipv6NetToMediaEntryTable in mp3ctl */
10714 10715          ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
10715 10716  
10716 10717          optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10717 10718          optp->level = MIB2_IP6;
10718 10719          optp->name = MIB2_IP6_MEDIA;
10719 10720          optp->len = msgdsize(ird.ird_netmedia.lp_head);
10720 10721          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10721 10722              (int)optp->level, (int)optp->name, (int)optp->len));
10722 10723          qreply(q, mp3ctl);
10723 10724  
10724 10725          /* ipv6RouteAttributeTable in mp4ctl */
10725 10726          optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10726 10727          optp->level = MIB2_IP6;
10727 10728          optp->name = EXPER_IP_RTATTR;
10728 10729          optp->len = msgdsize(ird.ird_attrs.lp_head);
10729 10730          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10730 10731              (int)optp->level, (int)optp->name, (int)optp->len));
10731 10732          if (optp->len == 0)
10732 10733                  freemsg(mp4ctl);
10733 10734          else
10734 10735                  qreply(q, mp4ctl);
10735 10736  
10736 10737          return (mp2ctl);
10737 10738  }
10738 10739  
10739 10740  /*
10740 10741   * IPv6 mib: One per ill
10741 10742   */
10742 10743  static mblk_t *
10743 10744  ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10744 10745      boolean_t legacy_req)
10745 10746  {
10746 10747          struct opthdr           *optp;
10747 10748          mblk_t                  *mp2ctl;
10748 10749          ill_t                   *ill;
10749 10750          ill_walk_context_t      ctx;
10750 10751          mblk_t                  *mp_tail = NULL;
10751 10752          mib2_ipv6AddrEntry_t    mae6;
10752 10753          mib2_ipIfStatsEntry_t   *ise;
10753 10754          size_t                  ise_size, iae_size;
10754 10755  
10755 10756          /*
10756 10757           * Make a copy of the original message
10757 10758           */
10758 10759          mp2ctl = copymsg(mpctl);
10759 10760  
10760 10761          /* fixed length IPv6 structure ... */
10761 10762  
10762 10763          if (legacy_req) {
10763 10764                  ise_size = LEGACY_MIB_SIZE(&ipst->ips_ip6_mib,
10764 10765                      mib2_ipIfStatsEntry_t);
10765 10766                  iae_size = LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t);
10766 10767          } else {
10767 10768                  ise_size = sizeof (mib2_ipIfStatsEntry_t);
10768 10769                  iae_size = sizeof (mib2_ipv6AddrEntry_t);
10769 10770          }
10770 10771  
10771 10772          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10772 10773          optp->level = MIB2_IP6;
10773 10774          optp->name = 0;
10774 10775          /* Include "unknown interface" ip6_mib */
10775 10776          ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
10776 10777          ipst->ips_ip6_mib.ipIfStatsIfIndex =
10777 10778              MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
10778 10779          SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding,
10779 10780              ipst->ips_ipv6_forwarding ? 1 : 2);
10780 10781          SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit,
10781 10782              ipst->ips_ipv6_def_hops);
10782 10783          SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize,
10783 10784              sizeof (mib2_ipIfStatsEntry_t));
10784 10785          SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize,
10785 10786              sizeof (mib2_ipv6AddrEntry_t));
10786 10787          SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize,
10787 10788              sizeof (mib2_ipv6RouteEntry_t));
10788 10789          SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize,
10789 10790              sizeof (mib2_ipv6NetToMediaEntry_t));
10790 10791          SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize,
10791 10792              sizeof (ipv6_member_t));
10792 10793          SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize,
10793 10794              sizeof (ipv6_grpsrc_t));
10794 10795  
10795 10796          /*
10796 10797           * Synchronize 64- and 32-bit counters
10797 10798           */
10798 10799          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives,
10799 10800              ipIfStatsHCInReceives);
10800 10801          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers,
10801 10802              ipIfStatsHCInDelivers);
10802 10803          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests,
10803 10804              ipIfStatsHCOutRequests);
10804 10805          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams,
10805 10806              ipIfStatsHCOutForwDatagrams);
10806 10807          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts,
10807 10808              ipIfStatsHCOutMcastPkts);
10808 10809          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts,
10809 10810              ipIfStatsHCInMcastPkts);
10810 10811  
10811 10812          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10812 10813              (char *)&ipst->ips_ip6_mib, (int)ise_size)) {
10813 10814                  ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n",
10814 10815                      (uint_t)ise_size));
10815 10816          } else if (legacy_req) {
10816 10817                  /* Adjust the EntrySize fields for legacy requests. */
10817 10818                  ise =
10818 10819                      (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr - (int)ise_size);
10819 10820                  SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10820 10821                  SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10821 10822          }
10822 10823  
10823 10824          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10824 10825          ill = ILL_START_WALK_V6(&ctx, ipst);
10825 10826          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10826 10827                  ill->ill_ip_mib->ipIfStatsIfIndex =
10827 10828                      ill->ill_phyint->phyint_ifindex;
10828 10829                  SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
10829 10830                      ipst->ips_ipv6_forwarding ? 1 : 2);
10830 10831                  SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit,
10831 10832                      ill->ill_max_hops);
10832 10833  
10833 10834                  /*
10834 10835                   * Synchronize 64- and 32-bit counters
10835 10836                   */
10836 10837                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives,
10837 10838                      ipIfStatsHCInReceives);
10838 10839                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers,
10839 10840                      ipIfStatsHCInDelivers);
10840 10841                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests,
10841 10842                      ipIfStatsHCOutRequests);
10842 10843                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams,
10843 10844                      ipIfStatsHCOutForwDatagrams);
10844 10845                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts,
10845 10846                      ipIfStatsHCOutMcastPkts);
10846 10847                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts,
10847 10848                      ipIfStatsHCInMcastPkts);
10848 10849  
10849 10850                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10850 10851                      (char *)ill->ill_ip_mib, (int)ise_size)) {
10851 10852                          ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate "
10852 10853                          "%u bytes\n", (uint_t)ise_size));
10853 10854                  } else if (legacy_req) {
10854 10855                          /* Adjust the EntrySize fields for legacy requests. */
10855 10856                          ise = (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr -
10856 10857                              (int)ise_size);
10857 10858                          SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10858 10859                          SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10859 10860                  }
10860 10861          }
10861 10862          rw_exit(&ipst->ips_ill_g_lock);
10862 10863  
10863 10864          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10864 10865          ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n",
10865 10866              (int)optp->level, (int)optp->name, (int)optp->len));
10866 10867          qreply(q, mpctl);
10867 10868          return (mp2ctl);
10868 10869  }
10869 10870  
10870 10871  /*
10871 10872   * ICMPv6 mib: One per ill
10872 10873   */
10873 10874  static mblk_t *
10874 10875  ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10875 10876  {
10876 10877          struct opthdr           *optp;
10877 10878          mblk_t                  *mp2ctl;
10878 10879          ill_t                   *ill;
10879 10880          ill_walk_context_t      ctx;
10880 10881          mblk_t                  *mp_tail = NULL;
10881 10882          /*
10882 10883           * Make a copy of the original message
10883 10884           */
10884 10885          mp2ctl = copymsg(mpctl);
10885 10886  
10886 10887          /* fixed length ICMPv6 structure ... */
10887 10888  
10888 10889          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10889 10890          optp->level = MIB2_ICMP6;
10890 10891          optp->name = 0;
10891 10892          /* Include "unknown interface" icmp6_mib */
10892 10893          ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex =
10893 10894              MIB2_UNKNOWN_INTERFACE; /* netstat flag */
10894 10895          ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize =
10895 10896              sizeof (mib2_ipv6IfIcmpEntry_t);
10896 10897          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10897 10898              (char *)&ipst->ips_icmp6_mib,
10898 10899              (int)sizeof (ipst->ips_icmp6_mib))) {
10899 10900                  ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n",
10900 10901                      (uint_t)sizeof (ipst->ips_icmp6_mib)));
10901 10902          }
10902 10903  
10903 10904          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10904 10905          ill = ILL_START_WALK_V6(&ctx, ipst);
10905 10906          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10906 10907                  ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
10907 10908                      ill->ill_phyint->phyint_ifindex;
10908 10909                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10909 10910                      (char *)ill->ill_icmp6_mib,
10910 10911                      (int)sizeof (*ill->ill_icmp6_mib))) {
10911 10912                          ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate "
10912 10913                              "%u bytes\n",
10913 10914                              (uint_t)sizeof (*ill->ill_icmp6_mib)));
10914 10915                  }
10915 10916          }
10916 10917          rw_exit(&ipst->ips_ill_g_lock);
10917 10918  
10918 10919          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10919 10920          ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n",
10920 10921              (int)optp->level, (int)optp->name, (int)optp->len));
10921 10922          qreply(q, mpctl);
10922 10923          return (mp2ctl);
10923 10924  }
10924 10925  
10925 10926  /*
10926 10927   * ire_walk routine to create both ipRouteEntryTable and
10927 10928   * ipRouteAttributeTable in one IRE walk
10928 10929   */
10929 10930  static void
10930 10931  ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
10931 10932  {
10932 10933          ill_t                           *ill;
10933 10934          mib2_ipRouteEntry_t             *re;
10934 10935          mib2_ipAttributeEntry_t         iaes;
10935 10936          tsol_ire_gw_secattr_t           *attrp;
10936 10937          tsol_gc_t                       *gc = NULL;
10937 10938          tsol_gcgrp_t                    *gcgrp = NULL;
10938 10939          ip_stack_t                      *ipst = ire->ire_ipst;
10939 10940  
10940 10941          ASSERT(ire->ire_ipversion == IPV4_VERSION);
10941 10942  
10942 10943          if (!(ird->ird_flags & IRD_REPORT_ALL)) {
10943 10944                  if (ire->ire_testhidden)
10944 10945                          return;
10945 10946                  if (ire->ire_type & IRE_IF_CLONE)
10946 10947                          return;
10947 10948          }
10948 10949  
10949 10950          if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
10950 10951                  return;
10951 10952  
10952 10953          if ((attrp = ire->ire_gw_secattr) != NULL) {
10953 10954                  mutex_enter(&attrp->igsa_lock);
10954 10955                  if ((gc = attrp->igsa_gc) != NULL) {
10955 10956                          gcgrp = gc->gc_grp;
10956 10957                          ASSERT(gcgrp != NULL);
10957 10958                          rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
10958 10959                  }
10959 10960                  mutex_exit(&attrp->igsa_lock);
10960 10961          }
10961 10962          /*
10962 10963           * Return all IRE types for route table... let caller pick and choose
10963 10964           */
10964 10965          re->ipRouteDest = ire->ire_addr;
10965 10966          ill = ire->ire_ill;
10966 10967          re->ipRouteIfIndex.o_length = 0;
10967 10968          if (ill != NULL) {
10968 10969                  ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
10969 10970                  re->ipRouteIfIndex.o_length =
10970 10971                      mi_strlen(re->ipRouteIfIndex.o_bytes);
10971 10972          }
10972 10973          re->ipRouteMetric1 = -1;
10973 10974          re->ipRouteMetric2 = -1;
10974 10975          re->ipRouteMetric3 = -1;
10975 10976          re->ipRouteMetric4 = -1;
10976 10977  
10977 10978          re->ipRouteNextHop = ire->ire_gateway_addr;
10978 10979          /* indirect(4), direct(3), or invalid(2) */
10979 10980          if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
10980 10981                  re->ipRouteType = 2;
10981 10982          else if (ire->ire_type & IRE_ONLINK)
10982 10983                  re->ipRouteType = 3;
10983 10984          else
10984 10985                  re->ipRouteType = 4;
10985 10986  
10986 10987          re->ipRouteProto = -1;
10987 10988          re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
10988 10989          re->ipRouteMask = ire->ire_mask;
10989 10990          re->ipRouteMetric5 = -1;
10990 10991          re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
10991 10992          if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
10992 10993                  re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
10993 10994  
10994 10995          re->ipRouteInfo.re_frag_flag    = 0;
10995 10996          re->ipRouteInfo.re_rtt          = 0;
10996 10997          re->ipRouteInfo.re_src_addr     = 0;
10997 10998          re->ipRouteInfo.re_ref          = ire->ire_refcnt;
10998 10999          re->ipRouteInfo.re_obpkt        = ire->ire_ob_pkt_count;
10999 11000          re->ipRouteInfo.re_ibpkt        = ire->ire_ib_pkt_count;
11000 11001          re->ipRouteInfo.re_flags        = ire->ire_flags;
11001 11002  
11002 11003          /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
11003 11004          if (ire->ire_type & IRE_INTERFACE) {
11004 11005                  ire_t *child;
11005 11006  
11006 11007                  rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
11007 11008                  child = ire->ire_dep_children;
11008 11009                  while (child != NULL) {
11009 11010                          re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
11010 11011                          re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
11011 11012                          child = child->ire_dep_sib_next;
11012 11013                  }
11013 11014                  rw_exit(&ipst->ips_ire_dep_lock);
11014 11015          }
11015 11016  
11016 11017          if (ire->ire_flags & RTF_DYNAMIC) {
11017 11018                  re->ipRouteInfo.re_ire_type     = IRE_HOST_REDIRECT;
11018 11019          } else {
11019 11020                  re->ipRouteInfo.re_ire_type     = ire->ire_type;
11020 11021          }
11021 11022  
11022 11023          if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
11023 11024              (char *)re, (int)sizeof (*re))) {
11024 11025                  ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
11025 11026                      (uint_t)sizeof (*re)));
11026 11027          }
11027 11028  
11028 11029          if (gc != NULL) {
11029 11030                  iaes.iae_routeidx = ird->ird_idx;
11030 11031                  iaes.iae_doi = gc->gc_db->gcdb_doi;
11031 11032                  iaes.iae_slrange = gc->gc_db->gcdb_slrange;
11032 11033  
11033 11034                  if (!snmp_append_data2(ird->ird_attrs.lp_head,
11034 11035                      &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
11035 11036                          ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
11036 11037                              "bytes\n", (uint_t)sizeof (iaes)));
11037 11038                  }
11038 11039          }
11039 11040  
11040 11041          /* bump route index for next pass */
11041 11042          ird->ird_idx++;
11042 11043  
11043 11044          kmem_free(re, sizeof (*re));
11044 11045          if (gcgrp != NULL)
11045 11046                  rw_exit(&gcgrp->gcgrp_rwlock);
11046 11047  }
11047 11048  
11048 11049  /*
11049 11050   * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable.
11050 11051   */
11051 11052  static void
11052 11053  ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
11053 11054  {
11054 11055          ill_t                           *ill;
11055 11056          mib2_ipv6RouteEntry_t           *re;
11056 11057          mib2_ipAttributeEntry_t         iaes;
11057 11058          tsol_ire_gw_secattr_t           *attrp;
11058 11059          tsol_gc_t                       *gc = NULL;
11059 11060          tsol_gcgrp_t                    *gcgrp = NULL;
11060 11061          ip_stack_t                      *ipst = ire->ire_ipst;
11061 11062  
11062 11063          ASSERT(ire->ire_ipversion == IPV6_VERSION);
11063 11064  
11064 11065          if (!(ird->ird_flags & IRD_REPORT_ALL)) {
11065 11066                  if (ire->ire_testhidden)
11066 11067                          return;
11067 11068                  if (ire->ire_type & IRE_IF_CLONE)
11068 11069                          return;
11069 11070          }
11070 11071  
11071 11072          if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
11072 11073                  return;
11073 11074  
11074 11075          if ((attrp = ire->ire_gw_secattr) != NULL) {
11075 11076                  mutex_enter(&attrp->igsa_lock);
11076 11077                  if ((gc = attrp->igsa_gc) != NULL) {
11077 11078                          gcgrp = gc->gc_grp;
11078 11079                          ASSERT(gcgrp != NULL);
11079 11080                          rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
11080 11081                  }
11081 11082                  mutex_exit(&attrp->igsa_lock);
11082 11083          }
11083 11084          /*
11084 11085           * Return all IRE types for route table... let caller pick and choose
11085 11086           */
11086 11087          re->ipv6RouteDest = ire->ire_addr_v6;
11087 11088          re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
11088 11089          re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */
11089 11090          re->ipv6RouteIfIndex.o_length = 0;
11090 11091          ill = ire->ire_ill;
11091 11092          if (ill != NULL) {
11092 11093                  ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
11093 11094                  re->ipv6RouteIfIndex.o_length =
11094 11095                      mi_strlen(re->ipv6RouteIfIndex.o_bytes);
11095 11096          }
11096 11097  
11097 11098          ASSERT(!(ire->ire_type & IRE_BROADCAST));
11098 11099  
11099 11100          mutex_enter(&ire->ire_lock);
11100 11101          re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
11101 11102          mutex_exit(&ire->ire_lock);
11102 11103  
11103 11104          /* remote(4), local(3), or discard(2) */
11104 11105          if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
11105 11106                  re->ipv6RouteType = 2;
11106 11107          else if (ire->ire_type & IRE_ONLINK)
11107 11108                  re->ipv6RouteType = 3;
11108 11109          else
11109 11110                  re->ipv6RouteType = 4;
11110 11111  
11111 11112          re->ipv6RouteProtocol   = -1;
11112 11113          re->ipv6RoutePolicy     = 0;
11113 11114          re->ipv6RouteAge        = gethrestime_sec() - ire->ire_create_time;
11114 11115          re->ipv6RouteNextHopRDI = 0;
11115 11116          re->ipv6RouteWeight     = 0;
11116 11117          re->ipv6RouteMetric     = 0;
11117 11118          re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
11118 11119          if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
11119 11120                  re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
11120 11121  
11121 11122          re->ipv6RouteInfo.re_frag_flag  = 0;
11122 11123          re->ipv6RouteInfo.re_rtt        = 0;
11123 11124          re->ipv6RouteInfo.re_src_addr   = ipv6_all_zeros;
11124 11125          re->ipv6RouteInfo.re_obpkt      = ire->ire_ob_pkt_count;
11125 11126          re->ipv6RouteInfo.re_ibpkt      = ire->ire_ib_pkt_count;
11126 11127          re->ipv6RouteInfo.re_ref        = ire->ire_refcnt;
11127 11128          re->ipv6RouteInfo.re_flags      = ire->ire_flags;
11128 11129  
11129 11130          /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
11130 11131          if (ire->ire_type & IRE_INTERFACE) {
11131 11132                  ire_t *child;
11132 11133  
11133 11134                  rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
11134 11135                  child = ire->ire_dep_children;
11135 11136                  while (child != NULL) {
11136 11137                          re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
11137 11138                          re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
11138 11139                          child = child->ire_dep_sib_next;
11139 11140                  }
11140 11141                  rw_exit(&ipst->ips_ire_dep_lock);
11141 11142          }
11142 11143          if (ire->ire_flags & RTF_DYNAMIC) {
11143 11144                  re->ipv6RouteInfo.re_ire_type   = IRE_HOST_REDIRECT;
11144 11145          } else {
11145 11146                  re->ipv6RouteInfo.re_ire_type   = ire->ire_type;
11146 11147          }
11147 11148  
11148 11149          if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
11149 11150              (char *)re, (int)sizeof (*re))) {
11150 11151                  ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
11151 11152                      (uint_t)sizeof (*re)));
11152 11153          }
11153 11154  
11154 11155          if (gc != NULL) {
11155 11156                  iaes.iae_routeidx = ird->ird_idx;
11156 11157                  iaes.iae_doi = gc->gc_db->gcdb_doi;
11157 11158                  iaes.iae_slrange = gc->gc_db->gcdb_slrange;
11158 11159  
11159 11160                  if (!snmp_append_data2(ird->ird_attrs.lp_head,
11160 11161                      &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
11161 11162                          ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
11162 11163                              "bytes\n", (uint_t)sizeof (iaes)));
11163 11164                  }
11164 11165          }
11165 11166  
11166 11167          /* bump route index for next pass */
11167 11168          ird->ird_idx++;
11168 11169  
11169 11170          kmem_free(re, sizeof (*re));
11170 11171          if (gcgrp != NULL)
11171 11172                  rw_exit(&gcgrp->gcgrp_rwlock);
11172 11173  }
11173 11174  
11174 11175  /*
11175 11176   * ncec_walk routine to create ipv6NetToMediaEntryTable
11176 11177   */
11177 11178  static void
11178 11179  ip_snmp_get2_v6_media(ncec_t *ncec, void *ptr)
11179 11180  {
11180 11181          iproutedata_t *ird              = ptr;
11181 11182          ill_t                           *ill;
11182 11183          mib2_ipv6NetToMediaEntry_t      ntme;
11183 11184  
11184 11185          ill = ncec->ncec_ill;
11185 11186          /* skip arpce entries, and loopback ncec entries */
11186 11187          if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
11187 11188                  return;
11188 11189          /*
11189 11190           * Neighbor cache entry attached to IRE with on-link
11190 11191           * destination.
11191 11192           * We report all IPMP groups on ncec_ill which is normally the upper.
11192 11193           */
11193 11194          ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
11194 11195          ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
11195 11196          ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
11196 11197          if (ncec->ncec_lladdr != NULL) {
11197 11198                  bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
11198 11199                      ntme.ipv6NetToMediaPhysAddress.o_length);
11199 11200          }
11200 11201          /*
11201 11202           * Note: Returns ND_* states. Should be:
11202 11203           * reachable(1), stale(2), delay(3), probe(4),
11203 11204           * invalid(5), unknown(6)
11204 11205           */
11205 11206          ntme.ipv6NetToMediaState = ncec->ncec_state;
11206 11207          ntme.ipv6NetToMediaLastUpdated = 0;
11207 11208  
11208 11209          /* other(1), dynamic(2), static(3), local(4) */
11209 11210          if (NCE_MYADDR(ncec)) {
11210 11211                  ntme.ipv6NetToMediaType = 4;
11211 11212          } else if (ncec->ncec_flags & NCE_F_PUBLISH) {
11212 11213                  ntme.ipv6NetToMediaType = 1; /* proxy */
11213 11214          } else if (ncec->ncec_flags & NCE_F_STATIC) {
11214 11215                  ntme.ipv6NetToMediaType = 3;
11215 11216          } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
11216 11217                  ntme.ipv6NetToMediaType = 1;
11217 11218          } else {
11218 11219                  ntme.ipv6NetToMediaType = 2;
11219 11220          }
11220 11221  
11221 11222          if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11222 11223              &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11223 11224                  ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n",
11224 11225                      (uint_t)sizeof (ntme)));
11225 11226          }
11226 11227  }
11227 11228  
11228 11229  int
11229 11230  nce2ace(ncec_t *ncec)
11230 11231  {
11231 11232          int flags = 0;
11232 11233  
11233 11234          if (NCE_ISREACHABLE(ncec))
11234 11235                  flags |= ACE_F_RESOLVED;
11235 11236          if (ncec->ncec_flags & NCE_F_AUTHORITY)
11236 11237                  flags |= ACE_F_AUTHORITY;
11237 11238          if (ncec->ncec_flags & NCE_F_PUBLISH)
11238 11239                  flags |= ACE_F_PUBLISH;
11239 11240          if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
11240 11241                  flags |= ACE_F_PERMANENT;
11241 11242          if (NCE_MYADDR(ncec))
11242 11243                  flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
11243 11244          if (ncec->ncec_flags & NCE_F_UNVERIFIED)
11244 11245                  flags |= ACE_F_UNVERIFIED;
11245 11246          if (ncec->ncec_flags & NCE_F_AUTHORITY)
11246 11247                  flags |= ACE_F_AUTHORITY;
11247 11248          if (ncec->ncec_flags & NCE_F_DELAYED)
11248 11249                  flags |= ACE_F_DELAYED;
11249 11250          return (flags);
11250 11251  }
11251 11252  
11252 11253  /*
11253 11254   * ncec_walk routine to create ipNetToMediaEntryTable
11254 11255   */
11255 11256  static void
11256 11257  ip_snmp_get2_v4_media(ncec_t *ncec, void *ptr)
11257 11258  {
11258 11259          iproutedata_t *ird              = ptr;
11259 11260          ill_t                           *ill;
11260 11261          mib2_ipNetToMediaEntry_t        ntme;
11261 11262          const char                      *name = "unknown";
11262 11263          ipaddr_t                        ncec_addr;
11263 11264  
11264 11265          ill = ncec->ncec_ill;
11265 11266          if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
11266 11267              ill->ill_net_type == IRE_LOOPBACK)
11267 11268                  return;
11268 11269  
11269 11270          /* We report all IPMP groups on ncec_ill which is normally the upper. */
11270 11271          name = ill->ill_name;
11271 11272          /* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
11272 11273          if (NCE_MYADDR(ncec)) {
11273 11274                  ntme.ipNetToMediaType = 4;
11274 11275          } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
11275 11276                  ntme.ipNetToMediaType = 1;
11276 11277          } else {
11277 11278                  ntme.ipNetToMediaType = 3;
11278 11279          }
11279 11280          ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
11280 11281          bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
11281 11282              ntme.ipNetToMediaIfIndex.o_length);
11282 11283  
11283 11284          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
11284 11285          bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
11285 11286  
11286 11287          ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
11287 11288          ncec_addr = INADDR_BROADCAST;
11288 11289          bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
11289 11290              sizeof (ncec_addr));
11290 11291          /*
11291 11292           * map all the flags to the ACE counterpart.
11292 11293           */
11293 11294          ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
11294 11295  
11295 11296          ntme.ipNetToMediaPhysAddress.o_length =
11296 11297              MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
11297 11298  
11298 11299          if (!NCE_ISREACHABLE(ncec))
11299 11300                  ntme.ipNetToMediaPhysAddress.o_length = 0;
11300 11301          else {
11301 11302                  if (ncec->ncec_lladdr != NULL) {
11302 11303                          bcopy(ncec->ncec_lladdr,
11303 11304                              ntme.ipNetToMediaPhysAddress.o_bytes,
11304 11305                              ntme.ipNetToMediaPhysAddress.o_length);
11305 11306                  }
11306 11307          }
11307 11308  
11308 11309          if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11309 11310              &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11310 11311                  ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
11311 11312                      (uint_t)sizeof (ntme)));
11312 11313          }
11313 11314  }
11314 11315  
11315 11316  /*
11316 11317   * return (0) if invalid set request, 1 otherwise, including non-tcp requests
11317 11318   */
11318 11319  /* ARGSUSED */
11319 11320  int
11320 11321  ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
11321 11322  {
11322 11323          switch (level) {
11323 11324          case MIB2_IP:
11324 11325          case MIB2_ICMP:
11325 11326                  switch (name) {
11326 11327                  default:
11327 11328                          break;
11328 11329                  }
11329 11330                  return (1);
11330 11331          default:
11331 11332                  return (1);
11332 11333          }
11333 11334  }
11334 11335  
11335 11336  /*
11336 11337   * When there exists both a 64- and 32-bit counter of a particular type
11337 11338   * (i.e., InReceives), only the 64-bit counters are added.
11338 11339   */
11339 11340  void
11340 11341  ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2)
11341 11342  {
11342 11343          UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors);
11343 11344          UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors);
11344 11345          UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes);
11345 11346          UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors);
11346 11347          UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos);
11347 11348          UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts);
11348 11349          UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards);
11349 11350          UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards);
11350 11351          UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs);
11351 11352          UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails);
11352 11353          UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates);
11353 11354          UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds);
11354 11355          UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs);
11355 11356          UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails);
11356 11357          UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes);
11357 11358          UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates);
11358 11359          UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups);
11359 11360          UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits);
11360 11361          UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs);
11361 11362          UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows);
11362 11363          UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows);
11363 11364          UPDATE_MIB(o1, ipIfStatsInWrongIPVersion,
11364 11365              o2->ipIfStatsInWrongIPVersion);
11365 11366          UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion,
11366 11367              o2->ipIfStatsInWrongIPVersion);
11367 11368          UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion,
11368 11369              o2->ipIfStatsOutSwitchIPVersion);
11369 11370          UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives);
11370 11371          UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets);
11371 11372          UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams,
11372 11373              o2->ipIfStatsHCInForwDatagrams);
11373 11374          UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers);
11374 11375          UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests);
11375 11376          UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams,
11376 11377              o2->ipIfStatsHCOutForwDatagrams);
11377 11378          UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds);
11378 11379          UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits);
11379 11380          UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets);
11380 11381          UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts);
11381 11382          UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets);
11382 11383          UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts);
11383 11384          UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets,
11384 11385              o2->ipIfStatsHCOutMcastOctets);
11385 11386          UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts);
11386 11387          UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts);
11387 11388          UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded);
11388 11389          UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed);
11389 11390          UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs);
11390 11391          UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs);
11391 11392          UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts);
11392 11393  }
11393 11394  
11394 11395  void
11395 11396  ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
11396 11397  {
11397 11398          UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs);
11398 11399          UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors);
11399 11400          UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs);
11400 11401          UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs);
11401 11402          UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds);
11402 11403          UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems);
11403 11404          UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs);
11404 11405          UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos);
11405 11406          UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies);
11406 11407          UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits,
11407 11408              o2->ipv6IfIcmpInRouterSolicits);
11408 11409          UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements,
11409 11410              o2->ipv6IfIcmpInRouterAdvertisements);
11410 11411          UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits,
11411 11412              o2->ipv6IfIcmpInNeighborSolicits);
11412 11413          UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements,
11413 11414              o2->ipv6IfIcmpInNeighborAdvertisements);
11414 11415          UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects);
11415 11416          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries,
11416 11417              o2->ipv6IfIcmpInGroupMembQueries);
11417 11418          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses,
11418 11419              o2->ipv6IfIcmpInGroupMembResponses);
11419 11420          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions,
11420 11421              o2->ipv6IfIcmpInGroupMembReductions);
11421 11422          UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs);
11422 11423          UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors);
11423 11424          UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs,
11424 11425              o2->ipv6IfIcmpOutDestUnreachs);
11425 11426          UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs,
11426 11427              o2->ipv6IfIcmpOutAdminProhibs);
11427 11428          UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds);
11428 11429          UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems,
11429 11430              o2->ipv6IfIcmpOutParmProblems);
11430 11431          UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs);
11431 11432          UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos);
11432 11433          UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies);
11433 11434          UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits,
11434 11435              o2->ipv6IfIcmpOutRouterSolicits);
11435 11436          UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements,
11436 11437              o2->ipv6IfIcmpOutRouterAdvertisements);
11437 11438          UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits,
11438 11439              o2->ipv6IfIcmpOutNeighborSolicits);
11439 11440          UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements,
11440 11441              o2->ipv6IfIcmpOutNeighborAdvertisements);
11441 11442          UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects);
11442 11443          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries,
11443 11444              o2->ipv6IfIcmpOutGroupMembQueries);
11444 11445          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses,
11445 11446              o2->ipv6IfIcmpOutGroupMembResponses);
11446 11447          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions,
11447 11448              o2->ipv6IfIcmpOutGroupMembReductions);
11448 11449          UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows);
11449 11450          UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit);
11450 11451          UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements,
11451 11452              o2->ipv6IfIcmpInBadNeighborAdvertisements);
11452 11453          UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations,
11453 11454              o2->ipv6IfIcmpInBadNeighborSolicitations);
11454 11455          UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects);
11455 11456          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal,
11456 11457              o2->ipv6IfIcmpInGroupMembTotal);
11457 11458          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries,
11458 11459              o2->ipv6IfIcmpInGroupMembBadQueries);
11459 11460          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports,
11460 11461              o2->ipv6IfIcmpInGroupMembBadReports);
11461 11462          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports,
11462 11463              o2->ipv6IfIcmpInGroupMembOurReports);
11463 11464  }
11464 11465  
11465 11466  /*
11466 11467   * Called before the options are updated to check if this packet will
11467 11468   * be source routed from here.
11468 11469   * This routine assumes that the options are well formed i.e. that they
11469 11470   * have already been checked.
11470 11471   */
11471 11472  boolean_t
11472 11473  ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
11473 11474  {
11474 11475          ipoptp_t        opts;
11475 11476          uchar_t         *opt;
11476 11477          uint8_t         optval;
11477 11478          uint8_t         optlen;
11478 11479          ipaddr_t        dst;
11479 11480  
11480 11481          if (IS_SIMPLE_IPH(ipha)) {
11481 11482                  ip2dbg(("not source routed\n"));
11482 11483                  return (B_FALSE);
11483 11484          }
11484 11485          dst = ipha->ipha_dst;
11485 11486          for (optval = ipoptp_first(&opts, ipha);
11486 11487              optval != IPOPT_EOL;
11487 11488              optval = ipoptp_next(&opts)) {
11488 11489                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11489 11490                  opt = opts.ipoptp_cur;
11490 11491                  optlen = opts.ipoptp_len;
11491 11492                  ip2dbg(("ip_source_routed: opt %d, len %d\n",
11492 11493                      optval, optlen));
11493 11494                  switch (optval) {
11494 11495                          uint32_t off;
11495 11496                  case IPOPT_SSRR:
11496 11497                  case IPOPT_LSRR:
11497 11498                          /*
11498 11499                           * If dst is one of our addresses and there are some
11499 11500                           * entries left in the source route return (true).
11500 11501                           */
11501 11502                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
11502 11503                                  ip2dbg(("ip_source_routed: not next"
11503 11504                                      " source route 0x%x\n",
11504 11505                                      ntohl(dst)));
11505 11506                                  return (B_FALSE);
11506 11507                          }
11507 11508                          off = opt[IPOPT_OFFSET];
11508 11509                          off--;
11509 11510                          if (optlen < IP_ADDR_LEN ||
11510 11511                              off > optlen - IP_ADDR_LEN) {
11511 11512                                  /* End of source route */
11512 11513                                  ip1dbg(("ip_source_routed: end of SR\n"));
11513 11514                                  return (B_FALSE);
11514 11515                          }
11515 11516                          return (B_TRUE);
11516 11517                  }
11517 11518          }
11518 11519          ip2dbg(("not source routed\n"));
11519 11520          return (B_FALSE);
11520 11521  }
11521 11522  
11522 11523  /*
11523 11524   * ip_unbind is called by the transports to remove a conn from
11524 11525   * the fanout table.
11525 11526   */
11526 11527  void
11527 11528  ip_unbind(conn_t *connp)
11528 11529  {
11529 11530  
11530 11531          ASSERT(!MUTEX_HELD(&connp->conn_lock));
11531 11532  
11532 11533          if (is_system_labeled() && connp->conn_anon_port) {
11533 11534                  (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
11534 11535                      connp->conn_mlp_type, connp->conn_proto,
11535 11536                      ntohs(connp->conn_lport), B_FALSE);
11536 11537                  connp->conn_anon_port = 0;
11537 11538          }
11538 11539          connp->conn_mlp_type = mlptSingle;
11539 11540  
11540 11541          ipcl_hash_remove(connp);
11541 11542  }
11542 11543  
11543 11544  /*
11544 11545   * Used for deciding the MSS size for the upper layer. Thus
11545 11546   * we need to check the outbound policy values in the conn.
11546 11547   */
11547 11548  int
11548 11549  conn_ipsec_length(conn_t *connp)
11549 11550  {
11550 11551          ipsec_latch_t *ipl;
11551 11552  
11552 11553          ipl = connp->conn_latch;
11553 11554          if (ipl == NULL)
11554 11555                  return (0);
11555 11556  
11556 11557          if (connp->conn_ixa->ixa_ipsec_policy == NULL)
11557 11558                  return (0);
11558 11559  
11559 11560          return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
11560 11561  }
11561 11562  
11562 11563  /*
11563 11564   * Returns an estimate of the IPsec headers size. This is used if
11564 11565   * we don't want to call into IPsec to get the exact size.
11565 11566   */
11566 11567  int
11567 11568  ipsec_out_extra_length(ip_xmit_attr_t *ixa)
11568 11569  {
11569 11570          ipsec_action_t *a;
11570 11571  
11571 11572          if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
11572 11573                  return (0);
11573 11574  
11574 11575          a = ixa->ixa_ipsec_action;
11575 11576          if (a == NULL) {
11576 11577                  ASSERT(ixa->ixa_ipsec_policy != NULL);
11577 11578                  a = ixa->ixa_ipsec_policy->ipsp_act;
11578 11579          }
11579 11580          ASSERT(a != NULL);
11580 11581  
11581 11582          return (a->ipa_ovhd);
11582 11583  }
11583 11584  
11584 11585  /*
11585 11586   * If there are any source route options, return the true final
11586 11587   * destination. Otherwise, return the destination.
11587 11588   */
11588 11589  ipaddr_t
11589 11590  ip_get_dst(ipha_t *ipha)
11590 11591  {
11591 11592          ipoptp_t        opts;
11592 11593          uchar_t         *opt;
11593 11594          uint8_t         optval;
11594 11595          uint8_t         optlen;
11595 11596          ipaddr_t        dst;
11596 11597          uint32_t off;
11597 11598  
11598 11599          dst = ipha->ipha_dst;
11599 11600  
11600 11601          if (IS_SIMPLE_IPH(ipha))
11601 11602                  return (dst);
11602 11603  
11603 11604          for (optval = ipoptp_first(&opts, ipha);
11604 11605              optval != IPOPT_EOL;
11605 11606              optval = ipoptp_next(&opts)) {
11606 11607                  opt = opts.ipoptp_cur;
11607 11608                  optlen = opts.ipoptp_len;
11608 11609                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11609 11610                  switch (optval) {
11610 11611                  case IPOPT_SSRR:
11611 11612                  case IPOPT_LSRR:
11612 11613                          off = opt[IPOPT_OFFSET];
11613 11614                          /*
11614 11615                           * If one of the conditions is true, it means
11615 11616                           * end of options and dst already has the right
11616 11617                           * value.
11617 11618                           */
11618 11619                          if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) {
11619 11620                                  off = optlen - IP_ADDR_LEN;
11620 11621                                  bcopy(&opt[off], &dst, IP_ADDR_LEN);
11621 11622                          }
11622 11623                          return (dst);
11623 11624                  default:
11624 11625                          break;
11625 11626                  }
11626 11627          }
11627 11628  
11628 11629          return (dst);
11629 11630  }
11630 11631  
11631 11632  /*
11632 11633   * Outbound IP fragmentation routine.
11633 11634   * Assumes the caller has checked whether or not fragmentation should
11634 11635   * be allowed. Here we copy the DF bit from the header to all the generated
11635 11636   * fragments.
11636 11637   */
11637 11638  int
11638 11639  ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
11639 11640      uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
11640 11641      zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
11641 11642  {
11642 11643          int             i1;
11643 11644          int             hdr_len;
11644 11645          mblk_t          *hdr_mp;
11645 11646          ipha_t          *ipha;
11646 11647          int             ip_data_end;
11647 11648          int             len;
11648 11649          mblk_t          *mp = mp_orig;
11649 11650          int             offset;
11650 11651          ill_t           *ill = nce->nce_ill;
11651 11652          ip_stack_t      *ipst = ill->ill_ipst;
11652 11653          mblk_t          *carve_mp;
11653 11654          uint32_t        frag_flag;
11654 11655          uint_t          priority = mp->b_band;
11655 11656          int             error = 0;
11656 11657  
11657 11658          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
11658 11659  
11659 11660          if (pkt_len != msgdsize(mp)) {
11660 11661                  ip0dbg(("Packet length mismatch: %d, %ld\n",
11661 11662                      pkt_len, msgdsize(mp)));
11662 11663                  freemsg(mp);
11663 11664                  return (EINVAL);
11664 11665          }
11665 11666  
11666 11667          if (max_frag == 0) {
11667 11668                  ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
11668 11669                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11669 11670                  ip_drop_output("FragFails: zero max_frag", mp, ill);
11670 11671                  freemsg(mp);
11671 11672                  return (EINVAL);
11672 11673          }
11673 11674  
11674 11675          ASSERT(MBLKL(mp) >= sizeof (ipha_t));
11675 11676          ipha = (ipha_t *)mp->b_rptr;
11676 11677          ASSERT(ntohs(ipha->ipha_length) == pkt_len);
11677 11678          frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
11678 11679  
11679 11680          /*
11680 11681           * Establish the starting offset.  May not be zero if we are fragging
11681 11682           * a fragment that is being forwarded.
11682 11683           */
11683 11684          offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
11684 11685  
11685 11686          /* TODO why is this test needed? */
11686 11687          if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
11687 11688                  /* TODO: notify ulp somehow */
11688 11689                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11689 11690                  ip_drop_output("FragFails: bad starting offset", mp, ill);
11690 11691                  freemsg(mp);
11691 11692                  return (EINVAL);
11692 11693          }
11693 11694  
11694 11695          hdr_len = IPH_HDR_LENGTH(ipha);
11695 11696          ipha->ipha_hdr_checksum = 0;
11696 11697  
11697 11698          /*
11698 11699           * Establish the number of bytes maximum per frag, after putting
11699 11700           * in the header.
11700 11701           */
11701 11702          len = (max_frag - hdr_len) & ~7;
11702 11703  
11703 11704          /* Get a copy of the header for the trailing frags */
11704 11705          hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
11705 11706              mp);
11706 11707          if (hdr_mp == NULL) {
11707 11708                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11708 11709                  ip_drop_output("FragFails: no hdr_mp", mp, ill);
11709 11710                  freemsg(mp);
11710 11711                  return (ENOBUFS);
11711 11712          }
11712 11713  
11713 11714          /* Store the starting offset, with the MoreFrags flag. */
11714 11715          i1 = offset | IPH_MF | frag_flag;
11715 11716          ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1);
11716 11717  
11717 11718          /* Establish the ending byte offset, based on the starting offset. */
11718 11719          offset <<= 3;
11719 11720          ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
11720 11721  
11721 11722          /* Store the length of the first fragment in the IP header. */
11722 11723          i1 = len + hdr_len;
11723 11724          ASSERT(i1 <= IP_MAXPACKET);
11724 11725          ipha->ipha_length = htons((uint16_t)i1);
11725 11726  
11726 11727          /*
11727 11728           * Compute the IP header checksum for the first frag.  We have to
11728 11729           * watch out that we stop at the end of the header.
11729 11730           */
11730 11731          ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11731 11732  
11732 11733          /*
11733 11734           * Now carve off the first frag.  Note that this will include the
11734 11735           * original IP header.
11735 11736           */
11736 11737          if (!(mp = ip_carve_mp(&mp_orig, i1))) {
11737 11738                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11738 11739                  ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
11739 11740                  freeb(hdr_mp);
11740 11741                  freemsg(mp_orig);
11741 11742                  return (ENOBUFS);
11742 11743          }
11743 11744  
11744 11745          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11745 11746  
11746 11747          error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
11747 11748              ixa_cookie);
11748 11749          if (error != 0 && error != EWOULDBLOCK) {
11749 11750                  /* No point in sending the other fragments */
11750 11751                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11751 11752                  ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
11752 11753                  freeb(hdr_mp);
11753 11754                  freemsg(mp_orig);
11754 11755                  return (error);
11755 11756          }
11756 11757  
11757 11758          /* No need to redo state machine in loop */
11758 11759          ixaflags &= ~IXAF_REACH_CONF;
11759 11760  
11760 11761          /* Advance the offset to the second frag starting point. */
11761 11762          offset += len;
11762 11763          /*
11763 11764           * Update hdr_len from the copied header - there might be less options
11764 11765           * in the later fragments.
11765 11766           */
11766 11767          hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr);
11767 11768          /* Loop until done. */
11768 11769          for (;;) {
11769 11770                  uint16_t        offset_and_flags;
11770 11771                  uint16_t        ip_len;
11771 11772  
11772 11773                  if (ip_data_end - offset > len) {
11773 11774                          /*
11774 11775                           * Carve off the appropriate amount from the original
11775 11776                           * datagram.
11776 11777                           */
11777 11778                          if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11778 11779                                  mp = NULL;
11779 11780                                  break;
11780 11781                          }
11781 11782                          /*
11782 11783                           * More frags after this one.  Get another copy
11783 11784                           * of the header.
11784 11785                           */
11785 11786                          if (carve_mp->b_datap->db_ref == 1 &&
11786 11787                              hdr_mp->b_wptr - hdr_mp->b_rptr <
11787 11788                              carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11788 11789                                  /* Inline IP header */
11789 11790                                  carve_mp->b_rptr -= hdr_mp->b_wptr -
11790 11791                                      hdr_mp->b_rptr;
11791 11792                                  bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11792 11793                                      hdr_mp->b_wptr - hdr_mp->b_rptr);
11793 11794                                  mp = carve_mp;
11794 11795                          } else {
11795 11796                                  if (!(mp = copyb(hdr_mp))) {
11796 11797                                          freemsg(carve_mp);
11797 11798                                          break;
11798 11799                                  }
11799 11800                                  /* Get priority marking, if any. */
11800 11801                                  mp->b_band = priority;
11801 11802                                  mp->b_cont = carve_mp;
11802 11803                          }
11803 11804                          ipha = (ipha_t *)mp->b_rptr;
11804 11805                          offset_and_flags = IPH_MF;
11805 11806                  } else {
11806 11807                          /*
11807 11808                           * Last frag.  Consume the header. Set len to
11808 11809                           * the length of this last piece.
11809 11810                           */
11810 11811                          len = ip_data_end - offset;
11811 11812  
11812 11813                          /*
11813 11814                           * Carve off the appropriate amount from the original
11814 11815                           * datagram.
11815 11816                           */
11816 11817                          if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11817 11818                                  mp = NULL;
11818 11819                                  break;
11819 11820                          }
11820 11821                          if (carve_mp->b_datap->db_ref == 1 &&
11821 11822                              hdr_mp->b_wptr - hdr_mp->b_rptr <
11822 11823                              carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11823 11824                                  /* Inline IP header */
11824 11825                                  carve_mp->b_rptr -= hdr_mp->b_wptr -
11825 11826                                      hdr_mp->b_rptr;
11826 11827                                  bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11827 11828                                      hdr_mp->b_wptr - hdr_mp->b_rptr);
11828 11829                                  mp = carve_mp;
11829 11830                                  freeb(hdr_mp);
11830 11831                                  hdr_mp = mp;
11831 11832                          } else {
11832 11833                                  mp = hdr_mp;
11833 11834                                  /* Get priority marking, if any. */
11834 11835                                  mp->b_band = priority;
11835 11836                                  mp->b_cont = carve_mp;
11836 11837                          }
11837 11838                          ipha = (ipha_t *)mp->b_rptr;
11838 11839                          /* A frag of a frag might have IPH_MF non-zero */
11839 11840                          offset_and_flags =
11840 11841                              ntohs(ipha->ipha_fragment_offset_and_flags) &
11841 11842                              IPH_MF;
11842 11843                  }
11843 11844                  offset_and_flags |= (uint16_t)(offset >> 3);
11844 11845                  offset_and_flags |= (uint16_t)frag_flag;
11845 11846                  /* Store the offset and flags in the IP header. */
11846 11847                  ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
11847 11848  
11848 11849                  /* Store the length in the IP header. */
11849 11850                  ip_len = (uint16_t)(len + hdr_len);
11850 11851                  ipha->ipha_length = htons(ip_len);
11851 11852  
11852 11853                  /*
11853 11854                   * Set the IP header checksum.  Note that mp is just
11854 11855                   * the header, so this is easy to pass to ip_csum.
11855 11856                   */
11856 11857                  ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11857 11858  
11858 11859                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11859 11860  
11860 11861                  error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
11861 11862                      nolzid, ixa_cookie);
11862 11863                  /* All done if we just consumed the hdr_mp. */
11863 11864                  if (mp == hdr_mp) {
11864 11865                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
11865 11866                          return (error);
11866 11867                  }
11867 11868                  if (error != 0 && error != EWOULDBLOCK) {
11868 11869                          DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
11869 11870                              mblk_t *, hdr_mp);
11870 11871                          /* No point in sending the other fragments */
11871 11872                          break;
11872 11873                  }
11873 11874  
11874 11875                  /* Otherwise, advance and loop. */
11875 11876                  offset += len;
11876 11877          }
11877 11878          /* Clean up following allocation failure. */
11878 11879          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11879 11880          ip_drop_output("FragFails: loop ended", NULL, ill);
11880 11881          if (mp != hdr_mp)
11881 11882                  freeb(hdr_mp);
11882 11883          if (mp != mp_orig)
11883 11884                  freemsg(mp_orig);
11884 11885          return (error);
11885 11886  }
11886 11887  
11887 11888  /*
11888 11889   * Copy the header plus those options which have the copy bit set
11889 11890   */
11890 11891  static mblk_t *
11891 11892  ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
11892 11893      mblk_t *src)
11893 11894  {
11894 11895          mblk_t  *mp;
11895 11896          uchar_t *up;
11896 11897  
11897 11898          /*
11898 11899           * Quick check if we need to look for options without the copy bit
11899 11900           * set
11900 11901           */
11901 11902          mp = allocb_tmpl(ipst->ips_ip_wroff_extra + hdr_len, src);
11902 11903          if (!mp)
11903 11904                  return (mp);
11904 11905          mp->b_rptr += ipst->ips_ip_wroff_extra;
11905 11906          if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) {
11906 11907                  bcopy(rptr, mp->b_rptr, hdr_len);
11907 11908                  mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra;
11908 11909                  return (mp);
11909 11910          }
11910 11911          up  = mp->b_rptr;
11911 11912          bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH);
11912 11913          up += IP_SIMPLE_HDR_LENGTH;
11913 11914          rptr += IP_SIMPLE_HDR_LENGTH;
11914 11915          hdr_len -= IP_SIMPLE_HDR_LENGTH;
11915 11916          while (hdr_len > 0) {
11916 11917                  uint32_t optval;
11917 11918                  uint32_t optlen;
11918 11919  
11919 11920                  optval = *rptr;
11920 11921                  if (optval == IPOPT_EOL)
11921 11922                          break;
11922 11923                  if (optval == IPOPT_NOP)
11923 11924                          optlen = 1;
11924 11925                  else
11925 11926                          optlen = rptr[1];
11926 11927                  if (optval & IPOPT_COPY) {
11927 11928                          bcopy(rptr, up, optlen);
11928 11929                          up += optlen;
11929 11930                  }
11930 11931                  rptr += optlen;
11931 11932                  hdr_len -= optlen;
11932 11933          }
11933 11934          /*
11934 11935           * Make sure that we drop an even number of words by filling
11935 11936           * with EOL to the next word boundary.
11936 11937           */
11937 11938          for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH);
11938 11939              hdr_len & 0x3; hdr_len++)
11939 11940                  *up++ = IPOPT_EOL;
11940 11941          mp->b_wptr = up;
11941 11942          /* Update header length */
11942 11943          mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2));
11943 11944          return (mp);
11944 11945  }
11945 11946  
11946 11947  /*
11947 11948   * Update any source route, record route, or timestamp options when
11948 11949   * sending a packet back to ourselves.
11949 11950   * Check that we are at end of strict source route.
11950 11951   * The options have been sanity checked by ip_output_options().
11951 11952   */
11952 11953  void
11953 11954  ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
11954 11955  {
11955 11956          ipoptp_t        opts;
11956 11957          uchar_t         *opt;
11957 11958          uint8_t         optval;
11958 11959          uint8_t         optlen;
11959 11960          ipaddr_t        dst;
11960 11961          uint32_t        ts;
11961 11962          timestruc_t     now;
11962 11963          uint32_t        off = 0;
11963 11964  
11964 11965          for (optval = ipoptp_first(&opts, ipha);
11965 11966              optval != IPOPT_EOL;
11966 11967              optval = ipoptp_next(&opts)) {
11967 11968                  opt = opts.ipoptp_cur;
11968 11969                  optlen = opts.ipoptp_len;
11969 11970                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11970 11971                  switch (optval) {
11971 11972                  case IPOPT_SSRR:
11972 11973                  case IPOPT_LSRR:
11973 11974                          off = opt[IPOPT_OFFSET];
11974 11975                          off--;
11975 11976                          if (optlen < IP_ADDR_LEN ||
11976 11977                              off > optlen - IP_ADDR_LEN) {
11977 11978                                  /* End of source route */
11978 11979                                  break;
11979 11980                          }
11980 11981                          /*
11981 11982                           * This will only happen if two consecutive entries
11982 11983                           * in the source route contains our address or if
11983 11984                           * it is a packet with a loose source route which
11984 11985                           * reaches us before consuming the whole source route
11985 11986                           */
11986 11987  
11987 11988                          if (optval == IPOPT_SSRR) {
11988 11989                                  return;
11989 11990                          }
11990 11991                          /*
11991 11992                           * Hack: instead of dropping the packet truncate the
11992 11993                           * source route to what has been used by filling the
11993 11994                           * rest with IPOPT_NOP.
11994 11995                           */
11995 11996                          opt[IPOPT_OLEN] = (uint8_t)off;
11996 11997                          while (off < optlen) {
11997 11998                                  opt[off++] = IPOPT_NOP;
11998 11999                          }
11999 12000                          break;
12000 12001                  case IPOPT_RR:
12001 12002                          off = opt[IPOPT_OFFSET];
12002 12003                          off--;
12003 12004                          if (optlen < IP_ADDR_LEN ||
12004 12005                              off > optlen - IP_ADDR_LEN) {
12005 12006                                  /* No more room - ignore */
12006 12007                                  ip1dbg((
12007 12008                                      "ip_output_local_options: end of RR\n"));
12008 12009                                  break;
12009 12010                          }
12010 12011                          dst = htonl(INADDR_LOOPBACK);
12011 12012                          bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
12012 12013                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
12013 12014                          break;
12014 12015                  case IPOPT_TS:
12015 12016                          /* Insert timestamp if there is romm */
12016 12017                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
12017 12018                          case IPOPT_TS_TSONLY:
12018 12019                                  off = IPOPT_TS_TIMELEN;
12019 12020                                  break;
12020 12021                          case IPOPT_TS_PRESPEC:
12021 12022                          case IPOPT_TS_PRESPEC_RFC791:
12022 12023                                  /* Verify that the address matched */
12023 12024                                  off = opt[IPOPT_OFFSET] - 1;
12024 12025                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
12025 12026                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
12026 12027                                          /* Not for us */
12027 12028                                          break;
12028 12029                                  }
12029 12030                                  /* FALLTHROUGH */
12030 12031                          case IPOPT_TS_TSANDADDR:
12031 12032                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
12032 12033                                  break;
12033 12034                          default:
12034 12035                                  /*
12035 12036                                   * ip_*put_options should have already
12036 12037                                   * dropped this packet.
12037 12038                                   */
12038 12039                                  cmn_err(CE_PANIC, "ip_output_local_options: "
12039 12040                                      "unknown IT - bug in ip_output_options?\n");
12040 12041                          }
12041 12042                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
12042 12043                                  /* Increase overflow counter */
12043 12044                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
12044 12045                                  opt[IPOPT_POS_OV_FLG] = (uint8_t)
12045 12046                                      (opt[IPOPT_POS_OV_FLG] & 0x0F) |
12046 12047                                      (off << 4);
12047 12048                                  break;
12048 12049                          }
12049 12050                          off = opt[IPOPT_OFFSET] - 1;
12050 12051                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
12051 12052                          case IPOPT_TS_PRESPEC:
12052 12053                          case IPOPT_TS_PRESPEC_RFC791:
12053 12054                          case IPOPT_TS_TSANDADDR:
12054 12055                                  dst = htonl(INADDR_LOOPBACK);
12055 12056                                  bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
12056 12057                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
12057 12058                                  /* FALLTHROUGH */
12058 12059                          case IPOPT_TS_TSONLY:
12059 12060                                  off = opt[IPOPT_OFFSET] - 1;
12060 12061                                  /* Compute # of milliseconds since midnight */
12061 12062                                  gethrestime(&now);
12062 12063                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
12063 12064                                      NSEC2MSEC(now.tv_nsec);
12064 12065                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
12065 12066                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
12066 12067                                  break;
12067 12068                          }
12068 12069                          break;
12069 12070                  }
12070 12071          }
12071 12072  }
12072 12073  
12073 12074  /*
12074 12075   * Prepend an M_DATA fastpath header, and if none present prepend a
12075 12076   * DL_UNITDATA_REQ. Frees the mblk on failure.
12076 12077   *
12077 12078   * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
12078 12079   * If there is a change to them, the nce will be deleted (condemned) and
12079 12080   * a new nce_t will be created when packets are sent. Thus we need no locks
12080 12081   * to access those fields.
12081 12082   *
12082 12083   * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
12083 12084   * we place b_band in dl_priority.dl_max.
12084 12085   */
12085 12086  static mblk_t *
12086 12087  ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
12087 12088  {
12088 12089          uint_t  hlen;
12089 12090          mblk_t *mp1;
12090 12091          uint_t  priority;
12091 12092          uchar_t *rptr;
12092 12093  
12093 12094          rptr = mp->b_rptr;
12094 12095  
12095 12096          ASSERT(DB_TYPE(mp) == M_DATA);
12096 12097          priority = mp->b_band;
12097 12098  
12098 12099          ASSERT(nce != NULL);
12099 12100          if ((mp1 = nce->nce_fp_mp) != NULL) {
12100 12101                  hlen = MBLKL(mp1);
12101 12102                  /*
12102 12103                   * Check if we have enough room to prepend fastpath
12103 12104                   * header
12104 12105                   */
12105 12106                  if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
12106 12107                          rptr -= hlen;
12107 12108                          bcopy(mp1->b_rptr, rptr, hlen);
12108 12109                          /*
12109 12110                           * Set the b_rptr to the start of the link layer
12110 12111                           * header
12111 12112                           */
12112 12113                          mp->b_rptr = rptr;
12113 12114                          return (mp);
12114 12115                  }
12115 12116                  mp1 = copyb(mp1);
12116 12117                  if (mp1 == NULL) {
12117 12118                          ill_t *ill = nce->nce_ill;
12118 12119  
12119 12120                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12120 12121                          ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12121 12122                          freemsg(mp);
12122 12123                          return (NULL);
12123 12124                  }
12124 12125                  mp1->b_band = priority;
12125 12126                  mp1->b_cont = mp;
12126 12127                  DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
12127 12128                  DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
12128 12129                  DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
12129 12130                  DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
12130 12131                  DB_LSOMSS(mp1) = DB_LSOMSS(mp);
12131 12132                  DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
12132 12133                  /*
12133 12134                   * XXX disable ICK_VALID and compute checksum
12134 12135                   * here; can happen if nce_fp_mp changes and
12135 12136                   * it can't be copied now due to insufficient
12136 12137                   * space. (unlikely, fp mp can change, but it
12137 12138                   * does not increase in length)
12138 12139                   */
12139 12140                  return (mp1);
12140 12141          }
12141 12142          mp1 = copyb(nce->nce_dlur_mp);
12142 12143  
12143 12144          if (mp1 == NULL) {
12144 12145                  ill_t *ill = nce->nce_ill;
12145 12146  
12146 12147                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12147 12148                  ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12148 12149                  freemsg(mp);
12149 12150                  return (NULL);
12150 12151          }
12151 12152          mp1->b_cont = mp;
12152 12153          if (priority != 0) {
12153 12154                  mp1->b_band = priority;
12154 12155                  ((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
12155 12156                      priority;
12156 12157          }
12157 12158          return (mp1);
12158 12159  }
12159 12160  
12160 12161  /*
12161 12162   * Finish the outbound IPsec processing. This function is called from
12162 12163   * ipsec_out_process() if the IPsec packet was processed
12163 12164   * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12164 12165   * asynchronously.
12165 12166   *
12166 12167   * This is common to IPv4 and IPv6.
12167 12168   */
12168 12169  int
12169 12170  ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
12170 12171  {
12171 12172          iaflags_t       ixaflags = ixa->ixa_flags;
12172 12173          uint_t          pktlen;
12173 12174  
12174 12175  
12175 12176          /* AH/ESP don't update ixa_pktlen when they modify the packet */
12176 12177          if (ixaflags & IXAF_IS_IPV4) {
12177 12178                  ipha_t          *ipha = (ipha_t *)mp->b_rptr;
12178 12179  
12179 12180                  ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12180 12181                  pktlen = ntohs(ipha->ipha_length);
12181 12182          } else {
12182 12183                  ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
12183 12184  
12184 12185                  ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12185 12186                  pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12186 12187          }
12187 12188  
12188 12189          /*
12189 12190           * We release any hard reference on the SAs here to make
12190 12191           * sure the SAs can be garbage collected. ipsr_sa has a soft reference
12191 12192           * on the SAs.
12192 12193           * If in the future we want the hard latching of the SAs in the
12193 12194           * ip_xmit_attr_t then we should remove this.
12194 12195           */
12195 12196          if (ixa->ixa_ipsec_esp_sa != NULL) {
12196 12197                  IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12197 12198                  ixa->ixa_ipsec_esp_sa = NULL;
12198 12199          }
12199 12200          if (ixa->ixa_ipsec_ah_sa != NULL) {
12200 12201                  IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12201 12202                  ixa->ixa_ipsec_ah_sa = NULL;
12202 12203          }
12203 12204  
12204 12205          /* Do we need to fragment? */
12205 12206          if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
12206 12207              pktlen > ixa->ixa_fragsize) {
12207 12208                  if (ixaflags & IXAF_IS_IPV4) {
12208 12209                          ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
12209 12210                          /*
12210 12211                           * We check for the DF case in ipsec_out_process
12211 12212                           * hence this only handles the non-DF case.
12212 12213                           */
12213 12214                          return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
12214 12215                              pktlen, ixa->ixa_fragsize,
12215 12216                              ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12216 12217                              ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
12217 12218                              &ixa->ixa_cookie));
12218 12219                  } else {
12219 12220                          mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
12220 12221                          if (mp == NULL) {
12221 12222                                  /* MIB and ip_drop_output already done */
12222 12223                                  return (ENOMEM);
12223 12224                          }
12224 12225                          pktlen += sizeof (ip6_frag_t);
12225 12226                          if (pktlen > ixa->ixa_fragsize) {
12226 12227                                  return (ip_fragment_v6(mp, ixa->ixa_nce,
12227 12228                                      ixa->ixa_flags, pktlen,
12228 12229                                      ixa->ixa_fragsize, ixa->ixa_xmit_hint,
12229 12230                                      ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
12230 12231                                      ixa->ixa_postfragfn, &ixa->ixa_cookie));
12231 12232                          }
12232 12233                  }
12233 12234          }
12234 12235          return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
12235 12236              pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12236 12237              ixa->ixa_no_loop_zoneid, NULL));
12237 12238  }
12238 12239  
12239 12240  /*
12240 12241   * Finish the inbound IPsec processing. This function is called from
12241 12242   * ipsec_out_process() if the IPsec packet was processed
12242 12243   * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12243 12244   * asynchronously.
12244 12245   *
12245 12246   * This is common to IPv4 and IPv6.
12246 12247   */
12247 12248  void
12248 12249  ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
12249 12250  {
12250 12251          iaflags_t       iraflags = ira->ira_flags;
12251 12252  
12252 12253          /* Length might have changed */
12253 12254          if (iraflags & IRAF_IS_IPV4) {
12254 12255                  ipha_t          *ipha = (ipha_t *)mp->b_rptr;
12255 12256  
12256 12257                  ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12257 12258                  ira->ira_pktlen = ntohs(ipha->ipha_length);
12258 12259                  ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
12259 12260                  ira->ira_protocol = ipha->ipha_protocol;
12260 12261  
12261 12262                  ip_fanout_v4(mp, ipha, ira);
12262 12263          } else {
12263 12264                  ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
12264 12265                  uint8_t         *nexthdrp;
12265 12266  
12266 12267                  ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12267 12268                  ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12268 12269                  if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
12269 12270                      &nexthdrp)) {
12270 12271                          /* Malformed packet */
12271 12272                          BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
12272 12273                          ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
12273 12274                          freemsg(mp);
12274 12275                          return;
12275 12276                  }
12276 12277                  ira->ira_protocol = *nexthdrp;
12277 12278                  ip_fanout_v6(mp, ip6h, ira);
12278 12279          }
12279 12280  }
12280 12281  
12281 12282  /*
12282 12283   * Select which AH & ESP SA's to use (if any) for the outbound packet.
12283 12284   *
12284 12285   * If this function returns B_TRUE, the requested SA's have been filled
12285 12286   * into the ixa_ipsec_*_sa pointers.
12286 12287   *
12287 12288   * If the function returns B_FALSE, the packet has been "consumed", most
12288 12289   * likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
12289 12290   *
12290 12291   * The SA references created by the protocol-specific "select"
12291 12292   * function will be released in ip_output_post_ipsec.
12292 12293   */
12293 12294  static boolean_t
12294 12295  ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
12295 12296  {
12296 12297          boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
12297 12298          ipsec_policy_t *pp;
12298 12299          ipsec_action_t *ap;
12299 12300  
12300 12301          ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12301 12302          ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12302 12303              (ixa->ixa_ipsec_action != NULL));
12303 12304  
12304 12305          ap = ixa->ixa_ipsec_action;
12305 12306          if (ap == NULL) {
12306 12307                  pp = ixa->ixa_ipsec_policy;
12307 12308                  ASSERT(pp != NULL);
12308 12309                  ap = pp->ipsp_act;
12309 12310                  ASSERT(ap != NULL);
12310 12311          }
12311 12312  
12312 12313          /*
12313 12314           * We have an action.  now, let's select SA's.
12314 12315           * A side effect of setting ixa_ipsec_*_sa is that it will
12315 12316           * be cached in the conn_t.
12316 12317           */
12317 12318          if (ap->ipa_want_esp) {
12318 12319                  if (ixa->ixa_ipsec_esp_sa == NULL) {
12319 12320                          need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
12320 12321                              IPPROTO_ESP);
12321 12322                  }
12322 12323                  ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
12323 12324          }
12324 12325  
12325 12326          if (ap->ipa_want_ah) {
12326 12327                  if (ixa->ixa_ipsec_ah_sa == NULL) {
12327 12328                          need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
12328 12329                              IPPROTO_AH);
12329 12330                  }
12330 12331                  ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
12331 12332                  /*
12332 12333                   * The ESP and AH processing order needs to be preserved
12333 12334                   * when both protocols are required (ESP should be applied
12334 12335                   * before AH for an outbound packet). Force an ESP ACQUIRE
12335 12336                   * when both ESP and AH are required, and an AH ACQUIRE
12336 12337                   * is needed.
12337 12338                   */
12338 12339                  if (ap->ipa_want_esp && need_ah_acquire)
12339 12340                          need_esp_acquire = B_TRUE;
12340 12341          }
12341 12342  
12342 12343          /*
12343 12344           * Send an ACQUIRE (extended, regular, or both) if we need one.
12344 12345           * Release SAs that got referenced, but will not be used until we
12345 12346           * acquire _all_ of the SAs we need.
12346 12347           */
12347 12348          if (need_ah_acquire || need_esp_acquire) {
12348 12349                  if (ixa->ixa_ipsec_ah_sa != NULL) {
12349 12350                          IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12350 12351                          ixa->ixa_ipsec_ah_sa = NULL;
12351 12352                  }
12352 12353                  if (ixa->ixa_ipsec_esp_sa != NULL) {
12353 12354                          IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12354 12355                          ixa->ixa_ipsec_esp_sa = NULL;
12355 12356                  }
12356 12357  
12357 12358                  sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
12358 12359                  return (B_FALSE);
12359 12360          }
12360 12361  
12361 12362          return (B_TRUE);
12362 12363  }
12363 12364  
12364 12365  /*
12365 12366   * Handle IPsec output processing.
12366 12367   * This function is only entered once for a given packet.
12367 12368   * We try to do things synchronously, but if we need to have user-level
12368 12369   * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
12369 12370   * will be completed
12370 12371   *  - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
12371 12372   *  - when asynchronous ESP is done it will do AH
12372 12373   *
12373 12374   * In all cases we come back in ip_output_post_ipsec() to fragment and
12374 12375   * send out the packet.
12375 12376   */
12376 12377  int
12377 12378  ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
12378 12379  {
12379 12380          ill_t           *ill = ixa->ixa_nce->nce_ill;
12380 12381          ip_stack_t      *ipst = ixa->ixa_ipst;
12381 12382          ipsec_stack_t   *ipss;
12382 12383          ipsec_policy_t  *pp;
12383 12384          ipsec_action_t  *ap;
12384 12385  
12385 12386          ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12386 12387  
12387 12388          ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12388 12389              (ixa->ixa_ipsec_action != NULL));
12389 12390  
12390 12391          ipss = ipst->ips_netstack->netstack_ipsec;
12391 12392          if (!ipsec_loaded(ipss)) {
12392 12393                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12393 12394                  ip_drop_packet(mp, B_TRUE, ill,
12394 12395                      DROPPER(ipss, ipds_ip_ipsec_not_loaded),
12395 12396                      &ipss->ipsec_dropper);
12396 12397                  return (ENOTSUP);
12397 12398          }
12398 12399  
12399 12400          ap = ixa->ixa_ipsec_action;
12400 12401          if (ap == NULL) {
12401 12402                  pp = ixa->ixa_ipsec_policy;
12402 12403                  ASSERT(pp != NULL);
12403 12404                  ap = pp->ipsp_act;
12404 12405                  ASSERT(ap != NULL);
12405 12406          }
12406 12407  
12407 12408          /* Handle explicit drop action and bypass. */
12408 12409          switch (ap->ipa_act.ipa_type) {
12409 12410          case IPSEC_ACT_DISCARD:
12410 12411          case IPSEC_ACT_REJECT:
12411 12412                  ip_drop_packet(mp, B_FALSE, ill,
12412 12413                      DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
12413 12414                  return (EHOSTUNREACH);  /* IPsec policy failure */
12414 12415          case IPSEC_ACT_BYPASS:
12415 12416                  return (ip_output_post_ipsec(mp, ixa));
12416 12417          }
12417 12418  
12418 12419          /*
12419 12420           * The order of processing is first insert a IP header if needed.
12420 12421           * Then insert the ESP header and then the AH header.
12421 12422           */
12422 12423          if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
12423 12424                  /*
12424 12425                   * First get the outer IP header before sending
12425 12426                   * it to ESP.
12426 12427                   */
12427 12428                  ipha_t *oipha, *iipha;
12428 12429                  mblk_t *outer_mp, *inner_mp;
12429 12430  
12430 12431                  if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
12431 12432                          (void) mi_strlog(ill->ill_rq, 0,
12432 12433                              SL_ERROR|SL_TRACE|SL_CONSOLE,
12433 12434                              "ipsec_out_process: "
12434 12435                              "Self-Encapsulation failed: Out of memory\n");
12435 12436                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12436 12437                          ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12437 12438                          freemsg(mp);
12438 12439                          return (ENOBUFS);
12439 12440                  }
12440 12441                  inner_mp = mp;
12441 12442                  ASSERT(inner_mp->b_datap->db_type == M_DATA);
12442 12443                  oipha = (ipha_t *)outer_mp->b_rptr;
12443 12444                  iipha = (ipha_t *)inner_mp->b_rptr;
12444 12445                  *oipha = *iipha;
12445 12446                  outer_mp->b_wptr += sizeof (ipha_t);
12446 12447                  oipha->ipha_length = htons(ntohs(iipha->ipha_length) +
12447 12448                      sizeof (ipha_t));
12448 12449                  oipha->ipha_protocol = IPPROTO_ENCAP;
12449 12450                  oipha->ipha_version_and_hdr_length =
12450 12451                      IP_SIMPLE_HDR_VERSION;
12451 12452                  oipha->ipha_hdr_checksum = 0;
12452 12453                  oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
12453 12454                  outer_mp->b_cont = inner_mp;
12454 12455                  mp = outer_mp;
12455 12456  
12456 12457                  ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
12457 12458          }
12458 12459  
12459 12460          /* If we need to wait for a SA then we can't return any errno */
12460 12461          if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
12461 12462              (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
12462 12463              !ipsec_out_select_sa(mp, ixa))
12463 12464                  return (0);
12464 12465  
12465 12466          /*
12466 12467           * By now, we know what SA's to use.  Toss over to ESP & AH
12467 12468           * to do the heavy lifting.
12468 12469           */
12469 12470          if (ap->ipa_want_esp) {
12470 12471                  ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
12471 12472  
12472 12473                  mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
12473 12474                  if (mp == NULL) {
12474 12475                          /*
12475 12476                           * Either it failed or is pending. In the former case
12476 12477                           * ipIfStatsInDiscards was increased.
12477 12478                           */
12478 12479                          return (0);
12479 12480                  }
12480 12481          }
12481 12482  
12482 12483          if (ap->ipa_want_ah) {
12483 12484                  ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
12484 12485  
12485 12486                  mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
12486 12487                  if (mp == NULL) {
12487 12488                          /*
12488 12489                           * Either it failed or is pending. In the former case
12489 12490                           * ipIfStatsInDiscards was increased.
12490 12491                           */
12491 12492                          return (0);
12492 12493                  }
12493 12494          }
12494 12495          /*
12495 12496           * We are done with IPsec processing. Send it over
12496 12497           * the wire.
12497 12498           */
12498 12499          return (ip_output_post_ipsec(mp, ixa));
12499 12500  }
12500 12501  
12501 12502  /*
12502 12503   * ioctls that go through a down/up sequence may need to wait for the down
12503 12504   * to complete. This involves waiting for the ire and ipif refcnts to go down
12504 12505   * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail.
12505 12506   */
12506 12507  /* ARGSUSED */
12507 12508  void
12508 12509  ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
12509 12510  {
12510 12511          struct iocblk *iocp;
12511 12512          mblk_t *mp1;
12512 12513          ip_ioctl_cmd_t *ipip;
12513 12514          int err;
12514 12515          sin_t   *sin;
12515 12516          struct lifreq *lifr;
12516 12517          struct ifreq *ifr;
12517 12518  
12518 12519          iocp = (struct iocblk *)mp->b_rptr;
12519 12520          ASSERT(ipsq != NULL);
12520 12521          /* Existence of mp1 verified in ip_wput_nondata */
12521 12522          mp1 = mp->b_cont->b_cont;
12522 12523          ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12523 12524          if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
12524 12525                  /*
12525 12526                   * Special case where ipx_current_ipif is not set:
12526 12527                   * ill_phyint_reinit merged the v4 and v6 into a single ipsq.
12527 12528                   * We are here as were not able to complete the operation in
12528 12529                   * ipif_set_values because we could not become exclusive on
12529 12530                   * the new ipsq.
12530 12531                   */
12531 12532                  ill_t *ill = q->q_ptr;
12532 12533                  ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
12533 12534          }
12534 12535          ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
12535 12536  
12536 12537          if (ipip->ipi_cmd_type == IF_CMD) {
12537 12538                  /* This a old style SIOC[GS]IF* command */
12538 12539                  ifr = (struct ifreq *)mp1->b_rptr;
12539 12540                  sin = (sin_t *)&ifr->ifr_addr;
12540 12541          } else if (ipip->ipi_cmd_type == LIF_CMD) {
12541 12542                  /* This a new style SIOC[GS]LIF* command */
12542 12543                  lifr = (struct lifreq *)mp1->b_rptr;
12543 12544                  sin = (sin_t *)&lifr->lifr_addr;
12544 12545          } else {
12545 12546                  sin = NULL;
12546 12547          }
12547 12548  
12548 12549          err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
12549 12550              q, mp, ipip, mp1->b_rptr);
12550 12551  
12551 12552          DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
12552 12553              int, ipip->ipi_cmd,
12553 12554              ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
12554 12555              ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
12555 12556  
12556 12557          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12557 12558  }
12558 12559  
12559 12560  /*
12560 12561   * ioctl processing
12561 12562   *
12562 12563   * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up
12563 12564   * the ioctl command in the ioctl tables, determines the copyin data size
12564 12565   * from the ipi_copyin_size field, and does an mi_copyin() of that size.
12565 12566   *
12566 12567   * ioctl processing then continues when the M_IOCDATA makes its way down to
12567 12568   * ip_wput_nondata().  The ioctl is looked up again in the ioctl table, its
12568 12569   * associated 'conn' is refheld till the end of the ioctl and the general
12569 12570   * ioctl processing function ip_process_ioctl() is called to extract the
12570 12571   * arguments and process the ioctl.  To simplify extraction, ioctl commands
12571 12572   * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a
12572 12573   * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq())
12573 12574   * is used to extract the ioctl's arguments.
12574 12575   *
12575 12576   * ip_process_ioctl determines if the ioctl needs to be serialized, and if
12576 12577   * so goes thru the serialization primitive ipsq_try_enter. Then the
12577 12578   * appropriate function to handle the ioctl is called based on the entry in
12578 12579   * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish
12579 12580   * which also refreleases the 'conn' that was refheld at the start of the
12580 12581   * ioctl. Finally ipsq_exit is called if needed to exit the ipsq.
12581 12582   *
12582 12583   * Many exclusive ioctls go thru an internal down up sequence as part of
12583 12584   * the operation. For example an attempt to change the IP address of an
12584 12585   * ipif entails ipif_down, set address, ipif_up. Bringing down the interface
12585 12586   * does all the cleanup such as deleting all ires that use this address.
12586 12587   * Then we need to wait till all references to the interface go away.
12587 12588   */
12588 12589  void
12589 12590  ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12590 12591  {
12591 12592          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
12592 12593          ip_ioctl_cmd_t *ipip = arg;
12593 12594          ip_extract_func_t *extract_funcp;
12594 12595          cmd_info_t ci;
12595 12596          int err;
12596 12597          boolean_t entered_ipsq = B_FALSE;
12597 12598  
12598 12599          ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
12599 12600  
12600 12601          if (ipip == NULL)
12601 12602                  ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12602 12603  
12603 12604          /*
12604 12605           * SIOCLIFADDIF needs to go thru a special path since the
12605 12606           * ill may not exist yet. This happens in the case of lo0
12606 12607           * which is created using this ioctl.
12607 12608           */
12608 12609          if (ipip->ipi_cmd == SIOCLIFADDIF) {
12609 12610                  err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
12610 12611                  DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
12611 12612                      int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12612 12613                  ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12613 12614                  return;
12614 12615          }
12615 12616  
12616 12617          ci.ci_ipif = NULL;
12617 12618          extract_funcp = NULL;
12618 12619          switch (ipip->ipi_cmd_type) {
12619 12620          case MISC_CMD:
12620 12621          case MSFILT_CMD:
12621 12622                  /*
12622 12623                   * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
12623 12624                   */
12624 12625                  if (ipip->ipi_cmd == IF_UNITSEL) {
12625 12626                          /* ioctl comes down the ill */
12626 12627                          ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif;
12627 12628                          ipif_refhold(ci.ci_ipif);
12628 12629                  }
12629 12630                  err = 0;
12630 12631                  ci.ci_sin = NULL;
12631 12632                  ci.ci_sin6 = NULL;
12632 12633                  ci.ci_lifr = NULL;
12633 12634                  extract_funcp = NULL;
12634 12635                  break;
12635 12636  
12636 12637          case IF_CMD:
12637 12638          case LIF_CMD:
12638 12639                  extract_funcp = ip_extract_lifreq;
12639 12640                  break;
12640 12641  
12641 12642          case ARP_CMD:
12642 12643          case XARP_CMD:
12643 12644                  extract_funcp = ip_extract_arpreq;
12644 12645                  break;
12645 12646  
12646 12647          default:
12647 12648                  ASSERT(0);
12648 12649          }
12649 12650  
12650 12651          if (extract_funcp != NULL) {
12651 12652                  err = (*extract_funcp)(q, mp, ipip, &ci);
12652 12653                  if (err != 0) {
12653 12654                          DTRACE_PROBE4(ipif__ioctl,
12654 12655                              char *, "ip_process_ioctl finish err",
12655 12656                              int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12656 12657                          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12657 12658                          return;
12658 12659                  }
12659 12660  
12660 12661                  /*
12661 12662                   * All of the extraction functions return a refheld ipif.
12662 12663                   */
12663 12664                  ASSERT(ci.ci_ipif != NULL);
12664 12665          }
12665 12666  
12666 12667          if (!(ipip->ipi_flags & IPI_WR)) {
12667 12668                  /*
12668 12669                   * A return value of EINPROGRESS means the ioctl is
12669 12670                   * either queued and waiting for some reason or has
12670 12671                   * already completed.
12671 12672                   */
12672 12673                  err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
12673 12674                      ci.ci_lifr);
12674 12675                  if (ci.ci_ipif != NULL) {
12675 12676                          DTRACE_PROBE4(ipif__ioctl,
12676 12677                              char *, "ip_process_ioctl finish RD",
12677 12678                              int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
12678 12679                              ipif_t *, ci.ci_ipif);
12679 12680                          ipif_refrele(ci.ci_ipif);
12680 12681                  } else {
12681 12682                          DTRACE_PROBE4(ipif__ioctl,
12682 12683                              char *, "ip_process_ioctl finish RD",
12683 12684                              int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12684 12685                  }
12685 12686                  ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12686 12687                  return;
12687 12688          }
12688 12689  
12689 12690          ASSERT(ci.ci_ipif != NULL);
12690 12691  
12691 12692          /*
12692 12693           * If ipsq is non-NULL, we are already being called exclusively
12693 12694           */
12694 12695          ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
12695 12696          if (ipsq == NULL) {
12696 12697                  ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
12697 12698                      NEW_OP, B_TRUE);
12698 12699                  if (ipsq == NULL) {
12699 12700                          ipif_refrele(ci.ci_ipif);
12700 12701                          return;
12701 12702                  }
12702 12703                  entered_ipsq = B_TRUE;
12703 12704          }
12704 12705          /*
12705 12706           * Release the ipif so that ipif_down and friends that wait for
12706 12707           * references to go away are not misled about the current ipif_refcnt
12707 12708           * values. We are writer so we can access the ipif even after releasing
12708 12709           * the ipif.
12709 12710           */
12710 12711          ipif_refrele(ci.ci_ipif);
12711 12712  
12712 12713          ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
12713 12714  
12714 12715          /*
12715 12716           * A return value of EINPROGRESS means the ioctl is
12716 12717           * either queued and waiting for some reason or has
12717 12718           * already completed.
12718 12719           */
12719 12720          err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
12720 12721  
12721 12722          DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
12722 12723              int, ipip->ipi_cmd,
12723 12724              ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
12724 12725              ipif_t *, ci.ci_ipif);
12725 12726          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12726 12727  
12727 12728          if (entered_ipsq)
12728 12729                  ipsq_exit(ipsq);
12729 12730  }
12730 12731  
12731 12732  /*
12732 12733   * Complete the ioctl. Typically ioctls use the mi package and need to
12733 12734   * do mi_copyout/mi_copy_done.
12734 12735   */
12735 12736  void
12736 12737  ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
12737 12738  {
12738 12739          conn_t  *connp = NULL;
12739 12740  
12740 12741          if (err == EINPROGRESS)
12741 12742                  return;
12742 12743  
12743 12744          if (CONN_Q(q)) {
12744 12745                  connp = Q_TO_CONN(q);
12745 12746                  ASSERT(connp->conn_ref >= 2);
12746 12747          }
12747 12748  
12748 12749          switch (mode) {
12749 12750          case COPYOUT:
12750 12751                  if (err == 0)
12751 12752                          mi_copyout(q, mp);
12752 12753                  else
12753 12754                          mi_copy_done(q, mp, err);
12754 12755                  break;
12755 12756  
12756 12757          case NO_COPYOUT:
12757 12758                  mi_copy_done(q, mp, err);
12758 12759                  break;
12759 12760  
12760 12761          default:
12761 12762                  ASSERT(mode == CONN_CLOSE);     /* aborted through CONN_CLOSE */
12762 12763                  break;
12763 12764          }
12764 12765  
12765 12766          /*
12766 12767           * The conn refhold and ioctlref placed on the conn at the start of the
12767 12768           * ioctl are released here.
12768 12769           */
12769 12770          if (connp != NULL) {
12770 12771                  CONN_DEC_IOCTLREF(connp);
12771 12772                  CONN_OPER_PENDING_DONE(connp);
12772 12773          }
12773 12774  
12774 12775          if (ipsq != NULL)
12775 12776                  ipsq_current_finish(ipsq);
12776 12777  }
12777 12778  
12778 12779  /* Handles all non data messages */
12779 12780  int
12780 12781  ip_wput_nondata(queue_t *q, mblk_t *mp)
12781 12782  {
12782 12783          mblk_t          *mp1;
12783 12784          struct iocblk   *iocp;
12784 12785          ip_ioctl_cmd_t  *ipip;
12785 12786          conn_t          *connp;
12786 12787          cred_t          *cr;
12787 12788          char            *proto_str;
12788 12789  
12789 12790          if (CONN_Q(q))
12790 12791                  connp = Q_TO_CONN(q);
12791 12792          else
12792 12793                  connp = NULL;
12793 12794  
12794 12795          iocp = NULL;
12795 12796          switch (DB_TYPE(mp)) {
12796 12797          case M_IOCTL:
12797 12798                  /*
12798 12799                   * IOCTL processing begins in ip_sioctl_copyin_setup which
12799 12800                   * will arrange to copy in associated control structures.
12800 12801                   */
12801 12802                  ip_sioctl_copyin_setup(q, mp);
12802 12803                  return (0);
12803 12804          case M_IOCDATA:
12804 12805                  /*
12805 12806                   * Ensure that this is associated with one of our trans-
12806 12807                   * parent ioctls.  If it's not ours, discard it if we're
12807 12808                   * running as a driver, or pass it on if we're a module.
12808 12809                   */
12809 12810                  iocp = (struct iocblk *)mp->b_rptr;
12810 12811                  ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12811 12812                  if (ipip == NULL) {
12812 12813                          if (q->q_next == NULL) {
12813 12814                                  goto nak;
12814 12815                          } else {
12815 12816                                  putnext(q, mp);
12816 12817                          }
12817 12818                          return (0);
12818 12819                  }
12819 12820                  if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
12820 12821                          /*
12821 12822                           * The ioctl is one we recognise, but is not consumed
12822 12823                           * by IP as a module and we are a module, so we drop
12823 12824                           */
12824 12825                          goto nak;
12825 12826                  }
12826 12827  
12827 12828                  /* IOCTL continuation following copyin or copyout. */
12828 12829                  if (mi_copy_state(q, mp, NULL) == -1) {
12829 12830                          /*
12830 12831                           * The copy operation failed.  mi_copy_state already
12831 12832                           * cleaned up, so we're out of here.
12832 12833                           */
12833 12834                          return (0);
12834 12835                  }
12835 12836                  /*
12836 12837                   * If we just completed a copy in, we become writer and
12837 12838                   * continue processing in ip_sioctl_copyin_done.  If it
12838 12839                   * was a copy out, we call mi_copyout again.  If there is
12839 12840                   * nothing more to copy out, it will complete the IOCTL.
12840 12841                   */
12841 12842                  if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) {
12842 12843                          if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) {
12843 12844                                  mi_copy_done(q, mp, EPROTO);
12844 12845                                  return (0);
12845 12846                          }
12846 12847                          /*
12847 12848                           * Check for cases that need more copying.  A return
12848 12849                           * value of 0 means a second copyin has been started,
12849 12850                           * so we return; a return value of 1 means no more
12850 12851                           * copying is needed, so we continue.
12851 12852                           */
12852 12853                          if (ipip->ipi_cmd_type == MSFILT_CMD &&
12853 12854                              MI_COPY_COUNT(mp) == 1) {
12854 12855                                  if (ip_copyin_msfilter(q, mp) == 0)
12855 12856                                          return (0);
12856 12857                          }
12857 12858                          /*
12858 12859                           * Refhold the conn, till the ioctl completes. This is
12859 12860                           * needed in case the ioctl ends up in the pending mp
12860 12861                           * list. Every mp in the ipx_pending_mp list must have
12861 12862                           * a refhold on the conn to resume processing. The
12862 12863                           * refhold is released when the ioctl completes
12863 12864                           * (whether normally or abnormally). An ioctlref is also
12864 12865                           * placed on the conn to prevent TCP from removing the
12865 12866                           * queue needed to send the ioctl reply back.
12866 12867                           * In all cases ip_ioctl_finish is called to finish
12867 12868                           * the ioctl and release the refholds.
12868 12869                           */
12869 12870                          if (connp != NULL) {
12870 12871                                  /* This is not a reentry */
12871 12872                                  CONN_INC_REF(connp);
12872 12873                                  CONN_INC_IOCTLREF(connp);
12873 12874                          } else {
12874 12875                                  if (!(ipip->ipi_flags & IPI_MODOK)) {
12875 12876                                          mi_copy_done(q, mp, EINVAL);
12876 12877                                          return (0);
12877 12878                                  }
12878 12879                          }
12879 12880  
12880 12881                          ip_process_ioctl(NULL, q, mp, ipip);
12881 12882  
12882 12883                  } else {
12883 12884                          mi_copyout(q, mp);
12884 12885                  }
12885 12886                  return (0);
12886 12887  
12887 12888          case M_IOCNAK:
12888 12889                  /*
12889 12890                   * The only way we could get here is if a resolver didn't like
12890 12891                   * an IOCTL we sent it.  This shouldn't happen.
12891 12892                   */
12892 12893                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
12893 12894                      "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
12894 12895                      ((struct iocblk *)mp->b_rptr)->ioc_cmd);
12895 12896                  freemsg(mp);
12896 12897                  return (0);
12897 12898          case M_IOCACK:
12898 12899                  /* /dev/ip shouldn't see this */
12899 12900                  goto nak;
12900 12901          case M_FLUSH:
12901 12902                  if (*mp->b_rptr & FLUSHW)
12902 12903                          flushq(q, FLUSHALL);
12903 12904                  if (q->q_next) {
12904 12905                          putnext(q, mp);
12905 12906                          return (0);
12906 12907                  }
12907 12908                  if (*mp->b_rptr & FLUSHR) {
12908 12909                          *mp->b_rptr &= ~FLUSHW;
12909 12910                          qreply(q, mp);
12910 12911                          return (0);
12911 12912                  }
12912 12913                  freemsg(mp);
12913 12914                  return (0);
12914 12915          case M_CTL:
12915 12916                  break;
12916 12917          case M_PROTO:
12917 12918          case M_PCPROTO:
12918 12919                  /*
12919 12920                   * The only PROTO messages we expect are SNMP-related.
12920 12921                   */
12921 12922                  switch (((union T_primitives *)mp->b_rptr)->type) {
12922 12923                  case T_SVR4_OPTMGMT_REQ:
12923 12924                          ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
12924 12925                              "flags %x\n",
12925 12926                              ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
12926 12927  
12927 12928                          if (connp == NULL) {
12928 12929                                  proto_str = "T_SVR4_OPTMGMT_REQ";
12929 12930                                  goto protonak;
12930 12931                          }
12931 12932  
12932 12933                          /*
12933 12934                           * All Solaris components should pass a db_credp
12934 12935                           * for this TPI message, hence we ASSERT.
12935 12936                           * But in case there is some other M_PROTO that looks
12936 12937                           * like a TPI message sent by some other kernel
12937 12938                           * component, we check and return an error.
12938 12939                           */
12939 12940                          cr = msg_getcred(mp, NULL);
12940 12941                          ASSERT(cr != NULL);
12941 12942                          if (cr == NULL) {
12942 12943                                  mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
12943 12944                                  if (mp != NULL)
12944 12945                                          qreply(q, mp);
12945 12946                                  return (0);
12946 12947                          }
12947 12948  
12948 12949                          if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
12949 12950                                  proto_str = "Bad SNMPCOM request?";
12950 12951                                  goto protonak;
12951 12952                          }
12952 12953                          return (0);
12953 12954                  default:
12954 12955                          ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
12955 12956                              (int)*(uint_t *)mp->b_rptr));
12956 12957                          freemsg(mp);
12957 12958                          return (0);
12958 12959                  }
12959 12960          default:
12960 12961                  break;
12961 12962          }
12962 12963          if (q->q_next) {
12963 12964                  putnext(q, mp);
12964 12965          } else
12965 12966                  freemsg(mp);
12966 12967          return (0);
12967 12968  
12968 12969  nak:
12969 12970          iocp->ioc_error = EINVAL;
12970 12971          mp->b_datap->db_type = M_IOCNAK;
12971 12972          iocp->ioc_count = 0;
12972 12973          qreply(q, mp);
12973 12974          return (0);
12974 12975  
12975 12976  protonak:
12976 12977          cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
12977 12978          if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
12978 12979                  qreply(q, mp);
12979 12980          return (0);
12980 12981  }
12981 12982  
12982 12983  /*
12983 12984   * Process IP options in an outbound packet.  Verify that the nexthop in a
12984 12985   * strict source route is onlink.
12985 12986   * Returns non-zero if something fails in which case an ICMP error has been
12986 12987   * sent and mp freed.
12987 12988   *
12988 12989   * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
12989 12990   */
12990 12991  int
12991 12992  ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
12992 12993  {
12993 12994          ipoptp_t        opts;
12994 12995          uchar_t         *opt;
12995 12996          uint8_t         optval;
12996 12997          uint8_t         optlen;
12997 12998          ipaddr_t        dst;
12998 12999          intptr_t        code = 0;
12999 13000          ire_t           *ire;
13000 13001          ip_stack_t      *ipst = ixa->ixa_ipst;
13001 13002          ip_recv_attr_t  iras;
13002 13003  
13003 13004          ip2dbg(("ip_output_options\n"));
13004 13005  
13005 13006          opt = NULL;
13006 13007          dst = ipha->ipha_dst;
13007 13008          for (optval = ipoptp_first(&opts, ipha);
13008 13009              optval != IPOPT_EOL;
13009 13010              optval = ipoptp_next(&opts)) {
13010 13011                  opt = opts.ipoptp_cur;
13011 13012                  optlen = opts.ipoptp_len;
13012 13013                  ip2dbg(("ip_output_options: opt %d, len %d\n",
13013 13014                      optval, optlen));
13014 13015                  switch (optval) {
13015 13016                          uint32_t off;
13016 13017                  case IPOPT_SSRR:
13017 13018                  case IPOPT_LSRR:
13018 13019                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13019 13020                                  ip1dbg((
13020 13021                                      "ip_output_options: bad option offset\n"));
13021 13022                                  code = (char *)&opt[IPOPT_OLEN] -
13022 13023                                      (char *)ipha;
13023 13024                                  goto param_prob;
13024 13025                          }
13025 13026                          off = opt[IPOPT_OFFSET];
13026 13027                          ip1dbg(("ip_output_options: next hop 0x%x\n",
13027 13028                              ntohl(dst)));
13028 13029                          /*
13029 13030                           * For strict: verify that dst is directly
13030 13031                           * reachable.
13031 13032                           */
13032 13033                          if (optval == IPOPT_SSRR) {
13033 13034                                  ire = ire_ftable_lookup_v4(dst, 0, 0,
13034 13035                                      IRE_INTERFACE, NULL, ALL_ZONES,
13035 13036                                      ixa->ixa_tsl,
13036 13037                                      MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
13037 13038                                      NULL);
13038 13039                                  if (ire == NULL) {
13039 13040                                          ip1dbg(("ip_output_options: SSRR not"
13040 13041                                              " directly reachable: 0x%x\n",
13041 13042                                              ntohl(dst)));
13042 13043                                          goto bad_src_route;
13043 13044                                  }
13044 13045                                  ire_refrele(ire);
13045 13046                          }
13046 13047                          break;
13047 13048                  case IPOPT_RR:
13048 13049                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13049 13050                                  ip1dbg((
13050 13051                                      "ip_output_options: bad option offset\n"));
13051 13052                                  code = (char *)&opt[IPOPT_OLEN] -
13052 13053                                      (char *)ipha;
13053 13054                                  goto param_prob;
13054 13055                          }
13055 13056                          break;
13056 13057                  case IPOPT_TS:
13057 13058                          /*
13058 13059                           * Verify that length >=5 and that there is either
13059 13060                           * room for another timestamp or that the overflow
13060 13061                           * counter is not maxed out.
13061 13062                           */
13062 13063                          code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
13063 13064                          if (optlen < IPOPT_MINLEN_IT) {
13064 13065                                  goto param_prob;
13065 13066                          }
13066 13067                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13067 13068                                  ip1dbg((
13068 13069                                      "ip_output_options: bad option offset\n"));
13069 13070                                  code = (char *)&opt[IPOPT_OFFSET] -
13070 13071                                      (char *)ipha;
13071 13072                                  goto param_prob;
13072 13073                          }
13073 13074                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
13074 13075                          case IPOPT_TS_TSONLY:
13075 13076                                  off = IPOPT_TS_TIMELEN;
13076 13077                                  break;
13077 13078                          case IPOPT_TS_TSANDADDR:
13078 13079                          case IPOPT_TS_PRESPEC:
13079 13080                          case IPOPT_TS_PRESPEC_RFC791:
13080 13081                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
13081 13082                                  break;
13082 13083                          default:
13083 13084                                  code = (char *)&opt[IPOPT_POS_OV_FLG] -
13084 13085                                      (char *)ipha;
13085 13086                                  goto param_prob;
13086 13087                          }
13087 13088                          if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
13088 13089                              (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
13089 13090                                  /*
13090 13091                                   * No room and the overflow counter is 15
13091 13092                                   * already.
13092 13093                                   */
13093 13094                                  goto param_prob;
13094 13095                          }
13095 13096                          break;
13096 13097                  }
13097 13098          }
13098 13099  
13099 13100          if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
13100 13101                  return (0);
13101 13102  
13102 13103          ip1dbg(("ip_output_options: error processing IP options."));
13103 13104          code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
13104 13105  
13105 13106  param_prob:
13106 13107          bzero(&iras, sizeof (iras));
13107 13108          iras.ira_ill = iras.ira_rill = ill;
13108 13109          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13109 13110          iras.ira_rifindex = iras.ira_ruifindex;
13110 13111          iras.ira_flags = IRAF_IS_IPV4;
13111 13112  
13112 13113          ip_drop_output("ip_output_options", mp, ill);
13113 13114          icmp_param_problem(mp, (uint8_t)code, &iras);
13114 13115          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13115 13116          return (-1);
13116 13117  
13117 13118  bad_src_route:
13118 13119          bzero(&iras, sizeof (iras));
13119 13120          iras.ira_ill = iras.ira_rill = ill;
13120 13121          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13121 13122          iras.ira_rifindex = iras.ira_ruifindex;
13122 13123          iras.ira_flags = IRAF_IS_IPV4;
13123 13124  
13124 13125          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
13125 13126          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
13126 13127          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13127 13128          return (-1);
13128 13129  }
13129 13130  
13130 13131  /*
13131 13132   * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT.
13132 13133   * conn_drain_list_cnt can be changed by setting conn_drain_nthreads
13133 13134   * thru /etc/system.
13134 13135   */
13135 13136  #define CONN_MAXDRAINCNT        64
13136 13137  
13137 13138  static void
13138 13139  conn_drain_init(ip_stack_t *ipst)
13139 13140  {
13140 13141          int i, j;
13141 13142          idl_tx_list_t *itl_tx;
13142 13143  
13143 13144          ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
13144 13145  
13145 13146          if ((ipst->ips_conn_drain_list_cnt == 0) ||
13146 13147              (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) {
13147 13148                  /*
13148 13149                   * Default value of the number of drainers is the
13149 13150                   * number of cpus, subject to maximum of 8 drainers.
13150 13151                   */
13151 13152                  if (boot_max_ncpus != -1)
13152 13153                          ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8);
13153 13154                  else
13154 13155                          ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
13155 13156          }
13156 13157  
13157 13158          ipst->ips_idl_tx_list =
13158 13159              kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
13159 13160          for (i = 0; i < TX_FANOUT_SIZE; i++) {
13160 13161                  itl_tx =  &ipst->ips_idl_tx_list[i];
13161 13162                  itl_tx->txl_drain_list =
13162 13163                      kmem_zalloc(ipst->ips_conn_drain_list_cnt *
13163 13164                      sizeof (idl_t), KM_SLEEP);
13164 13165                  mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
13165 13166                  for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
13166 13167                          mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
13167 13168                              MUTEX_DEFAULT, NULL);
13168 13169                          itl_tx->txl_drain_list[j].idl_itl = itl_tx;
13169 13170                  }
13170 13171          }
13171 13172  }
13172 13173  
13173 13174  static void
13174 13175  conn_drain_fini(ip_stack_t *ipst)
13175 13176  {
13176 13177          int i;
13177 13178          idl_tx_list_t *itl_tx;
13178 13179  
13179 13180          for (i = 0; i < TX_FANOUT_SIZE; i++) {
13180 13181                  itl_tx =  &ipst->ips_idl_tx_list[i];
13181 13182                  kmem_free(itl_tx->txl_drain_list,
13182 13183                      ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
13183 13184          }
13184 13185          kmem_free(ipst->ips_idl_tx_list,
13185 13186              TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
13186 13187          ipst->ips_idl_tx_list = NULL;
13187 13188  }
13188 13189  
13189 13190  /*
13190 13191   * Flow control has blocked us from proceeding.  Insert the given conn in one
13191 13192   * of the conn drain lists.  When flow control is unblocked, either ip_wsrv()
13192 13193   * (STREAMS) or ill_flow_enable() (direct) will be called back, which in turn
13193 13194   * will call conn_walk_drain().  See the flow control notes at the top of this
13194 13195   * file for more details.
13195 13196   */
13196 13197  void
13197 13198  conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
13198 13199  {
13199 13200          idl_t   *idl = tx_list->txl_drain_list;
13200 13201          uint_t  index;
13201 13202          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
13202 13203  
13203 13204          mutex_enter(&connp->conn_lock);
13204 13205          if (connp->conn_state_flags & CONN_CLOSING) {
13205 13206                  /*
13206 13207                   * The conn is closing as a result of which CONN_CLOSING
13207 13208                   * is set. Return.
13208 13209                   */
13209 13210                  mutex_exit(&connp->conn_lock);
13210 13211                  return;
13211 13212          } else if (connp->conn_idl == NULL) {
13212 13213                  /*
13213 13214                   * Assign the next drain list round robin. We dont' use
13214 13215                   * a lock, and thus it may not be strictly round robin.
13215 13216                   * Atomicity of load/stores is enough to make sure that
13216 13217                   * conn_drain_list_index is always within bounds.
13217 13218                   */
13218 13219                  index = tx_list->txl_drain_index;
13219 13220                  ASSERT(index < ipst->ips_conn_drain_list_cnt);
13220 13221                  connp->conn_idl = &tx_list->txl_drain_list[index];
13221 13222                  index++;
13222 13223                  if (index == ipst->ips_conn_drain_list_cnt)
13223 13224                          index = 0;
13224 13225                  tx_list->txl_drain_index = index;
13225 13226          } else {
13226 13227                  ASSERT(connp->conn_idl->idl_itl == tx_list);
13227 13228          }
13228 13229          mutex_exit(&connp->conn_lock);
13229 13230  
13230 13231          idl = connp->conn_idl;
13231 13232          mutex_enter(&idl->idl_lock);
13232 13233          if ((connp->conn_drain_prev != NULL) ||
13233 13234              (connp->conn_state_flags & CONN_CLOSING)) {
13234 13235                  /*
13235 13236                   * The conn is either already in the drain list or closing.
13236 13237                   * (We needed to check for CONN_CLOSING again since close can
13237 13238                   * sneak in between dropping conn_lock and acquiring idl_lock.)
13238 13239                   */
13239 13240                  mutex_exit(&idl->idl_lock);
13240 13241                  return;
13241 13242          }
13242 13243  
13243 13244          /*
13244 13245           * The conn is not in the drain list. Insert it at the
13245 13246           * tail of the drain list. The drain list is circular
13246 13247           * and doubly linked. idl_conn points to the 1st element
13247 13248           * in the list.
13248 13249           */
13249 13250          if (idl->idl_conn == NULL) {
13250 13251                  idl->idl_conn = connp;
13251 13252                  connp->conn_drain_next = connp;
13252 13253                  connp->conn_drain_prev = connp;
13253 13254          } else {
13254 13255                  conn_t *head = idl->idl_conn;
13255 13256  
13256 13257                  connp->conn_drain_next = head;
13257 13258                  connp->conn_drain_prev = head->conn_drain_prev;
13258 13259                  head->conn_drain_prev->conn_drain_next = connp;
13259 13260                  head->conn_drain_prev = connp;
13260 13261          }
13261 13262          /*
13262 13263           * For non streams based sockets assert flow control.
13263 13264           */
13264 13265          conn_setqfull(connp, NULL);
13265 13266          mutex_exit(&idl->idl_lock);
13266 13267  }
13267 13268  
13268 13269  static void
13269 13270  conn_drain_remove(conn_t *connp)
13270 13271  {
13271 13272          idl_t *idl = connp->conn_idl;
13272 13273  
13273 13274          if (idl != NULL) {
13274 13275                  /*
13275 13276                   * Remove ourself from the drain list.
13276 13277                   */
13277 13278                  if (connp->conn_drain_next == connp) {
13278 13279                          /* Singleton in the list */
13279 13280                          ASSERT(connp->conn_drain_prev == connp);
13280 13281                          idl->idl_conn = NULL;
13281 13282                  } else {
13282 13283                          connp->conn_drain_prev->conn_drain_next =
13283 13284                              connp->conn_drain_next;
13284 13285                          connp->conn_drain_next->conn_drain_prev =
13285 13286                              connp->conn_drain_prev;
13286 13287                          if (idl->idl_conn == connp)
13287 13288                                  idl->idl_conn = connp->conn_drain_next;
13288 13289                  }
13289 13290  
13290 13291                  /*
13291 13292                   * NOTE: because conn_idl is associated with a specific drain
13292 13293                   * list which in turn is tied to the index the TX ring
13293 13294                   * (txl_cookie) hashes to, and because the TX ring can change
13294 13295                   * over the lifetime of the conn_t, we must clear conn_idl so
13295 13296                   * a subsequent conn_drain_insert() will set conn_idl again
13296 13297                   * based on the latest txl_cookie.
13297 13298                   */
13298 13299                  connp->conn_idl = NULL;
13299 13300          }
13300 13301          connp->conn_drain_next = NULL;
13301 13302          connp->conn_drain_prev = NULL;
13302 13303  
13303 13304          conn_clrqfull(connp, NULL);
13304 13305          /*
13305 13306           * For streams based sockets open up flow control.
13306 13307           */
13307 13308          if (!IPCL_IS_NONSTR(connp))
13308 13309                  enableok(connp->conn_wq);
13309 13310  }
13310 13311  
13311 13312  /*
13312 13313   * This conn is closing, and we are called from ip_close. OR
13313 13314   * this conn is draining because flow-control on the ill has been relieved.
13314 13315   *
13315 13316   * We must also need to remove conn's on this idl from the list, and also
13316 13317   * inform the sockfs upcalls about the change in flow-control.
13317 13318   */
13318 13319  static void
13319 13320  conn_drain(conn_t *connp, boolean_t closing)
13320 13321  {
13321 13322          idl_t *idl;
13322 13323          conn_t *next_connp;
13323 13324  
13324 13325          /*
13325 13326           * connp->conn_idl is stable at this point, and no lock is needed
13326 13327           * to check it. If we are called from ip_close, close has already
13327 13328           * set CONN_CLOSING, thus freezing the value of conn_idl, and
13328 13329           * called us only because conn_idl is non-null. If we are called thru
13329 13330           * service, conn_idl could be null, but it cannot change because
13330 13331           * service is single-threaded per queue, and there cannot be another
13331 13332           * instance of service trying to call conn_drain_insert on this conn
13332 13333           * now.
13333 13334           */
13334 13335          ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
13335 13336  
13336 13337          /*
13337 13338           * If the conn doesn't exist or is not on a drain list, bail.
13338 13339           */
13339 13340          if (connp == NULL || connp->conn_idl == NULL ||
13340 13341              connp->conn_drain_prev == NULL) {
13341 13342                  return;
13342 13343          }
13343 13344  
13344 13345          idl = connp->conn_idl;
13345 13346          ASSERT(MUTEX_HELD(&idl->idl_lock));
13346 13347  
13347 13348          if (!closing) {
13348 13349                  next_connp = connp->conn_drain_next;
13349 13350                  while (next_connp != connp) {
13350 13351                          conn_t *delconnp = next_connp;
13351 13352  
13352 13353                          next_connp = next_connp->conn_drain_next;
13353 13354                          conn_drain_remove(delconnp);
13354 13355                  }
13355 13356                  ASSERT(connp->conn_drain_next == idl->idl_conn);
13356 13357          }
13357 13358          conn_drain_remove(connp);
13358 13359  }
13359 13360  
13360 13361  /*
13361 13362   * Write service routine. Shared perimeter entry point.
13362 13363   * The device queue's messages has fallen below the low water mark and STREAMS
13363 13364   * has backenabled the ill_wq. Send sockfs notification about flow-control on
13364 13365   * each waiting conn.
13365 13366   */
13366 13367  int
13367 13368  ip_wsrv(queue_t *q)
13368 13369  {
13369 13370          ill_t   *ill;
13370 13371  
13371 13372          ill = (ill_t *)q->q_ptr;
13372 13373          if (ill->ill_state_flags == 0) {
13373 13374                  ip_stack_t *ipst = ill->ill_ipst;
13374 13375  
13375 13376                  /*
13376 13377                   * The device flow control has opened up.
13377 13378                   * Walk through conn drain lists and qenable the
13378 13379                   * first conn in each list. This makes sense only
13379 13380                   * if the stream is fully plumbed and setup.
13380 13381                   * Hence the ill_state_flags check above.
13381 13382                   */
13382 13383                  ip1dbg(("ip_wsrv: walking\n"));
13383 13384                  conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
13384 13385                  enableok(ill->ill_wq);
13385 13386          }
13386 13387          return (0);
13387 13388  }
13388 13389  
13389 13390  /*
13390 13391   * Callback to disable flow control in IP.
13391 13392   *
13392 13393   * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
13393 13394   * is enabled.
13394 13395   *
13395 13396   * When MAC_TX() is not able to send any more packets, dld sets its queue
13396 13397   * to QFULL and enable the STREAMS flow control. Later, when the underlying
13397 13398   * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
13398 13399   * function and wakes up corresponding mac worker threads, which in turn
13399 13400   * calls this callback function, and disables flow control.
13400 13401   */
13401 13402  void
13402 13403  ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
13403 13404  {
13404 13405          ill_t *ill = (ill_t *)arg;
13405 13406          ip_stack_t *ipst = ill->ill_ipst;
13406 13407          idl_tx_list_t *idl_txl;
13407 13408  
13408 13409          idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
13409 13410          mutex_enter(&idl_txl->txl_lock);
13410 13411          /* add code to to set a flag to indicate idl_txl is enabled */
13411 13412          conn_walk_drain(ipst, idl_txl);
13412 13413          mutex_exit(&idl_txl->txl_lock);
13413 13414  }
13414 13415  
13415 13416  /*
13416 13417   * Flow control has been relieved and STREAMS has backenabled us; drain
13417 13418   * all the conn lists on `tx_list'.
13418 13419   */
13419 13420  static void
13420 13421  conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
13421 13422  {
13422 13423          int i;
13423 13424          idl_t *idl;
13424 13425  
13425 13426          IP_STAT(ipst, ip_conn_walk_drain);
13426 13427  
13427 13428          for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
13428 13429                  idl = &tx_list->txl_drain_list[i];
13429 13430                  mutex_enter(&idl->idl_lock);
13430 13431                  conn_drain(idl->idl_conn, B_FALSE);
13431 13432                  mutex_exit(&idl->idl_lock);
13432 13433          }
13433 13434  }
13434 13435  
13435 13436  /*
13436 13437   * Determine if the ill and multicast aspects of that packets
13437 13438   * "matches" the conn.
13438 13439   */
13439 13440  boolean_t
13440 13441  conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
13441 13442  {
13442 13443          ill_t           *ill = ira->ira_rill;
13443 13444          zoneid_t        zoneid = ira->ira_zoneid;
13444 13445          uint_t          in_ifindex;
13445 13446          ipaddr_t        dst, src;
13446 13447  
13447 13448          dst = ipha->ipha_dst;
13448 13449          src = ipha->ipha_src;
13449 13450  
13450 13451          /*
13451 13452           * conn_incoming_ifindex is set by IP_BOUND_IF which limits
13452 13453           * unicast, broadcast and multicast reception to
13453 13454           * conn_incoming_ifindex.
13454 13455           * conn_wantpacket is called for unicast, broadcast and
13455 13456           * multicast packets.
13456 13457           */
13457 13458          in_ifindex = connp->conn_incoming_ifindex;
13458 13459  
13459 13460          /* mpathd can bind to the under IPMP interface, which we allow */
13460 13461          if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
13461 13462                  if (!IS_UNDER_IPMP(ill))
13462 13463                          return (B_FALSE);
13463 13464  
13464 13465                  if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
13465 13466                          return (B_FALSE);
13466 13467          }
13467 13468  
13468 13469          if (!IPCL_ZONE_MATCH(connp, zoneid))
13469 13470                  return (B_FALSE);
13470 13471  
13471 13472          if (!(ira->ira_flags & IRAF_MULTICAST))
13472 13473                  return (B_TRUE);
13473 13474  
13474 13475          if (connp->conn_multi_router) {
13475 13476                  /* multicast packet and multicast router socket: send up */
13476 13477                  return (B_TRUE);
13477 13478          }
13478 13479  
13479 13480          if (ipha->ipha_protocol == IPPROTO_PIM ||
13480 13481              ipha->ipha_protocol == IPPROTO_RSVP)
13481 13482                  return (B_TRUE);
13482 13483  
13483 13484          return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
13484 13485  }
13485 13486  
13486 13487  void
13487 13488  conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
13488 13489  {
13489 13490          if (IPCL_IS_NONSTR(connp)) {
13490 13491                  (*connp->conn_upcalls->su_txq_full)
13491 13492                      (connp->conn_upper_handle, B_TRUE);
13492 13493                  if (flow_stopped != NULL)
13493 13494                          *flow_stopped = B_TRUE;
13494 13495          } else {
13495 13496                  queue_t *q = connp->conn_wq;
13496 13497  
13497 13498                  ASSERT(q != NULL);
13498 13499                  if (!(q->q_flag & QFULL)) {
13499 13500                          mutex_enter(QLOCK(q));
13500 13501                          if (!(q->q_flag & QFULL)) {
13501 13502                                  /* still need to set QFULL */
13502 13503                                  q->q_flag |= QFULL;
13503 13504                                  /* set flow_stopped to true under QLOCK */
13504 13505                                  if (flow_stopped != NULL)
13505 13506                                          *flow_stopped = B_TRUE;
13506 13507                                  mutex_exit(QLOCK(q));
13507 13508                          } else {
13508 13509                                  /* flow_stopped is left unchanged */
13509 13510                                  mutex_exit(QLOCK(q));
13510 13511                          }
13511 13512                  }
13512 13513          }
13513 13514  }
13514 13515  
13515 13516  void
13516 13517  conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
13517 13518  {
13518 13519          if (IPCL_IS_NONSTR(connp)) {
13519 13520                  (*connp->conn_upcalls->su_txq_full)
13520 13521                      (connp->conn_upper_handle, B_FALSE);
13521 13522                  if (flow_stopped != NULL)
13522 13523                          *flow_stopped = B_FALSE;
13523 13524          } else {
13524 13525                  queue_t *q = connp->conn_wq;
13525 13526  
13526 13527                  ASSERT(q != NULL);
13527 13528                  if (q->q_flag & QFULL) {
13528 13529                          mutex_enter(QLOCK(q));
13529 13530                          if (q->q_flag & QFULL) {
13530 13531                                  q->q_flag &= ~QFULL;
13531 13532                                  /* set flow_stopped to false under QLOCK */
13532 13533                                  if (flow_stopped != NULL)
13533 13534                                          *flow_stopped = B_FALSE;
13534 13535                                  mutex_exit(QLOCK(q));
13535 13536                                  if (q->q_flag & QWANTW)
13536 13537                                          qbackenable(q, 0);
13537 13538                          } else {
13538 13539                                  /* flow_stopped is left unchanged */
13539 13540                                  mutex_exit(QLOCK(q));
13540 13541                          }
13541 13542                  }
13542 13543          }
13543 13544  
13544 13545          mutex_enter(&connp->conn_lock);
13545 13546          connp->conn_blocked = B_FALSE;
13546 13547          mutex_exit(&connp->conn_lock);
13547 13548  }
13548 13549  
13549 13550  /*
13550 13551   * Return the length in bytes of the IPv4 headers (base header, label, and
13551 13552   * other IP options) that will be needed based on the
13552 13553   * ip_pkt_t structure passed by the caller.
13553 13554   *
13554 13555   * The returned length does not include the length of the upper level
13555 13556   * protocol (ULP) header.
13556 13557   * The caller needs to check that the length doesn't exceed the max for IPv4.
13557 13558   */
13558 13559  int
13559 13560  ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
13560 13561  {
13561 13562          int len;
13562 13563  
13563 13564          len = IP_SIMPLE_HDR_LENGTH;
13564 13565          if (ipp->ipp_fields & IPPF_LABEL_V4) {
13565 13566                  ASSERT(ipp->ipp_label_len_v4 != 0);
13566 13567                  /* We need to round up here */
13567 13568                  len += (ipp->ipp_label_len_v4 + 3) & ~3;
13568 13569          }
13569 13570  
13570 13571          if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13571 13572                  ASSERT(ipp->ipp_ipv4_options_len != 0);
13572 13573                  ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13573 13574                  len += ipp->ipp_ipv4_options_len;
13574 13575          }
13575 13576          return (len);
13576 13577  }
13577 13578  
13578 13579  /*
13579 13580   * All-purpose routine to build an IPv4 header with options based
13580 13581   * on the abstract ip_pkt_t.
13581 13582   *
13582 13583   * The caller has to set the source and destination address as well as
13583 13584   * ipha_length. The caller has to massage any source route and compensate
13584 13585   * for the ULP pseudo-header checksum due to the source route.
13585 13586   */
13586 13587  void
13587 13588  ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
13588 13589      uint8_t protocol)
13589 13590  {
13590 13591          ipha_t  *ipha = (ipha_t *)buf;
13591 13592          uint8_t *cp;
13592 13593  
13593 13594          /* Initialize IPv4 header */
13594 13595          ipha->ipha_type_of_service = ipp->ipp_type_of_service;
13595 13596          ipha->ipha_length = 0;  /* Caller will set later */
13596 13597          ipha->ipha_ident = 0;
13597 13598          ipha->ipha_fragment_offset_and_flags = 0;
13598 13599          ipha->ipha_ttl = ipp->ipp_unicast_hops;
13599 13600          ipha->ipha_protocol = protocol;
13600 13601          ipha->ipha_hdr_checksum = 0;
13601 13602  
13602 13603          if ((ipp->ipp_fields & IPPF_ADDR) &&
13603 13604              IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
13604 13605                  ipha->ipha_src = ipp->ipp_addr_v4;
13605 13606  
13606 13607          cp = (uint8_t *)&ipha[1];
13607 13608          if (ipp->ipp_fields & IPPF_LABEL_V4) {
13608 13609                  ASSERT(ipp->ipp_label_len_v4 != 0);
13609 13610                  bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
13610 13611                  cp += ipp->ipp_label_len_v4;
13611 13612                  /* We need to round up here */
13612 13613                  while ((uintptr_t)cp & 0x3) {
13613 13614                          *cp++ = IPOPT_NOP;
13614 13615                  }
13615 13616          }
13616 13617  
13617 13618          if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13618 13619                  ASSERT(ipp->ipp_ipv4_options_len != 0);
13619 13620                  ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13620 13621                  bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
13621 13622                  cp += ipp->ipp_ipv4_options_len;
13622 13623          }
13623 13624          ipha->ipha_version_and_hdr_length =
13624 13625              (uint8_t)((IP_VERSION << 4) + buf_len / 4);
13625 13626  
13626 13627          ASSERT((int)(cp - buf) == buf_len);
13627 13628  }
13628 13629  
13629 13630  /* Allocate the private structure */
13630 13631  static int
13631 13632  ip_priv_alloc(void **bufp)
13632 13633  {
13633 13634          void    *buf;
13634 13635  
13635 13636          if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL)
13636 13637                  return (ENOMEM);
13637 13638  
13638 13639          *bufp = buf;
13639 13640          return (0);
13640 13641  }
13641 13642  
13642 13643  /* Function to delete the private structure */
13643 13644  void
13644 13645  ip_priv_free(void *buf)
13645 13646  {
13646 13647          ASSERT(buf != NULL);
13647 13648          kmem_free(buf, sizeof (ip_priv_t));
13648 13649  }
13649 13650  
13650 13651  /*
13651 13652   * The entry point for IPPF processing.
13652 13653   * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the
13653 13654   * routine just returns.
13654 13655   *
13655 13656   * When called, ip_process generates an ipp_packet_t structure
13656 13657   * which holds the state information for this packet and invokes the
13657 13658   * the classifier (via ipp_packet_process). The classification, depending on
13658 13659   * configured filters, results in a list of actions for this packet. Invoking
13659 13660   * an action may cause the packet to be dropped, in which case we return NULL.
13660 13661   * proc indicates the callout position for
13661 13662   * this packet and ill is the interface this packet arrived on or will leave
13662 13663   * on (inbound and outbound resp.).
13663 13664   *
13664 13665   * We do the processing on the rill (mapped to the upper if ipmp), but MIB
13665 13666   * on the ill corrsponding to the destination IP address.
13666 13667   */
13667 13668  mblk_t *
13668 13669  ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
13669 13670  {
13670 13671          ip_priv_t       *priv;
13671 13672          ipp_action_id_t aid;
13672 13673          int             rc = 0;
13673 13674          ipp_packet_t    *pp;
13674 13675  
13675 13676          /* If the classifier is not loaded, return  */
13676 13677          if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
13677 13678                  return (mp);
13678 13679          }
13679 13680  
13680 13681          ASSERT(mp != NULL);
13681 13682  
13682 13683          /* Allocate the packet structure */
13683 13684          rc = ipp_packet_alloc(&pp, "ip", aid);
13684 13685          if (rc != 0)
13685 13686                  goto drop;
13686 13687  
13687 13688          /* Allocate the private structure */
13688 13689          rc = ip_priv_alloc((void **)&priv);
13689 13690          if (rc != 0) {
13690 13691                  ipp_packet_free(pp);
13691 13692                  goto drop;
13692 13693          }
13693 13694          priv->proc = proc;
13694 13695          priv->ill_index = ill_get_upper_ifindex(rill);
13695 13696  
13696 13697          ipp_packet_set_private(pp, priv, ip_priv_free);
13697 13698          ipp_packet_set_data(pp, mp);
13698 13699  
13699 13700          /* Invoke the classifier */
13700 13701          rc = ipp_packet_process(&pp);
13701 13702          if (pp != NULL) {
13702 13703                  mp = ipp_packet_get_data(pp);
13703 13704                  ipp_packet_free(pp);
13704 13705                  if (rc != 0)
13705 13706                          goto drop;
13706 13707                  return (mp);
13707 13708          } else {
13708 13709                  /* No mp to trace in ip_drop_input/ip_drop_output  */
13709 13710                  mp = NULL;
13710 13711          }
13711 13712  drop:
13712 13713          if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
13713 13714                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
13714 13715                  ip_drop_input("ip_process", mp, ill);
13715 13716          } else {
13716 13717                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13717 13718                  ip_drop_output("ip_process", mp, ill);
13718 13719          }
13719 13720          freemsg(mp);
13720 13721          return (NULL);
13721 13722  }
13722 13723  
13723 13724  /*
13724 13725   * Propagate a multicast group membership operation (add/drop) on
13725 13726   * all the interfaces crossed by the related multirt routes.
13726 13727   * The call is considered successful if the operation succeeds
13727 13728   * on at least one interface.
13728 13729   *
13729 13730   * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
13730 13731   * multicast addresses with the ire argument being the first one.
13731 13732   * We walk the bucket to find all the of those.
13732 13733   *
13733 13734   * Common to IPv4 and IPv6.
13734 13735   */
13735 13736  static int
13736 13737  ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
13737 13738      const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
13738 13739      ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
13739 13740      mcast_record_t fmode, const in6_addr_t *v6src)
13740 13741  {
13741 13742          ire_t           *ire_gw;
13742 13743          irb_t           *irb;
13743 13744          int             ifindex;
13744 13745          int             error = 0;
13745 13746          int             result;
13746 13747          ip_stack_t      *ipst = ire->ire_ipst;
13747 13748          ipaddr_t        group;
13748 13749          boolean_t       isv6;
13749 13750          int             match_flags;
13750 13751  
13751 13752          if (IN6_IS_ADDR_V4MAPPED(v6group)) {
13752 13753                  IN6_V4MAPPED_TO_IPADDR(v6group, group);
13753 13754                  isv6 = B_FALSE;
13754 13755          } else {
13755 13756                  isv6 = B_TRUE;
13756 13757          }
13757 13758  
13758 13759          irb = ire->ire_bucket;
13759 13760          ASSERT(irb != NULL);
13760 13761  
13761 13762          result = 0;
13762 13763          irb_refhold(irb);
13763 13764          for (; ire != NULL; ire = ire->ire_next) {
13764 13765                  if ((ire->ire_flags & RTF_MULTIRT) == 0)
13765 13766                          continue;
13766 13767  
13767 13768                  /* We handle -ifp routes by matching on the ill if set */
13768 13769                  match_flags = MATCH_IRE_TYPE;
13769 13770                  if (ire->ire_ill != NULL)
13770 13771                          match_flags |= MATCH_IRE_ILL;
13771 13772  
13772 13773                  if (isv6) {
13773 13774                          if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
13774 13775                                  continue;
13775 13776  
13776 13777                          ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
13777 13778                              0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13778 13779                              match_flags, 0, ipst, NULL);
13779 13780                  } else {
13780 13781                          if (ire->ire_addr != group)
13781 13782                                  continue;
13782 13783  
13783 13784                          ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
13784 13785                              0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13785 13786                              match_flags, 0, ipst, NULL);
13786 13787                  }
13787 13788                  /* No interface route exists for the gateway; skip this ire. */
13788 13789                  if (ire_gw == NULL)
13789 13790                          continue;
13790 13791                  if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
13791 13792                          ire_refrele(ire_gw);
13792 13793                          continue;
13793 13794                  }
13794 13795                  ASSERT(ire_gw->ire_ill != NULL);        /* IRE_INTERFACE */
13795 13796                  ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
13796 13797  
13797 13798                  /*
13798 13799                   * The operation is considered a success if
13799 13800                   * it succeeds at least once on any one interface.
13800 13801                   */
13801 13802                  error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
13802 13803                      fmode, v6src);
13803 13804                  if (error == 0)
13804 13805                          result = CGTP_MCAST_SUCCESS;
13805 13806  
13806 13807                  ire_refrele(ire_gw);
13807 13808          }
13808 13809          irb_refrele(irb);
13809 13810          /*
13810 13811           * Consider the call as successful if we succeeded on at least
13811 13812           * one interface. Otherwise, return the last encountered error.
13812 13813           */
13813 13814          return (result == CGTP_MCAST_SUCCESS ? 0 : error);
13814 13815  }
13815 13816  
13816 13817  /*
13817 13818   * Return the expected CGTP hooks version number.
13818 13819   */
13819 13820  int
13820 13821  ip_cgtp_filter_supported(void)
13821 13822  {
13822 13823          return (ip_cgtp_filter_rev);
13823 13824  }
13824 13825  
13825 13826  /*
13826 13827   * CGTP hooks can be registered by invoking this function.
13827 13828   * Checks that the version number matches.
13828 13829   */
13829 13830  int
13830 13831  ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
13831 13832  {
13832 13833          netstack_t *ns;
13833 13834          ip_stack_t *ipst;
13834 13835  
13835 13836          if (ops->cfo_filter_rev != CGTP_FILTER_REV)
13836 13837                  return (ENOTSUP);
13837 13838  
13838 13839          ns = netstack_find_by_stackid(stackid);
13839 13840          if (ns == NULL)
13840 13841                  return (EINVAL);
13841 13842          ipst = ns->netstack_ip;
13842 13843          ASSERT(ipst != NULL);
13843 13844  
13844 13845          if (ipst->ips_ip_cgtp_filter_ops != NULL) {
13845 13846                  netstack_rele(ns);
13846 13847                  return (EALREADY);
13847 13848          }
13848 13849  
13849 13850          ipst->ips_ip_cgtp_filter_ops = ops;
13850 13851  
13851 13852          ill_set_inputfn_all(ipst);
13852 13853  
13853 13854          netstack_rele(ns);
13854 13855          return (0);
13855 13856  }
13856 13857  
13857 13858  /*
13858 13859   * CGTP hooks can be unregistered by invoking this function.
13859 13860   * Returns ENXIO if there was no registration.
13860 13861   * Returns EBUSY if the ndd variable has not been turned off.
13861 13862   */
13862 13863  int
13863 13864  ip_cgtp_filter_unregister(netstackid_t stackid)
13864 13865  {
13865 13866          netstack_t *ns;
13866 13867          ip_stack_t *ipst;
13867 13868  
13868 13869          ns = netstack_find_by_stackid(stackid);
13869 13870          if (ns == NULL)
13870 13871                  return (EINVAL);
13871 13872          ipst = ns->netstack_ip;
13872 13873          ASSERT(ipst != NULL);
13873 13874  
13874 13875          if (ipst->ips_ip_cgtp_filter) {
13875 13876                  netstack_rele(ns);
13876 13877                  return (EBUSY);
13877 13878          }
13878 13879  
13879 13880          if (ipst->ips_ip_cgtp_filter_ops == NULL) {
13880 13881                  netstack_rele(ns);
13881 13882                  return (ENXIO);
13882 13883          }
13883 13884          ipst->ips_ip_cgtp_filter_ops = NULL;
13884 13885  
13885 13886          ill_set_inputfn_all(ipst);
13886 13887  
13887 13888          netstack_rele(ns);
13888 13889          return (0);
13889 13890  }
13890 13891  
13891 13892  /*
13892 13893   * Check whether there is a CGTP filter registration.
13893 13894   * Returns non-zero if there is a registration, otherwise returns zero.
13894 13895   * Note: returns zero if bad stackid.
13895 13896   */
13896 13897  int
13897 13898  ip_cgtp_filter_is_registered(netstackid_t stackid)
13898 13899  {
13899 13900          netstack_t *ns;
13900 13901          ip_stack_t *ipst;
13901 13902          int ret;
13902 13903  
13903 13904          ns = netstack_find_by_stackid(stackid);
13904 13905          if (ns == NULL)
13905 13906                  return (0);
13906 13907          ipst = ns->netstack_ip;
13907 13908          ASSERT(ipst != NULL);
13908 13909  
13909 13910          if (ipst->ips_ip_cgtp_filter_ops != NULL)
13910 13911                  ret = 1;
13911 13912          else
13912 13913                  ret = 0;
13913 13914  
13914 13915          netstack_rele(ns);
13915 13916          return (ret);
13916 13917  }
13917 13918  
13918 13919  static int
13919 13920  ip_squeue_switch(int val)
13920 13921  {
13921 13922          int rval;
13922 13923  
13923 13924          switch (val) {
13924 13925          case IP_SQUEUE_ENTER_NODRAIN:
13925 13926                  rval = SQ_NODRAIN;
13926 13927                  break;
13927 13928          case IP_SQUEUE_ENTER:
13928 13929                  rval = SQ_PROCESS;
13929 13930                  break;
13930 13931          case IP_SQUEUE_FILL:
13931 13932          default:
13932 13933                  rval = SQ_FILL;
13933 13934                  break;
13934 13935          }
13935 13936          return (rval);
13936 13937  }
13937 13938  
13938 13939  static void *
13939 13940  ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
13940 13941  {
13941 13942          kstat_t *ksp;
13942 13943  
13943 13944          ip_stat_t template = {
13944 13945                  { "ip_udp_fannorm",             KSTAT_DATA_UINT64 },
13945 13946                  { "ip_udp_fanmb",               KSTAT_DATA_UINT64 },
13946 13947                  { "ip_recv_pullup",             KSTAT_DATA_UINT64 },
13947 13948                  { "ip_db_ref",                  KSTAT_DATA_UINT64 },
13948 13949                  { "ip_notaligned",              KSTAT_DATA_UINT64 },
13949 13950                  { "ip_multimblk",               KSTAT_DATA_UINT64 },
13950 13951                  { "ip_opt",                     KSTAT_DATA_UINT64 },
13951 13952                  { "ipsec_proto_ahesp",          KSTAT_DATA_UINT64 },
13952 13953                  { "ip_conn_flputbq",            KSTAT_DATA_UINT64 },
13953 13954                  { "ip_conn_walk_drain",         KSTAT_DATA_UINT64 },
13954 13955                  { "ip_out_sw_cksum",            KSTAT_DATA_UINT64 },
13955 13956                  { "ip_out_sw_cksum_bytes",      KSTAT_DATA_UINT64 },
13956 13957                  { "ip_in_sw_cksum",             KSTAT_DATA_UINT64 },
13957 13958                  { "ip_ire_reclaim_calls",       KSTAT_DATA_UINT64 },
13958 13959                  { "ip_ire_reclaim_deleted",     KSTAT_DATA_UINT64 },
13959 13960                  { "ip_nce_reclaim_calls",       KSTAT_DATA_UINT64 },
13960 13961                  { "ip_nce_reclaim_deleted",     KSTAT_DATA_UINT64 },
13961 13962                  { "ip_nce_mcast_reclaim_calls", KSTAT_DATA_UINT64 },
13962 13963                  { "ip_nce_mcast_reclaim_deleted",       KSTAT_DATA_UINT64 },
13963 13964                  { "ip_nce_mcast_reclaim_tqfail",        KSTAT_DATA_UINT64 },
13964 13965                  { "ip_dce_reclaim_calls",       KSTAT_DATA_UINT64 },
13965 13966                  { "ip_dce_reclaim_deleted",     KSTAT_DATA_UINT64 },
13966 13967                  { "ip_tcp_in_full_hw_cksum_err",        KSTAT_DATA_UINT64 },
13967 13968                  { "ip_tcp_in_part_hw_cksum_err",        KSTAT_DATA_UINT64 },

↓ open down ↓

13930 lines elided

↑ open up ↑

13968 13969                  { "ip_tcp_in_sw_cksum_err",             KSTAT_DATA_UINT64 },
13969 13970                  { "ip_udp_in_full_hw_cksum_err",        KSTAT_DATA_UINT64 },
13970 13971                  { "ip_udp_in_part_hw_cksum_err",        KSTAT_DATA_UINT64 },
13971 13972                  { "ip_udp_in_sw_cksum_err",     KSTAT_DATA_UINT64 },
13972 13973                  { "conn_in_recvdstaddr",        KSTAT_DATA_UINT64 },
13973 13974                  { "conn_in_recvopts",           KSTAT_DATA_UINT64 },
13974 13975                  { "conn_in_recvif",             KSTAT_DATA_UINT64 },
13975 13976                  { "conn_in_recvslla",           KSTAT_DATA_UINT64 },
13976 13977                  { "conn_in_recvucred",          KSTAT_DATA_UINT64 },
13977 13978                  { "conn_in_recvttl",            KSTAT_DATA_UINT64 },
     13979 +                { "conn_in_recvtos",            KSTAT_DATA_UINT64 },
13978 13980                  { "conn_in_recvhopopts",        KSTAT_DATA_UINT64 },
13979 13981                  { "conn_in_recvhoplimit",       KSTAT_DATA_UINT64 },
13980 13982                  { "conn_in_recvdstopts",        KSTAT_DATA_UINT64 },
13981 13983                  { "conn_in_recvrthdrdstopts",   KSTAT_DATA_UINT64 },
13982 13984                  { "conn_in_recvrthdr",          KSTAT_DATA_UINT64 },
13983 13985                  { "conn_in_recvpktinfo",        KSTAT_DATA_UINT64 },
13984 13986                  { "conn_in_recvtclass",         KSTAT_DATA_UINT64 },
13985 13987                  { "conn_in_timestamp",          KSTAT_DATA_UINT64 },
13986 13988          };
13987 13989

13988 13990          ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
13989 13991              KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
13990 13992              KSTAT_FLAG_VIRTUAL, stackid);
13991 13993  
13992 13994          if (ksp == NULL)
13993 13995                  return (NULL);
13994 13996  
13995 13997          bcopy(&template, ip_statisticsp, sizeof (template));
13996 13998          ksp->ks_data = (void *)ip_statisticsp;
13997 13999          ksp->ks_private = (void *)(uintptr_t)stackid;
13998 14000  
13999 14001          kstat_install(ksp);
14000 14002          return (ksp);
14001 14003  }
14002 14004  
14003 14005  static void
14004 14006  ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp)
14005 14007  {
14006 14008          if (ksp != NULL) {
14007 14009                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14008 14010                  kstat_delete_netstack(ksp, stackid);
14009 14011          }
14010 14012  }
14011 14013  
14012 14014  static void *
14013 14015  ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst)
14014 14016  {
14015 14017          kstat_t *ksp;
14016 14018  
14017 14019          ip_named_kstat_t template = {
14018 14020                  { "forwarding",         KSTAT_DATA_UINT32, 0 },
14019 14021                  { "defaultTTL",         KSTAT_DATA_UINT32, 0 },
14020 14022                  { "inReceives",         KSTAT_DATA_UINT64, 0 },
14021 14023                  { "inHdrErrors",        KSTAT_DATA_UINT32, 0 },
14022 14024                  { "inAddrErrors",       KSTAT_DATA_UINT32, 0 },
14023 14025                  { "forwDatagrams",      KSTAT_DATA_UINT64, 0 },
14024 14026                  { "inUnknownProtos",    KSTAT_DATA_UINT32, 0 },
14025 14027                  { "inDiscards",         KSTAT_DATA_UINT32, 0 },
14026 14028                  { "inDelivers",         KSTAT_DATA_UINT64, 0 },
14027 14029                  { "outRequests",        KSTAT_DATA_UINT64, 0 },
14028 14030                  { "outDiscards",        KSTAT_DATA_UINT32, 0 },
14029 14031                  { "outNoRoutes",        KSTAT_DATA_UINT32, 0 },
14030 14032                  { "reasmTimeout",       KSTAT_DATA_UINT32, 0 },
14031 14033                  { "reasmReqds",         KSTAT_DATA_UINT32, 0 },
14032 14034                  { "reasmOKs",           KSTAT_DATA_UINT32, 0 },
14033 14035                  { "reasmFails",         KSTAT_DATA_UINT32, 0 },
14034 14036                  { "fragOKs",            KSTAT_DATA_UINT32, 0 },
14035 14037                  { "fragFails",          KSTAT_DATA_UINT32, 0 },
14036 14038                  { "fragCreates",        KSTAT_DATA_UINT32, 0 },
14037 14039                  { "addrEntrySize",      KSTAT_DATA_INT32, 0 },
14038 14040                  { "routeEntrySize",     KSTAT_DATA_INT32, 0 },
14039 14041                  { "netToMediaEntrySize",        KSTAT_DATA_INT32, 0 },
14040 14042                  { "routingDiscards",    KSTAT_DATA_UINT32, 0 },
14041 14043                  { "inErrs",             KSTAT_DATA_UINT32, 0 },
14042 14044                  { "noPorts",            KSTAT_DATA_UINT32, 0 },
14043 14045                  { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
14044 14046                  { "reasmDuplicates",    KSTAT_DATA_UINT32, 0 },
14045 14047                  { "reasmPartDups",      KSTAT_DATA_UINT32, 0 },
14046 14048                  { "forwProhibits",      KSTAT_DATA_UINT32, 0 },
14047 14049                  { "udpInCksumErrs",     KSTAT_DATA_UINT32, 0 },
14048 14050                  { "udpInOverflows",     KSTAT_DATA_UINT32, 0 },
14049 14051                  { "rawipInOverflows",   KSTAT_DATA_UINT32, 0 },
14050 14052                  { "ipsecInSucceeded",   KSTAT_DATA_UINT32, 0 },
14051 14053                  { "ipsecInFailed",      KSTAT_DATA_INT32, 0 },
14052 14054                  { "memberEntrySize",    KSTAT_DATA_INT32, 0 },
14053 14055                  { "inIPv6",             KSTAT_DATA_UINT32, 0 },
14054 14056                  { "outIPv6",            KSTAT_DATA_UINT32, 0 },
14055 14057                  { "outSwitchIPv6",      KSTAT_DATA_UINT32, 0 },
14056 14058          };
14057 14059  
14058 14060          ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED,
14059 14061              NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid);
14060 14062          if (ksp == NULL || ksp->ks_data == NULL)
14061 14063                  return (NULL);
14062 14064  
14063 14065          template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2;
14064 14066          template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl;
14065 14067          template.reasmTimeout.value.ui32 = ipst->ips_ip_reassembly_timeout;
14066 14068          template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t);
14067 14069          template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t);
14068 14070  
14069 14071          template.netToMediaEntrySize.value.i32 =
14070 14072              sizeof (mib2_ipNetToMediaEntry_t);
14071 14073  
14072 14074          template.memberEntrySize.value.i32 = sizeof (ipv6_member_t);
14073 14075  
14074 14076          bcopy(&template, ksp->ks_data, sizeof (template));
14075 14077          ksp->ks_update = ip_kstat_update;
14076 14078          ksp->ks_private = (void *)(uintptr_t)stackid;
14077 14079  
14078 14080          kstat_install(ksp);
14079 14081          return (ksp);
14080 14082  }
14081 14083  
14082 14084  static void
14083 14085  ip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14084 14086  {
14085 14087          if (ksp != NULL) {
14086 14088                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14087 14089                  kstat_delete_netstack(ksp, stackid);
14088 14090          }
14089 14091  }
14090 14092  
14091 14093  static int
14092 14094  ip_kstat_update(kstat_t *kp, int rw)
14093 14095  {
14094 14096          ip_named_kstat_t *ipkp;
14095 14097          mib2_ipIfStatsEntry_t ipmib;
14096 14098          ill_walk_context_t ctx;
14097 14099          ill_t *ill;
14098 14100          netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14099 14101          netstack_t      *ns;
14100 14102          ip_stack_t      *ipst;
14101 14103  
14102 14104          if (kp->ks_data == NULL)
14103 14105                  return (EIO);
14104 14106  
14105 14107          if (rw == KSTAT_WRITE)
14106 14108                  return (EACCES);
14107 14109  
14108 14110          ns = netstack_find_by_stackid(stackid);
14109 14111          if (ns == NULL)
14110 14112                  return (-1);
14111 14113          ipst = ns->netstack_ip;
14112 14114          if (ipst == NULL) {
14113 14115                  netstack_rele(ns);
14114 14116                  return (-1);
14115 14117          }
14116 14118          ipkp = (ip_named_kstat_t *)kp->ks_data;
14117 14119  
14118 14120          bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib));
14119 14121          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
14120 14122          ill = ILL_START_WALK_V4(&ctx, ipst);
14121 14123          for (; ill != NULL; ill = ill_next(&ctx, ill))
14122 14124                  ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib);
14123 14125          rw_exit(&ipst->ips_ill_g_lock);
14124 14126  
14125 14127          ipkp->forwarding.value.ui32 =           ipmib.ipIfStatsForwarding;
14126 14128          ipkp->defaultTTL.value.ui32 =           ipmib.ipIfStatsDefaultTTL;
14127 14129          ipkp->inReceives.value.ui64 =           ipmib.ipIfStatsHCInReceives;
14128 14130          ipkp->inHdrErrors.value.ui32 =          ipmib.ipIfStatsInHdrErrors;
14129 14131          ipkp->inAddrErrors.value.ui32 =         ipmib.ipIfStatsInAddrErrors;
14130 14132          ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams;
14131 14133          ipkp->inUnknownProtos.value.ui32 =      ipmib.ipIfStatsInUnknownProtos;
14132 14134          ipkp->inDiscards.value.ui32 =           ipmib.ipIfStatsInDiscards;
14133 14135          ipkp->inDelivers.value.ui64 =           ipmib.ipIfStatsHCInDelivers;
14134 14136          ipkp->outRequests.value.ui64 =          ipmib.ipIfStatsHCOutRequests;
14135 14137          ipkp->outDiscards.value.ui32 =          ipmib.ipIfStatsOutDiscards;
14136 14138          ipkp->outNoRoutes.value.ui32 =          ipmib.ipIfStatsOutNoRoutes;
14137 14139          ipkp->reasmTimeout.value.ui32 =         ipst->ips_ip_reassembly_timeout;
14138 14140          ipkp->reasmReqds.value.ui32 =           ipmib.ipIfStatsReasmReqds;
14139 14141          ipkp->reasmOKs.value.ui32 =             ipmib.ipIfStatsReasmOKs;
14140 14142          ipkp->reasmFails.value.ui32 =           ipmib.ipIfStatsReasmFails;
14141 14143          ipkp->fragOKs.value.ui32 =              ipmib.ipIfStatsOutFragOKs;
14142 14144          ipkp->fragFails.value.ui32 =            ipmib.ipIfStatsOutFragFails;
14143 14145          ipkp->fragCreates.value.ui32 =          ipmib.ipIfStatsOutFragCreates;
14144 14146  
14145 14147          ipkp->routingDiscards.value.ui32 =      0;
14146 14148          ipkp->inErrs.value.ui32 =               ipmib.tcpIfStatsInErrs;
14147 14149          ipkp->noPorts.value.ui32 =              ipmib.udpIfStatsNoPorts;
14148 14150          ipkp->inCksumErrs.value.ui32 =          ipmib.ipIfStatsInCksumErrs;
14149 14151          ipkp->reasmDuplicates.value.ui32 =      ipmib.ipIfStatsReasmDuplicates;
14150 14152          ipkp->reasmPartDups.value.ui32 =        ipmib.ipIfStatsReasmPartDups;
14151 14153          ipkp->forwProhibits.value.ui32 =        ipmib.ipIfStatsForwProhibits;
14152 14154          ipkp->udpInCksumErrs.value.ui32 =       ipmib.udpIfStatsInCksumErrs;
14153 14155          ipkp->udpInOverflows.value.ui32 =       ipmib.udpIfStatsInOverflows;
14154 14156          ipkp->rawipInOverflows.value.ui32 =     ipmib.rawipIfStatsInOverflows;
14155 14157          ipkp->ipsecInSucceeded.value.ui32 =     ipmib.ipsecIfStatsInSucceeded;
14156 14158          ipkp->ipsecInFailed.value.i32 =         ipmib.ipsecIfStatsInFailed;
14157 14159  
14158 14160          ipkp->inIPv6.value.ui32 =       ipmib.ipIfStatsInWrongIPVersion;
14159 14161          ipkp->outIPv6.value.ui32 =      ipmib.ipIfStatsOutWrongIPVersion;
14160 14162          ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion;
14161 14163  
14162 14164          netstack_rele(ns);
14163 14165  
14164 14166          return (0);
14165 14167  }
14166 14168  
14167 14169  static void *
14168 14170  icmp_kstat_init(netstackid_t stackid)
14169 14171  {
14170 14172          kstat_t *ksp;
14171 14173  
14172 14174          icmp_named_kstat_t template = {
14173 14175                  { "inMsgs",             KSTAT_DATA_UINT32 },
14174 14176                  { "inErrors",           KSTAT_DATA_UINT32 },
14175 14177                  { "inDestUnreachs",     KSTAT_DATA_UINT32 },
14176 14178                  { "inTimeExcds",        KSTAT_DATA_UINT32 },
14177 14179                  { "inParmProbs",        KSTAT_DATA_UINT32 },
14178 14180                  { "inSrcQuenchs",       KSTAT_DATA_UINT32 },
14179 14181                  { "inRedirects",        KSTAT_DATA_UINT32 },
14180 14182                  { "inEchos",            KSTAT_DATA_UINT32 },
14181 14183                  { "inEchoReps",         KSTAT_DATA_UINT32 },
14182 14184                  { "inTimestamps",       KSTAT_DATA_UINT32 },
14183 14185                  { "inTimestampReps",    KSTAT_DATA_UINT32 },
14184 14186                  { "inAddrMasks",        KSTAT_DATA_UINT32 },
14185 14187                  { "inAddrMaskReps",     KSTAT_DATA_UINT32 },
14186 14188                  { "outMsgs",            KSTAT_DATA_UINT32 },
14187 14189                  { "outErrors",          KSTAT_DATA_UINT32 },
14188 14190                  { "outDestUnreachs",    KSTAT_DATA_UINT32 },
14189 14191                  { "outTimeExcds",       KSTAT_DATA_UINT32 },
14190 14192                  { "outParmProbs",       KSTAT_DATA_UINT32 },
14191 14193                  { "outSrcQuenchs",      KSTAT_DATA_UINT32 },
14192 14194                  { "outRedirects",       KSTAT_DATA_UINT32 },
14193 14195                  { "outEchos",           KSTAT_DATA_UINT32 },
14194 14196                  { "outEchoReps",        KSTAT_DATA_UINT32 },
14195 14197                  { "outTimestamps",      KSTAT_DATA_UINT32 },
14196 14198                  { "outTimestampReps",   KSTAT_DATA_UINT32 },
14197 14199                  { "outAddrMasks",       KSTAT_DATA_UINT32 },
14198 14200                  { "outAddrMaskReps",    KSTAT_DATA_UINT32 },
14199 14201                  { "inChksumErrs",       KSTAT_DATA_UINT32 },
14200 14202                  { "inUnknowns",         KSTAT_DATA_UINT32 },
14201 14203                  { "inFragNeeded",       KSTAT_DATA_UINT32 },
14202 14204                  { "outFragNeeded",      KSTAT_DATA_UINT32 },
14203 14205                  { "outDrops",           KSTAT_DATA_UINT32 },
14204 14206                  { "inOverFlows",        KSTAT_DATA_UINT32 },
14205 14207                  { "inBadRedirects",     KSTAT_DATA_UINT32 },
14206 14208          };
14207 14209  
14208 14210          ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED,
14209 14211              NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid);
14210 14212          if (ksp == NULL || ksp->ks_data == NULL)
14211 14213                  return (NULL);
14212 14214  
14213 14215          bcopy(&template, ksp->ks_data, sizeof (template));
14214 14216  
14215 14217          ksp->ks_update = icmp_kstat_update;
14216 14218          ksp->ks_private = (void *)(uintptr_t)stackid;
14217 14219  
14218 14220          kstat_install(ksp);
14219 14221          return (ksp);
14220 14222  }
14221 14223  
14222 14224  static void
14223 14225  icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14224 14226  {
14225 14227          if (ksp != NULL) {
14226 14228                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14227 14229                  kstat_delete_netstack(ksp, stackid);
14228 14230          }
14229 14231  }
14230 14232  
14231 14233  static int
14232 14234  icmp_kstat_update(kstat_t *kp, int rw)
14233 14235  {
14234 14236          icmp_named_kstat_t *icmpkp;
14235 14237          netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14236 14238          netstack_t      *ns;
14237 14239          ip_stack_t      *ipst;
14238 14240  
14239 14241          if (kp->ks_data == NULL)
14240 14242                  return (EIO);
14241 14243  
14242 14244          if (rw == KSTAT_WRITE)
14243 14245                  return (EACCES);
14244 14246  
14245 14247          ns = netstack_find_by_stackid(stackid);
14246 14248          if (ns == NULL)
14247 14249                  return (-1);
14248 14250          ipst = ns->netstack_ip;
14249 14251          if (ipst == NULL) {
14250 14252                  netstack_rele(ns);
14251 14253                  return (-1);
14252 14254          }
14253 14255          icmpkp = (icmp_named_kstat_t *)kp->ks_data;
14254 14256  
14255 14257          icmpkp->inMsgs.value.ui32 =         ipst->ips_icmp_mib.icmpInMsgs;
14256 14258          icmpkp->inErrors.value.ui32 =       ipst->ips_icmp_mib.icmpInErrors;
14257 14259          icmpkp->inDestUnreachs.value.ui32 =
14258 14260              ipst->ips_icmp_mib.icmpInDestUnreachs;
14259 14261          icmpkp->inTimeExcds.value.ui32 =    ipst->ips_icmp_mib.icmpInTimeExcds;
14260 14262          icmpkp->inParmProbs.value.ui32 =    ipst->ips_icmp_mib.icmpInParmProbs;
14261 14263          icmpkp->inSrcQuenchs.value.ui32 =   ipst->ips_icmp_mib.icmpInSrcQuenchs;
14262 14264          icmpkp->inRedirects.value.ui32 =    ipst->ips_icmp_mib.icmpInRedirects;
14263 14265          icmpkp->inEchos.value.ui32 =        ipst->ips_icmp_mib.icmpInEchos;
14264 14266          icmpkp->inEchoReps.value.ui32 =     ipst->ips_icmp_mib.icmpInEchoReps;
14265 14267          icmpkp->inTimestamps.value.ui32 =   ipst->ips_icmp_mib.icmpInTimestamps;
14266 14268          icmpkp->inTimestampReps.value.ui32 =
14267 14269              ipst->ips_icmp_mib.icmpInTimestampReps;
14268 14270          icmpkp->inAddrMasks.value.ui32 =    ipst->ips_icmp_mib.icmpInAddrMasks;
14269 14271          icmpkp->inAddrMaskReps.value.ui32 =
14270 14272              ipst->ips_icmp_mib.icmpInAddrMaskReps;
14271 14273          icmpkp->outMsgs.value.ui32 =        ipst->ips_icmp_mib.icmpOutMsgs;
14272 14274          icmpkp->outErrors.value.ui32 =      ipst->ips_icmp_mib.icmpOutErrors;
14273 14275          icmpkp->outDestUnreachs.value.ui32 =
14274 14276              ipst->ips_icmp_mib.icmpOutDestUnreachs;
14275 14277          icmpkp->outTimeExcds.value.ui32 =   ipst->ips_icmp_mib.icmpOutTimeExcds;
14276 14278          icmpkp->outParmProbs.value.ui32 =   ipst->ips_icmp_mib.icmpOutParmProbs;
14277 14279          icmpkp->outSrcQuenchs.value.ui32 =
14278 14280              ipst->ips_icmp_mib.icmpOutSrcQuenchs;
14279 14281          icmpkp->outRedirects.value.ui32 =   ipst->ips_icmp_mib.icmpOutRedirects;
14280 14282          icmpkp->outEchos.value.ui32 =       ipst->ips_icmp_mib.icmpOutEchos;
14281 14283          icmpkp->outEchoReps.value.ui32 =    ipst->ips_icmp_mib.icmpOutEchoReps;
14282 14284          icmpkp->outTimestamps.value.ui32 =
14283 14285              ipst->ips_icmp_mib.icmpOutTimestamps;
14284 14286          icmpkp->outTimestampReps.value.ui32 =
14285 14287              ipst->ips_icmp_mib.icmpOutTimestampReps;
14286 14288          icmpkp->outAddrMasks.value.ui32 =
14287 14289              ipst->ips_icmp_mib.icmpOutAddrMasks;
14288 14290          icmpkp->outAddrMaskReps.value.ui32 =
14289 14291              ipst->ips_icmp_mib.icmpOutAddrMaskReps;
14290 14292          icmpkp->inCksumErrs.value.ui32 =    ipst->ips_icmp_mib.icmpInCksumErrs;
14291 14293          icmpkp->inUnknowns.value.ui32 =     ipst->ips_icmp_mib.icmpInUnknowns;
14292 14294          icmpkp->inFragNeeded.value.ui32 =   ipst->ips_icmp_mib.icmpInFragNeeded;
14293 14295          icmpkp->outFragNeeded.value.ui32 =
14294 14296              ipst->ips_icmp_mib.icmpOutFragNeeded;
14295 14297          icmpkp->outDrops.value.ui32 =       ipst->ips_icmp_mib.icmpOutDrops;
14296 14298          icmpkp->inOverflows.value.ui32 =    ipst->ips_icmp_mib.icmpInOverflows;
14297 14299          icmpkp->inBadRedirects.value.ui32 =
14298 14300              ipst->ips_icmp_mib.icmpInBadRedirects;
14299 14301  
14300 14302          netstack_rele(ns);
14301 14303          return (0);
14302 14304  }
14303 14305  
14304 14306  /*
14305 14307   * This is the fanout function for raw socket opened for SCTP.  Note
14306 14308   * that it is called after SCTP checks that there is no socket which
14307 14309   * wants a packet.  Then before SCTP handles this out of the blue packet,
14308 14310   * this function is called to see if there is any raw socket for SCTP.
14309 14311   * If there is and it is bound to the correct address, the packet will
14310 14312   * be sent to that socket.  Note that only one raw socket can be bound to
14311 14313   * a port.  This is assured in ipcl_sctp_hash_insert();
14312 14314   */
14313 14315  void
14314 14316  ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
14315 14317      ip_recv_attr_t *ira)
14316 14318  {
14317 14319          conn_t          *connp;
14318 14320          queue_t         *rq;
14319 14321          boolean_t       secure;
14320 14322          ill_t           *ill = ira->ira_ill;
14321 14323          ip_stack_t      *ipst = ill->ill_ipst;
14322 14324          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
14323 14325          sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
14324 14326          iaflags_t       iraflags = ira->ira_flags;
14325 14327          ill_t           *rill = ira->ira_rill;
14326 14328  
14327 14329          secure = iraflags & IRAF_IPSEC_SECURE;
14328 14330  
14329 14331          connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
14330 14332              ira, ipst);
14331 14333          if (connp == NULL) {
14332 14334                  /*
14333 14335                   * Although raw sctp is not summed, OOB chunks must be.
14334 14336                   * Drop the packet here if the sctp checksum failed.
14335 14337                   */
14336 14338                  if (iraflags & IRAF_SCTP_CSUM_ERR) {
14337 14339                          SCTPS_BUMP_MIB(sctps, sctpChecksumError);
14338 14340                          freemsg(mp);
14339 14341                          return;
14340 14342                  }
14341 14343                  ira->ira_ill = ira->ira_rill = NULL;
14342 14344                  sctp_ootb_input(mp, ira, ipst);
14343 14345                  ira->ira_ill = ill;
14344 14346                  ira->ira_rill = rill;
14345 14347                  return;
14346 14348          }
14347 14349          rq = connp->conn_rq;
14348 14350          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
14349 14351                  CONN_DEC_REF(connp);
14350 14352                  BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
14351 14353                  freemsg(mp);
14352 14354                  return;
14353 14355          }
14354 14356          if (((iraflags & IRAF_IS_IPV4) ?
14355 14357              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
14356 14358              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
14357 14359              secure) {
14358 14360                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
14359 14361                      ip6h, ira);
14360 14362                  if (mp == NULL) {
14361 14363                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
14362 14364                          /* Note that mp is NULL */
14363 14365                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
14364 14366                          CONN_DEC_REF(connp);
14365 14367                          return;
14366 14368                  }
14367 14369          }
14368 14370  
14369 14371          if (iraflags & IRAF_ICMP_ERROR) {
14370 14372                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
14371 14373          } else {
14372 14374                  ill_t *rill = ira->ira_rill;
14373 14375  
14374 14376                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
14375 14377                  /* This is the SOCK_RAW, IPPROTO_SCTP case. */
14376 14378                  ira->ira_ill = ira->ira_rill = NULL;
14377 14379                  (connp->conn_recv)(connp, mp, NULL, ira);
14378 14380                  ira->ira_ill = ill;
14379 14381                  ira->ira_rill = rill;
14380 14382          }
14381 14383          CONN_DEC_REF(connp);
14382 14384  }
14383 14385  
14384 14386  /*
14385 14387   * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
14386 14388   * header before the ip payload.
14387 14389   */
14388 14390  static void
14389 14391  ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
14390 14392  {
14391 14393          int len = (mp->b_wptr - mp->b_rptr);
14392 14394          mblk_t *ip_mp;
14393 14395  
14394 14396          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14395 14397          if (is_fp_mp || len != fp_mp_len) {
14396 14398                  if (len > fp_mp_len) {
14397 14399                          /*
14398 14400                           * fastpath header and ip header in the first mblk
14399 14401                           */
14400 14402                          mp->b_rptr += fp_mp_len;
14401 14403                  } else {
14402 14404                          /*
14403 14405                           * ip_xmit_attach_llhdr had to prepend an mblk to
14404 14406                           * attach the fastpath header before ip header.
14405 14407                           */
14406 14408                          ip_mp = mp->b_cont;
14407 14409                          freeb(mp);
14408 14410                          mp = ip_mp;
14409 14411                          mp->b_rptr += (fp_mp_len - len);
14410 14412                  }
14411 14413          } else {
14412 14414                  ip_mp = mp->b_cont;
14413 14415                  freeb(mp);
14414 14416                  mp = ip_mp;
14415 14417          }
14416 14418          ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
14417 14419          freemsg(mp);
14418 14420  }
14419 14421  
14420 14422  /*
14421 14423   * Normal post fragmentation function.
14422 14424   *
14423 14425   * Send a packet using the passed in nce. This handles both IPv4 and IPv6
14424 14426   * using the same state machine.
14425 14427   *
14426 14428   * We return an error on failure. In particular we return EWOULDBLOCK
14427 14429   * when the driver flow controls. In that case this ensures that ip_wsrv runs
14428 14430   * (currently by canputnext failure resulting in backenabling from GLD.)
14429 14431   * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
14430 14432   * indication that they can flow control until ip_wsrv() tells then to restart.
14431 14433   *
14432 14434   * If the nce passed by caller is incomplete, this function
14433 14435   * queues the packet and if necessary, sends ARP request and bails.
14434 14436   * If the Neighbor Cache passed is fully resolved, we simply prepend
14435 14437   * the link-layer header to the packet, do ipsec hw acceleration
14436 14438   * work if necessary, and send the packet out on the wire.
14437 14439   */
14438 14440  /* ARGSUSED6 */
14439 14441  int
14440 14442  ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
14441 14443      uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
14442 14444  {
14443 14445          queue_t         *wq;
14444 14446          ill_t           *ill = nce->nce_ill;
14445 14447          ip_stack_t      *ipst = ill->ill_ipst;
14446 14448          uint64_t        delta;
14447 14449          boolean_t       isv6 = ill->ill_isv6;
14448 14450          boolean_t       fp_mp;
14449 14451          ncec_t          *ncec = nce->nce_common;
14450 14452          int64_t         now = LBOLT_FASTPATH64;
14451 14453          boolean_t       is_probe;
14452 14454  
14453 14455          DTRACE_PROBE1(ip__xmit, nce_t *, nce);
14454 14456  
14455 14457          ASSERT(mp != NULL);
14456 14458          ASSERT(mp->b_datap->db_type == M_DATA);
14457 14459          ASSERT(pkt_len == msgdsize(mp));
14458 14460  
14459 14461          /*
14460 14462           * If we have already been here and are coming back after ARP/ND.
14461 14463           * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
14462 14464           * in that case since they have seen the packet when it came here
14463 14465           * the first time.
14464 14466           */
14465 14467          if (ixaflags & IXAF_NO_TRACE)
14466 14468                  goto sendit;
14467 14469  
14468 14470          if (ixaflags & IXAF_IS_IPV4) {
14469 14471                  ipha_t *ipha = (ipha_t *)mp->b_rptr;
14470 14472  
14471 14473                  ASSERT(!isv6);
14472 14474                  ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
14473 14475                  if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
14474 14476                      !(ixaflags & IXAF_NO_PFHOOK)) {
14475 14477                          int     error;
14476 14478  
14477 14479                          FW_HOOKS(ipst->ips_ip4_physical_out_event,
14478 14480                              ipst->ips_ipv4firewall_physical_out,
14479 14481                              NULL, ill, ipha, mp, mp, 0, ipst, error);
14480 14482                          DTRACE_PROBE1(ip4__physical__out__end,
14481 14483                              mblk_t *, mp);
14482 14484                          if (mp == NULL)
14483 14485                                  return (error);
14484 14486  
14485 14487                          /* The length could have changed */
14486 14488                          pkt_len = msgdsize(mp);
14487 14489                  }
14488 14490                  if (ipst->ips_ip4_observe.he_interested) {
14489 14491                          /*
14490 14492                           * Note that for TX the zoneid is the sending
14491 14493                           * zone, whether or not MLP is in play.
14492 14494                           * Since the szone argument is the IP zoneid (i.e.,
14493 14495                           * zero for exclusive-IP zones) and ipobs wants
14494 14496                           * the system zoneid, we map it here.
14495 14497                           */
14496 14498                          szone = IP_REAL_ZONEID(szone, ipst);
14497 14499  
14498 14500                          /*
14499 14501                           * On the outbound path the destination zone will be
14500 14502                           * unknown as we're sending this packet out on the
14501 14503                           * wire.
14502 14504                           */
14503 14505                          ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14504 14506                              ill, ipst);
14505 14507                  }
14506 14508                  DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
14507 14509                      void_ip_t *, ipha,  __dtrace_ipsr_ill_t *, ill,
14508 14510                      ipha_t *, ipha, ip6_t *, NULL, int, 0);
14509 14511          } else {
14510 14512                  ip6_t *ip6h = (ip6_t *)mp->b_rptr;
14511 14513  
14512 14514                  ASSERT(isv6);
14513 14515                  ASSERT(pkt_len ==
14514 14516                      ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
14515 14517                  if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
14516 14518                      !(ixaflags & IXAF_NO_PFHOOK)) {
14517 14519                          int     error;
14518 14520  
14519 14521                          FW_HOOKS6(ipst->ips_ip6_physical_out_event,
14520 14522                              ipst->ips_ipv6firewall_physical_out,
14521 14523                              NULL, ill, ip6h, mp, mp, 0, ipst, error);
14522 14524                          DTRACE_PROBE1(ip6__physical__out__end,
14523 14525                              mblk_t *, mp);
14524 14526                          if (mp == NULL)
14525 14527                                  return (error);
14526 14528  
14527 14529                          /* The length could have changed */
14528 14530                          pkt_len = msgdsize(mp);
14529 14531                  }
14530 14532                  if (ipst->ips_ip6_observe.he_interested) {
14531 14533                          /* See above */
14532 14534                          szone = IP_REAL_ZONEID(szone, ipst);
14533 14535  
14534 14536                          ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14535 14537                              ill, ipst);
14536 14538                  }
14537 14539                  DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
14538 14540                      void_ip_t *, ip6h,  __dtrace_ipsr_ill_t *, ill,
14539 14541                      ipha_t *, NULL, ip6_t *, ip6h, int, 0);
14540 14542          }
14541 14543  
14542 14544  sendit:
14543 14545          /*
14544 14546           * We check the state without a lock because the state can never
14545 14547           * move "backwards" to initial or incomplete.
14546 14548           */
14547 14549          switch (ncec->ncec_state) {
14548 14550          case ND_REACHABLE:
14549 14551          case ND_STALE:
14550 14552          case ND_DELAY:
14551 14553          case ND_PROBE:
14552 14554                  mp = ip_xmit_attach_llhdr(mp, nce);
14553 14555                  if (mp == NULL) {
14554 14556                          /*
14555 14557                           * ip_xmit_attach_llhdr has increased
14556 14558                           * ipIfStatsOutDiscards and called ip_drop_output()
14557 14559                           */
14558 14560                          return (ENOBUFS);
14559 14561                  }
14560 14562                  /*
14561 14563                   * check if nce_fastpath completed and we tagged on a
14562 14564                   * copy of nce_fp_mp in ip_xmit_attach_llhdr().
14563 14565                   */
14564 14566                  fp_mp = (mp->b_datap->db_type == M_DATA);
14565 14567  
14566 14568                  if (fp_mp &&
14567 14569                      (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
14568 14570                          ill_dld_direct_t *idd;
14569 14571  
14570 14572                          idd = &ill->ill_dld_capab->idc_direct;
14571 14573                          /*
14572 14574                           * Send the packet directly to DLD, where it
14573 14575                           * may be queued depending on the availability
14574 14576                           * of transmit resources at the media layer.
14575 14577                           * Return value should be taken into
14576 14578                           * account and flow control the TCP.
14577 14579                           */
14578 14580                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14579 14581                          UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14580 14582                              pkt_len);
14581 14583  
14582 14584                          if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
14583 14585                                  (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
14584 14586                                      (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
14585 14587                          } else {
14586 14588                                  uintptr_t cookie;
14587 14589  
14588 14590                                  if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
14589 14591                                      mp, (uintptr_t)xmit_hint, 0)) != 0) {
14590 14592                                          if (ixacookie != NULL)
14591 14593                                                  *ixacookie = cookie;
14592 14594                                          return (EWOULDBLOCK);
14593 14595                                  }
14594 14596                          }
14595 14597                  } else {
14596 14598                          wq = ill->ill_wq;
14597 14599  
14598 14600                          if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
14599 14601                              !canputnext(wq)) {
14600 14602                                  if (ixacookie != NULL)
14601 14603                                          *ixacookie = 0;
14602 14604                                  ip_xmit_flowctl_drop(ill, mp, fp_mp,
14603 14605                                      nce->nce_fp_mp != NULL ?
14604 14606                                      MBLKL(nce->nce_fp_mp) : 0);
14605 14607                                  return (EWOULDBLOCK);
14606 14608                          }
14607 14609                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14608 14610                          UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14609 14611                              pkt_len);
14610 14612                          putnext(wq, mp);
14611 14613                  }
14612 14614  
14613 14615                  /*
14614 14616                   * The rest of this function implements Neighbor Unreachability
14615 14617                   * detection. Determine if the ncec is eligible for NUD.
14616 14618                   */
14617 14619                  if (ncec->ncec_flags & NCE_F_NONUD)
14618 14620                          return (0);
14619 14621  
14620 14622                  ASSERT(ncec->ncec_state != ND_INCOMPLETE);
14621 14623  
14622 14624                  /*
14623 14625                   * Check for upper layer advice
14624 14626                   */
14625 14627                  if (ixaflags & IXAF_REACH_CONF) {
14626 14628                          timeout_id_t tid;
14627 14629  
14628 14630                          /*
14629 14631                           * It should be o.k. to check the state without
14630 14632                           * a lock here, at most we lose an advice.
14631 14633                           */
14632 14634                          ncec->ncec_last = TICK_TO_MSEC(now);
14633 14635                          if (ncec->ncec_state != ND_REACHABLE) {
14634 14636                                  mutex_enter(&ncec->ncec_lock);
14635 14637                                  ncec->ncec_state = ND_REACHABLE;
14636 14638                                  tid = ncec->ncec_timeout_id;
14637 14639                                  ncec->ncec_timeout_id = 0;
14638 14640                                  mutex_exit(&ncec->ncec_lock);
14639 14641                                  (void) untimeout(tid);
14640 14642                                  if (ip_debug > 2) {
14641 14643                                          /* ip1dbg */
14642 14644                                          pr_addr_dbg("ip_xmit: state"
14643 14645                                              " for %s changed to"
14644 14646                                              " REACHABLE\n", AF_INET6,
14645 14647                                              &ncec->ncec_addr);
14646 14648                                  }
14647 14649                          }
14648 14650                          return (0);
14649 14651                  }
14650 14652  
14651 14653                  delta =  TICK_TO_MSEC(now) - ncec->ncec_last;
14652 14654                  ip1dbg(("ip_xmit: delta = %" PRId64
14653 14655                      " ill_reachable_time = %d \n", delta,
14654 14656                      ill->ill_reachable_time));
14655 14657                  if (delta > (uint64_t)ill->ill_reachable_time) {
14656 14658                          mutex_enter(&ncec->ncec_lock);
14657 14659                          switch (ncec->ncec_state) {
14658 14660                          case ND_REACHABLE:
14659 14661                                  ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
14660 14662                                  /* FALLTHROUGH */
14661 14663                          case ND_STALE:
14662 14664                                  /*
14663 14665                                   * ND_REACHABLE is identical to
14664 14666                                   * ND_STALE in this specific case. If
14665 14667                                   * reachable time has expired for this
14666 14668                                   * neighbor (delta is greater than
14667 14669                                   * reachable time), conceptually, the
14668 14670                                   * neighbor cache is no longer in
14669 14671                                   * REACHABLE state, but already in
14670 14672                                   * STALE state.  So the correct
14671 14673                                   * transition here is to ND_DELAY.
14672 14674                                   */
14673 14675                                  ncec->ncec_state = ND_DELAY;
14674 14676                                  mutex_exit(&ncec->ncec_lock);
14675 14677                                  nce_restart_timer(ncec,
14676 14678                                      ipst->ips_delay_first_probe_time);
14677 14679                                  if (ip_debug > 3) {
14678 14680                                          /* ip2dbg */
14679 14681                                          pr_addr_dbg("ip_xmit: state"
14680 14682                                              " for %s changed to"
14681 14683                                              " DELAY\n", AF_INET6,
14682 14684                                              &ncec->ncec_addr);
14683 14685                                  }
14684 14686                                  break;
14685 14687                          case ND_DELAY:
14686 14688                          case ND_PROBE:
14687 14689                                  mutex_exit(&ncec->ncec_lock);
14688 14690                                  /* Timers have already started */
14689 14691                                  break;
14690 14692                          case ND_UNREACHABLE:
14691 14693                                  /*
14692 14694                                   * nce_timer has detected that this ncec
14693 14695                                   * is unreachable and initiated deleting
14694 14696                                   * this ncec.
14695 14697                                   * This is a harmless race where we found the
14696 14698                                   * ncec before it was deleted and have
14697 14699                                   * just sent out a packet using this
14698 14700                                   * unreachable ncec.
14699 14701                                   */
14700 14702                                  mutex_exit(&ncec->ncec_lock);
14701 14703                                  break;
14702 14704                          default:
14703 14705                                  ASSERT(0);
14704 14706                                  mutex_exit(&ncec->ncec_lock);
14705 14707                          }
14706 14708                  }
14707 14709                  return (0);
14708 14710  
14709 14711          case ND_INCOMPLETE:
14710 14712                  /*
14711 14713                   * the state could have changed since we didn't hold the lock.
14712 14714                   * Re-verify state under lock.
14713 14715                   */
14714 14716                  is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14715 14717                  mutex_enter(&ncec->ncec_lock);
14716 14718                  if (NCE_ISREACHABLE(ncec)) {
14717 14719                          mutex_exit(&ncec->ncec_lock);
14718 14720                          goto sendit;
14719 14721                  }
14720 14722                  /* queue the packet */
14721 14723                  nce_queue_mp(ncec, mp, is_probe);
14722 14724                  mutex_exit(&ncec->ncec_lock);
14723 14725                  DTRACE_PROBE2(ip__xmit__incomplete,
14724 14726                      (ncec_t *), ncec, (mblk_t *), mp);
14725 14727                  return (0);
14726 14728  
14727 14729          case ND_INITIAL:
14728 14730                  /*
14729 14731                   * State could have changed since we didn't hold the lock, so
14730 14732                   * re-verify state.
14731 14733                   */
14732 14734                  is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14733 14735                  mutex_enter(&ncec->ncec_lock);
14734 14736                  if (NCE_ISREACHABLE(ncec))  {
14735 14737                          mutex_exit(&ncec->ncec_lock);
14736 14738                          goto sendit;
14737 14739                  }
14738 14740                  nce_queue_mp(ncec, mp, is_probe);
14739 14741                  if (ncec->ncec_state == ND_INITIAL) {
14740 14742                          ncec->ncec_state = ND_INCOMPLETE;
14741 14743                          mutex_exit(&ncec->ncec_lock);
14742 14744                          /*
14743 14745                           * figure out the source we want to use
14744 14746                           * and resolve it.
14745 14747                           */
14746 14748                          ip_ndp_resolve(ncec);
14747 14749                  } else  {
14748 14750                          mutex_exit(&ncec->ncec_lock);
14749 14751                  }
14750 14752                  return (0);
14751 14753  
14752 14754          case ND_UNREACHABLE:
14753 14755                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14754 14756                  ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
14755 14757                      mp, ill);
14756 14758                  freemsg(mp);
14757 14759                  return (0);
14758 14760  
14759 14761          default:
14760 14762                  ASSERT(0);
14761 14763                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14762 14764                  ip_drop_output("ipIfStatsOutDiscards - ND_other",
14763 14765                      mp, ill);
14764 14766                  freemsg(mp);
14765 14767                  return (ENETUNREACH);
14766 14768          }
14767 14769  }
14768 14770  
14769 14771  /*
14770 14772   * Return B_TRUE if the buffers differ in length or content.
14771 14773   * This is used for comparing extension header buffers.
14772 14774   * Note that an extension header would be declared different
14773 14775   * even if all that changed was the next header value in that header i.e.
14774 14776   * what really changed is the next extension header.
14775 14777   */
14776 14778  boolean_t
14777 14779  ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf,
14778 14780      uint_t blen)
14779 14781  {
14780 14782          if (!b_valid)
14781 14783                  blen = 0;
14782 14784  
14783 14785          if (alen != blen)
14784 14786                  return (B_TRUE);
14785 14787          if (alen == 0)
14786 14788                  return (B_FALSE);       /* Both zero length */
14787 14789          return (bcmp(abuf, bbuf, alen));
14788 14790  }
14789 14791  
14790 14792  /*
14791 14793   * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok.
14792 14794   * Return B_FALSE if memory allocation fails - don't change any state!
14793 14795   */
14794 14796  boolean_t
14795 14797  ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14796 14798      const void *src, uint_t srclen)
14797 14799  {
14798 14800          void *dst;
14799 14801  
14800 14802          if (!src_valid)
14801 14803                  srclen = 0;
14802 14804  
14803 14805          ASSERT(*dstlenp == 0);
14804 14806          if (src != NULL && srclen != 0) {
14805 14807                  dst = mi_alloc(srclen, BPRI_MED);
14806 14808                  if (dst == NULL)
14807 14809                          return (B_FALSE);
14808 14810          } else {
14809 14811                  dst = NULL;
14810 14812          }
14811 14813          if (*dstp != NULL)
14812 14814                  mi_free(*dstp);
14813 14815          *dstp = dst;
14814 14816          *dstlenp = dst == NULL ? 0 : srclen;
14815 14817          return (B_TRUE);
14816 14818  }
14817 14819  
14818 14820  /*
14819 14821   * Replace what is in *dst, *dstlen with the source.
14820 14822   * Assumes ip_allocbuf has already been called.
14821 14823   */
14822 14824  void
14823 14825  ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14824 14826      const void *src, uint_t srclen)
14825 14827  {
14826 14828          if (!src_valid)
14827 14829                  srclen = 0;
14828 14830  
14829 14831          ASSERT(*dstlenp == srclen);
14830 14832          if (src != NULL && srclen != 0)
14831 14833                  bcopy(src, *dstp, srclen);
14832 14834  }
14833 14835  
14834 14836  /*
14835 14837   * Free the storage pointed to by the members of an ip_pkt_t.
14836 14838   */
14837 14839  void
14838 14840  ip_pkt_free(ip_pkt_t *ipp)
14839 14841  {
14840 14842          uint_t  fields = ipp->ipp_fields;
14841 14843  
14842 14844          if (fields & IPPF_HOPOPTS) {
14843 14845                  kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
14844 14846                  ipp->ipp_hopopts = NULL;
14845 14847                  ipp->ipp_hopoptslen = 0;
14846 14848          }
14847 14849          if (fields & IPPF_RTHDRDSTOPTS) {
14848 14850                  kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
14849 14851                  ipp->ipp_rthdrdstopts = NULL;
14850 14852                  ipp->ipp_rthdrdstoptslen = 0;
14851 14853          }
14852 14854          if (fields & IPPF_DSTOPTS) {
14853 14855                  kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
14854 14856                  ipp->ipp_dstopts = NULL;
14855 14857                  ipp->ipp_dstoptslen = 0;
14856 14858          }
14857 14859          if (fields & IPPF_RTHDR) {
14858 14860                  kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
14859 14861                  ipp->ipp_rthdr = NULL;
14860 14862                  ipp->ipp_rthdrlen = 0;
14861 14863          }
14862 14864          if (fields & IPPF_IPV4_OPTIONS) {
14863 14865                  kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
14864 14866                  ipp->ipp_ipv4_options = NULL;
14865 14867                  ipp->ipp_ipv4_options_len = 0;
14866 14868          }
14867 14869          if (fields & IPPF_LABEL_V4) {
14868 14870                  kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
14869 14871                  ipp->ipp_label_v4 = NULL;
14870 14872                  ipp->ipp_label_len_v4 = 0;
14871 14873          }
14872 14874          if (fields & IPPF_LABEL_V6) {
14873 14875                  kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
14874 14876                  ipp->ipp_label_v6 = NULL;
14875 14877                  ipp->ipp_label_len_v6 = 0;
14876 14878          }
14877 14879          ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14878 14880              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14879 14881  }
14880 14882  
14881 14883  /*
14882 14884   * Copy from src to dst and allocate as needed.
14883 14885   * Returns zero or ENOMEM.
14884 14886   *
14885 14887   * The caller must initialize dst to zero.
14886 14888   */
14887 14889  int
14888 14890  ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
14889 14891  {
14890 14892          uint_t  fields = src->ipp_fields;
14891 14893  
14892 14894          /* Start with fields that don't require memory allocation */
14893 14895          dst->ipp_fields = fields &
14894 14896              ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14895 14897              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14896 14898  
14897 14899          dst->ipp_addr = src->ipp_addr;
14898 14900          dst->ipp_unicast_hops = src->ipp_unicast_hops;
14899 14901          dst->ipp_hoplimit = src->ipp_hoplimit;
14900 14902          dst->ipp_tclass = src->ipp_tclass;
14901 14903          dst->ipp_type_of_service = src->ipp_type_of_service;
14902 14904  
14903 14905          if (!(fields & (IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14904 14906              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6)))
14905 14907                  return (0);
14906 14908  
14907 14909          if (fields & IPPF_HOPOPTS) {
14908 14910                  dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
14909 14911                  if (dst->ipp_hopopts == NULL) {
14910 14912                          ip_pkt_free(dst);
14911 14913                          return (ENOMEM);
14912 14914                  }
14913 14915                  dst->ipp_fields |= IPPF_HOPOPTS;
14914 14916                  bcopy(src->ipp_hopopts, dst->ipp_hopopts,
14915 14917                      src->ipp_hopoptslen);
14916 14918                  dst->ipp_hopoptslen = src->ipp_hopoptslen;
14917 14919          }
14918 14920          if (fields & IPPF_RTHDRDSTOPTS) {
14919 14921                  dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
14920 14922                      kmflag);
14921 14923                  if (dst->ipp_rthdrdstopts == NULL) {
14922 14924                          ip_pkt_free(dst);
14923 14925                          return (ENOMEM);
14924 14926                  }
14925 14927                  dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
14926 14928                  bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
14927 14929                      src->ipp_rthdrdstoptslen);
14928 14930                  dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
14929 14931          }
14930 14932          if (fields & IPPF_DSTOPTS) {
14931 14933                  dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
14932 14934                  if (dst->ipp_dstopts == NULL) {
14933 14935                          ip_pkt_free(dst);
14934 14936                          return (ENOMEM);
14935 14937                  }
14936 14938                  dst->ipp_fields |= IPPF_DSTOPTS;
14937 14939                  bcopy(src->ipp_dstopts, dst->ipp_dstopts,
14938 14940                      src->ipp_dstoptslen);
14939 14941                  dst->ipp_dstoptslen = src->ipp_dstoptslen;
14940 14942          }
14941 14943          if (fields & IPPF_RTHDR) {
14942 14944                  dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
14943 14945                  if (dst->ipp_rthdr == NULL) {
14944 14946                          ip_pkt_free(dst);
14945 14947                          return (ENOMEM);
14946 14948                  }
14947 14949                  dst->ipp_fields |= IPPF_RTHDR;
14948 14950                  bcopy(src->ipp_rthdr, dst->ipp_rthdr,
14949 14951                      src->ipp_rthdrlen);
14950 14952                  dst->ipp_rthdrlen = src->ipp_rthdrlen;
14951 14953          }
14952 14954          if (fields & IPPF_IPV4_OPTIONS) {
14953 14955                  dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
14954 14956                      kmflag);
14955 14957                  if (dst->ipp_ipv4_options == NULL) {
14956 14958                          ip_pkt_free(dst);
14957 14959                          return (ENOMEM);
14958 14960                  }
14959 14961                  dst->ipp_fields |= IPPF_IPV4_OPTIONS;
14960 14962                  bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
14961 14963                      src->ipp_ipv4_options_len);
14962 14964                  dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
14963 14965          }
14964 14966          if (fields & IPPF_LABEL_V4) {
14965 14967                  dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
14966 14968                  if (dst->ipp_label_v4 == NULL) {
14967 14969                          ip_pkt_free(dst);
14968 14970                          return (ENOMEM);
14969 14971                  }
14970 14972                  dst->ipp_fields |= IPPF_LABEL_V4;
14971 14973                  bcopy(src->ipp_label_v4, dst->ipp_label_v4,
14972 14974                      src->ipp_label_len_v4);
14973 14975                  dst->ipp_label_len_v4 = src->ipp_label_len_v4;
14974 14976          }
14975 14977          if (fields & IPPF_LABEL_V6) {
14976 14978                  dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
14977 14979                  if (dst->ipp_label_v6 == NULL) {
14978 14980                          ip_pkt_free(dst);
14979 14981                          return (ENOMEM);
14980 14982                  }
14981 14983                  dst->ipp_fields |= IPPF_LABEL_V6;
14982 14984                  bcopy(src->ipp_label_v6, dst->ipp_label_v6,
14983 14985                      src->ipp_label_len_v6);
14984 14986                  dst->ipp_label_len_v6 = src->ipp_label_len_v6;
14985 14987          }
14986 14988          if (fields & IPPF_FRAGHDR) {
14987 14989                  dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
14988 14990                  if (dst->ipp_fraghdr == NULL) {
14989 14991                          ip_pkt_free(dst);
14990 14992                          return (ENOMEM);
14991 14993                  }
14992 14994                  dst->ipp_fields |= IPPF_FRAGHDR;
14993 14995                  bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
14994 14996                      src->ipp_fraghdrlen);
14995 14997                  dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
14996 14998          }
14997 14999          return (0);
14998 15000  }
14999 15001  
15000 15002  /*
15001 15003   * Returns INADDR_ANY if no source route
15002 15004   */
15003 15005  ipaddr_t
15004 15006  ip_pkt_source_route_v4(const ip_pkt_t *ipp)
15005 15007  {
15006 15008          ipaddr_t        nexthop = INADDR_ANY;
15007 15009          ipoptp_t        opts;
15008 15010          uchar_t         *opt;
15009 15011          uint8_t         optval;
15010 15012          uint8_t         optlen;
15011 15013          uint32_t        totallen;
15012 15014  
15013 15015          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
15014 15016                  return (INADDR_ANY);
15015 15017  
15016 15018          totallen = ipp->ipp_ipv4_options_len;
15017 15019          if (totallen & 0x3)
15018 15020                  return (INADDR_ANY);
15019 15021  
15020 15022          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
15021 15023              optval != IPOPT_EOL;
15022 15024              optval = ipoptp_next(&opts)) {
15023 15025                  opt = opts.ipoptp_cur;
15024 15026                  switch (optval) {
15025 15027                          uint8_t off;
15026 15028                  case IPOPT_SSRR:
15027 15029                  case IPOPT_LSRR:
15028 15030                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
15029 15031                                  break;
15030 15032                          }
15031 15033                          optlen = opts.ipoptp_len;
15032 15034                          off = opt[IPOPT_OFFSET];
15033 15035                          off--;
15034 15036                          if (optlen < IP_ADDR_LEN ||
15035 15037                              off > optlen - IP_ADDR_LEN) {
15036 15038                                  /* End of source route */
15037 15039                                  break;
15038 15040                          }
15039 15041                          bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
15040 15042                          if (nexthop == htonl(INADDR_LOOPBACK)) {
15041 15043                                  /* Ignore */
15042 15044                                  nexthop = INADDR_ANY;
15043 15045                                  break;
15044 15046                          }
15045 15047                          break;
15046 15048                  }
15047 15049          }
15048 15050          return (nexthop);
15049 15051  }
15050 15052  
15051 15053  /*
15052 15054   * Reverse a source route.
15053 15055   */
15054 15056  void
15055 15057  ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
15056 15058  {
15057 15059          ipaddr_t        tmp;
15058 15060          ipoptp_t        opts;
15059 15061          uchar_t         *opt;
15060 15062          uint8_t         optval;
15061 15063          uint32_t        totallen;
15062 15064  
15063 15065          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
15064 15066                  return;
15065 15067  
15066 15068          totallen = ipp->ipp_ipv4_options_len;
15067 15069          if (totallen & 0x3)
15068 15070                  return;
15069 15071  
15070 15072          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
15071 15073              optval != IPOPT_EOL;
15072 15074              optval = ipoptp_next(&opts)) {
15073 15075                  uint8_t off1, off2;
15074 15076  
15075 15077                  opt = opts.ipoptp_cur;
15076 15078                  switch (optval) {
15077 15079                  case IPOPT_SSRR:
15078 15080                  case IPOPT_LSRR:
15079 15081                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
15080 15082                                  break;
15081 15083                          }
15082 15084                          off1 = IPOPT_MINOFF_SR - 1;
15083 15085                          off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
15084 15086                          while (off2 > off1) {
15085 15087                                  bcopy(opt + off2, &tmp, IP_ADDR_LEN);
15086 15088                                  bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
15087 15089                                  bcopy(&tmp, opt + off2, IP_ADDR_LEN);
15088 15090                                  off2 -= IP_ADDR_LEN;
15089 15091                                  off1 += IP_ADDR_LEN;
15090 15092                          }
15091 15093                          opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
15092 15094                          break;
15093 15095                  }
15094 15096          }
15095 15097  }
15096 15098  
15097 15099  /*
15098 15100   * Returns NULL if no routing header
15099 15101   */
15100 15102  in6_addr_t *
15101 15103  ip_pkt_source_route_v6(const ip_pkt_t *ipp)
15102 15104  {
15103 15105          in6_addr_t      *nexthop = NULL;
15104 15106          ip6_rthdr0_t    *rthdr;
15105 15107  
15106 15108          if (!(ipp->ipp_fields & IPPF_RTHDR))
15107 15109                  return (NULL);
15108 15110  
15109 15111          rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
15110 15112          if (rthdr->ip6r0_segleft == 0)
15111 15113                  return (NULL);
15112 15114  
15113 15115          nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
15114 15116          return (nexthop);
15115 15117  }
15116 15118  
15117 15119  zoneid_t
15118 15120  ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
15119 15121      zoneid_t lookup_zoneid)
15120 15122  {
15121 15123          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
15122 15124          ire_t           *ire;
15123 15125          int             ire_flags = MATCH_IRE_TYPE;
15124 15126          zoneid_t        zoneid = ALL_ZONES;
15125 15127  
15126 15128          if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15127 15129                  return (ALL_ZONES);
15128 15130  
15129 15131          if (lookup_zoneid != ALL_ZONES)
15130 15132                  ire_flags |= MATCH_IRE_ZONEONLY;
15131 15133          ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_LOCAL | IRE_LOOPBACK,
15132 15134              NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15133 15135          if (ire != NULL) {
15134 15136                  zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15135 15137                  ire_refrele(ire);
15136 15138          }
15137 15139          return (zoneid);
15138 15140  }
15139 15141  
15140 15142  zoneid_t
15141 15143  ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
15142 15144      ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
15143 15145  {
15144 15146          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
15145 15147          ire_t           *ire;
15146 15148          int             ire_flags = MATCH_IRE_TYPE;
15147 15149          zoneid_t        zoneid = ALL_ZONES;
15148 15150  
15149 15151          if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15150 15152                  return (ALL_ZONES);
15151 15153  
15152 15154          if (IN6_IS_ADDR_LINKLOCAL(addr))
15153 15155                  ire_flags |= MATCH_IRE_ILL;
15154 15156  
15155 15157          if (lookup_zoneid != ALL_ZONES)
15156 15158                  ire_flags |= MATCH_IRE_ZONEONLY;
15157 15159          ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15158 15160              ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15159 15161          if (ire != NULL) {
15160 15162                  zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15161 15163                  ire_refrele(ire);
15162 15164          }
15163 15165          return (zoneid);
15164 15166  }
15165 15167  
15166 15168  /*
15167 15169   * IP obserability hook support functions.
15168 15170   */
15169 15171  static void
15170 15172  ipobs_init(ip_stack_t *ipst)
15171 15173  {
15172 15174          netid_t id;
15173 15175  
15174 15176          id = net_getnetidbynetstackid(ipst->ips_netstack->netstack_stackid);
15175 15177  
15176 15178          ipst->ips_ip4_observe_pr = net_protocol_lookup(id, NHF_INET);
15177 15179          VERIFY(ipst->ips_ip4_observe_pr != NULL);
15178 15180  
15179 15181          ipst->ips_ip6_observe_pr = net_protocol_lookup(id, NHF_INET6);
15180 15182          VERIFY(ipst->ips_ip6_observe_pr != NULL);
15181 15183  }
15182 15184  
15183 15185  static void
15184 15186  ipobs_fini(ip_stack_t *ipst)
15185 15187  {
15186 15188  
15187 15189          VERIFY(net_protocol_release(ipst->ips_ip4_observe_pr) == 0);
15188 15190          VERIFY(net_protocol_release(ipst->ips_ip6_observe_pr) == 0);
15189 15191  }
15190 15192  
15191 15193  /*
15192 15194   * hook_pkt_observe_t is composed in network byte order so that the
15193 15195   * entire mblk_t chain handed into hook_run can be used as-is.
15194 15196   * The caveat is that use of the fields, such as the zone fields,
15195 15197   * requires conversion into host byte order first.
15196 15198   */
15197 15199  void
15198 15200  ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
15199 15201      const ill_t *ill, ip_stack_t *ipst)
15200 15202  {
15201 15203          hook_pkt_observe_t *hdr;
15202 15204          uint64_t grifindex;
15203 15205          mblk_t *imp;
15204 15206  
15205 15207          imp = allocb(sizeof (*hdr), BPRI_HI);
15206 15208          if (imp == NULL)
15207 15209                  return;
15208 15210  
15209 15211          hdr = (hook_pkt_observe_t *)imp->b_rptr;
15210 15212          /*
15211 15213           * b_wptr is set to make the apparent size of the data in the mblk_t
15212 15214           * to exclude the pointers at the end of hook_pkt_observer_t.
15213 15215           */
15214 15216          imp->b_wptr = imp->b_rptr + sizeof (dl_ipnetinfo_t);
15215 15217          imp->b_cont = mp;
15216 15218  
15217 15219          ASSERT(DB_TYPE(mp) == M_DATA);
15218 15220  
15219 15221          if (IS_UNDER_IPMP(ill))
15220 15222                  grifindex = ipmp_ill_get_ipmp_ifindex(ill);
15221 15223          else
15222 15224                  grifindex = 0;
15223 15225  
15224 15226          hdr->hpo_version = 1;
15225 15227          hdr->hpo_htype = htons(htype);
15226 15228          hdr->hpo_pktlen = htonl((ulong_t)msgdsize(mp));
15227 15229          hdr->hpo_ifindex = htonl(ill->ill_phyint->phyint_ifindex);
15228 15230          hdr->hpo_grifindex = htonl(grifindex);
15229 15231          hdr->hpo_zsrc = htonl(zsrc);
15230 15232          hdr->hpo_zdst = htonl(zdst);
15231 15233          hdr->hpo_pkt = imp;
15232 15234          hdr->hpo_ctx = ipst->ips_netstack;
15233 15235  
15234 15236          if (ill->ill_isv6) {
15235 15237                  hdr->hpo_family = AF_INET6;
15236 15238                  (void) hook_run(ipst->ips_ipv6_net_data->netd_hooks,
15237 15239                      ipst->ips_ipv6observing, (hook_data_t)hdr);
15238 15240          } else {
15239 15241                  hdr->hpo_family = AF_INET;
15240 15242                  (void) hook_run(ipst->ips_ipv4_net_data->netd_hooks,
15241 15243                      ipst->ips_ipv4observing, (hook_data_t)hdr);
15242 15244          }
15243 15245  
15244 15246          imp->b_cont = NULL;
15245 15247          freemsg(imp);
15246 15248  }
15247 15249  
15248 15250  /*
15249 15251   * Utility routine that checks if `v4srcp' is a valid address on underlying
15250 15252   * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
15251 15253   * associated with `v4srcp' on success.  NOTE: if this is not called from
15252 15254   * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
15253 15255   * group during or after this lookup.
15254 15256   */
15255 15257  boolean_t
15256 15258  ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
15257 15259  {
15258 15260          ipif_t *ipif;
15259 15261  
15260 15262          ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
15261 15263          if (ipif != NULL) {
15262 15264                  if (ipifp != NULL)
15263 15265                          *ipifp = ipif;
15264 15266                  else
15265 15267                          ipif_refrele(ipif);
15266 15268                  return (B_TRUE);
15267 15269          }
15268 15270  
15269 15271          ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
15270 15272              *v4srcp));
15271 15273          return (B_FALSE);
15272 15274  }
15273 15275  
15274 15276  /*
15275 15277   * Transport protocol call back function for CPU state change.
15276 15278   */
15277 15279  /* ARGSUSED */
15278 15280  static int
15279 15281  ip_tp_cpu_update(cpu_setup_t what, int id, void *arg)
15280 15282  {
15281 15283          processorid_t cpu_seqid;
15282 15284          netstack_handle_t nh;
15283 15285          netstack_t *ns;
15284 15286  
15285 15287          ASSERT(MUTEX_HELD(&cpu_lock));
15286 15288  
15287 15289          switch (what) {
15288 15290          case CPU_CONFIG:
15289 15291          case CPU_ON:
15290 15292          case CPU_INIT:
15291 15293          case CPU_CPUPART_IN:
15292 15294                  cpu_seqid = cpu[id]->cpu_seqid;
15293 15295                  netstack_next_init(&nh);
15294 15296                  while ((ns = netstack_next(&nh)) != NULL) {
15295 15297                          tcp_stack_cpu_add(ns->netstack_tcp, cpu_seqid);
15296 15298                          sctp_stack_cpu_add(ns->netstack_sctp, cpu_seqid);
15297 15299                          udp_stack_cpu_add(ns->netstack_udp, cpu_seqid);
15298 15300                          netstack_rele(ns);
15299 15301                  }
15300 15302                  netstack_next_fini(&nh);
15301 15303                  break;
15302 15304          case CPU_UNCONFIG:
15303 15305          case CPU_OFF:
15304 15306          case CPU_CPUPART_OUT:
15305 15307                  /*
15306 15308                   * Nothing to do.  We don't remove the per CPU stats from
15307 15309                   * the IP stack even when the CPU goes offline.
15308 15310                   */
15309 15311                  break;
15310 15312          default:
15311 15313                  break;
15312 15314          }
15313 15315          return (0);
15314 15316  }

↓ open down ↓

1327 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX