1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 /*
  26  * iptun - IP Tunneling Driver
  27  *
  28  * This module is a GLDv3 driver that implements virtual datalinks over IP
  29  * (a.k.a, IP tunneling).  The datalinks are managed through a dld ioctl
  30  * interface (see iptun_ctl.c), and registered with GLDv3 using
  31  * mac_register().  It implements the logic for various forms of IP (IPv4 or
  32  * IPv6) encapsulation within IP (IPv4 or IPv6) by interacting with the ip
  33  * module below it.  Each virtual IP tunnel datalink has a conn_t associated
  34  * with it representing the "outer" IP connection.
  35  *
  36  * The module implements the following locking semantics:
  37  *
  38  * Lookups and deletions in iptun_hash are synchronized using iptun_hash_lock.
  39  * See comments above iptun_hash_lock for details.
  40  *
  41  * No locks are ever held while calling up to GLDv3.  The general architecture
  42  * of GLDv3 requires this, as the mac perimeter (essentially a lock) for a
  43  * given link will be held while making downcalls (iptun_m_*() callbacks).
  44  * Because we need to hold locks while handling downcalls, holding these locks
  45  * while issuing upcalls results in deadlock scenarios.  See the block comment
  46  * above iptun_task_cb() for details on how we safely issue upcalls without
  47  * holding any locks.
  48  *
  49  * The contents of each iptun_t is protected by an iptun_mutex which is held
  50  * in iptun_enter() (called by iptun_enter_by_linkid()), and exited in
  51  * iptun_exit().
  52  *
  53  * See comments in iptun_delete() and iptun_free() for details on how the
  54  * iptun_t is deleted safely.
  55  */
  56 
  57 #include <sys/types.h>
  58 #include <sys/kmem.h>
  59 #include <sys/errno.h>
  60 #include <sys/modhash.h>
  61 #include <sys/list.h>
  62 #include <sys/strsun.h>
  63 #include <sys/file.h>
  64 #include <sys/systm.h>
  65 #include <sys/tihdr.h>
  66 #include <sys/param.h>
  67 #include <sys/mac_provider.h>
  68 #include <sys/mac_ipv4.h>
  69 #include <sys/mac_ipv6.h>
  70 #include <sys/mac_6to4.h>
  71 #include <sys/tsol/tnet.h>
  72 #include <sys/sunldi.h>
  73 #include <netinet/in.h>
  74 #include <netinet/ip6.h>
  75 #include <inet/ip.h>
  76 #include <inet/ip_ire.h>
  77 #include <inet/ipsec_impl.h>
  78 #include <sys/tsol/label.h>
  79 #include <sys/tsol/tnet.h>
  80 #include <inet/iptun.h>
  81 #include "iptun_impl.h"
  82 
  83 /* Do the tunnel type and address family match? */
  84 #define IPTUN_ADDR_MATCH(iptun_type, family)                            \
  85         ((iptun_type == IPTUN_TYPE_IPV4 && family == AF_INET) ||        \
  86         (iptun_type == IPTUN_TYPE_IPV6 && family == AF_INET6) ||        \
  87         (iptun_type == IPTUN_TYPE_6TO4 && family == AF_INET))
  88 
  89 #define IPTUN_HASH_KEY(key)     ((mod_hash_key_t)(uintptr_t)(key))
  90 
  91 #define IPTUN_MIN_IPV4_MTU      576             /* ip.h still uses 68 (!) */
  92 #define IPTUN_MIN_IPV6_MTU      IPV6_MIN_MTU
  93 #define IPTUN_MAX_IPV4_MTU      (IP_MAXPACKET - sizeof (ipha_t))
  94 #define IPTUN_MAX_IPV6_MTU      (IP_MAXPACKET - sizeof (ip6_t) -        \
  95                                     sizeof (iptun_encaplim_t))
  96 
  97 #define IPTUN_MIN_HOPLIMIT      1
  98 #define IPTUN_MAX_HOPLIMIT      UINT8_MAX
  99 
 100 #define IPTUN_MIN_ENCAPLIMIT    0
 101 #define IPTUN_MAX_ENCAPLIMIT    UINT8_MAX
 102 
 103 #define IPTUN_IPSEC_REQ_MASK    (IPSEC_PREF_REQUIRED | IPSEC_PREF_NEVER)
 104 
 105 static iptun_encaplim_t iptun_encaplim_init = {
 106         { IPPROTO_NONE, 0 },
 107         {   IP6OPT_TUNNEL_LIMIT,
 108             1,
 109             IPTUN_DEFAULT_ENCAPLIMIT }, /* filled in with actual value later */
 110         {   IP6OPT_PADN,
 111             1,
 112             0 }
 113 };
 114 
 115 /*
 116  * Table containing per-iptun-type information.
 117  * Since IPv6 can run over all of these we have the IPv6 min as the min MTU.
 118  */
 119 static iptun_typeinfo_t iptun_type_table[] = {
 120         { IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION,
 121             IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU,     B_TRUE },
 122         { IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION,
 123             IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV6_MTU,     B_TRUE },
 124         { IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION,
 125             IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU,     B_FALSE },
 126         { IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE }
 127 };
 128 
 129 /*
 130  * iptun_hash is an iptun_t lookup table by link ID protected by
 131  * iptun_hash_lock.  While the hash table's integrity is maintained via
 132  * internal locking in the mod_hash_*() functions, we need additional locking
 133  * so that an iptun_t cannot be deleted after a hash lookup has returned an
 134  * iptun_t and before iptun_lock has been entered.  As such, we use
 135  * iptun_hash_lock when doing lookups and removals from iptun_hash.
 136  */
 137 mod_hash_t      *iptun_hash;
 138 static kmutex_t iptun_hash_lock;
 139 
 140 static uint_t   iptun_tunnelcount;      /* total for all stacks */
 141 kmem_cache_t    *iptun_cache;
 142 ddi_taskq_t     *iptun_taskq;
 143 
 144 typedef enum {
 145         IPTUN_TASK_MTU_UPDATE,  /* tell mac about new tunnel link MTU */
 146         IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */
 147         IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */
 148         IPTUN_TASK_LINK_UPDATE, /* tell mac about new link state */
 149         IPTUN_TASK_PDATA_UPDATE /* tell mac about updated plugin data */
 150 } iptun_task_t;
 151 
 152 typedef struct iptun_task_data_s {
 153         iptun_task_t    itd_task;
 154         datalink_id_t   itd_linkid;
 155 } iptun_task_data_t;
 156 
 157 static void iptun_task_dispatch(iptun_t *, iptun_task_t);
 158 static int iptun_enter(iptun_t *);
 159 static void iptun_exit(iptun_t *);
 160 static void iptun_headergen(iptun_t *, boolean_t);
 161 static void iptun_drop_pkt(mblk_t *, uint64_t *);
 162 static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 163 static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *);
 164 static void iptun_output(iptun_t *, mblk_t *);
 165 static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
 166 static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t);
 167 static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
 168 static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *);
 169 static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *);
 170 
 171 static void iptun_output_6to4(iptun_t *, mblk_t *);
 172 static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *);
 173 static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
 174     ip_recv_attr_t *);
 175 
 176 static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
 177     ixa_notify_arg_t);
 178 
 179 static mac_callbacks_t iptun_m_callbacks;
 180 
 181 static int
 182 iptun_m_getstat(void *arg, uint_t stat, uint64_t *val)
 183 {
 184         iptun_t *iptun = arg;
 185         int     err = 0;
 186 
 187         switch (stat) {
 188         case MAC_STAT_IERRORS:
 189                 *val = iptun->iptun_ierrors;
 190                 break;
 191         case MAC_STAT_OERRORS:
 192                 *val = iptun->iptun_oerrors;
 193                 break;
 194         case MAC_STAT_RBYTES:
 195                 *val = iptun->iptun_rbytes;
 196                 break;
 197         case MAC_STAT_IPACKETS:
 198                 *val = iptun->iptun_ipackets;
 199                 break;
 200         case MAC_STAT_OBYTES:
 201                 *val = iptun->iptun_obytes;
 202                 break;
 203         case MAC_STAT_OPACKETS:
 204                 *val = iptun->iptun_opackets;
 205                 break;
 206         case MAC_STAT_NORCVBUF:
 207                 *val = iptun->iptun_norcvbuf;
 208                 break;
 209         case MAC_STAT_NOXMTBUF:
 210                 *val = iptun->iptun_noxmtbuf;
 211                 break;
 212         default:
 213                 err = ENOTSUP;
 214         }
 215 
 216         return (err);
 217 }
 218 
 219 static int
 220 iptun_m_start(void *arg)
 221 {
 222         iptun_t *iptun = arg;
 223         int     err;
 224 
 225         if ((err = iptun_enter(iptun)) == 0) {
 226                 iptun->iptun_flags |= IPTUN_MAC_STARTED;
 227                 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
 228                 iptun_exit(iptun);
 229         }
 230         return (err);
 231 }
 232 
 233 static void
 234 iptun_m_stop(void *arg)
 235 {
 236         iptun_t *iptun = arg;
 237 
 238         if (iptun_enter(iptun) == 0) {
 239                 iptun->iptun_flags &= ~IPTUN_MAC_STARTED;
 240                 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
 241                 iptun_exit(iptun);
 242         }
 243 }
 244 
 245 /*
 246  * iptun_m_setpromisc() does nothing and always succeeds.  This is because a
 247  * tunnel data-link only ever receives packets that are destined exclusively
 248  * for the local address of the tunnel.
 249  */
 250 /* ARGSUSED */
 251 static int
 252 iptun_m_setpromisc(void *arg, boolean_t on)
 253 {
 254         return (0);
 255 }
 256 
 257 /* ARGSUSED */
 258 static int
 259 iptun_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
 260 {
 261         return (ENOTSUP);
 262 }
 263 
 264 /*
 265  * iptun_m_unicst() sets the local address.
 266  */
 267 /* ARGSUSED */
 268 static int
 269 iptun_m_unicst(void *arg, const uint8_t *addrp)
 270 {
 271         iptun_t                 *iptun = arg;
 272         int                     err;
 273         struct sockaddr_storage ss;
 274         struct sockaddr_in      *sin;
 275         struct sockaddr_in6     *sin6;
 276 
 277         if ((err = iptun_enter(iptun)) == 0) {
 278                 switch (iptun->iptun_typeinfo->iti_ipvers) {
 279                 case IPV4_VERSION:
 280                         sin = (struct sockaddr_in *)&ss;
 281                         sin->sin_family = AF_INET;
 282                         bcopy(addrp, &sin->sin_addr, sizeof (in_addr_t));
 283                         break;
 284                 case IPV6_VERSION:
 285                         sin6 = (struct sockaddr_in6 *)&ss;
 286                         sin6->sin6_family = AF_INET6;
 287                         bcopy(addrp, &sin6->sin6_addr, sizeof (in6_addr_t));
 288                         break;
 289                 default:
 290                         ASSERT(0);
 291                 }
 292                 err = iptun_setladdr(iptun, &ss);
 293                 iptun_exit(iptun);
 294         }
 295         return (err);
 296 }
 297 
 298 static mblk_t *
 299 iptun_m_tx(void *arg, mblk_t *mpchain)
 300 {
 301         mblk_t  *mp, *nmp;
 302         iptun_t *iptun = arg;
 303 
 304         if (!IS_IPTUN_RUNNING(iptun)) {
 305                 iptun_drop_pkt(mpchain, &iptun->iptun_noxmtbuf);
 306                 return (NULL);
 307         }
 308 
 309         for (mp = mpchain; mp != NULL; mp = nmp) {
 310                 nmp = mp->b_next;
 311                 mp->b_next = NULL;
 312                 iptun_output(iptun, mp);
 313         }
 314 
 315         return (NULL);
 316 }
 317 
 318 /* ARGSUSED */
 319 static int
 320 iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
 321     uint_t pr_valsize, const void *pr_val)
 322 {
 323         iptun_t         *iptun = barg;
 324         uint32_t        value = *(uint32_t *)pr_val;
 325         int             err;
 326 
 327         /*
 328          * We need to enter this iptun_t since we'll be modifying the outer
 329          * header.
 330          */
 331         if ((err = iptun_enter(iptun)) != 0)
 332                 return (err);
 333 
 334         switch (pr_num) {
 335         case MAC_PROP_IPTUN_HOPLIMIT:
 336                 if (value < IPTUN_MIN_HOPLIMIT || value > IPTUN_MAX_HOPLIMIT) {
 337                         err = EINVAL;
 338                         break;
 339                 }
 340                 if (value != iptun->iptun_hoplimit) {
 341                         iptun->iptun_hoplimit = (uint8_t)value;
 342                         iptun_headergen(iptun, B_TRUE);
 343                 }
 344                 break;
 345         case MAC_PROP_IPTUN_ENCAPLIMIT:
 346                 if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6 ||
 347                     value > IPTUN_MAX_ENCAPLIMIT) {
 348                         err = EINVAL;
 349                         break;
 350                 }
 351                 if (value != iptun->iptun_encaplimit) {
 352                         iptun->iptun_encaplimit = (uint8_t)value;
 353                         iptun_headergen(iptun, B_TRUE);
 354                 }
 355                 break;
 356         case MAC_PROP_MTU: {
 357                 uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0);
 358 
 359                 if (value < iptun->iptun_typeinfo->iti_minmtu ||
 360                     value > maxmtu) {
 361                         err = EINVAL;
 362                         break;
 363                 }
 364                 iptun->iptun_flags |= IPTUN_FIXED_MTU;
 365                 if (value != iptun->iptun_mtu) {
 366                         iptun->iptun_mtu = value;
 367                         iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
 368                 }
 369                 break;
 370         }
 371         default:
 372                 err = EINVAL;
 373         }
 374         iptun_exit(iptun);
 375         return (err);
 376 }
 377 
 378 /* ARGSUSED */
 379 static int
 380 iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num,
 381     uint_t pr_valsize, void *pr_val)
 382 {
 383         iptun_t                 *iptun = barg;
 384         int                     err;
 385 
 386         if ((err = iptun_enter(iptun)) != 0)
 387                 return (err);
 388 
 389         switch (pr_num) {
 390         case MAC_PROP_IPTUN_HOPLIMIT:
 391                 ASSERT(pr_valsize >= sizeof (uint32_t));
 392                 *(uint32_t *)pr_val = iptun->iptun_hoplimit;
 393                 break;
 394 
 395         case MAC_PROP_IPTUN_ENCAPLIMIT:
 396                 *(uint32_t *)pr_val = iptun->iptun_encaplimit;
 397                 break;
 398         default:
 399                 err = ENOTSUP;
 400         }
 401 done:
 402         iptun_exit(iptun);
 403         return (err);
 404 }
 405 
 406 /* ARGSUSED */
 407 static void
 408 iptun_m_propinfo(void *barg, const char *pr_name, mac_prop_id_t pr_num,
 409     mac_prop_info_handle_t prh)
 410 {
 411         iptun_t                 *iptun = barg;
 412 
 413         switch (pr_num) {
 414         case MAC_PROP_IPTUN_HOPLIMIT:
 415                 mac_prop_info_set_range_uint32(prh,
 416                     IPTUN_MIN_HOPLIMIT, IPTUN_MAX_HOPLIMIT);
 417                 mac_prop_info_set_default_uint32(prh, IPTUN_DEFAULT_HOPLIMIT);
 418                 break;
 419 
 420         case MAC_PROP_IPTUN_ENCAPLIMIT:
 421                 if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_IPV6)
 422                         break;
 423                 mac_prop_info_set_range_uint32(prh,
 424                     IPTUN_MIN_ENCAPLIMIT, IPTUN_MAX_ENCAPLIMIT);
 425                 mac_prop_info_set_default_uint32(prh, IPTUN_DEFAULT_ENCAPLIMIT);
 426                 break;
 427         case MAC_PROP_MTU:
 428                 mac_prop_info_set_range_uint32(prh,
 429                     iptun->iptun_typeinfo->iti_minmtu,
 430                     iptun_get_maxmtu(iptun, NULL, 0));
 431                 break;
 432         }
 433 }
 434 
 435 uint_t
 436 iptun_count(void)
 437 {
 438         return (iptun_tunnelcount);
 439 }
 440 
 441 /*
 442  * Enter an iptun_t exclusively.  This is essentially just a mutex, but we
 443  * don't allow iptun_enter() to succeed on a tunnel if it's in the process of
 444  * being deleted.
 445  */
 446 static int
 447 iptun_enter(iptun_t *iptun)
 448 {
 449         mutex_enter(&iptun->iptun_lock);
 450         while (iptun->iptun_flags & IPTUN_DELETE_PENDING)
 451                 cv_wait(&iptun->iptun_enter_cv, &iptun->iptun_lock);
 452         if (iptun->iptun_flags & IPTUN_CONDEMNED) {
 453                 mutex_exit(&iptun->iptun_lock);
 454                 return (ENOENT);
 455         }
 456         return (0);
 457 }
 458 
 459 /*
 460  * Exit the tunnel entered in iptun_enter().
 461  */
 462 static void
 463 iptun_exit(iptun_t *iptun)
 464 {
 465         mutex_exit(&iptun->iptun_lock);
 466 }
 467 
 468 /*
 469  * Enter the IP tunnel instance by datalink ID.
 470  */
 471 static int
 472 iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun)
 473 {
 474         int err;
 475 
 476         mutex_enter(&iptun_hash_lock);
 477         if (mod_hash_find(iptun_hash, IPTUN_HASH_KEY(linkid),
 478             (mod_hash_val_t *)iptun) == 0)
 479                 err = iptun_enter(*iptun);
 480         else
 481                 err = ENOENT;
 482         if (err != 0)
 483                 *iptun = NULL;
 484         mutex_exit(&iptun_hash_lock);
 485         return (err);
 486 }
 487 
 488 /*
 489  * Handle tasks that were deferred through the iptun_taskq because they require
 490  * calling up to the mac module, and we can't call up to the mac module while
 491  * holding locks.
 492  *
 493  * This is tricky to get right without introducing race conditions and
 494  * deadlocks with the mac module, as we cannot issue an upcall while in the
 495  * iptun_t.  The reason is that upcalls may try and enter the mac perimeter,
 496  * while iptun callbacks (such as iptun_m_setprop()) called from the mac
 497  * module will already have the perimeter held, and will then try and enter
 498  * the iptun_t.  You can see the lock ordering problem with this; this will
 499  * deadlock.
 500  *
 501  * The safe way to do this is to enter the iptun_t in question and copy the
 502  * information we need out of it so that we can exit it and know that the
 503  * information being passed up to the upcalls won't be subject to modification
 504  * by other threads.  The problem now is that we need to exit it prior to
 505  * issuing the upcall, but once we do this, a thread could come along and
 506  * delete the iptun_t and thus the mac handle required to issue the upcall.
 507  * To prevent this, we set the IPTUN_UPCALL_PENDING flag prior to exiting the
 508  * iptun_t.  This flag is the condition associated with iptun_upcall_cv, which
 509  * iptun_delete() will cv_wait() on.  When the upcall completes, we clear
 510  * IPTUN_UPCALL_PENDING and cv_signal() any potentially waiting
 511  * iptun_delete().  We can thus still safely use iptun->iptun_mh after having
 512  * exited the iptun_t.
 513  */
 514 static void
 515 iptun_task_cb(void *arg)
 516 {
 517         iptun_task_data_t       *itd = arg;
 518         iptun_task_t            task = itd->itd_task;
 519         datalink_id_t           linkid = itd->itd_linkid;
 520         iptun_t                 *iptun;
 521         uint32_t                mtu;
 522         iptun_addr_t            addr;
 523         link_state_t            linkstate;
 524         size_t                  header_size;
 525         iptun_header_t          header;
 526 
 527         kmem_free(itd, sizeof (*itd));
 528 
 529         /*
 530          * Note that if the lookup fails, it's because the tunnel was deleted
 531          * between the time the task was dispatched and now.  That isn't an
 532          * error.
 533          */
 534         if (iptun_enter_by_linkid(linkid, &iptun) != 0)
 535                 return;
 536 
 537         iptun->iptun_flags |= IPTUN_UPCALL_PENDING;
 538 
 539         switch (task) {
 540         case IPTUN_TASK_MTU_UPDATE:
 541                 mtu = iptun->iptun_mtu;
 542                 break;
 543         case IPTUN_TASK_LADDR_UPDATE:
 544                 addr = iptun->iptun_laddr;
 545                 break;
 546         case IPTUN_TASK_RADDR_UPDATE:
 547                 addr = iptun->iptun_raddr;
 548                 break;
 549         case IPTUN_TASK_LINK_UPDATE:
 550                 linkstate = IS_IPTUN_RUNNING(iptun) ?
 551                     LINK_STATE_UP : LINK_STATE_DOWN;
 552                 break;
 553         case IPTUN_TASK_PDATA_UPDATE:
 554                 header_size = iptun->iptun_header_size;
 555                 header = iptun->iptun_header;
 556                 break;
 557         default:
 558                 ASSERT(0);
 559         }
 560 
 561         iptun_exit(iptun);
 562 
 563         switch (task) {
 564         case IPTUN_TASK_MTU_UPDATE:
 565                 (void) mac_maxsdu_update(iptun->iptun_mh, mtu);
 566                 break;
 567         case IPTUN_TASK_LADDR_UPDATE:
 568                 mac_unicst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
 569                 break;
 570         case IPTUN_TASK_RADDR_UPDATE:
 571                 mac_dst_update(iptun->iptun_mh, (uint8_t *)&addr.ia_addr);
 572                 break;
 573         case IPTUN_TASK_LINK_UPDATE:
 574                 mac_link_update(iptun->iptun_mh, linkstate);
 575                 break;
 576         case IPTUN_TASK_PDATA_UPDATE:
 577                 if (mac_pdata_update(iptun->iptun_mh,
 578                     header_size == 0 ? NULL : &header, header_size) != 0)
 579                         atomic_inc_64(&iptun->iptun_taskq_fail);
 580                 break;
 581         }
 582 
 583         mutex_enter(&iptun->iptun_lock);
 584         iptun->iptun_flags &= ~IPTUN_UPCALL_PENDING;
 585         cv_signal(&iptun->iptun_upcall_cv);
 586         mutex_exit(&iptun->iptun_lock);
 587 }
 588 
 589 static void
 590 iptun_task_dispatch(iptun_t *iptun, iptun_task_t iptun_task)
 591 {
 592         iptun_task_data_t *itd;
 593 
 594         itd = kmem_alloc(sizeof (*itd), KM_NOSLEEP);
 595         if (itd == NULL) {
 596                 atomic_inc_64(&iptun->iptun_taskq_fail);
 597                 return;
 598         }
 599         itd->itd_task = iptun_task;
 600         itd->itd_linkid = iptun->iptun_linkid;
 601         if (ddi_taskq_dispatch(iptun_taskq, iptun_task_cb, itd, DDI_NOSLEEP)) {
 602                 atomic_inc_64(&iptun->iptun_taskq_fail);
 603                 kmem_free(itd, sizeof (*itd));
 604         }
 605 }
 606 
 607 /*
 608  * Convert an iptun_addr_t to sockaddr_storage.
 609  */
 610 static void
 611 iptun_getaddr(iptun_addr_t *iptun_addr, struct sockaddr_storage *ss)
 612 {
 613         struct sockaddr_in      *sin;
 614         struct sockaddr_in6     *sin6;
 615 
 616         bzero(ss, sizeof (*ss));
 617         switch (iptun_addr->ia_family) {
 618         case AF_INET:
 619                 sin = (struct sockaddr_in *)ss;
 620                 sin->sin_addr.s_addr = iptun_addr->ia_addr.iau_addr4;
 621                 break;
 622         case AF_INET6:
 623                 sin6 = (struct sockaddr_in6 *)ss;
 624                 sin6->sin6_addr = iptun_addr->ia_addr.iau_addr6;
 625                 break;
 626         default:
 627                 ASSERT(0);
 628         }
 629         ss->ss_family = iptun_addr->ia_family;
 630 }
 631 
 632 /*
 633  * General purpose function to set an IP tunnel source or destination address.
 634  */
 635 static int
 636 iptun_setaddr(iptun_type_t iptun_type, iptun_addr_t *iptun_addr,
 637     const struct sockaddr_storage *ss)
 638 {
 639         if (!IPTUN_ADDR_MATCH(iptun_type, ss->ss_family))
 640                 return (EINVAL);
 641 
 642         switch (ss->ss_family) {
 643         case AF_INET: {
 644                 struct sockaddr_in *sin = (struct sockaddr_in *)ss;
 645 
 646                 if ((sin->sin_addr.s_addr == INADDR_ANY) ||
 647                     (sin->sin_addr.s_addr == INADDR_BROADCAST) ||
 648                     CLASSD(sin->sin_addr.s_addr)) {
 649                         return (EADDRNOTAVAIL);
 650                 }
 651                 iptun_addr->ia_addr.iau_addr4 = sin->sin_addr.s_addr;
 652                 break;
 653         }
 654         case AF_INET6: {
 655                 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)ss;
 656 
 657                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 658                     IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
 659                     IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 660                         return (EADDRNOTAVAIL);
 661                 }
 662                 iptun_addr->ia_addr.iau_addr6 = sin6->sin6_addr;
 663                 break;
 664         }
 665         default:
 666                 return (EAFNOSUPPORT);
 667         }
 668         iptun_addr->ia_family = ss->ss_family;
 669         return (0);
 670 }
 671 
 672 static int
 673 iptun_setladdr(iptun_t *iptun, const struct sockaddr_storage *laddr)
 674 {
 675         return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
 676             &iptun->iptun_laddr, laddr));
 677 }
 678 
 679 static int
 680 iptun_setraddr(iptun_t *iptun, const struct sockaddr_storage *raddr)
 681 {
 682         if (!(iptun->iptun_typeinfo->iti_hasraddr))
 683                 return (EINVAL);
 684         return (iptun_setaddr(iptun->iptun_typeinfo->iti_type,
 685             &iptun->iptun_raddr, raddr));
 686 }
 687 
 688 static boolean_t
 689 iptun_canbind(iptun_t *iptun)
 690 {
 691         /*
 692          * A tunnel may bind when its source address has been set, and if its
 693          * tunnel type requires one, also its destination address.
 694          */
 695         return ((iptun->iptun_flags & IPTUN_LADDR) &&
 696             ((iptun->iptun_flags & IPTUN_RADDR) ||
 697             !(iptun->iptun_typeinfo->iti_hasraddr)));
 698 }
 699 
 700 /*
 701  * Verify that the local address is valid, and insert in the fanout
 702  */
 703 static int
 704 iptun_bind(iptun_t *iptun)
 705 {
 706         conn_t                  *connp = iptun->iptun_connp;
 707         int                     error = 0;
 708         ip_xmit_attr_t          *ixa;
 709         ip_xmit_attr_t          *oldixa;
 710         iulp_t                  uinfo;
 711         ip_stack_t              *ipst = connp->conn_netstack->netstack_ip;
 712 
 713         /*
 714          * Get an exclusive ixa for this thread.
 715          * We defer updating conn_ixa until later to handle any concurrent
 716          * conn_ixa_cleanup thread.
 717          */
 718         ixa = conn_get_ixa(connp, B_FALSE);
 719         if (ixa == NULL)
 720                 return (ENOMEM);
 721 
 722         /* We create PMTU state including for 6to4 */
 723         ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
 724 
 725         ASSERT(iptun_canbind(iptun));
 726 
 727         mutex_enter(&connp->conn_lock);
 728         /*
 729          * Note that conn_proto can't be set since the upper protocol
 730          * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel.
 731          * ipcl_iptun_classify doesn't use conn_proto.
 732          */
 733         connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers;
 734 
 735         switch (iptun->iptun_typeinfo->iti_type) {
 736         case IPTUN_TYPE_IPV4:
 737                 IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
 738                     &connp->conn_laddr_v6);
 739                 IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4,
 740                     &connp->conn_faddr_v6);
 741                 ixa->ixa_flags |= IXAF_IS_IPV4;
 742                 if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp),
 743                     ipst, B_FALSE) != IPVL_UNICAST_UP) {
 744                         mutex_exit(&connp->conn_lock);
 745                         error = EADDRNOTAVAIL;
 746                         goto done;
 747                 }
 748                 break;
 749         case IPTUN_TYPE_IPV6:
 750                 connp->conn_laddr_v6 = iptun->iptun_laddr6;
 751                 connp->conn_faddr_v6 = iptun->iptun_raddr6;
 752                 ixa->ixa_flags &= ~IXAF_IS_IPV4;
 753                 /* We use a zero scopeid for now */
 754                 if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp),
 755                     ipst, B_FALSE, 0) != IPVL_UNICAST_UP) {
 756                         mutex_exit(&connp->conn_lock);
 757                         error = EADDRNOTAVAIL;
 758                         goto done;
 759                 }
 760                 break;
 761         case IPTUN_TYPE_6TO4:
 762                 IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4,
 763                     &connp->conn_laddr_v6);
 764                 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6);
 765                 ixa->ixa_flags |= IXAF_IS_IPV4;
 766                 mutex_exit(&connp->conn_lock);
 767 
 768                 switch (ip_laddr_verify_v4(iptun->iptun_laddr4,
 769                     IPCL_ZONEID(connp), ipst, B_FALSE)) {
 770                 case IPVL_UNICAST_UP:
 771                 case IPVL_UNICAST_DOWN:
 772                         break;
 773                 default:
 774                         error = EADDRNOTAVAIL;
 775                         goto done;
 776                 }
 777                 goto insert;
 778         }
 779 
 780         /* In case previous destination was multirt */
 781         ip_attr_newdst(ixa);
 782 
 783         /*
 784          * When we set a tunnel's destination address, we do not
 785          * care if the destination is reachable.  Transient routing
 786          * issues should not inhibit the creation of a tunnel
 787          * interface, for example. Thus we pass B_FALSE here.
 788          */
 789         connp->conn_saddr_v6 = connp->conn_laddr_v6;
 790         mutex_exit(&connp->conn_lock);
 791 
 792         /* As long as the MTU is large we avoid fragmentation */
 793         ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF;
 794 
 795         /* We handle IPsec in iptun_output_common */
 796         error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
 797             &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
 798             &connp->conn_saddr_v6, &uinfo, 0);
 799 
 800         if (error != 0)
 801                 goto done;
 802 
 803         /* saddr shouldn't change since it was already set */
 804         ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6,
 805             &connp->conn_saddr_v6));
 806 
 807         /* We set IXAF_VERIFY_PMTU to catch PMTU increases */
 808         ixa->ixa_flags |= IXAF_VERIFY_PMTU;
 809         ASSERT(uinfo.iulp_mtu != 0);
 810 
 811         /*
 812          * Allow setting new policies.
 813          * The addresses/ports are already set, thus the IPsec policy calls
 814          * can handle their passed-in conn's.
 815          */
 816         connp->conn_policy_cached = B_FALSE;
 817 
 818 insert:
 819         error = ipcl_conn_insert(connp);
 820         if (error != 0)
 821                 goto done;
 822 
 823         /* Atomically update v6lastdst and conn_ixa */
 824         mutex_enter(&connp->conn_lock);
 825         /* Record this as the "last" send even though we haven't sent any */
 826         connp->conn_v6lastdst = connp->conn_faddr_v6;
 827 
 828         iptun->iptun_flags |= IPTUN_BOUND;
 829 
 830         oldixa = conn_replace_ixa(connp, ixa);
 831         /* Done with conn_t */
 832         mutex_exit(&connp->conn_lock);
 833         ixa_refrele(oldixa);
 834 
 835         /*
 836          * Now that we're bound with ip below us, this is a good
 837          * time to initialize the destination path MTU and to
 838          * re-calculate the tunnel's link MTU.
 839          */
 840         (void) iptun_update_mtu(iptun, ixa, 0);
 841 
 842         if (IS_IPTUN_RUNNING(iptun))
 843                 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
 844 
 845 done:
 846         ixa_refrele(ixa);
 847         return (error);
 848 }
 849 
 850 static void
 851 iptun_unbind(iptun_t *iptun)
 852 {
 853         ASSERT(iptun->iptun_flags & IPTUN_BOUND);
 854         ASSERT(mutex_owned(&iptun->iptun_lock) ||
 855             (iptun->iptun_flags & IPTUN_CONDEMNED));
 856         ip_unbind(iptun->iptun_connp);
 857         iptun->iptun_flags &= ~IPTUN_BOUND;
 858         if (!(iptun->iptun_flags & IPTUN_CONDEMNED))
 859                 iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE);
 860 }
 861 
 862 /*
 863  * Re-generate the template data-link header for a given IP tunnel given the
 864  * tunnel's current parameters.
 865  */
 866 static void
 867 iptun_headergen(iptun_t *iptun, boolean_t update_mac)
 868 {
 869         switch (iptun->iptun_typeinfo->iti_ipvers) {
 870         case IPV4_VERSION:
 871                 /*
 872                  * We only need to use a custom IP header if the administrator
 873                  * has supplied a non-default hoplimit.
 874                  */
 875                 if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT) {
 876                         iptun->iptun_header_size = 0;
 877                         break;
 878                 }
 879                 iptun->iptun_header_size = sizeof (ipha_t);
 880                 iptun->iptun_header4.ipha_version_and_hdr_length =
 881                     IP_SIMPLE_HDR_VERSION;
 882                 iptun->iptun_header4.ipha_fragment_offset_and_flags =
 883                     htons(IPH_DF);
 884                 iptun->iptun_header4.ipha_ttl = iptun->iptun_hoplimit;
 885                 break;
 886         case IPV6_VERSION: {
 887                 ip6_t   *ip6hp = &iptun->iptun_header6.it6h_ip6h;
 888 
 889                 /*
 890                  * We only need to use a custom IPv6 header if either the
 891                  * administrator has supplied a non-default hoplimit, or we
 892                  * need to include an encapsulation limit option in the outer
 893                  * header.
 894                  */
 895                 if (iptun->iptun_hoplimit == IPTUN_DEFAULT_HOPLIMIT &&
 896                     iptun->iptun_encaplimit == 0) {
 897                         iptun->iptun_header_size = 0;
 898                         break;
 899                 }
 900 
 901                 (void) memset(ip6hp, 0, sizeof (*ip6hp));
 902                 if (iptun->iptun_encaplimit == 0) {
 903                         iptun->iptun_header_size = sizeof (ip6_t);
 904                         ip6hp->ip6_nxt = IPPROTO_NONE;
 905                 } else {
 906                         iptun_encaplim_t        *iel;
 907 
 908                         iptun->iptun_header_size = sizeof (iptun_ipv6hdrs_t);
 909                         /*
 910                          * The mac_ipv6 plugin requires ip6_plen to be in host
 911                          * byte order and reflect the extension headers
 912                          * present in the template.  The actual network byte
 913                          * order ip6_plen will be set on a per-packet basis on
 914                          * transmit.
 915                          */
 916                         ip6hp->ip6_plen = sizeof (*iel);
 917                         ip6hp->ip6_nxt = IPPROTO_DSTOPTS;
 918                         iel = &iptun->iptun_header6.it6h_encaplim;
 919                         *iel = iptun_encaplim_init;
 920                         iel->iel_telopt.ip6ot_encap_limit =
 921                             iptun->iptun_encaplimit;
 922                 }
 923 
 924                 ip6hp->ip6_hlim = iptun->iptun_hoplimit;
 925                 break;
 926         }
 927         }
 928 
 929         if (update_mac)
 930                 iptun_task_dispatch(iptun, IPTUN_TASK_PDATA_UPDATE);
 931 }
 932 
 933 /*
 934  * Insert inbound and outbound IPv4 and IPv6 policy into the given policy
 935  * head.
 936  */
 937 static boolean_t
 938 iptun_insert_simple_policies(ipsec_policy_head_t *ph, ipsec_act_t *actp,
 939     uint_t n, netstack_t *ns)
 940 {
 941         int f = IPSEC_AF_V4;
 942 
 943         if (!ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) ||
 944             !ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns))
 945                 return (B_FALSE);
 946 
 947         f = IPSEC_AF_V6;
 948         return (ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_INBOUND, ns) &&
 949             ipsec_polhead_insert(ph, actp, n, f, IPSEC_TYPE_OUTBOUND, ns));
 950 }
 951 
 952 /*
 953  * Used to set IPsec policy when policy is set through the IPTUN_CREATE or
 954  * IPTUN_MODIFY ioctls.
 955  */
 956 static int
 957 iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr)
 958 {
 959         int             rc = 0;
 960         uint_t          nact;
 961         ipsec_act_t     *actp = NULL;
 962         boolean_t       clear_all, old_policy = B_FALSE;
 963         ipsec_tun_pol_t *itp;
 964         char            name[MAXLINKNAMELEN];
 965         uint64_t        gen;
 966         netstack_t      *ns = iptun->iptun_ns;
 967 
 968         /* Can't specify self-encap on a tunnel. */
 969         if (ipsr->ipsr_self_encap_req != 0)
 970                 return (EINVAL);
 971 
 972         /*
 973          * If it's a "clear-all" entry, unset the security flags and resume
 974          * normal cleartext (or inherit-from-global) policy.
 975          */
 976         clear_all = ((ipsr->ipsr_ah_req & IPTUN_IPSEC_REQ_MASK) == 0 &&
 977             (ipsr->ipsr_esp_req & IPTUN_IPSEC_REQ_MASK) == 0);
 978 
 979         ASSERT(mutex_owned(&iptun->iptun_lock));
 980         itp = iptun->iptun_itp;
 981         if (itp == NULL) {
 982                 if (clear_all)
 983                         goto bail;
 984                 if ((rc = dls_mgmt_get_linkinfo(iptun->iptun_linkid, name, NULL,
 985                     NULL, NULL)) != 0)
 986                         goto bail;
 987                 ASSERT(name[0] != '\0');
 988                 if ((itp = create_tunnel_policy(name, &rc, &gen, ns)) == NULL)
 989                         goto bail;
 990                 iptun->iptun_itp = itp;
 991         }
 992 
 993         /* Allocate the actvec now, before holding itp or polhead locks. */
 994         ipsec_actvec_from_req(ipsr, &actp, &nact, ns);
 995         if (actp == NULL) {
 996                 rc = ENOMEM;
 997                 goto bail;
 998         }
 999 
1000         /*
1001          * Just write on the active polhead.  Save the primary/secondary stuff
1002          * for spdsock operations.
1003          *
1004          * Mutex because we need to write to the polhead AND flags atomically.
1005          * Other threads will acquire the polhead lock as a reader if the
1006          * (unprotected) flag is set.
1007          */
1008         mutex_enter(&itp->itp_lock);
1009         if (itp->itp_flags & ITPF_P_TUNNEL) {
1010                 /* Oops, we lost a race.  Let's get out of here. */
1011                 rc = EBUSY;
1012                 goto mutex_bail;
1013         }
1014         old_policy = ((itp->itp_flags & ITPF_P_ACTIVE) != 0);
1015 
1016         if (old_policy) {
1017                 ITPF_CLONE(itp->itp_flags);
1018                 rc = ipsec_copy_polhead(itp->itp_policy, itp->itp_inactive, ns);
1019                 if (rc != 0) {
1020                         /* inactive has already been cleared. */
1021                         itp->itp_flags &= ~ITPF_IFLAGS;
1022                         goto mutex_bail;
1023                 }
1024                 rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1025                 ipsec_polhead_flush(itp->itp_policy, ns);
1026         } else {
1027                 /* Else assume itp->itp_policy is already flushed. */
1028                 rw_enter(&itp->itp_policy->iph_lock, RW_WRITER);
1029         }
1030 
1031         if (clear_all) {
1032                 ASSERT(avl_numnodes(&itp->itp_policy->iph_rulebyid) == 0);
1033                 itp->itp_flags &= ~ITPF_PFLAGS;
1034                 rw_exit(&itp->itp_policy->iph_lock);
1035                 old_policy = B_FALSE;   /* Clear out the inactive one too. */
1036                 goto recover_bail;
1037         }
1038 
1039         if (iptun_insert_simple_policies(itp->itp_policy, actp, nact, ns)) {
1040                 rw_exit(&itp->itp_policy->iph_lock);
1041                 /*
1042                  * Adjust MTU and make sure the DL side knows what's up.
1043                  */
1044                 itp->itp_flags = ITPF_P_ACTIVE;
1045                 (void) iptun_update_mtu(iptun, NULL, 0);
1046                 old_policy = B_FALSE;   /* Blank out inactive - we succeeded */
1047         } else {
1048                 rw_exit(&itp->itp_policy->iph_lock);
1049                 rc = ENOMEM;
1050         }
1051 
1052 recover_bail:
1053         if (old_policy) {
1054                 /* Recover policy in in active polhead. */
1055                 ipsec_swap_policy(itp->itp_policy, itp->itp_inactive, ns);
1056                 ITPF_SWAP(itp->itp_flags);
1057         }
1058 
1059         /* Clear policy in inactive polhead. */
1060         itp->itp_flags &= ~ITPF_IFLAGS;
1061         rw_enter(&itp->itp_inactive->iph_lock, RW_WRITER);
1062         ipsec_polhead_flush(itp->itp_inactive, ns);
1063         rw_exit(&itp->itp_inactive->iph_lock);
1064 
1065 mutex_bail:
1066         mutex_exit(&itp->itp_lock);
1067 
1068 bail:
1069         if (actp != NULL)
1070                 ipsec_actvec_free(actp, nact);
1071 
1072         return (rc);
1073 }
1074 
1075 static iptun_typeinfo_t *
1076 iptun_gettypeinfo(iptun_type_t type)
1077 {
1078         int i;
1079 
1080         for (i = 0; iptun_type_table[i].iti_type != IPTUN_TYPE_UNKNOWN; i++) {
1081                 if (iptun_type_table[i].iti_type == type)
1082                         break;
1083         }
1084         return (&iptun_type_table[i]);
1085 }
1086 
1087 /*
1088  * Set the parameters included in ik on the tunnel iptun.  Parameters that can
1089  * only be set at creation time are set in iptun_create().
1090  */
1091 static int
1092 iptun_setparams(iptun_t *iptun, const iptun_kparams_t *ik)
1093 {
1094         int             err = 0;
1095         netstack_t      *ns = iptun->iptun_ns;
1096         iptun_addr_t    orig_laddr, orig_raddr;
1097         uint_t          orig_flags = iptun->iptun_flags;
1098 
1099         if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR) {
1100                 if (orig_flags & IPTUN_LADDR)
1101                         orig_laddr = iptun->iptun_laddr;
1102                 if ((err = iptun_setladdr(iptun, &ik->iptun_kparam_laddr)) != 0)
1103                         return (err);
1104                 iptun->iptun_flags |= IPTUN_LADDR;
1105         }
1106 
1107         if (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) {
1108                 if (orig_flags & IPTUN_RADDR)
1109                         orig_raddr = iptun->iptun_raddr;
1110                 if ((err = iptun_setraddr(iptun, &ik->iptun_kparam_raddr)) != 0)
1111                         goto done;
1112                 iptun->iptun_flags |= IPTUN_RADDR;
1113         }
1114 
1115         if (ik->iptun_kparam_flags & IPTUN_KPARAM_SECINFO) {
1116                 /*
1117                  * Set IPsec policy originating from the ifconfig(1M) command
1118                  * line.  This is traditionally called "simple" policy because
1119                  * the ipsec_req_t (iptun_kparam_secinfo) can only describe a
1120                  * simple policy of "do ESP on everything" and/or "do AH on
1121                  * everything" (as opposed to the rich policy that can be
1122                  * defined with ipsecconf(1M)).
1123                  */
1124                 if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
1125                         /*
1126                          * Can't set security properties for automatic
1127                          * tunnels.
1128                          */
1129                         err = EINVAL;
1130                         goto done;
1131                 }
1132 
1133                 if (!ipsec_loaded(ns->netstack_ipsec)) {
1134                         /* If IPsec can be loaded, try and load it now. */
1135                         if (ipsec_failed(ns->netstack_ipsec)) {
1136                                 err = EPROTONOSUPPORT;
1137                                 goto done;
1138                         }
1139                         ipsec_loader_loadnow(ns->netstack_ipsec);
1140                         /*
1141                          * ipsec_loader_loadnow() returns while IPsec is
1142                          * loaded asynchronously.  While a method exists to
1143                          * wait for IPsec to load (ipsec_loader_wait()), it
1144                          * requires use of a STREAMS queue to do a qwait().
1145                          * We're not in STREAMS context here, and so we can't
1146                          * use it.  This is not a problem in practice because
1147                          * in the vast majority of cases, key management and
1148                          * global policy will have loaded before any tunnels
1149                          * are plumbed, and so IPsec will already have been
1150                          * loaded.
1151                          */
1152                         err = EAGAIN;
1153                         goto done;
1154                 }
1155 
1156                 err = iptun_set_sec_simple(iptun, &ik->iptun_kparam_secinfo);
1157                 if (err == 0) {
1158                         iptun->iptun_flags |= IPTUN_SIMPLE_POLICY;
1159                         iptun->iptun_simple_policy = ik->iptun_kparam_secinfo;
1160                 }
1161         }
1162 done:
1163         if (err != 0) {
1164                 /* Restore original source and destination. */
1165                 if (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR &&
1166                     (orig_flags & IPTUN_LADDR))
1167                         iptun->iptun_laddr = orig_laddr;
1168                 if ((ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR) &&
1169                     (orig_flags & IPTUN_RADDR))
1170                         iptun->iptun_raddr = orig_raddr;
1171                 iptun->iptun_flags = orig_flags;
1172         }
1173         return (err);
1174 }
1175 
1176 static int
1177 iptun_register(iptun_t *iptun)
1178 {
1179         mac_register_t  *mac;
1180         int             err;
1181 
1182         ASSERT(!(iptun->iptun_flags & IPTUN_MAC_REGISTERED));
1183 
1184         if ((mac = mac_alloc(MAC_VERSION)) == NULL)
1185                 return (EINVAL);
1186 
1187         mac->m_type_ident = iptun->iptun_typeinfo->iti_ident;
1188         mac->m_driver = iptun;
1189         mac->m_dip = iptun_dip;
1190         mac->m_instance = (uint_t)-1;
1191         mac->m_src_addr = (uint8_t *)&iptun->iptun_laddr.ia_addr;
1192         mac->m_dst_addr = iptun->iptun_typeinfo->iti_hasraddr ?
1193             (uint8_t *)&iptun->iptun_raddr.ia_addr : NULL;
1194         mac->m_callbacks = &iptun_m_callbacks;
1195         mac->m_min_sdu = iptun->iptun_typeinfo->iti_minmtu;
1196         mac->m_max_sdu = iptun->iptun_mtu;
1197         if (iptun->iptun_header_size != 0) {
1198                 mac->m_pdata = &iptun->iptun_header;
1199                 mac->m_pdata_size = iptun->iptun_header_size;
1200         }
1201         if ((err = mac_register(mac, &iptun->iptun_mh)) == 0)
1202                 iptun->iptun_flags |= IPTUN_MAC_REGISTERED;
1203         mac_free(mac);
1204         return (err);
1205 }
1206 
1207 static int
1208 iptun_unregister(iptun_t *iptun)
1209 {
1210         int err;
1211 
1212         ASSERT(iptun->iptun_flags & IPTUN_MAC_REGISTERED);
1213         if ((err = mac_unregister(iptun->iptun_mh)) == 0)
1214                 iptun->iptun_flags &= ~IPTUN_MAC_REGISTERED;
1215         return (err);
1216 }
1217 
1218 static conn_t *
1219 iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp)
1220 {
1221         conn_t *connp;
1222 
1223         if ((connp = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ns)) == NULL)
1224                 return (NULL);
1225 
1226         connp->conn_flags |= IPCL_IPTUN;
1227         connp->conn_iptun = iptun;
1228         connp->conn_recv = iptun_input;
1229         connp->conn_recvicmp = iptun_input_icmp;
1230         connp->conn_verifyicmp = iptun_verifyicmp;
1231 
1232         /*
1233          * Register iptun_notify to listen to capability changes detected by IP.
1234          * This upcall is made in the context of the call to conn_ip_output.
1235          */
1236         connp->conn_ixa->ixa_notify = iptun_notify;
1237         connp->conn_ixa->ixa_notify_cookie = iptun;
1238 
1239         /*
1240          * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done
1241          * for all other conn_t's.
1242          *
1243          * Note that there's an important distinction between iptun_zoneid and
1244          * conn_zoneid.  The conn_zoneid is set to GLOBAL_ZONEID in non-global
1245          * exclusive stack zones to make the ip module believe that the
1246          * non-global zone is actually a global zone.  Therefore, when
1247          * interacting with the ip module, we must always use conn_zoneid.
1248          */
1249         connp->conn_zoneid = (ns->netstack_stackid == GLOBAL_NETSTACKID) ?
1250             crgetzoneid(credp) : GLOBAL_ZONEID;
1251         connp->conn_cred = credp;
1252         /* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */
1253         crhold(connp->conn_cred);
1254         connp->conn_cpid = NOPID;
1255 
1256         /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1257         connp->conn_ixa->ixa_zoneid = connp->conn_zoneid;
1258         ASSERT(connp->conn_ref == 1);
1259 
1260         /* Cache things in ixa without an extra refhold */
1261         ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1262         connp->conn_ixa->ixa_cred = connp->conn_cred;
1263         connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1264         if (is_system_labeled())
1265                 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1266 
1267         /*
1268          * Have conn_ip_output drop packets should our outer source
1269          * go invalid
1270          */
1271         connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1272 
1273         switch (iptun->iptun_typeinfo->iti_ipvers) {
1274         case IPV4_VERSION:
1275                 connp->conn_family = AF_INET6;
1276                 break;
1277         case IPV6_VERSION:
1278                 connp->conn_family = AF_INET;
1279                 break;
1280         }
1281         mutex_enter(&connp->conn_lock);
1282         connp->conn_state_flags &= ~CONN_INCIPIENT;
1283         mutex_exit(&connp->conn_lock);
1284         return (connp);
1285 }
1286 
1287 static void
1288 iptun_conn_destroy(conn_t *connp)
1289 {
1290         ip_quiesce_conn(connp);
1291         connp->conn_iptun = NULL;
1292         ASSERT(connp->conn_ref == 1);
1293         CONN_DEC_REF(connp);
1294 }
1295 
1296 static iptun_t *
1297 iptun_alloc(void)
1298 {
1299         iptun_t *iptun;
1300 
1301         if ((iptun = kmem_cache_alloc(iptun_cache, KM_NOSLEEP)) != NULL) {
1302                 bzero(iptun, sizeof (*iptun));
1303                 atomic_inc_32(&iptun_tunnelcount);
1304         }
1305         return (iptun);
1306 }
1307 
1308 static void
1309 iptun_free(iptun_t *iptun)
1310 {
1311         ASSERT(iptun->iptun_flags & IPTUN_CONDEMNED);
1312 
1313         if (iptun->iptun_flags & IPTUN_HASH_INSERTED) {
1314                 iptun_stack_t   *iptuns = iptun->iptun_iptuns;
1315 
1316                 mutex_enter(&iptun_hash_lock);
1317                 VERIFY(mod_hash_remove(iptun_hash,
1318                     IPTUN_HASH_KEY(iptun->iptun_linkid),
1319                     (mod_hash_val_t *)&iptun) == 0);
1320                 mutex_exit(&iptun_hash_lock);
1321                 iptun->iptun_flags &= ~IPTUN_HASH_INSERTED;
1322                 mutex_enter(&iptuns->iptuns_lock);
1323                 list_remove(&iptuns->iptuns_iptunlist, iptun);
1324                 mutex_exit(&iptuns->iptuns_lock);
1325         }
1326 
1327         if (iptun->iptun_flags & IPTUN_BOUND)
1328                 iptun_unbind(iptun);
1329 
1330         /*
1331          * After iptun_unregister(), there will be no threads executing a
1332          * downcall from the mac module, including in the tx datapath.
1333          */
1334         if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
1335                 VERIFY(iptun_unregister(iptun) == 0);
1336 
1337         if (iptun->iptun_itp != NULL) {
1338                 /*
1339                  * Remove from the AVL tree, AND release the reference iptun_t
1340                  * itself holds on the ITP.
1341                  */
1342                 itp_unlink(iptun->iptun_itp, iptun->iptun_ns);
1343                 ITP_REFRELE(iptun->iptun_itp, iptun->iptun_ns);
1344                 iptun->iptun_itp = NULL;
1345                 iptun->iptun_flags &= ~IPTUN_SIMPLE_POLICY;
1346         }
1347 
1348         /*
1349          * After ipcl_conn_destroy(), there will be no threads executing an
1350          * upcall from ip (i.e., iptun_input()), and it is then safe to free
1351          * the iptun_t.
1352          */
1353         if (iptun->iptun_connp != NULL) {
1354                 iptun_conn_destroy(iptun->iptun_connp);
1355                 iptun->iptun_connp = NULL;
1356         }
1357 
1358         kmem_cache_free(iptun_cache, iptun);
1359         atomic_dec_32(&iptun_tunnelcount);
1360 }
1361 
1362 int
1363 iptun_create(iptun_kparams_t *ik, cred_t *credp)
1364 {
1365         iptun_t         *iptun = NULL;
1366         int             err = 0, mherr;
1367         char            linkname[MAXLINKNAMELEN];
1368         ipsec_tun_pol_t *itp;
1369         netstack_t      *ns = NULL;
1370         iptun_stack_t   *iptuns;
1371         datalink_id_t   tmpid;
1372         zoneid_t        zoneid = crgetzoneid(credp);
1373         boolean_t       link_created = B_FALSE;
1374 
1375         /* The tunnel type is mandatory */
1376         if (!(ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE))
1377                 return (EINVAL);
1378 
1379         /*
1380          * Is the linkid that the caller wishes to associate with this new
1381          * tunnel assigned to this zone?
1382          */
1383         if (zone_check_datalink(&zoneid, ik->iptun_kparam_linkid) != 0) {
1384                 if (zoneid != GLOBAL_ZONEID)
1385                         return (EINVAL);
1386         } else if (zoneid == GLOBAL_ZONEID) {
1387                 return (EINVAL);
1388         }
1389 
1390         /*
1391          * Make sure that we're not trying to create a tunnel that has already
1392          * been created.
1393          */
1394         if (iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun) == 0) {
1395                 iptun_exit(iptun);
1396                 iptun = NULL;
1397                 err = EEXIST;
1398                 goto done;
1399         }
1400 
1401         ns = netstack_find_by_cred(credp);
1402         iptuns = ns->netstack_iptun;
1403 
1404         if ((iptun = iptun_alloc()) == NULL) {
1405                 err = ENOMEM;
1406                 goto done;
1407         }
1408 
1409         iptun->iptun_linkid = ik->iptun_kparam_linkid;
1410         iptun->iptun_zoneid = zoneid;
1411         iptun->iptun_ns = ns;
1412 
1413         iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type);
1414         if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_UNKNOWN) {
1415                 err = EINVAL;
1416                 goto done;
1417         }
1418 
1419         if (ik->iptun_kparam_flags & IPTUN_KPARAM_IMPLICIT)
1420                 iptun->iptun_flags |= IPTUN_IMPLICIT;
1421 
1422         if ((err = iptun_setparams(iptun, ik)) != 0)
1423                 goto done;
1424 
1425         iptun->iptun_hoplimit = IPTUN_DEFAULT_HOPLIMIT;
1426         if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_IPV6)
1427                 iptun->iptun_encaplimit = IPTUN_DEFAULT_ENCAPLIMIT;
1428 
1429         iptun_headergen(iptun, B_FALSE);
1430 
1431         iptun->iptun_connp = iptun_conn_create(iptun, ns, credp);
1432         if (iptun->iptun_connp == NULL) {
1433                 err = ENOMEM;
1434                 goto done;
1435         }
1436 
1437         iptun->iptun_mtu = iptun->iptun_typeinfo->iti_maxmtu;
1438         iptun->iptun_dpmtu = iptun->iptun_mtu;
1439 
1440         /*
1441          * Find an ITP based on linkname.  If we have parms already set via
1442          * the iptun_setparams() call above, it may have created an ITP for
1443          * us.  We always try get_tunnel_policy() for DEBUG correctness
1444          * checks, and we may wish to refactor this to only check when
1445          * iptun_itp is NULL.
1446          */
1447         if ((err = dls_mgmt_get_linkinfo(iptun->iptun_linkid, linkname, NULL,
1448             NULL, NULL)) != 0)
1449                 goto done;
1450         if ((itp = get_tunnel_policy(linkname, ns)) != NULL)
1451                 iptun->iptun_itp = itp;
1452 
1453         /*
1454          * See if we have the necessary IP addresses assigned to this tunnel
1455          * to try and bind them with ip underneath us.  If we're not ready to
1456          * bind yet, then we'll defer the bind operation until the addresses
1457          * are modified.
1458          */
1459         if (iptun_canbind(iptun) && ((err = iptun_bind(iptun)) != 0))
1460                 goto done;
1461 
1462         if ((err = iptun_register(iptun)) != 0)
1463                 goto done;
1464 
1465         err = dls_devnet_create(iptun->iptun_mh, iptun->iptun_linkid,
1466             iptun->iptun_zoneid);
1467         if (err != 0)
1468                 goto done;
1469         link_created = B_TRUE;
1470 
1471         /*
1472          * We hash by link-id as that is the key used by all other iptun
1473          * interfaces (modify, delete, etc.).
1474          */
1475         if ((mherr = mod_hash_insert(iptun_hash,
1476             IPTUN_HASH_KEY(iptun->iptun_linkid), (mod_hash_val_t)iptun)) == 0) {
1477                 mutex_enter(&iptuns->iptuns_lock);
1478                 list_insert_head(&iptuns->iptuns_iptunlist, iptun);
1479                 mutex_exit(&iptuns->iptuns_lock);
1480                 iptun->iptun_flags |= IPTUN_HASH_INSERTED;
1481         } else if (mherr == MH_ERR_NOMEM) {
1482                 err = ENOMEM;
1483         } else if (mherr == MH_ERR_DUPLICATE) {
1484                 err = EEXIST;
1485         } else {
1486                 err = EINVAL;
1487         }
1488 
1489 done:
1490         if (iptun == NULL && ns != NULL)
1491                 netstack_rele(ns);
1492         if (err != 0 && iptun != NULL) {
1493                 if (link_created) {
1494                         (void) dls_devnet_destroy(iptun->iptun_mh, &tmpid,
1495                             B_TRUE);
1496                 }
1497                 iptun->iptun_flags |= IPTUN_CONDEMNED;
1498                 iptun_free(iptun);
1499         }
1500         return (err);
1501 }
1502 
1503 int
1504 iptun_delete(datalink_id_t linkid, cred_t *credp)
1505 {
1506         int     err;
1507         iptun_t *iptun = NULL;
1508 
1509         if ((err = iptun_enter_by_linkid(linkid, &iptun)) != 0)
1510                 return (err);
1511 
1512         /* One cannot delete a tunnel that belongs to another zone. */
1513         if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1514                 iptun_exit(iptun);
1515                 return (EACCES);
1516         }
1517 
1518         /*
1519          * We need to exit iptun in order to issue calls up the stack such as
1520          * dls_devnet_destroy().  If we call up while still in iptun, deadlock
1521          * with calls coming down the stack is possible.  We prevent other
1522          * threads from entering this iptun after we've exited it by setting
1523          * the IPTUN_DELETE_PENDING flag.  This will cause callers of
1524          * iptun_enter() to block waiting on iptun_enter_cv.  The assumption
1525          * here is that the functions we're calling while IPTUN_DELETE_PENDING
1526          * is set dont resuult in an iptun_enter() call, as that would result
1527          * in deadlock.
1528          */
1529         iptun->iptun_flags |= IPTUN_DELETE_PENDING;
1530 
1531         /* Wait for any pending upcall to the mac module to complete. */
1532         while (iptun->iptun_flags & IPTUN_UPCALL_PENDING)
1533                 cv_wait(&iptun->iptun_upcall_cv, &iptun->iptun_lock);
1534 
1535         iptun_exit(iptun);
1536 
1537         if ((err = dls_devnet_destroy(iptun->iptun_mh, &linkid, B_TRUE)) == 0) {
1538                 /*
1539                  * mac_disable() will fail with EBUSY if there are references
1540                  * to the iptun MAC.  If there are none, then mac_disable()
1541                  * will assure that none can be acquired until the MAC is
1542                  * unregistered.
1543                  *
1544                  * XXX CR 6791335 prevents us from calling mac_disable() prior
1545                  * to dls_devnet_destroy(), so we unfortunately need to
1546                  * attempt to re-create the devnet node if mac_disable()
1547                  * fails.
1548                  */
1549                 if ((err = mac_disable(iptun->iptun_mh)) != 0) {
1550                         (void) dls_devnet_create(iptun->iptun_mh, linkid,
1551                             iptun->iptun_zoneid);
1552                 }
1553         }
1554 
1555         /*
1556          * Now that we know the fate of this iptun_t, we need to clear
1557          * IPTUN_DELETE_PENDING, and set IPTUN_CONDEMNED if the iptun_t is
1558          * slated to be freed.  Either way, we need to signal the threads
1559          * waiting in iptun_enter() so that they can either fail if
1560          * IPTUN_CONDEMNED is set, or continue if it's not.
1561          */
1562         mutex_enter(&iptun->iptun_lock);
1563         iptun->iptun_flags &= ~IPTUN_DELETE_PENDING;
1564         if (err == 0)
1565                 iptun->iptun_flags |= IPTUN_CONDEMNED;
1566         cv_broadcast(&iptun->iptun_enter_cv);
1567         mutex_exit(&iptun->iptun_lock);
1568 
1569         /*
1570          * Note that there is no danger in calling iptun_free() after having
1571          * dropped the iptun_lock since callers of iptun_enter() at this point
1572          * are doing so from iptun_enter_by_linkid() (mac_disable() got rid of
1573          * threads entering from mac callbacks which call iptun_enter()
1574          * directly) which holds iptun_hash_lock, and iptun_free() grabs this
1575          * lock in order to remove the iptun_t from the hash table.
1576          */
1577         if (err == 0)
1578                 iptun_free(iptun);
1579 
1580         return (err);
1581 }
1582 
1583 int
1584 iptun_modify(const iptun_kparams_t *ik, cred_t *credp)
1585 {
1586         iptun_t         *iptun;
1587         boolean_t       laddr_change = B_FALSE, raddr_change = B_FALSE;
1588         int             err;
1589 
1590         if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1591                 return (err);
1592 
1593         /* One cannot modify a tunnel that belongs to another zone. */
1594         if (iptun->iptun_zoneid != crgetzoneid(credp)) {
1595                 err = EACCES;
1596                 goto done;
1597         }
1598 
1599         /* The tunnel type cannot be changed */
1600         if (ik->iptun_kparam_flags & IPTUN_KPARAM_TYPE) {
1601                 err = EINVAL;
1602                 goto done;
1603         }
1604 
1605         if ((err = iptun_setparams(iptun, ik)) != 0)
1606                 goto done;
1607         iptun_headergen(iptun, B_FALSE);
1608 
1609         /*
1610          * If any of the tunnel's addresses has been modified and the tunnel
1611          * has the necessary addresses assigned to it, we need to try to bind
1612          * with ip underneath us.  If we're not ready to bind yet, then we'll
1613          * try again when the addresses are modified later.
1614          */
1615         laddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_LADDR);
1616         raddr_change = (ik->iptun_kparam_flags & IPTUN_KPARAM_RADDR);
1617         if (laddr_change || raddr_change) {
1618                 if (iptun->iptun_flags & IPTUN_BOUND)
1619                         iptun_unbind(iptun);
1620                 if (iptun_canbind(iptun) && (err = iptun_bind(iptun)) != 0) {
1621                         if (laddr_change)
1622                                 iptun->iptun_flags &= ~IPTUN_LADDR;
1623                         if (raddr_change)
1624                                 iptun->iptun_flags &= ~IPTUN_RADDR;
1625                         goto done;
1626                 }
1627         }
1628 
1629         if (laddr_change)
1630                 iptun_task_dispatch(iptun, IPTUN_TASK_LADDR_UPDATE);
1631         if (raddr_change)
1632                 iptun_task_dispatch(iptun, IPTUN_TASK_RADDR_UPDATE);
1633 
1634 done:
1635         iptun_exit(iptun);
1636         return (err);
1637 }
1638 
1639 /* Given an IP tunnel's datalink id, fill in its parameters. */
1640 int
1641 iptun_info(iptun_kparams_t *ik, cred_t *credp)
1642 {
1643         iptun_t *iptun;
1644         int     err;
1645 
1646         /* Is the tunnel link visible from the caller's zone? */
1647         if (!dls_devnet_islinkvisible(ik->iptun_kparam_linkid,
1648             crgetzoneid(credp)))
1649                 return (ENOENT);
1650 
1651         if ((err = iptun_enter_by_linkid(ik->iptun_kparam_linkid, &iptun)) != 0)
1652                 return (err);
1653 
1654         bzero(ik, sizeof (iptun_kparams_t));
1655 
1656         ik->iptun_kparam_linkid = iptun->iptun_linkid;
1657         ik->iptun_kparam_type = iptun->iptun_typeinfo->iti_type;
1658         ik->iptun_kparam_flags |= IPTUN_KPARAM_TYPE;
1659 
1660         if (iptun->iptun_flags & IPTUN_LADDR) {
1661                 iptun_getaddr(&iptun->iptun_laddr, &ik->iptun_kparam_laddr);
1662                 ik->iptun_kparam_flags |= IPTUN_KPARAM_LADDR;
1663         }
1664         if (iptun->iptun_flags & IPTUN_RADDR) {
1665                 iptun_getaddr(&iptun->iptun_raddr, &ik->iptun_kparam_raddr);
1666                 ik->iptun_kparam_flags |= IPTUN_KPARAM_RADDR;
1667         }
1668 
1669         if (iptun->iptun_flags & IPTUN_IMPLICIT)
1670                 ik->iptun_kparam_flags |= IPTUN_KPARAM_IMPLICIT;
1671 
1672         if (iptun->iptun_itp != NULL) {
1673                 mutex_enter(&iptun->iptun_itp->itp_lock);
1674                 if (iptun->iptun_itp->itp_flags & ITPF_P_ACTIVE) {
1675                         ik->iptun_kparam_flags |= IPTUN_KPARAM_IPSECPOL;
1676                         if (iptun->iptun_flags & IPTUN_SIMPLE_POLICY) {
1677                                 ik->iptun_kparam_flags |= IPTUN_KPARAM_SECINFO;
1678                                 ik->iptun_kparam_secinfo =
1679                                     iptun->iptun_simple_policy;
1680                         }
1681                 }
1682                 mutex_exit(&iptun->iptun_itp->itp_lock);
1683         }
1684 
1685 done:
1686         iptun_exit(iptun);
1687         return (err);
1688 }
1689 
1690 int
1691 iptun_set_6to4relay(netstack_t *ns, ipaddr_t relay_addr)
1692 {
1693         if (relay_addr == INADDR_BROADCAST || CLASSD(relay_addr))
1694                 return (EADDRNOTAVAIL);
1695         ns->netstack_iptun->iptuns_relay_rtr_addr = relay_addr;
1696         return (0);
1697 }
1698 
1699 void
1700 iptun_get_6to4relay(netstack_t *ns, ipaddr_t *relay_addr)
1701 {
1702         *relay_addr = ns->netstack_iptun->iptuns_relay_rtr_addr;
1703 }
1704 
1705 void
1706 iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp)
1707 {
1708         iptun_t *iptun;
1709 
1710         if (iptun_enter_by_linkid(linkid, &iptun) != 0)
1711                 return;
1712         if (iptun->iptun_itp != itp) {
1713                 ASSERT(iptun->iptun_itp == NULL);
1714                 ITP_REFHOLD(itp);
1715                 iptun->iptun_itp = itp;
1716         }
1717         /*
1718          * IPsec policy means IPsec overhead, which means lower MTU.
1719          * Refresh the MTU for this tunnel.
1720          */
1721         (void) iptun_update_mtu(iptun, NULL, 0);
1722         iptun_exit(iptun);
1723 }
1724 
1725 /*
1726  * Obtain the path MTU to the tunnel destination.
1727  * Can return zero in some cases.
1728  */
1729 static uint32_t
1730 iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1731 {
1732         uint32_t        pmtu = 0;
1733         conn_t          *connp = iptun->iptun_connp;
1734         boolean_t       need_rele = B_FALSE;
1735 
1736         /*
1737          * We only obtain the pmtu for tunnels that have a remote tunnel
1738          * address.
1739          */
1740         if (!(iptun->iptun_flags & IPTUN_RADDR))
1741                 return (0);
1742 
1743         if (ixa == NULL) {
1744                 ixa = conn_get_ixa(connp, B_FALSE);
1745                 if (ixa == NULL)
1746                         return (0);
1747                 need_rele = B_TRUE;
1748         }
1749         /*
1750          * Guard against ICMP errors before we have sent, as well as against
1751          * and a thread which held conn_ixa.
1752          */
1753         if (ixa->ixa_ire != NULL) {
1754                 pmtu = ip_get_pmtu(ixa);
1755 
1756                 /*
1757                  * For both IPv4 and IPv6 we can have indication that the outer
1758                  * header needs fragmentation.
1759                  */
1760                 if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1761                         /* Must allow fragmentation in ip_output */
1762                         ixa->ixa_flags &= ~IXAF_DONTFRAG;
1763                 } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1764                         ixa->ixa_flags |= IXAF_DONTFRAG;
1765                 } else {
1766                         /* ip_get_pmtu might have set this - we don't want it */
1767                         ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1768                 }
1769         }
1770 
1771         if (need_rele)
1772                 ixa_refrele(ixa);
1773         return (pmtu);
1774 }
1775 
1776 /*
1777  * Update the ip_xmit_attr_t to capture the current lower path mtu as known
1778  * by ip.
1779  */
1780 static void
1781 iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa)
1782 {
1783         uint32_t        pmtu;
1784         conn_t          *connp = iptun->iptun_connp;
1785         boolean_t       need_rele = B_FALSE;
1786 
1787         /* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */
1788         if (!(iptun->iptun_flags & IPTUN_RADDR))
1789                 return;
1790 
1791         if (ixa == NULL) {
1792                 ixa = conn_get_ixa(connp, B_FALSE);
1793                 if (ixa == NULL)
1794                         return;
1795                 need_rele = B_TRUE;
1796         }
1797         /*
1798          * Guard against ICMP errors before we have sent, as well as against
1799          * and a thread which held conn_ixa.
1800          */
1801         if (ixa->ixa_ire != NULL) {
1802                 pmtu = ip_get_pmtu(ixa);
1803                 /*
1804                  * Update ixa_fragsize and ixa_pmtu.
1805                  */
1806                 ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu;
1807 
1808                 /*
1809                  * For both IPv4 and IPv6 we can have indication that the outer
1810                  * header needs fragmentation.
1811                  */
1812                 if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) {
1813                         /* Must allow fragmentation in ip_output */
1814                         ixa->ixa_flags &= ~IXAF_DONTFRAG;
1815                 } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) {
1816                         ixa->ixa_flags |= IXAF_DONTFRAG;
1817                 } else {
1818                         /* ip_get_pmtu might have set this - we don't want it */
1819                         ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
1820                 }
1821         }
1822 
1823         if (need_rele)
1824                 ixa_refrele(ixa);
1825 }
1826 
1827 /*
1828  * There is nothing that iptun can verify in addition to IP having
1829  * verified the IP addresses in the fanout.
1830  */
1831 /* ARGSUSED */
1832 static boolean_t
1833 iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6,
1834     ip_recv_attr_t *ira)
1835 {
1836         return (B_TRUE);
1837 }
1838 
1839 /*
1840  * Notify function registered with ip_xmit_attr_t.
1841  */
1842 static void
1843 iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
1844     ixa_notify_arg_t narg)
1845 {
1846         iptun_t         *iptun = (iptun_t *)arg;
1847 
1848         switch (ntype) {
1849         case IXAN_PMTU:
1850                 (void) iptun_update_mtu(iptun, ixa, narg);
1851                 break;
1852         }
1853 }
1854 
1855 /*
1856  * Returns the max of old_ovhd and the overhead associated with pol.
1857  */
1858 static uint32_t
1859 iptun_max_policy_overhead(ipsec_policy_t *pol, uint32_t old_ovhd)
1860 {
1861         uint32_t new_ovhd = old_ovhd;
1862 
1863         while (pol != NULL) {
1864                 new_ovhd = max(new_ovhd,
1865                     ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1866                 pol = pol->ipsp_hash.hash_next;
1867         }
1868         return (new_ovhd);
1869 }
1870 
1871 static uint32_t
1872 iptun_get_ipsec_overhead(iptun_t *iptun)
1873 {
1874         ipsec_policy_root_t     *ipr;
1875         ipsec_policy_head_t     *iph;
1876         ipsec_policy_t          *pol;
1877         ipsec_selector_t        sel;
1878         int                     i;
1879         uint32_t                ipsec_ovhd = 0;
1880         ipsec_tun_pol_t         *itp = iptun->iptun_itp;
1881         netstack_t              *ns = iptun->iptun_ns;
1882 
1883         if (itp == NULL || !(itp->itp_flags & ITPF_P_ACTIVE)) {
1884                 /*
1885                  * Consult global policy, just in case.  This will only work
1886                  * if we have both source and destination addresses to work
1887                  * with.
1888                  */
1889                 if ((iptun->iptun_flags & (IPTUN_LADDR|IPTUN_RADDR)) !=
1890                     (IPTUN_LADDR|IPTUN_RADDR))
1891                         return (0);
1892 
1893                 iph = ipsec_system_policy(ns);
1894                 bzero(&sel, sizeof (sel));
1895                 sel.ips_isv4 =
1896                     (iptun->iptun_typeinfo->iti_ipvers == IPV4_VERSION);
1897                 switch (iptun->iptun_typeinfo->iti_ipvers) {
1898                 case IPV4_VERSION:
1899                         sel.ips_local_addr_v4 = iptun->iptun_laddr4;
1900                         sel.ips_remote_addr_v4 = iptun->iptun_raddr4;
1901                         break;
1902                 case IPV6_VERSION:
1903                         sel.ips_local_addr_v6 = iptun->iptun_laddr6;
1904                         sel.ips_remote_addr_v6 = iptun->iptun_raddr6;
1905                         break;
1906                 }
1907                 /* Check for both IPv4 and IPv6. */
1908                 sel.ips_protocol = IPPROTO_ENCAP;
1909                 pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1910                     &sel);
1911                 if (pol != NULL) {
1912                         ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act);
1913                         IPPOL_REFRELE(pol);
1914                 }
1915                 sel.ips_protocol = IPPROTO_IPV6;
1916                 pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND,
1917                     &sel);
1918                 if (pol != NULL) {
1919                         ipsec_ovhd = max(ipsec_ovhd,
1920                             ipsec_act_ovhd(&pol->ipsp_act->ipa_act));
1921                         IPPOL_REFRELE(pol);
1922                 }
1923                 IPPH_REFRELE(iph, ns);
1924         } else {
1925                 /*
1926                  * Look through all of the possible IPsec actions for the
1927                  * tunnel, and find the largest potential IPsec overhead.
1928                  */
1929                 iph = itp->itp_policy;
1930                 rw_enter(&iph->iph_lock, RW_READER);
1931                 ipr = &(iph->iph_root[IPSEC_TYPE_OUTBOUND]);
1932                 ipsec_ovhd = iptun_max_policy_overhead(
1933                     ipr->ipr_nonhash[IPSEC_AF_V4], 0);
1934                 ipsec_ovhd = iptun_max_policy_overhead(
1935                     ipr->ipr_nonhash[IPSEC_AF_V6], ipsec_ovhd);
1936                 for (i = 0; i < ipr->ipr_nchains; i++) {
1937                         ipsec_ovhd = iptun_max_policy_overhead(
1938                             ipr->ipr_hash[i].hash_head, ipsec_ovhd);
1939                 }
1940                 rw_exit(&iph->iph_lock);
1941         }
1942 
1943         return (ipsec_ovhd);
1944 }
1945 
1946 /*
1947  * Calculate and return the maximum possible upper MTU for the given tunnel.
1948  *
1949  * If new_pmtu is set then we also need to update the lower path MTU information
1950  * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that
1951  * we are notified by conn_ip_output() when the path MTU increases.
1952  */
1953 static uint32_t
1954 iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
1955 {
1956         size_t          header_size, ipsec_overhead;
1957         uint32_t        maxmtu, pmtu;
1958 
1959         /*
1960          * Start with the path-MTU to the remote address, which is either
1961          * provided as the new_pmtu argument, or obtained using
1962          * iptun_get_dst_pmtu().
1963          */
1964         if (new_pmtu != 0) {
1965                 if (iptun->iptun_flags & IPTUN_RADDR)
1966                         iptun->iptun_dpmtu = new_pmtu;
1967                 pmtu = new_pmtu;
1968         } else if (iptun->iptun_flags & IPTUN_RADDR) {
1969                 if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) {
1970                         /*
1971                          * We weren't able to obtain the path-MTU of the
1972                          * destination.  Use the previous value.
1973                          */
1974                         pmtu = iptun->iptun_dpmtu;
1975                 } else {
1976                         iptun->iptun_dpmtu = pmtu;
1977                 }
1978         } else {
1979                 /*
1980                  * We have no path-MTU information to go on, use the maximum
1981                  * possible value.
1982                  */
1983                 pmtu = iptun->iptun_typeinfo->iti_maxmtu;
1984         }
1985 
1986         /*
1987          * Now calculate tunneling overhead and subtract that from the
1988          * path-MTU information obtained above.
1989          */
1990         if (iptun->iptun_header_size != 0) {
1991                 header_size = iptun->iptun_header_size;
1992         } else {
1993                 switch (iptun->iptun_typeinfo->iti_ipvers) {
1994                 case IPV4_VERSION:
1995                         header_size = sizeof (ipha_t);
1996                         if (is_system_labeled())
1997                                 header_size += IP_MAX_OPT_LENGTH;
1998                         break;
1999                 case IPV6_VERSION:
2000                         header_size = sizeof (iptun_ipv6hdrs_t);
2001                         break;
2002                 }
2003         }
2004 
2005         ipsec_overhead = iptun_get_ipsec_overhead(iptun);
2006 
2007         maxmtu = pmtu - (header_size + ipsec_overhead);
2008         return (max(maxmtu, iptun->iptun_typeinfo->iti_minmtu));
2009 }
2010 
2011 /*
2012  * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer
2013  * of any change in MTU.  The new_pmtu argument is the new lower path MTU to
2014  * the tunnel destination to be used in the tunnel MTU calculation.  Passing
2015  * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using
2016  * ip_get_pmtu().
2017  *
2018  * If the calculated tunnel MTU is different than its previous value, then we
2019  * notify the MAC layer above us of this change using mac_maxsdu_update().
2020  */
2021 static uint32_t
2022 iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu)
2023 {
2024         uint32_t newmtu;
2025 
2026         /* We always update the ixa since we might have set IXAF_VERIFY_PMTU */
2027         iptun_update_dst_pmtu(iptun, ixa);
2028 
2029         /*
2030          * We return the current MTU without updating it if it was pegged to a
2031          * static value using the MAC_PROP_MTU link property.
2032          */
2033         if (iptun->iptun_flags & IPTUN_FIXED_MTU)
2034                 return (iptun->iptun_mtu);
2035 
2036         /* If the MTU isn't fixed, then use the maximum possible value. */
2037         newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu);
2038         /*
2039          * We only dynamically adjust the tunnel MTU for tunnels with
2040          * destinations because dynamic MTU calculations are based on the
2041          * destination path-MTU.
2042          */
2043         if ((iptun->iptun_flags & IPTUN_RADDR) && newmtu != iptun->iptun_mtu) {
2044                 iptun->iptun_mtu = newmtu;
2045                 if (iptun->iptun_flags & IPTUN_MAC_REGISTERED)
2046                         iptun_task_dispatch(iptun, IPTUN_TASK_MTU_UPDATE);
2047         }
2048 
2049         return (newmtu);
2050 }
2051 
2052 /*
2053  * Frees a packet or packet chain and bumps stat for each freed packet.
2054  */
2055 static void
2056 iptun_drop_pkt(mblk_t *mp, uint64_t *stat)
2057 {
2058         mblk_t *pktmp;
2059 
2060         for (pktmp = mp; pktmp != NULL; pktmp = mp) {
2061                 mp = mp->b_next;
2062                 pktmp->b_next = NULL;
2063                 if (stat != NULL)
2064                         atomic_inc_64(stat);
2065                 freemsg(pktmp);
2066         }
2067 }
2068 
2069 /*
2070  * Allocate and return a new mblk to hold an IP and ICMP header, and chain the
2071  * original packet to its b_cont.  Returns NULL on failure.
2072  */
2073 static mblk_t *
2074 iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt)
2075 {
2076         mblk_t *icmperr_mp;
2077 
2078         if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) {
2079                 icmperr_mp->b_wptr += hdrs_size;
2080                 /* tack on the offending packet */
2081                 icmperr_mp->b_cont = orig_pkt;
2082         }
2083         return (icmperr_mp);
2084 }
2085 
2086 /*
2087  * Transmit an ICMP error.  mp->b_rptr points at the packet to be included in
2088  * the ICMP error.
2089  */
2090 static void
2091 iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp,
2092     ts_label_t *tsl)
2093 {
2094         size_t  orig_pktsize, hdrs_size;
2095         mblk_t  *icmperr_mp;
2096         ipha_t  *new_ipha;
2097         icmph_t *new_icmp;
2098         ip_xmit_attr_t  ixas;
2099         conn_t  *connp = iptun->iptun_connp;
2100 
2101         orig_pktsize = msgdsize(mp);
2102         hdrs_size = sizeof (ipha_t) + sizeof (icmph_t);
2103         if ((icmperr_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2104                 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2105                 return;
2106         }
2107 
2108         new_ipha = (ipha_t *)icmperr_mp->b_rptr;
2109         new_icmp = (icmph_t *)(new_ipha + 1);
2110 
2111         new_ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2112         new_ipha->ipha_type_of_service = 0;
2113         new_ipha->ipha_ident = 0;
2114         new_ipha->ipha_fragment_offset_and_flags = 0;
2115         new_ipha->ipha_ttl = orig_ipha->ipha_ttl;
2116         new_ipha->ipha_protocol = IPPROTO_ICMP;
2117         new_ipha->ipha_src = orig_ipha->ipha_dst;
2118         new_ipha->ipha_dst = orig_ipha->ipha_src;
2119         new_ipha->ipha_hdr_checksum = 0; /* will be computed by ip */
2120         new_ipha->ipha_length = htons(hdrs_size + orig_pktsize);
2121 
2122         *new_icmp = *icmp;
2123         new_icmp->icmph_checksum = 0;
2124         new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0);
2125 
2126         bzero(&ixas, sizeof (ixas));
2127         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2128         if (new_ipha->ipha_src == INADDR_ANY) {
2129                 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
2130                 ixas.ixa_flags |= IXAF_SET_SOURCE;
2131         }
2132 
2133         ixas.ixa_zoneid = IPCL_ZONEID(connp);
2134         ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2135         ixas.ixa_cred = connp->conn_cred;
2136         ixas.ixa_cpid = NOPID;
2137         if (is_system_labeled())
2138                 ixas.ixa_tsl = tsl;
2139 
2140         ixas.ixa_ifindex = 0;
2141         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2142 
2143         (void) ip_output_simple(icmperr_mp, &ixas);
2144         ixa_cleanup(&ixas);
2145 }
2146 
2147 static void
2148 iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp,
2149     ts_label_t *tsl)
2150 {
2151         size_t  orig_pktsize, hdrs_size;
2152         mblk_t  *icmp6err_mp;
2153         ip6_t   *new_ip6h;
2154         icmp6_t *new_icmp6;
2155         ip_xmit_attr_t  ixas;
2156         conn_t  *connp = iptun->iptun_connp;
2157 
2158         orig_pktsize = msgdsize(mp);
2159         hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t);
2160         if ((icmp6err_mp = iptun_build_icmperr(hdrs_size, mp)) == NULL) {
2161                 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2162                 return;
2163         }
2164 
2165         new_ip6h = (ip6_t *)icmp6err_mp->b_rptr;
2166         new_icmp6 = (icmp6_t *)(new_ip6h + 1);
2167 
2168         new_ip6h->ip6_vcf = orig_ip6h->ip6_vcf;
2169         new_ip6h->ip6_plen = htons(sizeof (icmp6_t) + orig_pktsize);
2170         new_ip6h->ip6_hops = orig_ip6h->ip6_hops;
2171         new_ip6h->ip6_nxt = IPPROTO_ICMPV6;
2172         new_ip6h->ip6_src = orig_ip6h->ip6_dst;
2173         new_ip6h->ip6_dst = orig_ip6h->ip6_src;
2174 
2175         *new_icmp6 = *icmp6;
2176         /* The checksum is calculated in ip_output_simple and friends. */
2177         new_icmp6->icmp6_cksum = new_ip6h->ip6_plen;
2178 
2179         bzero(&ixas, sizeof (ixas));
2180         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
2181         if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src)) {
2182                 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
2183                 ixas.ixa_flags |= IXAF_SET_SOURCE;
2184         }
2185 
2186         ixas.ixa_zoneid = IPCL_ZONEID(connp);
2187         ixas.ixa_ipst = connp->conn_netstack->netstack_ip;
2188         ixas.ixa_cred = connp->conn_cred;
2189         ixas.ixa_cpid = NOPID;
2190         if (is_system_labeled())
2191                 ixas.ixa_tsl = tsl;
2192 
2193         ixas.ixa_ifindex = 0;
2194         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2195 
2196         (void) ip_output_simple(icmp6err_mp, &ixas);
2197         ixa_cleanup(&ixas);
2198 }
2199 
2200 static void
2201 iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp,
2202     uint8_t type, uint8_t code, ts_label_t *tsl)
2203 {
2204         icmph_t icmp;
2205 
2206         bzero(&icmp, sizeof (icmp));
2207         icmp.icmph_type = type;
2208         icmp.icmph_code = code;
2209 
2210         iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2211 }
2212 
2213 static void
2214 iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha,
2215     mblk_t *mp, ts_label_t *tsl)
2216 {
2217         icmph_t icmp;
2218 
2219         icmp.icmph_type = ICMP_DEST_UNREACHABLE;
2220         icmp.icmph_code = ICMP_FRAGMENTATION_NEEDED;
2221         icmp.icmph_du_zero = 0;
2222         icmp.icmph_du_mtu = htons(newmtu);
2223 
2224         iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl);
2225 }
2226 
2227 static void
2228 iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp,
2229     uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl)
2230 {
2231         icmp6_t icmp6;
2232 
2233         bzero(&icmp6, sizeof (icmp6));
2234         icmp6.icmp6_type = type;
2235         icmp6.icmp6_code = code;
2236         if (type == ICMP6_PARAM_PROB)
2237                 icmp6.icmp6_pptr = htonl(offset);
2238 
2239         iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2240 }
2241 
2242 static void
2243 iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h,
2244     mblk_t *mp, ts_label_t *tsl)
2245 {
2246         icmp6_t icmp6;
2247 
2248         icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
2249         icmp6.icmp6_code = 0;
2250         icmp6.icmp6_mtu = htonl(newmtu);
2251 
2252         iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl);
2253 }
2254 
2255 /*
2256  * Determines if the packet pointed to by ipha or ip6h is an ICMP error.  The
2257  * mp argument is only used to do bounds checking.
2258  */
2259 static boolean_t
2260 is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h)
2261 {
2262         uint16_t hlen;
2263 
2264         if (ipha != NULL) {
2265                 icmph_t *icmph;
2266 
2267                 ASSERT(ip6h == NULL);
2268                 if (ipha->ipha_protocol != IPPROTO_ICMP)
2269                         return (B_FALSE);
2270 
2271                 hlen = IPH_HDR_LENGTH(ipha);
2272                 icmph = (icmph_t *)((uint8_t *)ipha + hlen);
2273                 return (ICMP_IS_ERROR(icmph->icmph_type) ||
2274                     icmph->icmph_type == ICMP_REDIRECT);
2275         } else {
2276                 icmp6_t *icmp6;
2277                 uint8_t *nexthdrp;
2278 
2279                 ASSERT(ip6h != NULL);
2280                 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hlen, &nexthdrp) ||
2281                     *nexthdrp != IPPROTO_ICMPV6) {
2282                         return (B_FALSE);
2283                 }
2284 
2285                 icmp6 = (icmp6_t *)((uint8_t *)ip6h + hlen);
2286                 return (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
2287                     icmp6->icmp6_type == ND_REDIRECT);
2288         }
2289 }
2290 
2291 /*
2292  * Find inner and outer IP headers from a tunneled packet as setup for calls
2293  * into ipsec_tun_{in,out}bound().
2294  * Note that we need to allow the outer header to be in a separate mblk from
2295  * the inner header.
2296  * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero.
2297  */
2298 static size_t
2299 iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4,
2300     ipha_t **inner4, ip6_t **outer6, ip6_t **inner6)
2301 {
2302         ipha_t  *ipha;
2303         size_t  first_mblkl = MBLKL(mp);
2304         mblk_t  *inner_mp;
2305 
2306         /*
2307          * Don't bother handling packets that don't have a full IP header in
2308          * the fist mblk.  For the input path, the ip module ensures that this
2309          * won't happen, and on the output path, the IP tunneling MAC-type
2310          * plugins ensure that this also won't happen.
2311          */
2312         if (first_mblkl < sizeof (ipha_t))
2313                 return (0);
2314         ipha = (ipha_t *)(mp->b_rptr);
2315         switch (IPH_HDR_VERSION(ipha)) {
2316         case IPV4_VERSION:
2317                 *outer4 = ipha;
2318                 *outer6 = NULL;
2319                 if (outer_hlen == 0)
2320                         outer_hlen = IPH_HDR_LENGTH(ipha);
2321                 break;
2322         case IPV6_VERSION:
2323                 *outer4 = NULL;
2324                 *outer6 = (ip6_t *)ipha;
2325                 if (outer_hlen == 0)
2326                         outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha);
2327                 break;
2328         default:
2329                 return (0);
2330         }
2331 
2332         if (first_mblkl < outer_hlen ||
2333             (first_mblkl == outer_hlen && mp->b_cont == NULL))
2334                 return (0);
2335 
2336         /*
2337          * We don't bother doing a pullup here since the outer header will
2338          * just get stripped off soon on input anyway.  We just want to ensure
2339          * that the inner* pointer points to a full header.
2340          */
2341         if (first_mblkl == outer_hlen) {
2342                 inner_mp = mp->b_cont;
2343                 ipha = (ipha_t *)inner_mp->b_rptr;
2344         } else {
2345                 inner_mp = mp;
2346                 ipha = (ipha_t *)(mp->b_rptr + outer_hlen);
2347         }
2348         switch (IPH_HDR_VERSION(ipha)) {
2349         case IPV4_VERSION:
2350                 if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ipha_t))
2351                         return (0);
2352                 *inner4 = ipha;
2353                 *inner6 = NULL;
2354                 break;
2355         case IPV6_VERSION:
2356                 if (inner_mp->b_wptr - (uint8_t *)ipha < sizeof (ip6_t))
2357                         return (0);
2358                 *inner4 = NULL;
2359                 *inner6 = (ip6_t *)ipha;
2360                 break;
2361         default:
2362                 return (0);
2363         }
2364 
2365         return (outer_hlen);
2366 }
2367 
2368 /*
2369  * Received ICMP error in response to an X over IPv4 packet that we
2370  * transmitted.
2371  *
2372  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2373  * the following:
2374  *
2375  * [IPv4(0)][ICMPv4][IPv4(1)][IPv4(2)][ULP]
2376  *
2377  *      or
2378  *
2379  * [IPv4(0)][ICMPv4][IPv4(1)][IPv6][ULP]
2380  *
2381  * And "outer4" will get set to IPv4(1), and inner[46] will correspond to
2382  * whatever the very-inner packet is (IPv4(2) or IPv6).
2383  */
2384 static void
2385 iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph,
2386     ip_recv_attr_t *ira)
2387 {
2388         uint8_t *orig;
2389         ipha_t  *outer4, *inner4;
2390         ip6_t   *outer6, *inner6;
2391         int     outer_hlen;
2392         uint8_t type, code;
2393 
2394         ASSERT(data_mp->b_cont == NULL);
2395         /*
2396          * Temporarily move b_rptr forward so that iptun_find_headers() can
2397          * find headers in the ICMP packet payload.
2398          */
2399         orig = data_mp->b_rptr;
2400         data_mp->b_rptr = (uint8_t *)(icmph + 1);
2401         /*
2402          * The ip module ensures that ICMP errors contain at least the
2403          * original IP header (otherwise, the error would never have made it
2404          * here).
2405          */
2406         ASSERT(MBLKL(data_mp) >= 0);
2407         outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2408             &inner6);
2409         ASSERT(outer6 == NULL);
2410         data_mp->b_rptr = orig;
2411         if (outer_hlen == 0) {
2412                 iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2413                 return;
2414         }
2415 
2416         /* Only ICMP errors due to tunneled packets should reach here. */
2417         ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP ||
2418             outer4->ipha_protocol == IPPROTO_IPV6);
2419 
2420         data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2421             inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2422         if (data_mp == NULL) {
2423                 /* Callee did all of the freeing. */
2424                 atomic_inc_64(&iptun->iptun_ierrors);
2425                 return;
2426         }
2427         /* We should never see reassembled fragment here. */
2428         ASSERT(data_mp->b_next == NULL);
2429 
2430         data_mp->b_rptr = (uint8_t *)outer4 + outer_hlen;
2431 
2432         /*
2433          * If the original packet being transmitted was itself an ICMP error,
2434          * then drop this packet.  We don't want to generate an ICMP error in
2435          * response to an ICMP error.
2436          */
2437         if (is_icmp_error(data_mp, inner4, inner6)) {
2438                 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2439                 return;
2440         }
2441 
2442         switch (icmph->icmph_type) {
2443         case ICMP_DEST_UNREACHABLE:
2444                 type = (inner4 != NULL ? icmph->icmph_type : ICMP6_DST_UNREACH);
2445                 switch (icmph->icmph_code) {
2446                 case ICMP_FRAGMENTATION_NEEDED: {
2447                         uint32_t newmtu;
2448 
2449                         /*
2450                          * We reconcile this with the fact that the tunnel may
2451                          * also have IPsec policy by letting iptun_update_mtu
2452                          * take care of it.
2453                          */
2454                         newmtu = iptun_update_mtu(iptun, NULL,
2455                             ntohs(icmph->icmph_du_mtu));
2456 
2457                         if (inner4 != NULL) {
2458                                 iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2459                                     data_mp, ira->ira_tsl);
2460                         } else {
2461                                 iptun_icmp_toobig_v6(iptun, newmtu, inner6,
2462                                     data_mp, ira->ira_tsl);
2463                         }
2464                         return;
2465                 }
2466                 case ICMP_DEST_NET_UNREACH_ADMIN:
2467                 case ICMP_DEST_HOST_UNREACH_ADMIN:
2468                         code = (inner4 != NULL ? ICMP_DEST_NET_UNREACH_ADMIN :
2469                             ICMP6_DST_UNREACH_ADMIN);
2470                         break;
2471                 default:
2472                         code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2473                             ICMP6_DST_UNREACH_ADDR);
2474                         break;
2475                 }
2476                 break;
2477         case ICMP_TIME_EXCEEDED:
2478                 if (inner6 != NULL) {
2479                         type = ICMP6_TIME_EXCEEDED;
2480                         code = 0;
2481                 } /* else we're already set. */
2482                 break;
2483         case ICMP_PARAM_PROBLEM:
2484                 /*
2485                  * This is a problem with the outer header we transmitted.
2486                  * Treat this as an output error.
2487                  */
2488                 iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2489                 return;
2490         default:
2491                 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2492                 return;
2493         }
2494 
2495         if (inner4 != NULL) {
2496                 iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2497                     ira->ira_tsl);
2498         } else {
2499                 iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2500                     ira->ira_tsl);
2501         }
2502 }
2503 
2504 /*
2505  * Return B_TRUE if the IPv6 packet pointed to by ip6h contains a Tunnel
2506  * Encapsulation Limit destination option.  If there is one, set encaplim_ptr
2507  * to point to the option value.
2508  */
2509 static boolean_t
2510 iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr)
2511 {
2512         ip_pkt_t        pkt;
2513         uint8_t         *endptr;
2514         ip6_dest_t      *destp;
2515         struct ip6_opt  *optp;
2516 
2517         pkt.ipp_fields = 0; /* must be initialized */
2518         (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL);
2519         if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) {
2520                 destp = pkt.ipp_dstopts;
2521         } else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) {
2522                 destp = pkt.ipp_rthdrdstopts;
2523         } else {
2524                 return (B_FALSE);
2525         }
2526 
2527         endptr = (uint8_t *)destp + 8 * (destp->ip6d_len + 1);
2528         optp = (struct ip6_opt *)(destp + 1);
2529         while (endptr - (uint8_t *)optp > sizeof (*optp)) {
2530                 if (optp->ip6o_type == IP6OPT_TUNNEL_LIMIT) {
2531                         if ((uint8_t *)(optp + 1) >= endptr)
2532                                 return (B_FALSE);
2533                         *encaplim_ptr = (uint8_t *)&optp[1];
2534                         return (B_TRUE);
2535                 }
2536                 optp = (struct ip6_opt *)((uint8_t *)optp + optp->ip6o_len + 2);
2537         }
2538         return (B_FALSE);
2539 }
2540 
2541 /*
2542  * Received ICMPv6 error in response to an X over IPv6 packet that we
2543  * transmitted.
2544  *
2545  * NOTE: "outer" refers to what's inside the ICMP payload.  We will get one of
2546  * the following:
2547  *
2548  * [IPv6(0)][ICMPv6][IPv6(1)][IPv4][ULP]
2549  *
2550  *      or
2551  *
2552  * [IPv6(0)][ICMPv6][IPv6(1)][IPv6(2)][ULP]
2553  *
2554  * And "outer6" will get set to IPv6(1), and inner[46] will correspond to
2555  * whatever the very-inner packet is (IPv4 or IPv6(2)).
2556  */
2557 static void
2558 iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h,
2559     ip_recv_attr_t *ira)
2560 {
2561         uint8_t *orig;
2562         ipha_t  *outer4, *inner4;
2563         ip6_t   *outer6, *inner6;
2564         int     outer_hlen;
2565         uint8_t type, code;
2566 
2567         ASSERT(data_mp->b_cont == NULL);
2568 
2569         /*
2570          * Temporarily move b_rptr forward so that iptun_find_headers() can
2571          * find IP headers in the ICMP packet payload.
2572          */
2573         orig = data_mp->b_rptr;
2574         data_mp->b_rptr = (uint8_t *)(icmp6h + 1);
2575         /*
2576          * The ip module ensures that ICMP errors contain at least the
2577          * original IP header (otherwise, the error would never have made it
2578          * here).
2579          */
2580         ASSERT(MBLKL(data_mp) >= 0);
2581         outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6,
2582             &inner6);
2583         ASSERT(outer4 == NULL);
2584         data_mp->b_rptr = orig;      /* Restore r_ptr */
2585         if (outer_hlen == 0) {
2586                 iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2587                 return;
2588         }
2589 
2590         data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2591             inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns);
2592         if (data_mp == NULL) {
2593                 /* Callee did all of the freeing. */
2594                 atomic_inc_64(&iptun->iptun_ierrors);
2595                 return;
2596         }
2597         /* We should never see reassembled fragment here. */
2598         ASSERT(data_mp->b_next == NULL);
2599 
2600         data_mp->b_rptr = (uint8_t *)outer6 + outer_hlen;
2601 
2602         /*
2603          * If the original packet being transmitted was itself an ICMP error,
2604          * then drop this packet.  We don't want to generate an ICMP error in
2605          * response to an ICMP error.
2606          */
2607         if (is_icmp_error(data_mp, inner4, inner6)) {
2608                 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2609                 return;
2610         }
2611 
2612         switch (icmp6h->icmp6_type) {
2613         case ICMP6_PARAM_PROB: {
2614                 uint8_t *encaplim_ptr;
2615 
2616                 /*
2617                  * If the ICMPv6 error points to a valid Tunnel Encapsulation
2618                  * Limit option and the limit value is 0, then fall through
2619                  * and send a host unreachable message.  Otherwise, treat the
2620                  * error as an output error, as there must have been a problem
2621                  * with a packet we sent.
2622                  */
2623                 if (!iptun_find_encaplimit(data_mp, outer6, &encaplim_ptr) ||
2624                     (icmp6h->icmp6_pptr !=
2625                     ((ptrdiff_t)encaplim_ptr - (ptrdiff_t)outer6)) ||
2626                     *encaplim_ptr != 0) {
2627                         iptun_drop_pkt(data_mp, &iptun->iptun_oerrors);
2628                         return;
2629                 }
2630                 /* FALLTHRU */
2631         }
2632         case ICMP6_TIME_EXCEEDED:
2633         case ICMP6_DST_UNREACH:
2634                 type = (inner4 != NULL ? ICMP_DEST_UNREACHABLE :
2635                     ICMP6_DST_UNREACH);
2636                 code = (inner4 != NULL ? ICMP_HOST_UNREACHABLE :
2637                     ICMP6_DST_UNREACH_ADDR);
2638                 break;
2639         case ICMP6_PACKET_TOO_BIG: {
2640                 uint32_t newmtu;
2641 
2642                 /*
2643                  * We reconcile this with the fact that the tunnel may also
2644                  * have IPsec policy by letting iptun_update_mtu take care of
2645                  * it.
2646                  */
2647                 newmtu = iptun_update_mtu(iptun, NULL,
2648                     ntohl(icmp6h->icmp6_mtu));
2649 
2650                 if (inner4 != NULL) {
2651                         iptun_icmp_fragneeded_v4(iptun, newmtu, inner4,
2652                             data_mp, ira->ira_tsl);
2653                 } else {
2654                         iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp,
2655                             ira->ira_tsl);
2656                 }
2657                 return;
2658         }
2659         default:
2660                 iptun_drop_pkt(data_mp, &iptun->iptun_norcvbuf);
2661                 return;
2662         }
2663 
2664         if (inner4 != NULL) {
2665                 iptun_icmp_error_v4(iptun, inner4, data_mp, type, code,
2666                     ira->ira_tsl);
2667         } else {
2668                 iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0,
2669                     ira->ira_tsl);
2670         }
2671 }
2672 
2673 /*
2674  * Called as conn_recvicmp from IP for ICMP errors.
2675  */
2676 /* ARGSUSED2 */
2677 static void
2678 iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2679 {
2680         conn_t          *connp = arg;
2681         iptun_t         *iptun = connp->conn_iptun;
2682         mblk_t          *tmpmp;
2683         size_t          hlen;
2684 
2685         ASSERT(IPCL_IS_IPTUN(connp));
2686 
2687         if (mp->b_cont != NULL) {
2688                 /*
2689                  * Since ICMP error processing necessitates access to bits
2690                  * that are within the ICMP error payload (the original packet
2691                  * that caused the error), pull everything up into a single
2692                  * block for convenience.
2693                  */
2694                 if ((tmpmp = msgpullup(mp, -1)) == NULL) {
2695                         iptun_drop_pkt(mp, &iptun->iptun_norcvbuf);
2696                         return;
2697                 }
2698                 freemsg(mp);
2699                 mp = tmpmp;
2700         }
2701 
2702         hlen = ira->ira_ip_hdr_length;
2703         switch (iptun->iptun_typeinfo->iti_ipvers) {
2704         case IPV4_VERSION:
2705                 /*
2706                  * The outer IP header coming up from IP is always ipha_t
2707                  * alligned (otherwise, we would have crashed in ip).
2708                  */
2709                 iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen),
2710                     ira);
2711                 break;
2712         case IPV6_VERSION:
2713                 iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen),
2714                     ira);
2715                 break;
2716         }
2717 }
2718 
2719 static boolean_t
2720 iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2721 {
2722         ipaddr_t v4addr;
2723 
2724         /*
2725          * It's possible that someone sent us an IPv4-in-IPv4 packet with the
2726          * IPv4 address of a 6to4 tunnel as the destination.
2727          */
2728         if (inner6 == NULL)
2729                 return (B_FALSE);
2730 
2731         /*
2732          * Make sure that the IPv6 destination is within the site that this
2733          * 6to4 tunnel is routing for.  We don't want people bouncing random
2734          * tunneled IPv6 packets through this 6to4 router.
2735          */
2736         IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&v4addr);
2737         if (outer4->ipha_dst != v4addr)
2738                 return (B_FALSE);
2739 
2740         if (IN6_IS_ADDR_6TO4(&inner6->ip6_src)) {
2741                 /*
2742                  * Section 9 of RFC 3056 (security considerations) suggests
2743                  * that when a packet is from a 6to4 site (i.e., it's not a
2744                  * global address being forwarded froma relay router), make
2745                  * sure that the packet was tunneled by that site's 6to4
2746                  * router.
2747                  */
2748                 IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2749                 if (outer4->ipha_src != v4addr)
2750                         return (B_FALSE);
2751         } else {
2752                 /*
2753                  * Only accept packets from a relay router if we've configured
2754                  * outbound relay router functionality.
2755                  */
2756                 if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2757                         return (B_FALSE);
2758         }
2759 
2760         return (B_TRUE);
2761 }
2762 
2763 /*
2764  * Input function for everything that comes up from the ip module below us.
2765  * This is called directly from the ip module via connp->conn_recv().
2766  *
2767  * We receive M_DATA messages with IP-in-IP tunneled packets.
2768  */
2769 /* ARGSUSED2 */
2770 static void
2771 iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira)
2772 {
2773         conn_t  *connp = arg;
2774         iptun_t *iptun = connp->conn_iptun;
2775         int     outer_hlen;
2776         ipha_t  *outer4, *inner4;
2777         ip6_t   *outer6, *inner6;
2778 
2779         ASSERT(IPCL_IS_IPTUN(connp));
2780         ASSERT(DB_TYPE(data_mp) == M_DATA);
2781 
2782         outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length,
2783             &outer4, &inner4, &outer6, &inner6);
2784         if (outer_hlen == 0)
2785                 goto drop;
2786 
2787         /*
2788          * If the system is labeled, we call tsol_check_dest() on the packet
2789          * destination (our local tunnel address) to ensure that the packet as
2790          * labeled should be allowed to be sent to us.  We don't need to call
2791          * the more involved tsol_receive_local() since the tunnel link itself
2792          * cannot be assigned to shared-stack non-global zones.
2793          */
2794         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2795                 if (ira->ira_tsl == NULL)
2796                         goto drop;
2797                 if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ?
2798                     (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst),
2799                     (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION),
2800                     CONN_MAC_DEFAULT, B_FALSE, NULL) != 0)
2801                         goto drop;
2802         }
2803 
2804         data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp,
2805             inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns);
2806         if (data_mp == NULL) {
2807                 /* Callee did all of the freeing. */
2808                 return;
2809         }
2810 
2811         if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 &&
2812             !iptun_in_6to4_ok(iptun, outer4, inner6))
2813                 goto drop;
2814 
2815         /*
2816          * We need to statistically account for each packet individually, so
2817          * we might as well split up any b_next chains here.
2818          */
2819         do {
2820                 mblk_t  *mp;
2821 
2822                 mp = data_mp->b_next;
2823                 data_mp->b_next = NULL;
2824 
2825                 atomic_inc_64(&iptun->iptun_ipackets);
2826                 atomic_add_64(&iptun->iptun_rbytes, msgdsize(data_mp));
2827                 mac_rx(iptun->iptun_mh, NULL, data_mp);
2828 
2829                 data_mp = mp;
2830         } while (data_mp != NULL);
2831         return;
2832 drop:
2833         iptun_drop_pkt(data_mp, &iptun->iptun_ierrors);
2834 }
2835 
2836 /*
2837  * Do 6to4-specific header-processing on output.  Return B_TRUE if the packet
2838  * was processed without issue, or B_FALSE if the packet had issues and should
2839  * be dropped.
2840  */
2841 static boolean_t
2842 iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6)
2843 {
2844         ipaddr_t v4addr;
2845 
2846         /*
2847          * IPv6 source must be a 6to4 address.  This is because a conscious
2848          * decision was made to not allow a Solaris system to be used as a
2849          * relay router (for security reasons) when 6to4 was initially
2850          * integrated.  If this decision is ever reversed, the following check
2851          * can be removed.
2852          */
2853         if (!IN6_IS_ADDR_6TO4(&inner6->ip6_src))
2854                 return (B_FALSE);
2855 
2856         /*
2857          * RFC3056 mandates that the IPv4 source MUST be set to the IPv4
2858          * portion of the 6to4 IPv6 source address.  In other words, make sure
2859          * that we're tunneling packets from our own 6to4 site.
2860          */
2861         IN6_6TO4_TO_V4ADDR(&inner6->ip6_src, (struct in_addr *)&v4addr);
2862         if (outer4->ipha_src != v4addr)
2863                 return (B_FALSE);
2864 
2865         /*
2866          * Automatically set the destination of the outer IPv4 header as
2867          * described in RFC3056.  There are two possibilities:
2868          *
2869          * a. If the IPv6 destination is a 6to4 address, set the IPv4 address
2870          *    to the IPv4 portion of the 6to4 address.
2871          * b. If the IPv6 destination is a native IPv6 address, set the IPv4
2872          *    destination to the address of a relay router.
2873          *
2874          * Design Note: b shouldn't be necessary here, and this is a flaw in
2875          * the design of the 6to4relay command.  Instead of setting a 6to4
2876          * relay address in this module via an ioctl, the 6to4relay command
2877          * could simply add a IPv6 route for native IPv6 addresses (such as a
2878          * default route) in the forwarding table that uses a 6to4 destination
2879          * as its next hop, and the IPv4 portion of that address could be a
2880          * 6to4 relay address.  In order for this to work, IP would have to
2881          * resolve the next hop address, which would necessitate a link-layer
2882          * address resolver for 6to4 links, which doesn't exist today.
2883          *
2884          * In fact, if a resolver existed for 6to4 links, then setting the
2885          * IPv4 destination in the outer header could be done as part of
2886          * link-layer address resolution and fast-path header generation, and
2887          * not here.
2888          */
2889         if (IN6_IS_ADDR_6TO4(&inner6->ip6_dst)) {
2890                 /* destination is a 6to4 router */
2891                 IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst,
2892                     (struct in_addr *)&outer4->ipha_dst);
2893 
2894                 /* Reject attempts to send to INADDR_ANY */
2895                 if (outer4->ipha_dst == INADDR_ANY)
2896                         return (B_FALSE);
2897         } else {
2898                 /*
2899                  * The destination is a native IPv6 address.  If output to a
2900                  * relay-router is enabled, use the relay-router's IPv4
2901                  * address as the destination.
2902                  */
2903                 if (iptun->iptun_iptuns->iptuns_relay_rtr_addr == INADDR_ANY)
2904                         return (B_FALSE);
2905                 outer4->ipha_dst = iptun->iptun_iptuns->iptuns_relay_rtr_addr;
2906         }
2907 
2908         /*
2909          * If the outer source and destination are equal, this means that the
2910          * 6to4 router somehow forwarded an IPv6 packet destined for its own
2911          * 6to4 site to its 6to4 tunnel interface, which will result in this
2912          * packet infinitely bouncing between ip and iptun.
2913          */
2914         return (outer4->ipha_src != outer4->ipha_dst);
2915 }
2916 
2917 /*
2918  * Process output packets with outer IPv4 headers.  Frees mp and bumps stat on
2919  * error.
2920  */
2921 static mblk_t *
2922 iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4,
2923     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
2924 {
2925         uint8_t *innerptr = (inner4 != NULL ?
2926             (uint8_t *)inner4 : (uint8_t *)inner6);
2927         size_t  minmtu = iptun->iptun_typeinfo->iti_minmtu;
2928 
2929         if (inner4 != NULL) {
2930                 ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP);
2931                 /*
2932                  * Copy the tos from the inner IPv4 header. We mask off ECN
2933                  * bits (bits 6 and 7) because there is currently no
2934                  * tunnel-tunnel communication to determine if both sides
2935                  * support ECN.  We opt for the safe choice: don't copy the
2936                  * ECN bits when doing encapsulation.
2937                  */
2938                 outer4->ipha_type_of_service =
2939                     inner4->ipha_type_of_service & ~0x03;
2940         } else {
2941                 ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 &&
2942                     inner6 != NULL);
2943         }
2944         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF)
2945                 outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
2946         else
2947                 outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS;
2948 
2949         /*
2950          * As described in section 3.2.2 of RFC4213, if the packet payload is
2951          * less than or equal to the minimum MTU size, then we need to allow
2952          * IPv4 to fragment the packet.  The reason is that even if we end up
2953          * receiving an ICMP frag-needed, the interface above this tunnel
2954          * won't be allowed to drop its MTU as a result, since the packet was
2955          * already smaller than the smallest allowable MTU for that interface.
2956          */
2957         if (mp->b_wptr - innerptr <= minmtu) {
2958                 outer4->ipha_fragment_offset_and_flags = 0;
2959                 ixa->ixa_flags &= ~IXAF_DONTFRAG;
2960         } else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) &&
2961             (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) {
2962                 ixa->ixa_flags |= IXAF_DONTFRAG;
2963         }
2964 
2965         ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4);
2966         ixa->ixa_pktlen = msgdsize(mp);
2967         ixa->ixa_protocol = outer4->ipha_protocol;
2968 
2969         outer4->ipha_length = htons(ixa->ixa_pktlen);
2970         return (mp);
2971 }
2972 
2973 /*
2974  * Insert an encapsulation limit destination option in the packet provided.
2975  * Always consumes the mp argument and returns a new mblk pointer.
2976  */
2977 static mblk_t *
2978 iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
2979     uint8_t limit)
2980 {
2981         mblk_t                  *newmp;
2982         iptun_ipv6hdrs_t        *newouter6;
2983 
2984         ASSERT(outer6->ip6_nxt == IPPROTO_IPV6);
2985         ASSERT(mp->b_cont == NULL);
2986 
2987         mp->b_rptr += sizeof (ip6_t);
2988         newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED);
2989         if (newmp == NULL) {
2990                 iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
2991                 return (NULL);
2992         }
2993         newmp->b_wptr += sizeof (iptun_ipv6hdrs_t);
2994         /* Copy the payload (Starting with the inner IPv6 header). */
2995         bcopy(mp->b_rptr, newmp->b_wptr, MBLKL(mp));
2996         newmp->b_wptr += MBLKL(mp);
2997         newouter6 = (iptun_ipv6hdrs_t *)newmp->b_rptr;
2998         /* Now copy the outer IPv6 header. */
2999         bcopy(outer6, &newouter6->it6h_ip6h, sizeof (ip6_t));
3000         newouter6->it6h_ip6h.ip6_nxt = IPPROTO_DSTOPTS;
3001         newouter6->it6h_encaplim = iptun_encaplim_init;
3002         newouter6->it6h_encaplim.iel_destopt.ip6d_nxt = outer6->ip6_nxt;
3003         newouter6->it6h_encaplim.iel_telopt.ip6ot_encap_limit = limit;
3004 
3005         /*
3006          * The payload length will be set at the end of
3007          * iptun_out_process_ipv6().
3008          */
3009 
3010         freemsg(mp);
3011         return (newmp);
3012 }
3013 
3014 /*
3015  * Process output packets with outer IPv6 headers.  Frees mp and bumps stats
3016  * on error.
3017  */
3018 static mblk_t *
3019 iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6,
3020     ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa)
3021 {
3022         uint8_t         *innerptr = (inner4 != NULL ?
3023             (uint8_t *)inner4 : (uint8_t *)inner6);
3024         size_t          minmtu = iptun->iptun_typeinfo->iti_minmtu;
3025         uint8_t         *limit, *configlimit;
3026         uint32_t        offset;
3027         iptun_ipv6hdrs_t *v6hdrs;
3028 
3029         if (inner6 != NULL && iptun_find_encaplimit(mp, inner6, &limit)) {
3030                 /*
3031                  * The inner packet is an IPv6 packet which itself contains an
3032                  * encapsulation limit option.  The limit variable points to
3033                  * the value in the embedded option.  Process the
3034                  * encapsulation limit option as specified in RFC 2473.
3035                  *
3036                  * If limit is 0, then we've exceeded the limit and we need to
3037                  * send back an ICMPv6 parameter problem message.
3038                  *
3039                  * If limit is > 0, then we decrement it by 1 and make sure
3040                  * that the encapsulation limit option in the outer header
3041                  * reflects that (adding an option if one isn't already
3042                  * there).
3043                  */
3044                 ASSERT(limit > mp->b_rptr && limit < mp->b_wptr);
3045                 if (*limit == 0) {
3046                         mp->b_rptr = (uint8_t *)inner6;
3047                         offset = limit - mp->b_rptr;
3048                         iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB,
3049                             0, offset, ixa->ixa_tsl);
3050                         atomic_inc_64(&iptun->iptun_noxmtbuf);
3051                         return (NULL);
3052                 }
3053 
3054                 /*
3055                  * The outer header requires an encapsulation limit option.
3056                  * If there isn't one already, add one.
3057                  */
3058                 if (iptun->iptun_encaplimit == 0) {
3059                         if ((mp = iptun_insert_encaplimit(iptun, mp, outer6,
3060                             (*limit - 1))) == NULL)
3061                                 return (NULL);
3062                         v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3063                 } else {
3064                         /*
3065                          * There is an existing encapsulation limit option in
3066                          * the outer header.  If the inner encapsulation limit
3067                          * is less than the configured encapsulation limit,
3068                          * update the outer encapsulation limit to reflect
3069                          * this lesser value.
3070                          */
3071                         v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr;
3072                         configlimit =
3073                             &v6hdrs->it6h_encaplim.iel_telopt.ip6ot_encap_limit;
3074                         if ((*limit - 1) < *configlimit)
3075                                 *configlimit = (*limit - 1);
3076                 }
3077                 ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t);
3078                 ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt;
3079         } else {
3080                 ixa->ixa_ip_hdr_length = sizeof (ip6_t);
3081                 ixa->ixa_protocol = outer6->ip6_nxt;
3082         }
3083         /*
3084          * See iptun_output_process_ipv4() why we allow fragmentation for
3085          * small packets
3086          */
3087         if (mp->b_wptr - innerptr <= minmtu)
3088                 ixa->ixa_flags &= ~IXAF_DONTFRAG;
3089         else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL))
3090                 ixa->ixa_flags |= IXAF_DONTFRAG;
3091 
3092         ixa->ixa_pktlen = msgdsize(mp);
3093         outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t));
3094         return (mp);
3095 }
3096 
3097 /*
3098  * The IP tunneling MAC-type plugins have already done most of the header
3099  * processing and validity checks.  We are simply responsible for multiplexing
3100  * down to the ip module below us.
3101  */
3102 static void
3103 iptun_output(iptun_t *iptun, mblk_t *mp)
3104 {
3105         conn_t  *connp = iptun->iptun_connp;
3106         mblk_t  *newmp;
3107         int     error;
3108         ip_xmit_attr_t  *ixa;
3109 
3110         ASSERT(mp->b_datap->db_type == M_DATA);
3111 
3112         if (mp->b_cont != NULL) {
3113                 if ((newmp = msgpullup(mp, -1)) == NULL) {
3114                         iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf);
3115                         return;
3116                 }
3117                 freemsg(mp);
3118                 mp = newmp;
3119         }
3120 
3121         if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) {
3122                 iptun_output_6to4(iptun, mp);
3123                 return;
3124         }
3125 
3126         if (is_system_labeled()) {
3127                 /*
3128                  * Since the label can be different meaning a potentially
3129                  * different IRE,we always use a unique ip_xmit_attr_t.
3130                  */
3131                 ixa = conn_get_ixa_exclusive(connp);
3132         } else {
3133                 /*
3134                  * If no other thread is using conn_ixa this just gets a
3135                  * reference to conn_ixa. Otherwise we get a safe copy of
3136                  * conn_ixa.
3137                  */
3138                 ixa = conn_get_ixa(connp, B_FALSE);
3139         }
3140         if (ixa == NULL) {
3141                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3142                 return;
3143         }
3144 
3145         /*
3146          * In case we got a safe copy of conn_ixa, then we need
3147          * to fill in any pointers in it.
3148          */
3149         if (ixa->ixa_ire == NULL) {
3150                 error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3151                     &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0,
3152                     NULL, NULL, 0);
3153                 if (error != 0) {
3154                         if (ixa->ixa_ire != NULL &&
3155                             (error == EHOSTUNREACH || error == ENETUNREACH)) {
3156                                 /*
3157                                  * Let conn_ip_output/ire_send_noroute return
3158                                  * the error and send any local ICMP error.
3159                                  */
3160                                 error = 0;
3161                         } else {
3162                                 ixa_refrele(ixa);
3163                                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3164                                 return;
3165                         }
3166                 }
3167         }
3168 
3169         iptun_output_common(iptun, ixa, mp);
3170         ixa_refrele(ixa);
3171 }
3172 
3173 /*
3174  * We use an ixa based on the last destination.
3175  */
3176 static void
3177 iptun_output_6to4(iptun_t *iptun, mblk_t *mp)
3178 {
3179         conn_t          *connp = iptun->iptun_connp;
3180         ipha_t          *outer4, *inner4;
3181         ip6_t           *outer6, *inner6;
3182         ip_xmit_attr_t  *ixa;
3183         ip_xmit_attr_t  *oldixa;
3184         int             error;
3185         boolean_t       need_connect;
3186         in6_addr_t      v6dst;
3187 
3188         ASSERT(mp->b_cont == NULL);  /* Verified by iptun_output */
3189 
3190         /* Make sure we set ipha_dst before we look at ipha_dst */
3191 
3192         (void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6);
3193         ASSERT(outer4 != NULL);
3194         if (!iptun_out_process_6to4(iptun, outer4, inner6)) {
3195                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3196                 return;
3197         }
3198 
3199         if (is_system_labeled()) {
3200                 /*
3201                  * Since the label can be different meaning a potentially
3202                  * different IRE,we always use a unique ip_xmit_attr_t.
3203                  */
3204                 ixa = conn_get_ixa_exclusive(connp);
3205         } else {
3206                 /*
3207                  * If no other thread is using conn_ixa this just gets a
3208                  * reference to conn_ixa. Otherwise we get a safe copy of
3209                  * conn_ixa.
3210                  */
3211                 ixa = conn_get_ixa(connp, B_FALSE);
3212         }
3213         if (ixa == NULL) {
3214                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3215                 return;
3216         }
3217 
3218         mutex_enter(&connp->conn_lock);
3219         if (connp->conn_v4lastdst == outer4->ipha_dst) {
3220                 need_connect = (ixa->ixa_ire == NULL);
3221         } else {
3222                 /* In case previous destination was multirt */
3223                 ip_attr_newdst(ixa);
3224 
3225                 /*
3226                  * We later update conn_ixa when we update conn_v4lastdst
3227                  * which enables subsequent packets to avoid redoing
3228                  * ip_attr_connect
3229                  */
3230                 need_connect = B_TRUE;
3231         }
3232         mutex_exit(&connp->conn_lock);
3233 
3234         /*
3235          * In case we got a safe copy of conn_ixa, or otherwise we don't
3236          * have a current ixa_ire, then we need to fill in any pointers in
3237          * the ixa.
3238          */
3239         if (need_connect) {
3240                 IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst);
3241 
3242                 /* We handle IPsec in iptun_output_common */
3243                 error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6,
3244                     &v6dst, &v6dst, 0, NULL, NULL, 0);
3245                 if (error != 0) {
3246                         if (ixa->ixa_ire != NULL &&
3247                             (error == EHOSTUNREACH || error == ENETUNREACH)) {
3248                                 /*
3249                                  * Let conn_ip_output/ire_send_noroute return
3250                                  * the error and send any local ICMP error.
3251                                  */
3252                                 error = 0;
3253                         } else {
3254                                 ixa_refrele(ixa);
3255                                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3256                                 return;
3257                         }
3258                 }
3259         }
3260 
3261         iptun_output_common(iptun, ixa, mp);
3262 
3263         /* Atomically replace conn_ixa and conn_v4lastdst */
3264         mutex_enter(&connp->conn_lock);
3265         if (connp->conn_v4lastdst != outer4->ipha_dst) {
3266                 /* Remember the dst which corresponds to conn_ixa */
3267                 connp->conn_v6lastdst = v6dst;
3268                 oldixa = conn_replace_ixa(connp, ixa);
3269         } else {
3270                 oldixa = NULL;
3271         }
3272         mutex_exit(&connp->conn_lock);
3273         ixa_refrele(ixa);
3274         if (oldixa != NULL)
3275                 ixa_refrele(oldixa);
3276 }
3277 
3278 /*
3279  * Check the destination/label. Modifies *mpp by adding/removing CIPSO.
3280  *
3281  * We get the label from the message in order to honor the
3282  * ULPs/IPs choice of label. This will be NULL for forwarded
3283  * packets, neighbor discovery packets and some others.
3284  */
3285 static int
3286 iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa)
3287 {
3288         cred_t  *cr;
3289         int     adjust;
3290         int     iplen;
3291         int     err;
3292         ts_label_t *effective_tsl = NULL;
3293 
3294 
3295         ASSERT(is_system_labeled());
3296 
3297         cr = msg_getcred(*mpp, NULL);
3298         if (cr == NULL)
3299                 return (0);
3300 
3301         /*
3302          * We need to start with a label based on the IP/ULP above us
3303          */
3304         ip_xmit_attr_restore_tsl(ixa, cr);
3305 
3306         /*
3307          * Need to update packet with any CIPSO option since
3308          * conn_ip_output doesn't do that.
3309          */
3310         if (ixa->ixa_flags & IXAF_IS_IPV4) {
3311                 ipha_t *ipha;
3312 
3313                 ipha = (ipha_t *)(*mpp)->b_rptr;
3314                 iplen = ntohs(ipha->ipha_length);
3315                 err = tsol_check_label_v4(ixa->ixa_tsl,
3316                     ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3317                     ixa->ixa_ipst, &effective_tsl);
3318                 if (err != 0)
3319                         return (err);
3320 
3321                 ipha = (ipha_t *)(*mpp)->b_rptr;
3322                 adjust = (int)ntohs(ipha->ipha_length) - iplen;
3323         } else {
3324                 ip6_t *ip6h;
3325 
3326                 ip6h = (ip6_t *)(*mpp)->b_rptr;
3327                 iplen = ntohs(ip6h->ip6_plen);
3328 
3329                 err = tsol_check_label_v6(ixa->ixa_tsl,
3330                     ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE,
3331                     ixa->ixa_ipst, &effective_tsl);
3332                 if (err != 0)
3333                         return (err);
3334 
3335                 ip6h = (ip6_t *)(*mpp)->b_rptr;
3336                 adjust = (int)ntohs(ip6h->ip6_plen) - iplen;
3337         }
3338 
3339         if (effective_tsl != NULL) {
3340                 /* Update the label */
3341                 ip_xmit_attr_replace_tsl(ixa, effective_tsl);
3342         }
3343         ixa->ixa_pktlen += adjust;
3344         ixa->ixa_ip_hdr_length += adjust;
3345         return (0);
3346 }
3347 
3348 
3349 static void
3350 iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp)
3351 {
3352         ipsec_tun_pol_t *itp = iptun->iptun_itp;
3353         int             outer_hlen;
3354         mblk_t          *newmp;
3355         ipha_t          *outer4, *inner4;
3356         ip6_t           *outer6, *inner6;
3357         int             error;
3358         boolean_t       update_pktlen;
3359 
3360         ASSERT(ixa->ixa_ire != NULL);
3361 
3362         outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6,
3363             &inner6);
3364         if (outer_hlen == 0) {
3365                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3366                 return;
3367         }
3368 
3369         /* Save IXAF_DONTFRAG value */
3370         iaflags_t dontfrag = ixa->ixa_flags & IXAF_DONTFRAG;
3371 
3372         /* Perform header processing. */
3373         if (outer4 != NULL) {
3374                 mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6,
3375                     ixa);
3376         } else {
3377                 mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6,
3378                     ixa);
3379         }
3380         if (mp == NULL)
3381                 return;
3382 
3383         /*
3384          * Let's hope the compiler optimizes this with "branch taken".
3385          */
3386         if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) {
3387                 /* This updates the ip_xmit_attr_t */
3388                 mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4,
3389                     outer6, outer_hlen, ixa);
3390                 if (mp == NULL) {
3391                         atomic_inc_64(&iptun->iptun_oerrors);
3392                         return;
3393                 }
3394                 if (is_system_labeled()) {
3395                         /*
3396                          * Might change the packet by adding/removing CIPSO.
3397                          * After this caller inner* and outer* and outer_hlen
3398                          * might be invalid.
3399                          */
3400                         error = iptun_output_check_label(&mp, ixa);
3401                         if (error != 0) {
3402                                 ip2dbg(("label check failed (%d)\n", error));
3403                                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3404                                 return;
3405                         }
3406                 }
3407 
3408                 /*
3409                  * ipsec_tun_outbound() returns a chain of tunneled IP
3410                  * fragments linked with b_next (or a single message if the
3411                  * tunneled packet wasn't a fragment).
3412                  * If fragcache returned a list then we need to update
3413                  * ixa_pktlen for all packets in the list.
3414                  */
3415                 update_pktlen = (mp->b_next != NULL);
3416 
3417                 /*
3418                  * Otherwise, we're good to go.  The ixa has been updated with
3419                  * instructions for outbound IPsec processing.
3420                  */
3421                 for (newmp = mp; newmp != NULL; newmp = mp) {
3422                         size_t minmtu = iptun->iptun_typeinfo->iti_minmtu;
3423 
3424                         atomic_inc_64(&iptun->iptun_opackets);
3425                         atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3426                         mp = mp->b_next;
3427                         newmp->b_next = NULL;
3428 
3429                         /*
3430                          * The IXAF_DONTFRAG flag is global, but there is
3431                          * a chain here.  Check if we're really already
3432                          * smaller than the minimum allowed MTU and reset here
3433                          * appropriately.  Otherwise one small packet can kill
3434                          * the whole chain's path mtu discovery.
3435                          * In addition, update the pktlen to the length of
3436                          * the actual packet being processed.
3437                          */
3438                         if (update_pktlen) {
3439                                 ixa->ixa_pktlen = msgdsize(newmp);
3440                                 if (ixa->ixa_pktlen <= minmtu)
3441                                         ixa->ixa_flags &= ~IXAF_DONTFRAG;
3442                         }
3443 
3444                         atomic_inc_64(&iptun->iptun_opackets);
3445                         atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3446 
3447                         error = conn_ip_output(newmp, ixa);
3448 
3449                         /* Restore IXAF_DONTFRAG value */
3450                         ixa->ixa_flags |= dontfrag;
3451 
3452                         if (error == EMSGSIZE) {
3453                                 /* IPsec policy might have changed */
3454                                 (void) iptun_update_mtu(iptun, ixa, 0);
3455                         }
3456                 }
3457         } else {
3458                 /*
3459                  * The ip module will potentially apply global policy to the
3460                  * packet in its output path if there's no active tunnel
3461                  * policy.
3462                  */
3463                 ASSERT(ixa->ixa_ipsec_policy == NULL);
3464                 mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa);
3465                 if (mp == NULL) {
3466                         atomic_inc_64(&iptun->iptun_oerrors);
3467                         return;
3468                 }
3469                 if (is_system_labeled()) {
3470                         /*
3471                          * Might change the packet by adding/removing CIPSO.
3472                          * After this caller inner* and outer* and outer_hlen
3473                          * might be invalid.
3474                          */
3475                         error = iptun_output_check_label(&mp, ixa);
3476                         if (error != 0) {
3477                                 ip2dbg(("label check failed (%d)\n", error));
3478                                 iptun_drop_pkt(mp, &iptun->iptun_oerrors);
3479                                 return;
3480                         }
3481                 }
3482 
3483                 atomic_inc_64(&iptun->iptun_opackets);
3484                 atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen);
3485 
3486                 error = conn_ip_output(mp, ixa);
3487                 if (error == EMSGSIZE) {
3488                         /* IPsec policy might have changed */
3489                         (void) iptun_update_mtu(iptun, ixa, 0);
3490                 }
3491         }
3492         if (ixa->ixa_flags & IXAF_IPSEC_SECURE)
3493                 ipsec_out_release_refs(ixa);
3494 }
3495 
3496 static mac_callbacks_t iptun_m_callbacks = {
3497         .mc_callbacks   = (MC_SETPROP | MC_GETPROP | MC_PROPINFO),
3498         .mc_getstat     = iptun_m_getstat,
3499         .mc_start       = iptun_m_start,
3500         .mc_stop        = iptun_m_stop,
3501         .mc_setpromisc  = iptun_m_setpromisc,
3502         .mc_multicst    = iptun_m_multicst,
3503         .mc_unicst      = iptun_m_unicst,
3504         .mc_tx          = iptun_m_tx,
3505         .mc_reserved    = NULL,
3506         .mc_setprop     = iptun_m_setprop,
3507         .mc_getprop     = iptun_m_getprop,
3508         .mc_propinfo    = iptun_m_propinfo
3509 };