1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  *  This module supports AF_TRILL sockets and TRILL layer-2 forwarding.
  29  */
  30 
  31 #include <sys/strsubr.h>
  32 #include <sys/socket.h>
  33 #include <sys/socketvar.h>
  34 #include <sys/modctl.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/tihdr.h>
  37 #include <sys/strsun.h>
  38 #include <sys/policy.h>
  39 #include <sys/ethernet.h>
  40 #include <sys/vlan.h>
  41 #include <net/trill.h>
  42 #include <net/if_dl.h>
  43 #include <sys/mac.h>
  44 #include <sys/mac_client.h>
  45 #include <sys/mac_provider.h>
  46 #include <sys/mac_client_priv.h>
  47 #include <sys/sdt.h>
  48 #include <sys/dls.h>
  49 #include <sys/sunddi.h>
  50 
  51 #include "trill_impl.h"
  52 
  53 static void trill_del_all(trill_inst_t *, boolean_t);
  54 static int trill_del_nick(trill_inst_t *, uint16_t, boolean_t);
  55 static void trill_stop_recv(trill_sock_t *);
  56 static void trill_ctrl_input(trill_sock_t *, mblk_t *, const uint8_t *,
  57     uint16_t);
  58 static trill_node_t *trill_node_lookup(trill_inst_t *, uint16_t);
  59 static void trill_node_unref(trill_inst_t *, trill_node_t *);
  60 static void trill_sock_unref(trill_sock_t *);
  61 static void trill_kstats_init(trill_sock_t *, const char *);
  62 
  63 static list_t trill_inst_list;
  64 static krwlock_t trill_inst_rwlock;
  65 
  66 static sock_lower_handle_t trill_create(int, int, int, sock_downcalls_t **,
  67     uint_t *, int *, int, cred_t *);
  68 
  69 static smod_reg_t sinfo = {
  70         SOCKMOD_VERSION,
  71         "trill",
  72         SOCK_UC_VERSION,
  73         SOCK_DC_VERSION,
  74         trill_create,
  75         NULL,
  76 };
  77 
  78 /* modldrv structure */
  79 static struct modlsockmod sockmod = {
  80         &mod_sockmodops, "AF_TRILL socket module", &sinfo
  81 };
  82 
  83 /* modlinkage structure */
  84 static struct modlinkage ml = {
  85         MODREV_1,
  86         { &sockmod, NULL }
  87 };
  88 
  89 #define VALID_NICK(n)   ((n) != RBRIDGE_NICKNAME_NONE && \
  90                         (n) != RBRIDGE_NICKNAME_UNUSED)
  91 
  92 static mblk_t *
  93 create_trill_header(trill_sock_t *tsock, mblk_t *mp, const uint8_t *daddr,
  94     boolean_t trill_hdr_ok, boolean_t multidest, uint16_t tci,
  95     size_t msglen)
  96 {
  97         int extra_hdr_len;
  98         struct ether_vlan_header *ethvlanhdr;
  99         mblk_t *hdr_mp;
 100         uint16_t etype;
 101 
 102         etype = msglen > 0 ? (uint16_t)msglen : ETHERTYPE_TRILL;
 103 
 104         /* When sending on the PVID, we must not give a VLAN ID */
 105         if (tci == tsock->ts_link->bl_pvid)
 106                 tci = TRILL_NO_TCI;
 107 
 108         /*
 109          * Create new Ethernet header and include additional space
 110          * for writing TRILL header and/or VLAN tag.
 111          */
 112         extra_hdr_len = (trill_hdr_ok ? 0 : sizeof (trill_header_t)) +
 113             (tci != TRILL_NO_TCI ? sizeof (struct ether_vlan_extinfo) : 0);
 114         hdr_mp = mac_header(tsock->ts_link->bl_mh, daddr,
 115             tci != TRILL_NO_TCI ? ETHERTYPE_VLAN : etype, mp, extra_hdr_len);
 116         if (hdr_mp == NULL) {
 117                 freemsg(mp);
 118                 return (NULL);
 119         }
 120 
 121         if (tci != TRILL_NO_TCI) {
 122                 /* LINTED: alignment */
 123                 ethvlanhdr = (struct ether_vlan_header *)hdr_mp->b_rptr;
 124                 ethvlanhdr->ether_tci = htons(tci);
 125                 ethvlanhdr->ether_type = htons(etype);
 126                 hdr_mp->b_wptr += sizeof (struct ether_vlan_extinfo);
 127         }
 128 
 129         if (!trill_hdr_ok) {
 130                 trill_header_t *thp;
 131                 /* LINTED: alignment */
 132                 thp = (trill_header_t *)hdr_mp->b_wptr;
 133                 (void) memset(thp, 0, sizeof (trill_header_t));
 134                 thp->th_hopcount = TRILL_DEFAULT_HOPS;
 135                 thp->th_multidest = (multidest ? 1:0);
 136                 hdr_mp->b_wptr += sizeof (trill_header_t);
 137         }
 138 
 139         hdr_mp->b_cont = mp;
 140         return (hdr_mp);
 141 }
 142 
 143 /*
 144  * TRILL local recv function. TRILL data frames that should be received
 145  * by the local system are decapsulated here and passed to bridging for
 146  * learning and local system receive. Only called when we are the forwarder
 147  * on the link (multi-dest frames) or the frame was destined for us.
 148  */
 149 static void
 150 trill_recv_local(trill_sock_t *tsock, mblk_t *mp, uint16_t ingressnick)
 151 {
 152         struct ether_header *inner_ethhdr;
 153 
 154         /* LINTED: alignment */
 155         inner_ethhdr = (struct ether_header *)mp->b_rptr;
 156         DTRACE_PROBE1(trill__recv__local, struct ether_header *, inner_ethhdr);
 157 
 158         DB_CKSUMFLAGS(mp) = 0;
 159         /*
 160          * Transmit the decapsulated frame on the link via Bridging.
 161          * Bridging does source address learning and appropriate forwarding.
 162          */
 163         bridge_trill_decaps(tsock->ts_link, mp, ingressnick);
 164         KSPINCR(tks_decap);
 165 }
 166 
 167 /*
 168  * Determines the outgoing link to reach a RBridge having the given nick
 169  * Assumes caller has acquired the trill instance rwlock.
 170  */
 171 static trill_sock_t *
 172 find_trill_link(trill_inst_t *tip, datalink_id_t linkid)
 173 {
 174         trill_sock_t *tsp = NULL;
 175 
 176         ASSERT(RW_LOCK_HELD(&tip->ti_rwlock));
 177         for (tsp = list_head(&tip->ti_socklist); tsp != NULL;
 178             tsp = list_next(&tip->ti_socklist, tsp)) {
 179                 if (tsp->ts_link != NULL && tsp->ts_link->bl_linkid == linkid) {
 180                         ASSERT(tsp->ts_link->bl_mh != NULL);
 181                         ASSERT(!(tsp->ts_flags & TSF_SHUTDOWN));
 182                         atomic_inc_uint(&tsp->ts_refs);
 183                         break;
 184                 }
 185         }
 186         return (tsp);
 187 }
 188 
 189 /*
 190  * TRILL destination forwarding function. Transmits the TRILL data packet
 191  * to the next-hop, adjacent RBridge.  Consumes passed mblk_t.
 192  */
 193 static void
 194 trill_dest_fwd(trill_inst_t *tip, mblk_t *fwd_mp, uint16_t adj_nick,
 195     boolean_t has_trill_hdr, boolean_t multidest, uint16_t dtnick)
 196 {
 197         trill_node_t *adj;
 198         trill_sock_t *tsock = NULL;
 199         trill_header_t *trillhdr;
 200         struct ether_header *ethhdr;
 201         int ethtype;
 202         int ethhdrlen;
 203 
 204         adj = trill_node_lookup(tip, adj_nick);
 205         if (adj == NULL || ((tsock = adj->tn_tsp) == NULL))
 206                 goto dest_fwd_fail;
 207 
 208         ASSERT(tsock->ts_link != NULL);
 209         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
 210         ASSERT(adj->tn_ni != NULL);
 211 
 212         DTRACE_PROBE3(trill__dest__fwd, uint16_t, adj_nick, trill_node_t,
 213             adj, trill_sock_t, tsock);
 214 
 215         /*
 216          * For broadcast links by using the dest address of
 217          * the RBridge to forward the frame should result in
 218          * savings. When the link is a bridged LAN or there are
 219          * many end stations the frame will not always be flooded.
 220          */
 221         fwd_mp = create_trill_header(tsock, fwd_mp, adj->tn_ni->tni_adjsnpa,
 222             has_trill_hdr, multidest, tsock->ts_desigvlan, 0);
 223         if (fwd_mp == NULL)
 224                 goto dest_fwd_fail;
 225 
 226         /* LINTED: alignment */
 227         ethhdr = (struct ether_header *)fwd_mp->b_rptr;
 228         ethtype = ntohs(ethhdr->ether_type);
 229         ASSERT(ethtype == ETHERTYPE_VLAN || ethtype == ETHERTYPE_TRILL);
 230 
 231         /* Pullup Ethernet and TRILL header (w/o TRILL options) */
 232         ethhdrlen = sizeof (struct ether_header) +
 233             (ethtype == ETHERTYPE_VLAN ? sizeof (struct ether_vlan_extinfo):0);
 234         if (!pullupmsg(fwd_mp, ethhdrlen + sizeof (trill_header_t)))
 235                 goto dest_fwd_fail;
 236         /* LINTED: alignment */
 237         trillhdr = (struct trill_header *)(fwd_mp->b_rptr + ethhdrlen);
 238 
 239         /* Update TRILL header with ingress and egress nicks for new frames */
 240         if (!has_trill_hdr) {
 241                 /* We are creating a new TRILL frame */
 242                 trillhdr->th_egressnick = (multidest ? dtnick:adj_nick);
 243                 rw_enter(&tip->ti_rwlock, RW_READER);
 244                 trillhdr->th_ingressnick = tip->ti_nick;
 245                 rw_exit(&tip->ti_rwlock);
 246                 if (!VALID_NICK(trillhdr->th_ingressnick))
 247                         goto dest_fwd_fail;
 248         }
 249 
 250         /* Set hop count and update header in packet */
 251         ASSERT(trillhdr->th_hopcount != 0);
 252         trillhdr->th_hopcount--;
 253 
 254         /* Clear checksum flag and transmit frame on the link */
 255         DB_CKSUMFLAGS(fwd_mp) = 0;
 256         DTRACE_PROBE1(trill__dest__fwd__tx, trill_header_t *, &trillhdr);
 257         fwd_mp = bridge_trill_output(tsock->ts_link, fwd_mp);
 258         if (fwd_mp == NULL) {
 259                 KSPINCR(tks_sent);
 260                 KSPINCR(tks_forward);
 261         } else {
 262                 freemsg(fwd_mp);
 263                 KSPINCR(tks_drops);
 264         }
 265         trill_node_unref(tip, adj);
 266         return;
 267 
 268 dest_fwd_fail:
 269         if (adj != NULL)
 270                 trill_node_unref(tip, adj);
 271         if (tsock != NULL)
 272                 KSPINCR(tks_drops);
 273         freemsg(fwd_mp);
 274 }
 275 
 276 /*
 277  * TRILL multi-destination forwarding. Transmits the packet to the adjacencies
 278  * on the distribution tree determined by the egress nick. Source addr (saddr)
 279  * is NULL for new TRILL packets originating from us.
 280  */
 281 static void
 282 trill_multidest_fwd(trill_inst_t *tip, mblk_t *mp, uint16_t egressnick,
 283     uint16_t ingressnick, boolean_t is_trill_pkt, const uint8_t *saddr,
 284     int inner_vlan, boolean_t free_mblk)
 285 {
 286         int idx;
 287         uint16_t adjnick;
 288         trill_node_t *dest;
 289         trill_node_t *adj;
 290         mblk_t *fwd_mp;
 291         boolean_t nicksaved = B_FALSE;
 292         uint16_t adjnicksaved;
 293 
 294         /* Lookup the egress nick info, this is the DT root */
 295         if ((dest = trill_node_lookup(tip, egressnick)) == NULL)
 296                 goto fail_multidest_fwd;
 297 
 298         /* Send a copy to all our adjacencies on the DT root  */
 299         ASSERT(dest->tn_ni);
 300         for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
 301 
 302                 /* Check for a valid adjacency node */
 303                 adjnick = TNI_ADJNICK(dest->tn_ni, idx);
 304                 if (!VALID_NICK(adjnick) || ingressnick == adjnick ||
 305                     ((adj = trill_node_lookup(tip, adjnick)) == NULL))
 306                         continue;
 307 
 308                 /* Do not forward back to adjacency that sent the pkt to us */
 309                 ASSERT(adj->tn_ni != NULL);
 310                 if ((saddr != NULL) &&
 311                     (memcmp(adj->tn_ni->tni_adjsnpa, saddr,
 312                     ETHERADDRL) == 0)) {
 313                         trill_node_unref(tip, adj);
 314                         continue;
 315                 }
 316 
 317                 /* Check if adj is marked as reaching inner VLAN downstream */
 318                 if ((inner_vlan != VLAN_ID_NONE) &&
 319                     !TRILL_VLANISSET(TNI_VLANFILTERMAP(dest->tn_ni, idx),
 320                     inner_vlan)) {
 321                         trill_node_unref(tip, adj);
 322                         DTRACE_PROBE4(trill__multi__dest__fwd__vlanfiltered,
 323                             uint16_t, adjnick, uint16_t, ingressnick,
 324                             uint16_t, egressnick, int, inner_vlan);
 325                         continue;
 326                 }
 327 
 328                 trill_node_unref(tip, adj);
 329 
 330                 /*
 331                  * Save the nick and look ahead to see if we should forward the
 332                  * frame to more adjacencies. We avoid doing a copy for this
 333                  * nick and use the passed mblk when we can consume the passed
 334                  * mblk.
 335                  */
 336                 if (free_mblk && !nicksaved) {
 337                         adjnicksaved = adjnick;
 338                         nicksaved = B_TRUE;
 339                         continue;
 340                 }
 341 
 342                 fwd_mp = copymsg(mp);
 343                 if (fwd_mp == NULL)
 344                         break;
 345                 DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
 346                     adjnick, uint16_t, ingressnick);
 347                 trill_dest_fwd(tip, fwd_mp, adjnick, is_trill_pkt,
 348                     B_TRUE, egressnick);
 349         }
 350         trill_node_unref(tip, dest);
 351 
 352         if (nicksaved) {
 353                 ASSERT(free_mblk);
 354                 DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
 355                     adjnicksaved, uint16_t, ingressnick);
 356                 trill_dest_fwd(tip, mp, adjnicksaved, is_trill_pkt,
 357                     B_TRUE, egressnick);
 358                 return;
 359         }
 360 
 361 fail_multidest_fwd:
 362         DTRACE_PROBE2(trill__multi__dest__fwd__fail, uint16_t,
 363             egressnick, uint16_t, ingressnick);
 364         if (free_mblk) {
 365                 freemsg(mp);
 366         }
 367 }
 368 
 369 /*
 370  * TRILL data receive function. Forwards the received frame if necessary
 371  * and also determines if the received frame should be consumed locally.
 372  * Consumes passed mblk.
 373  */
 374 static void
 375 trill_recv(trill_sock_t *tsock, mblk_t *mp, const uint8_t *mpsaddr)
 376 {
 377         trill_header_t *trillhdr;
 378         trill_node_t *dest = NULL;
 379         trill_node_t *source = NULL;
 380         trill_node_t *adj;
 381         uint16_t ournick, adjnick, treeroot;
 382         struct ether_header *ethhdr;
 383         trill_inst_t *tip = tsock->ts_tip;
 384         uint8_t srcaddr[ETHERADDRL];
 385         size_t trillhdrlen;
 386         int inner_vlan = VLAN_ID_NONE;
 387         int tci;
 388         int idx;
 389         size_t min_size;
 390 
 391         /* Copy Ethernet source address before modifying packet */
 392         (void) memcpy(srcaddr, mpsaddr, ETHERADDRL);
 393 
 394         /* Pull up TRILL header if necessary. */
 395         min_size = sizeof (trill_header_t);
 396         if ((MBLKL(mp) < min_size ||
 397             !IS_P2ALIGNED(mp->b_rptr, TRILL_HDR_ALIGN)) &&
 398             !pullupmsg(mp, min_size))
 399                 goto fail;
 400 
 401         /* LINTED: alignment */
 402         trillhdr = (trill_header_t *)mp->b_rptr;
 403         if (trillhdr->th_version != TRILL_PROTOCOL_VERS) {
 404                 DTRACE_PROBE1(trill__recv__wrongversion,
 405                     trill_header_t *, trillhdr);
 406                 goto fail;
 407         }
 408 
 409         /* Drop if unknown or invalid nickname */
 410         if (!VALID_NICK(trillhdr->th_egressnick) ||
 411             !VALID_NICK(trillhdr->th_ingressnick)) {
 412                 DTRACE_PROBE1(trill__recv__invalidnick,
 413                     trill_header_t *, trillhdr);
 414                 goto fail;
 415         }
 416 
 417         rw_enter(&tip->ti_rwlock, RW_READER);
 418         ournick = tip->ti_nick;
 419         treeroot = tip->ti_treeroot;
 420         rw_exit(&tip->ti_rwlock);
 421         /* Drop if we received a packet with our nick as ingress */
 422         if (trillhdr->th_ingressnick == ournick)
 423                 goto fail;
 424 
 425         /* Re-pull any TRILL options and inner Ethernet header */
 426         min_size += GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t) +
 427             sizeof (struct ether_header);
 428         if (MBLKL(mp) < min_size) {
 429                 if (!pullupmsg(mp, min_size))
 430                         goto fail;
 431                 /* LINTED: alignment */
 432                 trillhdr = (trill_header_t *)mp->b_rptr;
 433         }
 434         trillhdrlen = sizeof (trill_header_t) +
 435             (GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t));
 436 
 437         /*
 438          * Get the inner Ethernet header, plus the inner VLAN header if there
 439          * is one.
 440          */
 441         /* LINTED: alignment */
 442         ethhdr = (struct ether_header *)(mp->b_rptr + trillhdrlen);
 443         if (ethhdr->ether_type == htons(ETHERTYPE_VLAN)) {
 444                 min_size += sizeof (struct ether_vlan_extinfo);
 445                 if (MBLKL(mp) < min_size) {
 446                         if (!pullupmsg(mp, min_size))
 447                                 goto fail;
 448                         /* LINTED: alignment */
 449                         trillhdr = (trill_header_t *)mp->b_rptr;
 450                         /* LINTED: alignment */
 451                         ethhdr = (struct ether_header *)(mp->b_rptr +
 452                             trillhdrlen);
 453                 }
 454 
 455                 tci = ntohs(((struct ether_vlan_header *)ethhdr)->ether_tci);
 456                 inner_vlan = VLAN_ID(tci);
 457         }
 458 
 459         /* Known/single destination forwarding. */
 460         if (!trillhdr->th_multidest) {
 461 
 462                 /* Inner MacDA must be unicast */
 463                 if (ethhdr->ether_dhost.ether_addr_octet[0] & 1)
 464                         goto fail;
 465 
 466                 /* Ingress and Egress nicks must be different */
 467                 if (trillhdr->th_egressnick == trillhdr->th_ingressnick)
 468                         goto fail;
 469 
 470                 DTRACE_PROBE1(trill__recv__singledest,
 471                     trill_header_t *, trillhdr);
 472                 if (trillhdr->th_egressnick == ournick) {
 473                         mp->b_rptr += trillhdrlen;
 474                         trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
 475                 } else if (trillhdr->th_hopcount > 0) {
 476                         trill_dest_fwd(tip, mp, trillhdr->th_egressnick,
 477                             B_TRUE, B_FALSE, RBRIDGE_NICKNAME_NONE);
 478                 } else {
 479                         goto fail;
 480                 }
 481                 return;
 482         }
 483 
 484         /*
 485          * Multi-destination frame: perform checks verifying we have
 486          * received a valid multi-destination frame before receiving the
 487          * frame locally and forwarding the frame to other RBridges.
 488          *
 489          * Check if we received this multi-destination frame on a
 490          * adjacency in the distribution tree indicated by the frame's
 491          * egress nickname.
 492          */
 493         if ((dest = trill_node_lookup(tip, trillhdr->th_egressnick)) == NULL)
 494                 goto fail;
 495         for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
 496                 adjnick = TNI_ADJNICK(dest->tn_ni, idx);
 497                 if ((adj = trill_node_lookup(tip, adjnick)) == NULL)
 498                         continue;
 499                 if (memcmp(adj->tn_ni->tni_adjsnpa, srcaddr, ETHERADDRL) == 0) {
 500                         trill_node_unref(tip, adj);
 501                         break;
 502                 }
 503                 trill_node_unref(tip, adj);
 504         }
 505 
 506         if (idx >= dest->tn_ni->tni_adjcount) {
 507                 DTRACE_PROBE2(trill__recv__multidest__adjcheckfail,
 508                     trill_header_t *, trillhdr, trill_node_t *, dest);
 509                 goto fail;
 510         }
 511 
 512         /*
 513          * Reverse path forwarding check. Check if the ingress RBridge
 514          * that has forwarded the frame advertised the use of the
 515          * distribution tree specified in the egress nick.
 516          */
 517         if ((source = trill_node_lookup(tip, trillhdr->th_ingressnick)) == NULL)
 518                 goto fail;
 519         for (idx = 0; idx < source->tn_ni->tni_dtrootcount; idx++) {
 520                 if (TNI_DTROOTNICK(source->tn_ni, idx) ==
 521                     trillhdr->th_egressnick)
 522                         break;
 523         }
 524 
 525         if (idx >= source->tn_ni->tni_dtrootcount) {
 526                 /*
 527                  * Allow receipt of forwarded frame with the highest
 528                  * tree root RBridge as the egress RBridge when the
 529                  * ingress RBridge has not advertised the use of any
 530                  * distribution trees.
 531                  */
 532                 if (source->tn_ni->tni_dtrootcount != 0 ||
 533                     trillhdr->th_egressnick != treeroot) {
 534                         DTRACE_PROBE3(
 535                             trill__recv__multidest__rpfcheckfail,
 536                             trill_header_t *, trillhdr, trill_node_t *,
 537                             source, trill_inst_t *, tip);
 538                         goto fail;
 539                 }
 540         }
 541 
 542         /* Check hop count before doing any forwarding */
 543         if (trillhdr->th_hopcount == 0)
 544                 goto fail;
 545 
 546         /* Forward frame using the distribution tree specified by egress nick */
 547         DTRACE_PROBE2(trill__recv__multidest, trill_header_t *,
 548             trillhdr, trill_node_t *, source);
 549         trill_node_unref(tip, source);
 550         trill_node_unref(tip, dest);
 551 
 552         /* Tell forwarding not to free if we're the link forwarder. */
 553         trill_multidest_fwd(tip, mp, trillhdr->th_egressnick,
 554             trillhdr->th_ingressnick, B_TRUE, srcaddr, inner_vlan,
 555             B_FALSE);
 556 
 557         /*
 558          * Send de-capsulated frame locally if we are the link forwarder (also
 559          * does bridge learning).
 560          */
 561         mp->b_rptr += trillhdrlen;
 562         trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
 563         KSPINCR(tks_recv);
 564         return;
 565 
 566 fail:
 567         DTRACE_PROBE2(trill__recv__multidest__fail, mblk_t *, mp,
 568             trill_sock_t *, tsock);
 569         if (dest != NULL)
 570                 trill_node_unref(tip, dest);
 571         if (source != NULL)
 572                 trill_node_unref(tip, source);
 573         freemsg(mp);
 574         KSPINCR(tks_drops);
 575 }
 576 
 577 static void
 578 trill_stop_recv(trill_sock_t *tsock)
 579 {
 580         mutex_enter(&tsock->ts_socklock);
 581 stop_retry:
 582         if (tsock->ts_state == TS_UNBND || tsock->ts_link == NULL) {
 583                 mutex_exit(&tsock->ts_socklock);
 584                 return;
 585         }
 586 
 587         /*
 588          * If another thread is closing the socket then wait. Our callers
 589          * expect us to return only after the socket is closed.
 590          */
 591         if (tsock->ts_flags & TSF_CLOSEWAIT) {
 592                 cv_wait(&tsock->ts_sockclosewait, &tsock->ts_socklock);
 593                 goto stop_retry;
 594         }
 595 
 596         /*
 597          * Set state and flags to block new bind or close calls
 598          * while we close the socket.
 599          */
 600         tsock->ts_flags |= TSF_CLOSEWAIT;
 601 
 602         /* Wait until all AF_TRILL socket transmit operations are done */
 603         while (tsock->ts_sockthreadcount > 0)
 604                 cv_wait(&tsock->ts_sockthreadwait, &tsock->ts_socklock);
 605 
 606         /*
 607          * We are guaranteed to be the only thread closing on the
 608          * socket while the TSF_CLOSEWAIT flag is set, all others cv_wait
 609          * for us to finish.
 610          */
 611         ASSERT(tsock->ts_link != NULL);
 612         if (tsock->ts_ksp != NULL)
 613                 kstat_delete(tsock->ts_ksp);
 614 
 615         /*
 616          * Release lock before bridge_trill_lnunref to prevent deadlock
 617          * between trill_ctrl_input thread waiting to acquire ts_socklock
 618          * and bridge_trill_lnunref waiting for the trill thread to finish.
 619          */
 620         mutex_exit(&tsock->ts_socklock);
 621 
 622         /*
 623          * Release TRILL link reference from Bridging. On return from
 624          * bridge_trill_lnunref we can be sure there are no active TRILL data
 625          * threads for this link.
 626          */
 627         bridge_trill_lnunref(tsock->ts_link);
 628 
 629         /* Set socket as unbound & wakeup threads waiting for socket to close */
 630         mutex_enter(&tsock->ts_socklock);
 631         ASSERT(tsock->ts_link != NULL);
 632         tsock->ts_link = NULL;
 633         tsock->ts_state = TS_UNBND;
 634         tsock->ts_flags &= ~TSF_CLOSEWAIT;
 635         cv_broadcast(&tsock->ts_sockclosewait);
 636         mutex_exit(&tsock->ts_socklock);
 637 }
 638 
 639 static int
 640 trill_start_recv(trill_sock_t *tsock, const struct sockaddr *sa, socklen_t len)
 641 {
 642         struct sockaddr_dl *lladdr = (struct sockaddr_dl *)sa;
 643         datalink_id_t linkid;
 644         int err = 0;
 645 
 646         if (len != sizeof (*lladdr))
 647                 return (EINVAL);
 648 
 649         mutex_enter(&tsock->ts_socklock);
 650         if (tsock->ts_tip == NULL || tsock->ts_state != TS_UNBND) {
 651                 err = EINVAL;
 652                 goto bind_error;
 653         }
 654 
 655         if (tsock->ts_flags & TSF_CLOSEWAIT || tsock->ts_link != NULL) {
 656                 err = EBUSY;
 657                 goto bind_error;
 658         }
 659 
 660         (void) memcpy(&(tsock->ts_lladdr), lladdr,
 661             sizeof (struct sockaddr_dl));
 662         (void) memcpy(&linkid, tsock->ts_lladdr.sdl_data,
 663             sizeof (datalink_id_t));
 664 
 665         tsock->ts_link = bridge_trill_lnref(tsock->ts_tip->ti_binst,
 666             linkid, tsock);
 667         if (tsock->ts_link == NULL) {
 668                 err = EINVAL;
 669                 goto bind_error;
 670         }
 671 
 672         trill_kstats_init(tsock, tsock->ts_tip->ti_bridgename);
 673         tsock->ts_state = TS_IDLE;
 674 
 675 bind_error:
 676         mutex_exit(&tsock->ts_socklock);
 677         return (err);
 678 }
 679 
 680 static int
 681 trill_do_unbind(trill_sock_t *tsock)
 682 {
 683         /* If a bind has not been done, we can't unbind. */
 684         if (tsock->ts_state != TS_IDLE)
 685                 return (EINVAL);
 686 
 687         trill_stop_recv(tsock);
 688         return (0);
 689 }
 690 
 691 static void
 692 trill_instance_unref(trill_inst_t *tip)
 693 {
 694         rw_enter(&trill_inst_rwlock, RW_WRITER);
 695         rw_enter(&tip->ti_rwlock, RW_WRITER);
 696         if (atomic_dec_uint_nv(&tip->ti_refs) == 0) {
 697                 list_remove(&trill_inst_list, tip);
 698                 rw_exit(&tip->ti_rwlock);
 699                 rw_exit(&trill_inst_rwlock);
 700                 if (tip->ti_binst != NULL)
 701                         bridge_trill_brunref(tip->ti_binst);
 702                 list_destroy(&tip->ti_socklist);
 703                 rw_destroy(&tip->ti_rwlock);
 704                 kmem_free(tip, sizeof (*tip));
 705         } else {
 706                 rw_exit(&tip->ti_rwlock);
 707                 rw_exit(&trill_inst_rwlock);
 708         }
 709 }
 710 
 711 /*
 712  * This is called when the bridge module receives a TRILL-encapsulated packet
 713  * on a given link or a packet identified as "TRILL control."  We must verify
 714  * that it's for us (it almost certainly will be), and then either decapsulate
 715  * (if it's to our nickname), forward (if it's to someone else), or send up one
 716  * of the sockets (if it's control traffic).
 717  *
 718  * Sadly, on Ethernet, the control traffic is identified by Outer.MacDA, and
 719  * not by TRILL header information.
 720  */
 721 static void
 722 trill_recv_pkt_cb(void *lptr, bridge_link_t *blp, mac_resource_handle_t rsrc,
 723     mblk_t *mp, mac_header_info_t *hdr_info)
 724 {
 725         trill_sock_t *tsock = lptr;
 726 
 727         _NOTE(ARGUNUSED(rsrc));
 728 
 729         ASSERT(tsock->ts_tip != NULL);
 730         ASSERT(tsock->ts_link != NULL);
 731         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
 732 
 733         /*
 734          * Only receive packet if the source address is not multicast (which is
 735          * bogus).
 736          */
 737         if (hdr_info->mhi_saddr[0] & 1)
 738                 goto discard;
 739 
 740         /*
 741          * Check if this is our own packet reflected back.  It should not be.
 742          */
 743         if (bcmp(hdr_info->mhi_saddr, blp->bl_local_mac, ETHERADDRL) == 0)
 744                 goto discard;
 745 
 746         /* Only receive unicast packet if addressed to us */
 747         if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST &&
 748             bcmp(hdr_info->mhi_daddr, blp->bl_local_mac, ETHERADDRL) != 0)
 749                 goto discard;
 750 
 751         if (hdr_info->mhi_bindsap == ETHERTYPE_TRILL) {
 752                 /* TRILL data packets */
 753                 trill_recv(tsock, mp, hdr_info->mhi_saddr);
 754         } else {
 755                 /* Design constraint for cheap IS-IS/BPDU comparison */
 756                 ASSERT(all_isis_rbridges[4] != bridge_group_address[4]);
 757                 /* Send received control packet upstream */
 758                 trill_ctrl_input(tsock, mp, hdr_info->mhi_saddr,
 759                     hdr_info->mhi_daddr[4] == all_isis_rbridges[4] ?
 760                     hdr_info->mhi_tci : TRILL_TCI_BPDU);
 761         }
 762 
 763         return;
 764 
 765 discard:
 766         freemsg(mp);
 767         KSPINCR(tks_drops);
 768 }
 769 
 770 /*
 771  * This is called when the bridge module discovers that the destination address
 772  * for a packet is not local -- it's through some remote node.  We must verify
 773  * that the remote node isn't our nickname (it shouldn't be), add a TRILL
 774  * header, and then use the IS-IS data to determine which link and which
 775  * next-hop RBridge should be used for output.  We then transmit on that link.
 776  *
 777  * The egress_nick is RBRIDGE_NICKNAME_NONE for the "unknown destination" case.
 778  */
 779 static void
 780 trill_encap_pkt_cb(void *lptr, bridge_link_t *blp, mac_header_info_t *hdr_info,
 781     mblk_t *mp, uint16_t egress_nick)
 782 {
 783         uint16_t ournick;
 784         uint16_t dtnick;
 785         trill_node_t *self = NULL;
 786         trill_sock_t *tsock = lptr;
 787         trill_inst_t *tip = tsock->ts_tip;
 788         int vlan = VLAN_ID_NONE;
 789 
 790         _NOTE(ARGUNUSED(blp));
 791         ASSERT(hdr_info->mhi_bindsap != ETHERTYPE_TRILL);
 792 
 793         /* egress_nick = RBRIDGE_NICKNAME_NONE is valid */
 794         if (egress_nick != RBRIDGE_NICKNAME_NONE && !VALID_NICK(egress_nick))
 795                 goto discard;
 796 
 797         /* Check if our own nick is valid before we do any forwarding */
 798         rw_enter(&tip->ti_rwlock, RW_READER);
 799         ournick = tip->ti_nick;
 800         dtnick = tip->ti_treeroot;
 801         rw_exit(&tip->ti_rwlock);
 802         if (!VALID_NICK(ournick))
 803                 goto discard;
 804 
 805         /*
 806          * For Multi-Destination forwarding determine our choice of
 807          * root distribution tree. If we didn't choose a distribution
 808          * tree (dtroots_count=0) then we use the highest priority tree
 809          * root (t_treeroot) else we drop the packet without forwarding.
 810          */
 811         if (egress_nick == RBRIDGE_NICKNAME_NONE) {
 812                 if ((self = trill_node_lookup(tip, ournick)) == NULL)
 813                         goto discard;
 814 
 815                 /*
 816                  * Use the first DT configured for now. In future we
 817                  * should have DT selection code here.
 818                  */
 819                 if (self->tn_ni->tni_dtrootcount > 0) {
 820                         dtnick = TNI_DTROOTNICK(self->tn_ni, 0);
 821                 }
 822 
 823                 trill_node_unref(tip, self);
 824                 if (!VALID_NICK(dtnick)) {
 825                         DTRACE_PROBE(trill__fwd__packet__nodtroot);
 826                         goto discard;
 827                 }
 828         }
 829 
 830         /*
 831          * Retrieve VLAN ID of the native frame used for VLAN
 832          * pruning of multi-destination frames.
 833          */
 834         if (hdr_info->mhi_istagged) {
 835                 vlan = VLAN_ID(hdr_info->mhi_tci);
 836         }
 837 
 838         DTRACE_PROBE2(trill__fwd__packet, mac_header_info_t *, hdr_info,
 839             uint16_t, egress_nick);
 840         if (egress_nick == RBRIDGE_NICKNAME_NONE) {
 841                 trill_multidest_fwd(tip, mp, dtnick,
 842                     ournick, B_FALSE, NULL, vlan, B_TRUE);
 843         } else {
 844                 trill_dest_fwd(tip, mp, egress_nick, B_FALSE, B_FALSE,
 845                     RBRIDGE_NICKNAME_NONE);
 846         }
 847         KSPINCR(tks_encap);
 848         return;
 849 
 850 discard:
 851         freemsg(mp);
 852 }
 853 
 854 /*
 855  * This is called when the bridge module has completely torn down a bridge
 856  * instance and all of the attached links.  We need to make the TRILL instance
 857  * go away at this point.
 858  */
 859 static void
 860 trill_br_dstr_cb(void *bptr, bridge_inst_t *bip)
 861 {
 862         trill_inst_t *tip = bptr;
 863 
 864         _NOTE(ARGUNUSED(bip));
 865         rw_enter(&tip->ti_rwlock, RW_WRITER);
 866         if (tip->ti_binst != NULL)
 867                 bridge_trill_brunref(tip->ti_binst);
 868         tip->ti_binst = NULL;
 869         rw_exit(&tip->ti_rwlock);
 870 }
 871 
 872 /*
 873  * This is called when the bridge module is tearing down a link, but before the
 874  * actual tear-down starts.  When this function returns, we must make sure that
 875  * we will not initiate any new transmits on this link.
 876  */
 877 static void
 878 trill_ln_dstr_cb(void *lptr, bridge_link_t *blp)
 879 {
 880         trill_sock_t *tsock = lptr;
 881 
 882         _NOTE(ARGUNUSED(blp));
 883         trill_stop_recv(tsock);
 884 }
 885 
 886 static void
 887 trill_init(void)
 888 {
 889         list_create(&trill_inst_list, sizeof (trill_inst_t),
 890             offsetof(trill_inst_t, ti_instnode));
 891         rw_init(&trill_inst_rwlock, NULL, RW_DRIVER, NULL);
 892         bridge_trill_register_cb(trill_recv_pkt_cb, trill_encap_pkt_cb,
 893             trill_br_dstr_cb, trill_ln_dstr_cb);
 894 }
 895 
 896 static void
 897 trill_fini(void)
 898 {
 899         bridge_trill_register_cb(NULL, NULL, NULL, NULL);
 900         rw_destroy(&trill_inst_rwlock);
 901         list_destroy(&trill_inst_list);
 902 }
 903 
 904 /* Loadable module configuration entry points */
 905 int
 906 _init(void)
 907 {
 908         int rc;
 909 
 910         trill_init();
 911         if ((rc = mod_install(&ml)) != 0)
 912                 trill_fini();
 913         return (rc);
 914 }
 915 
 916 int
 917 _info(struct modinfo *modinfop)
 918 {
 919         return (mod_info(&ml, modinfop));
 920 }
 921 
 922 int
 923 _fini(void)
 924 {
 925         int rc;
 926 
 927         rw_enter(&trill_inst_rwlock, RW_READER);
 928         rc = list_is_empty(&trill_inst_list) ? 0 : EBUSY;
 929         rw_exit(&trill_inst_rwlock);
 930         if (rc == 0 && ((rc = mod_remove(&ml)) == 0))
 931                 trill_fini();
 932         return (rc);
 933 }
 934 
 935 static void
 936 trill_kstats_init(trill_sock_t *tsock, const char *bname)
 937 {
 938         int i;
 939         char kstatname[KSTAT_STRLEN];
 940         kstat_named_t  *knt;
 941         static const char *sock_kstats_list[] = { TRILL_KSSOCK_NAMES };
 942         char link_name[MAXNAMELEN];
 943         int num;
 944         int err;
 945 
 946         bzero(link_name, sizeof (link_name));
 947         if ((err = dls_mgmt_get_linkinfo(tsock->ts_link->bl_linkid, link_name,
 948             NULL, NULL, NULL)) != 0) {
 949                 cmn_err(CE_WARN, "%s: trill_kstats_init: error %d retrieving"
 950                     " linkinfo for linkid:%d", "trill", err,
 951                     tsock->ts_link->bl_linkid);
 952                 return;
 953         }
 954 
 955         bzero(kstatname, sizeof (kstatname));
 956         (void) snprintf(kstatname, sizeof (kstatname), "%s-%s",
 957             bname, link_name);
 958 
 959         num = sizeof (sock_kstats_list) / sizeof (*sock_kstats_list);
 960         for (i = 0; i < num; i++) {
 961                 knt = (kstat_named_t *)&(tsock->ts_kstats);
 962                 kstat_named_init(&knt[i], sock_kstats_list[i],
 963                     KSTAT_DATA_UINT64);
 964         }
 965 
 966         tsock->ts_ksp = kstat_create_zone("trill", 0, kstatname, "sock",
 967             KSTAT_TYPE_NAMED, num, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
 968         if (tsock->ts_ksp != NULL) {
 969                 tsock->ts_ksp->ks_data = &tsock->ts_kstats;
 970                 kstat_install(tsock->ts_ksp);
 971         }
 972 }
 973 
 974 static trill_sock_t *
 975 trill_do_open(int flags)
 976 {
 977         trill_sock_t *tsock;
 978         int kmflag = ((flags & SOCKET_NOSLEEP)) ? KM_NOSLEEP:KM_SLEEP;
 979 
 980         tsock = kmem_zalloc(sizeof (trill_sock_t), kmflag);
 981         if (tsock != NULL) {
 982                 tsock->ts_state = TS_UNBND;
 983                 tsock->ts_refs++;
 984                 mutex_init(&tsock->ts_socklock, NULL, MUTEX_DRIVER, NULL);
 985                 cv_init(&tsock->ts_sockthreadwait, NULL, CV_DRIVER, NULL);
 986                 cv_init(&tsock->ts_sockclosewait, NULL, CV_DRIVER, NULL);
 987         }
 988         return (tsock);
 989 }
 990 
 991 static int
 992 trill_find_bridge(trill_sock_t *tsock, const char *bname, boolean_t can_create)
 993 {
 994         trill_inst_t *tip, *newtip = NULL;
 995 
 996         /* Allocate some memory (speculatively) before taking locks */
 997         if (can_create)
 998                 newtip = kmem_zalloc(sizeof (*tip), KM_NOSLEEP);
 999 
1000         rw_enter(&trill_inst_rwlock, RW_WRITER);
1001         for (tip = list_head(&trill_inst_list); tip != NULL;
1002             tip = list_next(&trill_inst_list, tip)) {
1003                 if (strcmp(tip->ti_bridgename, bname) == 0)
1004                         break;
1005         }
1006         if (tip == NULL) {
1007                 if (!can_create || newtip == NULL) {
1008                         rw_exit(&trill_inst_rwlock);
1009                         return (can_create ? ENOMEM : ENOENT);
1010                 }
1011 
1012                 tip = newtip;
1013                 newtip = NULL;
1014                 (void) strcpy(tip->ti_bridgename, bname);
1015 
1016                 /* Register TRILL instance with bridging */
1017                 tip->ti_binst = bridge_trill_brref(bname, tip);
1018                 if (tip->ti_binst == NULL) {
1019                         rw_exit(&trill_inst_rwlock);
1020                         kmem_free(tip, sizeof (*tip));
1021                         return (ENOENT);
1022                 }
1023 
1024                 rw_init(&tip->ti_rwlock, NULL, RW_DRIVER, NULL);
1025                 list_create(&tip->ti_socklist, sizeof (trill_sock_t),
1026                     offsetof(trill_sock_t, ts_socklistnode));
1027                 list_insert_tail(&trill_inst_list, tip);
1028         }
1029         atomic_inc_uint(&tip->ti_refs);
1030         rw_exit(&trill_inst_rwlock);
1031 
1032         /* If we didn't need the preallocated memory, then discard now. */
1033         if (newtip != NULL)
1034                 kmem_free(newtip, sizeof (*newtip));
1035 
1036         rw_enter(&tip->ti_rwlock, RW_WRITER);
1037         list_insert_tail(&(tip->ti_socklist), tsock);
1038         tsock->ts_tip = tip;
1039         rw_exit(&tip->ti_rwlock);
1040         return (0);
1041 }
1042 
1043 static void
1044 trill_clear_bridge(trill_sock_t *tsock)
1045 {
1046         trill_inst_t *tip;
1047 
1048         if ((tip = tsock->ts_tip) == NULL)
1049                 return;
1050         rw_enter(&tip->ti_rwlock, RW_WRITER);
1051         list_remove(&tip->ti_socklist, tsock);
1052         if (list_is_empty(&tip->ti_socklist))
1053                 trill_del_all(tip, B_TRUE);
1054         rw_exit(&tip->ti_rwlock);
1055 }
1056 
1057 static void
1058 trill_sock_unref(trill_sock_t *tsock)
1059 {
1060         if (atomic_dec_uint_nv(&tsock->ts_refs) == 0) {
1061                 mutex_destroy(&tsock->ts_socklock);
1062                 cv_destroy(&tsock->ts_sockthreadwait);
1063                 cv_destroy(&tsock->ts_sockclosewait);
1064                 kmem_free(tsock, sizeof (trill_sock_t));
1065         }
1066 }
1067 
1068 static void
1069 trill_do_close(trill_sock_t *tsock)
1070 {
1071         trill_inst_t *tip;
1072 
1073         tip = tsock->ts_tip;
1074         trill_stop_recv(tsock);
1075         /* Remove socket from TRILL instance socket list */
1076         trill_clear_bridge(tsock);
1077         tsock->ts_flags |= TSF_SHUTDOWN;
1078         trill_sock_unref(tsock);
1079         if (tip != NULL)
1080                 trill_instance_unref(tip);
1081 }
1082 
1083 static void
1084 trill_del_all(trill_inst_t *tip, boolean_t lockheld)
1085 {
1086         int i;
1087 
1088         if (!lockheld)
1089                 rw_enter(&tip->ti_rwlock, RW_WRITER);
1090         for (i = RBRIDGE_NICKNAME_MIN; i < RBRIDGE_NICKNAME_MAX; i++) {
1091                 if (tip->ti_nodes[i] != NULL)
1092                         (void) trill_del_nick(tip, i, B_TRUE);
1093         }
1094         if (!lockheld)
1095                 rw_exit(&tip->ti_rwlock);
1096 }
1097 
1098 static void
1099 trill_node_free(trill_node_t *nick_entry)
1100 {
1101         trill_nickinfo_t *tni;
1102 
1103         tni = nick_entry->tn_ni;
1104         kmem_free(tni, TNI_TOTALSIZE(tni));
1105         kmem_free(nick_entry, sizeof (trill_node_t));
1106 }
1107 
1108 static void
1109 trill_node_unref(trill_inst_t *tip, trill_node_t *tnp)
1110 {
1111         if (atomic_dec_uint_nv(&tnp->tn_refs) == 0) {
1112                 if (tnp->tn_tsp != NULL)
1113                         trill_sock_unref(tnp->tn_tsp);
1114                 trill_node_free(tnp);
1115                 atomic_dec_uint(&tip->ti_nodecount);
1116         }
1117 }
1118 
1119 static trill_node_t *
1120 trill_node_lookup(trill_inst_t *tip, uint16_t nick)
1121 {
1122         trill_node_t *nick_entry;
1123 
1124         if (!VALID_NICK(nick))
1125                 return (NULL);
1126         rw_enter(&tip->ti_rwlock, RW_READER);
1127         nick_entry = tip->ti_nodes[nick];
1128         if (nick_entry != NULL) {
1129                 atomic_inc_uint(&nick_entry->tn_refs);
1130         }
1131         rw_exit(&tip->ti_rwlock);
1132         return (nick_entry);
1133 }
1134 
1135 static int
1136 trill_del_nick(trill_inst_t *tip, uint16_t nick, boolean_t lockheld)
1137 {
1138         trill_node_t *nick_entry;
1139         int rc = ENOENT;
1140 
1141         if (!lockheld)
1142                 rw_enter(&tip->ti_rwlock, RW_WRITER);
1143         if (VALID_NICK(nick)) {
1144                 nick_entry = tip->ti_nodes[nick];
1145                 if (nick_entry != NULL) {
1146                         trill_node_unref(tip, nick_entry);
1147                         tip->ti_nodes[nick] = NULL;
1148                         rc = 0;
1149                 }
1150         }
1151         if (!lockheld)
1152                 rw_exit(&tip->ti_rwlock);
1153         return (rc);
1154 }
1155 
1156 static int
1157 trill_add_nick(trill_inst_t *tip, void *arg, boolean_t self, int mode)
1158 {
1159         uint16_t nick;
1160         int size;
1161         trill_node_t *tnode;
1162         trill_nickinfo_t tnihdr;
1163 
1164         /* First make sure we have at least the header available */
1165         if (ddi_copyin(arg, &tnihdr, sizeof (trill_nickinfo_t), mode) != 0)
1166                 return (EFAULT);
1167 
1168         nick = tnihdr.tni_nick;
1169         if (!VALID_NICK(nick)) {
1170                 DTRACE_PROBE1(trill__add__nick__bad, trill_nickinfo_t *,
1171                     &tnihdr);
1172                 return (EINVAL);
1173         }
1174 
1175         size = TNI_TOTALSIZE(&tnihdr);
1176         if (size > TNI_MAXSIZE)
1177                 return (EINVAL);
1178         tnode = kmem_zalloc(sizeof (trill_node_t), KM_SLEEP);
1179         tnode->tn_ni = kmem_zalloc(size, KM_SLEEP);
1180         if (ddi_copyin(arg, tnode->tn_ni, size, mode) != 0) {
1181                 kmem_free(tnode->tn_ni, size);
1182                 kmem_free(tnode, sizeof (trill_node_t));
1183                 return (EFAULT);
1184         }
1185 
1186         tnode->tn_refs++;
1187         rw_enter(&tip->ti_rwlock, RW_WRITER);
1188         if (tip->ti_nodes[nick] != NULL)
1189                 (void) trill_del_nick(tip, nick, B_TRUE);
1190 
1191         if (self) {
1192                 tip->ti_nick = nick;
1193         } else {
1194                 tnode->tn_tsp = find_trill_link(tip,
1195                     tnode->tn_ni->tni_linkid);
1196         }
1197         DTRACE_PROBE2(trill__add__nick, trill_node_t *, tnode,
1198             uint16_t, nick);
1199         tip->ti_nodes[nick] = tnode;
1200         tip->ti_nodecount++;
1201         rw_exit(&tip->ti_rwlock);
1202         return (0);
1203 }
1204 
1205 static int
1206 trill_do_ioctl(trill_sock_t *tsock, int cmd, void *arg, int mode)
1207 {
1208         int error = 0;
1209         trill_inst_t *tip = tsock->ts_tip;
1210 
1211         switch (cmd) {
1212         case TRILL_DESIGVLAN: {
1213                 uint16_t desigvlan;
1214 
1215                 if (ddi_copyin(arg, &desigvlan, sizeof (desigvlan), mode) != 0)
1216                         return (EFAULT);
1217                 tsock->ts_desigvlan = desigvlan;
1218                 break;
1219         }
1220         case TRILL_VLANFWDER: {
1221                 uint8_t vlans[TRILL_VLANS_ARRSIZE];
1222 
1223                 if (tsock->ts_link == NULL)
1224                         return (EINVAL);
1225                 if ((ddi_copyin(arg, vlans, sizeof (vlans), mode)) != 0)
1226                         return (EFAULT);
1227                 bridge_trill_setvlans(tsock->ts_link, vlans);
1228                 break;
1229         }
1230         case TRILL_SETNICK:
1231                 if (tip == NULL)
1232                         return (EINVAL);
1233                 error = trill_add_nick(tip, arg, B_TRUE, mode);
1234                 break;
1235 
1236         case TRILL_GETNICK:
1237                 if (tip == NULL)
1238                         return (EINVAL);
1239                 rw_enter(&tip->ti_rwlock, RW_READER);
1240                 if (ddi_copyout(&tip->ti_nick, arg, sizeof (tip->ti_nick),
1241                     mode) != 0)
1242                         error = EFAULT;
1243                 rw_exit(&tip->ti_rwlock);
1244                 break;
1245 
1246         case TRILL_ADDNICK:
1247                 if (tip == NULL)
1248                         break;
1249                 error = trill_add_nick(tip, arg, B_FALSE, mode);
1250                 break;
1251 
1252         case TRILL_DELNICK: {
1253                 uint16_t delnick;
1254 
1255                 if (tip == NULL)
1256                         break;
1257                 if (ddi_copyin(arg, &delnick, sizeof (delnick), mode) != 0)
1258                         return (EFAULT);
1259                 error = trill_del_nick(tip, delnick, B_FALSE);
1260                 break;
1261         }
1262         case TRILL_DELALL:
1263                 if (tip == NULL)
1264                         break;
1265                 trill_del_all(tip, B_FALSE);
1266                 break;
1267 
1268         case TRILL_TREEROOT: {
1269                 uint16_t treeroot;
1270 
1271                 if (tip == NULL)
1272                         break;
1273                 if (ddi_copyin(arg, &treeroot, sizeof (treeroot), mode) != 0)
1274                         return (EFAULT);
1275                 if (!VALID_NICK(treeroot))
1276                         return (EINVAL);
1277                 rw_enter(&tip->ti_rwlock, RW_WRITER);
1278                 tip->ti_treeroot = treeroot;
1279                 rw_exit(&tip->ti_rwlock);
1280                 break;
1281         }
1282         case TRILL_HWADDR:
1283                 if (tsock->ts_link == NULL)
1284                         break;
1285                 if (ddi_copyout(tsock->ts_link->bl_local_mac, arg, ETHERADDRL,
1286                     mode) != 0)
1287                         return (EFAULT);
1288                 break;
1289 
1290         case TRILL_NEWBRIDGE: {
1291                 char bname[MAXLINKNAMELEN];
1292 
1293                 if (tsock->ts_state != TS_UNBND)
1294                         return (ENOTSUP);
1295                 /* ts_tip can only be set once */
1296                 if (tip != NULL)
1297                         return (EEXIST);
1298                 if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1299                         return (EFAULT);
1300                 bname[MAXLINKNAMELEN-1] = '\0';
1301                 error = trill_find_bridge(tsock, bname, B_TRUE);
1302                 break;
1303         }
1304 
1305         case TRILL_GETBRIDGE: {
1306                 char bname[MAXLINKNAMELEN];
1307 
1308                 /* ts_tip can only be set once */
1309                 if (tip != NULL)
1310                         return (EEXIST);
1311                 if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1312                         return (EFAULT);
1313                 bname[MAXLINKNAMELEN - 1] = '\0';
1314                 error = trill_find_bridge(tsock, bname, B_FALSE);
1315                 break;
1316         }
1317 
1318         case TRILL_LISTNICK: {
1319                 trill_listnick_t tln;
1320                 trill_node_t *tnp;
1321                 trill_nickinfo_t *tnip;
1322                 uint16_t nick;
1323 
1324                 if (tip == NULL)
1325                         return (EINVAL);
1326                 if (ddi_copyin(arg, &tln, sizeof (tln), mode) != 0)
1327                         return (EFAULT);
1328                 nick = tln.tln_nick;
1329                 if (nick >= RBRIDGE_NICKNAME_MAX) {
1330                         error = EINVAL;
1331                         break;
1332                 }
1333                 rw_enter(&tip->ti_rwlock, RW_READER);
1334                 while (++nick < RBRIDGE_NICKNAME_MAX) {
1335                         if ((tnp = tip->ti_nodes[nick]) != NULL) {
1336                                 tnip = tnp->tn_ni;
1337                                 ASSERT(nick == tnip->tni_nick);
1338                                 tln.tln_nick = nick;
1339                                 bcopy(tnip->tni_adjsnpa, tln.tln_nexthop,
1340                                     ETHERADDRL);
1341                                 tln.tln_ours = nick == tip->ti_nick;
1342                                 if (tln.tln_ours || tnp->tn_tsp == NULL) {
1343                                         tln.tln_linkid =
1344                                             DATALINK_INVALID_LINKID;
1345                                 } else {
1346                                         tln.tln_linkid =
1347                                             tnp->tn_tsp->ts_link->bl_linkid;
1348                                 }
1349                                 break;
1350                         }
1351                 }
1352                 rw_exit(&tip->ti_rwlock);
1353                 if (nick >= RBRIDGE_NICKNAME_MAX)
1354                         bzero(&tln, sizeof (tln));
1355                 if (ddi_copyout(&tln, arg, sizeof (tln), mode) != 0)
1356                         return (EFAULT);
1357                 break;
1358         }
1359 
1360         /*
1361          * Port flush: this is used when we lose AF on a port.  We must discard
1362          * all regular bridge forwarding entries on this port with the
1363          * indicated VLAN.
1364          */
1365         case TRILL_PORTFLUSH: {
1366                 uint16_t vlan = (uint16_t)(uintptr_t)arg;
1367 
1368                 if (tsock->ts_link == NULL)
1369                         return (EINVAL);
1370                 bridge_trill_flush(tsock->ts_link, vlan, B_FALSE);
1371                 break;
1372         }
1373 
1374         /*
1375          * Nick flush: this is used when we lose AF on a port.  We must discard
1376          * all bridge TRILL forwarding entries on this port with the indicated
1377          * VLAN.
1378          */
1379         case TRILL_NICKFLUSH: {
1380                 uint16_t vlan = (uint16_t)(uintptr_t)arg;
1381 
1382                 if (tsock->ts_link == NULL)
1383                         return (EINVAL);
1384                 bridge_trill_flush(tsock->ts_link, vlan, B_TRUE);
1385                 break;
1386         }
1387 
1388         case TRILL_GETMTU:
1389                 if (tsock->ts_link == NULL)
1390                         break;
1391                 if (ddi_copyout(&tsock->ts_link->bl_maxsdu, arg,
1392                     sizeof (uint_t), mode) != 0)
1393                         return (EFAULT);
1394                 break;
1395 
1396         default:
1397                 error = ENOTSUP;
1398                 break;
1399         }
1400 
1401         return (error);
1402 }
1403 
1404 /*
1405  * Sends received packet back upstream on the TRILL socket.
1406  * Consumes passed mblk_t.
1407  */
1408 static void
1409 trill_ctrl_input(trill_sock_t *tsock, mblk_t *mp, const uint8_t *saddr,
1410     uint16_t tci)
1411 {
1412         int udi_size;
1413         mblk_t *mp1;
1414         struct T_unitdata_ind *tudi;
1415         struct sockaddr_dl *sdl;
1416         char *lladdr;
1417         int error;
1418 
1419         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1420         if (tsock->ts_flow_ctrld) {
1421                 freemsg(mp);
1422                 KSPINCR(tks_drops);
1423                 return;
1424         }
1425 
1426         udi_size =  sizeof (struct T_unitdata_ind) +
1427             sizeof (struct sockaddr_dl);
1428         mp1 = allocb(udi_size, BPRI_MED);
1429         if (mp1 == NULL) {
1430                 freemsg(mp);
1431                 KSPINCR(tks_drops);
1432                 return;
1433         }
1434 
1435         mp1->b_cont = mp;
1436         mp = mp1;
1437         mp->b_datap->db_type = M_PROTO;
1438         /* LINTED: alignment */
1439         tudi = (struct T_unitdata_ind *)mp->b_rptr;
1440         mp->b_wptr = (uchar_t *)tudi + udi_size;
1441 
1442         tudi->PRIM_type = T_UNITDATA_IND;
1443         tudi->SRC_length = sizeof (struct sockaddr_dl);
1444         tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1445         tudi->OPT_length = 0;
1446         tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
1447             sizeof (struct sockaddr_dl);
1448 
1449         /* Information of the link on which packet was received. */
1450         sdl = (struct sockaddr_dl *)&tudi[1];
1451         (void) memset(sdl, 0, sizeof (struct sockaddr_dl));
1452         sdl->sdl_family = AF_TRILL;
1453 
1454         /* LINTED: alignment */
1455         *(datalink_id_t *)sdl->sdl_data = tsock->ts_link->bl_linkid;
1456         sdl->sdl_nlen = sizeof (tsock->ts_link->bl_linkid);
1457 
1458         lladdr = LLADDR(sdl);
1459         (void) memcpy(lladdr, saddr, ETHERADDRL);
1460         lladdr += ETHERADDRL;
1461         sdl->sdl_alen = ETHERADDRL;
1462 
1463         /* LINTED: alignment */
1464         *(uint16_t *)lladdr = tci;
1465         sdl->sdl_slen = sizeof (uint16_t);
1466 
1467         DTRACE_PROBE2(trill__ctrl__input, trill_sock_t *, tsock, mblk_t *, mp);
1468         (*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1469             mp, msgdsize(mp), 0, &error, NULL);
1470 
1471         if (error == ENOSPC) {
1472                 mutex_enter(&tsock->ts_socklock);
1473                 (*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1474                     NULL, 0, 0, &error, NULL);
1475                 if (error == ENOSPC)
1476                         tsock->ts_flow_ctrld = B_TRUE;
1477                 mutex_exit(&tsock->ts_socklock);
1478                 KSPINCR(tks_drops);
1479         } else if (error != 0) {
1480                 KSPINCR(tks_drops);
1481         } else {
1482                 KSPINCR(tks_recv);
1483         }
1484 
1485         DTRACE_PROBE2(trill__ctrl__input__done, trill_sock_t *,
1486             tsock, int, error);
1487 }
1488 
1489 /* ARGSUSED */
1490 static void
1491 trill_activate(sock_lower_handle_t proto_handle,
1492     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
1493     int flags, cred_t *cr)
1494 {
1495         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1496         struct sock_proto_props sopp;
1497 
1498         tsock->ts_conn_upcalls = sock_upcalls;
1499         tsock->ts_conn_upper_handle = sock_handle;
1500 
1501         sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
1502             SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
1503             SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
1504         sopp.sopp_wroff = 0;
1505         sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
1506         sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
1507         sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
1508         sopp.sopp_maxpsz = INFPSZ;
1509         sopp.sopp_maxblk = INFPSZ;
1510         sopp.sopp_minpsz = 0;
1511         (*tsock->ts_conn_upcalls->su_set_proto_props)(
1512             tsock->ts_conn_upper_handle, &sopp);
1513 }
1514 
1515 /* ARGSUSED */
1516 static int
1517 trill_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
1518 {
1519         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1520 
1521         trill_do_close(tsock);
1522         return (0);
1523 }
1524 
1525 /* ARGSUSED */
1526 static int
1527 trill_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
1528     socklen_t len, cred_t *cr)
1529 {
1530         int error;
1531         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1532 
1533         if (sa == NULL)
1534                 error = trill_do_unbind(tsock);
1535         else
1536                 error = trill_start_recv(tsock, sa, len);
1537 
1538         return (error);
1539 }
1540 
1541 /* ARGSUSED */
1542 static int
1543 trill_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
1544     cred_t *cr)
1545 {
1546         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1547         struct sockaddr_dl *laddr;
1548         uint16_t tci;
1549 
1550         ASSERT(DB_TYPE(mp) == M_DATA);
1551         ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1552 
1553         if (msg->msg_name == NULL || msg->msg_namelen != sizeof (*laddr))
1554                 goto eproto;
1555 
1556         /*
1557          * The name is a datalink_id_t, the address is an Ethernet address, and
1558          * the selector value is the VLAN ID.
1559          */
1560         laddr = (struct sockaddr_dl *)msg->msg_name;
1561         if (laddr->sdl_nlen != sizeof (datalink_id_t) ||
1562             laddr->sdl_alen != ETHERADDRL ||
1563             (laddr->sdl_slen != sizeof (tci) && laddr->sdl_slen != 0))
1564                 goto eproto;
1565 
1566         mutex_enter(&tsock->ts_socklock);
1567         if (tsock->ts_state != TS_IDLE || tsock->ts_link == NULL) {
1568                 mutex_exit(&tsock->ts_socklock);
1569                 goto eproto;
1570         }
1571         atomic_inc_uint(&tsock->ts_sockthreadcount);
1572         mutex_exit(&tsock->ts_socklock);
1573 
1574         /*
1575          * Safe to dereference VLAN now, as we've checked the user's specified
1576          * values, and alignment is now guaranteed.
1577          */
1578         if (laddr->sdl_slen == 0) {
1579                 tci = TRILL_NO_TCI;
1580         } else {
1581                 /* LINTED: alignment */
1582                 tci = *(uint16_t *)(LLADDR(laddr) + ETHERADDRL);
1583         }
1584 
1585         mp = create_trill_header(tsock, mp, (const uchar_t *)LLADDR(laddr),
1586             B_TRUE, B_FALSE, tci, msgdsize(mp));
1587         if (mp != NULL) {
1588                 mp = bridge_trill_output(tsock->ts_link, mp);
1589                 if (mp == NULL) {
1590                         KSPINCR(tks_sent);
1591                 } else {
1592                         freemsg(mp);
1593                         KSPINCR(tks_drops);
1594                 }
1595         }
1596 
1597         /* Wake up any threads blocking on us */
1598         if (atomic_dec_uint_nv(&tsock->ts_sockthreadcount) == 0)
1599                 cv_broadcast(&tsock->ts_sockthreadwait);
1600         return (0);
1601 
1602 eproto:
1603         freemsg(mp);
1604         KSPINCR(tks_drops);
1605         return (EPROTO);
1606 }
1607 
1608 /* ARGSUSED */
1609 static int
1610 trill_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
1611     int mode, int32_t *rvalp, cred_t *cr)
1612 {
1613         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1614         int rc;
1615 
1616         switch (cmd) {
1617         /* List of unprivileged TRILL ioctls */
1618         case TRILL_GETNICK:
1619         case TRILL_GETBRIDGE:
1620         case TRILL_LISTNICK:
1621                 break;
1622         default:
1623                 if (secpolicy_dl_config(cr) != 0)
1624                         return (EPERM);
1625                 break;
1626         }
1627 
1628         /* Lock ensures socket state is unchanged during ioctl handling */
1629         mutex_enter(&tsock->ts_socklock);
1630         rc = trill_do_ioctl(tsock, cmd, (void *)arg, mode);
1631         mutex_exit(&tsock->ts_socklock);
1632         return (rc);
1633 }
1634 
1635 static void
1636 trill_clr_flowctrl(sock_lower_handle_t proto_handle)
1637 {
1638         trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1639 
1640         mutex_enter(&tsock->ts_socklock);
1641         tsock->ts_flow_ctrld = B_FALSE;
1642         mutex_exit(&tsock->ts_socklock);
1643 }
1644 
1645 static sock_downcalls_t sock_trill_downcalls = {
1646         trill_activate,                 /* sd_activate */
1647         sock_accept_notsupp,            /* sd_accept */
1648         trill_bind,                     /* sd_bind */
1649         sock_listen_notsupp,            /* sd_listen */
1650         sock_connect_notsupp,           /* sd_connect */
1651         sock_getpeername_notsupp,       /* sd_getpeername */
1652         sock_getsockname_notsupp,       /* sd_getsockname */
1653         sock_getsockopt_notsupp,        /* sd_getsockopt */
1654         sock_setsockopt_notsupp,        /* sd_setsockopt */
1655         trill_send,                     /* sd_send */
1656         NULL,                           /* sd_send_uio */
1657         NULL,                           /* sd_recv_uio */
1658         NULL,                           /* sd_poll */
1659         sock_shutdown_notsupp,          /* sd_shutdown */
1660         trill_clr_flowctrl,             /* sd_setflowctrl */
1661         trill_ioctl,                    /* sd_ioctl */
1662         trill_close                     /* sd_close */
1663 };
1664 
1665 /* ARGSUSED */
1666 static sock_lower_handle_t
1667 trill_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
1668     uint_t *smodep, int *errorp, int flags, cred_t *credp)
1669 {
1670         trill_sock_t *tsock;
1671 
1672         if (family != AF_TRILL || type != SOCK_DGRAM || proto != 0) {
1673                 *errorp = EPROTONOSUPPORT;
1674                 return (NULL);
1675         }
1676 
1677         *sock_downcalls = &sock_trill_downcalls;
1678         *smodep = SM_ATOMIC;
1679         tsock = trill_do_open(flags);
1680         *errorp = (tsock != NULL) ? 0:ENOMEM;
1681         return ((sock_lower_handle_t)tsock);
1682 }