1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/conf.h>
  28 #include <sys/modctl.h>
  29 #include <sys/stat.h>
  30 #include <sys/stream.h>
  31 #include <sys/strsun.h>
  32 #include <sys/ddi.h>
  33 #include <sys/sunddi.h>
  34 #include <sys/priv_names.h>
  35 #include <inet/common.h>
  36 
  37 #define _SUN_TPI_VERSION 2
  38 #include <sys/tihdr.h>
  39 #include <sys/timod.h>
  40 #include <sys/tiuser.h>
  41 #include <sys/suntpi.h>
  42 #include <inet/common.h>
  43 #include <inet/ip.h>
  44 #include <inet/mi.h>
  45 #include <inet/proto_set.h>
  46 #include <sys/ib/clients/rds/rds.h>
  47 #include <sys/policy.h>
  48 #include <inet/ipclassifier.h>
  49 #include <sys/ib/clients/rds/rds_kstat.h>
  50 #include "sys/random.h"
  51 #include <sys/ib/clients/rds/rds_transport.h>
  52 #include <sys/ib/ibtl/ibti.h>
  53 
  54 
  55 #define RDS_NAME        "rds"
  56 #define RDS_STRTAB      rdsinfo
  57 #define RDS_DEVDESC     "RDS STREAMS driver"
  58 #define RDS_DEVMINOR    0
  59 #define RDS_DEVMTFLAGS D_MP | D_SYNCSTR
  60 #define RDS_DEFAULT_PRIV_MODE   0666
  61 
  62 #define rds_smallest_port       1
  63 #define rds_largest_port        65535
  64 
  65 #define RDS_RECV_HIWATER        (56 * 1024)
  66 #define RDS_RECV_LOWATER        128
  67 #define RDS_XMIT_HIWATER        (56 * 1024)
  68 #define RDS_XMIT_LOWATER        1024
  69 
  70 #define RDS_DPRINTF2    0 &&
  71 #define LABEL   "RDS"
  72 
  73 typedef struct rdsahdr_s {
  74         in_port_t       uha_src_port;   /* Source port */
  75         in_port_t       uha_dst_port;   /* Destination port */
  76 } rdsha_t;
  77 
  78 #define RDSH_SIZE       4
  79 
  80 int rds_recv_hiwat = RDS_RECV_HIWATER;
  81 int rds_recv_lowat = RDS_RECV_LOWATER;
  82 int rds_xmit_hiwat = RDS_XMIT_HIWATER;
  83 int rds_xmit_lowat = RDS_XMIT_LOWATER;
  84 
  85 int rdsdebug;
  86 
  87 static dev_info_t *rds_dev_info;
  88 
  89 /* Hint not protected by any lock */
  90 static  in_port_t       rds_next_port_to_try;
  91 
  92 ldi_ident_t rds_li;
  93 static int loopmax = rds_largest_port - rds_smallest_port + 1;
  94 
  95 /* global configuration variables */
  96 uint_t  UserBufferSize;
  97 uint_t  rds_rx_pkts_pending_hwm;
  98 
  99 extern void rds_ioctl(queue_t *, mblk_t *);
 100 extern void rds_ioctl_copyin_done(queue_t *q, mblk_t *mp);
 101 
 102 int rds_open_transport_driver();
 103 int rds_close_transport_driver();
 104 
 105 #define RDS_CURRENT_PORT_QUOTA()                                        \
 106         (rds_rx_pkts_pending_hwm/RDS_GET_NPORT())
 107 
 108 krwlock_t       rds_transport_lock;
 109 ldi_handle_t    rds_transport_handle = NULL;
 110 rds_transport_ops_t *rds_transport_ops = NULL;
 111 
 112 static int
 113 rds_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 114 {
 115         int     ret;
 116 
 117         if (cmd != DDI_ATTACH)
 118                 return (DDI_FAILURE);
 119 
 120         rds_dev_info = devi;
 121 
 122         ret = ddi_create_minor_node(devi, RDS_NAME, S_IFCHR,
 123             RDS_DEVMINOR, DDI_PSEUDO, 0);
 124         if (ret != DDI_SUCCESS) {
 125                 return (ret);
 126         }
 127 
 128         return (DDI_SUCCESS);
 129 }
 130 
 131 static int
 132 rds_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 133 {
 134         if (cmd != DDI_DETACH)
 135                 return (DDI_FAILURE);
 136 
 137         ASSERT(devi == rds_dev_info);
 138 
 139         ddi_remove_minor_node(devi, NULL);
 140 
 141         return (DDI_SUCCESS);
 142 }
 143 
 144 /* ARGSUSED */
 145 static int
 146 rds_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
 147 {
 148         int error = DDI_FAILURE;
 149 
 150         switch (cmd) {
 151         case DDI_INFO_DEVT2DEVINFO:
 152                 if (rds_dev_info != NULL) {
 153                         *result = (void *)rds_dev_info;
 154                         error = DDI_SUCCESS;
 155                 }
 156                 break;
 157 
 158         case DDI_INFO_DEVT2INSTANCE:
 159                 *result = NULL;
 160                 error = DDI_SUCCESS;
 161                 break;
 162 
 163         default:
 164                 break;
 165         }
 166 
 167         return (error);
 168 }
 169 
 170 
 171 /*ARGSUSED*/
 172 static int
 173 rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 174 {
 175         rds_t   *rds;
 176         int     ret;
 177 
 178         if (is_system_labeled()) {
 179                 /*
 180                  * RDS socket is not supported on labeled systems
 181                  */
 182                 return (ESOCKTNOSUPPORT);
 183         }
 184 
 185         /* Open the transport driver if IB HW is present */
 186         rw_enter(&rds_transport_lock, RW_READER);
 187         if (rds_transport_handle == NULL) {
 188                 rw_exit(&rds_transport_lock);
 189                 ret = rds_open_transport_driver();
 190                 rw_enter(&rds_transport_lock, RW_READER);
 191 
 192                 if (ret != 0) {
 193                         /* Transport driver failed to load */
 194                         rw_exit(&rds_transport_lock);
 195                         return (ret);
 196                 }
 197         }
 198         rw_exit(&rds_transport_lock);
 199 
 200         if (sflag == MODOPEN) {
 201                 return (EINVAL);
 202         }
 203 
 204         /* Reopen not supported */
 205         if (q->q_ptr != NULL) {
 206                 dprint(2, ("%s: Reopen is not supported: %p", LABEL, q->q_ptr));
 207                 return (0);
 208         }
 209 
 210         rds = rds_create(q, credp);
 211         if (rds == NULL) {
 212                 dprint(2, ("%s: rds_create failed", LABEL));
 213                 return (0);
 214         }
 215 
 216         q->q_ptr = WR(q)->q_ptr = rds;
 217         rds->rds_state = TS_UNBND;
 218         rds->rds_family = AF_INET_OFFLOAD;
 219 
 220         q->q_hiwat = rds_recv_hiwat;
 221         q->q_lowat = rds_recv_lowat;
 222 
 223         qprocson(q);
 224 
 225         WR(q)->q_hiwat = rds_xmit_hiwat;
 226         WR(q)->q_lowat = rds_xmit_lowat;
 227 
 228         /* Set the Stream head watermarks */
 229         (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat);
 230         (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat);
 231 
 232         return (0);
 233 }
 234 
 235 static int
 236 rds_close(queue_t *q)
 237 {
 238         rds_t *rdsp = (rds_t *)q->q_ptr;
 239 
 240         qprocsoff(q);
 241 
 242         /*
 243          * NPORT should be decremented only if this socket was previously
 244          * bound to an RDS port.
 245          */
 246         if (rdsp->rds_state >= TS_IDLE) {
 247                 RDS_DECR_NPORT();
 248                 RDS_SET_PORT_QUOTA(RDS_CURRENT_PORT_QUOTA());
 249                 rds_transport_ops->
 250                     rds_transport_resume_port(ntohs(rdsp->rds_port));
 251         }
 252 
 253         /* close the transport driver if this is the last socket */
 254         if (RDS_GET_NPORT() == 1) {
 255                 (void) rds_close_transport_driver();
 256         }
 257 
 258         /*
 259          * We set the flags without holding a lock as this is
 260          * just a hint for the fanout lookup to skip this rds.
 261          * We dont free the struct until it's out of the hash and
 262          * the ref count goes down.
 263          */
 264         rdsp->rds_flags |= RDS_CLOSING;
 265         rds_bind_hash_remove(rdsp, B_FALSE);
 266         mutex_enter(&rdsp->rds_lock);
 267         ASSERT(rdsp->rds_refcnt > 0);
 268         if (rdsp->rds_refcnt != 1) {
 269                 cv_wait(&rdsp->rds_refcv, &rdsp->rds_lock);
 270         }
 271         mutex_exit(&rdsp->rds_lock);
 272         RDS_DEC_REF_CNT(rdsp);
 273         RD(q)->q_ptr = NULL;
 274         WR(q)->q_ptr = NULL;
 275         return (0);
 276 }
 277 
 278 /*
 279  * Add a new message to the socket
 280  */
 281 int
 282 rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr,
 283     in_port_t local_port, in_port_t rem_port, zoneid_t zoneid)
 284 {
 285         rds_t *rds;
 286         struct  T_unitdata_ind  *tudi;
 287         int     udi_size;       /* Size of T_unitdata_ind */
 288         mblk_t *mp1;
 289         sin_t   *sin;
 290         int error = 0;
 291 
 292         local_port = htons(local_port);
 293         rem_port = htons(rem_port);
 294 
 295         ASSERT(mp->b_datap->db_type == M_DATA);
 296         rds = rds_fanout(local_addr, rem_addr, local_port, rem_port, zoneid);
 297         if (rds == NULL) {
 298                 dprint(2, ("%s: rds_fanout failed: (0x%x 0x%x %d %d)", LABEL,
 299                     local_addr, rem_addr, ntohs(local_port), ntohs(rem_port)));
 300                 freemsg(mp);
 301                 return (error);
 302         }
 303 
 304         udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
 305 
 306         /* Allocate a message block for the T_UNITDATA_IND structure. */
 307         mp1 = allocb(udi_size, BPRI_MED);
 308         if (mp1 == NULL) {
 309                 dprint(2, ("%s: allocb failed", LABEL));
 310                 freemsg(mp);
 311                 return (ENOMEM);
 312         }
 313 
 314         mp1->b_cont = mp;
 315         mp = mp1;
 316         mp->b_datap->db_type = M_PROTO;
 317         tudi = (struct T_unitdata_ind *)(uintptr_t)mp->b_rptr;
 318         mp->b_wptr = (uchar_t *)tudi + udi_size;
 319         tudi->PRIM_type = T_UNITDATA_IND;
 320         tudi->SRC_length = sizeof (sin_t);
 321         tudi->SRC_offset = sizeof (struct T_unitdata_ind);
 322         tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
 323         udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
 324         tudi->OPT_length = udi_size;
 325         sin = (sin_t *)&tudi[1];
 326         sin->sin_addr.s_addr = rem_addr;
 327         sin->sin_port = ntohs(rem_port);
 328         sin->sin_family = rds->rds_family;
 329         *(uint32_t *)(uintptr_t)&sin->sin_zero[0] = 0;
 330         *(uint32_t *)(uintptr_t)&sin->sin_zero[4] = 0;
 331 
 332         putnext(rds->rds_ulpd, mp);
 333 
 334         /* check port quota */
 335         if (RDS_GET_RXPKTS_PEND() > rds_rx_pkts_pending_hwm) {
 336                 ulong_t current_port_quota = RDS_GET_PORT_QUOTA();
 337                 if (rds->rds_port_quota > current_port_quota) {
 338                         /* this may result in stalling the port */
 339                         rds->rds_port_quota = current_port_quota;
 340                         (void) proto_set_rx_hiwat(rds->rds_ulpd, NULL,
 341                             rds->rds_port_quota * UserBufferSize);
 342                         RDS_INCR_PORT_QUOTA_ADJUSTED();
 343                 }
 344         }
 345 
 346         /*
 347          * canputnext() check is done after putnext as the protocol does
 348          * not allow dropping any received packet.
 349          */
 350         if (!canputnext(rds->rds_ulpd)) {
 351                 error = ENOSPC;
 352         }
 353 
 354         RDS_DEC_REF_CNT(rds);
 355         return (error);
 356 }
 357 
 358 
 359 /* Default structure copied into T_INFO_ACK messages */
 360 static struct T_info_ack rds_g_t_info_ack_ipv4 = {
 361         T_INFO_ACK,
 362         65535,  /* TSDU_size. Excl. headers */
 363         T_INVALID,      /* ETSU_size.  rds does not support expedited data. */
 364         T_INVALID,      /* CDATA_size. rds does not support connect data. */
 365         T_INVALID,      /* DDATA_size. rds does not support disconnect data. */
 366         sizeof (sin_t), /* ADDR_size. */
 367         0,              /* OPT_size - not initialized here */
 368         65535,          /* TIDU_size.  Excl. headers */
 369         T_CLTS,         /* SERV_type.  rds supports connection-less. */
 370         TS_UNBND,       /* CURRENT_state.  This is set from rds_state. */
 371         (XPG4_1|SENDZERO) /* PROVIDER_flag */
 372 };
 373 
 374 static in_port_t
 375 rds_update_next_port(in_port_t port)
 376 {
 377         (void) random_get_pseudo_bytes((uint8_t *)&port, sizeof (in_port_t));
 378         if (port < rds_smallest_port)
 379                 port = rds_smallest_port;
 380         return (port);
 381 }
 382 
 383 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
 384 static void
 385 rds_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
 386 {
 387         if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
 388                 qreply(q, mp);
 389 }
 390 
 391 static void
 392 rds_capability_req(queue_t *q, mblk_t *mp)
 393 {
 394         t_uscalar_t     cap_bits1;
 395         struct T_capability_ack *tcap;
 396 
 397         cap_bits1 =
 398             ((struct T_capability_req *)(uintptr_t)mp->b_rptr)->CAP_bits1;
 399 
 400         mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
 401             mp->b_datap->db_type, T_CAPABILITY_ACK);
 402         if (mp == NULL)
 403                 return;
 404         tcap = (struct T_capability_ack *)(uintptr_t)mp->b_rptr;
 405         tcap->CAP_bits1 = 0;
 406 
 407         if (cap_bits1 & TC1_INFO) {
 408                 tcap->CAP_bits1 |= TC1_INFO;
 409                 *(&tcap->INFO_ack) = rds_g_t_info_ack_ipv4;
 410         }
 411 
 412         qreply(q, mp);
 413 }
 414 
 415 static void
 416 rds_info_req(queue_t *q, mblk_t *omp)
 417 {
 418         rds_t *rds = (rds_t *)q->q_ptr;
 419         struct T_info_ack *tap;
 420         mblk_t *mp;
 421 
 422         /* Create a T_INFO_ACK message. */
 423         mp = tpi_ack_alloc(omp, sizeof (struct T_info_ack), M_PCPROTO,
 424             T_INFO_ACK);
 425         if (mp == NULL)
 426                 return;
 427         tap = (struct T_info_ack *)(uintptr_t)mp->b_rptr;
 428         *tap = rds_g_t_info_ack_ipv4;
 429         tap->CURRENT_state = rds->rds_state;
 430         tap->OPT_size = 128;
 431         qreply(q, mp);
 432 }
 433 
 434 /*
 435  * NO locking protection here as sockfs will only send down
 436  * one bind operation at a time.
 437  */
 438 static void
 439 rds_bind(queue_t *q, mblk_t *mp)
 440 {
 441         sin_t           *sin;
 442         rds_t *rds;
 443         struct T_bind_req *tbr;
 444         in_port_t       port;   /* Host byte order */
 445         in_port_t       requested_port; /* Host byte order */
 446         struct T_bind_ack *tba;
 447         int             count;
 448         rds_bf_t        *rdsbf;
 449         in_port_t       lport;  /* Network byte order */
 450 
 451         rds = (rds_t *)q->q_ptr;
 452         if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) < sizeof (*tbr)) {
 453                 rds_err_ack(q, mp, TPROTO, 0);
 454                 return;
 455         }
 456 
 457         /*
 458          * We don't allow multiple binds
 459          */
 460         if (rds->rds_state != TS_UNBND) {
 461                 rds_err_ack(q, mp, TOUTSTATE, 0);
 462                 return;
 463         }
 464 
 465         tbr = (struct T_bind_req *)(uintptr_t)mp->b_rptr;
 466         switch (tbr->ADDR_length) {
 467         case sizeof (sin_t):    /* Complete IPv4 address */
 468                 sin = (sin_t *)(uintptr_t)mi_offset_param(mp, tbr->ADDR_offset,
 469                     sizeof (sin_t));
 470                 if (sin == NULL || !OK_32PTR((char *)sin)) {
 471                         rds_err_ack(q, mp, TSYSERR, EINVAL);
 472                         return;
 473                 }
 474                 if (rds->rds_family != AF_INET_OFFLOAD ||
 475                     sin->sin_family != AF_INET_OFFLOAD) {
 476                         rds_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
 477                         return;
 478                 }
 479                 if (sin->sin_addr.s_addr == INADDR_ANY) {
 480                         rds_err_ack(q, mp, TBADADDR, 0);
 481                         return;
 482                 }
 483 
 484                 /*
 485                  * verify that the address is hosted on IB
 486                  * only exception is the loopback address.
 487                  */
 488                 if ((sin->sin_addr.s_addr != INADDR_LOOPBACK) &&
 489                     !rds_verify_bind_address(sin->sin_addr.s_addr)) {
 490                         rds_err_ack(q, mp, TBADADDR, 0);
 491                         return;
 492                 }
 493 
 494                 port = ntohs(sin->sin_port);
 495                 break;
 496         default:        /* Invalid request */
 497                 rds_err_ack(q, mp, TBADADDR, 0);
 498                 return;
 499         }
 500 
 501         requested_port = port;
 502 
 503         /*
 504          * TPI only sends down T_BIND_REQ for AF_INET and AF_INET6
 505          * since RDS socket is of type AF_INET_OFFLOAD a O_T_BIND_REQ
 506          * will be sent down. Treat O_T_BIND_REQ as T_BIND_REQ
 507          */
 508 
 509         if (requested_port == 0) {
 510                 /*
 511                  * If the application passed in zero for the port number, it
 512                  * doesn't care which port number we bind to. Get one in the
 513                  * valid range.
 514                  */
 515                 port = rds_update_next_port(rds_next_port_to_try);
 516         }
 517 
 518         ASSERT(port != 0);
 519         count = 0;
 520         for (;;) {
 521                 rds_t           *rds1;
 522                 ASSERT(sin->sin_addr.s_addr != INADDR_ANY);
 523                 /*
 524                  * Walk through the list of rds streams bound to
 525                  * requested port with the same IP address.
 526                  */
 527                 lport = htons(port);
 528                 rdsbf = &rds_bind_fanout[RDS_BIND_HASH(lport)];
 529                 mutex_enter(&rdsbf->rds_bf_lock);
 530                 for (rds1 = rdsbf->rds_bf_rds; rds1 != NULL;
 531                     rds1 = rds1->rds_bind_hash) {
 532                         if (lport != rds1->rds_port ||
 533                             rds1->rds_src != sin->sin_addr.s_addr ||
 534                             rds1->rds_zoneid != rds->rds_zoneid)
 535 
 536                                 continue;
 537                         break;
 538                 }
 539 
 540                 if (rds1 == NULL) {
 541                         /*
 542                          * No other stream has this IP address
 543                          * and port number. We can use it.
 544                          */
 545                         break;
 546                 }
 547                 mutex_exit(&rdsbf->rds_bf_lock);
 548                 if (requested_port != 0) {
 549                         /*
 550                          * We get here only when requested port
 551                          * is bound (and only first  of the for()
 552                          * loop iteration).
 553                          *
 554                          * The semantics of this bind request
 555                          * require it to fail so we return from
 556                          * the routine (and exit the loop).
 557                          *
 558                          */
 559                         rds_err_ack(q, mp, TADDRBUSY, 0);
 560                         return;
 561                 }
 562 
 563                 port = rds_update_next_port(port + 1);
 564 
 565                 if (++count >= loopmax) {
 566                         /*
 567                          * We've tried every possible port number and
 568                          * there are none available, so send an error
 569                          * to the user.
 570                          */
 571                         rds_err_ack(q, mp, TNOADDR, 0);
 572                         return;
 573                 }
 574         }
 575 
 576         /*
 577          * Copy the source address into our rds structure.
 578          */
 579         rds->rds_src = sin->sin_addr.s_addr;
 580         rds->rds_port = lport;
 581 
 582         /*
 583          * reset the next port if we choose the port
 584          */
 585         if (requested_port == 0) {
 586                 rds_next_port_to_try = port + 1;
 587         }
 588 
 589         rds->rds_state = TS_IDLE;
 590         rds_bind_hash_insert(rdsbf, rds);
 591         mutex_exit(&rdsbf->rds_bf_lock);
 592 
 593         /* Reset the message type in preparation for shipping it back. */
 594         mp->b_datap->db_type = M_PCPROTO;
 595         tba = (struct T_bind_ack *)(uintptr_t)mp->b_rptr;
 596         tba->PRIM_type = T_BIND_ACK;
 597 
 598         /* Increment the number of ports and set the port quota */
 599         RDS_INCR_NPORT();
 600         rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA();
 601         RDS_SET_PORT_QUOTA(rds->rds_port_quota);
 602         (void) proto_set_rx_hiwat(RD(q), NULL,
 603             rds->rds_port_quota * UserBufferSize);
 604 
 605         qreply(q, mp);
 606 }
 607 
 608 static void
 609 rds_wput_other(queue_t *q, mblk_t *mp)
 610 {
 611         uchar_t *rptr = mp->b_rptr;
 612         struct datab *db;
 613         cred_t *cr;
 614 
 615         db = mp->b_datap;
 616         switch (db->db_type) {
 617         case M_DATA:
 618                 /* Not connected */
 619                 freemsg(mp);
 620                 return;
 621         case M_PROTO:
 622         case M_PCPROTO:
 623                 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr <
 624                     sizeof (t_scalar_t)) {
 625                         freemsg(mp);
 626                         return;
 627                 }
 628                 switch (((union T_primitives *)(uintptr_t)rptr)->type) {
 629                 case T_CAPABILITY_REQ:
 630                         rds_capability_req(q, mp);
 631                         return;
 632 
 633                 case T_INFO_REQ:
 634                         rds_info_req(q, mp);
 635                         return;
 636                 case O_T_BIND_REQ:
 637                 case T_BIND_REQ:
 638                         rds_bind(q, mp);
 639                         return;
 640                 case T_SVR4_OPTMGMT_REQ:
 641                 case T_OPTMGMT_REQ:
 642                         /*
 643                          * All Solaris components should pass a db_credp
 644                          * for this TPI message, hence we ASSERT.
 645                          * But in case there is some other M_PROTO that looks
 646                          * like a TPI message sent by some other kernel
 647                          * component, we check and return an error.
 648                          */
 649                         cr = msg_getcred(mp, NULL);
 650                         ASSERT(cr != NULL);
 651                         if (cr == NULL) {
 652                                 rds_err_ack(q, mp, TSYSERR, EINVAL);
 653                                 return;
 654                         }
 655                         if (((union T_primitives *)(uintptr_t)rptr)->type ==
 656                             T_SVR4_OPTMGMT_REQ) {
 657                                 svr4_optcom_req(q, mp, cr, &rds_opt_obj);
 658                         } else {
 659                                 tpi_optcom_req(q, mp, cr, &rds_opt_obj);
 660                         }
 661                         return;
 662                 case T_CONN_REQ:
 663                         /*
 664                          * We should not receive T_CONN_REQ as sockfs only
 665                          * sends down T_CONN_REQ if family == AF_INET/AF_INET6
 666                          * and type == SOCK_DGRAM/SOCK_RAW. For all others
 667                          * it simply calls soisconnected. see sotpi_connect()
 668                          * for details.
 669                          */
 670                 /* FALLTHRU */
 671                 default:
 672                         cmn_err(CE_PANIC, "type %d \n",
 673                             ((union T_primitives *)(uintptr_t)rptr)->type);
 674                 }
 675                 break;
 676         case M_FLUSH:
 677                 if (*rptr & FLUSHW)
 678                         flushq(q, FLUSHDATA);
 679                 break;
 680         case M_IOCTL:
 681                 rds_ioctl(q, mp);
 682                 break;
 683         case M_IOCDATA:
 684                 /* IOCTL continuation following copyin or copyout. */
 685                 if (mi_copy_state(q, mp, NULL) == -1) {
 686                         /*
 687                          * The copy operation failed.  mi_copy_state already
 688                          * cleaned up, so we're out of here.
 689                          */
 690                         return;
 691                 }
 692                 /*
 693                  * If we just completed a copy in, continue processing
 694                  * in rds_ioctl_copyin_done. If it was a copy out, we call
 695                  * mi_copyout again.  If there is nothing more to copy out,
 696                  * it will complete the IOCTL.
 697                  */
 698 
 699                 if (MI_COPY_DIRECTION(mp) == MI_COPY_IN)
 700                         rds_ioctl_copyin_done(q, mp);
 701                 else
 702                         mi_copyout(q, mp);
 703                 return;
 704 
 705         default:
 706                 cmn_err(CE_PANIC, "types %d \n", db->db_type);
 707         }
 708 }
 709 
 710 static int
 711 rds_wput(queue_t *q, mblk_t *mp)
 712 {
 713         struct  datab   *db;
 714         uchar_t *rptr = mp->b_rptr;
 715 
 716         db = mp->b_datap;
 717         switch (db->db_type) {
 718         case M_PROTO:
 719         case M_PCPROTO:
 720                 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
 721                     (uintptr_t)INT_MAX);
 722                 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
 723                     sizeof (struct T_unitdata_req)) {
 724                         if (((union T_primitives *)(uintptr_t)rptr)->type
 725                             == T_UNITDATA_REQ) {
 726                                 /*
 727                                  *  We should never come here for T_UNITDATA_REQ
 728                                  */
 729                                 cmn_err(CE_PANIC, "rds_wput T_UNITDATA_REQ \n");
 730                         }
 731                 }
 732                 /* FALLTHRU */
 733         default:
 734                 rds_wput_other(q, mp);
 735                 return (0);
 736         }
 737 }
 738 
 739 static int
 740 rds_wput_data(queue_t *q, mblk_t *mp, uio_t *uiop)
 741 {
 742         uchar_t *rptr = mp->b_rptr;
 743         rds_t   *rds;
 744         mblk_t  *mp1;
 745         sin_t   *sin;
 746         ipaddr_t dst;
 747         uint16_t port;
 748         int ret = 0;
 749 
 750 #define tudr    ((struct T_unitdata_req *)(uintptr_t)rptr)
 751 
 752         rds = (rds_t *)q->q_ptr;
 753         /* Handle UNITDATA_REQ messages here */
 754         if (rds->rds_state == TS_UNBND) {
 755                 /* If a port has not been bound to the stream, fail. */
 756                 dprint(2, ("%s: socket is not bound to a port", LABEL));
 757                 freemsg(mp);
 758                 return (EPROTO);
 759         }
 760 
 761         mp1 = mp->b_cont;
 762         mp->b_cont = NULL;
 763         if (mp1 == NULL) {
 764                 dprint(2, ("%s: No message to send", LABEL));
 765                 freemsg(mp);
 766                 return (EPROTO);
 767         }
 768 
 769         /*
 770          * No options allowed
 771          */
 772         if (tudr->OPT_length != 0) {
 773                 ret = EINVAL;
 774                 goto done;
 775         }
 776 
 777         ASSERT(mp1->b_datap->db_ref == 1);
 778 
 779         if ((rptr + tudr->DEST_offset + tudr->DEST_length) >
 780             mp->b_wptr) {
 781                 ret = EDESTADDRREQ;
 782                 goto done;
 783         }
 784 
 785         sin = (sin_t *)(uintptr_t)&rptr[tudr->DEST_offset];
 786         if (!OK_32PTR((char *)sin) || tudr->DEST_length !=
 787             sizeof (sin_t) || sin->sin_family != AF_INET_OFFLOAD) {
 788                 ret = EDESTADDRREQ;
 789                 goto done;
 790         }
 791         /* Extract port and ipaddr */
 792         port = sin->sin_port;
 793         dst = sin->sin_addr.s_addr;
 794 
 795         if (port == 0 || dst == INADDR_ANY) {
 796                 ret = EDESTADDRREQ;
 797                 goto done;
 798         }
 799 
 800         ASSERT(rds_transport_ops != NULL);
 801         ret = rds_transport_ops->rds_transport_sendmsg(uiop, rds->rds_src, dst,
 802             ntohs(rds->rds_port), ntohs(port), rds->rds_zoneid);
 803         if (ret != 0) {
 804                 if ((ret != ENOBUFS) && (ret != ENOMEM)) {
 805                         /* ENOMEM is actually EWOULDBLOCK */
 806                         dprint(2, ("%s: rds_sendmsg returned %d", LABEL, ret));
 807                         goto done;
 808                 }
 809         }
 810 done:
 811         freemsg(mp1);
 812         freemsg(mp);
 813         return (ret);
 814 }
 815 
 816 /*
 817  * Make sure we dont return EINVAL and EWOULDBLOCK as it has
 818  * special meanings for the synchronous streams (rwnext()).
 819  * We should return ENOMEM which is changed to EWOULDBLOCK by kstrputmsg()
 820  */
 821 static int
 822 rds_wrw(queue_t *q, struiod_t *dp)
 823 {
 824         mblk_t  *mp = dp->d_mp;
 825         int error = 0;
 826         struct  datab   *db;
 827         uchar_t *rptr;
 828 
 829         db = mp->b_datap;
 830         rptr = mp->b_rptr;
 831         switch (db->db_type) {
 832         case M_PROTO:
 833         case M_PCPROTO:
 834                 ASSERT(((uintptr_t)mp->b_wptr - (uintptr_t)rptr) <=
 835                     (uintptr_t)INT_MAX);
 836                 if ((uintptr_t)mp->b_wptr - (uintptr_t)rptr >=
 837                     sizeof (struct T_unitdata_req)) {
 838                         /* Detect valid T_UNITDATA_REQ here */
 839                         if (((union T_primitives *)(uintptr_t)rptr)->type
 840                             == T_UNITDATA_REQ)
 841                         break;
 842                 }
 843                 /* FALLTHRU */
 844         default:
 845 
 846                 if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
 847                 /*
 848                  * Uio error of some sort, so just return the error.
 849                  */
 850                         goto done;
 851                 }
 852                 dp->d_mp = 0;
 853                 rds_wput_other(q, mp);
 854                 return (0);
 855         }
 856 
 857         dp->d_mp = 0;
 858         error = rds_wput_data(q, mp, &dp->d_uio);
 859 done:
 860         if (error == EWOULDBLOCK || error == EINVAL)
 861                 error = EIO;
 862 
 863         return (error);
 864 }
 865 
 866 static void
 867 rds_rsrv(queue_t *q)
 868 {
 869         rds_t   *rds = (rds_t *)q->q_ptr;
 870         ulong_t current_port_quota;
 871 
 872         /* update the port quota to the current level */
 873         current_port_quota = RDS_GET_PORT_QUOTA();
 874         if (rds->rds_port_quota != current_port_quota) {
 875                 rds->rds_port_quota = current_port_quota;
 876                 (void) proto_set_rx_hiwat(q, NULL,
 877                     rds->rds_port_quota * UserBufferSize);
 878         }
 879 
 880         /* No more messages in the q, unstall the socket */
 881         rds_transport_ops->rds_transport_resume_port(ntohs(rds->rds_port));
 882 }
 883 
 884 int
 885 rds_close_transport_driver()
 886 {
 887         ASSERT(rds_transport_ops != NULL);
 888 
 889         rw_enter(&rds_transport_lock, RW_WRITER);
 890         if (rds_transport_handle != NULL) {
 891                 rds_transport_ops->rds_transport_close_ib();
 892                 (void) ldi_close(rds_transport_handle, FNDELAY, kcred);
 893                 rds_transport_handle = NULL;
 894         }
 895         rw_exit(&rds_transport_lock);
 896 
 897         return (0);
 898 }
 899 
 900 
 901 int
 902 rds_open_transport_driver()
 903 {
 904         int ret = 0;
 905 
 906         rw_enter(&rds_transport_lock, RW_WRITER);
 907         if (rds_transport_handle != NULL) {
 908                 /*
 909                  * Someone beat us to it.
 910                  */
 911                 goto done;
 912         }
 913 
 914         if (ibt_hw_is_present() == 0) {
 915                 ret = ENODEV;
 916                 goto done;
 917         }
 918 
 919         if (rds_li == NULL) {
 920                 ret = EPROTONOSUPPORT;
 921                 goto done;
 922         }
 923 
 924         ret = ldi_open_by_name("/devices/ib/rdsib@0:rdsib",
 925             FREAD | FWRITE, kcred, &rds_transport_handle, rds_li);
 926         if (ret != 0) {
 927                 ret = EPROTONOSUPPORT;
 928                 rds_transport_handle = NULL;
 929                 goto done;
 930         }
 931 
 932         ret = rds_transport_ops->rds_transport_open_ib();
 933         if (ret != 0) {
 934                 (void) ldi_close(rds_transport_handle, FNDELAY, kcred);
 935                 rds_transport_handle = NULL;
 936         }
 937 done:
 938         rw_exit(&rds_transport_lock);
 939         return (ret);
 940 }
 941 
 942 static struct module_info info = {
 943         0, "rds", 1, INFPSZ, 65536, 1024
 944 };
 945 
 946 static struct qinit rinit = {
 947         NULL, (pfi_t)rds_rsrv, rds_open, rds_close, NULL, &info
 948 };
 949 
 950 static struct qinit winit = {
 951         (pfi_t)rds_wput, NULL, rds_open, rds_close, NULL, &info,
 952         NULL, rds_wrw, NULL, STRUIOT_STANDARD
 953 };
 954 
 955 struct streamtab rdsinfo = {
 956         &rinit, &winit, NULL, NULL
 957 };
 958 
 959 DDI_DEFINE_STREAM_OPS(rds_devops, nulldev, nulldev, rds_attach, rds_detach,
 960     nulldev, rds_info, RDS_DEVMTFLAGS, &RDS_STRTAB, ddi_quiesce_not_supported);
 961 
 962 /*
 963  * Module linkage information for the kernel.
 964  */
 965 static struct modldrv modldrv = {
 966         &mod_driverops,
 967         RDS_DEVDESC,
 968         &rds_devops
 969 };
 970 
 971 static struct modlinkage modlinkage = {
 972         MODREV_1,
 973         { &modldrv, NULL }
 974 };
 975 
 976 int
 977 _init(void)
 978 {
 979         int     ret;
 980 
 981         rds_init();
 982 
 983         ret = mod_install(&modlinkage);
 984         if (ret != 0)
 985                 goto done;
 986         ret = ldi_ident_from_mod(&modlinkage, &rds_li);
 987         if (ret != 0)
 988                 rds_li = NULL;
 989 done:
 990         return (ret);
 991 }
 992 
 993 int
 994 _fini(void)
 995 {
 996         int     ret;
 997 
 998         ret = mod_remove(&modlinkage);
 999         if (ret != 0) {
1000                 return (ret);
1001         }
1002 
1003         rds_fini();
1004 
1005         ldi_ident_release(rds_li);
1006         return (0);
1007 }
1008 
1009 int
1010 _info(struct modinfo *modinfop)
1011 {
1012         return (mod_info(&modlinkage, modinfop));
1013 }