1 /*
   2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 
   5 /*
   6  * This file contains code imported from the OFED rds source file af_rds.c
   7  * Oracle elects to have and use the contents of af_rds.c under and governed
   8  * by the OpenIB.org BSD license (see below for full license text). However,
   9  * the following notice accompanied the original version of this file:
  10  */
  11 
  12 /*
  13  * Copyright (c) 2006 Oracle.  All rights reserved.
  14  *
  15  * This software is available to you under a choice of one of two
  16  * licenses.  You may choose to be licensed under the terms of the GNU
  17  * General Public License (GPL) Version 2, available from the file
  18  * COPYING in the main directory of this source tree, or the
  19  * OpenIB.org BSD license below:
  20  *
  21  *     Redistribution and use in source and binary forms, with or
  22  *     without modification, are permitted provided that the following
  23  *     conditions are met:
  24  *
  25  *      - Redistributions of source code must retain the above
  26  *        copyright notice, this list of conditions and the following
  27  *        disclaimer.
  28  *
  29  *      - Redistributions in binary form must reproduce the above
  30  *        copyright notice, this list of conditions and the following
  31  *        disclaimer in the documentation and/or other materials
  32  *        provided with the distribution.
  33  *
  34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  41  * SOFTWARE.
  42  *
  43  */
  44 #include <sys/types.h>
  45 #include <sys/stat.h>
  46 #include <sys/conf.h>
  47 #include <sys/ddi.h>
  48 #include <sys/sunddi.h>
  49 #include <sys/modctl.h>
  50 #include <sys/rds.h>
  51 #include <sys/stropts.h>
  52 #include <sys/socket.h>
  53 #include <sys/socketvar.h>
  54 #include <sys/sockio.h>
  55 #include <sys/sysmacros.h>
  56 
  57 #include <inet/ip.h>
  58 #include <net/if_types.h>
  59 
  60 #include <sys/ib/clients/rdsv3/rdsv3.h>
  61 #include <sys/ib/clients/rdsv3/rdma.h>
  62 #include <sys/ib/clients/rdsv3/rdma_transport.h>
  63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
  64 
  65 extern void rdsv3_remove_bound(struct rdsv3_sock *rds);
  66 extern int rdsv3_verify_bind_address(ipaddr_t addr);
  67 
  68 extern ddi_taskq_t      *rdsv3_taskq;
  69 extern struct rdma_cm_id *rdsv3_rdma_listen_id;
  70 
  71 /* this is just used for stats gathering :/ */
  72 kmutex_t rdsv3_sock_lock;
  73 static unsigned long rdsv3_sock_count;
  74 list_t rdsv3_sock_list;
  75 
  76 /*
  77  * This is called as the final descriptor referencing this socket is closed.
  78  * We have to unbind the socket so that another socket can be bound to the
  79  * address it was using.
  80  *
  81  * We have to be careful about racing with the incoming path.  sock_orphan()
  82  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
  83  * messages shouldn't be queued.
  84  */
  85 /* ARGSUSED */
  86 static int
  87 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr)
  88 {
  89         struct rsock *sk = (struct rsock *)proto_handle;
  90         struct rdsv3_sock *rs;
  91 
  92         if (!sk)
  93                 goto out;
  94 
  95         rs = rdsv3_sk_to_rs(sk);
  96         RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk);
  97 
  98         rdsv3_sk_sock_orphan(sk);
  99         rdsv3_cong_remove_socket(rs);
 100         rdsv3_remove_bound(rs);
 101 
 102         /*
 103          * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so
 104          * that ensures the recv path has completed messing
 105          * with the socket.
 106          *
 107          * Note2 - rdsv3_clear_recv_queue(rs) should be called first
 108          * to prevent some race conditions, which is different from
 109          * the Linux code.
 110          */
 111         rdsv3_clear_recv_queue(rs);
 112         rdsv3_send_drop_to(rs, NULL);
 113         rdsv3_rdma_drop_keys(rs);
 114         (void) rdsv3_notify_queue_get(rs, NULL);
 115 
 116         mutex_enter(&rdsv3_sock_lock);
 117         list_remove_node(&rs->rs_item);
 118         rdsv3_sock_count--;
 119         mutex_exit(&rdsv3_sock_lock);
 120 
 121         while (sk->sk_refcount > 1) {
 122                 /* wait for 1 sec and try again */
 123                 delay(drv_usectohz(1000000));
 124         }
 125 
 126         /* this will free the rs and sk */
 127         rdsv3_sk_sock_put(sk);
 128 
 129         RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs);
 130 out:
 131         return (0);
 132 }
 133 
 134 void
 135 __rdsv3_wake_sk_sleep(struct rsock *sk)
 136 {
 137         /* wakup anyone waiting in recvmsg */
 138         if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep)
 139                 rdsv3_wake_up(sk->sk_sleep);
 140 }
 141 
 142 /*
 143  * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep.
 144  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
 145  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
 146  * this seems more conservative.
 147  * NB - normally, one would use sk_callback_lock for this, but we can
 148  * get here from interrupts, whereas the network code grabs sk_callback_lock
 149  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
 150  */
 151 void
 152 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs)
 153 {
 154         RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs);
 155 
 156         rw_enter(&rs->rs_recv_lock, RW_READER);
 157         __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs));
 158         rw_exit(&rs->rs_recv_lock);
 159 }
 160 
 161 /*ARGSUSED*/
 162 static int
 163 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
 164     socklen_t *addr_len, cred_t *cr)
 165 {
 166         struct rsock *sk = (struct rsock *)proto_handle;
 167         struct sockaddr_in *sin = (struct sockaddr_in *)addr;
 168         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 169 
 170         RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs,
 171             rs->rs_bound_port);
 172 
 173         sin->sin_port = rs->rs_bound_port;
 174         sin->sin_addr.s_addr = rs->rs_bound_addr;
 175 
 176         sin->sin_family = AF_INET_OFFLOAD;
 177 
 178         *addr_len = sizeof (*sin);
 179         return (0);
 180 }
 181 
 182 /*
 183  * RDS' poll is without a doubt the least intuitive part of the interface,
 184  * as POLLIN and POLLOUT do not behave entirely as you would expect from
 185  * a network protocol.
 186  *
 187  * POLLIN is asserted if
 188  *  -   there is data on the receive queue.
 189  *  -   to signal that a previously congested destination may have become
 190  *      uncongested
 191  *  -   A notification has been queued to the socket (this can be a congestion
 192  *      update, or a RDMA completion).
 193  *
 194  * POLLOUT is asserted if there is room on the send queue. This does not mean
 195  * however, that the next sendmsg() call will succeed. If the application tries
 196  * to send to a congested destination, the system call may still fail (and
 197  * return ENOBUFS).
 198  */
 199 /* ARGSUSED */
 200 static short
 201 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet,
 202     cred_t *cr)
 203 {
 204         struct rsock    *sk = (struct rsock *)proto_handle;
 205         struct rdsv3_sock       *rs = rdsv3_sk_to_rs(sk);
 206         unsigned short mask = 0;
 207 
 208 #if 0
 209         RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet);
 210 #endif
 211 
 212         /*
 213          * If rs_seen_congestion is on, wait until it's off.
 214          * This is implemented for the following OFED code.
 215          *      if (rs->rs_seen_congestion)
 216          *              poll_wait(file, &rds_poll_waitq, wait);
 217          */
 218         mutex_enter(&rs->rs_congested_lock);
 219         while (rs->rs_seen_congestion) {
 220                 cv_wait(&rs->rs_congested_cv,
 221                     &rs->rs_congested_lock);
 222         }
 223         mutex_exit(&rs->rs_congested_lock);
 224 
 225         rw_enter(&rs->rs_recv_lock, RW_READER);
 226         if (!rs->rs_cong_monitor) {
 227                 /*
 228                  * When a congestion map was updated, we signal POLLIN for
 229                  * "historical" reasons. Applications can also poll for
 230                  * WRBAND instead.
 231                  */
 232                 if (rdsv3_cong_updated_since(&rs->rs_cong_track))
 233                         mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
 234         } else {
 235                 mutex_enter(&rs->rs_lock);
 236                 if (rs->rs_cong_notify)
 237                         mask |= (POLLIN | POLLRDNORM);
 238                 mutex_exit(&rs->rs_lock);
 239         }
 240         if (!list_is_empty(&rs->rs_recv_queue) ||
 241             !list_is_empty(&rs->rs_notify_queue))
 242                 mask |= (POLLIN | POLLRDNORM);
 243         if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs))
 244                 mask |= (POLLOUT | POLLWRNORM);
 245 
 246         /* clear state any time we wake a seen-congested socket */
 247         if (mask) {
 248                 mutex_enter(&rs->rs_congested_lock);
 249                 rs->rs_seen_congestion = 0;
 250                 mutex_exit(&rs->rs_congested_lock);
 251         }
 252 
 253         rw_exit(&rs->rs_recv_lock);
 254 
 255 #if 0
 256         RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask);
 257 #endif
 258 
 259         return (mask);
 260 }
 261 
 262 /* ARGSUSED */
 263 static int
 264 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
 265     int mode, int32_t *rvalp, cred_t *cr)
 266 {
 267         ksocket_t       so4;
 268         struct lifconf  lifc;
 269         struct lifreq   lifr, *lifrp;
 270         struct ifconf   ifc;
 271         struct ifreq    ifr;
 272         int             rval = 0, rc, len;
 273         int             numifs;
 274         int             bufsize;
 275         void            *buf;
 276 
 277         RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd);
 278 
 279         /* Only ipv4 for now */
 280         rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP,
 281             CRED());
 282         if (rval != 0) {
 283                 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d",
 284                     rval);
 285                 return (rval);
 286         }
 287 
 288         switch (cmd) {
 289         case SIOCGLIFNUM :
 290         case SIOCGIFNUM :
 291                 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
 292                 if (rval != 0) break;
 293                 if (cmd == SIOCGLIFNUM) {
 294                         struct lifnum   lifn;
 295                         lifn.lifn_family = AF_INET_OFFLOAD;
 296                         lifn.lifn_flags = 0;
 297                         lifn.lifn_count = numifs;
 298                         (void) ddi_copyout(&lifn, (void *)arg,
 299                             sizeof (struct lifnum), 0);
 300                 } else {
 301                         len = 0;
 302                         for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs;
 303                             rc++, lifrp++) {
 304                                 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) {
 305                                         len++;
 306                                 }
 307                         }
 308                         (void) ddi_copyout(&len, (void *)arg,
 309                             sizeof (int), 0);
 310                 }
 311                 kmem_free(buf, bufsize);
 312                 break;
 313 
 314         case SIOCGLIFCONF :
 315                 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0)
 316                     != 0) {
 317                         RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc");
 318                         rval = EFAULT;
 319                         break;
 320                 }
 321 
 322                 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs);
 323                 if (rval != 0) {
 324                         RDSV3_DPRINTF2("rdsv3_ioctl",
 325                             "rdsv3_do_ip_ioctl failed: %d", rval);
 326                         break;
 327                 }
 328 
 329                 if ((lifc.lifc_len > 0) && (numifs > 0)) {
 330                         if (ddi_copyout(buf, (void *)lifc.lifc_req,
 331                             (lifc.lifc_len < bufsize) ? lifc.lifc_len :
 332                             bufsize, 0) != 0) {
 333                                 RDSV3_DPRINTF2("rdsv3_ioctl",
 334                                     "copyout of records failed");
 335                                 rval = EFAULT;
 336                         }
 337 
 338                 }
 339 
 340                 lifc.lifc_len = bufsize;
 341                 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf),
 342                     0) != 0) {
 343                         RDSV3_DPRINTF2("rdsv3_ioctl",
 344                             "copyout of lifconf failed");
 345                         rval = EFAULT;
 346                 }
 347 
 348                 kmem_free(buf, bufsize);
 349                 break;
 350 
 351         case SIOCGIFCONF :
 352         case O_SIOCGIFCONF :
 353                 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0)
 354                     != 0) {
 355                         RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc");
 356                         rval = EFAULT;
 357                         break;
 358                 }
 359 
 360                 RDSV3_DPRINTF2("rdsv3_ioctl",
 361                     "O_SIOCGIFCONF: ifc_len: %d, req: %p",
 362                     ifc.ifc_len, ifc.ifc_req);
 363 
 364                 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs);
 365                 if (rval != 0) {
 366                         RDSV3_DPRINTF2("rdsv3_ioctl",
 367                             "rdsv3_do_ip_ioctl_old failed: %d", rval);
 368                         break;
 369                 }
 370 
 371                 if ((ifc.ifc_len > 0) && (numifs > 0)) {
 372                         if (ddi_copyout(buf, (void *)ifc.ifc_req,
 373                             (ifc.ifc_len < bufsize) ? ifc.ifc_len :
 374                             bufsize, 0) != 0) {
 375                                 RDSV3_DPRINTF2("rdsv3_ioctl",
 376                                     "copyout of records failed");
 377                                 rval = EFAULT;
 378                         }
 379 
 380                 }
 381 
 382                 ifc.ifc_len = bufsize;
 383                 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf),
 384                     0) != 0) {
 385                         RDSV3_DPRINTF2("rdsv3_ioctl",
 386                             "copyout of ifconf failed");
 387                         rval = EFAULT;
 388                 }
 389 
 390                 kmem_free(buf, bufsize);
 391                 break;
 392 
 393         case SIOCGLIFFLAGS :
 394         case SIOCSLIFFLAGS :
 395         case SIOCGLIFMTU :
 396         case SIOCGLIFNETMASK :
 397         case SIOCGLIFINDEX :
 398                 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0)
 399                     != 0) {
 400                         RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr");
 401                         rval = EFAULT;
 402                         break;
 403                 }
 404 
 405                 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED());
 406                 if (rc != 0) {
 407                         RDSV3_DPRINTF2("rdsv3_ioctl",
 408                             "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
 409                             rc, lifr.lifr_name, cmd);
 410                         break;
 411                 }
 412 
 413                 (void) ddi_copyout(&lifr, (void *)arg,
 414                     sizeof (struct lifreq), 0);
 415                 break;
 416 
 417         case SIOCGIFFLAGS :
 418         case SIOCSIFFLAGS :
 419         case SIOCGIFMTU :
 420         case SIOCGIFNETMASK :
 421         case SIOCGIFINDEX :
 422                 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0)
 423                     != 0) {
 424                         RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr");
 425                         rval = EFAULT;
 426                         break;
 427                 }
 428 
 429                 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name);
 430 
 431                 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED());
 432                 if (rc != 0) {
 433                         RDSV3_DPRINTF2("rdsv3_ioctl",
 434                             "ksocket_ioctl failed: %d, name: %s cmd: 0x%x",
 435                             rc, ifr.ifr_name, cmd);
 436 
 437                         break;
 438                 }
 439 
 440                 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name);
 441 
 442                 (void) ddi_copyout(&ifr, (void *)arg,
 443                     sizeof (struct ifreq), 0);
 444                 break;
 445 
 446         default:
 447                 if ((cmd >= RDS_INFO_FIRST) &&
 448                     (cmd <= RDS_INFO_LAST)) {
 449                         return (rdsv3_info_ioctl((struct rsock *)proto_handle,
 450                             cmd, (char *)arg, rvalp));
 451                 }
 452                 RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d",  cmd);
 453                 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd);
 454                 rval = EOPNOTSUPP;
 455         }
 456 
 457         (void) ksocket_close(so4, CRED());
 458 
 459         RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd);
 460 
 461         *rvalp = rval;
 462         return (rval);
 463 }
 464 
 465 static int
 466 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len)
 467 {
 468         struct sockaddr_in sin;
 469 
 470         /* racing with another thread binding seems ok here */
 471         if (rs->rs_bound_addr == 0)
 472                 return (-ENOTCONN); /* XXX not a great errno */
 473 
 474         if (len < sizeof (struct sockaddr_in))
 475                 return (-EINVAL);
 476 
 477         if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in),
 478             0) != 0) {
 479                 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin");
 480                 return (-EFAULT);
 481         }
 482 
 483         rdsv3_send_drop_to(rs, &sin);
 484 
 485         return (0);
 486 }
 487 
 488 static int
 489 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen)
 490 {
 491         int value = *optval;
 492 
 493         if (optlen < sizeof (int))
 494                 return (-EINVAL);
 495         *optvar = !!value;
 496         return (0);
 497 }
 498 
 499 static int
 500 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen)
 501 {
 502         int ret;
 503 
 504         ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
 505         if (ret == 0) {
 506                 if (rs->rs_cong_monitor) {
 507                         rdsv3_cong_add_socket(rs);
 508                 } else {
 509                         rdsv3_cong_remove_socket(rs);
 510                         rs->rs_cong_mask = 0;
 511                         rs->rs_cong_notify = 0;
 512                 }
 513         }
 514         return (ret);
 515 }
 516 
 517 /*ARGSUSED*/
 518 static int
 519 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level,
 520     int optname, const void *optval, socklen_t optlen, cred_t *cr)
 521 {
 522         struct rsock *sk = (struct rsock *)proto_handle;
 523         struct rdsv3_sock       *rs = rdsv3_sk_to_rs(sk);
 524         int     ret = 0;
 525 
 526         RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)",
 527             rs, level, optname);
 528 
 529         switch (optname) {
 530         case RDS_CANCEL_SENT_TO:
 531                 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen);
 532                 break;
 533         case RDS_GET_MR:
 534                 ret = rdsv3_get_mr(rs, optval, optlen);
 535                 break;
 536         case RDS_GET_MR_FOR_DEST:
 537                 ret = rdsv3_get_mr_for_dest(rs, optval, optlen);
 538                 break;
 539         case RDS_FREE_MR:
 540                 ret = rdsv3_free_mr(rs, optval, optlen);
 541                 break;
 542         case RDS_RECVERR:
 543                 ret = rdsv3_set_bool_option(&rs->rs_recverr,
 544                     (char *)optval, optlen);
 545                 break;
 546         case RDS_CONG_MONITOR:
 547                 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen);
 548                 break;
 549         case SO_SNDBUF:
 550                 sk->sk_sndbuf = *(uint_t *)optval;
 551                 return (ret);
 552         case SO_RCVBUF:
 553                 sk->sk_rcvbuf = *(uint_t *)optval;
 554                 return (ret);
 555         default:
 556 #if 1
 557                 break;
 558 #else
 559                 ret = -ENOPROTOOPT;
 560 #endif
 561         }
 562 out:
 563         return (ret);
 564 }
 565 
 566 /* XXX */
 567 /*ARGSUSED*/
 568 static int
 569 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level,
 570     int optname, void *optval, socklen_t *optlen, cred_t *cr)
 571 {
 572         struct rsock *sk = (struct rsock *)proto_handle;
 573         struct rdsv3_sock       *rs = rdsv3_sk_to_rs(sk);
 574         int ret = 0;
 575 
 576         RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)",
 577             rs, optname, *optlen);
 578 
 579         switch (optname) {
 580         case SO_SNDBUF:
 581                 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)",
 582                     sk->sk_sndbuf);
 583                 if (*optlen != 0) {
 584                         *((int *)optval) = sk->sk_sndbuf;
 585                         *optlen = sizeof (uint_t);
 586                 }
 587                 return (ret);
 588         case SO_RCVBUF:
 589                 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)",
 590                     sk->sk_rcvbuf);
 591                 if (*optlen != 0) {
 592                         *((int *)optval) = sk->sk_rcvbuf;
 593                         *optlen = sizeof (uint_t);
 594                 }
 595                 return (ret);
 596         case RDS_RECVERR:
 597                 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)",
 598                     rs->rs_recverr);
 599                 if (*optlen < sizeof (int))
 600                         return (-EINVAL);
 601                 else {
 602                         *(int *)optval = rs->rs_recverr;
 603                         *optlen = sizeof (int);
 604                 }
 605                 return (0);
 606         default:
 607                 RDSV3_DPRINTF2("rdsv3_getsockopt",
 608                     "Unknown: level: %d optname: %d", level, optname);
 609                 ret = -ENOPROTOOPT;
 610         }
 611 
 612         RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)",
 613             rs, optname, ret);
 614         return (ret);
 615 }
 616 
 617 /*ARGSUSED*/
 618 static int rdsv3_connect(sock_lower_handle_t proto_handle,
 619     const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn,
 620     cred_t *cr)
 621 {
 622         struct rsock *sk = (struct rsock *)proto_handle;
 623         struct sockaddr_in *sin = (struct sockaddr_in *)addr;
 624         struct rdsv3_sock       *rs = rdsv3_sk_to_rs(sk);
 625         int ret = 0;
 626 
 627         RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs);
 628 
 629         mutex_enter(&sk->sk_lock);
 630 
 631         if (addr_len != sizeof (struct sockaddr_in)) {
 632                 ret = -EINVAL;
 633                 goto out;
 634         }
 635 
 636         if (sin->sin_family != AF_INET_OFFLOAD) {
 637                 ret = -EAFNOSUPPORT;
 638                 goto out;
 639         }
 640 
 641         if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
 642                 ret = -EDESTADDRREQ;
 643                 goto out;
 644         }
 645 
 646         rs->rs_conn_addr = sin->sin_addr.s_addr;
 647         rs->rs_conn_port = sin->sin_port;
 648 
 649         sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1);
 650 
 651         RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs);
 652 
 653 out:
 654         mutex_exit(&sk->sk_lock);
 655         return (ret);
 656 }
 657 
 658 /*ARGSUSED*/
 659 static int
 660 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
 661 {
 662         struct rsock *sk = (struct rsock *)proto_handle;
 663         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 664 
 665         RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs);
 666 
 667         return (0);
 668 }
 669 
 670 /*ARGSUSED*/
 671 void
 672 rdsv3_activate(sock_lower_handle_t proto_handle,
 673     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
 674     int flags, cred_t *cr)
 675 {
 676         struct rsock *sk = (struct rsock *)proto_handle;
 677         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 678 
 679         RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs);
 680 
 681         sk->sk_upcalls = sock_upcalls;
 682         sk->sk_upper_handle = sock_handle;
 683 
 684         RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs);
 685 }
 686 
 687 
 688 /* ARGSUSED */
 689 int
 690 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio,
 691     struct nmsghdr *msg, cred_t *cr)
 692 {
 693         struct rsock *sk = (struct rsock *)proto_handle;
 694         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 695         int ret;
 696 
 697         RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs);
 698         ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid);
 699 
 700         RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret);
 701         if (ret < 0) {
 702                 return (-ret);
 703         }
 704 
 705         return (0);
 706 }
 707 
 708 /* ARGSUSED */
 709 int
 710 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio,
 711     struct nmsghdr *msg, cred_t *cr)
 712 {
 713         struct rsock *sk = (struct rsock *)proto_handle;
 714         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 715         int ret;
 716 
 717         RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs);
 718         ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags);
 719 
 720         RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret);
 721 
 722         if (ret < 0) {
 723                 return (-ret);
 724         }
 725 
 726         return (0);
 727 }
 728 
 729 /*ARGSUSED*/
 730 int
 731 rdsv3_getpeername(sock_lower_handle_t  proto_handle, struct sockaddr *addr,
 732     socklen_t *addr_len, cred_t *cr)
 733 {
 734         struct sockaddr_in *sin = (struct sockaddr_in *)addr;
 735         struct rsock *sk = (struct rsock *)proto_handle;
 736         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 737 
 738         RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs);
 739 
 740         (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero));
 741 
 742         /* racey, don't care */
 743         if (!rs->rs_conn_addr)
 744                 return (-ENOTCONN);
 745 
 746         sin->sin_port = rs->rs_conn_port;
 747         sin->sin_addr.s_addr = rs->rs_conn_addr;
 748 
 749         sin->sin_family = AF_INET_OFFLOAD;
 750 
 751         *addr_len = sizeof (*sin);
 752         return (0);
 753 }
 754 
 755 void
 756 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle)
 757 {
 758         struct rsock *sk = (struct rsock *)proto_handle;
 759         struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk);
 760 
 761         RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs);
 762 }
 763 
 764 static struct sock_downcalls_s rdsv3_sock_downcalls = {
 765         .sd_close =             rdsv3_release,
 766         .sd_bind =              rdsv3_bind,
 767         .sd_connect =           rdsv3_connect,
 768         .sd_accept =            NULL,
 769         .sd_getsockname =       rdsv3_getname,
 770         .sd_poll =              rdsv3_poll,
 771         .sd_ioctl =             rdsv3_ioctl,
 772         .sd_listen =            NULL,
 773         .sd_shutdown =          rdsv3_shutdown,
 774         .sd_setsockopt =        rdsv3_setsockopt,
 775         .sd_getsockopt =        rdsv3_getsockopt,
 776         .sd_send_uio =          rdsv3_send_uio,
 777         .sd_recv_uio =          rdsv3_recv_uio,
 778         .sd_activate =          rdsv3_activate,
 779         .sd_getpeername =       rdsv3_getpeername,
 780         .sd_send =              NULL,
 781         .sd_clr_flowctrl =      NULL
 782 };
 783 
 784 sock_lower_handle_t
 785 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
 786     uint_t *smodep, int *errorp, int flags, cred_t *credp)
 787 {
 788         struct rdsv3_sock       *rs;
 789         struct rsock            *sk;
 790 
 791         RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d "
 792             "flags: %d", family, type, proto, flags);
 793 
 794         sk = rdsv3_sk_alloc();
 795         if (sk == NULL)
 796                 return (NULL);
 797         rdsv3_sock_init_data(sk);
 798 
 799         rs = rdsv3_sk_to_rs(sk);
 800         rs->rs_sk = sk;
 801         mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL);
 802         rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL);
 803         list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message),
 804             offsetof(struct rdsv3_message, m_sock_item));
 805         list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming),
 806             offsetof(struct rdsv3_incoming, i_item));
 807         list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier),
 808             offsetof(struct rdsv3_notifier, n_list));
 809         mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL);
 810         avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare,
 811             sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node));
 812         mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL);
 813         mutex_init(&rs->rs_congested_lock, NULL, MUTEX_DRIVER, NULL);
 814         cv_init(&rs->rs_congested_cv, NULL, CV_DRIVER, NULL);
 815         rs->rs_cred = credp;
 816         rs->rs_zoneid = getzoneid();
 817         crhold(credp);
 818 
 819         mutex_enter(&rdsv3_sock_lock);
 820         list_insert_tail(&rdsv3_sock_list, rs);
 821         rdsv3_sock_count++;
 822         /* Initialize RDMA/IB on the 1st socket if not done at attach */
 823         if (rdsv3_sock_count == 1) {
 824                 rdsv3_rdma_init();
 825         }
 826         mutex_exit(&rdsv3_sock_lock);
 827 
 828         *errorp = 0;
 829         *smodep = SM_ATOMIC;
 830         *sock_downcalls = &rdsv3_sock_downcalls;
 831 
 832         RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs);
 833 
 834         return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs));
 835 }
 836 
 837 void
 838 rdsv3_sock_addref(struct rdsv3_sock *rs)
 839 {
 840         RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs);
 841         rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs));
 842 }
 843 
 844 void
 845 rdsv3_sock_put(struct rdsv3_sock *rs)
 846 {
 847         RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs);
 848         rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs));
 849 }
 850 
 851 static void
 852 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len,
 853     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
 854 {
 855         struct rdsv3_sock *rs;
 856         struct rdsv3_incoming *inc;
 857         unsigned int total = 0;
 858 
 859         RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)",
 860             rdsv3_sk_to_rs(sock));
 861 
 862         len /= sizeof (struct rds_info_message);
 863 
 864         mutex_enter(&rdsv3_sock_lock);
 865 
 866         RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
 867                 rw_enter(&rs->rs_recv_lock, RW_READER);
 868 
 869                 /* XXX too lazy to maintain counts.. */
 870                 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) {
 871                         total++;
 872                         if (total <= len)
 873                                 rdsv3_inc_info_copy(inc, iter, inc->i_saddr,
 874                                     rs->rs_bound_addr, 1);
 875                 }
 876 
 877                 rw_exit(&rs->rs_recv_lock);
 878         }
 879 
 880         mutex_exit(&rdsv3_sock_lock);
 881 
 882         lens->nr = total;
 883         lens->each = sizeof (struct rds_info_message);
 884 
 885         RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)",
 886             rdsv3_sk_to_rs(sock));
 887 }
 888 
 889 static void
 890 rdsv3_sock_info(struct rsock *sock, unsigned int len,
 891     struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens)
 892 {
 893         struct rds_info_socket sinfo;
 894         struct rdsv3_sock *rs;
 895         unsigned long bytes;
 896 
 897         RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)",
 898             rdsv3_sk_to_rs(sock));
 899 
 900         len /= sizeof (struct rds_info_socket);
 901 
 902         mutex_enter(&rdsv3_sock_lock);
 903 
 904         if ((len < rdsv3_sock_count) || (iter->addr == NULL))
 905                 goto out;
 906 
 907         bytes = sizeof (struct rds_info_socket);
 908         RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) {
 909                 sinfo.sndbuf = rdsv3_sk_sndbuf(rs);
 910                 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs);
 911                 sinfo.bound_addr = rs->rs_bound_addr;
 912                 sinfo.connected_addr = rs->rs_conn_addr;
 913                 sinfo.bound_port = rs->rs_bound_port;
 914                 sinfo.connected_port = rs->rs_conn_port;
 915 
 916                 rdsv3_info_copy(iter, &sinfo, bytes);
 917         }
 918 
 919         RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)",
 920             rdsv3_sk_to_rs(sock));
 921 
 922 out:
 923         lens->nr = rdsv3_sock_count;
 924         lens->each = sizeof (struct rds_info_socket);
 925 
 926         mutex_exit(&rdsv3_sock_lock);
 927 }
 928 
 929 rdsv3_delayed_work_t    *rdsv3_rdma_dwp = NULL;
 930 uint_t                  rdsv3_rdma_init_delay = 5; /* secs */
 931 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work);
 932 
 933 void
 934 rdsv3_exit(void)
 935 {
 936         RDSV3_DPRINTF4("rdsv3_exit", "Enter");
 937 
 938         if (rdsv3_rdma_dwp) {
 939                 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp);
 940         }
 941 
 942         (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit,
 943             NULL, DDI_SLEEP);
 944         while (rdsv3_rdma_listen_id != NULL) {
 945                 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit",
 946                     __func__, __LINE__);
 947                 delay(drv_usectohz(1000));
 948         }
 949 
 950         rdsv3_conn_exit();
 951         rdsv3_cong_exit();
 952         rdsv3_sysctl_exit();
 953         rdsv3_threads_exit();
 954         rdsv3_stats_exit();
 955         rdsv3_info_deregister_func(RDS_INFO_SOCKETS, rdsv3_sock_info);
 956         rdsv3_info_deregister_func(RDS_INFO_RECV_MESSAGES,
 957             rdsv3_sock_inc_info);
 958 
 959         if (rdsv3_rdma_dwp) {
 960                 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t));
 961                 rdsv3_rdma_dwp = NULL;
 962         }
 963 
 964         RDSV3_DPRINTF4("rdsv3_exit", "Return");
 965 }
 966 
 967 /*ARGSUSED*/
 968 int
 969 rdsv3_init()
 970 {
 971         int ret;
 972 
 973         RDSV3_DPRINTF4("rdsv3_init", "Enter");
 974 
 975         rdsv3_cong_init();
 976 
 977         ret = rdsv3_conn_init();
 978         if (ret)
 979                 goto out;
 980         ret = rdsv3_threads_init();
 981         if (ret)
 982                 goto out_conn;
 983         ret = rdsv3_sysctl_init();
 984         if (ret)
 985                 goto out_threads;
 986         ret = rdsv3_stats_init();
 987         if (ret)
 988                 goto out_sysctl;
 989 
 990         rdsv3_info_register_func(RDS_INFO_SOCKETS, rdsv3_sock_info);
 991         rdsv3_info_register_func(RDS_INFO_RECV_MESSAGES, rdsv3_sock_inc_info);
 992 
 993         /* rdsv3_rdma_init need to be called with a little delay */
 994         rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP);
 995         RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker);
 996         rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp,
 997             rdsv3_rdma_init_delay);
 998 
 999         RDSV3_DPRINTF4("rdsv3_init", "Return");
1000 
1001         goto out;
1002 
1003 out_stats:
1004         rdsv3_stats_exit();
1005 out_sysctl:
1006         rdsv3_sysctl_exit();
1007 out_threads:
1008         rdsv3_threads_exit();
1009 out_conn:
1010         rdsv3_conn_exit();
1011         rdsv3_cong_exit();
1012 out:
1013         return (ret);
1014 }