1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file af_rds.c 7 * Oracle elects to have and use the contents of af_rds.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/types.h> 45 #include <sys/stat.h> 46 #include <sys/conf.h> 47 #include <sys/ddi.h> 48 #include <sys/sunddi.h> 49 #include <sys/modctl.h> 50 #include <sys/rds.h> 51 #include <sys/stropts.h> 52 #include <sys/socket.h> 53 #include <sys/socketvar.h> 54 #include <sys/sockio.h> 55 #include <sys/sysmacros.h> 56 57 #include <inet/ip.h> 58 #include <net/if_types.h> 59 60 #include <sys/ib/clients/rdsv3/rdsv3.h> 61 #include <sys/ib/clients/rdsv3/rdma.h> 62 #include <sys/ib/clients/rdsv3/rdma_transport.h> 63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 64 65 extern void rdsv3_remove_bound(struct rdsv3_sock *rds); 66 extern int rdsv3_verify_bind_address(ipaddr_t addr); 67 68 extern ddi_taskq_t *rdsv3_taskq; 69 extern struct rdma_cm_id *rdsv3_rdma_listen_id; 70 71 /* this is just used for stats gathering :/ */ 72 kmutex_t rdsv3_sock_lock; 73 static unsigned long rdsv3_sock_count; 74 list_t rdsv3_sock_list; 75 76 /* 77 * This is called as the final descriptor referencing this socket is closed. 78 * We have to unbind the socket so that another socket can be bound to the 79 * address it was using. 80 * 81 * We have to be careful about racing with the incoming path. sock_orphan() 82 * sets SOCK_DEAD and we use that as an indicator to the rx path that new 83 * messages shouldn't be queued. 84 */ 85 /* ARGSUSED */ 86 static int 87 rdsv3_release(sock_lower_handle_t proto_handle, int flgs, cred_t *cr) 88 { 89 struct rsock *sk = (struct rsock *)proto_handle; 90 struct rdsv3_sock *rs; 91 92 if (!sk) 93 goto out; 94 95 rs = rdsv3_sk_to_rs(sk); 96 RDSV3_DPRINTF4("rdsv3_release", "Enter(rs: %p, sk: %p)", rs, sk); 97 98 rdsv3_sk_sock_orphan(sk); 99 rdsv3_cong_remove_socket(rs); 100 rdsv3_remove_bound(rs); 101 102 /* 103 * Note - rdsv3_clear_recv_queue grabs rs_recv_lock, so 104 * that ensures the recv path has completed messing 105 * with the socket. 106 * 107 * Note2 - rdsv3_clear_recv_queue(rs) should be called first 108 * to prevent some race conditions, which is different from 109 * the Linux code. 110 */ 111 rdsv3_clear_recv_queue(rs); 112 rdsv3_send_drop_to(rs, NULL); 113 rdsv3_rdma_drop_keys(rs); 114 (void) rdsv3_notify_queue_get(rs, NULL); 115 116 mutex_enter(&rdsv3_sock_lock); 117 list_remove_node(&rs->rs_item); 118 rdsv3_sock_count--; 119 mutex_exit(&rdsv3_sock_lock); 120 121 while (sk->sk_refcount > 1) { 122 /* wait for 1 sec and try again */ 123 delay(drv_usectohz(1000000)); 124 } 125 126 /* this will free the rs and sk */ 127 rdsv3_sk_sock_put(sk); 128 129 RDSV3_DPRINTF4("rdsv3_release", "Return (rds: %p)", rs); 130 out: 131 return (0); 132 } 133 134 void 135 __rdsv3_wake_sk_sleep(struct rsock *sk) 136 { 137 /* wakup anyone waiting in recvmsg */ 138 if (!rdsv3_sk_sock_flag(sk, SOCK_DEAD) && sk->sk_sleep) 139 rdsv3_wake_up(sk->sk_sleep); 140 } 141 142 /* 143 * Careful not to race with rdsv3_release -> sock_orphan which clears sk_sleep. 144 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK 145 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but 146 * this seems more conservative. 147 * NB - normally, one would use sk_callback_lock for this, but we can 148 * get here from interrupts, whereas the network code grabs sk_callback_lock 149 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks. 150 */ 151 void 152 rdsv3_wake_sk_sleep(struct rdsv3_sock *rs) 153 { 154 RDSV3_DPRINTF4("rdsv3_wake_sk_sleep", "Enter(rs: %p)", rs); 155 156 rw_enter(&rs->rs_recv_lock, RW_READER); 157 __rdsv3_wake_sk_sleep(rdsv3_rs_to_sk(rs)); 158 rw_exit(&rs->rs_recv_lock); 159 } 160 161 /*ARGSUSED*/ 162 static int 163 rdsv3_getname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 164 socklen_t *addr_len, cred_t *cr) 165 { 166 struct rsock *sk = (struct rsock *)proto_handle; 167 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 168 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 169 170 RDSV3_DPRINTF4("rdsv3_getname", "Enter(rs: %p, port: %d)", rs, 171 rs->rs_bound_port); 172 173 sin->sin_port = rs->rs_bound_port; 174 sin->sin_addr.s_addr = rs->rs_bound_addr; 175 176 sin->sin_family = AF_INET_OFFLOAD; 177 178 *addr_len = sizeof (*sin); 179 return (0); 180 } 181 182 /* 183 * RDS' poll is without a doubt the least intuitive part of the interface, 184 * as POLLIN and POLLOUT do not behave entirely as you would expect from 185 * a network protocol. 186 * 187 * POLLIN is asserted if 188 * - there is data on the receive queue. 189 * - to signal that a previously congested destination may have become 190 * uncongested 191 * - A notification has been queued to the socket (this can be a congestion 192 * update, or a RDMA completion). 193 * 194 * POLLOUT is asserted if there is room on the send queue. This does not mean 195 * however, that the next sendmsg() call will succeed. If the application tries 196 * to send to a congested destination, the system call may still fail (and 197 * return ENOBUFS). 198 */ 199 /* ARGSUSED */ 200 static short 201 rdsv3_poll(sock_lower_handle_t proto_handle, short events, int anyyet, 202 cred_t *cr) 203 { 204 struct rsock *sk = (struct rsock *)proto_handle; 205 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 206 unsigned short mask = 0; 207 208 #if 0 209 RDSV3_DPRINTF4("rdsv3_poll", "enter(%p %x %d)", rs, events, anyyet); 210 #endif 211 212 /* 213 * If rs_seen_congestion is on, wait until it's off. 214 * This is implemented for the following OFED code. 215 * if (rs->rs_seen_congestion) 216 * poll_wait(file, &rds_poll_waitq, wait); 217 */ 218 mutex_enter(&rs->rs_congested_lock); 219 while (rs->rs_seen_congestion) { 220 cv_wait(&rs->rs_congested_cv, 221 &rs->rs_congested_lock); 222 } 223 mutex_exit(&rs->rs_congested_lock); 224 225 rw_enter(&rs->rs_recv_lock, RW_READER); 226 if (!rs->rs_cong_monitor) { 227 /* 228 * When a congestion map was updated, we signal POLLIN for 229 * "historical" reasons. Applications can also poll for 230 * WRBAND instead. 231 */ 232 if (rdsv3_cong_updated_since(&rs->rs_cong_track)) 233 mask |= (POLLIN | POLLRDNORM | POLLWRBAND); 234 } else { 235 mutex_enter(&rs->rs_lock); 236 if (rs->rs_cong_notify) 237 mask |= (POLLIN | POLLRDNORM); 238 mutex_exit(&rs->rs_lock); 239 } 240 if (!list_is_empty(&rs->rs_recv_queue) || 241 !list_is_empty(&rs->rs_notify_queue)) 242 mask |= (POLLIN | POLLRDNORM); 243 if (rs->rs_snd_bytes < rdsv3_sk_sndbuf(rs)) 244 mask |= (POLLOUT | POLLWRNORM); 245 246 /* clear state any time we wake a seen-congested socket */ 247 if (mask) { 248 mutex_enter(&rs->rs_congested_lock); 249 rs->rs_seen_congestion = 0; 250 mutex_exit(&rs->rs_congested_lock); 251 } 252 253 rw_exit(&rs->rs_recv_lock); 254 255 #if 0 256 RDSV3_DPRINTF4("rdsv3_poll", "return(%p %x)", rs, mask); 257 #endif 258 259 return (mask); 260 } 261 262 /* ARGSUSED */ 263 static int 264 rdsv3_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 265 int mode, int32_t *rvalp, cred_t *cr) 266 { 267 ksocket_t so4; 268 struct lifconf lifc; 269 struct lifreq lifr, *lifrp; 270 struct ifconf ifc; 271 struct ifreq ifr; 272 int rval = 0, rc, len; 273 int numifs; 274 int bufsize; 275 void *buf; 276 277 RDSV3_DPRINTF4("rdsv3_ioctl", "enter: cmd: %d", cmd); 278 279 /* Only ipv4 for now */ 280 rval = ksocket_socket(&so4, PF_INET, SOCK_DGRAM, 0, KSOCKET_NOSLEEP, 281 CRED()); 282 if (rval != 0) { 283 RDSV3_DPRINTF2("rdsv3_ioctl", "ksocket_socket returned %d", 284 rval); 285 return (rval); 286 } 287 288 switch (cmd) { 289 case SIOCGLIFNUM : 290 case SIOCGIFNUM : 291 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 292 if (rval != 0) break; 293 if (cmd == SIOCGLIFNUM) { 294 struct lifnum lifn; 295 lifn.lifn_family = AF_INET_OFFLOAD; 296 lifn.lifn_flags = 0; 297 lifn.lifn_count = numifs; 298 (void) ddi_copyout(&lifn, (void *)arg, 299 sizeof (struct lifnum), 0); 300 } else { 301 len = 0; 302 for (lifrp = (struct lifreq *)buf, rc = 0; rc < numifs; 303 rc++, lifrp++) { 304 if (strlen(lifrp->lifr_name) <= IFNAMSIZ) { 305 len++; 306 } 307 } 308 (void) ddi_copyout(&len, (void *)arg, 309 sizeof (int), 0); 310 } 311 kmem_free(buf, bufsize); 312 break; 313 314 case SIOCGLIFCONF : 315 if (ddi_copyin((void *)arg, &lifc, sizeof (struct lifconf), 0) 316 != 0) { 317 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifc"); 318 rval = EFAULT; 319 break; 320 } 321 322 rval = rdsv3_do_ip_ioctl(so4, &buf, &bufsize, &numifs); 323 if (rval != 0) { 324 RDSV3_DPRINTF2("rdsv3_ioctl", 325 "rdsv3_do_ip_ioctl failed: %d", rval); 326 break; 327 } 328 329 if ((lifc.lifc_len > 0) && (numifs > 0)) { 330 if (ddi_copyout(buf, (void *)lifc.lifc_req, 331 (lifc.lifc_len < bufsize) ? lifc.lifc_len : 332 bufsize, 0) != 0) { 333 RDSV3_DPRINTF2("rdsv3_ioctl", 334 "copyout of records failed"); 335 rval = EFAULT; 336 } 337 338 } 339 340 lifc.lifc_len = bufsize; 341 if (ddi_copyout(&lifc, (void *)arg, sizeof (struct lifconf), 342 0) != 0) { 343 RDSV3_DPRINTF2("rdsv3_ioctl", 344 "copyout of lifconf failed"); 345 rval = EFAULT; 346 } 347 348 kmem_free(buf, bufsize); 349 break; 350 351 case SIOCGIFCONF : 352 case O_SIOCGIFCONF : 353 if (ddi_copyin((void *)arg, &ifc, sizeof (struct ifconf), 0) 354 != 0) { 355 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifc"); 356 rval = EFAULT; 357 break; 358 } 359 360 RDSV3_DPRINTF2("rdsv3_ioctl", 361 "O_SIOCGIFCONF: ifc_len: %d, req: %p", 362 ifc.ifc_len, ifc.ifc_req); 363 364 rval = rdsv3_do_ip_ioctl_old(so4, &buf, &bufsize, &numifs); 365 if (rval != 0) { 366 RDSV3_DPRINTF2("rdsv3_ioctl", 367 "rdsv3_do_ip_ioctl_old failed: %d", rval); 368 break; 369 } 370 371 if ((ifc.ifc_len > 0) && (numifs > 0)) { 372 if (ddi_copyout(buf, (void *)ifc.ifc_req, 373 (ifc.ifc_len < bufsize) ? ifc.ifc_len : 374 bufsize, 0) != 0) { 375 RDSV3_DPRINTF2("rdsv3_ioctl", 376 "copyout of records failed"); 377 rval = EFAULT; 378 } 379 380 } 381 382 ifc.ifc_len = bufsize; 383 if (ddi_copyout(&ifc, (void *)arg, sizeof (struct ifconf), 384 0) != 0) { 385 RDSV3_DPRINTF2("rdsv3_ioctl", 386 "copyout of ifconf failed"); 387 rval = EFAULT; 388 } 389 390 kmem_free(buf, bufsize); 391 break; 392 393 case SIOCGLIFFLAGS : 394 case SIOCSLIFFLAGS : 395 case SIOCGLIFMTU : 396 case SIOCGLIFNETMASK : 397 case SIOCGLIFINDEX : 398 if (ddi_copyin((void *)arg, &lifr, sizeof (struct lifreq), 0) 399 != 0) { 400 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed lifr"); 401 rval = EFAULT; 402 break; 403 } 404 405 rc = ksocket_ioctl(so4, cmd, (intptr_t)&lifr, &rval, CRED()); 406 if (rc != 0) { 407 RDSV3_DPRINTF2("rdsv3_ioctl", 408 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 409 rc, lifr.lifr_name, cmd); 410 break; 411 } 412 413 (void) ddi_copyout(&lifr, (void *)arg, 414 sizeof (struct lifreq), 0); 415 break; 416 417 case SIOCGIFFLAGS : 418 case SIOCSIFFLAGS : 419 case SIOCGIFMTU : 420 case SIOCGIFNETMASK : 421 case SIOCGIFINDEX : 422 if (ddi_copyin((void *)arg, &ifr, sizeof (struct ifreq), 0) 423 != 0) { 424 RDSV3_DPRINTF2("rdsv3_ioctl", "ddi_copyin failed ifr"); 425 rval = EFAULT; 426 break; 427 } 428 429 RDSV3_DPRINTF2("rdsv3_ioctl", "1. name: %s", ifr.ifr_name); 430 431 rc = ksocket_ioctl(so4, cmd, (intptr_t)&ifr, &rval, CRED()); 432 if (rc != 0) { 433 RDSV3_DPRINTF2("rdsv3_ioctl", 434 "ksocket_ioctl failed: %d, name: %s cmd: 0x%x", 435 rc, ifr.ifr_name, cmd); 436 437 break; 438 } 439 440 RDSV3_DPRINTF2("rdsv3_ioctl", "2. name: %s", ifr.ifr_name); 441 442 (void) ddi_copyout(&ifr, (void *)arg, 443 sizeof (struct ifreq), 0); 444 break; 445 446 default: 447 if ((cmd >= RDS_INFO_FIRST) && 448 (cmd <= RDS_INFO_LAST)) { 449 return (rdsv3_info_ioctl((struct rsock *)proto_handle, 450 cmd, (char *)arg, rvalp)); 451 } 452 RDSV3_DPRINTF2("rdsv3_ioctl", "Unknown ioctl cmd: %d", cmd); 453 cmn_err(CE_CONT, "unsupported IOCTL cmd: %d \n", cmd); 454 rval = EOPNOTSUPP; 455 } 456 457 (void) ksocket_close(so4, CRED()); 458 459 RDSV3_DPRINTF4("rdsv3_ioctl", "return: %d cmd: %d", rval, cmd); 460 461 *rvalp = rval; 462 return (rval); 463 } 464 465 static int 466 rdsv3_cancel_sent_to(struct rdsv3_sock *rs, char *optval, int len) 467 { 468 struct sockaddr_in sin; 469 470 /* racing with another thread binding seems ok here */ 471 if (rs->rs_bound_addr == 0) 472 return (-ENOTCONN); /* XXX not a great errno */ 473 474 if (len < sizeof (struct sockaddr_in)) 475 return (-EINVAL); 476 477 if (ddi_copyin((void *)optval, &sin, sizeof (struct sockaddr_in), 478 0) != 0) { 479 RDSV3_DPRINTF2("rdsv3_cancel_sent_to", "ddi_copyin failed sin"); 480 return (-EFAULT); 481 } 482 483 rdsv3_send_drop_to(rs, &sin); 484 485 return (0); 486 } 487 488 static int 489 rdsv3_set_bool_option(unsigned char *optvar, char *optval, int optlen) 490 { 491 int value = *optval; 492 493 if (optlen < sizeof (int)) 494 return (-EINVAL); 495 *optvar = !!value; 496 return (0); 497 } 498 499 static int 500 rdsv3_cong_monitor(struct rdsv3_sock *rs, char *optval, int optlen) 501 { 502 int ret; 503 504 ret = rdsv3_set_bool_option(&rs->rs_cong_monitor, optval, optlen); 505 if (ret == 0) { 506 if (rs->rs_cong_monitor) { 507 rdsv3_cong_add_socket(rs); 508 } else { 509 rdsv3_cong_remove_socket(rs); 510 rs->rs_cong_mask = 0; 511 rs->rs_cong_notify = 0; 512 } 513 } 514 return (ret); 515 } 516 517 /*ARGSUSED*/ 518 static int 519 rdsv3_setsockopt(sock_lower_handle_t proto_handle, int level, 520 int optname, const void *optval, socklen_t optlen, cred_t *cr) 521 { 522 struct rsock *sk = (struct rsock *)proto_handle; 523 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 524 int ret = 0; 525 526 RDSV3_DPRINTF4("rdsv3_setsockopt", "enter(%p %d %d)", 527 rs, level, optname); 528 529 switch (optname) { 530 case RDS_CANCEL_SENT_TO: 531 ret = rdsv3_cancel_sent_to(rs, (char *)optval, optlen); 532 break; 533 case RDS_GET_MR: 534 ret = rdsv3_get_mr(rs, optval, optlen); 535 break; 536 case RDS_GET_MR_FOR_DEST: 537 ret = rdsv3_get_mr_for_dest(rs, optval, optlen); 538 break; 539 case RDS_FREE_MR: 540 ret = rdsv3_free_mr(rs, optval, optlen); 541 break; 542 case RDS_RECVERR: 543 ret = rdsv3_set_bool_option(&rs->rs_recverr, 544 (char *)optval, optlen); 545 break; 546 case RDS_CONG_MONITOR: 547 ret = rdsv3_cong_monitor(rs, (char *)optval, optlen); 548 break; 549 case SO_SNDBUF: 550 sk->sk_sndbuf = *(uint_t *)optval; 551 return (ret); 552 case SO_RCVBUF: 553 sk->sk_rcvbuf = *(uint_t *)optval; 554 return (ret); 555 default: 556 #if 1 557 break; 558 #else 559 ret = -ENOPROTOOPT; 560 #endif 561 } 562 out: 563 return (ret); 564 } 565 566 /* XXX */ 567 /*ARGSUSED*/ 568 static int 569 rdsv3_getsockopt(sock_lower_handle_t proto_handle, int level, 570 int optname, void *optval, socklen_t *optlen, cred_t *cr) 571 { 572 struct rsock *sk = (struct rsock *)proto_handle; 573 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 574 int ret = 0; 575 576 RDSV3_DPRINTF4("rdsv3_getsockopt", "enter(%p %d %d)", 577 rs, optname, *optlen); 578 579 switch (optname) { 580 case SO_SNDBUF: 581 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_SNDBUF(%d)", 582 sk->sk_sndbuf); 583 if (*optlen != 0) { 584 *((int *)optval) = sk->sk_sndbuf; 585 *optlen = sizeof (uint_t); 586 } 587 return (ret); 588 case SO_RCVBUF: 589 RDSV3_DPRINTF4("rdsv3_getsockopt", "SO_RCVBUF(%d)", 590 sk->sk_rcvbuf); 591 if (*optlen != 0) { 592 *((int *)optval) = sk->sk_rcvbuf; 593 *optlen = sizeof (uint_t); 594 } 595 return (ret); 596 case RDS_RECVERR: 597 RDSV3_DPRINTF4("rdsv3_getsockopt", "RDSV3_RECVERR(%d)", 598 rs->rs_recverr); 599 if (*optlen < sizeof (int)) 600 return (-EINVAL); 601 else { 602 *(int *)optval = rs->rs_recverr; 603 *optlen = sizeof (int); 604 } 605 return (0); 606 default: 607 RDSV3_DPRINTF2("rdsv3_getsockopt", 608 "Unknown: level: %d optname: %d", level, optname); 609 ret = -ENOPROTOOPT; 610 } 611 612 RDSV3_DPRINTF4("rdsv3_getsockopt", "return(%p %d %d)", 613 rs, optname, ret); 614 return (ret); 615 } 616 617 /*ARGSUSED*/ 618 static int rdsv3_connect(sock_lower_handle_t proto_handle, 619 const struct sockaddr *addr, socklen_t addr_len, sock_connid_t *conn, 620 cred_t *cr) 621 { 622 struct rsock *sk = (struct rsock *)proto_handle; 623 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 624 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 625 int ret = 0; 626 627 RDSV3_DPRINTF4("rdsv3_connect", "Enter(rs: %p)", rs); 628 629 mutex_enter(&sk->sk_lock); 630 631 if (addr_len != sizeof (struct sockaddr_in)) { 632 ret = -EINVAL; 633 goto out; 634 } 635 636 if (sin->sin_family != AF_INET_OFFLOAD) { 637 ret = -EAFNOSUPPORT; 638 goto out; 639 } 640 641 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 642 ret = -EDESTADDRREQ; 643 goto out; 644 } 645 646 rs->rs_conn_addr = sin->sin_addr.s_addr; 647 rs->rs_conn_port = sin->sin_port; 648 649 sk->sk_upcalls->su_connected(sk->sk_upper_handle, 0, NULL, -1); 650 651 RDSV3_DPRINTF4("rdsv3_connect", "Return(rs: %p)", rs); 652 653 out: 654 mutex_exit(&sk->sk_lock); 655 return (ret); 656 } 657 658 /*ARGSUSED*/ 659 static int 660 rdsv3_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 661 { 662 struct rsock *sk = (struct rsock *)proto_handle; 663 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 664 665 RDSV3_DPRINTF4("rdsv3_shutdown", "Enter(rs: %p)", rs); 666 667 return (0); 668 } 669 670 /*ARGSUSED*/ 671 void 672 rdsv3_activate(sock_lower_handle_t proto_handle, 673 sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, 674 int flags, cred_t *cr) 675 { 676 struct rsock *sk = (struct rsock *)proto_handle; 677 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 678 679 RDSV3_DPRINTF4("rdsv3_activate", "Enter(rs: %p)", rs); 680 681 sk->sk_upcalls = sock_upcalls; 682 sk->sk_upper_handle = sock_handle; 683 684 RDSV3_DPRINTF4("rdsv3_activate", "Return (rs: %p)", rs); 685 } 686 687 688 /* ARGSUSED */ 689 int 690 rdsv3_send_uio(sock_lower_handle_t proto_handle, uio_t *uio, 691 struct nmsghdr *msg, cred_t *cr) 692 { 693 struct rsock *sk = (struct rsock *)proto_handle; 694 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 695 int ret; 696 697 RDSV3_DPRINTF4("rdsv3_send_uio", "Enter(rs: %p)", rs); 698 ret = rdsv3_sendmsg(rs, uio, msg, uio->uio_resid); 699 700 RDSV3_DPRINTF4("rdsv3_send_uio", "Return(rs: %p ret %d)", rs, ret); 701 if (ret < 0) { 702 return (-ret); 703 } 704 705 return (0); 706 } 707 708 /* ARGSUSED */ 709 int 710 rdsv3_recv_uio(sock_lower_handle_t proto_handle, uio_t *uio, 711 struct nmsghdr *msg, cred_t *cr) 712 { 713 struct rsock *sk = (struct rsock *)proto_handle; 714 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 715 int ret; 716 717 RDSV3_DPRINTF4("rdsv3_recv_uio", "Enter (rs: %p)", rs); 718 ret = rdsv3_recvmsg(rs, uio, msg, uio->uio_resid, msg->msg_flags); 719 720 RDSV3_DPRINTF4("rdsv3_recv_uio", "Return(rs: %p ret %d)", rs, ret); 721 722 if (ret < 0) { 723 return (-ret); 724 } 725 726 return (0); 727 } 728 729 /*ARGSUSED*/ 730 int 731 rdsv3_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 732 socklen_t *addr_len, cred_t *cr) 733 { 734 struct sockaddr_in *sin = (struct sockaddr_in *)addr; 735 struct rsock *sk = (struct rsock *)proto_handle; 736 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 737 738 RDSV3_DPRINTF2("rdsv3_getpeername", "enter(rs: %p)", rs); 739 740 (void) memset(sin->sin_zero, 0, sizeof (sin->sin_zero)); 741 742 /* racey, don't care */ 743 if (!rs->rs_conn_addr) 744 return (-ENOTCONN); 745 746 sin->sin_port = rs->rs_conn_port; 747 sin->sin_addr.s_addr = rs->rs_conn_addr; 748 749 sin->sin_family = AF_INET_OFFLOAD; 750 751 *addr_len = sizeof (*sin); 752 return (0); 753 } 754 755 void 756 rdsv3_clrflowctrl(sock_lower_handle_t proto_handle) 757 { 758 struct rsock *sk = (struct rsock *)proto_handle; 759 struct rdsv3_sock *rs = rdsv3_sk_to_rs(sk); 760 761 RDSV3_DPRINTF2("rdsv3_clrflowctrl", "enter(rs: %p)", rs); 762 } 763 764 static struct sock_downcalls_s rdsv3_sock_downcalls = { 765 .sd_close = rdsv3_release, 766 .sd_bind = rdsv3_bind, 767 .sd_connect = rdsv3_connect, 768 .sd_accept = NULL, 769 .sd_getsockname = rdsv3_getname, 770 .sd_poll = rdsv3_poll, 771 .sd_ioctl = rdsv3_ioctl, 772 .sd_listen = NULL, 773 .sd_shutdown = rdsv3_shutdown, 774 .sd_setsockopt = rdsv3_setsockopt, 775 .sd_getsockopt = rdsv3_getsockopt, 776 .sd_send_uio = rdsv3_send_uio, 777 .sd_recv_uio = rdsv3_recv_uio, 778 .sd_activate = rdsv3_activate, 779 .sd_getpeername = rdsv3_getpeername, 780 .sd_send = NULL, 781 .sd_clr_flowctrl = NULL 782 }; 783 784 sock_lower_handle_t 785 rdsv3_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 786 uint_t *smodep, int *errorp, int flags, cred_t *credp) 787 { 788 struct rdsv3_sock *rs; 789 struct rsock *sk; 790 791 RDSV3_DPRINTF4("rdsv3_create", "Enter (family: %d type: %d, proto: %d " 792 "flags: %d", family, type, proto, flags); 793 794 sk = rdsv3_sk_alloc(); 795 if (sk == NULL) 796 return (NULL); 797 rdsv3_sock_init_data(sk); 798 799 rs = rdsv3_sk_to_rs(sk); 800 rs->rs_sk = sk; 801 mutex_init(&rs->rs_lock, NULL, MUTEX_DRIVER, NULL); 802 rw_init(&rs->rs_recv_lock, NULL, RW_DRIVER, NULL); 803 list_create(&rs->rs_send_queue, sizeof (struct rdsv3_message), 804 offsetof(struct rdsv3_message, m_sock_item)); 805 list_create(&rs->rs_recv_queue, sizeof (struct rdsv3_incoming), 806 offsetof(struct rdsv3_incoming, i_item)); 807 list_create(&rs->rs_notify_queue, sizeof (struct rdsv3_notifier), 808 offsetof(struct rdsv3_notifier, n_list)); 809 mutex_init(&rs->rs_rdma_lock, NULL, MUTEX_DRIVER, NULL); 810 avl_create(&rs->rs_rdma_keys, rdsv3_mr_compare, 811 sizeof (struct rdsv3_mr), offsetof(struct rdsv3_mr, r_rb_node)); 812 mutex_init(&rs->rs_conn_lock, NULL, MUTEX_DRIVER, NULL); 813 mutex_init(&rs->rs_congested_lock, NULL, MUTEX_DRIVER, NULL); 814 cv_init(&rs->rs_congested_cv, NULL, CV_DRIVER, NULL); 815 rs->rs_cred = credp; 816 rs->rs_zoneid = getzoneid(); 817 crhold(credp); 818 819 mutex_enter(&rdsv3_sock_lock); 820 list_insert_tail(&rdsv3_sock_list, rs); 821 rdsv3_sock_count++; 822 /* Initialize RDMA/IB on the 1st socket if not done at attach */ 823 if (rdsv3_sock_count == 1) { 824 rdsv3_rdma_init(); 825 } 826 mutex_exit(&rdsv3_sock_lock); 827 828 *errorp = 0; 829 *smodep = SM_ATOMIC; 830 *sock_downcalls = &rdsv3_sock_downcalls; 831 832 RDSV3_DPRINTF4("rdsv3_create", "Return: %p", rs); 833 834 return ((sock_lower_handle_t)rdsv3_rs_to_sk(rs)); 835 } 836 837 void 838 rdsv3_sock_addref(struct rdsv3_sock *rs) 839 { 840 RDSV3_DPRINTF4("rdsv3_sock_addref", "Enter(rs: %p)", rs); 841 rdsv3_sk_sock_hold(rdsv3_rs_to_sk(rs)); 842 } 843 844 void 845 rdsv3_sock_put(struct rdsv3_sock *rs) 846 { 847 RDSV3_DPRINTF4("rdsv3_sock_put", "Enter(rs: %p)", rs); 848 rdsv3_sk_sock_put(rdsv3_rs_to_sk(rs)); 849 } 850 851 static void 852 rdsv3_sock_inc_info(struct rsock *sock, unsigned int len, 853 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 854 { 855 struct rdsv3_sock *rs; 856 struct rdsv3_incoming *inc; 857 unsigned int total = 0; 858 859 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "Enter(rs: %p)", 860 rdsv3_sk_to_rs(sock)); 861 862 len /= sizeof (struct rds_info_message); 863 864 mutex_enter(&rdsv3_sock_lock); 865 866 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 867 rw_enter(&rs->rs_recv_lock, RW_READER); 868 869 /* XXX too lazy to maintain counts.. */ 870 RDSV3_FOR_EACH_LIST_NODE(inc, &rs->rs_recv_queue, i_item) { 871 total++; 872 if (total <= len) 873 rdsv3_inc_info_copy(inc, iter, inc->i_saddr, 874 rs->rs_bound_addr, 1); 875 } 876 877 rw_exit(&rs->rs_recv_lock); 878 } 879 880 mutex_exit(&rdsv3_sock_lock); 881 882 lens->nr = total; 883 lens->each = sizeof (struct rds_info_message); 884 885 RDSV3_DPRINTF4("rdsv3_sock_inc_info", "return(rs: %p)", 886 rdsv3_sk_to_rs(sock)); 887 } 888 889 static void 890 rdsv3_sock_info(struct rsock *sock, unsigned int len, 891 struct rdsv3_info_iterator *iter, struct rdsv3_info_lengths *lens) 892 { 893 struct rds_info_socket sinfo; 894 struct rdsv3_sock *rs; 895 unsigned long bytes; 896 897 RDSV3_DPRINTF4("rdsv3_sock_info", "Enter(rs: %p)", 898 rdsv3_sk_to_rs(sock)); 899 900 len /= sizeof (struct rds_info_socket); 901 902 mutex_enter(&rdsv3_sock_lock); 903 904 if ((len < rdsv3_sock_count) || (iter->addr == NULL)) 905 goto out; 906 907 bytes = sizeof (struct rds_info_socket); 908 RDSV3_FOR_EACH_LIST_NODE(rs, &rdsv3_sock_list, rs_item) { 909 sinfo.sndbuf = rdsv3_sk_sndbuf(rs); 910 sinfo.rcvbuf = rdsv3_sk_rcvbuf(rs); 911 sinfo.bound_addr = rs->rs_bound_addr; 912 sinfo.connected_addr = rs->rs_conn_addr; 913 sinfo.bound_port = rs->rs_bound_port; 914 sinfo.connected_port = rs->rs_conn_port; 915 916 rdsv3_info_copy(iter, &sinfo, bytes); 917 } 918 919 RDSV3_DPRINTF4("rdsv3_sock_info", "Return(rs: %p)", 920 rdsv3_sk_to_rs(sock)); 921 922 out: 923 lens->nr = rdsv3_sock_count; 924 lens->each = sizeof (struct rds_info_socket); 925 926 mutex_exit(&rdsv3_sock_lock); 927 } 928 929 rdsv3_delayed_work_t *rdsv3_rdma_dwp = NULL; 930 uint_t rdsv3_rdma_init_delay = 5; /* secs */ 931 extern void rdsv3_rdma_init_worker(struct rdsv3_work_s *work); 932 933 void 934 rdsv3_exit(void) 935 { 936 RDSV3_DPRINTF4("rdsv3_exit", "Enter"); 937 938 if (rdsv3_rdma_dwp) { 939 rdsv3_cancel_delayed_work(rdsv3_rdma_dwp); 940 } 941 942 (void) ddi_taskq_dispatch(rdsv3_taskq, rdsv3_rdma_exit, 943 NULL, DDI_SLEEP); 944 while (rdsv3_rdma_listen_id != NULL) { 945 RDSV3_DPRINTF5("rdsv3", "%s-%d Waiting for rdsv3_rdma_exit", 946 __func__, __LINE__); 947 delay(drv_usectohz(1000)); 948 } 949 950 rdsv3_conn_exit(); 951 rdsv3_cong_exit(); 952 rdsv3_sysctl_exit(); 953 rdsv3_threads_exit(); 954 rdsv3_stats_exit(); 955 rdsv3_info_deregister_func(RDS_INFO_SOCKETS, rdsv3_sock_info); 956 rdsv3_info_deregister_func(RDS_INFO_RECV_MESSAGES, 957 rdsv3_sock_inc_info); 958 959 if (rdsv3_rdma_dwp) { 960 kmem_free(rdsv3_rdma_dwp, sizeof (rdsv3_delayed_work_t)); 961 rdsv3_rdma_dwp = NULL; 962 } 963 964 RDSV3_DPRINTF4("rdsv3_exit", "Return"); 965 } 966 967 /*ARGSUSED*/ 968 int 969 rdsv3_init() 970 { 971 int ret; 972 973 RDSV3_DPRINTF4("rdsv3_init", "Enter"); 974 975 rdsv3_cong_init(); 976 977 ret = rdsv3_conn_init(); 978 if (ret) 979 goto out; 980 ret = rdsv3_threads_init(); 981 if (ret) 982 goto out_conn; 983 ret = rdsv3_sysctl_init(); 984 if (ret) 985 goto out_threads; 986 ret = rdsv3_stats_init(); 987 if (ret) 988 goto out_sysctl; 989 990 rdsv3_info_register_func(RDS_INFO_SOCKETS, rdsv3_sock_info); 991 rdsv3_info_register_func(RDS_INFO_RECV_MESSAGES, rdsv3_sock_inc_info); 992 993 /* rdsv3_rdma_init need to be called with a little delay */ 994 rdsv3_rdma_dwp = kmem_zalloc(sizeof (rdsv3_delayed_work_t), KM_SLEEP); 995 RDSV3_INIT_DELAYED_WORK(rdsv3_rdma_dwp, rdsv3_rdma_init_worker); 996 rdsv3_queue_delayed_work(rdsv3_wq, rdsv3_rdma_dwp, 997 rdsv3_rdma_init_delay); 998 999 RDSV3_DPRINTF4("rdsv3_init", "Return"); 1000 1001 goto out; 1002 1003 out_stats: 1004 rdsv3_stats_exit(); 1005 out_sysctl: 1006 rdsv3_sysctl_exit(); 1007 out_threads: 1008 rdsv3_threads_exit(); 1009 out_conn: 1010 rdsv3_conn_exit(); 1011 rdsv3_cong_exit(); 1012 out: 1013 return (ret); 1014 }