1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 { (void *)&rib_modldrv, NULL } 189 }; 190 191 typedef struct rib_lrc_entry { 192 struct rib_lrc_entry *forw; 193 struct rib_lrc_entry *back; 194 char *lrc_buf; 195 196 uint32_t lrc_len; 197 void *avl_node; 198 bool_t registered; 199 200 struct mrc lrc_mhandle; 201 bool_t lrc_on_freed_list; 202 } rib_lrc_entry_t; 203 204 typedef struct cache_struct { 205 rib_lrc_entry_t r; 206 uint32_t len; 207 uint32_t elements; 208 kmutex_t node_lock; 209 avl_node_t avl_link; 210 } cache_avl_struct_t; 211 212 uint64_t cache_limit = 100 * 1024 * 1024; 213 static uint64_t cache_watermark = 80 * 1024 * 1024; 214 static bool_t stats_enabled = FALSE; 215 216 static uint64_t max_unsignaled_rws = 5; 217 int nfs_rdma_port = NFS_RDMA_PORT; 218 219 #define RIBNETID_TCP "tcp" 220 #define RIBNETID_TCP6 "tcp6" 221 222 /* 223 * rib_stat: private data pointer used when registering 224 * with the IBTF. It is returned to the consumer 225 * in all callbacks. 226 */ 227 static rpcib_state_t *rib_stat = NULL; 228 229 #define RNR_RETRIES IBT_RNR_RETRY_1 230 #define MAX_PORTS 2 231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 232 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 233 234 int preposted_rbufs = RDMA_BUFS_GRANT; 235 int send_threshold = 1; 236 237 /* 238 * Old cards with Tavor driver have limited memory footprint 239 * when booted in 32bit. The rib_max_rbufs tunable can be 240 * tuned for more buffers if needed. 241 */ 242 243 #if !defined(_ELF64) && !defined(__sparc) 244 int rib_max_rbufs = MAX_BUFS; 245 #else 246 int rib_max_rbufs = 10 * MAX_BUFS; 247 #endif /* !(_ELF64) && !(__sparc) */ 248 249 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 250 251 /* 252 * State of the plugin. 253 * ACCEPT = accepting new connections and requests. 254 * NO_ACCEPT = not accepting new connection and requests. 255 * This should eventually move to rpcib_state_t structure, since this 256 * will tell in which state the plugin is for a particular type of service 257 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 258 * state for one and in no_accept state for the other. 259 */ 260 int plugin_state; 261 kmutex_t plugin_state_lock; 262 263 ldi_ident_t rpcib_li; 264 265 /* 266 * RPCIB RDMATF operations 267 */ 268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 269 static rdma_stat rib_disconnect(CONN *conn); 270 static void rib_listen(struct rdma_svc_data *rd); 271 static void rib_listen_stop(struct rdma_svc_data *rd); 272 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 273 uint_t buflen, struct mrc *buf_handle); 274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 275 struct mrc buf_handle); 276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 277 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 279 struct mrc buf_handle); 280 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 281 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 282 void *lrc); 283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 284 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 286 caddr_t buf, int len, int cpu); 287 288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 289 290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 292 293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 294 295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 305 int addr_type, void *, CONN **); 306 static rdma_stat rib_conn_release(CONN *conn); 307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 308 rpcib_ping_t *, CONN **); 309 static rdma_stat rib_getinfo(rdma_info_t *info); 310 311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 313 static void rib_destroy_cache(rib_hca_t *hca); 314 static void rib_server_side_cache_reclaim(void *argp); 315 static int avl_compare(const void *t1, const void *t2); 316 317 static void rib_stop_services(rib_hca_t *); 318 static void rib_close_channels(rib_conn_list_t *); 319 static void rib_conn_close(void *); 320 static void rib_recv_rele(rib_qp_t *); 321 static rdma_stat rib_conn_release_locked(CONN *conn); 322 323 /* 324 * RPCIB addressing operations 325 */ 326 327 /* 328 * RDMA operations the RPCIB module exports 329 */ 330 static rdmaops_t rib_ops = { 331 rib_reachable, 332 rib_conn_get, 333 rib_conn_release, 334 rib_listen, 335 rib_listen_stop, 336 rib_registermem, 337 rib_deregistermem, 338 rib_registermemsync, 339 rib_deregistermemsync, 340 rib_syncmem, 341 rib_reg_buf_alloc, 342 rib_reg_buf_free, 343 rib_send, 344 rib_send_resp, 345 rib_post_resp, 346 rib_post_resp_remove, 347 rib_post_recv, 348 rib_recv, 349 rib_read, 350 rib_write, 351 rib_getinfo, 352 }; 353 354 /* 355 * RDMATF RPCIB plugin details 356 */ 357 static rdma_mod_t rib_mod = { 358 "ibtf", /* api name */ 359 RDMATF_VERS_1, 360 0, 361 &rib_ops, /* rdma op vector for ibtf */ 362 }; 363 364 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 365 static rdma_stat rib_qp_init(rib_qp_t *, int); 366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 372 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 374 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 377 rib_qp_t **); 378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 379 rib_qp_t **); 380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 382 static int rib_free_sendwait(struct send_wid *); 383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 385 static void rdma_done_rem_list(rib_qp_t *); 386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 387 388 static void rib_async_handler(void *, 389 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 392 static int rib_free_svc_recv(struct svc_recv *); 393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 394 static void rib_free_wid(struct recv_wid *); 395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 396 static void rib_detach_hca(ibt_hca_hdl_t); 397 static void rib_close_a_channel(CONN *); 398 static void rib_send_hold(rib_qp_t *); 399 static void rib_send_rele(rib_qp_t *); 400 401 /* 402 * Registration with IBTF as a consumer 403 */ 404 static struct ibt_clnt_modinfo_s rib_modinfo = { 405 IBTI_V_CURR, 406 IBT_GENERIC, 407 rib_async_handler, /* async event handler */ 408 NULL, /* Memory Region Handler */ 409 "nfs/ib" 410 }; 411 412 /* 413 * Global strucuture 414 */ 415 416 typedef struct rpcib_s { 417 dev_info_t *rpcib_dip; 418 kmutex_t rpcib_mutex; 419 } rpcib_t; 420 421 rpcib_t rpcib; 422 423 /* 424 * /etc/system controlled variable to control 425 * debugging in rpcib kernel module. 426 * Set it to values greater that 1 to control 427 * the amount of debugging messages required. 428 */ 429 int rib_debug = 0; 430 431 int 432 _init(void) 433 { 434 int error; 435 436 error = mod_install((struct modlinkage *)&rib_modlinkage); 437 if (error != 0) { 438 /* 439 * Could not load module 440 */ 441 return (error); 442 } 443 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 444 return (0); 445 } 446 447 int 448 _fini() 449 { 450 int status; 451 452 /* 453 * Remove module 454 */ 455 if ((status = mod_remove(&rib_modlinkage)) != 0) { 456 return (status); 457 } 458 mutex_destroy(&plugin_state_lock); 459 return (0); 460 } 461 462 int 463 _info(struct modinfo *modinfop) 464 { 465 return (mod_info(&rib_modlinkage, modinfop)); 466 } 467 468 /* 469 * rpcib_getinfo() 470 * Given the device number, return the devinfo pointer or the 471 * instance number. 472 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 473 */ 474 475 /*ARGSUSED*/ 476 static int 477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 478 { 479 int ret = DDI_SUCCESS; 480 481 switch (cmd) { 482 case DDI_INFO_DEVT2DEVINFO: 483 if (rpcib.rpcib_dip != NULL) 484 *result = rpcib.rpcib_dip; 485 else { 486 *result = NULL; 487 ret = DDI_FAILURE; 488 } 489 break; 490 491 case DDI_INFO_DEVT2INSTANCE: 492 *result = NULL; 493 break; 494 495 default: 496 ret = DDI_FAILURE; 497 } 498 return (ret); 499 } 500 501 static void 502 rpcib_free_hca_list() 503 { 504 rib_hca_t *hca, *hcap; 505 506 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 507 hca = rib_stat->hcas_list; 508 rib_stat->hcas_list = NULL; 509 rw_exit(&rib_stat->hcas_list_lock); 510 while (hca != NULL) { 511 rw_enter(&hca->state_lock, RW_WRITER); 512 hcap = hca; 513 hca = hca->next; 514 rib_stat->nhca_inited--; 515 rib_mod.rdma_count--; 516 hcap->state = HCA_DETACHED; 517 rw_exit(&hcap->state_lock); 518 rib_stop_hca_services(hcap); 519 520 kmem_free(hcap, sizeof (*hcap)); 521 } 522 } 523 524 static rdma_stat 525 rpcib_free_service_list() 526 { 527 rib_service_t *service; 528 ibt_status_t ret; 529 530 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 531 while (rib_stat->service_list != NULL) { 532 service = rib_stat->service_list; 533 ret = ibt_unbind_all_services(service->srv_hdl); 534 if (ret != IBT_SUCCESS) { 535 rw_exit(&rib_stat->service_list_lock); 536 #ifdef DEBUG 537 cmn_err(CE_NOTE, "rpcib_free_service_list: " 538 "ibt_unbind_all_services failed (%d)\n", (int)ret); 539 #endif 540 return (RDMA_FAILED); 541 } 542 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 543 service->srv_hdl); 544 if (ret != IBT_SUCCESS) { 545 rw_exit(&rib_stat->service_list_lock); 546 #ifdef DEBUG 547 cmn_err(CE_NOTE, "rpcib_free_service_list: " 548 "ibt_deregister_service failed (%d)\n", (int)ret); 549 #endif 550 return (RDMA_FAILED); 551 } 552 rib_stat->service_list = service->next; 553 kmem_free(service, sizeof (rib_service_t)); 554 } 555 rw_exit(&rib_stat->service_list_lock); 556 557 return (RDMA_SUCCESS); 558 } 559 560 static int 561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 562 { 563 ibt_status_t ibt_status; 564 rdma_stat r_status; 565 566 switch (cmd) { 567 case DDI_ATTACH: 568 break; 569 case DDI_RESUME: 570 return (DDI_SUCCESS); 571 default: 572 return (DDI_FAILURE); 573 } 574 575 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 576 577 mutex_enter(&rpcib.rpcib_mutex); 578 if (rpcib.rpcib_dip != NULL) { 579 mutex_exit(&rpcib.rpcib_mutex); 580 return (DDI_FAILURE); 581 } 582 rpcib.rpcib_dip = dip; 583 mutex_exit(&rpcib.rpcib_mutex); 584 /* 585 * Create the "rpcib" minor-node. 586 */ 587 if (ddi_create_minor_node(dip, 588 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 589 /* Error message, no cmn_err as they print on console */ 590 return (DDI_FAILURE); 591 } 592 593 if (rib_stat == NULL) { 594 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 595 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 596 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 597 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 598 } 599 600 rib_stat->hca_count = ibt_get_hca_list(NULL); 601 if (rib_stat->hca_count < 1) { 602 mutex_destroy(&rib_stat->listen_lock); 603 rw_destroy(&rib_stat->hcas_list_lock); 604 mutex_destroy(&rib_stat->open_hca_lock); 605 kmem_free(rib_stat, sizeof (*rib_stat)); 606 rib_stat = NULL; 607 return (DDI_FAILURE); 608 } 609 610 ibt_status = ibt_attach(&rib_modinfo, dip, 611 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 612 613 if (ibt_status != IBT_SUCCESS) { 614 mutex_destroy(&rib_stat->listen_lock); 615 rw_destroy(&rib_stat->hcas_list_lock); 616 mutex_destroy(&rib_stat->open_hca_lock); 617 kmem_free(rib_stat, sizeof (*rib_stat)); 618 rib_stat = NULL; 619 return (DDI_FAILURE); 620 } 621 622 rib_stat->service_list = NULL; 623 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 624 mutex_enter(&rib_stat->open_hca_lock); 625 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 626 mutex_exit(&rib_stat->open_hca_lock); 627 goto open_fail; 628 } 629 mutex_exit(&rib_stat->open_hca_lock); 630 631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 632 DDI_PROP_SUCCESS) { 633 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 634 "failed."); 635 goto register_fail; 636 } 637 638 /* 639 * Register with rdmatf 640 */ 641 r_status = rdma_register_mod(&rib_mod); 642 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 643 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 644 "status = %d", r_status); 645 goto register_fail; 646 } 647 648 return (DDI_SUCCESS); 649 650 register_fail: 651 652 open_fail: 653 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 654 rpcib_free_hca_list(); 655 (void) rpcib_free_service_list(); 656 mutex_destroy(&rib_stat->listen_lock); 657 rw_destroy(&rib_stat->hcas_list_lock); 658 mutex_destroy(&rib_stat->open_hca_lock); 659 rw_destroy(&rib_stat->service_list_lock); 660 kmem_free(rib_stat, sizeof (*rib_stat)); 661 rib_stat = NULL; 662 return (DDI_FAILURE); 663 } 664 665 /*ARGSUSED*/ 666 static int 667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 668 { 669 switch (cmd) { 670 671 case DDI_DETACH: 672 break; 673 674 case DDI_SUSPEND: 675 default: 676 return (DDI_FAILURE); 677 } 678 679 /* 680 * Detach the hca and free resources 681 */ 682 mutex_enter(&plugin_state_lock); 683 plugin_state = NO_ACCEPT; 684 mutex_exit(&plugin_state_lock); 685 686 if (rpcib_free_service_list() != RDMA_SUCCESS) 687 return (DDI_FAILURE); 688 rpcib_free_hca_list(); 689 690 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 691 mutex_destroy(&rib_stat->listen_lock); 692 rw_destroy(&rib_stat->hcas_list_lock); 693 mutex_destroy(&rib_stat->open_hca_lock); 694 rw_destroy(&rib_stat->service_list_lock); 695 696 kmem_free(rib_stat, sizeof (*rib_stat)); 697 rib_stat = NULL; 698 699 mutex_enter(&rpcib.rpcib_mutex); 700 rpcib.rpcib_dip = NULL; 701 mutex_exit(&rpcib.rpcib_mutex); 702 mutex_destroy(&rpcib.rpcib_mutex); 703 return (DDI_SUCCESS); 704 } 705 706 707 static void rib_rbufpool_free(rib_hca_t *, int); 708 static void rib_rbufpool_deregister(rib_hca_t *, int); 709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 711 static rdma_stat rib_rem_replylist(rib_qp_t *); 712 static int rib_remreply(rib_qp_t *, struct reply *); 713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 715 716 717 /* 718 * One CQ pair per HCA 719 */ 720 static rdma_stat 721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 722 rib_cq_t **cqp) 723 { 724 rib_cq_t *cq; 725 ibt_cq_attr_t cq_attr; 726 uint32_t real_size; 727 ibt_status_t status; 728 rdma_stat error = RDMA_SUCCESS; 729 730 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 731 cq->rib_hca = hca; 732 bzero(&cq_attr, sizeof (cq_attr)); 733 cq_attr.cq_size = cq_size; 734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 736 &real_size); 737 if (status != IBT_SUCCESS) { 738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 739 " status=%d", status); 740 error = RDMA_FAILED; 741 goto fail; 742 } 743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 744 745 /* 746 * Enable CQ callbacks. CQ Callbacks are single shot 747 * (e.g. you have to call ibt_enable_cq_notify() 748 * after each callback to get another one). 749 */ 750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 751 if (status != IBT_SUCCESS) { 752 cmn_err(CE_WARN, "rib_create_cq: " 753 "enable_cq_notify failed, status %d", status); 754 error = RDMA_FAILED; 755 goto fail; 756 } 757 *cqp = cq; 758 759 return (error); 760 fail: 761 if (cq->rib_cq_hdl) 762 (void) ibt_free_cq(cq->rib_cq_hdl); 763 if (cq) 764 kmem_free(cq, sizeof (rib_cq_t)); 765 return (error); 766 } 767 768 /* 769 * rpcib_find_hca 770 * 771 * Caller should have already locked the hcas_lock before calling 772 * this function. 773 */ 774 static rib_hca_t * 775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 776 { 777 rib_hca_t *hca = ribstat->hcas_list; 778 779 while (hca && hca->hca_guid != guid) 780 hca = hca->next; 781 782 return (hca); 783 } 784 785 static rdma_stat 786 rpcib_open_hcas(rpcib_state_t *ribstat) 787 { 788 rib_hca_t *hca; 789 ibt_status_t ibt_status; 790 rdma_stat status; 791 ibt_hca_portinfo_t *pinfop; 792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 793 uint_t size, cq_size; 794 int i; 795 kstat_t *ksp; 796 cache_avl_struct_t example_avl_node; 797 char rssc_name[32]; 798 int old_nhca_inited = ribstat->nhca_inited; 799 ib_guid_t *hca_guids; 800 801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 802 803 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 804 if (ribstat->hca_count == 0) 805 return (RDMA_FAILED); 806 807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 808 /* 809 * Open a hca and setup for RDMA 810 */ 811 for (i = 0; i < ribstat->hca_count; i++) { 812 if (rpcib_find_hca(ribstat, hca_guids[i])) 813 continue; 814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 815 816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 817 hca_guids[i], &hca->hca_hdl); 818 if (ibt_status != IBT_SUCCESS) { 819 kmem_free(hca, sizeof (rib_hca_t)); 820 continue; 821 } 822 hca->hca_guid = hca_guids[i]; 823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 824 hca->state = HCA_INITED; 825 826 /* 827 * query HCA info 828 */ 829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 830 if (ibt_status != IBT_SUCCESS) { 831 goto fail1; 832 } 833 834 /* 835 * One PD (Protection Domain) per HCA. 836 * A qp is allowed to access a memory region 837 * only when it's in the same PD as that of 838 * the memory region. 839 */ 840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 841 if (ibt_status != IBT_SUCCESS) { 842 goto fail1; 843 } 844 845 /* 846 * query HCA ports 847 */ 848 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 849 0, &pinfop, &hca->hca_nports, &size); 850 if (ibt_status != IBT_SUCCESS) { 851 goto fail2; 852 } 853 hca->hca_ports = pinfop; 854 hca->hca_pinfosz = size; 855 pinfop = NULL; 856 857 cq_size = DEF_CQ_SIZE; /* default cq size */ 858 /* 859 * Create 2 pairs of cq's (1 pair for client 860 * and the other pair for server) on this hca. 861 * If number of qp's gets too large, then several 862 * cq's will be needed. 863 */ 864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 865 &hca->svc_rcq); 866 if (status != RDMA_SUCCESS) { 867 goto fail3; 868 } 869 870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 871 &hca->svc_scq); 872 if (status != RDMA_SUCCESS) { 873 goto fail3; 874 } 875 876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 877 &hca->clnt_rcq); 878 if (status != RDMA_SUCCESS) { 879 goto fail3; 880 } 881 882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 883 &hca->clnt_scq); 884 if (status != RDMA_SUCCESS) { 885 goto fail3; 886 } 887 888 /* 889 * Create buffer pools. 890 * Note rib_rbuf_create also allocates memory windows. 891 */ 892 hca->recv_pool = rib_rbufpool_create(hca, 893 RECV_BUFFER, rib_max_rbufs); 894 if (hca->recv_pool == NULL) { 895 goto fail3; 896 } 897 898 hca->send_pool = rib_rbufpool_create(hca, 899 SEND_BUFFER, rib_max_rbufs); 900 if (hca->send_pool == NULL) { 901 rib_rbufpool_destroy(hca, RECV_BUFFER); 902 goto fail3; 903 } 904 905 if (hca->server_side_cache == NULL) { 906 (void) sprintf(rssc_name, 907 "rib_srvr_cache_%llx", 908 (long long unsigned int) hca->hca_guid); 909 hca->server_side_cache = kmem_cache_create( 910 rssc_name, 911 sizeof (cache_avl_struct_t), 0, 912 NULL, 913 NULL, 914 rib_server_side_cache_reclaim, 915 hca, NULL, 0); 916 } 917 918 avl_create(&hca->avl_tree, 919 avl_compare, 920 sizeof (cache_avl_struct_t), 921 (uint_t)(uintptr_t)&example_avl_node.avl_link- 922 (uint_t)(uintptr_t)&example_avl_node); 923 924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 925 hca->iblock); 926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 927 rw_init(&hca->avl_rw_lock, 928 NULL, RW_DRIVER, hca->iblock); 929 mutex_init(&hca->cache_allocation_lock, 930 NULL, MUTEX_DRIVER, NULL); 931 hca->avl_init = TRUE; 932 933 /* Create kstats for the cache */ 934 ASSERT(INGLOBALZONE(curproc)); 935 936 if (!stats_enabled) { 937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 938 KSTAT_TYPE_NAMED, 939 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 941 GLOBAL_ZONEID); 942 if (ksp) { 943 ksp->ks_data = (void *) &rpcib_kstat; 944 ksp->ks_update = rpcib_cache_kstat_update; 945 kstat_install(ksp); 946 stats_enabled = TRUE; 947 } 948 } 949 if (hca->cleanup_helper == NULL) { 950 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 951 952 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 953 (unsigned long long int) hca->hca_guid); 954 hca->cleanup_helper = ddi_taskq_create(NULL, 955 tq_name, 1, TASKQ_DEFAULTPRI, 0); 956 } 957 958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 961 hca->iblock); 962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 963 hca->iblock); 964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 965 hca->inuse = TRUE; 966 967 hca->next = ribstat->hcas_list; 968 ribstat->hcas_list = hca; 969 ribstat->nhca_inited++; 970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 971 continue; 972 973 fail3: 974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 975 fail2: 976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 977 fail1: 978 (void) ibt_close_hca(hca->hca_hdl); 979 kmem_free(hca, sizeof (rib_hca_t)); 980 } 981 rw_exit(&ribstat->hcas_list_lock); 982 ibt_free_hca_list(hca_guids, ribstat->hca_count); 983 rib_mod.rdma_count = rib_stat->nhca_inited; 984 985 /* 986 * return success if at least one new hca has been configured. 987 */ 988 if (ribstat->nhca_inited != old_nhca_inited) 989 return (RDMA_SUCCESS); 990 else 991 return (RDMA_FAILED); 992 } 993 994 /* 995 * Callback routines 996 */ 997 998 /* 999 * SCQ handlers 1000 */ 1001 /* ARGSUSED */ 1002 static void 1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1004 { 1005 ibt_status_t ibt_status; 1006 ibt_wc_t wc; 1007 struct send_wid *wd; 1008 CONN *conn; 1009 rib_qp_t *qp; 1010 int i; 1011 1012 /* 1013 * Re-enable cq notify here to avoid missing any 1014 * completion queue notification. 1015 */ 1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1017 1018 ibt_status = IBT_SUCCESS; 1019 while (ibt_status != IBT_CQ_EMPTY) { 1020 bzero(&wc, sizeof (wc)); 1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1022 if (ibt_status != IBT_SUCCESS) 1023 return; 1024 1025 /* 1026 * Got a send completion 1027 */ 1028 if (wc.wc_id != RDMA_DUMMY_WRID) { 1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1030 qp = wd->qp; 1031 conn = qptoc(qp); 1032 1033 mutex_enter(&wd->sendwait_lock); 1034 switch (wc.wc_status) { 1035 case IBT_WC_SUCCESS: 1036 wd->status = RDMA_SUCCESS; 1037 break; 1038 default: 1039 /* 1040 * RC Send Q Error Code Local state Remote State 1041 * ==================== =========== ============ 1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1043 * IBT_WC_LOCAL_LEN_ERR ERROR None 1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1052 * IBT_WC_WR_FLUSHED_ERR ERROR None 1053 */ 1054 /* 1055 * Channel in error state. Set connection to 1056 * ERROR and cleanup will happen either from 1057 * conn_release or from rib_conn_get 1058 */ 1059 wd->status = RDMA_FAILED; 1060 mutex_enter(&conn->c_lock); 1061 if (conn->c_state != C_DISCONN_PEND) 1062 conn->c_state = C_ERROR_CONN; 1063 mutex_exit(&conn->c_lock); 1064 break; 1065 } 1066 1067 if (wd->cv_sig == 1) { 1068 /* 1069 * Notify poster 1070 */ 1071 cv_signal(&wd->wait_cv); 1072 mutex_exit(&wd->sendwait_lock); 1073 } else { 1074 /* 1075 * Poster not waiting for notification. 1076 * Free the send buffers and send_wid 1077 */ 1078 for (i = 0; i < wd->nsbufs; i++) { 1079 rib_rbuf_free(qptoc(wd->qp), 1080 SEND_BUFFER, 1081 (void *)(uintptr_t)wd->sbufaddr[i]); 1082 } 1083 1084 /* decrement the send ref count */ 1085 rib_send_rele(qp); 1086 1087 mutex_exit(&wd->sendwait_lock); 1088 (void) rib_free_sendwait(wd); 1089 } 1090 } 1091 } 1092 } 1093 1094 /* ARGSUSED */ 1095 static void 1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1097 { 1098 ibt_status_t ibt_status; 1099 ibt_wc_t wc; 1100 struct send_wid *wd; 1101 rib_qp_t *qp; 1102 CONN *conn; 1103 int i; 1104 1105 /* 1106 * Re-enable cq notify here to avoid missing any 1107 * completion queue notification. 1108 */ 1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1110 1111 ibt_status = IBT_SUCCESS; 1112 while (ibt_status != IBT_CQ_EMPTY) { 1113 bzero(&wc, sizeof (wc)); 1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1115 if (ibt_status != IBT_SUCCESS) 1116 return; 1117 1118 /* 1119 * Got a send completion 1120 */ 1121 if (wc.wc_id != RDMA_DUMMY_WRID) { 1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1123 qp = wd->qp; 1124 conn = qptoc(qp); 1125 mutex_enter(&wd->sendwait_lock); 1126 1127 switch (wc.wc_status) { 1128 case IBT_WC_SUCCESS: 1129 wd->status = RDMA_SUCCESS; 1130 break; 1131 default: 1132 /* 1133 * Channel in error state. Set connection to 1134 * ERROR and cleanup will happen either from 1135 * conn_release or conn timeout. 1136 */ 1137 wd->status = RDMA_FAILED; 1138 mutex_enter(&conn->c_lock); 1139 if (conn->c_state != C_DISCONN_PEND) 1140 conn->c_state = C_ERROR_CONN; 1141 mutex_exit(&conn->c_lock); 1142 break; 1143 } 1144 1145 if (wd->cv_sig == 1) { 1146 /* 1147 * Update completion status and notify poster 1148 */ 1149 cv_signal(&wd->wait_cv); 1150 mutex_exit(&wd->sendwait_lock); 1151 } else { 1152 /* 1153 * Poster not waiting for notification. 1154 * Free the send buffers and send_wid 1155 */ 1156 for (i = 0; i < wd->nsbufs; i++) { 1157 rib_rbuf_free(qptoc(wd->qp), 1158 SEND_BUFFER, 1159 (void *)(uintptr_t)wd->sbufaddr[i]); 1160 } 1161 1162 /* decrement the send ref count */ 1163 rib_send_rele(qp); 1164 1165 mutex_exit(&wd->sendwait_lock); 1166 (void) rib_free_sendwait(wd); 1167 } 1168 } 1169 } 1170 } 1171 1172 /* 1173 * RCQ handler 1174 */ 1175 /* ARGSUSED */ 1176 static void 1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1178 { 1179 rib_qp_t *qp; 1180 ibt_status_t ibt_status; 1181 ibt_wc_t wc; 1182 struct recv_wid *rwid; 1183 1184 /* 1185 * Re-enable cq notify here to avoid missing any 1186 * completion queue notification. 1187 */ 1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1189 1190 ibt_status = IBT_SUCCESS; 1191 while (ibt_status != IBT_CQ_EMPTY) { 1192 bzero(&wc, sizeof (wc)); 1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1194 if (ibt_status != IBT_SUCCESS) 1195 return; 1196 1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1198 qp = rwid->qp; 1199 1200 if (wc.wc_status == IBT_WC_SUCCESS) { 1201 XDR inxdrs, *xdrs; 1202 uint_t xid, vers, op, find_xid = 0; 1203 struct reply *r; 1204 CONN *conn = qptoc(qp); 1205 uint32_t rdma_credit = 0; 1206 1207 xdrs = &inxdrs; 1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1209 wc.wc_bytes_xfer, XDR_DECODE); 1210 /* 1211 * Treat xid as opaque (xid is the first entity 1212 * in the rpc rdma message). 1213 */ 1214 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1215 1216 /* Skip xid and set the xdr position accordingly. */ 1217 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1218 (void) xdr_u_int(xdrs, &vers); 1219 (void) xdr_u_int(xdrs, &rdma_credit); 1220 (void) xdr_u_int(xdrs, &op); 1221 XDR_DESTROY(xdrs); 1222 1223 if (vers != RPCRDMA_VERS) { 1224 /* 1225 * Invalid RPC/RDMA version. Cannot 1226 * interoperate. Set connection to 1227 * ERROR state and bail out. 1228 */ 1229 mutex_enter(&conn->c_lock); 1230 if (conn->c_state != C_DISCONN_PEND) 1231 conn->c_state = C_ERROR_CONN; 1232 mutex_exit(&conn->c_lock); 1233 rib_rbuf_free(conn, RECV_BUFFER, 1234 (void *)(uintptr_t)rwid->addr); 1235 rib_free_wid(rwid); 1236 rib_recv_rele(qp); 1237 continue; 1238 } 1239 1240 mutex_enter(&qp->replylist_lock); 1241 for (r = qp->replylist; r != NULL; r = r->next) { 1242 if (r->xid == xid) { 1243 find_xid = 1; 1244 switch (op) { 1245 case RDMA_MSG: 1246 case RDMA_NOMSG: 1247 case RDMA_MSGP: 1248 r->status = RDMA_SUCCESS; 1249 r->vaddr_cq = rwid->addr; 1250 r->bytes_xfer = 1251 wc.wc_bytes_xfer; 1252 cv_signal(&r->wait_cv); 1253 break; 1254 default: 1255 rib_rbuf_free(qptoc(qp), 1256 RECV_BUFFER, 1257 (void *)(uintptr_t) 1258 rwid->addr); 1259 break; 1260 } 1261 break; 1262 } 1263 } 1264 mutex_exit(&qp->replylist_lock); 1265 if (find_xid == 0) { 1266 /* RPC caller not waiting for reply */ 1267 1268 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1269 int, xid); 1270 1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1272 (void *)(uintptr_t)rwid->addr); 1273 } 1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1275 CONN *conn = qptoc(qp); 1276 1277 /* 1278 * Connection being flushed. Just free 1279 * the posted buffer 1280 */ 1281 rib_rbuf_free(conn, RECV_BUFFER, 1282 (void *)(uintptr_t)rwid->addr); 1283 } else { 1284 CONN *conn = qptoc(qp); 1285 /* 1286 * RC Recv Q Error Code Local state Remote State 1287 * ==================== =========== ============ 1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1293 * IBT_WC_WR_FLUSHED_ERR None None 1294 */ 1295 /* 1296 * Channel in error state. Set connection 1297 * in ERROR state. 1298 */ 1299 mutex_enter(&conn->c_lock); 1300 if (conn->c_state != C_DISCONN_PEND) 1301 conn->c_state = C_ERROR_CONN; 1302 mutex_exit(&conn->c_lock); 1303 rib_rbuf_free(conn, RECV_BUFFER, 1304 (void *)(uintptr_t)rwid->addr); 1305 } 1306 rib_free_wid(rwid); 1307 rib_recv_rele(qp); 1308 } 1309 } 1310 1311 /* Server side */ 1312 /* ARGSUSED */ 1313 static void 1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1315 { 1316 rdma_recv_data_t *rdp; 1317 rib_qp_t *qp; 1318 ibt_status_t ibt_status; 1319 ibt_wc_t wc; 1320 struct svc_recv *s_recvp; 1321 CONN *conn; 1322 mblk_t *mp; 1323 1324 /* 1325 * Re-enable cq notify here to avoid missing any 1326 * completion queue notification. 1327 */ 1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1329 1330 ibt_status = IBT_SUCCESS; 1331 while (ibt_status != IBT_CQ_EMPTY) { 1332 bzero(&wc, sizeof (wc)); 1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1334 if (ibt_status != IBT_SUCCESS) 1335 return; 1336 1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1338 qp = s_recvp->qp; 1339 conn = qptoc(qp); 1340 1341 if (wc.wc_status == IBT_WC_SUCCESS) { 1342 XDR inxdrs, *xdrs; 1343 uint_t xid, vers, op; 1344 uint32_t rdma_credit; 1345 1346 xdrs = &inxdrs; 1347 /* s_recvp->vaddr stores data */ 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1349 wc.wc_bytes_xfer, XDR_DECODE); 1350 1351 /* 1352 * Treat xid as opaque (xid is the first entity 1353 * in the rpc rdma message). 1354 */ 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1356 /* Skip xid and set the xdr position accordingly. */ 1357 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1358 if (!xdr_u_int(xdrs, &vers) || 1359 !xdr_u_int(xdrs, &rdma_credit) || 1360 !xdr_u_int(xdrs, &op)) { 1361 rib_rbuf_free(conn, RECV_BUFFER, 1362 (void *)(uintptr_t)s_recvp->vaddr); 1363 XDR_DESTROY(xdrs); 1364 rib_recv_rele(qp); 1365 (void) rib_free_svc_recv(s_recvp); 1366 continue; 1367 } 1368 XDR_DESTROY(xdrs); 1369 1370 if (vers != RPCRDMA_VERS) { 1371 /* 1372 * Invalid RPC/RDMA version. 1373 * Drop rpc rdma message. 1374 */ 1375 rib_rbuf_free(conn, RECV_BUFFER, 1376 (void *)(uintptr_t)s_recvp->vaddr); 1377 rib_recv_rele(qp); 1378 (void) rib_free_svc_recv(s_recvp); 1379 continue; 1380 } 1381 /* 1382 * Is this for RDMA_DONE? 1383 */ 1384 if (op == RDMA_DONE) { 1385 rib_rbuf_free(conn, RECV_BUFFER, 1386 (void *)(uintptr_t)s_recvp->vaddr); 1387 /* 1388 * Wake up the thread waiting on 1389 * a RDMA_DONE for xid 1390 */ 1391 mutex_enter(&qp->rdlist_lock); 1392 rdma_done_notify(qp, xid); 1393 mutex_exit(&qp->rdlist_lock); 1394 rib_recv_rele(qp); 1395 (void) rib_free_svc_recv(s_recvp); 1396 continue; 1397 } 1398 1399 mutex_enter(&plugin_state_lock); 1400 mutex_enter(&conn->c_lock); 1401 if ((plugin_state == ACCEPT) && 1402 (conn->c_state == C_CONNECTED)) { 1403 conn->c_ref++; 1404 mutex_exit(&conn->c_lock); 1405 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1406 == NULL) 1407 (void) strwaitbuf( 1408 sizeof (*rdp), BPRI_LO); 1409 /* 1410 * Plugin is in accept state, hence the master 1411 * transport queue for this is still accepting 1412 * requests. Hence we can call svc_queuereq to 1413 * queue this recieved msg. 1414 */ 1415 rdp = (rdma_recv_data_t *)mp->b_rptr; 1416 rdp->conn = conn; 1417 rdp->rpcmsg.addr = 1418 (caddr_t)(uintptr_t)s_recvp->vaddr; 1419 rdp->rpcmsg.type = RECV_BUFFER; 1420 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1421 rdp->status = wc.wc_status; 1422 mp->b_wptr += sizeof (*rdp); 1423 (void) svc_queuereq((queue_t *)rib_stat->q, mp, 1424 FALSE); 1425 mutex_exit(&plugin_state_lock); 1426 } else { 1427 /* 1428 * The master transport for this is going 1429 * away and the queue is not accepting anymore 1430 * requests for krpc, so don't do anything, just 1431 * free the msg. 1432 */ 1433 mutex_exit(&conn->c_lock); 1434 mutex_exit(&plugin_state_lock); 1435 rib_rbuf_free(conn, RECV_BUFFER, 1436 (void *)(uintptr_t)s_recvp->vaddr); 1437 } 1438 } else { 1439 rib_rbuf_free(conn, RECV_BUFFER, 1440 (void *)(uintptr_t)s_recvp->vaddr); 1441 } 1442 rib_recv_rele(qp); 1443 (void) rib_free_svc_recv(s_recvp); 1444 } 1445 } 1446 1447 static void 1448 rib_attach_hca() 1449 { 1450 mutex_enter(&rib_stat->open_hca_lock); 1451 (void) rpcib_open_hcas(rib_stat); 1452 rib_listen(NULL); 1453 mutex_exit(&rib_stat->open_hca_lock); 1454 } 1455 1456 /* 1457 * Handles DR event of IBT_HCA_DETACH_EVENT. 1458 */ 1459 /* ARGSUSED */ 1460 static void 1461 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1462 ibt_async_code_t code, ibt_async_event_t *event) 1463 { 1464 switch (code) { 1465 case IBT_HCA_ATTACH_EVENT: 1466 rib_attach_hca(); 1467 break; 1468 case IBT_HCA_DETACH_EVENT: 1469 rib_detach_hca(hca_hdl); 1470 #ifdef DEBUG 1471 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1472 #endif 1473 break; 1474 case IBT_EVENT_PORT_UP: 1475 /* 1476 * A port is up. We should call rib_listen() since there is 1477 * a chance that rib_listen() may have failed during 1478 * rib_attach_hca() because the port had not been up yet. 1479 */ 1480 rib_listen(NULL); 1481 #ifdef DEBUG 1482 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1483 #endif 1484 break; 1485 #ifdef DEBUG 1486 case IBT_EVENT_PATH_MIGRATED: 1487 cmn_err(CE_NOTE, "rib_async_handler(): " 1488 "IBT_EVENT_PATH_MIGRATED\n"); 1489 break; 1490 case IBT_EVENT_SQD: 1491 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1492 break; 1493 case IBT_EVENT_COM_EST: 1494 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1495 break; 1496 case IBT_ERROR_CATASTROPHIC_CHAN: 1497 cmn_err(CE_NOTE, "rib_async_handler(): " 1498 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1499 break; 1500 case IBT_ERROR_INVALID_REQUEST_CHAN: 1501 cmn_err(CE_NOTE, "rib_async_handler(): " 1502 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1503 break; 1504 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1505 cmn_err(CE_NOTE, "rib_async_handler(): " 1506 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1507 break; 1508 case IBT_ERROR_PATH_MIGRATE_REQ: 1509 cmn_err(CE_NOTE, "rib_async_handler(): " 1510 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1511 break; 1512 case IBT_ERROR_CQ: 1513 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1514 break; 1515 case IBT_ERROR_PORT_DOWN: 1516 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1517 break; 1518 case IBT_ASYNC_OPAQUE1: 1519 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1520 break; 1521 case IBT_ASYNC_OPAQUE2: 1522 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1523 break; 1524 case IBT_ASYNC_OPAQUE3: 1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1526 break; 1527 case IBT_ASYNC_OPAQUE4: 1528 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1529 break; 1530 #endif 1531 default: 1532 break; 1533 } 1534 } 1535 1536 /* 1537 * Client's reachable function. 1538 */ 1539 static rdma_stat 1540 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1541 { 1542 rdma_stat status; 1543 rpcib_ping_t rpt; 1544 struct netbuf saddr; 1545 CONN *conn; 1546 1547 bzero(&saddr, sizeof (struct netbuf)); 1548 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1549 1550 if (status == RDMA_SUCCESS) { 1551 *handle = (void *)rpt.hca; 1552 /* release the reference */ 1553 (void) rib_conn_release(conn); 1554 return (RDMA_SUCCESS); 1555 } else { 1556 *handle = NULL; 1557 DTRACE_PROBE(rpcib__i__pingfailed); 1558 return (RDMA_FAILED); 1559 } 1560 } 1561 1562 /* Client side qp creation */ 1563 static rdma_stat 1564 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1565 { 1566 rib_qp_t *kqp = NULL; 1567 CONN *conn; 1568 rdma_clnt_cred_ctrl_t *cc_info; 1569 1570 ASSERT(qp != NULL); 1571 *qp = NULL; 1572 1573 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1574 conn = qptoc(kqp); 1575 kqp->hca = hca; 1576 kqp->rdmaconn.c_rdmamod = &rib_mod; 1577 kqp->rdmaconn.c_private = (caddr_t)kqp; 1578 1579 kqp->mode = RIB_CLIENT; 1580 kqp->chan_flags = IBT_BLOCKING; 1581 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1582 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1583 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1584 /* 1585 * Initialize 1586 */ 1587 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1588 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1589 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1590 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1591 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1592 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1593 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1594 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1595 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1596 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1597 /* 1598 * Initialize the client credit control 1599 * portion of the rdmaconn struct. 1600 */ 1601 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1602 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1603 cc_info->clnt_cc_granted_ops = 0; 1604 cc_info->clnt_cc_in_flight_ops = 0; 1605 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1606 1607 *qp = kqp; 1608 return (RDMA_SUCCESS); 1609 } 1610 1611 /* Server side qp creation */ 1612 static rdma_stat 1613 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1614 { 1615 rib_qp_t *kqp = NULL; 1616 ibt_chan_sizes_t chan_sizes; 1617 ibt_rc_chan_alloc_args_t qp_attr; 1618 ibt_status_t ibt_status; 1619 rdma_srv_cred_ctrl_t *cc_info; 1620 1621 *qp = NULL; 1622 1623 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1624 kqp->hca = hca; 1625 kqp->port_num = port; 1626 kqp->rdmaconn.c_rdmamod = &rib_mod; 1627 kqp->rdmaconn.c_private = (caddr_t)kqp; 1628 1629 /* 1630 * Create the qp handle 1631 */ 1632 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1633 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1634 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1635 qp_attr.rc_pd = hca->pd_hdl; 1636 qp_attr.rc_hca_port_num = port; 1637 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1638 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1639 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1640 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1641 qp_attr.rc_clone_chan = NULL; 1642 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1643 qp_attr.rc_flags = IBT_WR_SIGNALED; 1644 1645 rw_enter(&hca->state_lock, RW_READER); 1646 if (hca->state != HCA_DETACHED) { 1647 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1648 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1649 &chan_sizes); 1650 } else { 1651 rw_exit(&hca->state_lock); 1652 goto fail; 1653 } 1654 rw_exit(&hca->state_lock); 1655 1656 if (ibt_status != IBT_SUCCESS) { 1657 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1658 int, ibt_status); 1659 goto fail; 1660 } 1661 1662 kqp->mode = RIB_SERVER; 1663 kqp->chan_flags = IBT_BLOCKING; 1664 kqp->q = q; /* server ONLY */ 1665 1666 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1667 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1668 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1669 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1670 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1671 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1672 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1673 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1674 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1675 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1676 /* 1677 * Set the private data area to qp to be used in callbacks 1678 */ 1679 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1680 kqp->rdmaconn.c_state = C_CONNECTED; 1681 1682 /* 1683 * Initialize the server credit control 1684 * portion of the rdmaconn struct. 1685 */ 1686 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1687 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1688 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1689 cc_info->srv_cc_cur_buffers_used = 0; 1690 cc_info->srv_cc_posted = preposted_rbufs; 1691 1692 *qp = kqp; 1693 1694 return (RDMA_SUCCESS); 1695 fail: 1696 if (kqp) 1697 kmem_free(kqp, sizeof (rib_qp_t)); 1698 1699 return (RDMA_FAILED); 1700 } 1701 1702 /* ARGSUSED */ 1703 ibt_cm_status_t 1704 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1705 ibt_cm_return_args_t *ret_args, void *priv_data, 1706 ibt_priv_data_len_t len) 1707 { 1708 rib_hca_t *hca; 1709 1710 hca = (rib_hca_t *)clnt_hdl; 1711 1712 switch (event->cm_type) { 1713 1714 /* got a connection close event */ 1715 case IBT_CM_EVENT_CONN_CLOSED: 1716 { 1717 CONN *conn; 1718 rib_qp_t *qp; 1719 1720 /* check reason why connection was closed */ 1721 switch (event->cm_event.closed) { 1722 case IBT_CM_CLOSED_DREP_RCVD: 1723 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1724 case IBT_CM_CLOSED_DUP: 1725 case IBT_CM_CLOSED_ABORT: 1726 case IBT_CM_CLOSED_ALREADY: 1727 /* 1728 * These cases indicate the local end initiated 1729 * the closing of the channel. Nothing to do here. 1730 */ 1731 break; 1732 default: 1733 /* 1734 * Reason for CONN_CLOSED event must be one of 1735 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1736 * or IBT_CM_CLOSED_STALE. These indicate cases were 1737 * the remote end is closing the channel. In these 1738 * cases free the channel and transition to error 1739 * state 1740 */ 1741 qp = ibt_get_chan_private(event->cm_channel); 1742 conn = qptoc(qp); 1743 mutex_enter(&conn->c_lock); 1744 if (conn->c_state == C_DISCONN_PEND) { 1745 mutex_exit(&conn->c_lock); 1746 break; 1747 } 1748 1749 conn->c_state = C_ERROR_CONN; 1750 1751 /* 1752 * Free the conn if c_ref is down to 0 already 1753 */ 1754 if (conn->c_ref == 0) { 1755 /* 1756 * Remove from list and free conn 1757 */ 1758 conn->c_state = C_DISCONN_PEND; 1759 mutex_exit(&conn->c_lock); 1760 rw_enter(&hca->state_lock, RW_READER); 1761 if (hca->state != HCA_DETACHED) 1762 (void) rib_disconnect_channel(conn, 1763 &hca->cl_conn_list); 1764 rw_exit(&hca->state_lock); 1765 } else { 1766 /* 1767 * conn will be freed when c_ref goes to 0. 1768 * Indicate to cleaning thread not to close 1769 * the connection, but just free the channel. 1770 */ 1771 conn->c_flags |= C_CLOSE_NOTNEEDED; 1772 mutex_exit(&conn->c_lock); 1773 } 1774 #ifdef DEBUG 1775 if (rib_debug) 1776 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1777 "(CONN_CLOSED) channel disconnected"); 1778 #endif 1779 break; 1780 } 1781 break; 1782 } 1783 default: 1784 break; 1785 } 1786 return (IBT_CM_ACCEPT); 1787 } 1788 1789 /* 1790 * Connect to the server. 1791 */ 1792 rdma_stat 1793 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1794 { 1795 ibt_chan_open_args_t chan_args; /* channel args */ 1796 ibt_chan_sizes_t chan_sizes; 1797 ibt_rc_chan_alloc_args_t qp_attr; 1798 ibt_status_t ibt_status; 1799 ibt_rc_returns_t ret_args; /* conn reject info */ 1800 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1801 ibt_ip_cm_info_t ipcm_info; 1802 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1803 1804 1805 (void) bzero(&chan_args, sizeof (chan_args)); 1806 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1807 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1808 1809 ipcm_info.src_addr.family = rptp->srcip.family; 1810 switch (ipcm_info.src_addr.family) { 1811 case AF_INET: 1812 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1813 break; 1814 case AF_INET6: 1815 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1816 break; 1817 } 1818 1819 ipcm_info.dst_addr.family = rptp->srcip.family; 1820 switch (ipcm_info.dst_addr.family) { 1821 case AF_INET: 1822 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1823 break; 1824 case AF_INET6: 1825 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1826 break; 1827 } 1828 1829 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1830 1831 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1832 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1833 1834 if (ibt_status != IBT_SUCCESS) { 1835 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1836 return (-1); 1837 } 1838 1839 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1840 /* Alloc a RC channel */ 1841 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1842 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1843 qp_attr.rc_pd = hca->pd_hdl; 1844 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1845 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1846 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1847 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1848 qp_attr.rc_clone_chan = NULL; 1849 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1850 qp_attr.rc_flags = IBT_WR_SIGNALED; 1851 1852 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1853 chan_args.oc_path = &rptp->path; 1854 1855 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1856 chan_args.oc_cm_clnt_private = (void *)hca; 1857 chan_args.oc_rdma_ra_out = 4; 1858 chan_args.oc_rdma_ra_in = 4; 1859 chan_args.oc_path_retry_cnt = 2; 1860 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1861 chan_args.oc_priv_data = cmp_ip_pvt; 1862 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1863 1864 refresh: 1865 rw_enter(&hca->state_lock, RW_READER); 1866 if (hca->state != HCA_DETACHED) { 1867 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1868 IBT_ACHAN_NO_FLAGS, 1869 &qp_attr, &qp->qp_hdl, 1870 &chan_sizes); 1871 } else { 1872 rw_exit(&hca->state_lock); 1873 return (RDMA_FAILED); 1874 } 1875 rw_exit(&hca->state_lock); 1876 1877 if (ibt_status != IBT_SUCCESS) { 1878 DTRACE_PROBE1(rpcib__i_conntosrv, 1879 int, ibt_status); 1880 return (RDMA_FAILED); 1881 } 1882 1883 /* Connect to the Server */ 1884 (void) bzero(&ret_args, sizeof (ret_args)); 1885 mutex_enter(&qp->cb_lock); 1886 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1887 IBT_BLOCKING, &chan_args, &ret_args); 1888 if (ibt_status != IBT_SUCCESS) { 1889 DTRACE_PROBE2(rpcib__i_openrctosrv, 1890 int, ibt_status, int, ret_args.rc_status); 1891 1892 (void) ibt_free_channel(qp->qp_hdl); 1893 qp->qp_hdl = NULL; 1894 mutex_exit(&qp->cb_lock); 1895 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1896 ret_args.rc_status == IBT_CM_CONN_STALE) { 1897 /* 1898 * Got IBT_CM_CONN_STALE probably because of stale 1899 * data on the passive end of a channel that existed 1900 * prior to reboot. Retry establishing a channel 1901 * REFRESH_ATTEMPTS times, during which time the 1902 * stale conditions on the server might clear up. 1903 */ 1904 goto refresh; 1905 } 1906 return (RDMA_FAILED); 1907 } 1908 mutex_exit(&qp->cb_lock); 1909 /* 1910 * Set the private data area to qp to be used in callbacks 1911 */ 1912 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1913 return (RDMA_SUCCESS); 1914 } 1915 1916 rdma_stat 1917 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1918 { 1919 uint_t i, addr_count; 1920 ibt_status_t ibt_status; 1921 uint8_t num_paths_p; 1922 ibt_ip_path_attr_t ipattr; 1923 ibt_path_ip_src_t srcip; 1924 rpcib_ipaddrs_t addrs4; 1925 rpcib_ipaddrs_t addrs6; 1926 struct sockaddr_in *sinp; 1927 struct sockaddr_in6 *sin6p; 1928 rdma_stat retval = RDMA_FAILED; 1929 rib_hca_t *hca; 1930 1931 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1932 return (RDMA_INVAL); 1933 ASSERT(raddr->buf != NULL); 1934 1935 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1936 1937 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1938 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1939 retval = RDMA_FAILED; 1940 goto done2; 1941 } 1942 1943 if (addr_type == AF_INET) { 1944 addr_count = addrs4.ri_count; 1945 sinp = (struct sockaddr_in *)raddr->buf; 1946 rptp->dstip.family = AF_INET; 1947 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1948 sinp = addrs4.ri_list; 1949 } else { 1950 addr_count = addrs6.ri_count; 1951 sin6p = (struct sockaddr_in6 *)raddr->buf; 1952 rptp->dstip.family = AF_INET6; 1953 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1954 sin6p = addrs6.ri_list; 1955 } 1956 1957 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1958 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1959 rw_enter(&hca->state_lock, RW_READER); 1960 if (hca->state == HCA_DETACHED) { 1961 rw_exit(&hca->state_lock); 1962 continue; 1963 } 1964 1965 ipattr.ipa_dst_ip = &rptp->dstip; 1966 ipattr.ipa_hca_guid = hca->hca_guid; 1967 ipattr.ipa_ndst = 1; 1968 ipattr.ipa_max_paths = 1; 1969 ipattr.ipa_src_ip.family = rptp->dstip.family; 1970 for (i = 0; i < addr_count; i++) { 1971 num_paths_p = 0; 1972 if (addr_type == AF_INET) { 1973 ipattr.ipa_src_ip.un.ip4addr = 1974 sinp[i].sin_addr.s_addr; 1975 } else { 1976 ipattr.ipa_src_ip.un.ip6addr = 1977 sin6p[i].sin6_addr; 1978 } 1979 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1980 1981 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1982 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1983 &num_paths_p, &srcip); 1984 if (ibt_status == IBT_SUCCESS && 1985 num_paths_p != 0 && 1986 rptp->path.pi_hca_guid == hca->hca_guid) { 1987 rptp->hca = hca; 1988 rw_exit(&hca->state_lock); 1989 if (addr_type == AF_INET) { 1990 rptp->srcip.family = AF_INET; 1991 rptp->srcip.un.ip4addr = 1992 srcip.ip_primary.un.ip4addr; 1993 } else { 1994 rptp->srcip.family = AF_INET6; 1995 rptp->srcip.un.ip6addr = 1996 srcip.ip_primary.un.ip6addr; 1997 1998 } 1999 retval = RDMA_SUCCESS; 2000 goto done1; 2001 } 2002 } 2003 rw_exit(&hca->state_lock); 2004 } 2005 done1: 2006 rw_exit(&rib_stat->hcas_list_lock); 2007 done2: 2008 if (addrs4.ri_size > 0) 2009 kmem_free(addrs4.ri_list, addrs4.ri_size); 2010 if (addrs6.ri_size > 0) 2011 kmem_free(addrs6.ri_list, addrs6.ri_size); 2012 return (retval); 2013 } 2014 2015 /* 2016 * Close channel, remove from connection list and 2017 * free up resources allocated for that channel. 2018 */ 2019 rdma_stat 2020 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2021 { 2022 rib_qp_t *qp = ctoqp(conn); 2023 rib_hca_t *hca; 2024 2025 mutex_enter(&conn->c_lock); 2026 if (conn->c_timeout != NULL) { 2027 mutex_exit(&conn->c_lock); 2028 (void) untimeout(conn->c_timeout); 2029 mutex_enter(&conn->c_lock); 2030 } 2031 2032 while (conn->c_flags & C_CLOSE_PENDING) { 2033 cv_wait(&conn->c_cv, &conn->c_lock); 2034 } 2035 mutex_exit(&conn->c_lock); 2036 2037 /* 2038 * c_ref == 0 and connection is in C_DISCONN_PEND 2039 */ 2040 hca = qp->hca; 2041 if (conn_list != NULL) 2042 (void) rib_rm_conn(conn, conn_list); 2043 2044 /* 2045 * There is only one case where we get here with 2046 * qp_hdl = NULL, which is during connection setup on 2047 * the client. In such a case there are no posted 2048 * send/recv buffers. 2049 */ 2050 if (qp->qp_hdl != NULL) { 2051 mutex_enter(&qp->posted_rbufs_lock); 2052 while (qp->n_posted_rbufs) 2053 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2054 mutex_exit(&qp->posted_rbufs_lock); 2055 2056 mutex_enter(&qp->send_rbufs_lock); 2057 while (qp->n_send_rbufs) 2058 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2059 mutex_exit(&qp->send_rbufs_lock); 2060 2061 (void) ibt_free_channel(qp->qp_hdl); 2062 qp->qp_hdl = NULL; 2063 } 2064 2065 ASSERT(qp->rdlist == NULL); 2066 2067 if (qp->replylist != NULL) { 2068 (void) rib_rem_replylist(qp); 2069 } 2070 2071 cv_destroy(&qp->cb_conn_cv); 2072 cv_destroy(&qp->posted_rbufs_cv); 2073 cv_destroy(&qp->send_rbufs_cv); 2074 mutex_destroy(&qp->cb_lock); 2075 mutex_destroy(&qp->replylist_lock); 2076 mutex_destroy(&qp->posted_rbufs_lock); 2077 mutex_destroy(&qp->send_rbufs_lock); 2078 mutex_destroy(&qp->rdlist_lock); 2079 2080 cv_destroy(&conn->c_cv); 2081 mutex_destroy(&conn->c_lock); 2082 2083 if (conn->c_raddr.buf != NULL) { 2084 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2085 } 2086 if (conn->c_laddr.buf != NULL) { 2087 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2088 } 2089 if (conn->c_netid != NULL) { 2090 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2091 } 2092 if (conn->c_addrmask.buf != NULL) { 2093 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len); 2094 } 2095 2096 /* 2097 * Credit control cleanup. 2098 */ 2099 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2100 rdma_clnt_cred_ctrl_t *cc_info; 2101 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2102 cv_destroy(&cc_info->clnt_cc_cv); 2103 } 2104 2105 kmem_free(qp, sizeof (rib_qp_t)); 2106 2107 /* 2108 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2109 * then the hca is no longer being used. 2110 */ 2111 if (conn_list != NULL) { 2112 rw_enter(&hca->state_lock, RW_READER); 2113 if (hca->state == HCA_DETACHED) { 2114 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2115 if (hca->srv_conn_list.conn_hd == NULL) { 2116 rw_enter(&hca->cl_conn_list.conn_lock, 2117 RW_READER); 2118 2119 if (hca->cl_conn_list.conn_hd == NULL) { 2120 mutex_enter(&hca->inuse_lock); 2121 hca->inuse = FALSE; 2122 cv_signal(&hca->cb_cv); 2123 mutex_exit(&hca->inuse_lock); 2124 } 2125 rw_exit(&hca->cl_conn_list.conn_lock); 2126 } 2127 rw_exit(&hca->srv_conn_list.conn_lock); 2128 } 2129 rw_exit(&hca->state_lock); 2130 } 2131 2132 return (RDMA_SUCCESS); 2133 } 2134 2135 /* 2136 * All sends are done under the protection of 2137 * the wdesc->sendwait_lock. n_send_rbufs count 2138 * is protected using the send_rbufs_lock. 2139 * lock ordering is: 2140 * sendwait_lock -> send_rbufs_lock 2141 */ 2142 2143 void 2144 rib_send_hold(rib_qp_t *qp) 2145 { 2146 mutex_enter(&qp->send_rbufs_lock); 2147 qp->n_send_rbufs++; 2148 mutex_exit(&qp->send_rbufs_lock); 2149 } 2150 2151 void 2152 rib_send_rele(rib_qp_t *qp) 2153 { 2154 mutex_enter(&qp->send_rbufs_lock); 2155 qp->n_send_rbufs--; 2156 if (qp->n_send_rbufs == 0) 2157 cv_signal(&qp->send_rbufs_cv); 2158 mutex_exit(&qp->send_rbufs_lock); 2159 } 2160 2161 void 2162 rib_recv_rele(rib_qp_t *qp) 2163 { 2164 mutex_enter(&qp->posted_rbufs_lock); 2165 qp->n_posted_rbufs--; 2166 if (qp->n_posted_rbufs == 0) 2167 cv_signal(&qp->posted_rbufs_cv); 2168 mutex_exit(&qp->posted_rbufs_lock); 2169 } 2170 2171 /* 2172 * Wait for send completion notification. Only on receiving a 2173 * notification be it a successful or error completion, free the 2174 * send_wid. 2175 */ 2176 static rdma_stat 2177 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2178 { 2179 clock_t timout, cv_wait_ret; 2180 rdma_stat error = RDMA_SUCCESS; 2181 int i; 2182 2183 /* 2184 * Wait for send to complete 2185 */ 2186 ASSERT(wd != NULL); 2187 mutex_enter(&wd->sendwait_lock); 2188 if (wd->status == (uint_t)SEND_WAIT) { 2189 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2190 ddi_get_lbolt(); 2191 2192 if (qp->mode == RIB_SERVER) { 2193 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2194 &wd->sendwait_lock, timout)) > 0 && 2195 wd->status == (uint_t)SEND_WAIT) 2196 ; 2197 switch (cv_wait_ret) { 2198 case -1: /* timeout */ 2199 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2200 2201 wd->cv_sig = 0; /* no signal needed */ 2202 error = RDMA_TIMEDOUT; 2203 break; 2204 default: /* got send completion */ 2205 break; 2206 } 2207 } else { 2208 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2209 &wd->sendwait_lock, timout)) > 0 && 2210 wd->status == (uint_t)SEND_WAIT) 2211 ; 2212 switch (cv_wait_ret) { 2213 case -1: /* timeout */ 2214 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2215 2216 wd->cv_sig = 0; /* no signal needed */ 2217 error = RDMA_TIMEDOUT; 2218 break; 2219 case 0: /* interrupted */ 2220 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2221 2222 wd->cv_sig = 0; /* no signal needed */ 2223 error = RDMA_INTR; 2224 break; 2225 default: /* got send completion */ 2226 break; 2227 } 2228 } 2229 } 2230 2231 if (wd->status != (uint_t)SEND_WAIT) { 2232 /* got send completion */ 2233 if (wd->status != RDMA_SUCCESS) { 2234 switch (wd->status) { 2235 case RDMA_CONNLOST: 2236 error = RDMA_CONNLOST; 2237 break; 2238 default: 2239 error = RDMA_FAILED; 2240 break; 2241 } 2242 } 2243 for (i = 0; i < wd->nsbufs; i++) { 2244 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2245 (void *)(uintptr_t)wd->sbufaddr[i]); 2246 } 2247 2248 rib_send_rele(qp); 2249 2250 mutex_exit(&wd->sendwait_lock); 2251 (void) rib_free_sendwait(wd); 2252 2253 } else { 2254 mutex_exit(&wd->sendwait_lock); 2255 } 2256 return (error); 2257 } 2258 2259 static struct send_wid * 2260 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2261 { 2262 struct send_wid *wd; 2263 2264 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2265 wd->xid = xid; 2266 wd->cv_sig = cv_sig; 2267 wd->qp = qp; 2268 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2269 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2270 wd->status = (uint_t)SEND_WAIT; 2271 2272 return (wd); 2273 } 2274 2275 static int 2276 rib_free_sendwait(struct send_wid *wdesc) 2277 { 2278 cv_destroy(&wdesc->wait_cv); 2279 mutex_destroy(&wdesc->sendwait_lock); 2280 kmem_free(wdesc, sizeof (*wdesc)); 2281 2282 return (0); 2283 } 2284 2285 static rdma_stat 2286 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2287 { 2288 mutex_enter(&qp->replylist_lock); 2289 if (rep != NULL) { 2290 (void) rib_remreply(qp, rep); 2291 mutex_exit(&qp->replylist_lock); 2292 return (RDMA_SUCCESS); 2293 } 2294 mutex_exit(&qp->replylist_lock); 2295 return (RDMA_FAILED); 2296 } 2297 2298 /* 2299 * Send buffers are freed here only in case of error in posting 2300 * on QP. If the post succeeded, the send buffers are freed upon 2301 * send completion in rib_sendwait() or in the scq_handler. 2302 */ 2303 rdma_stat 2304 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2305 int send_sig, int cv_sig, caddr_t *swid) 2306 { 2307 struct send_wid *wdesc; 2308 struct clist *clp; 2309 ibt_status_t ibt_status = IBT_SUCCESS; 2310 rdma_stat ret = RDMA_SUCCESS; 2311 ibt_send_wr_t tx_wr; 2312 int i, nds; 2313 ibt_wr_ds_t sgl[DSEG_MAX]; 2314 uint_t total_msg_size; 2315 rib_qp_t *qp; 2316 2317 qp = ctoqp(conn); 2318 2319 ASSERT(cl != NULL); 2320 2321 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2322 2323 nds = 0; 2324 total_msg_size = 0; 2325 clp = cl; 2326 while (clp != NULL) { 2327 if (nds >= DSEG_MAX) { 2328 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2329 return (RDMA_FAILED); 2330 } 2331 sgl[nds].ds_va = clp->w.c_saddr; 2332 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2333 sgl[nds].ds_len = clp->c_len; 2334 total_msg_size += clp->c_len; 2335 clp = clp->c_next; 2336 nds++; 2337 } 2338 2339 if (send_sig) { 2340 /* Set SEND_SIGNAL flag. */ 2341 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2342 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2343 *swid = (caddr_t)wdesc; 2344 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2345 mutex_enter(&wdesc->sendwait_lock); 2346 wdesc->nsbufs = nds; 2347 for (i = 0; i < nds; i++) { 2348 wdesc->sbufaddr[i] = sgl[i].ds_va; 2349 } 2350 } else { 2351 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2352 *swid = NULL; 2353 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2354 } 2355 2356 tx_wr.wr_opcode = IBT_WRC_SEND; 2357 tx_wr.wr_trans = IBT_RC_SRV; 2358 tx_wr.wr_nds = nds; 2359 tx_wr.wr_sgl = sgl; 2360 2361 mutex_enter(&conn->c_lock); 2362 if (conn->c_state == C_CONNECTED) { 2363 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2364 } 2365 if (conn->c_state != C_CONNECTED || 2366 ibt_status != IBT_SUCCESS) { 2367 if (conn->c_state != C_DISCONN_PEND) 2368 conn->c_state = C_ERROR_CONN; 2369 mutex_exit(&conn->c_lock); 2370 if (send_sig) { 2371 for (i = 0; i < nds; i++) { 2372 rib_rbuf_free(conn, SEND_BUFFER, 2373 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2374 } 2375 mutex_exit(&wdesc->sendwait_lock); 2376 (void) rib_free_sendwait(wdesc); 2377 } 2378 return (RDMA_CONNLOST); 2379 } 2380 2381 mutex_exit(&conn->c_lock); 2382 2383 if (send_sig) { 2384 rib_send_hold(qp); 2385 mutex_exit(&wdesc->sendwait_lock); 2386 if (cv_sig) { 2387 /* 2388 * cv_wait for send to complete. 2389 * We can fail due to a timeout or signal or 2390 * unsuccessful send. 2391 */ 2392 ret = rib_sendwait(qp, wdesc); 2393 2394 return (ret); 2395 } 2396 } 2397 2398 return (RDMA_SUCCESS); 2399 } 2400 2401 2402 rdma_stat 2403 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2404 { 2405 rdma_stat ret; 2406 caddr_t wd; 2407 2408 /* send-wait & cv_signal */ 2409 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2410 return (ret); 2411 } 2412 2413 /* 2414 * Deprecated/obsolete interface not used currently 2415 * but earlier used for READ-READ protocol. 2416 * Send RPC reply and wait for RDMA_DONE. 2417 */ 2418 rdma_stat 2419 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2420 { 2421 rdma_stat ret = RDMA_SUCCESS; 2422 struct rdma_done_list *rd; 2423 clock_t cv_wait_ret; 2424 caddr_t *wid = NULL; 2425 rib_qp_t *qp = ctoqp(conn); 2426 2427 mutex_enter(&qp->rdlist_lock); 2428 rd = rdma_done_add(qp, msgid); 2429 2430 /* No cv_signal (whether send-wait or no-send-wait) */ 2431 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2432 2433 if (ret != RDMA_SUCCESS) { 2434 rdma_done_rm(qp, rd); 2435 } else { 2436 /* 2437 * Wait for RDMA_DONE from remote end 2438 */ 2439 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2440 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2441 TR_CLOCK_TICK); 2442 2443 rdma_done_rm(qp, rd); 2444 2445 if (cv_wait_ret < 0) { 2446 ret = RDMA_TIMEDOUT; 2447 } 2448 } 2449 2450 mutex_exit(&qp->rdlist_lock); 2451 return (ret); 2452 } 2453 2454 static struct recv_wid * 2455 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2456 { 2457 struct recv_wid *rwid; 2458 2459 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2460 rwid->xid = msgid; 2461 rwid->addr = sgl->ds_va; 2462 rwid->qp = qp; 2463 2464 return (rwid); 2465 } 2466 2467 static void 2468 rib_free_wid(struct recv_wid *rwid) 2469 { 2470 kmem_free(rwid, sizeof (struct recv_wid)); 2471 } 2472 2473 rdma_stat 2474 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2475 { 2476 rib_qp_t *qp = ctoqp(conn); 2477 struct clist *clp = cl; 2478 struct reply *rep; 2479 struct recv_wid *rwid; 2480 int nds; 2481 ibt_wr_ds_t sgl[DSEG_MAX]; 2482 ibt_recv_wr_t recv_wr; 2483 rdma_stat ret; 2484 ibt_status_t ibt_status; 2485 2486 /* 2487 * rdma_clnt_postrecv uses RECV_BUFFER. 2488 */ 2489 2490 nds = 0; 2491 while (cl != NULL) { 2492 if (nds >= DSEG_MAX) { 2493 ret = RDMA_FAILED; 2494 goto done; 2495 } 2496 sgl[nds].ds_va = cl->w.c_saddr; 2497 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2498 sgl[nds].ds_len = cl->c_len; 2499 cl = cl->c_next; 2500 nds++; 2501 } 2502 2503 if (nds != 1) { 2504 ret = RDMA_FAILED; 2505 goto done; 2506 } 2507 2508 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2509 recv_wr.wr_nds = nds; 2510 recv_wr.wr_sgl = sgl; 2511 2512 rwid = rib_create_wid(qp, &sgl[0], msgid); 2513 if (rwid) { 2514 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2515 } else { 2516 ret = RDMA_NORESOURCE; 2517 goto done; 2518 } 2519 rep = rib_addreplylist(qp, msgid); 2520 if (!rep) { 2521 rib_free_wid(rwid); 2522 ret = RDMA_NORESOURCE; 2523 goto done; 2524 } 2525 2526 mutex_enter(&conn->c_lock); 2527 2528 if (conn->c_state == C_CONNECTED) { 2529 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2530 } 2531 2532 if (conn->c_state != C_CONNECTED || 2533 ibt_status != IBT_SUCCESS) { 2534 if (conn->c_state != C_DISCONN_PEND) 2535 conn->c_state = C_ERROR_CONN; 2536 mutex_exit(&conn->c_lock); 2537 rib_free_wid(rwid); 2538 (void) rib_rem_rep(qp, rep); 2539 ret = RDMA_CONNLOST; 2540 goto done; 2541 } 2542 2543 mutex_enter(&qp->posted_rbufs_lock); 2544 qp->n_posted_rbufs++; 2545 mutex_exit(&qp->posted_rbufs_lock); 2546 2547 mutex_exit(&conn->c_lock); 2548 return (RDMA_SUCCESS); 2549 2550 done: 2551 while (clp != NULL) { 2552 rib_rbuf_free(conn, RECV_BUFFER, 2553 (void *)(uintptr_t)clp->w.c_saddr3); 2554 clp = clp->c_next; 2555 } 2556 return (ret); 2557 } 2558 2559 rdma_stat 2560 rib_svc_post(CONN* conn, struct clist *cl) 2561 { 2562 rib_qp_t *qp = ctoqp(conn); 2563 struct svc_recv *s_recvp; 2564 int nds; 2565 ibt_wr_ds_t sgl[DSEG_MAX]; 2566 ibt_recv_wr_t recv_wr; 2567 ibt_status_t ibt_status; 2568 2569 nds = 0; 2570 while (cl != NULL) { 2571 if (nds >= DSEG_MAX) { 2572 return (RDMA_FAILED); 2573 } 2574 sgl[nds].ds_va = cl->w.c_saddr; 2575 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2576 sgl[nds].ds_len = cl->c_len; 2577 cl = cl->c_next; 2578 nds++; 2579 } 2580 2581 if (nds != 1) { 2582 rib_rbuf_free(conn, RECV_BUFFER, 2583 (caddr_t)(uintptr_t)sgl[0].ds_va); 2584 2585 return (RDMA_FAILED); 2586 } 2587 2588 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2589 recv_wr.wr_nds = nds; 2590 recv_wr.wr_sgl = sgl; 2591 2592 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2593 /* Use s_recvp's addr as wr id */ 2594 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2595 mutex_enter(&conn->c_lock); 2596 if (conn->c_state == C_CONNECTED) { 2597 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2598 } 2599 if (conn->c_state != C_CONNECTED || 2600 ibt_status != IBT_SUCCESS) { 2601 if (conn->c_state != C_DISCONN_PEND) 2602 conn->c_state = C_ERROR_CONN; 2603 mutex_exit(&conn->c_lock); 2604 rib_rbuf_free(conn, RECV_BUFFER, 2605 (caddr_t)(uintptr_t)sgl[0].ds_va); 2606 (void) rib_free_svc_recv(s_recvp); 2607 2608 return (RDMA_CONNLOST); 2609 } 2610 mutex_exit(&conn->c_lock); 2611 2612 return (RDMA_SUCCESS); 2613 } 2614 2615 /* Client */ 2616 rdma_stat 2617 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2618 { 2619 return (rib_clnt_post(conn, cl, msgid)); 2620 } 2621 2622 /* Client */ 2623 rdma_stat 2624 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2625 { 2626 rib_qp_t *qp = ctoqp(conn); 2627 struct reply *rep; 2628 2629 mutex_enter(&qp->replylist_lock); 2630 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2631 if (rep->xid == msgid) { 2632 if (rep->vaddr_cq) { 2633 rib_rbuf_free(conn, RECV_BUFFER, 2634 (caddr_t)(uintptr_t)rep->vaddr_cq); 2635 } 2636 (void) rib_remreply(qp, rep); 2637 break; 2638 } 2639 } 2640 mutex_exit(&qp->replylist_lock); 2641 2642 return (RDMA_SUCCESS); 2643 } 2644 2645 /* Server */ 2646 rdma_stat 2647 rib_post_recv(CONN *conn, struct clist *cl) 2648 { 2649 rib_qp_t *qp = ctoqp(conn); 2650 2651 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2652 mutex_enter(&qp->posted_rbufs_lock); 2653 qp->n_posted_rbufs++; 2654 mutex_exit(&qp->posted_rbufs_lock); 2655 return (RDMA_SUCCESS); 2656 } 2657 return (RDMA_FAILED); 2658 } 2659 2660 /* 2661 * Client side only interface to "recv" the rpc reply buf 2662 * posted earlier by rib_post_resp(conn, cl, msgid). 2663 */ 2664 rdma_stat 2665 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2666 { 2667 struct reply *rep = NULL; 2668 clock_t timout, cv_wait_ret; 2669 rdma_stat ret = RDMA_SUCCESS; 2670 rib_qp_t *qp = ctoqp(conn); 2671 2672 /* 2673 * Find the reply structure for this msgid 2674 */ 2675 mutex_enter(&qp->replylist_lock); 2676 2677 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2678 if (rep->xid == msgid) 2679 break; 2680 } 2681 2682 if (rep != NULL) { 2683 /* 2684 * If message not yet received, wait. 2685 */ 2686 if (rep->status == (uint_t)REPLY_WAIT) { 2687 timout = ddi_get_lbolt() + 2688 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2689 2690 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2691 &qp->replylist_lock, timout)) > 0 && 2692 rep->status == (uint_t)REPLY_WAIT) 2693 ; 2694 2695 switch (cv_wait_ret) { 2696 case -1: /* timeout */ 2697 ret = RDMA_TIMEDOUT; 2698 break; 2699 case 0: 2700 ret = RDMA_INTR; 2701 break; 2702 default: 2703 break; 2704 } 2705 } 2706 2707 if (rep->status == RDMA_SUCCESS) { 2708 struct clist *cl = NULL; 2709 2710 /* 2711 * Got message successfully 2712 */ 2713 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2714 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2715 *clp = cl; 2716 } else { 2717 if (rep->status != (uint_t)REPLY_WAIT) { 2718 /* 2719 * Got error in reply message. Free 2720 * recv buffer here. 2721 */ 2722 ret = rep->status; 2723 rib_rbuf_free(conn, RECV_BUFFER, 2724 (caddr_t)(uintptr_t)rep->vaddr_cq); 2725 } 2726 } 2727 (void) rib_remreply(qp, rep); 2728 } else { 2729 /* 2730 * No matching reply structure found for given msgid on the 2731 * reply wait list. 2732 */ 2733 ret = RDMA_INVAL; 2734 DTRACE_PROBE(rpcib__i__nomatchxid2); 2735 } 2736 2737 /* 2738 * Done. 2739 */ 2740 mutex_exit(&qp->replylist_lock); 2741 return (ret); 2742 } 2743 2744 /* 2745 * RDMA write a buffer to the remote address. 2746 */ 2747 rdma_stat 2748 rib_write(CONN *conn, struct clist *cl, int wait) 2749 { 2750 ibt_send_wr_t tx_wr; 2751 int cv_sig; 2752 ibt_wr_ds_t sgl[DSEG_MAX]; 2753 struct send_wid *wdesc; 2754 ibt_status_t ibt_status; 2755 rdma_stat ret = RDMA_SUCCESS; 2756 rib_qp_t *qp = ctoqp(conn); 2757 uint64_t n_writes = 0; 2758 2759 if (cl == NULL) { 2760 return (RDMA_FAILED); 2761 } 2762 2763 while ((cl != NULL)) { 2764 if (cl->c_len > 0) { 2765 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2766 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2767 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2768 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2769 sgl[0].ds_va = cl->w.c_saddr; 2770 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2771 sgl[0].ds_len = cl->c_len; 2772 2773 if (wait) { 2774 cv_sig = 1; 2775 } else { 2776 if (n_writes > max_unsignaled_rws) { 2777 n_writes = 0; 2778 cv_sig = 1; 2779 } else { 2780 cv_sig = 0; 2781 } 2782 } 2783 2784 if (cv_sig) { 2785 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2786 wdesc = rib_init_sendwait(0, cv_sig, qp); 2787 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2788 mutex_enter(&wdesc->sendwait_lock); 2789 } else { 2790 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2791 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2792 } 2793 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2794 tx_wr.wr_trans = IBT_RC_SRV; 2795 tx_wr.wr_nds = 1; 2796 tx_wr.wr_sgl = sgl; 2797 2798 mutex_enter(&conn->c_lock); 2799 if (conn->c_state == C_CONNECTED) { 2800 ibt_status = 2801 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2802 } 2803 if (conn->c_state != C_CONNECTED || 2804 ibt_status != IBT_SUCCESS) { 2805 if (conn->c_state != C_DISCONN_PEND) 2806 conn->c_state = C_ERROR_CONN; 2807 mutex_exit(&conn->c_lock); 2808 if (cv_sig) { 2809 mutex_exit(&wdesc->sendwait_lock); 2810 (void) rib_free_sendwait(wdesc); 2811 } 2812 return (RDMA_CONNLOST); 2813 } 2814 2815 mutex_exit(&conn->c_lock); 2816 2817 /* 2818 * Wait for send to complete 2819 */ 2820 if (cv_sig) { 2821 2822 rib_send_hold(qp); 2823 mutex_exit(&wdesc->sendwait_lock); 2824 2825 ret = rib_sendwait(qp, wdesc); 2826 if (ret != 0) 2827 return (ret); 2828 } 2829 n_writes ++; 2830 } 2831 cl = cl->c_next; 2832 } 2833 return (RDMA_SUCCESS); 2834 } 2835 2836 /* 2837 * RDMA Read a buffer from the remote address. 2838 */ 2839 rdma_stat 2840 rib_read(CONN *conn, struct clist *cl, int wait) 2841 { 2842 ibt_send_wr_t rx_wr; 2843 int cv_sig = 0; 2844 ibt_wr_ds_t sgl; 2845 struct send_wid *wdesc; 2846 ibt_status_t ibt_status = IBT_SUCCESS; 2847 rdma_stat ret = RDMA_SUCCESS; 2848 rib_qp_t *qp = ctoqp(conn); 2849 2850 if (cl == NULL) { 2851 return (RDMA_FAILED); 2852 } 2853 2854 while (cl != NULL) { 2855 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2856 /* 2857 * Remote address is at the head chunk item in list. 2858 */ 2859 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2860 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2861 2862 sgl.ds_va = cl->u.c_daddr; 2863 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2864 sgl.ds_len = cl->c_len; 2865 2866 /* 2867 * If there are multiple chunks to be read, and 2868 * wait is set, ask for signal only for the last chunk 2869 * and wait only on the last chunk. The completion of 2870 * RDMA_READ on last chunk ensures that reads on all 2871 * previous chunks are also completed. 2872 */ 2873 if (wait && (cl->c_next == NULL)) { 2874 cv_sig = 1; 2875 wdesc = rib_init_sendwait(0, cv_sig, qp); 2876 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2877 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2878 mutex_enter(&wdesc->sendwait_lock); 2879 } else { 2880 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2881 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2882 } 2883 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2884 rx_wr.wr_trans = IBT_RC_SRV; 2885 rx_wr.wr_nds = 1; 2886 rx_wr.wr_sgl = &sgl; 2887 2888 mutex_enter(&conn->c_lock); 2889 if (conn->c_state == C_CONNECTED) { 2890 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2891 } 2892 if (conn->c_state != C_CONNECTED || 2893 ibt_status != IBT_SUCCESS) { 2894 if (conn->c_state != C_DISCONN_PEND) 2895 conn->c_state = C_ERROR_CONN; 2896 mutex_exit(&conn->c_lock); 2897 if (wait && (cl->c_next == NULL)) { 2898 mutex_exit(&wdesc->sendwait_lock); 2899 (void) rib_free_sendwait(wdesc); 2900 } 2901 return (RDMA_CONNLOST); 2902 } 2903 2904 mutex_exit(&conn->c_lock); 2905 2906 /* 2907 * Wait for send to complete if this is the 2908 * last item in the list. 2909 */ 2910 if (wait && cl->c_next == NULL) { 2911 rib_send_hold(qp); 2912 mutex_exit(&wdesc->sendwait_lock); 2913 2914 ret = rib_sendwait(qp, wdesc); 2915 2916 if (ret != 0) 2917 return (ret); 2918 } 2919 cl = cl->c_next; 2920 } 2921 return (RDMA_SUCCESS); 2922 } 2923 2924 /* 2925 * rib_srv_cm_handler() 2926 * Connection Manager callback to handle RC connection requests. 2927 */ 2928 /* ARGSUSED */ 2929 static ibt_cm_status_t 2930 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2931 ibt_cm_return_args_t *ret_args, void *priv_data, 2932 ibt_priv_data_len_t len) 2933 { 2934 queue_t *q; 2935 rib_qp_t *qp; 2936 rib_hca_t *hca; 2937 rdma_stat status = RDMA_SUCCESS; 2938 int i; 2939 struct clist cl; 2940 rdma_buf_t rdbuf = {0}; 2941 void *buf = NULL; 2942 CONN *conn; 2943 ibt_ip_cm_info_t ipinfo; 2944 struct sockaddr_in *s; 2945 struct sockaddr_in6 *s6; 2946 int sin_size = sizeof (struct sockaddr_in); 2947 int in_size = sizeof (struct in_addr); 2948 int sin6_size = sizeof (struct sockaddr_in6); 2949 2950 ASSERT(any != NULL); 2951 ASSERT(event != NULL); 2952 2953 hca = (rib_hca_t *)any; 2954 2955 /* got a connection request */ 2956 switch (event->cm_type) { 2957 case IBT_CM_EVENT_REQ_RCV: 2958 /* 2959 * If the plugin is in the NO_ACCEPT state, bail out. 2960 */ 2961 mutex_enter(&plugin_state_lock); 2962 if (plugin_state == NO_ACCEPT) { 2963 mutex_exit(&plugin_state_lock); 2964 return (IBT_CM_REJECT); 2965 } 2966 mutex_exit(&plugin_state_lock); 2967 2968 /* 2969 * Need to send a MRA MAD to CM so that it does not 2970 * timeout on us. 2971 */ 2972 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2973 event->cm_event.req.req_timeout * 8, NULL, 0); 2974 2975 mutex_enter(&rib_stat->open_hca_lock); 2976 q = rib_stat->q; 2977 mutex_exit(&rib_stat->open_hca_lock); 2978 2979 status = rib_svc_create_chan(hca, (caddr_t)q, 2980 event->cm_event.req.req_prim_hca_port, &qp); 2981 2982 if (status) { 2983 return (IBT_CM_REJECT); 2984 } 2985 2986 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2987 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2988 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2989 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2990 2991 /* 2992 * Pre-posts RECV buffers 2993 */ 2994 conn = qptoc(qp); 2995 for (i = 0; i < preposted_rbufs; i++) { 2996 bzero(&rdbuf, sizeof (rdbuf)); 2997 rdbuf.type = RECV_BUFFER; 2998 buf = rib_rbuf_alloc(conn, &rdbuf); 2999 if (buf == NULL) { 3000 /* 3001 * A connection is not established yet. 3002 * Just flush the channel. Buffers 3003 * posted till now will error out with 3004 * IBT_WC_WR_FLUSHED_ERR. 3005 */ 3006 (void) ibt_flush_channel(qp->qp_hdl); 3007 (void) rib_disconnect_channel(conn, NULL); 3008 return (IBT_CM_REJECT); 3009 } 3010 3011 bzero(&cl, sizeof (cl)); 3012 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3013 cl.c_len = rdbuf.len; 3014 cl.c_smemhandle.mrc_lmr = 3015 rdbuf.handle.mrc_lmr; /* lkey */ 3016 cl.c_next = NULL; 3017 status = rib_post_recv(conn, &cl); 3018 if (status != RDMA_SUCCESS) { 3019 /* 3020 * A connection is not established yet. 3021 * Just flush the channel. Buffers 3022 * posted till now will error out with 3023 * IBT_WC_WR_FLUSHED_ERR. 3024 */ 3025 (void) ibt_flush_channel(qp->qp_hdl); 3026 (void) rib_disconnect_channel(conn, NULL); 3027 return (IBT_CM_REJECT); 3028 } 3029 } 3030 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3031 3032 /* 3033 * Get the address translation 3034 */ 3035 rw_enter(&hca->state_lock, RW_READER); 3036 if (hca->state == HCA_DETACHED) { 3037 rw_exit(&hca->state_lock); 3038 return (IBT_CM_REJECT); 3039 } 3040 rw_exit(&hca->state_lock); 3041 3042 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3043 3044 if (ibt_get_ip_data(event->cm_priv_data_len, 3045 event->cm_priv_data, 3046 &ipinfo) != IBT_SUCCESS) { 3047 3048 return (IBT_CM_REJECT); 3049 } 3050 3051 switch (ipinfo.src_addr.family) { 3052 case AF_INET: 3053 3054 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3055 KM_SLEEP); 3056 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3057 3058 conn->c_raddr.maxlen = 3059 conn->c_raddr.len = sin_size; 3060 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3061 3062 s = (struct sockaddr_in *)conn->c_raddr.buf; 3063 s->sin_family = AF_INET; 3064 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3065 &s->sin_addr, in_size); 3066 3067 conn->c_laddr.maxlen = 3068 conn->c_laddr.len = sin_size; 3069 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3070 3071 s = (struct sockaddr_in *)conn->c_laddr.buf; 3072 s->sin_family = AF_INET; 3073 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3074 &s->sin_addr, in_size); 3075 3076 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3077 sizeof (struct sockaddr_in); 3078 conn->c_addrmask.buf = 3079 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3080 ((struct sockaddr_in *) 3081 conn->c_addrmask.buf)->sin_addr.s_addr = 3082 (uint32_t)~0; 3083 ((struct sockaddr_in *) 3084 conn->c_addrmask.buf)->sin_family = 3085 (sa_family_t)~0; 3086 break; 3087 3088 case AF_INET6: 3089 3090 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3091 KM_SLEEP); 3092 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3093 3094 conn->c_raddr.maxlen = 3095 conn->c_raddr.len = sin6_size; 3096 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3097 3098 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3099 s6->sin6_family = AF_INET6; 3100 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3101 &s6->sin6_addr, 3102 sizeof (struct in6_addr)); 3103 3104 conn->c_laddr.maxlen = 3105 conn->c_laddr.len = sin6_size; 3106 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3107 3108 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3109 s6->sin6_family = AF_INET6; 3110 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3111 &s6->sin6_addr, 3112 sizeof (struct in6_addr)); 3113 3114 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3115 sizeof (struct sockaddr_in6); 3116 conn->c_addrmask.buf = 3117 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3118 (void) memset(&((struct sockaddr_in6 *) 3119 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0, 3120 sizeof (struct in6_addr)); 3121 ((struct sockaddr_in6 *) 3122 conn->c_addrmask.buf)->sin6_family = 3123 (sa_family_t)~0; 3124 break; 3125 3126 default: 3127 return (IBT_CM_REJECT); 3128 } 3129 3130 break; 3131 3132 case IBT_CM_EVENT_CONN_CLOSED: 3133 { 3134 CONN *conn; 3135 rib_qp_t *qp; 3136 3137 switch (event->cm_event.closed) { 3138 case IBT_CM_CLOSED_DREP_RCVD: 3139 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3140 case IBT_CM_CLOSED_DUP: 3141 case IBT_CM_CLOSED_ABORT: 3142 case IBT_CM_CLOSED_ALREADY: 3143 /* 3144 * These cases indicate the local end initiated 3145 * the closing of the channel. Nothing to do here. 3146 */ 3147 break; 3148 default: 3149 /* 3150 * Reason for CONN_CLOSED event must be one of 3151 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3152 * or IBT_CM_CLOSED_STALE. These indicate cases were 3153 * the remote end is closing the channel. In these 3154 * cases free the channel and transition to error 3155 * state 3156 */ 3157 qp = ibt_get_chan_private(event->cm_channel); 3158 conn = qptoc(qp); 3159 mutex_enter(&conn->c_lock); 3160 if (conn->c_state == C_DISCONN_PEND) { 3161 mutex_exit(&conn->c_lock); 3162 break; 3163 } 3164 conn->c_state = C_ERROR_CONN; 3165 3166 /* 3167 * Free the conn if c_ref goes down to 0 3168 */ 3169 if (conn->c_ref == 0) { 3170 /* 3171 * Remove from list and free conn 3172 */ 3173 conn->c_state = C_DISCONN_PEND; 3174 mutex_exit(&conn->c_lock); 3175 (void) rib_disconnect_channel(conn, 3176 &hca->srv_conn_list); 3177 } else { 3178 /* 3179 * conn will be freed when c_ref goes to 0. 3180 * Indicate to cleaning thread not to close 3181 * the connection, but just free the channel. 3182 */ 3183 conn->c_flags |= C_CLOSE_NOTNEEDED; 3184 mutex_exit(&conn->c_lock); 3185 } 3186 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3187 break; 3188 } 3189 break; 3190 } 3191 case IBT_CM_EVENT_CONN_EST: 3192 /* 3193 * RTU received, hence connection established. 3194 */ 3195 if (rib_debug > 1) 3196 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3197 "(CONN_EST) channel established"); 3198 break; 3199 3200 default: 3201 if (rib_debug > 2) { 3202 /* Let CM handle the following events. */ 3203 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3204 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3205 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3206 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3207 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3208 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3209 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3210 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3211 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3212 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3213 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3214 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3215 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3216 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3217 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3218 } 3219 } 3220 return (IBT_CM_DEFAULT); 3221 } 3222 3223 /* accept all other CM messages (i.e. let the CM handle them) */ 3224 return (IBT_CM_ACCEPT); 3225 } 3226 3227 static rdma_stat 3228 rib_register_service(rib_hca_t *hca, int service_type, 3229 uint8_t protocol_num, in_port_t dst_port) 3230 { 3231 ibt_srv_desc_t sdesc; 3232 ibt_hca_portinfo_t *port_infop; 3233 ib_svc_id_t srv_id; 3234 ibt_srv_hdl_t srv_hdl; 3235 uint_t port_size; 3236 uint_t pki, i, num_ports, nbinds; 3237 ibt_status_t ibt_status; 3238 rib_service_t *service; 3239 ib_pkey_t pkey; 3240 3241 /* 3242 * Query all ports for the given HCA 3243 */ 3244 rw_enter(&hca->state_lock, RW_READER); 3245 if (hca->state != HCA_DETACHED) { 3246 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3247 &num_ports, &port_size); 3248 rw_exit(&hca->state_lock); 3249 } else { 3250 rw_exit(&hca->state_lock); 3251 return (RDMA_FAILED); 3252 } 3253 if (ibt_status != IBT_SUCCESS) { 3254 return (RDMA_FAILED); 3255 } 3256 3257 DTRACE_PROBE1(rpcib__i__regservice_numports, 3258 int, num_ports); 3259 3260 for (i = 0; i < num_ports; i++) { 3261 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3262 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3263 int, i+1); 3264 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3265 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3266 int, i+1); 3267 } 3268 } 3269 3270 /* 3271 * Get all the IP addresses on this system to register the 3272 * given "service type" on all DNS recognized IP addrs. 3273 * Each service type such as NFS will have all the systems 3274 * IP addresses as its different names. For now the only 3275 * type of service we support in RPCIB is NFS. 3276 */ 3277 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3278 /* 3279 * Start registering and binding service to active 3280 * on active ports on this HCA. 3281 */ 3282 nbinds = 0; 3283 for (service = rib_stat->service_list; 3284 service && (service->srv_type != service_type); 3285 service = service->next) 3286 ; 3287 3288 if (service == NULL) { 3289 /* 3290 * We use IP addresses as the service names for 3291 * service registration. Register each of them 3292 * with CM to obtain a svc_id and svc_hdl. We do not 3293 * register the service with machine's loopback address. 3294 */ 3295 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3296 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3297 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3298 sdesc.sd_handler = rib_srv_cm_handler; 3299 sdesc.sd_flags = 0; 3300 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3301 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3302 1, &srv_hdl, &srv_id); 3303 if ((ibt_status != IBT_SUCCESS) && 3304 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3305 rw_exit(&rib_stat->service_list_lock); 3306 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3307 int, ibt_status); 3308 ibt_free_portinfo(port_infop, port_size); 3309 return (RDMA_FAILED); 3310 } 3311 3312 /* 3313 * Allocate and prepare a service entry 3314 */ 3315 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3316 3317 service->srv_type = service_type; 3318 service->srv_hdl = srv_hdl; 3319 service->srv_id = srv_id; 3320 3321 service->next = rib_stat->service_list; 3322 rib_stat->service_list = service; 3323 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3324 int, service->srv_type); 3325 } else { 3326 srv_hdl = service->srv_hdl; 3327 srv_id = service->srv_id; 3328 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3329 int, service->srv_type); 3330 } 3331 3332 for (i = 0; i < num_ports; i++) { 3333 ibt_sbind_hdl_t sbp; 3334 rib_hca_service_t *hca_srv; 3335 ib_gid_t gid; 3336 3337 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3338 continue; 3339 3340 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3341 pkey = port_infop[i].p_pkey_tbl[pki]; 3342 3343 rw_enter(&hca->bound_services_lock, RW_READER); 3344 gid = port_infop[i].p_sgid_tbl[0]; 3345 for (hca_srv = hca->bound_services; hca_srv; 3346 hca_srv = hca_srv->next) { 3347 if ((hca_srv->srv_id == service->srv_id) && 3348 (hca_srv->gid.gid_prefix == 3349 gid.gid_prefix) && 3350 (hca_srv->gid.gid_guid == gid.gid_guid)) 3351 break; 3352 } 3353 rw_exit(&hca->bound_services_lock); 3354 if (hca_srv != NULL) { 3355 /* 3356 * port is alreay bound the the service 3357 */ 3358 DTRACE_PROBE1( 3359 rpcib__i__regservice__already__bound, 3360 int, i+1); 3361 nbinds++; 3362 continue; 3363 } 3364 3365 if ((pkey & IBSRM_HB) && 3366 (pkey != IB_PKEY_INVALID_FULL)) { 3367 3368 sbp = NULL; 3369 ibt_status = ibt_bind_service(srv_hdl, 3370 gid, NULL, hca, &sbp); 3371 3372 if (ibt_status == IBT_SUCCESS) { 3373 hca_srv = kmem_zalloc( 3374 sizeof (rib_hca_service_t), 3375 KM_SLEEP); 3376 hca_srv->srv_id = srv_id; 3377 hca_srv->gid = gid; 3378 hca_srv->sbind_hdl = sbp; 3379 3380 rw_enter(&hca->bound_services_lock, 3381 RW_WRITER); 3382 hca_srv->next = hca->bound_services; 3383 hca->bound_services = hca_srv; 3384 rw_exit(&hca->bound_services_lock); 3385 nbinds++; 3386 } 3387 3388 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3389 int, ibt_status); 3390 } 3391 } 3392 } 3393 rw_exit(&rib_stat->service_list_lock); 3394 3395 ibt_free_portinfo(port_infop, port_size); 3396 3397 if (nbinds == 0) { 3398 return (RDMA_FAILED); 3399 } else { 3400 /* 3401 * Put this plugin into accept state, since atleast 3402 * one registration was successful. 3403 */ 3404 mutex_enter(&plugin_state_lock); 3405 plugin_state = ACCEPT; 3406 mutex_exit(&plugin_state_lock); 3407 return (RDMA_SUCCESS); 3408 } 3409 } 3410 3411 void 3412 rib_listen(struct rdma_svc_data *rd) 3413 { 3414 rdma_stat status; 3415 int n_listening = 0; 3416 rib_hca_t *hca; 3417 3418 mutex_enter(&rib_stat->listen_lock); 3419 /* 3420 * if rd parameter is NULL then it means that rib_stat->q is 3421 * already initialized by a call from RDMA and we just want to 3422 * add a newly attached HCA to the same listening state as other 3423 * HCAs. 3424 */ 3425 if (rd == NULL) { 3426 if (rib_stat->q == NULL) { 3427 mutex_exit(&rib_stat->listen_lock); 3428 return; 3429 } 3430 } else { 3431 rib_stat->q = &rd->q; 3432 } 3433 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3434 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3435 /* 3436 * First check if a hca is still attached 3437 */ 3438 rw_enter(&hca->state_lock, RW_READER); 3439 if (hca->state != HCA_INITED) { 3440 rw_exit(&hca->state_lock); 3441 continue; 3442 } 3443 rw_exit(&hca->state_lock); 3444 3445 /* 3446 * Right now the only service type is NFS. Hence 3447 * force feed this value. Ideally to communicate 3448 * the service type it should be passed down in 3449 * rdma_svc_data. 3450 */ 3451 status = rib_register_service(hca, NFS, 3452 IPPROTO_TCP, nfs_rdma_port); 3453 if (status == RDMA_SUCCESS) 3454 n_listening++; 3455 } 3456 rw_exit(&rib_stat->hcas_list_lock); 3457 3458 /* 3459 * Service active on an HCA, check rd->err_code for more 3460 * explainable errors. 3461 */ 3462 if (rd) { 3463 if (n_listening > 0) { 3464 rd->active = 1; 3465 rd->err_code = RDMA_SUCCESS; 3466 } else { 3467 rd->active = 0; 3468 rd->err_code = RDMA_FAILED; 3469 } 3470 } 3471 mutex_exit(&rib_stat->listen_lock); 3472 } 3473 3474 /* XXXX */ 3475 /* ARGSUSED */ 3476 static void 3477 rib_listen_stop(struct rdma_svc_data *svcdata) 3478 { 3479 rib_hca_t *hca; 3480 3481 mutex_enter(&rib_stat->listen_lock); 3482 /* 3483 * KRPC called the RDMATF to stop the listeners, this means 3484 * stop sending incomming or recieved requests to KRPC master 3485 * transport handle for RDMA-IB. This is also means that the 3486 * master transport handle, responsible for us, is going away. 3487 */ 3488 mutex_enter(&plugin_state_lock); 3489 plugin_state = NO_ACCEPT; 3490 if (svcdata != NULL) 3491 svcdata->active = 0; 3492 mutex_exit(&plugin_state_lock); 3493 3494 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3495 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3496 /* 3497 * First check if a hca is still attached 3498 */ 3499 rw_enter(&hca->state_lock, RW_READER); 3500 if (hca->state == HCA_DETACHED) { 3501 rw_exit(&hca->state_lock); 3502 continue; 3503 } 3504 rib_close_channels(&hca->srv_conn_list); 3505 rib_stop_services(hca); 3506 rw_exit(&hca->state_lock); 3507 } 3508 rw_exit(&rib_stat->hcas_list_lock); 3509 3510 /* 3511 * Avoid rib_listen() using the stale q field. 3512 * This could happen if a port goes up after all services 3513 * are already unregistered. 3514 */ 3515 rib_stat->q = NULL; 3516 mutex_exit(&rib_stat->listen_lock); 3517 } 3518 3519 /* 3520 * Traverse the HCA's service list to unbind and deregister services. 3521 * For each bound service of HCA to be removed, first find the corresponding 3522 * service handle (srv_hdl) and then unbind the service by calling 3523 * ibt_unbind_service(). 3524 */ 3525 static void 3526 rib_stop_services(rib_hca_t *hca) 3527 { 3528 rib_hca_service_t *srv_list, *to_remove; 3529 3530 /* 3531 * unbind and deregister the services for this service type. 3532 * Right now there is only one service type. In future it will 3533 * be passed down to this function. 3534 */ 3535 rw_enter(&hca->bound_services_lock, RW_READER); 3536 srv_list = hca->bound_services; 3537 hca->bound_services = NULL; 3538 rw_exit(&hca->bound_services_lock); 3539 3540 while (srv_list != NULL) { 3541 rib_service_t *sc; 3542 3543 to_remove = srv_list; 3544 srv_list = to_remove->next; 3545 rw_enter(&rib_stat->service_list_lock, RW_READER); 3546 for (sc = rib_stat->service_list; 3547 sc && (sc->srv_id != to_remove->srv_id); 3548 sc = sc->next) 3549 ; 3550 /* 3551 * if sc is NULL then the service doesn't exist anymore, 3552 * probably just removed completely through rib_stat. 3553 */ 3554 if (sc != NULL) 3555 (void) ibt_unbind_service(sc->srv_hdl, 3556 to_remove->sbind_hdl); 3557 rw_exit(&rib_stat->service_list_lock); 3558 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3559 } 3560 } 3561 3562 static struct svc_recv * 3563 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3564 { 3565 struct svc_recv *recvp; 3566 3567 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3568 recvp->vaddr = sgl->ds_va; 3569 recvp->qp = qp; 3570 recvp->bytes_xfer = 0; 3571 return (recvp); 3572 } 3573 3574 static int 3575 rib_free_svc_recv(struct svc_recv *recvp) 3576 { 3577 kmem_free(recvp, sizeof (*recvp)); 3578 3579 return (0); 3580 } 3581 3582 static struct reply * 3583 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3584 { 3585 struct reply *rep; 3586 3587 3588 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3589 if (rep == NULL) { 3590 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3591 return (NULL); 3592 } 3593 rep->xid = msgid; 3594 rep->vaddr_cq = NULL; 3595 rep->bytes_xfer = 0; 3596 rep->status = (uint_t)REPLY_WAIT; 3597 rep->prev = NULL; 3598 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3599 3600 mutex_enter(&qp->replylist_lock); 3601 if (qp->replylist) { 3602 rep->next = qp->replylist; 3603 qp->replylist->prev = rep; 3604 } 3605 qp->rep_list_size++; 3606 3607 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3608 int, qp->rep_list_size); 3609 3610 qp->replylist = rep; 3611 mutex_exit(&qp->replylist_lock); 3612 3613 return (rep); 3614 } 3615 3616 static rdma_stat 3617 rib_rem_replylist(rib_qp_t *qp) 3618 { 3619 struct reply *r, *n; 3620 3621 mutex_enter(&qp->replylist_lock); 3622 for (r = qp->replylist; r != NULL; r = n) { 3623 n = r->next; 3624 (void) rib_remreply(qp, r); 3625 } 3626 mutex_exit(&qp->replylist_lock); 3627 3628 return (RDMA_SUCCESS); 3629 } 3630 3631 static int 3632 rib_remreply(rib_qp_t *qp, struct reply *rep) 3633 { 3634 3635 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3636 if (rep->prev) { 3637 rep->prev->next = rep->next; 3638 } 3639 if (rep->next) { 3640 rep->next->prev = rep->prev; 3641 } 3642 if (qp->replylist == rep) 3643 qp->replylist = rep->next; 3644 3645 cv_destroy(&rep->wait_cv); 3646 qp->rep_list_size--; 3647 3648 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3649 int, qp->rep_list_size); 3650 3651 kmem_free(rep, sizeof (*rep)); 3652 3653 return (0); 3654 } 3655 3656 rdma_stat 3657 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3658 struct mrc *buf_handle) 3659 { 3660 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3661 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3662 rdma_stat status; 3663 rib_hca_t *hca = (ctoqp(conn))->hca; 3664 3665 /* 3666 * Note: ALL buffer pools use the same memory type RDMARW. 3667 */ 3668 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3669 if (status == RDMA_SUCCESS) { 3670 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3671 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3672 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3673 } else { 3674 buf_handle->mrc_linfo = NULL; 3675 buf_handle->mrc_lmr = 0; 3676 buf_handle->mrc_rmr = 0; 3677 } 3678 return (status); 3679 } 3680 3681 static rdma_stat 3682 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3683 ibt_mr_flags_t spec, 3684 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3685 { 3686 ibt_mr_attr_t mem_attr; 3687 ibt_status_t ibt_status; 3688 mem_attr.mr_vaddr = (uintptr_t)buf; 3689 mem_attr.mr_len = (ib_msglen_t)size; 3690 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3691 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3692 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3693 IBT_MR_ENABLE_WINDOW_BIND | spec; 3694 3695 rw_enter(&hca->state_lock, RW_READER); 3696 if (hca->state != HCA_DETACHED) { 3697 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3698 &mem_attr, mr_hdlp, mr_descp); 3699 rw_exit(&hca->state_lock); 3700 } else { 3701 rw_exit(&hca->state_lock); 3702 return (RDMA_FAILED); 3703 } 3704 3705 if (ibt_status != IBT_SUCCESS) { 3706 return (RDMA_FAILED); 3707 } 3708 return (RDMA_SUCCESS); 3709 } 3710 3711 rdma_stat 3712 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3713 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3714 { 3715 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3716 rib_lrc_entry_t *l; 3717 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3718 rdma_stat status; 3719 rib_hca_t *hca = (ctoqp(conn))->hca; 3720 3721 /* 3722 * Non-coherent memory registration. 3723 */ 3724 l = (rib_lrc_entry_t *)lrc; 3725 if (l) { 3726 if (l->registered) { 3727 buf_handle->mrc_linfo = 3728 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3729 buf_handle->mrc_lmr = 3730 (uint32_t)l->lrc_mhandle.mrc_lmr; 3731 buf_handle->mrc_rmr = 3732 (uint32_t)l->lrc_mhandle.mrc_rmr; 3733 *sync_handle = (RIB_SYNCMEM_HANDLE) 3734 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3735 return (RDMA_SUCCESS); 3736 } else { 3737 /* Always register the whole buffer */ 3738 buf = (caddr_t)l->lrc_buf; 3739 buflen = l->lrc_len; 3740 } 3741 } 3742 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3743 3744 if (status == RDMA_SUCCESS) { 3745 if (l) { 3746 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3747 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3748 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3749 l->registered = TRUE; 3750 } 3751 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3752 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3753 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3754 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3755 } else { 3756 buf_handle->mrc_linfo = NULL; 3757 buf_handle->mrc_lmr = 0; 3758 buf_handle->mrc_rmr = 0; 3759 } 3760 return (status); 3761 } 3762 3763 /* ARGSUSED */ 3764 rdma_stat 3765 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3766 { 3767 rib_hca_t *hca = (ctoqp(conn))->hca; 3768 /* 3769 * Allow memory deregistration even if HCA is 3770 * getting detached. Need all outstanding 3771 * memory registrations to be deregistered 3772 * before HCA_DETACH_EVENT can be accepted. 3773 */ 3774 (void) ibt_deregister_mr(hca->hca_hdl, 3775 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3776 return (RDMA_SUCCESS); 3777 } 3778 3779 /* ARGSUSED */ 3780 rdma_stat 3781 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3782 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3783 { 3784 rib_lrc_entry_t *l; 3785 l = (rib_lrc_entry_t *)lrc; 3786 if (l) 3787 if (l->registered) 3788 return (RDMA_SUCCESS); 3789 3790 (void) rib_deregistermem(conn, buf, buf_handle); 3791 3792 return (RDMA_SUCCESS); 3793 } 3794 3795 /* ARGSUSED */ 3796 rdma_stat 3797 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3798 int len, int cpu) 3799 { 3800 ibt_status_t status; 3801 rib_hca_t *hca = (ctoqp(conn))->hca; 3802 ibt_mr_sync_t mr_segment; 3803 3804 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3805 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3806 mr_segment.ms_len = (ib_memlen_t)len; 3807 if (cpu) { 3808 /* make incoming data visible to memory */ 3809 mr_segment.ms_flags = IBT_SYNC_WRITE; 3810 } else { 3811 /* make memory changes visible to IO */ 3812 mr_segment.ms_flags = IBT_SYNC_READ; 3813 } 3814 rw_enter(&hca->state_lock, RW_READER); 3815 if (hca->state != HCA_DETACHED) { 3816 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3817 rw_exit(&hca->state_lock); 3818 } else { 3819 rw_exit(&hca->state_lock); 3820 return (RDMA_FAILED); 3821 } 3822 3823 if (status == IBT_SUCCESS) 3824 return (RDMA_SUCCESS); 3825 else { 3826 return (RDMA_FAILED); 3827 } 3828 } 3829 3830 /* 3831 * XXXX ???? 3832 */ 3833 static rdma_stat 3834 rib_getinfo(rdma_info_t *info) 3835 { 3836 /* 3837 * XXXX Hack! 3838 */ 3839 info->addrlen = 16; 3840 info->mts = 1000000; 3841 info->mtu = 1000000; 3842 3843 return (RDMA_SUCCESS); 3844 } 3845 3846 rib_bufpool_t * 3847 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3848 { 3849 rib_bufpool_t *rbp = NULL; 3850 bufpool_t *bp = NULL; 3851 caddr_t buf; 3852 ibt_mr_attr_t mem_attr; 3853 ibt_status_t ibt_status; 3854 int i, j; 3855 3856 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3857 3858 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3859 num * sizeof (void *), KM_SLEEP); 3860 3861 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3862 bp->numelems = num; 3863 3864 3865 switch (ptype) { 3866 case SEND_BUFFER: 3867 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3868 bp->rsize = RPC_MSG_SZ; 3869 break; 3870 case RECV_BUFFER: 3871 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3872 bp->rsize = RPC_BUF_SIZE; 3873 break; 3874 default: 3875 goto fail; 3876 } 3877 3878 /* 3879 * Register the pool. 3880 */ 3881 bp->bufsize = num * bp->rsize; 3882 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3883 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3884 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3885 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3886 sizeof (ibt_mr_desc_t), KM_SLEEP); 3887 rw_enter(&hca->state_lock, RW_READER); 3888 3889 if (hca->state == HCA_DETACHED) { 3890 rw_exit(&hca->state_lock); 3891 goto fail; 3892 } 3893 3894 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3895 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3896 mem_attr.mr_vaddr = (uintptr_t)buf; 3897 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3898 mem_attr.mr_as = NULL; 3899 ibt_status = ibt_register_mr(hca->hca_hdl, 3900 hca->pd_hdl, &mem_attr, 3901 &rbp->mr_hdl[i], 3902 &rbp->mr_desc[i]); 3903 if (ibt_status != IBT_SUCCESS) { 3904 for (j = 0; j < i; j++) { 3905 (void) ibt_deregister_mr(hca->hca_hdl, 3906 rbp->mr_hdl[j]); 3907 } 3908 rw_exit(&hca->state_lock); 3909 goto fail; 3910 } 3911 } 3912 rw_exit(&hca->state_lock); 3913 buf = (caddr_t)bp->buf; 3914 for (i = 0; i < num; i++, buf += bp->rsize) { 3915 bp->buflist[i] = (void *)buf; 3916 } 3917 bp->buffree = num - 1; /* no. of free buffers */ 3918 rbp->bpool = bp; 3919 3920 return (rbp); 3921 fail: 3922 if (bp) { 3923 if (bp->buf) 3924 kmem_free(bp->buf, bp->bufsize); 3925 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3926 } 3927 if (rbp) { 3928 if (rbp->mr_hdl) 3929 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3930 if (rbp->mr_desc) 3931 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3932 kmem_free(rbp, sizeof (rib_bufpool_t)); 3933 } 3934 return (NULL); 3935 } 3936 3937 static void 3938 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3939 { 3940 int i; 3941 rib_bufpool_t *rbp = NULL; 3942 bufpool_t *bp; 3943 3944 /* 3945 * Obtain pool address based on type of pool 3946 */ 3947 switch (ptype) { 3948 case SEND_BUFFER: 3949 rbp = hca->send_pool; 3950 break; 3951 case RECV_BUFFER: 3952 rbp = hca->recv_pool; 3953 break; 3954 default: 3955 return; 3956 } 3957 if (rbp == NULL) 3958 return; 3959 3960 bp = rbp->bpool; 3961 3962 /* 3963 * Deregister the pool memory and free it. 3964 */ 3965 for (i = 0; i < bp->numelems; i++) { 3966 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3967 } 3968 } 3969 3970 static void 3971 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3972 { 3973 3974 rib_bufpool_t *rbp = NULL; 3975 bufpool_t *bp; 3976 3977 /* 3978 * Obtain pool address based on type of pool 3979 */ 3980 switch (ptype) { 3981 case SEND_BUFFER: 3982 rbp = hca->send_pool; 3983 break; 3984 case RECV_BUFFER: 3985 rbp = hca->recv_pool; 3986 break; 3987 default: 3988 return; 3989 } 3990 if (rbp == NULL) 3991 return; 3992 3993 bp = rbp->bpool; 3994 3995 /* 3996 * Free the pool memory. 3997 */ 3998 if (rbp->mr_hdl) 3999 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 4000 4001 if (rbp->mr_desc) 4002 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4003 if (bp->buf) 4004 kmem_free(bp->buf, bp->bufsize); 4005 mutex_destroy(&bp->buflock); 4006 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4007 kmem_free(rbp, sizeof (rib_bufpool_t)); 4008 } 4009 4010 void 4011 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4012 { 4013 /* 4014 * Deregister the pool memory and free it. 4015 */ 4016 rib_rbufpool_deregister(hca, ptype); 4017 rib_rbufpool_free(hca, ptype); 4018 } 4019 4020 /* 4021 * Fetch a buffer from the pool of type specified in rdbuf->type. 4022 */ 4023 static rdma_stat 4024 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4025 { 4026 rib_lrc_entry_t *rlep; 4027 4028 if (rdbuf->type == RDMA_LONG_BUFFER) { 4029 rlep = rib_get_cache_buf(conn, rdbuf->len); 4030 rdbuf->rb_private = (caddr_t)rlep; 4031 rdbuf->addr = rlep->lrc_buf; 4032 rdbuf->handle = rlep->lrc_mhandle; 4033 return (RDMA_SUCCESS); 4034 } 4035 4036 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4037 if (rdbuf->addr) { 4038 switch (rdbuf->type) { 4039 case SEND_BUFFER: 4040 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4041 break; 4042 case RECV_BUFFER: 4043 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4044 break; 4045 default: 4046 rdbuf->len = 0; 4047 } 4048 return (RDMA_SUCCESS); 4049 } else 4050 return (RDMA_FAILED); 4051 } 4052 4053 /* 4054 * Fetch a buffer of specified type. 4055 * Note that rdbuf->handle is mw's rkey. 4056 */ 4057 static void * 4058 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4059 { 4060 rib_qp_t *qp = ctoqp(conn); 4061 rib_hca_t *hca = qp->hca; 4062 rdma_btype ptype = rdbuf->type; 4063 void *buf; 4064 rib_bufpool_t *rbp = NULL; 4065 bufpool_t *bp; 4066 int i; 4067 4068 /* 4069 * Obtain pool address based on type of pool 4070 */ 4071 switch (ptype) { 4072 case SEND_BUFFER: 4073 rbp = hca->send_pool; 4074 break; 4075 case RECV_BUFFER: 4076 rbp = hca->recv_pool; 4077 break; 4078 default: 4079 return (NULL); 4080 } 4081 if (rbp == NULL) 4082 return (NULL); 4083 4084 bp = rbp->bpool; 4085 4086 mutex_enter(&bp->buflock); 4087 if (bp->buffree < 0) { 4088 mutex_exit(&bp->buflock); 4089 return (NULL); 4090 } 4091 4092 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4093 buf = bp->buflist[bp->buffree]; 4094 rdbuf->addr = buf; 4095 rdbuf->len = bp->rsize; 4096 for (i = bp->numelems - 1; i >= 0; i--) { 4097 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4098 rdbuf->handle.mrc_rmr = 4099 (uint32_t)rbp->mr_desc[i].md_rkey; 4100 rdbuf->handle.mrc_linfo = 4101 (uintptr_t)rbp->mr_hdl[i]; 4102 rdbuf->handle.mrc_lmr = 4103 (uint32_t)rbp->mr_desc[i].md_lkey; 4104 bp->buffree--; 4105 4106 mutex_exit(&bp->buflock); 4107 4108 return (buf); 4109 } 4110 } 4111 4112 mutex_exit(&bp->buflock); 4113 4114 return (NULL); 4115 } 4116 4117 static void 4118 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4119 { 4120 4121 if (rdbuf->type == RDMA_LONG_BUFFER) { 4122 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4123 rdbuf->rb_private = NULL; 4124 return; 4125 } 4126 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4127 } 4128 4129 static void 4130 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4131 { 4132 rib_qp_t *qp = ctoqp(conn); 4133 rib_hca_t *hca = qp->hca; 4134 rib_bufpool_t *rbp = NULL; 4135 bufpool_t *bp; 4136 4137 /* 4138 * Obtain pool address based on type of pool 4139 */ 4140 switch (ptype) { 4141 case SEND_BUFFER: 4142 rbp = hca->send_pool; 4143 break; 4144 case RECV_BUFFER: 4145 rbp = hca->recv_pool; 4146 break; 4147 default: 4148 return; 4149 } 4150 if (rbp == NULL) 4151 return; 4152 4153 bp = rbp->bpool; 4154 4155 mutex_enter(&bp->buflock); 4156 if (++bp->buffree >= bp->numelems) { 4157 /* 4158 * Should never happen 4159 */ 4160 bp->buffree--; 4161 } else { 4162 bp->buflist[bp->buffree] = buf; 4163 } 4164 mutex_exit(&bp->buflock); 4165 } 4166 4167 static rdma_stat 4168 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4169 { 4170 rw_enter(&connlist->conn_lock, RW_WRITER); 4171 if (connlist->conn_hd) { 4172 cn->c_next = connlist->conn_hd; 4173 connlist->conn_hd->c_prev = cn; 4174 } 4175 connlist->conn_hd = cn; 4176 rw_exit(&connlist->conn_lock); 4177 4178 return (RDMA_SUCCESS); 4179 } 4180 4181 static rdma_stat 4182 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4183 { 4184 rw_enter(&connlist->conn_lock, RW_WRITER); 4185 if (cn->c_prev) { 4186 cn->c_prev->c_next = cn->c_next; 4187 } 4188 if (cn->c_next) { 4189 cn->c_next->c_prev = cn->c_prev; 4190 } 4191 if (connlist->conn_hd == cn) 4192 connlist->conn_hd = cn->c_next; 4193 rw_exit(&connlist->conn_lock); 4194 4195 return (RDMA_SUCCESS); 4196 } 4197 4198 /* ARGSUSED */ 4199 static rdma_stat 4200 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4201 int addr_type, void *handle, CONN **conn) 4202 { 4203 rdma_stat status; 4204 rpcib_ping_t rpt; 4205 4206 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4207 return (status); 4208 } 4209 4210 /* 4211 * rib_find_hca_connection 4212 * 4213 * if there is an existing connection to the specified address then 4214 * it will be returned in conn, otherwise conn will be set to NULL. 4215 * Also cleans up any connection that is in error state. 4216 */ 4217 static int 4218 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4219 struct netbuf *d_svcaddr, CONN **conn) 4220 { 4221 CONN *cn; 4222 clock_t cv_stat, timout; 4223 4224 *conn = NULL; 4225 again: 4226 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4227 cn = hca->cl_conn_list.conn_hd; 4228 while (cn != NULL) { 4229 /* 4230 * First, clear up any connection in the ERROR state 4231 */ 4232 mutex_enter(&cn->c_lock); 4233 if (cn->c_state == C_ERROR_CONN) { 4234 if (cn->c_ref == 0) { 4235 /* 4236 * Remove connection from list and destroy it. 4237 */ 4238 cn->c_state = C_DISCONN_PEND; 4239 mutex_exit(&cn->c_lock); 4240 rw_exit(&hca->cl_conn_list.conn_lock); 4241 rib_conn_close((void *)cn); 4242 goto again; 4243 } 4244 mutex_exit(&cn->c_lock); 4245 cn = cn->c_next; 4246 continue; 4247 } 4248 if (cn->c_state == C_DISCONN_PEND) { 4249 mutex_exit(&cn->c_lock); 4250 cn = cn->c_next; 4251 continue; 4252 } 4253 4254 /* 4255 * source address is only checked for if there is one, 4256 * this is the case for retries. 4257 */ 4258 if ((cn->c_raddr.len == d_svcaddr->len) && 4259 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4260 d_svcaddr->len) == 0) && 4261 ((s_svcaddr->len == 0) || 4262 ((cn->c_laddr.len == s_svcaddr->len) && 4263 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4264 s_svcaddr->len) == 0)))) { 4265 /* 4266 * Our connection. Give up conn list lock 4267 * as we are done traversing the list. 4268 */ 4269 rw_exit(&hca->cl_conn_list.conn_lock); 4270 if (cn->c_state == C_CONNECTED) { 4271 cn->c_ref++; /* sharing a conn */ 4272 mutex_exit(&cn->c_lock); 4273 *conn = cn; 4274 return (RDMA_SUCCESS); 4275 } 4276 if (cn->c_state == C_CONN_PEND) { 4277 /* 4278 * Hold a reference to this conn before 4279 * we give up the lock. 4280 */ 4281 cn->c_ref++; 4282 timout = ddi_get_lbolt() + 4283 drv_usectohz(CONN_WAIT_TIME * 1000000); 4284 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4285 &cn->c_lock, timout)) > 0 && 4286 cn->c_state == C_CONN_PEND) 4287 ; 4288 if (cv_stat == 0) { 4289 (void) rib_conn_release_locked(cn); 4290 return (RDMA_INTR); 4291 } 4292 if (cv_stat < 0) { 4293 (void) rib_conn_release_locked(cn); 4294 return (RDMA_TIMEDOUT); 4295 } 4296 if (cn->c_state == C_CONNECTED) { 4297 *conn = cn; 4298 mutex_exit(&cn->c_lock); 4299 return (RDMA_SUCCESS); 4300 } else { 4301 (void) rib_conn_release_locked(cn); 4302 return (RDMA_TIMEDOUT); 4303 } 4304 } 4305 } 4306 mutex_exit(&cn->c_lock); 4307 cn = cn->c_next; 4308 } 4309 rw_exit(&hca->cl_conn_list.conn_lock); 4310 *conn = NULL; 4311 return (RDMA_FAILED); 4312 } 4313 4314 /* 4315 * Connection management. 4316 * IBTF does not support recycling of channels. So connections are only 4317 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4318 * C_DISCONN_PEND state. No C_IDLE state. 4319 * C_CONN_PEND state: Connection establishment in progress to the server. 4320 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4321 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4322 * only in this state. 4323 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4324 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4325 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4326 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4327 * c_ref drops to 0 (this indicates that RPC has no more references to this 4328 * connection), the connection should be destroyed. A connection transitions 4329 * into this state when it is being destroyed. 4330 */ 4331 /* ARGSUSED */ 4332 static rdma_stat 4333 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4334 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4335 { 4336 CONN *cn; 4337 int status; 4338 rib_hca_t *hca; 4339 rib_qp_t *qp; 4340 int s_addr_len; 4341 char *s_addr_buf; 4342 4343 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4344 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4345 rw_enter(&hca->state_lock, RW_READER); 4346 if (hca->state != HCA_DETACHED) { 4347 status = rib_find_hca_connection(hca, s_svcaddr, 4348 d_svcaddr, conn); 4349 rw_exit(&hca->state_lock); 4350 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4351 rw_exit(&rib_stat->hcas_list_lock); 4352 return (status); 4353 } 4354 } else 4355 rw_exit(&hca->state_lock); 4356 } 4357 rw_exit(&rib_stat->hcas_list_lock); 4358 4359 /* 4360 * No existing connection found, establish a new connection. 4361 */ 4362 bzero(rpt, sizeof (rpcib_ping_t)); 4363 4364 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4365 if (status != RDMA_SUCCESS) { 4366 return (RDMA_FAILED); 4367 } 4368 hca = rpt->hca; 4369 4370 if (rpt->srcip.family == AF_INET) { 4371 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4372 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4373 } else if (rpt->srcip.family == AF_INET6) { 4374 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4375 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4376 } else { 4377 return (RDMA_FAILED); 4378 } 4379 4380 /* 4381 * Channel to server doesn't exist yet, create one. 4382 */ 4383 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4384 return (RDMA_FAILED); 4385 } 4386 cn = qptoc(qp); 4387 cn->c_state = C_CONN_PEND; 4388 cn->c_ref = 1; 4389 4390 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4391 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4392 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4393 4394 if (rpt->srcip.family == AF_INET) { 4395 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4396 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4397 4398 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4399 sizeof (struct sockaddr_in); 4400 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4401 4402 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr = 4403 (uint32_t)~0; 4404 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family = 4405 (ushort_t)~0; 4406 4407 } else { 4408 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4409 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4410 4411 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4412 sizeof (struct sockaddr_in6); 4413 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4414 4415 (void) memset( 4416 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr, 4417 (uchar_t)~0, sizeof (struct in6_addr)); 4418 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family = 4419 (sa_family_t)~0; 4420 } 4421 4422 /* 4423 * Add to conn list. 4424 * We had given up the READER lock. In the time since then, 4425 * another thread might have created the connection we are 4426 * trying here. But for now, that is quiet alright - there 4427 * might be two connections between a pair of hosts instead 4428 * of one. If we really want to close that window, 4429 * then need to check the list after acquiring the 4430 * WRITER lock. 4431 */ 4432 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4433 status = rib_conn_to_srv(hca, qp, rpt); 4434 mutex_enter(&cn->c_lock); 4435 4436 if (cn->c_flags & C_CLOSE_PENDING) { 4437 /* 4438 * This handles a case where the module or 4439 * HCA detached in the time a connection is 4440 * established. In such a case close the 4441 * connection immediately if this is the 4442 * only reference. 4443 */ 4444 if (cn->c_ref == 1) { 4445 cn->c_ref--; 4446 cn->c_state = C_DISCONN_PEND; 4447 mutex_exit(&cn->c_lock); 4448 rib_conn_close((void *)cn); 4449 return (RDMA_FAILED); 4450 } 4451 4452 /* 4453 * Connection to be closed later when c_ref = 0 4454 */ 4455 status = RDMA_FAILED; 4456 } 4457 4458 if (status == RDMA_SUCCESS) { 4459 cn->c_state = C_CONNECTED; 4460 *conn = cn; 4461 } else { 4462 cn->c_state = C_ERROR_CONN; 4463 cn->c_ref--; 4464 } 4465 cv_signal(&cn->c_cv); 4466 mutex_exit(&cn->c_lock); 4467 return (status); 4468 } 4469 4470 static void 4471 rib_conn_close(void *rarg) 4472 { 4473 CONN *conn = (CONN *)rarg; 4474 rib_qp_t *qp = ctoqp(conn); 4475 4476 mutex_enter(&conn->c_lock); 4477 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4478 4479 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4480 4481 /* 4482 * Live connection in CONNECTED state. 4483 */ 4484 if (conn->c_state == C_CONNECTED) { 4485 conn->c_state = C_ERROR_CONN; 4486 } 4487 mutex_exit(&conn->c_lock); 4488 4489 rib_close_a_channel(conn); 4490 4491 mutex_enter(&conn->c_lock); 4492 conn->c_flags &= ~C_CLOSE_PENDING; 4493 } 4494 4495 mutex_exit(&conn->c_lock); 4496 4497 if (qp->mode == RIB_SERVER) 4498 (void) rib_disconnect_channel(conn, 4499 &qp->hca->srv_conn_list); 4500 else 4501 (void) rib_disconnect_channel(conn, 4502 &qp->hca->cl_conn_list); 4503 } 4504 4505 static void 4506 rib_conn_timeout_call(void *carg) 4507 { 4508 time_t idle_time; 4509 CONN *conn = (CONN *)carg; 4510 rib_hca_t *hca = ctoqp(conn)->hca; 4511 int error; 4512 4513 mutex_enter(&conn->c_lock); 4514 if ((conn->c_ref > 0) || 4515 (conn->c_state == C_DISCONN_PEND)) { 4516 conn->c_timeout = NULL; 4517 mutex_exit(&conn->c_lock); 4518 return; 4519 } 4520 4521 idle_time = (gethrestime_sec() - conn->c_last_used); 4522 4523 if ((idle_time <= rib_conn_timeout) && 4524 (conn->c_state != C_ERROR_CONN)) { 4525 /* 4526 * There was activity after the last timeout. 4527 * Extend the conn life. Unless the conn is 4528 * already in error state. 4529 */ 4530 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4531 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4532 mutex_exit(&conn->c_lock); 4533 return; 4534 } 4535 4536 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4537 (void *)conn, DDI_NOSLEEP); 4538 4539 /* 4540 * If taskq dispatch fails above, then reset the timeout 4541 * to try again after 10 secs. 4542 */ 4543 4544 if (error != DDI_SUCCESS) { 4545 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4546 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4547 mutex_exit(&conn->c_lock); 4548 return; 4549 } 4550 4551 conn->c_state = C_DISCONN_PEND; 4552 mutex_exit(&conn->c_lock); 4553 } 4554 4555 static rdma_stat 4556 rib_conn_release(CONN *conn) 4557 { 4558 mutex_enter(&conn->c_lock); 4559 return (rib_conn_release_locked(conn)); 4560 } 4561 4562 /* 4563 * Expects conn->c_lock to be held on entry. 4564 * c_lock released on return 4565 */ 4566 static rdma_stat 4567 rib_conn_release_locked(CONN *conn) 4568 { 4569 conn->c_ref--; 4570 4571 conn->c_last_used = gethrestime_sec(); 4572 if (conn->c_ref > 0) { 4573 mutex_exit(&conn->c_lock); 4574 return (RDMA_SUCCESS); 4575 } 4576 4577 /* 4578 * If a conn is C_ERROR_CONN, close the channel. 4579 */ 4580 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4581 conn->c_state = C_DISCONN_PEND; 4582 mutex_exit(&conn->c_lock); 4583 rib_conn_close((void *)conn); 4584 return (RDMA_SUCCESS); 4585 } 4586 4587 /* 4588 * c_ref == 0, set a timeout for conn release 4589 */ 4590 4591 if (conn->c_timeout == NULL) { 4592 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4593 SEC_TO_TICK(rib_conn_timeout)); 4594 } 4595 4596 mutex_exit(&conn->c_lock); 4597 return (RDMA_SUCCESS); 4598 } 4599 4600 /* 4601 * Add at front of list 4602 */ 4603 static struct rdma_done_list * 4604 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4605 { 4606 struct rdma_done_list *rd; 4607 4608 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4609 4610 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4611 rd->xid = xid; 4612 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4613 4614 rd->prev = NULL; 4615 rd->next = qp->rdlist; 4616 if (qp->rdlist != NULL) 4617 qp->rdlist->prev = rd; 4618 qp->rdlist = rd; 4619 4620 return (rd); 4621 } 4622 4623 static void 4624 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4625 { 4626 struct rdma_done_list *r; 4627 4628 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4629 4630 r = rd->next; 4631 if (r != NULL) { 4632 r->prev = rd->prev; 4633 } 4634 4635 r = rd->prev; 4636 if (r != NULL) { 4637 r->next = rd->next; 4638 } else { 4639 qp->rdlist = rd->next; 4640 } 4641 4642 cv_destroy(&rd->rdma_done_cv); 4643 kmem_free(rd, sizeof (*rd)); 4644 } 4645 4646 static void 4647 rdma_done_rem_list(rib_qp_t *qp) 4648 { 4649 struct rdma_done_list *r, *n; 4650 4651 mutex_enter(&qp->rdlist_lock); 4652 for (r = qp->rdlist; r != NULL; r = n) { 4653 n = r->next; 4654 rdma_done_rm(qp, r); 4655 } 4656 mutex_exit(&qp->rdlist_lock); 4657 } 4658 4659 static void 4660 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4661 { 4662 struct rdma_done_list *r = qp->rdlist; 4663 4664 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4665 4666 while (r) { 4667 if (r->xid == xid) { 4668 cv_signal(&r->rdma_done_cv); 4669 return; 4670 } else { 4671 r = r->next; 4672 } 4673 } 4674 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4675 int, xid); 4676 } 4677 4678 /* 4679 * Expects conn->c_lock to be held by the caller. 4680 */ 4681 4682 static void 4683 rib_close_a_channel(CONN *conn) 4684 { 4685 rib_qp_t *qp; 4686 qp = ctoqp(conn); 4687 4688 if (qp->qp_hdl == NULL) { 4689 /* channel already freed */ 4690 return; 4691 } 4692 4693 /* 4694 * Call ibt_close_rc_channel in blocking mode 4695 * with no callbacks. 4696 */ 4697 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4698 NULL, 0, NULL, NULL, 0); 4699 } 4700 4701 /* 4702 * Goes through all connections and closes the channel 4703 * This will cause all the WRs on those channels to be 4704 * flushed. 4705 */ 4706 static void 4707 rib_close_channels(rib_conn_list_t *connlist) 4708 { 4709 CONN *conn, *tmp; 4710 4711 rw_enter(&connlist->conn_lock, RW_READER); 4712 conn = connlist->conn_hd; 4713 while (conn != NULL) { 4714 mutex_enter(&conn->c_lock); 4715 tmp = conn->c_next; 4716 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4717 4718 if (conn->c_state == C_CONN_PEND) { 4719 conn->c_flags |= C_CLOSE_PENDING; 4720 goto next; 4721 } 4722 4723 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4724 4725 /* 4726 * Live connection in CONNECTED state. 4727 */ 4728 if (conn->c_state == C_CONNECTED) 4729 conn->c_state = C_ERROR_CONN; 4730 mutex_exit(&conn->c_lock); 4731 4732 rib_close_a_channel(conn); 4733 4734 mutex_enter(&conn->c_lock); 4735 conn->c_flags &= ~C_CLOSE_PENDING; 4736 /* Signal a pending rib_disconnect_channel() */ 4737 cv_signal(&conn->c_cv); 4738 } 4739 next: 4740 mutex_exit(&conn->c_lock); 4741 conn = tmp; 4742 } 4743 rw_exit(&connlist->conn_lock); 4744 } 4745 4746 /* 4747 * Frees up all connections that are no longer being referenced 4748 */ 4749 static void 4750 rib_purge_connlist(rib_conn_list_t *connlist) 4751 { 4752 CONN *conn; 4753 4754 top: 4755 rw_enter(&connlist->conn_lock, RW_READER); 4756 conn = connlist->conn_hd; 4757 while (conn != NULL) { 4758 mutex_enter(&conn->c_lock); 4759 4760 /* 4761 * At this point connection is either in ERROR 4762 * or DISCONN_PEND state. If in DISCONN_PEND state 4763 * then some other thread is culling that connection. 4764 * If not and if c_ref is 0, then destroy the connection. 4765 */ 4766 if (conn->c_ref == 0 && 4767 conn->c_state != C_DISCONN_PEND) { 4768 /* 4769 * Cull the connection 4770 */ 4771 conn->c_state = C_DISCONN_PEND; 4772 mutex_exit(&conn->c_lock); 4773 rw_exit(&connlist->conn_lock); 4774 (void) rib_disconnect_channel(conn, connlist); 4775 goto top; 4776 } else { 4777 /* 4778 * conn disconnect already scheduled or will 4779 * happen from conn_release when c_ref drops to 0. 4780 */ 4781 mutex_exit(&conn->c_lock); 4782 } 4783 conn = conn->c_next; 4784 } 4785 rw_exit(&connlist->conn_lock); 4786 4787 /* 4788 * At this point, only connections with c_ref != 0 are on the list 4789 */ 4790 } 4791 4792 /* 4793 * Free all the HCA resources and close 4794 * the hca. 4795 */ 4796 4797 static void 4798 rib_free_hca(rib_hca_t *hca) 4799 { 4800 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4801 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4802 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4803 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4804 4805 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4806 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4807 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4808 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4809 4810 rib_rbufpool_destroy(hca, RECV_BUFFER); 4811 rib_rbufpool_destroy(hca, SEND_BUFFER); 4812 rib_destroy_cache(hca); 4813 if (rib_mod.rdma_count == 0) 4814 (void) rdma_unregister_mod(&rib_mod); 4815 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4816 (void) ibt_close_hca(hca->hca_hdl); 4817 hca->hca_hdl = NULL; 4818 } 4819 4820 4821 static void 4822 rib_stop_hca_services(rib_hca_t *hca) 4823 { 4824 rib_stop_services(hca); 4825 rib_close_channels(&hca->cl_conn_list); 4826 rib_close_channels(&hca->srv_conn_list); 4827 4828 rib_purge_connlist(&hca->cl_conn_list); 4829 rib_purge_connlist(&hca->srv_conn_list); 4830 4831 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4832 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4833 GLOBAL_ZONEID); 4834 stats_enabled = FALSE; 4835 } 4836 4837 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4838 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4839 if (hca->srv_conn_list.conn_hd == NULL && 4840 hca->cl_conn_list.conn_hd == NULL) { 4841 /* 4842 * conn_lists are NULL, so destroy 4843 * buffers, close hca and be done. 4844 */ 4845 rib_free_hca(hca); 4846 } 4847 rw_exit(&hca->cl_conn_list.conn_lock); 4848 rw_exit(&hca->srv_conn_list.conn_lock); 4849 4850 if (hca->hca_hdl != NULL) { 4851 mutex_enter(&hca->inuse_lock); 4852 while (hca->inuse) 4853 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4854 mutex_exit(&hca->inuse_lock); 4855 4856 rib_free_hca(hca); 4857 } 4858 rw_destroy(&hca->bound_services_lock); 4859 4860 if (hca->cleanup_helper != NULL) { 4861 ddi_taskq_destroy(hca->cleanup_helper); 4862 hca->cleanup_helper = NULL; 4863 } 4864 } 4865 4866 /* 4867 * Cleans and closes up all uses of the HCA 4868 */ 4869 static void 4870 rib_detach_hca(ibt_hca_hdl_t hca_hdl) 4871 { 4872 rib_hca_t *hca = NULL; 4873 rib_hca_t **hcap; 4874 4875 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4876 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { 4877 hca = *hcap; 4878 rw_enter(&hca->state_lock, RW_WRITER); 4879 if (hca->hca_hdl == hca_hdl) { 4880 /* 4881 * Mark as detached and remove from 4882 * hca list. 4883 */ 4884 hca->state = HCA_DETACHED; 4885 *hcap = hca->next; 4886 rib_stat->nhca_inited--; 4887 rib_mod.rdma_count--; 4888 rw_exit(&hca->state_lock); 4889 break; 4890 } 4891 rw_exit(&hca->state_lock); 4892 } 4893 rw_exit(&rib_stat->hcas_list_lock); 4894 4895 if (hca == NULL) 4896 return; 4897 ASSERT(hca->hca_hdl == hca_hdl); 4898 4899 /* 4900 * Stop all services on the HCA 4901 * Go through cl_conn_list and close all rc_channels 4902 * Go through svr_conn_list and close all rc_channels 4903 * Free connections whose c_ref has dropped to 0 4904 * Destroy all CQs 4905 * Deregister and released all buffer pool memory after all 4906 * connections are destroyed 4907 * Free the protection domain 4908 * ibt_close_hca() 4909 */ 4910 rib_stop_hca_services(hca); 4911 4912 kmem_free(hca, sizeof (*hca)); 4913 } 4914 4915 static void 4916 rib_server_side_cache_reclaim(void *argp) 4917 { 4918 cache_avl_struct_t *rcas; 4919 rib_lrc_entry_t *rb; 4920 rib_hca_t *hca = (rib_hca_t *)argp; 4921 4922 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4923 rcas = avl_first(&hca->avl_tree); 4924 if (rcas != NULL) 4925 avl_remove(&hca->avl_tree, rcas); 4926 4927 while (rcas != NULL) { 4928 while (rcas->r.forw != &rcas->r) { 4929 rcas->elements--; 4930 rb = rcas->r.forw; 4931 remque(rb); 4932 if (rb->registered) 4933 (void) rib_deregistermem_via_hca(hca, 4934 rb->lrc_buf, rb->lrc_mhandle); 4935 4936 hca->cache_allocation -= rb->lrc_len; 4937 kmem_free(rb->lrc_buf, rb->lrc_len); 4938 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4939 } 4940 mutex_destroy(&rcas->node_lock); 4941 kmem_cache_free(hca->server_side_cache, rcas); 4942 rcas = avl_first(&hca->avl_tree); 4943 if (rcas != NULL) 4944 avl_remove(&hca->avl_tree, rcas); 4945 } 4946 rw_exit(&hca->avl_rw_lock); 4947 } 4948 4949 static void 4950 rib_server_side_cache_cleanup(void *argp) 4951 { 4952 cache_avl_struct_t *rcas; 4953 rib_lrc_entry_t *rb; 4954 rib_hca_t *hca = (rib_hca_t *)argp; 4955 4956 mutex_enter(&hca->cache_allocation_lock); 4957 if (hca->cache_allocation < cache_limit) { 4958 mutex_exit(&hca->cache_allocation_lock); 4959 return; 4960 } 4961 mutex_exit(&hca->cache_allocation_lock); 4962 4963 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4964 rcas = avl_last(&hca->avl_tree); 4965 if (rcas != NULL) 4966 avl_remove(&hca->avl_tree, rcas); 4967 4968 while (rcas != NULL) { 4969 while (rcas->r.forw != &rcas->r) { 4970 rcas->elements--; 4971 rb = rcas->r.forw; 4972 remque(rb); 4973 if (rb->registered) 4974 (void) rib_deregistermem_via_hca(hca, 4975 rb->lrc_buf, rb->lrc_mhandle); 4976 4977 hca->cache_allocation -= rb->lrc_len; 4978 4979 kmem_free(rb->lrc_buf, rb->lrc_len); 4980 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4981 } 4982 mutex_destroy(&rcas->node_lock); 4983 if (hca->server_side_cache) { 4984 kmem_cache_free(hca->server_side_cache, rcas); 4985 } 4986 4987 if (hca->cache_allocation < cache_limit) { 4988 rw_exit(&hca->avl_rw_lock); 4989 return; 4990 } 4991 4992 rcas = avl_last(&hca->avl_tree); 4993 if (rcas != NULL) 4994 avl_remove(&hca->avl_tree, rcas); 4995 } 4996 rw_exit(&hca->avl_rw_lock); 4997 } 4998 4999 static int 5000 avl_compare(const void *t1, const void *t2) 5001 { 5002 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 5003 return (0); 5004 5005 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 5006 return (-1); 5007 5008 return (1); 5009 } 5010 5011 static void 5012 rib_destroy_cache(rib_hca_t *hca) 5013 { 5014 if (hca->avl_init) { 5015 rib_server_side_cache_reclaim((void *)hca); 5016 if (hca->server_side_cache) { 5017 kmem_cache_destroy(hca->server_side_cache); 5018 hca->server_side_cache = NULL; 5019 } 5020 avl_destroy(&hca->avl_tree); 5021 mutex_destroy(&hca->cache_allocation_lock); 5022 rw_destroy(&hca->avl_rw_lock); 5023 } 5024 hca->avl_init = FALSE; 5025 } 5026 5027 static void 5028 rib_force_cleanup(void *hca) 5029 { 5030 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 5031 (void) ddi_taskq_dispatch( 5032 ((rib_hca_t *)hca)->cleanup_helper, 5033 rib_server_side_cache_cleanup, 5034 (void *)hca, DDI_NOSLEEP); 5035 } 5036 5037 static rib_lrc_entry_t * 5038 rib_get_cache_buf(CONN *conn, uint32_t len) 5039 { 5040 cache_avl_struct_t cas, *rcas; 5041 rib_hca_t *hca = (ctoqp(conn))->hca; 5042 rib_lrc_entry_t *reply_buf; 5043 avl_index_t where = NULL; 5044 uint64_t c_alloc = 0; 5045 5046 if (!hca->avl_init) 5047 goto error_alloc; 5048 5049 cas.len = len; 5050 5051 rw_enter(&hca->avl_rw_lock, RW_READER); 5052 5053 mutex_enter(&hca->cache_allocation_lock); 5054 c_alloc = hca->cache_allocation; 5055 mutex_exit(&hca->cache_allocation_lock); 5056 5057 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 5058 &where)) == NULL) { 5059 /* Am I above the cache limit */ 5060 if ((c_alloc + len) >= cache_limit) { 5061 rib_force_cleanup((void *)hca); 5062 rw_exit(&hca->avl_rw_lock); 5063 mutex_enter(&hca->cache_allocation_lock); 5064 hca->cache_misses_above_the_limit ++; 5065 mutex_exit(&hca->cache_allocation_lock); 5066 5067 /* Allocate and register the buffer directly */ 5068 goto error_alloc; 5069 } 5070 5071 rw_exit(&hca->avl_rw_lock); 5072 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5073 5074 /* Recheck to make sure no other thread added the entry in */ 5075 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 5076 &cas, &where)) == NULL) { 5077 /* Allocate an avl tree entry */ 5078 rcas = (cache_avl_struct_t *) 5079 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 5080 5081 bzero(rcas, sizeof (cache_avl_struct_t)); 5082 rcas->elements = 0; 5083 rcas->r.forw = &rcas->r; 5084 rcas->r.back = &rcas->r; 5085 rcas->len = len; 5086 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5087 avl_insert(&hca->avl_tree, rcas, where); 5088 } 5089 } 5090 5091 mutex_enter(&rcas->node_lock); 5092 5093 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5094 reply_buf = rcas->r.forw; 5095 remque(reply_buf); 5096 rcas->elements--; 5097 mutex_exit(&rcas->node_lock); 5098 rw_exit(&hca->avl_rw_lock); 5099 5100 mutex_enter(&hca->cache_allocation_lock); 5101 hca->cache_hits++; 5102 hca->cache_allocation -= len; 5103 mutex_exit(&hca->cache_allocation_lock); 5104 } else { 5105 /* Am I above the cache limit */ 5106 mutex_exit(&rcas->node_lock); 5107 if ((c_alloc + len) >= cache_limit) { 5108 rib_force_cleanup((void *)hca); 5109 rw_exit(&hca->avl_rw_lock); 5110 5111 mutex_enter(&hca->cache_allocation_lock); 5112 hca->cache_misses_above_the_limit++; 5113 mutex_exit(&hca->cache_allocation_lock); 5114 /* Allocate and register the buffer directly */ 5115 goto error_alloc; 5116 } 5117 rw_exit(&hca->avl_rw_lock); 5118 mutex_enter(&hca->cache_allocation_lock); 5119 hca->cache_misses++; 5120 mutex_exit(&hca->cache_allocation_lock); 5121 /* Allocate a reply_buf entry */ 5122 reply_buf = (rib_lrc_entry_t *) 5123 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5124 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5125 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5126 reply_buf->lrc_len = len; 5127 reply_buf->registered = FALSE; 5128 reply_buf->avl_node = (void *)rcas; 5129 } 5130 5131 return (reply_buf); 5132 5133 error_alloc: 5134 reply_buf = (rib_lrc_entry_t *) 5135 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5136 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5137 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5138 reply_buf->lrc_len = len; 5139 reply_buf->registered = FALSE; 5140 reply_buf->avl_node = NULL; 5141 5142 return (reply_buf); 5143 } 5144 5145 /* 5146 * Return a pre-registered back to the cache (without 5147 * unregistering the buffer).. 5148 */ 5149 5150 static void 5151 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5152 { 5153 cache_avl_struct_t cas, *rcas; 5154 avl_index_t where = NULL; 5155 rib_hca_t *hca = (ctoqp(conn))->hca; 5156 5157 if (!hca->avl_init) 5158 goto error_free; 5159 5160 cas.len = reg_buf->lrc_len; 5161 rw_enter(&hca->avl_rw_lock, RW_READER); 5162 if ((rcas = (cache_avl_struct_t *) 5163 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5164 rw_exit(&hca->avl_rw_lock); 5165 goto error_free; 5166 } else { 5167 cas.len = reg_buf->lrc_len; 5168 mutex_enter(&rcas->node_lock); 5169 insque(reg_buf, &rcas->r); 5170 rcas->elements ++; 5171 mutex_exit(&rcas->node_lock); 5172 rw_exit(&hca->avl_rw_lock); 5173 mutex_enter(&hca->cache_allocation_lock); 5174 hca->cache_allocation += cas.len; 5175 mutex_exit(&hca->cache_allocation_lock); 5176 } 5177 5178 return; 5179 5180 error_free: 5181 5182 if (reg_buf->registered) 5183 (void) rib_deregistermem_via_hca(hca, 5184 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5185 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5186 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5187 } 5188 5189 static rdma_stat 5190 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5191 uint_t buflen, struct mrc *buf_handle) 5192 { 5193 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5194 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5195 rdma_stat status; 5196 5197 5198 /* 5199 * Note: ALL buffer pools use the same memory type RDMARW. 5200 */ 5201 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5202 if (status == RDMA_SUCCESS) { 5203 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5204 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5205 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5206 } else { 5207 buf_handle->mrc_linfo = NULL; 5208 buf_handle->mrc_lmr = 0; 5209 buf_handle->mrc_rmr = 0; 5210 } 5211 return (status); 5212 } 5213 5214 /* ARGSUSED */ 5215 static rdma_stat 5216 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5217 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5218 { 5219 5220 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5221 return (RDMA_SUCCESS); 5222 } 5223 5224 /* ARGSUSED */ 5225 static rdma_stat 5226 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5227 { 5228 5229 (void) ibt_deregister_mr(hca->hca_hdl, 5230 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5231 return (RDMA_SUCCESS); 5232 } 5233 5234 /* 5235 * Check if the IP interface named by `lifrp' is RDMA-capable. 5236 */ 5237 static boolean_t 5238 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5239 { 5240 char ifname[LIFNAMSIZ]; 5241 char *cp; 5242 5243 if (lifrp->lifr_type == IFT_IB) 5244 return (B_TRUE); 5245 5246 /* 5247 * Strip off the logical interface portion before getting 5248 * intimate with the name. 5249 */ 5250 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5251 if ((cp = strchr(ifname, ':')) != NULL) 5252 *cp = '\0'; 5253 5254 return (strcmp("lo0", ifname) == 0); 5255 } 5256 5257 static int 5258 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5259 { 5260 vnode_t *kkvp, *vp; 5261 TIUSER *tiptr; 5262 struct strioctl iocb; 5263 k_sigset_t smask; 5264 int err = 0; 5265 5266 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { 5267 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, 5268 &tiptr, CRED()) == 0) { 5269 vp = tiptr->fp->f_vnode; 5270 } else { 5271 VN_RELE(kkvp); 5272 return (EPROTO); 5273 } 5274 } else { 5275 return (EPROTO); 5276 } 5277 5278 iocb.ic_cmd = cmd; 5279 iocb.ic_timout = 0; 5280 iocb.ic_len = len; 5281 iocb.ic_dp = (caddr_t)arg; 5282 sigintr(&smask, 0); 5283 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5284 sigunintr(&smask); 5285 (void) t_kclose(tiptr, 0); 5286 VN_RELE(kkvp); 5287 return (err); 5288 } 5289 5290 /* 5291 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5292 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5293 */ 5294 static int 5295 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5296 { 5297 int err; 5298 struct lifnum lifn; 5299 5300 bzero(&lifn, sizeof (struct lifnum)); 5301 lifn.lifn_family = AF_UNSPEC; 5302 5303 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5304 if (err != 0) 5305 return (err); 5306 5307 /* 5308 * Pad the interface count to account for additional interfaces that 5309 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5310 */ 5311 lifn.lifn_count += 4; 5312 5313 bzero(lifcp, sizeof (struct lifconf)); 5314 lifcp->lifc_family = AF_UNSPEC; 5315 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5316 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5317 5318 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5319 if (err != 0) { 5320 kmem_free(lifcp->lifc_buf, *bufsizep); 5321 return (err); 5322 } 5323 return (0); 5324 } 5325 5326 static boolean_t 5327 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5328 { 5329 uint_t i, nifs; 5330 uint_t bufsize; 5331 struct lifconf lifc; 5332 struct lifreq *lifrp; 5333 struct sockaddr_in *sinp; 5334 struct sockaddr_in6 *sin6p; 5335 5336 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5337 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5338 5339 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5340 return (B_FALSE); 5341 5342 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5343 kmem_free(lifc.lifc_buf, bufsize); 5344 return (B_FALSE); 5345 } 5346 5347 /* 5348 * Worst case is that all of the addresses are IB-capable and have 5349 * the same address family, so size our buffers accordingly. 5350 */ 5351 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5352 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5353 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5354 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5355 5356 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5357 if (!rpcib_rdma_capable_interface(lifrp)) 5358 continue; 5359 5360 if (lifrp->lifr_addr.ss_family == AF_INET) { 5361 sinp = addrs4->ri_list; 5362 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5363 sizeof (struct sockaddr_in)); 5364 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5365 sin6p = addrs6->ri_list; 5366 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5367 sizeof (struct sockaddr_in6)); 5368 } 5369 } 5370 5371 kmem_free(lifc.lifc_buf, bufsize); 5372 return (B_TRUE); 5373 } 5374 5375 /* ARGSUSED */ 5376 static int 5377 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5378 { 5379 rib_hca_t *hca; 5380 5381 if (KSTAT_WRITE == rw) { 5382 return (EACCES); 5383 } 5384 5385 rpcib_kstat.cache_limit.value.ui64 = 5386 (uint64_t)cache_limit; 5387 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5388 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5389 rpcib_kstat.cache_allocation.value.ui64 += 5390 (uint64_t)hca->cache_allocation; 5391 rpcib_kstat.cache_hits.value.ui64 += 5392 (uint64_t)hca->cache_hits; 5393 rpcib_kstat.cache_misses.value.ui64 += 5394 (uint64_t)hca->cache_misses; 5395 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5396 (uint64_t)hca->cache_misses_above_the_limit; 5397 } 5398 rw_exit(&rib_stat->hcas_list_lock); 5399 return (0); 5400 }