1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 /* 41 * The rpcib plugin. Implements the interface for RDMATF's 42 * interaction with IBTF. 43 */ 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/user.h> 48 #include <sys/systm.h> 49 #include <sys/sysmacros.h> 50 #include <sys/proc.h> 51 #include <sys/socket.h> 52 #include <sys/file.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/stropts.h> 56 #include <sys/errno.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/pathname.h> 60 #include <sys/kstat.h> 61 #include <sys/t_lock.h> 62 #include <sys/ddi.h> 63 #include <sys/cmn_err.h> 64 #include <sys/time.h> 65 #include <sys/isa_defs.h> 66 #include <sys/callb.h> 67 #include <sys/sunddi.h> 68 #include <sys/sunndi.h> 69 #include <sys/sdt.h> 70 #include <sys/ib/ibtl/ibti.h> 71 #include <rpc/rpc.h> 72 #include <rpc/ib.h> 73 #include <sys/modctl.h> 74 #include <sys/kstr.h> 75 #include <sys/sockio.h> 76 #include <sys/vnode.h> 77 #include <sys/tiuser.h> 78 #include <net/if.h> 79 #include <net/if_types.h> 80 #include <sys/cred.h> 81 #include <rpc/rpc_rdma.h> 82 #include <nfs/nfs.h> 83 #include <sys/atomic.h> 84 85 #define NFS_RDMA_PORT 20049 86 87 88 /* 89 * Convenience structures for connection management 90 */ 91 typedef struct rpcib_ipaddrs { 92 void *ri_list; /* pointer to list of addresses */ 93 uint_t ri_count; /* number of addresses in list */ 94 uint_t ri_size; /* size of ri_list in bytes */ 95 } rpcib_ipaddrs_t; 96 97 98 typedef struct rpcib_ping { 99 rib_hca_t *hca; 100 ibt_path_info_t path; 101 ibt_ip_addr_t srcip; 102 ibt_ip_addr_t dstip; 103 } rpcib_ping_t; 104 105 /* 106 * Prototype declarations for driver ops 107 */ 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t); 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t, 110 void *, void **); 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t); 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *); 113 static int rpcib_do_ip_ioctl(int, int, void *); 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *); 115 static int rpcib_cache_kstat_update(kstat_t *, int); 116 static void rib_force_cleanup(void *); 117 static void rib_stop_hca_services(rib_hca_t *); 118 static void rib_attach_hca(void); 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 120 struct netbuf *d_svcaddr, CONN **conn); 121 122 struct { 123 kstat_named_t cache_limit; 124 kstat_named_t cache_allocation; 125 kstat_named_t cache_hits; 126 kstat_named_t cache_misses; 127 kstat_named_t cache_misses_above_the_limit; 128 } rpcib_kstat = { 129 {"cache_limit", KSTAT_DATA_UINT64 }, 130 {"cache_allocation", KSTAT_DATA_UINT64 }, 131 {"cache_hits", KSTAT_DATA_UINT64 }, 132 {"cache_misses", KSTAT_DATA_UINT64 }, 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 }, 134 }; 135 136 /* rpcib cb_ops */ 137 static struct cb_ops rpcib_cbops = { 138 nulldev, /* open */ 139 nulldev, /* close */ 140 nodev, /* strategy */ 141 nodev, /* print */ 142 nodev, /* dump */ 143 nodev, /* read */ 144 nodev, /* write */ 145 nodev, /* ioctl */ 146 nodev, /* devmap */ 147 nodev, /* mmap */ 148 nodev, /* segmap */ 149 nochpoll, /* poll */ 150 ddi_prop_op, /* prop_op */ 151 NULL, /* stream */ 152 D_MP, /* cb_flag */ 153 CB_REV, /* rev */ 154 nodev, /* int (*cb_aread)() */ 155 nodev /* int (*cb_awrite)() */ 156 }; 157 158 /* 159 * Device options 160 */ 161 static struct dev_ops rpcib_ops = { 162 DEVO_REV, /* devo_rev, */ 163 0, /* refcnt */ 164 rpcib_getinfo, /* info */ 165 nulldev, /* identify */ 166 nulldev, /* probe */ 167 rpcib_attach, /* attach */ 168 rpcib_detach, /* detach */ 169 nodev, /* reset */ 170 &rpcib_cbops, /* driver ops - devctl interfaces */ 171 NULL, /* bus operations */ 172 NULL, /* power */ 173 ddi_quiesce_not_needed, /* quiesce */ 174 }; 175 176 /* 177 * Module linkage information. 178 */ 179 180 static struct modldrv rib_modldrv = { 181 &mod_driverops, /* Driver module */ 182 "RPCIB plugin driver", /* Driver name and version */ 183 &rpcib_ops, /* Driver ops */ 184 }; 185 186 static struct modlinkage rib_modlinkage = { 187 MODREV_1, 188 (void *)&rib_modldrv, 189 NULL 190 }; 191 192 typedef struct rib_lrc_entry { 193 struct rib_lrc_entry *forw; 194 struct rib_lrc_entry *back; 195 char *lrc_buf; 196 197 uint32_t lrc_len; 198 void *avl_node; 199 bool_t registered; 200 201 struct mrc lrc_mhandle; 202 bool_t lrc_on_freed_list; 203 } rib_lrc_entry_t; 204 205 typedef struct cache_struct { 206 rib_lrc_entry_t r; 207 uint32_t len; 208 uint32_t elements; 209 kmutex_t node_lock; 210 avl_node_t avl_link; 211 } cache_avl_struct_t; 212 213 uint64_t cache_limit = 100 * 1024 * 1024; 214 static uint64_t cache_watermark = 80 * 1024 * 1024; 215 static bool_t stats_enabled = FALSE; 216 217 static uint64_t max_unsignaled_rws = 5; 218 int nfs_rdma_port = NFS_RDMA_PORT; 219 220 #define RIBNETID_TCP "tcp" 221 #define RIBNETID_TCP6 "tcp6" 222 223 /* 224 * rib_stat: private data pointer used when registering 225 * with the IBTF. It is returned to the consumer 226 * in all callbacks. 227 */ 228 static rpcib_state_t *rib_stat = NULL; 229 230 #define RNR_RETRIES IBT_RNR_RETRY_1 231 #define MAX_PORTS 2 232 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D 233 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */ 234 235 int preposted_rbufs = RDMA_BUFS_GRANT; 236 int send_threshold = 1; 237 238 /* 239 * Old cards with Tavor driver have limited memory footprint 240 * when booted in 32bit. The rib_max_rbufs tunable can be 241 * tuned for more buffers if needed. 242 */ 243 244 #if !defined(_ELF64) && !defined(__sparc) 245 int rib_max_rbufs = MAX_BUFS; 246 #else 247 int rib_max_rbufs = 10 * MAX_BUFS; 248 #endif /* !(_ELF64) && !(__sparc) */ 249 250 int rib_conn_timeout = 60 * 12; /* 12 minutes */ 251 252 /* 253 * State of the plugin. 254 * ACCEPT = accepting new connections and requests. 255 * NO_ACCEPT = not accepting new connection and requests. 256 * This should eventually move to rpcib_state_t structure, since this 257 * will tell in which state the plugin is for a particular type of service 258 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept 259 * state for one and in no_accept state for the other. 260 */ 261 int plugin_state; 262 kmutex_t plugin_state_lock; 263 264 ldi_ident_t rpcib_li; 265 266 /* 267 * RPCIB RDMATF operations 268 */ 269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle); 270 static rdma_stat rib_disconnect(CONN *conn); 271 static void rib_listen(struct rdma_svc_data *rd); 272 static void rib_listen_stop(struct rdma_svc_data *rd); 273 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, 274 uint_t buflen, struct mrc *buf_handle); 275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf, 276 struct mrc buf_handle); 277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 278 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 280 struct mrc buf_handle); 281 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, 282 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, 283 void *lrc); 284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 285 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, 287 caddr_t buf, int len, int cpu); 288 289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf); 290 291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf); 292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *); 293 294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf); 295 296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid); 297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid); 298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid); 299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid); 300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl); 301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid); 302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait); 303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait); 304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *); 305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *, 306 int addr_type, void *, CONN **); 307 static rdma_stat rib_conn_release(CONN *conn); 308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int, 309 rpcib_ping_t *, CONN **); 310 static rdma_stat rib_getinfo(rdma_info_t *info); 311 312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len); 313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf); 314 static void rib_destroy_cache(rib_hca_t *hca); 315 static void rib_server_side_cache_reclaim(void *argp); 316 static int avl_compare(const void *t1, const void *t2); 317 318 static void rib_stop_services(rib_hca_t *); 319 static void rib_close_channels(rib_conn_list_t *); 320 static void rib_conn_close(void *); 321 static void rib_recv_rele(rib_qp_t *); 322 static rdma_stat rib_conn_release_locked(CONN *conn); 323 324 /* 325 * RPCIB addressing operations 326 */ 327 328 /* 329 * RDMA operations the RPCIB module exports 330 */ 331 static rdmaops_t rib_ops = { 332 rib_reachable, 333 rib_conn_get, 334 rib_conn_release, 335 rib_listen, 336 rib_listen_stop, 337 rib_registermem, 338 rib_deregistermem, 339 rib_registermemsync, 340 rib_deregistermemsync, 341 rib_syncmem, 342 rib_reg_buf_alloc, 343 rib_reg_buf_free, 344 rib_send, 345 rib_send_resp, 346 rib_post_resp, 347 rib_post_resp_remove, 348 rib_post_recv, 349 rib_recv, 350 rib_read, 351 rib_write, 352 rib_getinfo, 353 }; 354 355 /* 356 * RDMATF RPCIB plugin details 357 */ 358 static rdma_mod_t rib_mod = { 359 "ibtf", /* api name */ 360 RDMATF_VERS_1, 361 0, 362 &rib_ops, /* rdma op vector for ibtf */ 363 }; 364 365 static rdma_stat rpcib_open_hcas(rpcib_state_t *); 366 static rdma_stat rib_qp_init(rib_qp_t *, int); 367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *); 368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *); 369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *); 370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *); 371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num); 372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t, 373 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *); 374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t, 375 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t); 376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *); 377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *, 378 rib_qp_t **); 379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t, 380 rib_qp_t **); 381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *); 382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *); 383 static int rib_free_sendwait(struct send_wid *); 384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid); 385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd); 386 static void rdma_done_rem_list(rib_qp_t *); 387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid); 388 389 static void rib_async_handler(void *, 390 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *); 391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *); 392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *); 393 static int rib_free_svc_recv(struct svc_recv *); 394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t); 395 static void rib_free_wid(struct recv_wid *); 396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *); 397 static void rib_detach_hca(ibt_hca_hdl_t); 398 static void rib_close_a_channel(CONN *); 399 static void rib_send_hold(rib_qp_t *); 400 static void rib_send_rele(rib_qp_t *); 401 402 /* 403 * Registration with IBTF as a consumer 404 */ 405 static struct ibt_clnt_modinfo_s rib_modinfo = { 406 IBTI_V_CURR, 407 IBT_GENERIC, 408 rib_async_handler, /* async event handler */ 409 NULL, /* Memory Region Handler */ 410 "nfs/ib" 411 }; 412 413 /* 414 * Global strucuture 415 */ 416 417 typedef struct rpcib_s { 418 dev_info_t *rpcib_dip; 419 kmutex_t rpcib_mutex; 420 } rpcib_t; 421 422 rpcib_t rpcib; 423 424 /* 425 * /etc/system controlled variable to control 426 * debugging in rpcib kernel module. 427 * Set it to values greater that 1 to control 428 * the amount of debugging messages required. 429 */ 430 int rib_debug = 0; 431 432 int 433 _init(void) 434 { 435 int error; 436 437 error = mod_install((struct modlinkage *)&rib_modlinkage); 438 if (error != 0) { 439 /* 440 * Could not load module 441 */ 442 return (error); 443 } 444 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL); 445 return (0); 446 } 447 448 int 449 _fini() 450 { 451 int status; 452 453 /* 454 * Remove module 455 */ 456 if ((status = mod_remove(&rib_modlinkage)) != 0) { 457 return (status); 458 } 459 mutex_destroy(&plugin_state_lock); 460 return (0); 461 } 462 463 int 464 _info(struct modinfo *modinfop) 465 { 466 return (mod_info(&rib_modlinkage, modinfop)); 467 } 468 469 /* 470 * rpcib_getinfo() 471 * Given the device number, return the devinfo pointer or the 472 * instance number. 473 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach. 474 */ 475 476 /*ARGSUSED*/ 477 static int 478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) 479 { 480 int ret = DDI_SUCCESS; 481 482 switch (cmd) { 483 case DDI_INFO_DEVT2DEVINFO: 484 if (rpcib.rpcib_dip != NULL) 485 *result = rpcib.rpcib_dip; 486 else { 487 *result = NULL; 488 ret = DDI_FAILURE; 489 } 490 break; 491 492 case DDI_INFO_DEVT2INSTANCE: 493 *result = NULL; 494 break; 495 496 default: 497 ret = DDI_FAILURE; 498 } 499 return (ret); 500 } 501 502 static void 503 rpcib_free_hca_list() 504 { 505 rib_hca_t *hca, *hcap; 506 507 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 508 hca = rib_stat->hcas_list; 509 rib_stat->hcas_list = NULL; 510 rw_exit(&rib_stat->hcas_list_lock); 511 while (hca != NULL) { 512 rw_enter(&hca->state_lock, RW_WRITER); 513 hcap = hca; 514 hca = hca->next; 515 rib_stat->nhca_inited--; 516 rib_mod.rdma_count--; 517 hcap->state = HCA_DETACHED; 518 rw_exit(&hcap->state_lock); 519 rib_stop_hca_services(hcap); 520 521 kmem_free(hcap, sizeof (*hcap)); 522 } 523 } 524 525 static rdma_stat 526 rpcib_free_service_list() 527 { 528 rib_service_t *service; 529 ibt_status_t ret; 530 531 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 532 while (rib_stat->service_list != NULL) { 533 service = rib_stat->service_list; 534 ret = ibt_unbind_all_services(service->srv_hdl); 535 if (ret != IBT_SUCCESS) { 536 rw_exit(&rib_stat->service_list_lock); 537 #ifdef DEBUG 538 cmn_err(CE_NOTE, "rpcib_free_service_list: " 539 "ibt_unbind_all_services failed (%d)\n", (int)ret); 540 #endif 541 return (RDMA_FAILED); 542 } 543 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl, 544 service->srv_hdl); 545 if (ret != IBT_SUCCESS) { 546 rw_exit(&rib_stat->service_list_lock); 547 #ifdef DEBUG 548 cmn_err(CE_NOTE, "rpcib_free_service_list: " 549 "ibt_deregister_service failed (%d)\n", (int)ret); 550 #endif 551 return (RDMA_FAILED); 552 } 553 rib_stat->service_list = service->next; 554 kmem_free(service, sizeof (rib_service_t)); 555 } 556 rw_exit(&rib_stat->service_list_lock); 557 558 return (RDMA_SUCCESS); 559 } 560 561 static int 562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 563 { 564 ibt_status_t ibt_status; 565 rdma_stat r_status; 566 567 switch (cmd) { 568 case DDI_ATTACH: 569 break; 570 case DDI_RESUME: 571 return (DDI_SUCCESS); 572 default: 573 return (DDI_FAILURE); 574 } 575 576 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL); 577 578 mutex_enter(&rpcib.rpcib_mutex); 579 if (rpcib.rpcib_dip != NULL) { 580 mutex_exit(&rpcib.rpcib_mutex); 581 return (DDI_FAILURE); 582 } 583 rpcib.rpcib_dip = dip; 584 mutex_exit(&rpcib.rpcib_mutex); 585 /* 586 * Create the "rpcib" minor-node. 587 */ 588 if (ddi_create_minor_node(dip, 589 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) { 590 /* Error message, no cmn_err as they print on console */ 591 return (DDI_FAILURE); 592 } 593 594 if (rib_stat == NULL) { 595 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP); 596 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL); 597 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL); 598 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL); 599 } 600 601 rib_stat->hca_count = ibt_get_hca_list(NULL); 602 if (rib_stat->hca_count < 1) { 603 mutex_destroy(&rib_stat->listen_lock); 604 rw_destroy(&rib_stat->hcas_list_lock); 605 mutex_destroy(&rib_stat->open_hca_lock); 606 kmem_free(rib_stat, sizeof (*rib_stat)); 607 rib_stat = NULL; 608 return (DDI_FAILURE); 609 } 610 611 ibt_status = ibt_attach(&rib_modinfo, dip, 612 (void *)rib_stat, &rib_stat->ibt_clnt_hdl); 613 614 if (ibt_status != IBT_SUCCESS) { 615 mutex_destroy(&rib_stat->listen_lock); 616 rw_destroy(&rib_stat->hcas_list_lock); 617 mutex_destroy(&rib_stat->open_hca_lock); 618 kmem_free(rib_stat, sizeof (*rib_stat)); 619 rib_stat = NULL; 620 return (DDI_FAILURE); 621 } 622 623 rib_stat->service_list = NULL; 624 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL); 625 mutex_enter(&rib_stat->open_hca_lock); 626 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) { 627 mutex_exit(&rib_stat->open_hca_lock); 628 goto open_fail; 629 } 630 mutex_exit(&rib_stat->open_hca_lock); 631 632 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) != 633 DDI_PROP_SUCCESS) { 634 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update " 635 "failed."); 636 goto register_fail; 637 } 638 639 /* 640 * Register with rdmatf 641 */ 642 r_status = rdma_register_mod(&rib_mod); 643 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) { 644 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, " 645 "status = %d", r_status); 646 goto register_fail; 647 } 648 649 return (DDI_SUCCESS); 650 651 register_fail: 652 653 open_fail: 654 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 655 rpcib_free_hca_list(); 656 (void) rpcib_free_service_list(); 657 mutex_destroy(&rib_stat->listen_lock); 658 rw_destroy(&rib_stat->hcas_list_lock); 659 mutex_destroy(&rib_stat->open_hca_lock); 660 rw_destroy(&rib_stat->service_list_lock); 661 kmem_free(rib_stat, sizeof (*rib_stat)); 662 rib_stat = NULL; 663 return (DDI_FAILURE); 664 } 665 666 /*ARGSUSED*/ 667 static int 668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 669 { 670 switch (cmd) { 671 672 case DDI_DETACH: 673 break; 674 675 case DDI_SUSPEND: 676 default: 677 return (DDI_FAILURE); 678 } 679 680 /* 681 * Detach the hca and free resources 682 */ 683 mutex_enter(&plugin_state_lock); 684 plugin_state = NO_ACCEPT; 685 mutex_exit(&plugin_state_lock); 686 687 if (rpcib_free_service_list() != RDMA_SUCCESS) 688 return (DDI_FAILURE); 689 rpcib_free_hca_list(); 690 691 (void) ibt_detach(rib_stat->ibt_clnt_hdl); 692 mutex_destroy(&rib_stat->listen_lock); 693 rw_destroy(&rib_stat->hcas_list_lock); 694 mutex_destroy(&rib_stat->open_hca_lock); 695 rw_destroy(&rib_stat->service_list_lock); 696 697 kmem_free(rib_stat, sizeof (*rib_stat)); 698 rib_stat = NULL; 699 700 mutex_enter(&rpcib.rpcib_mutex); 701 rpcib.rpcib_dip = NULL; 702 mutex_exit(&rpcib.rpcib_mutex); 703 mutex_destroy(&rpcib.rpcib_mutex); 704 return (DDI_SUCCESS); 705 } 706 707 708 static void rib_rbufpool_free(rib_hca_t *, int); 709 static void rib_rbufpool_deregister(rib_hca_t *, int); 710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype); 711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t); 712 static rdma_stat rib_rem_replylist(rib_qp_t *); 713 static int rib_remreply(rib_qp_t *, struct reply *); 714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *); 715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *); 716 717 718 /* 719 * One CQ pair per HCA 720 */ 721 static rdma_stat 722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler, 723 rib_cq_t **cqp) 724 { 725 rib_cq_t *cq; 726 ibt_cq_attr_t cq_attr; 727 uint32_t real_size; 728 ibt_status_t status; 729 rdma_stat error = RDMA_SUCCESS; 730 731 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP); 732 cq->rib_hca = hca; 733 bzero(&cq_attr, sizeof (cq_attr)); 734 cq_attr.cq_size = cq_size; 735 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 736 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl, 737 &real_size); 738 if (status != IBT_SUCCESS) { 739 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed," 740 " status=%d", status); 741 error = RDMA_FAILED; 742 goto fail; 743 } 744 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca); 745 746 /* 747 * Enable CQ callbacks. CQ Callbacks are single shot 748 * (e.g. you have to call ibt_enable_cq_notify() 749 * after each callback to get another one). 750 */ 751 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION); 752 if (status != IBT_SUCCESS) { 753 cmn_err(CE_WARN, "rib_create_cq: " 754 "enable_cq_notify failed, status %d", status); 755 error = RDMA_FAILED; 756 goto fail; 757 } 758 *cqp = cq; 759 760 return (error); 761 fail: 762 if (cq->rib_cq_hdl) 763 (void) ibt_free_cq(cq->rib_cq_hdl); 764 if (cq) 765 kmem_free(cq, sizeof (rib_cq_t)); 766 return (error); 767 } 768 769 /* 770 * rpcib_find_hca 771 * 772 * Caller should have already locked the hcas_lock before calling 773 * this function. 774 */ 775 static rib_hca_t * 776 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid) 777 { 778 rib_hca_t *hca = ribstat->hcas_list; 779 780 while (hca && hca->hca_guid != guid) 781 hca = hca->next; 782 783 return (hca); 784 } 785 786 static rdma_stat 787 rpcib_open_hcas(rpcib_state_t *ribstat) 788 { 789 rib_hca_t *hca; 790 ibt_status_t ibt_status; 791 rdma_stat status; 792 ibt_hca_portinfo_t *pinfop; 793 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS; 794 uint_t size, cq_size; 795 int i; 796 kstat_t *ksp; 797 cache_avl_struct_t example_avl_node; 798 char rssc_name[32]; 799 int old_nhca_inited = ribstat->nhca_inited; 800 ib_guid_t *hca_guids; 801 802 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock)); 803 804 ribstat->hca_count = ibt_get_hca_list(&hca_guids); 805 if (ribstat->hca_count == 0) 806 return (RDMA_FAILED); 807 808 rw_enter(&ribstat->hcas_list_lock, RW_WRITER); 809 /* 810 * Open a hca and setup for RDMA 811 */ 812 for (i = 0; i < ribstat->hca_count; i++) { 813 if (rpcib_find_hca(ribstat, hca_guids[i])) 814 continue; 815 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP); 816 817 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl, 818 hca_guids[i], &hca->hca_hdl); 819 if (ibt_status != IBT_SUCCESS) { 820 kmem_free(hca, sizeof (rib_hca_t)); 821 continue; 822 } 823 hca->hca_guid = hca_guids[i]; 824 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl; 825 hca->state = HCA_INITED; 826 827 /* 828 * query HCA info 829 */ 830 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs); 831 if (ibt_status != IBT_SUCCESS) { 832 goto fail1; 833 } 834 835 /* 836 * One PD (Protection Domain) per HCA. 837 * A qp is allowed to access a memory region 838 * only when it's in the same PD as that of 839 * the memory region. 840 */ 841 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl); 842 if (ibt_status != IBT_SUCCESS) { 843 goto fail1; 844 } 845 846 /* 847 * query HCA ports 848 */ 849 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 850 0, &pinfop, &hca->hca_nports, &size); 851 if (ibt_status != IBT_SUCCESS) { 852 goto fail2; 853 } 854 hca->hca_ports = pinfop; 855 hca->hca_pinfosz = size; 856 pinfop = NULL; 857 858 cq_size = DEF_CQ_SIZE; /* default cq size */ 859 /* 860 * Create 2 pairs of cq's (1 pair for client 861 * and the other pair for server) on this hca. 862 * If number of qp's gets too large, then several 863 * cq's will be needed. 864 */ 865 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler, 866 &hca->svc_rcq); 867 if (status != RDMA_SUCCESS) { 868 goto fail3; 869 } 870 871 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler, 872 &hca->svc_scq); 873 if (status != RDMA_SUCCESS) { 874 goto fail3; 875 } 876 877 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler, 878 &hca->clnt_rcq); 879 if (status != RDMA_SUCCESS) { 880 goto fail3; 881 } 882 883 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler, 884 &hca->clnt_scq); 885 if (status != RDMA_SUCCESS) { 886 goto fail3; 887 } 888 889 /* 890 * Create buffer pools. 891 * Note rib_rbuf_create also allocates memory windows. 892 */ 893 hca->recv_pool = rib_rbufpool_create(hca, 894 RECV_BUFFER, rib_max_rbufs); 895 if (hca->recv_pool == NULL) { 896 goto fail3; 897 } 898 899 hca->send_pool = rib_rbufpool_create(hca, 900 SEND_BUFFER, rib_max_rbufs); 901 if (hca->send_pool == NULL) { 902 rib_rbufpool_destroy(hca, RECV_BUFFER); 903 goto fail3; 904 } 905 906 if (hca->server_side_cache == NULL) { 907 (void) sprintf(rssc_name, 908 "rib_srvr_cache_%llx", 909 (long long unsigned int) hca->hca_guid); 910 hca->server_side_cache = kmem_cache_create( 911 rssc_name, 912 sizeof (cache_avl_struct_t), 0, 913 NULL, 914 NULL, 915 rib_server_side_cache_reclaim, 916 hca, NULL, 0); 917 } 918 919 avl_create(&hca->avl_tree, 920 avl_compare, 921 sizeof (cache_avl_struct_t), 922 (uint_t)(uintptr_t)&example_avl_node.avl_link- 923 (uint_t)(uintptr_t)&example_avl_node); 924 925 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER, 926 hca->iblock); 927 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock); 928 rw_init(&hca->avl_rw_lock, 929 NULL, RW_DRIVER, hca->iblock); 930 mutex_init(&hca->cache_allocation_lock, 931 NULL, MUTEX_DRIVER, NULL); 932 hca->avl_init = TRUE; 933 934 /* Create kstats for the cache */ 935 ASSERT(INGLOBALZONE(curproc)); 936 937 if (!stats_enabled) { 938 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc", 939 KSTAT_TYPE_NAMED, 940 sizeof (rpcib_kstat) / sizeof (kstat_named_t), 941 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, 942 GLOBAL_ZONEID); 943 if (ksp) { 944 ksp->ks_data = (void *) &rpcib_kstat; 945 ksp->ks_update = rpcib_cache_kstat_update; 946 kstat_install(ksp); 947 stats_enabled = TRUE; 948 } 949 } 950 if (hca->cleanup_helper == NULL) { 951 char tq_name[sizeof (hca->hca_guid) * 2 + 1]; 952 953 (void) snprintf(tq_name, sizeof (tq_name), "%llX", 954 (unsigned long long int) hca->hca_guid); 955 hca->cleanup_helper = ddi_taskq_create(NULL, 956 tq_name, 1, TASKQ_DEFAULTPRI, 0); 957 } 958 959 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 960 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL); 961 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER, 962 hca->iblock); 963 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER, 964 hca->iblock); 965 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock); 966 hca->inuse = TRUE; 967 968 hca->next = ribstat->hcas_list; 969 ribstat->hcas_list = hca; 970 ribstat->nhca_inited++; 971 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 972 continue; 973 974 fail3: 975 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz); 976 fail2: 977 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 978 fail1: 979 (void) ibt_close_hca(hca->hca_hdl); 980 kmem_free(hca, sizeof (rib_hca_t)); 981 } 982 rw_exit(&ribstat->hcas_list_lock); 983 ibt_free_hca_list(hca_guids, ribstat->hca_count); 984 rib_mod.rdma_count = rib_stat->nhca_inited; 985 986 /* 987 * return success if at least one new hca has been configured. 988 */ 989 if (ribstat->nhca_inited != old_nhca_inited) 990 return (RDMA_SUCCESS); 991 else 992 return (RDMA_FAILED); 993 } 994 995 /* 996 * Callback routines 997 */ 998 999 /* 1000 * SCQ handlers 1001 */ 1002 /* ARGSUSED */ 1003 static void 1004 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1005 { 1006 ibt_status_t ibt_status; 1007 ibt_wc_t wc; 1008 struct send_wid *wd; 1009 CONN *conn; 1010 rib_qp_t *qp; 1011 int i; 1012 1013 /* 1014 * Re-enable cq notify here to avoid missing any 1015 * completion queue notification. 1016 */ 1017 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1018 1019 ibt_status = IBT_SUCCESS; 1020 while (ibt_status != IBT_CQ_EMPTY) { 1021 bzero(&wc, sizeof (wc)); 1022 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1023 if (ibt_status != IBT_SUCCESS) 1024 return; 1025 1026 /* 1027 * Got a send completion 1028 */ 1029 if (wc.wc_id != RDMA_DUMMY_WRID) { 1030 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1031 qp = wd->qp; 1032 conn = qptoc(qp); 1033 1034 mutex_enter(&wd->sendwait_lock); 1035 switch (wc.wc_status) { 1036 case IBT_WC_SUCCESS: 1037 wd->status = RDMA_SUCCESS; 1038 break; 1039 default: 1040 /* 1041 * RC Send Q Error Code Local state Remote State 1042 * ==================== =========== ============ 1043 * IBT_WC_BAD_RESPONSE_ERR ERROR None 1044 * IBT_WC_LOCAL_LEN_ERR ERROR None 1045 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None 1046 * IBT_WC_LOCAL_PROTECT_ERR ERROR None 1047 * IBT_WC_MEM_WIN_BIND_ERR ERROR None 1048 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR 1049 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR 1050 * IBT_WC_REMOTE_OP_ERR ERROR ERROR 1051 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None 1052 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None 1053 * IBT_WC_WR_FLUSHED_ERR ERROR None 1054 */ 1055 /* 1056 * Channel in error state. Set connection to 1057 * ERROR and cleanup will happen either from 1058 * conn_release or from rib_conn_get 1059 */ 1060 wd->status = RDMA_FAILED; 1061 mutex_enter(&conn->c_lock); 1062 if (conn->c_state != C_DISCONN_PEND) 1063 conn->c_state = C_ERROR_CONN; 1064 mutex_exit(&conn->c_lock); 1065 break; 1066 } 1067 1068 if (wd->cv_sig == 1) { 1069 /* 1070 * Notify poster 1071 */ 1072 cv_signal(&wd->wait_cv); 1073 mutex_exit(&wd->sendwait_lock); 1074 } else { 1075 /* 1076 * Poster not waiting for notification. 1077 * Free the send buffers and send_wid 1078 */ 1079 for (i = 0; i < wd->nsbufs; i++) { 1080 rib_rbuf_free(qptoc(wd->qp), 1081 SEND_BUFFER, 1082 (void *)(uintptr_t)wd->sbufaddr[i]); 1083 } 1084 1085 /* decrement the send ref count */ 1086 rib_send_rele(qp); 1087 1088 mutex_exit(&wd->sendwait_lock); 1089 (void) rib_free_sendwait(wd); 1090 } 1091 } 1092 } 1093 } 1094 1095 /* ARGSUSED */ 1096 static void 1097 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1098 { 1099 ibt_status_t ibt_status; 1100 ibt_wc_t wc; 1101 struct send_wid *wd; 1102 rib_qp_t *qp; 1103 CONN *conn; 1104 int i; 1105 1106 /* 1107 * Re-enable cq notify here to avoid missing any 1108 * completion queue notification. 1109 */ 1110 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1111 1112 ibt_status = IBT_SUCCESS; 1113 while (ibt_status != IBT_CQ_EMPTY) { 1114 bzero(&wc, sizeof (wc)); 1115 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1116 if (ibt_status != IBT_SUCCESS) 1117 return; 1118 1119 /* 1120 * Got a send completion 1121 */ 1122 if (wc.wc_id != RDMA_DUMMY_WRID) { 1123 wd = (struct send_wid *)(uintptr_t)wc.wc_id; 1124 qp = wd->qp; 1125 conn = qptoc(qp); 1126 mutex_enter(&wd->sendwait_lock); 1127 1128 switch (wc.wc_status) { 1129 case IBT_WC_SUCCESS: 1130 wd->status = RDMA_SUCCESS; 1131 break; 1132 default: 1133 /* 1134 * Channel in error state. Set connection to 1135 * ERROR and cleanup will happen either from 1136 * conn_release or conn timeout. 1137 */ 1138 wd->status = RDMA_FAILED; 1139 mutex_enter(&conn->c_lock); 1140 if (conn->c_state != C_DISCONN_PEND) 1141 conn->c_state = C_ERROR_CONN; 1142 mutex_exit(&conn->c_lock); 1143 break; 1144 } 1145 1146 if (wd->cv_sig == 1) { 1147 /* 1148 * Update completion status and notify poster 1149 */ 1150 cv_signal(&wd->wait_cv); 1151 mutex_exit(&wd->sendwait_lock); 1152 } else { 1153 /* 1154 * Poster not waiting for notification. 1155 * Free the send buffers and send_wid 1156 */ 1157 for (i = 0; i < wd->nsbufs; i++) { 1158 rib_rbuf_free(qptoc(wd->qp), 1159 SEND_BUFFER, 1160 (void *)(uintptr_t)wd->sbufaddr[i]); 1161 } 1162 1163 /* decrement the send ref count */ 1164 rib_send_rele(qp); 1165 1166 mutex_exit(&wd->sendwait_lock); 1167 (void) rib_free_sendwait(wd); 1168 } 1169 } 1170 } 1171 } 1172 1173 /* 1174 * RCQ handler 1175 */ 1176 /* ARGSUSED */ 1177 static void 1178 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1179 { 1180 rib_qp_t *qp; 1181 ibt_status_t ibt_status; 1182 ibt_wc_t wc; 1183 struct recv_wid *rwid; 1184 1185 /* 1186 * Re-enable cq notify here to avoid missing any 1187 * completion queue notification. 1188 */ 1189 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1190 1191 ibt_status = IBT_SUCCESS; 1192 while (ibt_status != IBT_CQ_EMPTY) { 1193 bzero(&wc, sizeof (wc)); 1194 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1195 if (ibt_status != IBT_SUCCESS) 1196 return; 1197 1198 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id; 1199 qp = rwid->qp; 1200 1201 if (wc.wc_status == IBT_WC_SUCCESS) { 1202 XDR inxdrs, *xdrs; 1203 uint_t xid, vers, op, find_xid = 0; 1204 struct reply *r; 1205 CONN *conn = qptoc(qp); 1206 uint32_t rdma_credit = 0; 1207 1208 xdrs = &inxdrs; 1209 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr, 1210 wc.wc_bytes_xfer, XDR_DECODE); 1211 /* 1212 * Treat xid as opaque (xid is the first entity 1213 * in the rpc rdma message). 1214 */ 1215 xid = *(uint32_t *)(uintptr_t)rwid->addr; 1216 1217 /* Skip xid and set the xdr position accordingly. */ 1218 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1219 (void) xdr_u_int(xdrs, &vers); 1220 (void) xdr_u_int(xdrs, &rdma_credit); 1221 (void) xdr_u_int(xdrs, &op); 1222 XDR_DESTROY(xdrs); 1223 1224 if (vers != RPCRDMA_VERS) { 1225 /* 1226 * Invalid RPC/RDMA version. Cannot 1227 * interoperate. Set connection to 1228 * ERROR state and bail out. 1229 */ 1230 mutex_enter(&conn->c_lock); 1231 if (conn->c_state != C_DISCONN_PEND) 1232 conn->c_state = C_ERROR_CONN; 1233 mutex_exit(&conn->c_lock); 1234 rib_rbuf_free(conn, RECV_BUFFER, 1235 (void *)(uintptr_t)rwid->addr); 1236 rib_free_wid(rwid); 1237 rib_recv_rele(qp); 1238 continue; 1239 } 1240 1241 mutex_enter(&qp->replylist_lock); 1242 for (r = qp->replylist; r != NULL; r = r->next) { 1243 if (r->xid == xid) { 1244 find_xid = 1; 1245 switch (op) { 1246 case RDMA_MSG: 1247 case RDMA_NOMSG: 1248 case RDMA_MSGP: 1249 r->status = RDMA_SUCCESS; 1250 r->vaddr_cq = rwid->addr; 1251 r->bytes_xfer = 1252 wc.wc_bytes_xfer; 1253 cv_signal(&r->wait_cv); 1254 break; 1255 default: 1256 rib_rbuf_free(qptoc(qp), 1257 RECV_BUFFER, 1258 (void *)(uintptr_t) 1259 rwid->addr); 1260 break; 1261 } 1262 break; 1263 } 1264 } 1265 mutex_exit(&qp->replylist_lock); 1266 if (find_xid == 0) { 1267 /* RPC caller not waiting for reply */ 1268 1269 DTRACE_PROBE1(rpcib__i__nomatchxid1, 1270 int, xid); 1271 1272 rib_rbuf_free(qptoc(qp), RECV_BUFFER, 1273 (void *)(uintptr_t)rwid->addr); 1274 } 1275 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) { 1276 CONN *conn = qptoc(qp); 1277 1278 /* 1279 * Connection being flushed. Just free 1280 * the posted buffer 1281 */ 1282 rib_rbuf_free(conn, RECV_BUFFER, 1283 (void *)(uintptr_t)rwid->addr); 1284 } else { 1285 CONN *conn = qptoc(qp); 1286 /* 1287 * RC Recv Q Error Code Local state Remote State 1288 * ==================== =========== ============ 1289 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd 1290 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd 1291 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd 1292 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd 1293 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd 1294 * IBT_WC_WR_FLUSHED_ERR None None 1295 */ 1296 /* 1297 * Channel in error state. Set connection 1298 * in ERROR state. 1299 */ 1300 mutex_enter(&conn->c_lock); 1301 if (conn->c_state != C_DISCONN_PEND) 1302 conn->c_state = C_ERROR_CONN; 1303 mutex_exit(&conn->c_lock); 1304 rib_rbuf_free(conn, RECV_BUFFER, 1305 (void *)(uintptr_t)rwid->addr); 1306 } 1307 rib_free_wid(rwid); 1308 rib_recv_rele(qp); 1309 } 1310 } 1311 1312 /* Server side */ 1313 /* ARGSUSED */ 1314 static void 1315 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 1316 { 1317 rdma_recv_data_t *rdp; 1318 rib_qp_t *qp; 1319 ibt_status_t ibt_status; 1320 ibt_wc_t wc; 1321 struct svc_recv *s_recvp; 1322 CONN *conn; 1323 mblk_t *mp; 1324 1325 /* 1326 * Re-enable cq notify here to avoid missing any 1327 * completion queue notification. 1328 */ 1329 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION); 1330 1331 ibt_status = IBT_SUCCESS; 1332 while (ibt_status != IBT_CQ_EMPTY) { 1333 bzero(&wc, sizeof (wc)); 1334 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL); 1335 if (ibt_status != IBT_SUCCESS) 1336 return; 1337 1338 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id; 1339 qp = s_recvp->qp; 1340 conn = qptoc(qp); 1341 1342 if (wc.wc_status == IBT_WC_SUCCESS) { 1343 XDR inxdrs, *xdrs; 1344 uint_t xid, vers, op; 1345 uint32_t rdma_credit; 1346 1347 xdrs = &inxdrs; 1348 /* s_recvp->vaddr stores data */ 1349 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr, 1350 wc.wc_bytes_xfer, XDR_DECODE); 1351 1352 /* 1353 * Treat xid as opaque (xid is the first entity 1354 * in the rpc rdma message). 1355 */ 1356 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr; 1357 /* Skip xid and set the xdr position accordingly. */ 1358 XDR_SETPOS(xdrs, sizeof (uint32_t)); 1359 if (!xdr_u_int(xdrs, &vers) || 1360 !xdr_u_int(xdrs, &rdma_credit) || 1361 !xdr_u_int(xdrs, &op)) { 1362 rib_rbuf_free(conn, RECV_BUFFER, 1363 (void *)(uintptr_t)s_recvp->vaddr); 1364 XDR_DESTROY(xdrs); 1365 rib_recv_rele(qp); 1366 (void) rib_free_svc_recv(s_recvp); 1367 continue; 1368 } 1369 XDR_DESTROY(xdrs); 1370 1371 if (vers != RPCRDMA_VERS) { 1372 /* 1373 * Invalid RPC/RDMA version. 1374 * Drop rpc rdma message. 1375 */ 1376 rib_rbuf_free(conn, RECV_BUFFER, 1377 (void *)(uintptr_t)s_recvp->vaddr); 1378 rib_recv_rele(qp); 1379 (void) rib_free_svc_recv(s_recvp); 1380 continue; 1381 } 1382 /* 1383 * Is this for RDMA_DONE? 1384 */ 1385 if (op == RDMA_DONE) { 1386 rib_rbuf_free(conn, RECV_BUFFER, 1387 (void *)(uintptr_t)s_recvp->vaddr); 1388 /* 1389 * Wake up the thread waiting on 1390 * a RDMA_DONE for xid 1391 */ 1392 mutex_enter(&qp->rdlist_lock); 1393 rdma_done_notify(qp, xid); 1394 mutex_exit(&qp->rdlist_lock); 1395 rib_recv_rele(qp); 1396 (void) rib_free_svc_recv(s_recvp); 1397 continue; 1398 } 1399 1400 mutex_enter(&plugin_state_lock); 1401 mutex_enter(&conn->c_lock); 1402 if ((plugin_state == ACCEPT) && 1403 (conn->c_state == C_CONNECTED)) { 1404 conn->c_ref++; 1405 mutex_exit(&conn->c_lock); 1406 while ((mp = allocb(sizeof (*rdp), BPRI_LO)) 1407 == NULL) 1408 (void) strwaitbuf( 1409 sizeof (*rdp), BPRI_LO); 1410 /* 1411 * Plugin is in accept state, hence the master 1412 * transport queue for this is still accepting 1413 * requests. Hence we can call svc_queuereq to 1414 * queue this recieved msg. 1415 */ 1416 rdp = (rdma_recv_data_t *)mp->b_rptr; 1417 rdp->conn = conn; 1418 rdp->rpcmsg.addr = 1419 (caddr_t)(uintptr_t)s_recvp->vaddr; 1420 rdp->rpcmsg.type = RECV_BUFFER; 1421 rdp->rpcmsg.len = wc.wc_bytes_xfer; 1422 rdp->status = wc.wc_status; 1423 mp->b_wptr += sizeof (*rdp); 1424 (void) svc_queuereq((queue_t *)rib_stat->q, mp, 1425 FALSE); 1426 mutex_exit(&plugin_state_lock); 1427 } else { 1428 /* 1429 * The master transport for this is going 1430 * away and the queue is not accepting anymore 1431 * requests for krpc, so don't do anything, just 1432 * free the msg. 1433 */ 1434 mutex_exit(&conn->c_lock); 1435 mutex_exit(&plugin_state_lock); 1436 rib_rbuf_free(conn, RECV_BUFFER, 1437 (void *)(uintptr_t)s_recvp->vaddr); 1438 } 1439 } else { 1440 rib_rbuf_free(conn, RECV_BUFFER, 1441 (void *)(uintptr_t)s_recvp->vaddr); 1442 } 1443 rib_recv_rele(qp); 1444 (void) rib_free_svc_recv(s_recvp); 1445 } 1446 } 1447 1448 static void 1449 rib_attach_hca() 1450 { 1451 mutex_enter(&rib_stat->open_hca_lock); 1452 (void) rpcib_open_hcas(rib_stat); 1453 rib_listen(NULL); 1454 mutex_exit(&rib_stat->open_hca_lock); 1455 } 1456 1457 /* 1458 * Handles DR event of IBT_HCA_DETACH_EVENT. 1459 */ 1460 /* ARGSUSED */ 1461 static void 1462 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1463 ibt_async_code_t code, ibt_async_event_t *event) 1464 { 1465 switch (code) { 1466 case IBT_HCA_ATTACH_EVENT: 1467 rib_attach_hca(); 1468 break; 1469 case IBT_HCA_DETACH_EVENT: 1470 rib_detach_hca(hca_hdl); 1471 #ifdef DEBUG 1472 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n"); 1473 #endif 1474 break; 1475 case IBT_EVENT_PORT_UP: 1476 /* 1477 * A port is up. We should call rib_listen() since there is 1478 * a chance that rib_listen() may have failed during 1479 * rib_attach_hca() because the port had not been up yet. 1480 */ 1481 rib_listen(NULL); 1482 #ifdef DEBUG 1483 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n"); 1484 #endif 1485 break; 1486 #ifdef DEBUG 1487 case IBT_EVENT_PATH_MIGRATED: 1488 cmn_err(CE_NOTE, "rib_async_handler(): " 1489 "IBT_EVENT_PATH_MIGRATED\n"); 1490 break; 1491 case IBT_EVENT_SQD: 1492 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n"); 1493 break; 1494 case IBT_EVENT_COM_EST: 1495 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n"); 1496 break; 1497 case IBT_ERROR_CATASTROPHIC_CHAN: 1498 cmn_err(CE_NOTE, "rib_async_handler(): " 1499 "IBT_ERROR_CATASTROPHIC_CHAN\n"); 1500 break; 1501 case IBT_ERROR_INVALID_REQUEST_CHAN: 1502 cmn_err(CE_NOTE, "rib_async_handler(): " 1503 "IBT_ERROR_INVALID_REQUEST_CHAN\n"); 1504 break; 1505 case IBT_ERROR_ACCESS_VIOLATION_CHAN: 1506 cmn_err(CE_NOTE, "rib_async_handler(): " 1507 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n"); 1508 break; 1509 case IBT_ERROR_PATH_MIGRATE_REQ: 1510 cmn_err(CE_NOTE, "rib_async_handler(): " 1511 "IBT_ERROR_PATH_MIGRATE_REQ\n"); 1512 break; 1513 case IBT_ERROR_CQ: 1514 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n"); 1515 break; 1516 case IBT_ERROR_PORT_DOWN: 1517 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n"); 1518 break; 1519 case IBT_ASYNC_OPAQUE1: 1520 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n"); 1521 break; 1522 case IBT_ASYNC_OPAQUE2: 1523 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n"); 1524 break; 1525 case IBT_ASYNC_OPAQUE3: 1526 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n"); 1527 break; 1528 case IBT_ASYNC_OPAQUE4: 1529 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n"); 1530 break; 1531 #endif 1532 default: 1533 break; 1534 } 1535 } 1536 1537 /* 1538 * Client's reachable function. 1539 */ 1540 static rdma_stat 1541 rib_reachable(int addr_type, struct netbuf *raddr, void **handle) 1542 { 1543 rdma_stat status; 1544 rpcib_ping_t rpt; 1545 struct netbuf saddr; 1546 CONN *conn; 1547 1548 bzero(&saddr, sizeof (struct netbuf)); 1549 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn); 1550 1551 if (status == RDMA_SUCCESS) { 1552 *handle = (void *)rpt.hca; 1553 /* release the reference */ 1554 (void) rib_conn_release(conn); 1555 return (RDMA_SUCCESS); 1556 } else { 1557 *handle = NULL; 1558 DTRACE_PROBE(rpcib__i__pingfailed); 1559 return (RDMA_FAILED); 1560 } 1561 } 1562 1563 /* Client side qp creation */ 1564 static rdma_stat 1565 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp) 1566 { 1567 rib_qp_t *kqp = NULL; 1568 CONN *conn; 1569 rdma_clnt_cred_ctrl_t *cc_info; 1570 1571 ASSERT(qp != NULL); 1572 *qp = NULL; 1573 1574 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1575 conn = qptoc(kqp); 1576 kqp->hca = hca; 1577 kqp->rdmaconn.c_rdmamod = &rib_mod; 1578 kqp->rdmaconn.c_private = (caddr_t)kqp; 1579 1580 kqp->mode = RIB_CLIENT; 1581 kqp->chan_flags = IBT_BLOCKING; 1582 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP); 1583 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len); 1584 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len; 1585 /* 1586 * Initialize 1587 */ 1588 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1589 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1590 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1591 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1592 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1593 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock); 1594 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1595 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1596 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1597 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1598 /* 1599 * Initialize the client credit control 1600 * portion of the rdmaconn struct. 1601 */ 1602 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT; 1603 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 1604 cc_info->clnt_cc_granted_ops = 0; 1605 cc_info->clnt_cc_in_flight_ops = 0; 1606 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL); 1607 1608 *qp = kqp; 1609 return (RDMA_SUCCESS); 1610 } 1611 1612 /* Server side qp creation */ 1613 static rdma_stat 1614 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp) 1615 { 1616 rib_qp_t *kqp = NULL; 1617 ibt_chan_sizes_t chan_sizes; 1618 ibt_rc_chan_alloc_args_t qp_attr; 1619 ibt_status_t ibt_status; 1620 rdma_srv_cred_ctrl_t *cc_info; 1621 1622 *qp = NULL; 1623 1624 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP); 1625 kqp->hca = hca; 1626 kqp->port_num = port; 1627 kqp->rdmaconn.c_rdmamod = &rib_mod; 1628 kqp->rdmaconn.c_private = (caddr_t)kqp; 1629 1630 /* 1631 * Create the qp handle 1632 */ 1633 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1634 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl; 1635 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl; 1636 qp_attr.rc_pd = hca->pd_hdl; 1637 qp_attr.rc_hca_port_num = port; 1638 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1639 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1640 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1641 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1642 qp_attr.rc_clone_chan = NULL; 1643 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1644 qp_attr.rc_flags = IBT_WR_SIGNALED; 1645 1646 rw_enter(&hca->state_lock, RW_READER); 1647 if (hca->state != HCA_DETACHED) { 1648 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1649 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl, 1650 &chan_sizes); 1651 } else { 1652 rw_exit(&hca->state_lock); 1653 goto fail; 1654 } 1655 rw_exit(&hca->state_lock); 1656 1657 if (ibt_status != IBT_SUCCESS) { 1658 DTRACE_PROBE1(rpcib__i_svccreatechanfail, 1659 int, ibt_status); 1660 goto fail; 1661 } 1662 1663 kqp->mode = RIB_SERVER; 1664 kqp->chan_flags = IBT_BLOCKING; 1665 kqp->q = q; /* server ONLY */ 1666 1667 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL); 1668 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL); 1669 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1670 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1671 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL); 1672 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock); 1673 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock); 1674 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock); 1675 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL); 1676 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock); 1677 /* 1678 * Set the private data area to qp to be used in callbacks 1679 */ 1680 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp); 1681 kqp->rdmaconn.c_state = C_CONNECTED; 1682 1683 /* 1684 * Initialize the server credit control 1685 * portion of the rdmaconn struct. 1686 */ 1687 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV; 1688 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc; 1689 cc_info->srv_cc_buffers_granted = preposted_rbufs; 1690 cc_info->srv_cc_cur_buffers_used = 0; 1691 cc_info->srv_cc_posted = preposted_rbufs; 1692 1693 *qp = kqp; 1694 1695 return (RDMA_SUCCESS); 1696 fail: 1697 if (kqp) 1698 kmem_free(kqp, sizeof (rib_qp_t)); 1699 1700 return (RDMA_FAILED); 1701 } 1702 1703 /* ARGSUSED */ 1704 ibt_cm_status_t 1705 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event, 1706 ibt_cm_return_args_t *ret_args, void *priv_data, 1707 ibt_priv_data_len_t len) 1708 { 1709 rib_hca_t *hca; 1710 1711 hca = (rib_hca_t *)clnt_hdl; 1712 1713 switch (event->cm_type) { 1714 1715 /* got a connection close event */ 1716 case IBT_CM_EVENT_CONN_CLOSED: 1717 { 1718 CONN *conn; 1719 rib_qp_t *qp; 1720 1721 /* check reason why connection was closed */ 1722 switch (event->cm_event.closed) { 1723 case IBT_CM_CLOSED_DREP_RCVD: 1724 case IBT_CM_CLOSED_DREQ_TIMEOUT: 1725 case IBT_CM_CLOSED_DUP: 1726 case IBT_CM_CLOSED_ABORT: 1727 case IBT_CM_CLOSED_ALREADY: 1728 /* 1729 * These cases indicate the local end initiated 1730 * the closing of the channel. Nothing to do here. 1731 */ 1732 break; 1733 default: 1734 /* 1735 * Reason for CONN_CLOSED event must be one of 1736 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 1737 * or IBT_CM_CLOSED_STALE. These indicate cases were 1738 * the remote end is closing the channel. In these 1739 * cases free the channel and transition to error 1740 * state 1741 */ 1742 qp = ibt_get_chan_private(event->cm_channel); 1743 conn = qptoc(qp); 1744 mutex_enter(&conn->c_lock); 1745 if (conn->c_state == C_DISCONN_PEND) { 1746 mutex_exit(&conn->c_lock); 1747 break; 1748 } 1749 1750 conn->c_state = C_ERROR_CONN; 1751 1752 /* 1753 * Free the conn if c_ref is down to 0 already 1754 */ 1755 if (conn->c_ref == 0) { 1756 /* 1757 * Remove from list and free conn 1758 */ 1759 conn->c_state = C_DISCONN_PEND; 1760 mutex_exit(&conn->c_lock); 1761 rw_enter(&hca->state_lock, RW_READER); 1762 if (hca->state != HCA_DETACHED) 1763 (void) rib_disconnect_channel(conn, 1764 &hca->cl_conn_list); 1765 rw_exit(&hca->state_lock); 1766 } else { 1767 /* 1768 * conn will be freed when c_ref goes to 0. 1769 * Indicate to cleaning thread not to close 1770 * the connection, but just free the channel. 1771 */ 1772 conn->c_flags |= C_CLOSE_NOTNEEDED; 1773 mutex_exit(&conn->c_lock); 1774 } 1775 #ifdef DEBUG 1776 if (rib_debug) 1777 cmn_err(CE_NOTE, "rib_clnt_cm_handler: " 1778 "(CONN_CLOSED) channel disconnected"); 1779 #endif 1780 break; 1781 } 1782 break; 1783 } 1784 default: 1785 break; 1786 } 1787 return (IBT_CM_ACCEPT); 1788 } 1789 1790 /* 1791 * Connect to the server. 1792 */ 1793 rdma_stat 1794 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp) 1795 { 1796 ibt_chan_open_args_t chan_args; /* channel args */ 1797 ibt_chan_sizes_t chan_sizes; 1798 ibt_rc_chan_alloc_args_t qp_attr; 1799 ibt_status_t ibt_status; 1800 ibt_rc_returns_t ret_args; /* conn reject info */ 1801 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */ 1802 ibt_ip_cm_info_t ipcm_info; 1803 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ]; 1804 1805 1806 (void) bzero(&chan_args, sizeof (chan_args)); 1807 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t)); 1808 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t)); 1809 1810 ipcm_info.src_addr.family = rptp->srcip.family; 1811 switch (ipcm_info.src_addr.family) { 1812 case AF_INET: 1813 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr; 1814 break; 1815 case AF_INET6: 1816 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr; 1817 break; 1818 } 1819 1820 ipcm_info.dst_addr.family = rptp->srcip.family; 1821 switch (ipcm_info.dst_addr.family) { 1822 case AF_INET: 1823 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr; 1824 break; 1825 case AF_INET6: 1826 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr; 1827 break; 1828 } 1829 1830 ipcm_info.src_port = (in_port_t)nfs_rdma_port; 1831 1832 ibt_status = ibt_format_ip_private_data(&ipcm_info, 1833 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt); 1834 1835 if (ibt_status != IBT_SUCCESS) { 1836 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n"); 1837 return (-1); 1838 } 1839 1840 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num; 1841 /* Alloc a RC channel */ 1842 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 1843 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl; 1844 qp_attr.rc_pd = hca->pd_hdl; 1845 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX; 1846 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX; 1847 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE; 1848 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE; 1849 qp_attr.rc_clone_chan = NULL; 1850 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR; 1851 qp_attr.rc_flags = IBT_WR_SIGNALED; 1852 1853 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port); 1854 chan_args.oc_path = &rptp->path; 1855 1856 chan_args.oc_cm_handler = rib_clnt_cm_handler; 1857 chan_args.oc_cm_clnt_private = (void *)hca; 1858 chan_args.oc_rdma_ra_out = 4; 1859 chan_args.oc_rdma_ra_in = 4; 1860 chan_args.oc_path_retry_cnt = 2; 1861 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES; 1862 chan_args.oc_priv_data = cmp_ip_pvt; 1863 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ; 1864 1865 refresh: 1866 rw_enter(&hca->state_lock, RW_READER); 1867 if (hca->state != HCA_DETACHED) { 1868 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl, 1869 IBT_ACHAN_NO_FLAGS, 1870 &qp_attr, &qp->qp_hdl, 1871 &chan_sizes); 1872 } else { 1873 rw_exit(&hca->state_lock); 1874 return (RDMA_FAILED); 1875 } 1876 rw_exit(&hca->state_lock); 1877 1878 if (ibt_status != IBT_SUCCESS) { 1879 DTRACE_PROBE1(rpcib__i_conntosrv, 1880 int, ibt_status); 1881 return (RDMA_FAILED); 1882 } 1883 1884 /* Connect to the Server */ 1885 (void) bzero(&ret_args, sizeof (ret_args)); 1886 mutex_enter(&qp->cb_lock); 1887 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS, 1888 IBT_BLOCKING, &chan_args, &ret_args); 1889 if (ibt_status != IBT_SUCCESS) { 1890 DTRACE_PROBE2(rpcib__i_openrctosrv, 1891 int, ibt_status, int, ret_args.rc_status); 1892 1893 (void) ibt_free_channel(qp->qp_hdl); 1894 qp->qp_hdl = NULL; 1895 mutex_exit(&qp->cb_lock); 1896 if (refresh-- && ibt_status == IBT_CM_FAILURE && 1897 ret_args.rc_status == IBT_CM_CONN_STALE) { 1898 /* 1899 * Got IBT_CM_CONN_STALE probably because of stale 1900 * data on the passive end of a channel that existed 1901 * prior to reboot. Retry establishing a channel 1902 * REFRESH_ATTEMPTS times, during which time the 1903 * stale conditions on the server might clear up. 1904 */ 1905 goto refresh; 1906 } 1907 return (RDMA_FAILED); 1908 } 1909 mutex_exit(&qp->cb_lock); 1910 /* 1911 * Set the private data area to qp to be used in callbacks 1912 */ 1913 ibt_set_chan_private(qp->qp_hdl, (void *)qp); 1914 return (RDMA_SUCCESS); 1915 } 1916 1917 rdma_stat 1918 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp) 1919 { 1920 uint_t i, addr_count; 1921 ibt_status_t ibt_status; 1922 uint8_t num_paths_p; 1923 ibt_ip_path_attr_t ipattr; 1924 ibt_path_ip_src_t srcip; 1925 rpcib_ipaddrs_t addrs4; 1926 rpcib_ipaddrs_t addrs6; 1927 struct sockaddr_in *sinp; 1928 struct sockaddr_in6 *sin6p; 1929 rdma_stat retval = RDMA_FAILED; 1930 rib_hca_t *hca; 1931 1932 if ((addr_type != AF_INET) && (addr_type != AF_INET6)) 1933 return (RDMA_INVAL); 1934 ASSERT(raddr->buf != NULL); 1935 1936 bzero(&ipattr, sizeof (ibt_ip_path_attr_t)); 1937 1938 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) || 1939 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) { 1940 retval = RDMA_FAILED; 1941 goto done2; 1942 } 1943 1944 if (addr_type == AF_INET) { 1945 addr_count = addrs4.ri_count; 1946 sinp = (struct sockaddr_in *)raddr->buf; 1947 rptp->dstip.family = AF_INET; 1948 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr; 1949 sinp = addrs4.ri_list; 1950 } else { 1951 addr_count = addrs6.ri_count; 1952 sin6p = (struct sockaddr_in6 *)raddr->buf; 1953 rptp->dstip.family = AF_INET6; 1954 rptp->dstip.un.ip6addr = sin6p->sin6_addr; 1955 sin6p = addrs6.ri_list; 1956 } 1957 1958 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 1959 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 1960 rw_enter(&hca->state_lock, RW_READER); 1961 if (hca->state == HCA_DETACHED) { 1962 rw_exit(&hca->state_lock); 1963 continue; 1964 } 1965 1966 ipattr.ipa_dst_ip = &rptp->dstip; 1967 ipattr.ipa_hca_guid = hca->hca_guid; 1968 ipattr.ipa_ndst = 1; 1969 ipattr.ipa_max_paths = 1; 1970 ipattr.ipa_src_ip.family = rptp->dstip.family; 1971 for (i = 0; i < addr_count; i++) { 1972 num_paths_p = 0; 1973 if (addr_type == AF_INET) { 1974 ipattr.ipa_src_ip.un.ip4addr = 1975 sinp[i].sin_addr.s_addr; 1976 } else { 1977 ipattr.ipa_src_ip.un.ip6addr = 1978 sin6p[i].sin6_addr; 1979 } 1980 bzero(&srcip, sizeof (ibt_path_ip_src_t)); 1981 1982 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl, 1983 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path, 1984 &num_paths_p, &srcip); 1985 if (ibt_status == IBT_SUCCESS && 1986 num_paths_p != 0 && 1987 rptp->path.pi_hca_guid == hca->hca_guid) { 1988 rptp->hca = hca; 1989 rw_exit(&hca->state_lock); 1990 if (addr_type == AF_INET) { 1991 rptp->srcip.family = AF_INET; 1992 rptp->srcip.un.ip4addr = 1993 srcip.ip_primary.un.ip4addr; 1994 } else { 1995 rptp->srcip.family = AF_INET6; 1996 rptp->srcip.un.ip6addr = 1997 srcip.ip_primary.un.ip6addr; 1998 1999 } 2000 retval = RDMA_SUCCESS; 2001 goto done1; 2002 } 2003 } 2004 rw_exit(&hca->state_lock); 2005 } 2006 done1: 2007 rw_exit(&rib_stat->hcas_list_lock); 2008 done2: 2009 if (addrs4.ri_size > 0) 2010 kmem_free(addrs4.ri_list, addrs4.ri_size); 2011 if (addrs6.ri_size > 0) 2012 kmem_free(addrs6.ri_list, addrs6.ri_size); 2013 return (retval); 2014 } 2015 2016 /* 2017 * Close channel, remove from connection list and 2018 * free up resources allocated for that channel. 2019 */ 2020 rdma_stat 2021 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list) 2022 { 2023 rib_qp_t *qp = ctoqp(conn); 2024 rib_hca_t *hca; 2025 2026 mutex_enter(&conn->c_lock); 2027 if (conn->c_timeout != NULL) { 2028 mutex_exit(&conn->c_lock); 2029 (void) untimeout(conn->c_timeout); 2030 mutex_enter(&conn->c_lock); 2031 } 2032 2033 while (conn->c_flags & C_CLOSE_PENDING) { 2034 cv_wait(&conn->c_cv, &conn->c_lock); 2035 } 2036 mutex_exit(&conn->c_lock); 2037 2038 /* 2039 * c_ref == 0 and connection is in C_DISCONN_PEND 2040 */ 2041 hca = qp->hca; 2042 if (conn_list != NULL) 2043 (void) rib_rm_conn(conn, conn_list); 2044 2045 /* 2046 * There is only one case where we get here with 2047 * qp_hdl = NULL, which is during connection setup on 2048 * the client. In such a case there are no posted 2049 * send/recv buffers. 2050 */ 2051 if (qp->qp_hdl != NULL) { 2052 mutex_enter(&qp->posted_rbufs_lock); 2053 while (qp->n_posted_rbufs) 2054 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock); 2055 mutex_exit(&qp->posted_rbufs_lock); 2056 2057 mutex_enter(&qp->send_rbufs_lock); 2058 while (qp->n_send_rbufs) 2059 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock); 2060 mutex_exit(&qp->send_rbufs_lock); 2061 2062 (void) ibt_free_channel(qp->qp_hdl); 2063 qp->qp_hdl = NULL; 2064 } 2065 2066 ASSERT(qp->rdlist == NULL); 2067 2068 if (qp->replylist != NULL) { 2069 (void) rib_rem_replylist(qp); 2070 } 2071 2072 cv_destroy(&qp->cb_conn_cv); 2073 cv_destroy(&qp->posted_rbufs_cv); 2074 cv_destroy(&qp->send_rbufs_cv); 2075 mutex_destroy(&qp->cb_lock); 2076 mutex_destroy(&qp->replylist_lock); 2077 mutex_destroy(&qp->posted_rbufs_lock); 2078 mutex_destroy(&qp->send_rbufs_lock); 2079 mutex_destroy(&qp->rdlist_lock); 2080 2081 cv_destroy(&conn->c_cv); 2082 mutex_destroy(&conn->c_lock); 2083 2084 if (conn->c_raddr.buf != NULL) { 2085 kmem_free(conn->c_raddr.buf, conn->c_raddr.len); 2086 } 2087 if (conn->c_laddr.buf != NULL) { 2088 kmem_free(conn->c_laddr.buf, conn->c_laddr.len); 2089 } 2090 if (conn->c_netid != NULL) { 2091 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1)); 2092 } 2093 if (conn->c_addrmask.buf != NULL) { 2094 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len); 2095 } 2096 2097 /* 2098 * Credit control cleanup. 2099 */ 2100 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) { 2101 rdma_clnt_cred_ctrl_t *cc_info; 2102 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc; 2103 cv_destroy(&cc_info->clnt_cc_cv); 2104 } 2105 2106 kmem_free(qp, sizeof (rib_qp_t)); 2107 2108 /* 2109 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL, 2110 * then the hca is no longer being used. 2111 */ 2112 if (conn_list != NULL) { 2113 rw_enter(&hca->state_lock, RW_READER); 2114 if (hca->state == HCA_DETACHED) { 2115 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 2116 if (hca->srv_conn_list.conn_hd == NULL) { 2117 rw_enter(&hca->cl_conn_list.conn_lock, 2118 RW_READER); 2119 2120 if (hca->cl_conn_list.conn_hd == NULL) { 2121 mutex_enter(&hca->inuse_lock); 2122 hca->inuse = FALSE; 2123 cv_signal(&hca->cb_cv); 2124 mutex_exit(&hca->inuse_lock); 2125 } 2126 rw_exit(&hca->cl_conn_list.conn_lock); 2127 } 2128 rw_exit(&hca->srv_conn_list.conn_lock); 2129 } 2130 rw_exit(&hca->state_lock); 2131 } 2132 2133 return (RDMA_SUCCESS); 2134 } 2135 2136 /* 2137 * All sends are done under the protection of 2138 * the wdesc->sendwait_lock. n_send_rbufs count 2139 * is protected using the send_rbufs_lock. 2140 * lock ordering is: 2141 * sendwait_lock -> send_rbufs_lock 2142 */ 2143 2144 void 2145 rib_send_hold(rib_qp_t *qp) 2146 { 2147 mutex_enter(&qp->send_rbufs_lock); 2148 qp->n_send_rbufs++; 2149 mutex_exit(&qp->send_rbufs_lock); 2150 } 2151 2152 void 2153 rib_send_rele(rib_qp_t *qp) 2154 { 2155 mutex_enter(&qp->send_rbufs_lock); 2156 qp->n_send_rbufs--; 2157 if (qp->n_send_rbufs == 0) 2158 cv_signal(&qp->send_rbufs_cv); 2159 mutex_exit(&qp->send_rbufs_lock); 2160 } 2161 2162 void 2163 rib_recv_rele(rib_qp_t *qp) 2164 { 2165 mutex_enter(&qp->posted_rbufs_lock); 2166 qp->n_posted_rbufs--; 2167 if (qp->n_posted_rbufs == 0) 2168 cv_signal(&qp->posted_rbufs_cv); 2169 mutex_exit(&qp->posted_rbufs_lock); 2170 } 2171 2172 /* 2173 * Wait for send completion notification. Only on receiving a 2174 * notification be it a successful or error completion, free the 2175 * send_wid. 2176 */ 2177 static rdma_stat 2178 rib_sendwait(rib_qp_t *qp, struct send_wid *wd) 2179 { 2180 clock_t timout, cv_wait_ret; 2181 rdma_stat error = RDMA_SUCCESS; 2182 int i; 2183 2184 /* 2185 * Wait for send to complete 2186 */ 2187 ASSERT(wd != NULL); 2188 mutex_enter(&wd->sendwait_lock); 2189 if (wd->status == (uint_t)SEND_WAIT) { 2190 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) + 2191 ddi_get_lbolt(); 2192 2193 if (qp->mode == RIB_SERVER) { 2194 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv, 2195 &wd->sendwait_lock, timout)) > 0 && 2196 wd->status == (uint_t)SEND_WAIT) 2197 ; 2198 switch (cv_wait_ret) { 2199 case -1: /* timeout */ 2200 DTRACE_PROBE(rpcib__i__srvsendwait__timeout); 2201 2202 wd->cv_sig = 0; /* no signal needed */ 2203 error = RDMA_TIMEDOUT; 2204 break; 2205 default: /* got send completion */ 2206 break; 2207 } 2208 } else { 2209 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv, 2210 &wd->sendwait_lock, timout)) > 0 && 2211 wd->status == (uint_t)SEND_WAIT) 2212 ; 2213 switch (cv_wait_ret) { 2214 case -1: /* timeout */ 2215 DTRACE_PROBE(rpcib__i__clntsendwait__timeout); 2216 2217 wd->cv_sig = 0; /* no signal needed */ 2218 error = RDMA_TIMEDOUT; 2219 break; 2220 case 0: /* interrupted */ 2221 DTRACE_PROBE(rpcib__i__clntsendwait__intr); 2222 2223 wd->cv_sig = 0; /* no signal needed */ 2224 error = RDMA_INTR; 2225 break; 2226 default: /* got send completion */ 2227 break; 2228 } 2229 } 2230 } 2231 2232 if (wd->status != (uint_t)SEND_WAIT) { 2233 /* got send completion */ 2234 if (wd->status != RDMA_SUCCESS) { 2235 switch (wd->status) { 2236 case RDMA_CONNLOST: 2237 error = RDMA_CONNLOST; 2238 break; 2239 default: 2240 error = RDMA_FAILED; 2241 break; 2242 } 2243 } 2244 for (i = 0; i < wd->nsbufs; i++) { 2245 rib_rbuf_free(qptoc(qp), SEND_BUFFER, 2246 (void *)(uintptr_t)wd->sbufaddr[i]); 2247 } 2248 2249 rib_send_rele(qp); 2250 2251 mutex_exit(&wd->sendwait_lock); 2252 (void) rib_free_sendwait(wd); 2253 2254 } else { 2255 mutex_exit(&wd->sendwait_lock); 2256 } 2257 return (error); 2258 } 2259 2260 static struct send_wid * 2261 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp) 2262 { 2263 struct send_wid *wd; 2264 2265 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP); 2266 wd->xid = xid; 2267 wd->cv_sig = cv_sig; 2268 wd->qp = qp; 2269 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL); 2270 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL); 2271 wd->status = (uint_t)SEND_WAIT; 2272 2273 return (wd); 2274 } 2275 2276 static int 2277 rib_free_sendwait(struct send_wid *wdesc) 2278 { 2279 cv_destroy(&wdesc->wait_cv); 2280 mutex_destroy(&wdesc->sendwait_lock); 2281 kmem_free(wdesc, sizeof (*wdesc)); 2282 2283 return (0); 2284 } 2285 2286 static rdma_stat 2287 rib_rem_rep(rib_qp_t *qp, struct reply *rep) 2288 { 2289 mutex_enter(&qp->replylist_lock); 2290 if (rep != NULL) { 2291 (void) rib_remreply(qp, rep); 2292 mutex_exit(&qp->replylist_lock); 2293 return (RDMA_SUCCESS); 2294 } 2295 mutex_exit(&qp->replylist_lock); 2296 return (RDMA_FAILED); 2297 } 2298 2299 /* 2300 * Send buffers are freed here only in case of error in posting 2301 * on QP. If the post succeeded, the send buffers are freed upon 2302 * send completion in rib_sendwait() or in the scq_handler. 2303 */ 2304 rdma_stat 2305 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 2306 int send_sig, int cv_sig, caddr_t *swid) 2307 { 2308 struct send_wid *wdesc; 2309 struct clist *clp; 2310 ibt_status_t ibt_status = IBT_SUCCESS; 2311 rdma_stat ret = RDMA_SUCCESS; 2312 ibt_send_wr_t tx_wr; 2313 int i, nds; 2314 ibt_wr_ds_t sgl[DSEG_MAX]; 2315 uint_t total_msg_size; 2316 rib_qp_t *qp; 2317 2318 qp = ctoqp(conn); 2319 2320 ASSERT(cl != NULL); 2321 2322 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2323 2324 nds = 0; 2325 total_msg_size = 0; 2326 clp = cl; 2327 while (clp != NULL) { 2328 if (nds >= DSEG_MAX) { 2329 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded); 2330 return (RDMA_FAILED); 2331 } 2332 sgl[nds].ds_va = clp->w.c_saddr; 2333 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */ 2334 sgl[nds].ds_len = clp->c_len; 2335 total_msg_size += clp->c_len; 2336 clp = clp->c_next; 2337 nds++; 2338 } 2339 2340 if (send_sig) { 2341 /* Set SEND_SIGNAL flag. */ 2342 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2343 wdesc = rib_init_sendwait(msgid, cv_sig, qp); 2344 *swid = (caddr_t)wdesc; 2345 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2346 mutex_enter(&wdesc->sendwait_lock); 2347 wdesc->nsbufs = nds; 2348 for (i = 0; i < nds; i++) { 2349 wdesc->sbufaddr[i] = sgl[i].ds_va; 2350 } 2351 } else { 2352 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2353 *swid = NULL; 2354 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2355 } 2356 2357 tx_wr.wr_opcode = IBT_WRC_SEND; 2358 tx_wr.wr_trans = IBT_RC_SRV; 2359 tx_wr.wr_nds = nds; 2360 tx_wr.wr_sgl = sgl; 2361 2362 mutex_enter(&conn->c_lock); 2363 if (conn->c_state == C_CONNECTED) { 2364 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2365 } 2366 if (conn->c_state != C_CONNECTED || 2367 ibt_status != IBT_SUCCESS) { 2368 if (conn->c_state != C_DISCONN_PEND) 2369 conn->c_state = C_ERROR_CONN; 2370 mutex_exit(&conn->c_lock); 2371 if (send_sig) { 2372 for (i = 0; i < nds; i++) { 2373 rib_rbuf_free(conn, SEND_BUFFER, 2374 (void *)(uintptr_t)wdesc->sbufaddr[i]); 2375 } 2376 mutex_exit(&wdesc->sendwait_lock); 2377 (void) rib_free_sendwait(wdesc); 2378 } 2379 return (RDMA_CONNLOST); 2380 } 2381 2382 mutex_exit(&conn->c_lock); 2383 2384 if (send_sig) { 2385 rib_send_hold(qp); 2386 mutex_exit(&wdesc->sendwait_lock); 2387 if (cv_sig) { 2388 /* 2389 * cv_wait for send to complete. 2390 * We can fail due to a timeout or signal or 2391 * unsuccessful send. 2392 */ 2393 ret = rib_sendwait(qp, wdesc); 2394 2395 return (ret); 2396 } 2397 } 2398 2399 return (RDMA_SUCCESS); 2400 } 2401 2402 2403 rdma_stat 2404 rib_send(CONN *conn, struct clist *cl, uint32_t msgid) 2405 { 2406 rdma_stat ret; 2407 caddr_t wd; 2408 2409 /* send-wait & cv_signal */ 2410 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 2411 return (ret); 2412 } 2413 2414 /* 2415 * Deprecated/obsolete interface not used currently 2416 * but earlier used for READ-READ protocol. 2417 * Send RPC reply and wait for RDMA_DONE. 2418 */ 2419 rdma_stat 2420 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid) 2421 { 2422 rdma_stat ret = RDMA_SUCCESS; 2423 struct rdma_done_list *rd; 2424 clock_t cv_wait_ret; 2425 caddr_t *wid = NULL; 2426 rib_qp_t *qp = ctoqp(conn); 2427 2428 mutex_enter(&qp->rdlist_lock); 2429 rd = rdma_done_add(qp, msgid); 2430 2431 /* No cv_signal (whether send-wait or no-send-wait) */ 2432 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 2433 2434 if (ret != RDMA_SUCCESS) { 2435 rdma_done_rm(qp, rd); 2436 } else { 2437 /* 2438 * Wait for RDMA_DONE from remote end 2439 */ 2440 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv, 2441 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000), 2442 TR_CLOCK_TICK); 2443 2444 rdma_done_rm(qp, rd); 2445 2446 if (cv_wait_ret < 0) { 2447 ret = RDMA_TIMEDOUT; 2448 } 2449 } 2450 2451 mutex_exit(&qp->rdlist_lock); 2452 return (ret); 2453 } 2454 2455 static struct recv_wid * 2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid) 2457 { 2458 struct recv_wid *rwid; 2459 2460 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP); 2461 rwid->xid = msgid; 2462 rwid->addr = sgl->ds_va; 2463 rwid->qp = qp; 2464 2465 return (rwid); 2466 } 2467 2468 static void 2469 rib_free_wid(struct recv_wid *rwid) 2470 { 2471 kmem_free(rwid, sizeof (struct recv_wid)); 2472 } 2473 2474 rdma_stat 2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid) 2476 { 2477 rib_qp_t *qp = ctoqp(conn); 2478 struct clist *clp = cl; 2479 struct reply *rep; 2480 struct recv_wid *rwid; 2481 int nds; 2482 ibt_wr_ds_t sgl[DSEG_MAX]; 2483 ibt_recv_wr_t recv_wr; 2484 rdma_stat ret; 2485 ibt_status_t ibt_status; 2486 2487 /* 2488 * rdma_clnt_postrecv uses RECV_BUFFER. 2489 */ 2490 2491 nds = 0; 2492 while (cl != NULL) { 2493 if (nds >= DSEG_MAX) { 2494 ret = RDMA_FAILED; 2495 goto done; 2496 } 2497 sgl[nds].ds_va = cl->w.c_saddr; 2498 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2499 sgl[nds].ds_len = cl->c_len; 2500 cl = cl->c_next; 2501 nds++; 2502 } 2503 2504 if (nds != 1) { 2505 ret = RDMA_FAILED; 2506 goto done; 2507 } 2508 2509 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2510 recv_wr.wr_nds = nds; 2511 recv_wr.wr_sgl = sgl; 2512 2513 rwid = rib_create_wid(qp, &sgl[0], msgid); 2514 if (rwid) { 2515 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid; 2516 } else { 2517 ret = RDMA_NORESOURCE; 2518 goto done; 2519 } 2520 rep = rib_addreplylist(qp, msgid); 2521 if (!rep) { 2522 rib_free_wid(rwid); 2523 ret = RDMA_NORESOURCE; 2524 goto done; 2525 } 2526 2527 mutex_enter(&conn->c_lock); 2528 2529 if (conn->c_state == C_CONNECTED) { 2530 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2531 } 2532 2533 if (conn->c_state != C_CONNECTED || 2534 ibt_status != IBT_SUCCESS) { 2535 if (conn->c_state != C_DISCONN_PEND) 2536 conn->c_state = C_ERROR_CONN; 2537 mutex_exit(&conn->c_lock); 2538 rib_free_wid(rwid); 2539 (void) rib_rem_rep(qp, rep); 2540 ret = RDMA_CONNLOST; 2541 goto done; 2542 } 2543 2544 mutex_enter(&qp->posted_rbufs_lock); 2545 qp->n_posted_rbufs++; 2546 mutex_exit(&qp->posted_rbufs_lock); 2547 2548 mutex_exit(&conn->c_lock); 2549 return (RDMA_SUCCESS); 2550 2551 done: 2552 while (clp != NULL) { 2553 rib_rbuf_free(conn, RECV_BUFFER, 2554 (void *)(uintptr_t)clp->w.c_saddr3); 2555 clp = clp->c_next; 2556 } 2557 return (ret); 2558 } 2559 2560 rdma_stat 2561 rib_svc_post(CONN* conn, struct clist *cl) 2562 { 2563 rib_qp_t *qp = ctoqp(conn); 2564 struct svc_recv *s_recvp; 2565 int nds; 2566 ibt_wr_ds_t sgl[DSEG_MAX]; 2567 ibt_recv_wr_t recv_wr; 2568 ibt_status_t ibt_status; 2569 2570 nds = 0; 2571 while (cl != NULL) { 2572 if (nds >= DSEG_MAX) { 2573 return (RDMA_FAILED); 2574 } 2575 sgl[nds].ds_va = cl->w.c_saddr; 2576 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2577 sgl[nds].ds_len = cl->c_len; 2578 cl = cl->c_next; 2579 nds++; 2580 } 2581 2582 if (nds != 1) { 2583 rib_rbuf_free(conn, RECV_BUFFER, 2584 (caddr_t)(uintptr_t)sgl[0].ds_va); 2585 2586 return (RDMA_FAILED); 2587 } 2588 2589 bzero(&recv_wr, sizeof (ibt_recv_wr_t)); 2590 recv_wr.wr_nds = nds; 2591 recv_wr.wr_sgl = sgl; 2592 2593 s_recvp = rib_init_svc_recv(qp, &sgl[0]); 2594 /* Use s_recvp's addr as wr id */ 2595 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp; 2596 mutex_enter(&conn->c_lock); 2597 if (conn->c_state == C_CONNECTED) { 2598 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL); 2599 } 2600 if (conn->c_state != C_CONNECTED || 2601 ibt_status != IBT_SUCCESS) { 2602 if (conn->c_state != C_DISCONN_PEND) 2603 conn->c_state = C_ERROR_CONN; 2604 mutex_exit(&conn->c_lock); 2605 rib_rbuf_free(conn, RECV_BUFFER, 2606 (caddr_t)(uintptr_t)sgl[0].ds_va); 2607 (void) rib_free_svc_recv(s_recvp); 2608 2609 return (RDMA_CONNLOST); 2610 } 2611 mutex_exit(&conn->c_lock); 2612 2613 return (RDMA_SUCCESS); 2614 } 2615 2616 /* Client */ 2617 rdma_stat 2618 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid) 2619 { 2620 return (rib_clnt_post(conn, cl, msgid)); 2621 } 2622 2623 /* Client */ 2624 rdma_stat 2625 rib_post_resp_remove(CONN* conn, uint32_t msgid) 2626 { 2627 rib_qp_t *qp = ctoqp(conn); 2628 struct reply *rep; 2629 2630 mutex_enter(&qp->replylist_lock); 2631 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2632 if (rep->xid == msgid) { 2633 if (rep->vaddr_cq) { 2634 rib_rbuf_free(conn, RECV_BUFFER, 2635 (caddr_t)(uintptr_t)rep->vaddr_cq); 2636 } 2637 (void) rib_remreply(qp, rep); 2638 break; 2639 } 2640 } 2641 mutex_exit(&qp->replylist_lock); 2642 2643 return (RDMA_SUCCESS); 2644 } 2645 2646 /* Server */ 2647 rdma_stat 2648 rib_post_recv(CONN *conn, struct clist *cl) 2649 { 2650 rib_qp_t *qp = ctoqp(conn); 2651 2652 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) { 2653 mutex_enter(&qp->posted_rbufs_lock); 2654 qp->n_posted_rbufs++; 2655 mutex_exit(&qp->posted_rbufs_lock); 2656 return (RDMA_SUCCESS); 2657 } 2658 return (RDMA_FAILED); 2659 } 2660 2661 /* 2662 * Client side only interface to "recv" the rpc reply buf 2663 * posted earlier by rib_post_resp(conn, cl, msgid). 2664 */ 2665 rdma_stat 2666 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid) 2667 { 2668 struct reply *rep = NULL; 2669 clock_t timout, cv_wait_ret; 2670 rdma_stat ret = RDMA_SUCCESS; 2671 rib_qp_t *qp = ctoqp(conn); 2672 2673 /* 2674 * Find the reply structure for this msgid 2675 */ 2676 mutex_enter(&qp->replylist_lock); 2677 2678 for (rep = qp->replylist; rep != NULL; rep = rep->next) { 2679 if (rep->xid == msgid) 2680 break; 2681 } 2682 2683 if (rep != NULL) { 2684 /* 2685 * If message not yet received, wait. 2686 */ 2687 if (rep->status == (uint_t)REPLY_WAIT) { 2688 timout = ddi_get_lbolt() + 2689 drv_usectohz(REPLY_WAIT_TIME * 1000000); 2690 2691 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv, 2692 &qp->replylist_lock, timout)) > 0 && 2693 rep->status == (uint_t)REPLY_WAIT) 2694 ; 2695 2696 switch (cv_wait_ret) { 2697 case -1: /* timeout */ 2698 ret = RDMA_TIMEDOUT; 2699 break; 2700 case 0: 2701 ret = RDMA_INTR; 2702 break; 2703 default: 2704 break; 2705 } 2706 } 2707 2708 if (rep->status == RDMA_SUCCESS) { 2709 struct clist *cl = NULL; 2710 2711 /* 2712 * Got message successfully 2713 */ 2714 clist_add(&cl, 0, rep->bytes_xfer, NULL, 2715 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL); 2716 *clp = cl; 2717 } else { 2718 if (rep->status != (uint_t)REPLY_WAIT) { 2719 /* 2720 * Got error in reply message. Free 2721 * recv buffer here. 2722 */ 2723 ret = rep->status; 2724 rib_rbuf_free(conn, RECV_BUFFER, 2725 (caddr_t)(uintptr_t)rep->vaddr_cq); 2726 } 2727 } 2728 (void) rib_remreply(qp, rep); 2729 } else { 2730 /* 2731 * No matching reply structure found for given msgid on the 2732 * reply wait list. 2733 */ 2734 ret = RDMA_INVAL; 2735 DTRACE_PROBE(rpcib__i__nomatchxid2); 2736 } 2737 2738 /* 2739 * Done. 2740 */ 2741 mutex_exit(&qp->replylist_lock); 2742 return (ret); 2743 } 2744 2745 /* 2746 * RDMA write a buffer to the remote address. 2747 */ 2748 rdma_stat 2749 rib_write(CONN *conn, struct clist *cl, int wait) 2750 { 2751 ibt_send_wr_t tx_wr; 2752 int cv_sig; 2753 ibt_wr_ds_t sgl[DSEG_MAX]; 2754 struct send_wid *wdesc; 2755 ibt_status_t ibt_status; 2756 rdma_stat ret = RDMA_SUCCESS; 2757 rib_qp_t *qp = ctoqp(conn); 2758 uint64_t n_writes = 0; 2759 2760 if (cl == NULL) { 2761 return (RDMA_FAILED); 2762 } 2763 2764 while ((cl != NULL)) { 2765 if (cl->c_len > 0) { 2766 bzero(&tx_wr, sizeof (ibt_send_wr_t)); 2767 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr; 2768 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = 2769 cl->c_dmemhandle.mrc_rmr; /* rkey */ 2770 sgl[0].ds_va = cl->w.c_saddr; 2771 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */ 2772 sgl[0].ds_len = cl->c_len; 2773 2774 if (wait) { 2775 cv_sig = 1; 2776 } else { 2777 if (n_writes > max_unsignaled_rws) { 2778 n_writes = 0; 2779 cv_sig = 1; 2780 } else { 2781 cv_sig = 0; 2782 } 2783 } 2784 2785 if (cv_sig) { 2786 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2787 wdesc = rib_init_sendwait(0, cv_sig, qp); 2788 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2789 mutex_enter(&wdesc->sendwait_lock); 2790 } else { 2791 tx_wr.wr_flags = IBT_WR_NO_FLAGS; 2792 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2793 } 2794 tx_wr.wr_opcode = IBT_WRC_RDMAW; 2795 tx_wr.wr_trans = IBT_RC_SRV; 2796 tx_wr.wr_nds = 1; 2797 tx_wr.wr_sgl = sgl; 2798 2799 mutex_enter(&conn->c_lock); 2800 if (conn->c_state == C_CONNECTED) { 2801 ibt_status = 2802 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL); 2803 } 2804 if (conn->c_state != C_CONNECTED || 2805 ibt_status != IBT_SUCCESS) { 2806 if (conn->c_state != C_DISCONN_PEND) 2807 conn->c_state = C_ERROR_CONN; 2808 mutex_exit(&conn->c_lock); 2809 if (cv_sig) { 2810 mutex_exit(&wdesc->sendwait_lock); 2811 (void) rib_free_sendwait(wdesc); 2812 } 2813 return (RDMA_CONNLOST); 2814 } 2815 2816 mutex_exit(&conn->c_lock); 2817 2818 /* 2819 * Wait for send to complete 2820 */ 2821 if (cv_sig) { 2822 2823 rib_send_hold(qp); 2824 mutex_exit(&wdesc->sendwait_lock); 2825 2826 ret = rib_sendwait(qp, wdesc); 2827 if (ret != 0) 2828 return (ret); 2829 } 2830 n_writes ++; 2831 } 2832 cl = cl->c_next; 2833 } 2834 return (RDMA_SUCCESS); 2835 } 2836 2837 /* 2838 * RDMA Read a buffer from the remote address. 2839 */ 2840 rdma_stat 2841 rib_read(CONN *conn, struct clist *cl, int wait) 2842 { 2843 ibt_send_wr_t rx_wr; 2844 int cv_sig = 0; 2845 ibt_wr_ds_t sgl; 2846 struct send_wid *wdesc; 2847 ibt_status_t ibt_status = IBT_SUCCESS; 2848 rdma_stat ret = RDMA_SUCCESS; 2849 rib_qp_t *qp = ctoqp(conn); 2850 2851 if (cl == NULL) { 2852 return (RDMA_FAILED); 2853 } 2854 2855 while (cl != NULL) { 2856 bzero(&rx_wr, sizeof (ibt_send_wr_t)); 2857 /* 2858 * Remote address is at the head chunk item in list. 2859 */ 2860 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr; 2861 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; 2862 2863 sgl.ds_va = cl->u.c_daddr; 2864 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */ 2865 sgl.ds_len = cl->c_len; 2866 2867 /* 2868 * If there are multiple chunks to be read, and 2869 * wait is set, ask for signal only for the last chunk 2870 * and wait only on the last chunk. The completion of 2871 * RDMA_READ on last chunk ensures that reads on all 2872 * previous chunks are also completed. 2873 */ 2874 if (wait && (cl->c_next == NULL)) { 2875 cv_sig = 1; 2876 wdesc = rib_init_sendwait(0, cv_sig, qp); 2877 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL; 2878 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc; 2879 mutex_enter(&wdesc->sendwait_lock); 2880 } else { 2881 rx_wr.wr_flags = IBT_WR_NO_FLAGS; 2882 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID; 2883 } 2884 rx_wr.wr_opcode = IBT_WRC_RDMAR; 2885 rx_wr.wr_trans = IBT_RC_SRV; 2886 rx_wr.wr_nds = 1; 2887 rx_wr.wr_sgl = &sgl; 2888 2889 mutex_enter(&conn->c_lock); 2890 if (conn->c_state == C_CONNECTED) { 2891 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL); 2892 } 2893 if (conn->c_state != C_CONNECTED || 2894 ibt_status != IBT_SUCCESS) { 2895 if (conn->c_state != C_DISCONN_PEND) 2896 conn->c_state = C_ERROR_CONN; 2897 mutex_exit(&conn->c_lock); 2898 if (wait && (cl->c_next == NULL)) { 2899 mutex_exit(&wdesc->sendwait_lock); 2900 (void) rib_free_sendwait(wdesc); 2901 } 2902 return (RDMA_CONNLOST); 2903 } 2904 2905 mutex_exit(&conn->c_lock); 2906 2907 /* 2908 * Wait for send to complete if this is the 2909 * last item in the list. 2910 */ 2911 if (wait && cl->c_next == NULL) { 2912 rib_send_hold(qp); 2913 mutex_exit(&wdesc->sendwait_lock); 2914 2915 ret = rib_sendwait(qp, wdesc); 2916 2917 if (ret != 0) 2918 return (ret); 2919 } 2920 cl = cl->c_next; 2921 } 2922 return (RDMA_SUCCESS); 2923 } 2924 2925 /* 2926 * rib_srv_cm_handler() 2927 * Connection Manager callback to handle RC connection requests. 2928 */ 2929 /* ARGSUSED */ 2930 static ibt_cm_status_t 2931 rib_srv_cm_handler(void *any, ibt_cm_event_t *event, 2932 ibt_cm_return_args_t *ret_args, void *priv_data, 2933 ibt_priv_data_len_t len) 2934 { 2935 queue_t *q; 2936 rib_qp_t *qp; 2937 rib_hca_t *hca; 2938 rdma_stat status = RDMA_SUCCESS; 2939 int i; 2940 struct clist cl; 2941 rdma_buf_t rdbuf = {0}; 2942 void *buf = NULL; 2943 CONN *conn; 2944 ibt_ip_cm_info_t ipinfo; 2945 struct sockaddr_in *s; 2946 struct sockaddr_in6 *s6; 2947 int sin_size = sizeof (struct sockaddr_in); 2948 int in_size = sizeof (struct in_addr); 2949 int sin6_size = sizeof (struct sockaddr_in6); 2950 2951 ASSERT(any != NULL); 2952 ASSERT(event != NULL); 2953 2954 hca = (rib_hca_t *)any; 2955 2956 /* got a connection request */ 2957 switch (event->cm_type) { 2958 case IBT_CM_EVENT_REQ_RCV: 2959 /* 2960 * If the plugin is in the NO_ACCEPT state, bail out. 2961 */ 2962 mutex_enter(&plugin_state_lock); 2963 if (plugin_state == NO_ACCEPT) { 2964 mutex_exit(&plugin_state_lock); 2965 return (IBT_CM_REJECT); 2966 } 2967 mutex_exit(&plugin_state_lock); 2968 2969 /* 2970 * Need to send a MRA MAD to CM so that it does not 2971 * timeout on us. 2972 */ 2973 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id, 2974 event->cm_event.req.req_timeout * 8, NULL, 0); 2975 2976 mutex_enter(&rib_stat->open_hca_lock); 2977 q = rib_stat->q; 2978 mutex_exit(&rib_stat->open_hca_lock); 2979 2980 status = rib_svc_create_chan(hca, (caddr_t)q, 2981 event->cm_event.req.req_prim_hca_port, &qp); 2982 2983 if (status) { 2984 return (IBT_CM_REJECT); 2985 } 2986 2987 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl; 2988 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 2989 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 2990 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES; 2991 2992 /* 2993 * Pre-posts RECV buffers 2994 */ 2995 conn = qptoc(qp); 2996 for (i = 0; i < preposted_rbufs; i++) { 2997 bzero(&rdbuf, sizeof (rdbuf)); 2998 rdbuf.type = RECV_BUFFER; 2999 buf = rib_rbuf_alloc(conn, &rdbuf); 3000 if (buf == NULL) { 3001 /* 3002 * A connection is not established yet. 3003 * Just flush the channel. Buffers 3004 * posted till now will error out with 3005 * IBT_WC_WR_FLUSHED_ERR. 3006 */ 3007 (void) ibt_flush_channel(qp->qp_hdl); 3008 (void) rib_disconnect_channel(conn, NULL); 3009 return (IBT_CM_REJECT); 3010 } 3011 3012 bzero(&cl, sizeof (cl)); 3013 cl.w.c_saddr3 = (caddr_t)rdbuf.addr; 3014 cl.c_len = rdbuf.len; 3015 cl.c_smemhandle.mrc_lmr = 3016 rdbuf.handle.mrc_lmr; /* lkey */ 3017 cl.c_next = NULL; 3018 status = rib_post_recv(conn, &cl); 3019 if (status != RDMA_SUCCESS) { 3020 /* 3021 * A connection is not established yet. 3022 * Just flush the channel. Buffers 3023 * posted till now will error out with 3024 * IBT_WC_WR_FLUSHED_ERR. 3025 */ 3026 (void) ibt_flush_channel(qp->qp_hdl); 3027 (void) rib_disconnect_channel(conn, NULL); 3028 return (IBT_CM_REJECT); 3029 } 3030 } 3031 (void) rib_add_connlist(conn, &hca->srv_conn_list); 3032 3033 /* 3034 * Get the address translation 3035 */ 3036 rw_enter(&hca->state_lock, RW_READER); 3037 if (hca->state == HCA_DETACHED) { 3038 rw_exit(&hca->state_lock); 3039 return (IBT_CM_REJECT); 3040 } 3041 rw_exit(&hca->state_lock); 3042 3043 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t)); 3044 3045 if (ibt_get_ip_data(event->cm_priv_data_len, 3046 event->cm_priv_data, 3047 &ipinfo) != IBT_SUCCESS) { 3048 3049 return (IBT_CM_REJECT); 3050 } 3051 3052 switch (ipinfo.src_addr.family) { 3053 case AF_INET: 3054 3055 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, 3056 KM_SLEEP); 3057 (void) strcpy(conn->c_netid, RIBNETID_TCP); 3058 3059 conn->c_raddr.maxlen = 3060 conn->c_raddr.len = sin_size; 3061 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3062 3063 s = (struct sockaddr_in *)conn->c_raddr.buf; 3064 s->sin_family = AF_INET; 3065 bcopy((void *)&ipinfo.src_addr.un.ip4addr, 3066 &s->sin_addr, in_size); 3067 3068 conn->c_laddr.maxlen = 3069 conn->c_laddr.len = sin_size; 3070 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP); 3071 3072 s = (struct sockaddr_in *)conn->c_laddr.buf; 3073 s->sin_family = AF_INET; 3074 bcopy((void *)&ipinfo.dst_addr.un.ip4addr, 3075 &s->sin_addr, in_size); 3076 3077 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3078 sizeof (struct sockaddr_in); 3079 conn->c_addrmask.buf = 3080 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3081 ((struct sockaddr_in *) 3082 conn->c_addrmask.buf)->sin_addr.s_addr = 3083 (uint32_t)~0; 3084 ((struct sockaddr_in *) 3085 conn->c_addrmask.buf)->sin_family = 3086 (sa_family_t)~0; 3087 break; 3088 3089 case AF_INET6: 3090 3091 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, 3092 KM_SLEEP); 3093 (void) strcpy(conn->c_netid, RIBNETID_TCP6); 3094 3095 conn->c_raddr.maxlen = 3096 conn->c_raddr.len = sin6_size; 3097 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3098 3099 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf; 3100 s6->sin6_family = AF_INET6; 3101 bcopy((void *)&ipinfo.src_addr.un.ip6addr, 3102 &s6->sin6_addr, 3103 sizeof (struct in6_addr)); 3104 3105 conn->c_laddr.maxlen = 3106 conn->c_laddr.len = sin6_size; 3107 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP); 3108 3109 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf; 3110 s6->sin6_family = AF_INET6; 3111 bcopy((void *)&ipinfo.dst_addr.un.ip6addr, 3112 &s6->sin6_addr, 3113 sizeof (struct in6_addr)); 3114 3115 conn->c_addrmask.maxlen = conn->c_addrmask.len = 3116 sizeof (struct sockaddr_in6); 3117 conn->c_addrmask.buf = 3118 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP); 3119 (void) memset(&((struct sockaddr_in6 *) 3120 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0, 3121 sizeof (struct in6_addr)); 3122 ((struct sockaddr_in6 *) 3123 conn->c_addrmask.buf)->sin6_family = 3124 (sa_family_t)~0; 3125 break; 3126 3127 default: 3128 return (IBT_CM_REJECT); 3129 } 3130 3131 break; 3132 3133 case IBT_CM_EVENT_CONN_CLOSED: 3134 { 3135 CONN *conn; 3136 rib_qp_t *qp; 3137 3138 switch (event->cm_event.closed) { 3139 case IBT_CM_CLOSED_DREP_RCVD: 3140 case IBT_CM_CLOSED_DREQ_TIMEOUT: 3141 case IBT_CM_CLOSED_DUP: 3142 case IBT_CM_CLOSED_ABORT: 3143 case IBT_CM_CLOSED_ALREADY: 3144 /* 3145 * These cases indicate the local end initiated 3146 * the closing of the channel. Nothing to do here. 3147 */ 3148 break; 3149 default: 3150 /* 3151 * Reason for CONN_CLOSED event must be one of 3152 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD 3153 * or IBT_CM_CLOSED_STALE. These indicate cases were 3154 * the remote end is closing the channel. In these 3155 * cases free the channel and transition to error 3156 * state 3157 */ 3158 qp = ibt_get_chan_private(event->cm_channel); 3159 conn = qptoc(qp); 3160 mutex_enter(&conn->c_lock); 3161 if (conn->c_state == C_DISCONN_PEND) { 3162 mutex_exit(&conn->c_lock); 3163 break; 3164 } 3165 conn->c_state = C_ERROR_CONN; 3166 3167 /* 3168 * Free the conn if c_ref goes down to 0 3169 */ 3170 if (conn->c_ref == 0) { 3171 /* 3172 * Remove from list and free conn 3173 */ 3174 conn->c_state = C_DISCONN_PEND; 3175 mutex_exit(&conn->c_lock); 3176 (void) rib_disconnect_channel(conn, 3177 &hca->srv_conn_list); 3178 } else { 3179 /* 3180 * conn will be freed when c_ref goes to 0. 3181 * Indicate to cleaning thread not to close 3182 * the connection, but just free the channel. 3183 */ 3184 conn->c_flags |= C_CLOSE_NOTNEEDED; 3185 mutex_exit(&conn->c_lock); 3186 } 3187 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect); 3188 break; 3189 } 3190 break; 3191 } 3192 case IBT_CM_EVENT_CONN_EST: 3193 /* 3194 * RTU received, hence connection established. 3195 */ 3196 if (rib_debug > 1) 3197 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3198 "(CONN_EST) channel established"); 3199 break; 3200 3201 default: 3202 if (rib_debug > 2) { 3203 /* Let CM handle the following events. */ 3204 if (event->cm_type == IBT_CM_EVENT_REP_RCV) { 3205 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3206 "server recv'ed IBT_CM_EVENT_REP_RCV\n"); 3207 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) { 3208 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3209 "server recv'ed IBT_CM_EVENT_LAP_RCV\n"); 3210 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) { 3211 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3212 "server recv'ed IBT_CM_EVENT_MRA_RCV\n"); 3213 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) { 3214 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3215 "server recv'ed IBT_CM_EVENT_APR_RCV\n"); 3216 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) { 3217 cmn_err(CE_NOTE, "rib_srv_cm_handler: " 3218 "server recv'ed IBT_CM_EVENT_FAILURE\n"); 3219 } 3220 } 3221 return (IBT_CM_DEFAULT); 3222 } 3223 3224 /* accept all other CM messages (i.e. let the CM handle them) */ 3225 return (IBT_CM_ACCEPT); 3226 } 3227 3228 static rdma_stat 3229 rib_register_service(rib_hca_t *hca, int service_type, 3230 uint8_t protocol_num, in_port_t dst_port) 3231 { 3232 ibt_srv_desc_t sdesc; 3233 ibt_hca_portinfo_t *port_infop; 3234 ib_svc_id_t srv_id; 3235 ibt_srv_hdl_t srv_hdl; 3236 uint_t port_size; 3237 uint_t pki, i, num_ports, nbinds; 3238 ibt_status_t ibt_status; 3239 rib_service_t *service; 3240 ib_pkey_t pkey; 3241 3242 /* 3243 * Query all ports for the given HCA 3244 */ 3245 rw_enter(&hca->state_lock, RW_READER); 3246 if (hca->state != HCA_DETACHED) { 3247 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop, 3248 &num_ports, &port_size); 3249 rw_exit(&hca->state_lock); 3250 } else { 3251 rw_exit(&hca->state_lock); 3252 return (RDMA_FAILED); 3253 } 3254 if (ibt_status != IBT_SUCCESS) { 3255 return (RDMA_FAILED); 3256 } 3257 3258 DTRACE_PROBE1(rpcib__i__regservice_numports, 3259 int, num_ports); 3260 3261 for (i = 0; i < num_ports; i++) { 3262 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) { 3263 DTRACE_PROBE1(rpcib__i__regservice__portinactive, 3264 int, i+1); 3265 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) { 3266 DTRACE_PROBE1(rpcib__i__regservice__portactive, 3267 int, i+1); 3268 } 3269 } 3270 3271 /* 3272 * Get all the IP addresses on this system to register the 3273 * given "service type" on all DNS recognized IP addrs. 3274 * Each service type such as NFS will have all the systems 3275 * IP addresses as its different names. For now the only 3276 * type of service we support in RPCIB is NFS. 3277 */ 3278 rw_enter(&rib_stat->service_list_lock, RW_WRITER); 3279 /* 3280 * Start registering and binding service to active 3281 * on active ports on this HCA. 3282 */ 3283 nbinds = 0; 3284 for (service = rib_stat->service_list; 3285 service && (service->srv_type != service_type); 3286 service = service->next) 3287 ; 3288 3289 if (service == NULL) { 3290 /* 3291 * We use IP addresses as the service names for 3292 * service registration. Register each of them 3293 * with CM to obtain a svc_id and svc_hdl. We do not 3294 * register the service with machine's loopback address. 3295 */ 3296 (void) bzero(&srv_id, sizeof (ib_svc_id_t)); 3297 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t)); 3298 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t)); 3299 sdesc.sd_handler = rib_srv_cm_handler; 3300 sdesc.sd_flags = 0; 3301 ibt_status = ibt_register_service(hca->ibt_clnt_hdl, 3302 &sdesc, ibt_get_ip_sid(protocol_num, dst_port), 3303 1, &srv_hdl, &srv_id); 3304 if ((ibt_status != IBT_SUCCESS) && 3305 (ibt_status != IBT_CM_SERVICE_EXISTS)) { 3306 rw_exit(&rib_stat->service_list_lock); 3307 DTRACE_PROBE1(rpcib__i__regservice__ibtres, 3308 int, ibt_status); 3309 ibt_free_portinfo(port_infop, port_size); 3310 return (RDMA_FAILED); 3311 } 3312 3313 /* 3314 * Allocate and prepare a service entry 3315 */ 3316 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP); 3317 3318 service->srv_type = service_type; 3319 service->srv_hdl = srv_hdl; 3320 service->srv_id = srv_id; 3321 3322 service->next = rib_stat->service_list; 3323 rib_stat->service_list = service; 3324 DTRACE_PROBE1(rpcib__i__regservice__new__service, 3325 int, service->srv_type); 3326 } else { 3327 srv_hdl = service->srv_hdl; 3328 srv_id = service->srv_id; 3329 DTRACE_PROBE1(rpcib__i__regservice__existing__service, 3330 int, service->srv_type); 3331 } 3332 3333 for (i = 0; i < num_ports; i++) { 3334 ibt_sbind_hdl_t sbp; 3335 rib_hca_service_t *hca_srv; 3336 ib_gid_t gid; 3337 3338 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) 3339 continue; 3340 3341 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) { 3342 pkey = port_infop[i].p_pkey_tbl[pki]; 3343 3344 rw_enter(&hca->bound_services_lock, RW_READER); 3345 gid = port_infop[i].p_sgid_tbl[0]; 3346 for (hca_srv = hca->bound_services; hca_srv; 3347 hca_srv = hca_srv->next) { 3348 if ((hca_srv->srv_id == service->srv_id) && 3349 (hca_srv->gid.gid_prefix == 3350 gid.gid_prefix) && 3351 (hca_srv->gid.gid_guid == gid.gid_guid)) 3352 break; 3353 } 3354 rw_exit(&hca->bound_services_lock); 3355 if (hca_srv != NULL) { 3356 /* 3357 * port is alreay bound the the service 3358 */ 3359 DTRACE_PROBE1( 3360 rpcib__i__regservice__already__bound, 3361 int, i+1); 3362 nbinds++; 3363 continue; 3364 } 3365 3366 if ((pkey & IBSRM_HB) && 3367 (pkey != IB_PKEY_INVALID_FULL)) { 3368 3369 sbp = NULL; 3370 ibt_status = ibt_bind_service(srv_hdl, 3371 gid, NULL, hca, &sbp); 3372 3373 if (ibt_status == IBT_SUCCESS) { 3374 hca_srv = kmem_zalloc( 3375 sizeof (rib_hca_service_t), 3376 KM_SLEEP); 3377 hca_srv->srv_id = srv_id; 3378 hca_srv->gid = gid; 3379 hca_srv->sbind_hdl = sbp; 3380 3381 rw_enter(&hca->bound_services_lock, 3382 RW_WRITER); 3383 hca_srv->next = hca->bound_services; 3384 hca->bound_services = hca_srv; 3385 rw_exit(&hca->bound_services_lock); 3386 nbinds++; 3387 } 3388 3389 DTRACE_PROBE1(rpcib__i__regservice__bindres, 3390 int, ibt_status); 3391 } 3392 } 3393 } 3394 rw_exit(&rib_stat->service_list_lock); 3395 3396 ibt_free_portinfo(port_infop, port_size); 3397 3398 if (nbinds == 0) { 3399 return (RDMA_FAILED); 3400 } else { 3401 /* 3402 * Put this plugin into accept state, since atleast 3403 * one registration was successful. 3404 */ 3405 mutex_enter(&plugin_state_lock); 3406 plugin_state = ACCEPT; 3407 mutex_exit(&plugin_state_lock); 3408 return (RDMA_SUCCESS); 3409 } 3410 } 3411 3412 void 3413 rib_listen(struct rdma_svc_data *rd) 3414 { 3415 rdma_stat status; 3416 int n_listening = 0; 3417 rib_hca_t *hca; 3418 3419 mutex_enter(&rib_stat->listen_lock); 3420 /* 3421 * if rd parameter is NULL then it means that rib_stat->q is 3422 * already initialized by a call from RDMA and we just want to 3423 * add a newly attached HCA to the same listening state as other 3424 * HCAs. 3425 */ 3426 if (rd == NULL) { 3427 if (rib_stat->q == NULL) { 3428 mutex_exit(&rib_stat->listen_lock); 3429 return; 3430 } 3431 } else { 3432 rib_stat->q = &rd->q; 3433 } 3434 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3435 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3436 /* 3437 * First check if a hca is still attached 3438 */ 3439 rw_enter(&hca->state_lock, RW_READER); 3440 if (hca->state != HCA_INITED) { 3441 rw_exit(&hca->state_lock); 3442 continue; 3443 } 3444 rw_exit(&hca->state_lock); 3445 3446 /* 3447 * Right now the only service type is NFS. Hence 3448 * force feed this value. Ideally to communicate 3449 * the service type it should be passed down in 3450 * rdma_svc_data. 3451 */ 3452 status = rib_register_service(hca, NFS, 3453 IPPROTO_TCP, nfs_rdma_port); 3454 if (status == RDMA_SUCCESS) 3455 n_listening++; 3456 } 3457 rw_exit(&rib_stat->hcas_list_lock); 3458 3459 /* 3460 * Service active on an HCA, check rd->err_code for more 3461 * explainable errors. 3462 */ 3463 if (rd) { 3464 if (n_listening > 0) { 3465 rd->active = 1; 3466 rd->err_code = RDMA_SUCCESS; 3467 } else { 3468 rd->active = 0; 3469 rd->err_code = RDMA_FAILED; 3470 } 3471 } 3472 mutex_exit(&rib_stat->listen_lock); 3473 } 3474 3475 /* XXXX */ 3476 /* ARGSUSED */ 3477 static void 3478 rib_listen_stop(struct rdma_svc_data *svcdata) 3479 { 3480 rib_hca_t *hca; 3481 3482 mutex_enter(&rib_stat->listen_lock); 3483 /* 3484 * KRPC called the RDMATF to stop the listeners, this means 3485 * stop sending incomming or recieved requests to KRPC master 3486 * transport handle for RDMA-IB. This is also means that the 3487 * master transport handle, responsible for us, is going away. 3488 */ 3489 mutex_enter(&plugin_state_lock); 3490 plugin_state = NO_ACCEPT; 3491 if (svcdata != NULL) 3492 svcdata->active = 0; 3493 mutex_exit(&plugin_state_lock); 3494 3495 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 3496 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 3497 /* 3498 * First check if a hca is still attached 3499 */ 3500 rw_enter(&hca->state_lock, RW_READER); 3501 if (hca->state == HCA_DETACHED) { 3502 rw_exit(&hca->state_lock); 3503 continue; 3504 } 3505 rib_close_channels(&hca->srv_conn_list); 3506 rib_stop_services(hca); 3507 rw_exit(&hca->state_lock); 3508 } 3509 rw_exit(&rib_stat->hcas_list_lock); 3510 3511 /* 3512 * Avoid rib_listen() using the stale q field. 3513 * This could happen if a port goes up after all services 3514 * are already unregistered. 3515 */ 3516 rib_stat->q = NULL; 3517 mutex_exit(&rib_stat->listen_lock); 3518 } 3519 3520 /* 3521 * Traverse the HCA's service list to unbind and deregister services. 3522 * For each bound service of HCA to be removed, first find the corresponding 3523 * service handle (srv_hdl) and then unbind the service by calling 3524 * ibt_unbind_service(). 3525 */ 3526 static void 3527 rib_stop_services(rib_hca_t *hca) 3528 { 3529 rib_hca_service_t *srv_list, *to_remove; 3530 3531 /* 3532 * unbind and deregister the services for this service type. 3533 * Right now there is only one service type. In future it will 3534 * be passed down to this function. 3535 */ 3536 rw_enter(&hca->bound_services_lock, RW_READER); 3537 srv_list = hca->bound_services; 3538 hca->bound_services = NULL; 3539 rw_exit(&hca->bound_services_lock); 3540 3541 while (srv_list != NULL) { 3542 rib_service_t *sc; 3543 3544 to_remove = srv_list; 3545 srv_list = to_remove->next; 3546 rw_enter(&rib_stat->service_list_lock, RW_READER); 3547 for (sc = rib_stat->service_list; 3548 sc && (sc->srv_id != to_remove->srv_id); 3549 sc = sc->next) 3550 ; 3551 /* 3552 * if sc is NULL then the service doesn't exist anymore, 3553 * probably just removed completely through rib_stat. 3554 */ 3555 if (sc != NULL) 3556 (void) ibt_unbind_service(sc->srv_hdl, 3557 to_remove->sbind_hdl); 3558 rw_exit(&rib_stat->service_list_lock); 3559 kmem_free(to_remove, sizeof (rib_hca_service_t)); 3560 } 3561 } 3562 3563 static struct svc_recv * 3564 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl) 3565 { 3566 struct svc_recv *recvp; 3567 3568 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP); 3569 recvp->vaddr = sgl->ds_va; 3570 recvp->qp = qp; 3571 recvp->bytes_xfer = 0; 3572 return (recvp); 3573 } 3574 3575 static int 3576 rib_free_svc_recv(struct svc_recv *recvp) 3577 { 3578 kmem_free(recvp, sizeof (*recvp)); 3579 3580 return (0); 3581 } 3582 3583 static struct reply * 3584 rib_addreplylist(rib_qp_t *qp, uint32_t msgid) 3585 { 3586 struct reply *rep; 3587 3588 3589 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP); 3590 if (rep == NULL) { 3591 DTRACE_PROBE(rpcib__i__addrreply__nomem); 3592 return (NULL); 3593 } 3594 rep->xid = msgid; 3595 rep->vaddr_cq = NULL; 3596 rep->bytes_xfer = 0; 3597 rep->status = (uint_t)REPLY_WAIT; 3598 rep->prev = NULL; 3599 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL); 3600 3601 mutex_enter(&qp->replylist_lock); 3602 if (qp->replylist) { 3603 rep->next = qp->replylist; 3604 qp->replylist->prev = rep; 3605 } 3606 qp->rep_list_size++; 3607 3608 DTRACE_PROBE1(rpcib__i__addrreply__listsize, 3609 int, qp->rep_list_size); 3610 3611 qp->replylist = rep; 3612 mutex_exit(&qp->replylist_lock); 3613 3614 return (rep); 3615 } 3616 3617 static rdma_stat 3618 rib_rem_replylist(rib_qp_t *qp) 3619 { 3620 struct reply *r, *n; 3621 3622 mutex_enter(&qp->replylist_lock); 3623 for (r = qp->replylist; r != NULL; r = n) { 3624 n = r->next; 3625 (void) rib_remreply(qp, r); 3626 } 3627 mutex_exit(&qp->replylist_lock); 3628 3629 return (RDMA_SUCCESS); 3630 } 3631 3632 static int 3633 rib_remreply(rib_qp_t *qp, struct reply *rep) 3634 { 3635 3636 ASSERT(MUTEX_HELD(&qp->replylist_lock)); 3637 if (rep->prev) { 3638 rep->prev->next = rep->next; 3639 } 3640 if (rep->next) { 3641 rep->next->prev = rep->prev; 3642 } 3643 if (qp->replylist == rep) 3644 qp->replylist = rep->next; 3645 3646 cv_destroy(&rep->wait_cv); 3647 qp->rep_list_size--; 3648 3649 DTRACE_PROBE1(rpcib__i__remreply__listsize, 3650 int, qp->rep_list_size); 3651 3652 kmem_free(rep, sizeof (*rep)); 3653 3654 return (0); 3655 } 3656 3657 rdma_stat 3658 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3659 struct mrc *buf_handle) 3660 { 3661 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3662 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3663 rdma_stat status; 3664 rib_hca_t *hca = (ctoqp(conn))->hca; 3665 3666 /* 3667 * Note: ALL buffer pools use the same memory type RDMARW. 3668 */ 3669 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3670 if (status == RDMA_SUCCESS) { 3671 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3672 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3673 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3674 } else { 3675 buf_handle->mrc_linfo = NULL; 3676 buf_handle->mrc_lmr = 0; 3677 buf_handle->mrc_rmr = 0; 3678 } 3679 return (status); 3680 } 3681 3682 static rdma_stat 3683 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, 3684 ibt_mr_flags_t spec, 3685 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp) 3686 { 3687 ibt_mr_attr_t mem_attr; 3688 ibt_status_t ibt_status; 3689 mem_attr.mr_vaddr = (uintptr_t)buf; 3690 mem_attr.mr_len = (ib_msglen_t)size; 3691 mem_attr.mr_as = (struct as *)(caddr_t)adsp; 3692 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE | 3693 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE | 3694 IBT_MR_ENABLE_WINDOW_BIND | spec; 3695 3696 rw_enter(&hca->state_lock, RW_READER); 3697 if (hca->state != HCA_DETACHED) { 3698 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl, 3699 &mem_attr, mr_hdlp, mr_descp); 3700 rw_exit(&hca->state_lock); 3701 } else { 3702 rw_exit(&hca->state_lock); 3703 return (RDMA_FAILED); 3704 } 3705 3706 if (ibt_status != IBT_SUCCESS) { 3707 return (RDMA_FAILED); 3708 } 3709 return (RDMA_SUCCESS); 3710 } 3711 3712 rdma_stat 3713 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen, 3714 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 3715 { 3716 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 3717 rib_lrc_entry_t *l; 3718 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 3719 rdma_stat status; 3720 rib_hca_t *hca = (ctoqp(conn))->hca; 3721 3722 /* 3723 * Non-coherent memory registration. 3724 */ 3725 l = (rib_lrc_entry_t *)lrc; 3726 if (l) { 3727 if (l->registered) { 3728 buf_handle->mrc_linfo = 3729 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3730 buf_handle->mrc_lmr = 3731 (uint32_t)l->lrc_mhandle.mrc_lmr; 3732 buf_handle->mrc_rmr = 3733 (uint32_t)l->lrc_mhandle.mrc_rmr; 3734 *sync_handle = (RIB_SYNCMEM_HANDLE) 3735 (uintptr_t)l->lrc_mhandle.mrc_linfo; 3736 return (RDMA_SUCCESS); 3737 } else { 3738 /* Always register the whole buffer */ 3739 buf = (caddr_t)l->lrc_buf; 3740 buflen = l->lrc_len; 3741 } 3742 } 3743 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 3744 3745 if (status == RDMA_SUCCESS) { 3746 if (l) { 3747 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 3748 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey; 3749 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey; 3750 l->registered = TRUE; 3751 } 3752 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 3753 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 3754 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 3755 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl; 3756 } else { 3757 buf_handle->mrc_linfo = NULL; 3758 buf_handle->mrc_lmr = 0; 3759 buf_handle->mrc_rmr = 0; 3760 } 3761 return (status); 3762 } 3763 3764 /* ARGSUSED */ 3765 rdma_stat 3766 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle) 3767 { 3768 rib_hca_t *hca = (ctoqp(conn))->hca; 3769 /* 3770 * Allow memory deregistration even if HCA is 3771 * getting detached. Need all outstanding 3772 * memory registrations to be deregistered 3773 * before HCA_DETACH_EVENT can be accepted. 3774 */ 3775 (void) ibt_deregister_mr(hca->hca_hdl, 3776 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 3777 return (RDMA_SUCCESS); 3778 } 3779 3780 /* ARGSUSED */ 3781 rdma_stat 3782 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle, 3783 RIB_SYNCMEM_HANDLE sync_handle, void *lrc) 3784 { 3785 rib_lrc_entry_t *l; 3786 l = (rib_lrc_entry_t *)lrc; 3787 if (l) 3788 if (l->registered) 3789 return (RDMA_SUCCESS); 3790 3791 (void) rib_deregistermem(conn, buf, buf_handle); 3792 3793 return (RDMA_SUCCESS); 3794 } 3795 3796 /* ARGSUSED */ 3797 rdma_stat 3798 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf, 3799 int len, int cpu) 3800 { 3801 ibt_status_t status; 3802 rib_hca_t *hca = (ctoqp(conn))->hca; 3803 ibt_mr_sync_t mr_segment; 3804 3805 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle; 3806 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf; 3807 mr_segment.ms_len = (ib_memlen_t)len; 3808 if (cpu) { 3809 /* make incoming data visible to memory */ 3810 mr_segment.ms_flags = IBT_SYNC_WRITE; 3811 } else { 3812 /* make memory changes visible to IO */ 3813 mr_segment.ms_flags = IBT_SYNC_READ; 3814 } 3815 rw_enter(&hca->state_lock, RW_READER); 3816 if (hca->state != HCA_DETACHED) { 3817 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1); 3818 rw_exit(&hca->state_lock); 3819 } else { 3820 rw_exit(&hca->state_lock); 3821 return (RDMA_FAILED); 3822 } 3823 3824 if (status == IBT_SUCCESS) 3825 return (RDMA_SUCCESS); 3826 else { 3827 return (RDMA_FAILED); 3828 } 3829 } 3830 3831 /* 3832 * XXXX ???? 3833 */ 3834 static rdma_stat 3835 rib_getinfo(rdma_info_t *info) 3836 { 3837 /* 3838 * XXXX Hack! 3839 */ 3840 info->addrlen = 16; 3841 info->mts = 1000000; 3842 info->mtu = 1000000; 3843 3844 return (RDMA_SUCCESS); 3845 } 3846 3847 rib_bufpool_t * 3848 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num) 3849 { 3850 rib_bufpool_t *rbp = NULL; 3851 bufpool_t *bp = NULL; 3852 caddr_t buf; 3853 ibt_mr_attr_t mem_attr; 3854 ibt_status_t ibt_status; 3855 int i, j; 3856 3857 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP); 3858 3859 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) + 3860 num * sizeof (void *), KM_SLEEP); 3861 3862 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock); 3863 bp->numelems = num; 3864 3865 3866 switch (ptype) { 3867 case SEND_BUFFER: 3868 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3869 bp->rsize = RPC_MSG_SZ; 3870 break; 3871 case RECV_BUFFER: 3872 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3873 bp->rsize = RPC_BUF_SIZE; 3874 break; 3875 default: 3876 goto fail; 3877 } 3878 3879 /* 3880 * Register the pool. 3881 */ 3882 bp->bufsize = num * bp->rsize; 3883 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP); 3884 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num * 3885 sizeof (ibt_mr_hdl_t), KM_SLEEP); 3886 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num * 3887 sizeof (ibt_mr_desc_t), KM_SLEEP); 3888 rw_enter(&hca->state_lock, RW_READER); 3889 3890 if (hca->state == HCA_DETACHED) { 3891 rw_exit(&hca->state_lock); 3892 goto fail; 3893 } 3894 3895 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) { 3896 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t)); 3897 mem_attr.mr_vaddr = (uintptr_t)buf; 3898 mem_attr.mr_len = (ib_msglen_t)bp->rsize; 3899 mem_attr.mr_as = NULL; 3900 ibt_status = ibt_register_mr(hca->hca_hdl, 3901 hca->pd_hdl, &mem_attr, 3902 &rbp->mr_hdl[i], 3903 &rbp->mr_desc[i]); 3904 if (ibt_status != IBT_SUCCESS) { 3905 for (j = 0; j < i; j++) { 3906 (void) ibt_deregister_mr(hca->hca_hdl, 3907 rbp->mr_hdl[j]); 3908 } 3909 rw_exit(&hca->state_lock); 3910 goto fail; 3911 } 3912 } 3913 rw_exit(&hca->state_lock); 3914 buf = (caddr_t)bp->buf; 3915 for (i = 0; i < num; i++, buf += bp->rsize) { 3916 bp->buflist[i] = (void *)buf; 3917 } 3918 bp->buffree = num - 1; /* no. of free buffers */ 3919 rbp->bpool = bp; 3920 3921 return (rbp); 3922 fail: 3923 if (bp) { 3924 if (bp->buf) 3925 kmem_free(bp->buf, bp->bufsize); 3926 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *)); 3927 } 3928 if (rbp) { 3929 if (rbp->mr_hdl) 3930 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t)); 3931 if (rbp->mr_desc) 3932 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t)); 3933 kmem_free(rbp, sizeof (rib_bufpool_t)); 3934 } 3935 return (NULL); 3936 } 3937 3938 static void 3939 rib_rbufpool_deregister(rib_hca_t *hca, int ptype) 3940 { 3941 int i; 3942 rib_bufpool_t *rbp = NULL; 3943 bufpool_t *bp; 3944 3945 /* 3946 * Obtain pool address based on type of pool 3947 */ 3948 switch (ptype) { 3949 case SEND_BUFFER: 3950 rbp = hca->send_pool; 3951 break; 3952 case RECV_BUFFER: 3953 rbp = hca->recv_pool; 3954 break; 3955 default: 3956 return; 3957 } 3958 if (rbp == NULL) 3959 return; 3960 3961 bp = rbp->bpool; 3962 3963 /* 3964 * Deregister the pool memory and free it. 3965 */ 3966 for (i = 0; i < bp->numelems; i++) { 3967 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]); 3968 } 3969 } 3970 3971 static void 3972 rib_rbufpool_free(rib_hca_t *hca, int ptype) 3973 { 3974 3975 rib_bufpool_t *rbp = NULL; 3976 bufpool_t *bp; 3977 3978 /* 3979 * Obtain pool address based on type of pool 3980 */ 3981 switch (ptype) { 3982 case SEND_BUFFER: 3983 rbp = hca->send_pool; 3984 break; 3985 case RECV_BUFFER: 3986 rbp = hca->recv_pool; 3987 break; 3988 default: 3989 return; 3990 } 3991 if (rbp == NULL) 3992 return; 3993 3994 bp = rbp->bpool; 3995 3996 /* 3997 * Free the pool memory. 3998 */ 3999 if (rbp->mr_hdl) 4000 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t)); 4001 4002 if (rbp->mr_desc) 4003 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t)); 4004 if (bp->buf) 4005 kmem_free(bp->buf, bp->bufsize); 4006 mutex_destroy(&bp->buflock); 4007 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *)); 4008 kmem_free(rbp, sizeof (rib_bufpool_t)); 4009 } 4010 4011 void 4012 rib_rbufpool_destroy(rib_hca_t *hca, int ptype) 4013 { 4014 /* 4015 * Deregister the pool memory and free it. 4016 */ 4017 rib_rbufpool_deregister(hca, ptype); 4018 rib_rbufpool_free(hca, ptype); 4019 } 4020 4021 /* 4022 * Fetch a buffer from the pool of type specified in rdbuf->type. 4023 */ 4024 static rdma_stat 4025 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4026 { 4027 rib_lrc_entry_t *rlep; 4028 4029 if (rdbuf->type == RDMA_LONG_BUFFER) { 4030 rlep = rib_get_cache_buf(conn, rdbuf->len); 4031 rdbuf->rb_private = (caddr_t)rlep; 4032 rdbuf->addr = rlep->lrc_buf; 4033 rdbuf->handle = rlep->lrc_mhandle; 4034 return (RDMA_SUCCESS); 4035 } 4036 4037 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf); 4038 if (rdbuf->addr) { 4039 switch (rdbuf->type) { 4040 case SEND_BUFFER: 4041 rdbuf->len = RPC_MSG_SZ; /* 1K */ 4042 break; 4043 case RECV_BUFFER: 4044 rdbuf->len = RPC_BUF_SIZE; /* 2K */ 4045 break; 4046 default: 4047 rdbuf->len = 0; 4048 } 4049 return (RDMA_SUCCESS); 4050 } else 4051 return (RDMA_FAILED); 4052 } 4053 4054 /* 4055 * Fetch a buffer of specified type. 4056 * Note that rdbuf->handle is mw's rkey. 4057 */ 4058 static void * 4059 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf) 4060 { 4061 rib_qp_t *qp = ctoqp(conn); 4062 rib_hca_t *hca = qp->hca; 4063 rdma_btype ptype = rdbuf->type; 4064 void *buf; 4065 rib_bufpool_t *rbp = NULL; 4066 bufpool_t *bp; 4067 int i; 4068 4069 /* 4070 * Obtain pool address based on type of pool 4071 */ 4072 switch (ptype) { 4073 case SEND_BUFFER: 4074 rbp = hca->send_pool; 4075 break; 4076 case RECV_BUFFER: 4077 rbp = hca->recv_pool; 4078 break; 4079 default: 4080 return (NULL); 4081 } 4082 if (rbp == NULL) 4083 return (NULL); 4084 4085 bp = rbp->bpool; 4086 4087 mutex_enter(&bp->buflock); 4088 if (bp->buffree < 0) { 4089 mutex_exit(&bp->buflock); 4090 return (NULL); 4091 } 4092 4093 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */ 4094 buf = bp->buflist[bp->buffree]; 4095 rdbuf->addr = buf; 4096 rdbuf->len = bp->rsize; 4097 for (i = bp->numelems - 1; i >= 0; i--) { 4098 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) { 4099 rdbuf->handle.mrc_rmr = 4100 (uint32_t)rbp->mr_desc[i].md_rkey; 4101 rdbuf->handle.mrc_linfo = 4102 (uintptr_t)rbp->mr_hdl[i]; 4103 rdbuf->handle.mrc_lmr = 4104 (uint32_t)rbp->mr_desc[i].md_lkey; 4105 bp->buffree--; 4106 4107 mutex_exit(&bp->buflock); 4108 4109 return (buf); 4110 } 4111 } 4112 4113 mutex_exit(&bp->buflock); 4114 4115 return (NULL); 4116 } 4117 4118 static void 4119 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf) 4120 { 4121 4122 if (rdbuf->type == RDMA_LONG_BUFFER) { 4123 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private); 4124 rdbuf->rb_private = NULL; 4125 return; 4126 } 4127 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr); 4128 } 4129 4130 static void 4131 rib_rbuf_free(CONN *conn, int ptype, void *buf) 4132 { 4133 rib_qp_t *qp = ctoqp(conn); 4134 rib_hca_t *hca = qp->hca; 4135 rib_bufpool_t *rbp = NULL; 4136 bufpool_t *bp; 4137 4138 /* 4139 * Obtain pool address based on type of pool 4140 */ 4141 switch (ptype) { 4142 case SEND_BUFFER: 4143 rbp = hca->send_pool; 4144 break; 4145 case RECV_BUFFER: 4146 rbp = hca->recv_pool; 4147 break; 4148 default: 4149 return; 4150 } 4151 if (rbp == NULL) 4152 return; 4153 4154 bp = rbp->bpool; 4155 4156 mutex_enter(&bp->buflock); 4157 if (++bp->buffree >= bp->numelems) { 4158 /* 4159 * Should never happen 4160 */ 4161 bp->buffree--; 4162 } else { 4163 bp->buflist[bp->buffree] = buf; 4164 } 4165 mutex_exit(&bp->buflock); 4166 } 4167 4168 static rdma_stat 4169 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist) 4170 { 4171 rw_enter(&connlist->conn_lock, RW_WRITER); 4172 if (connlist->conn_hd) { 4173 cn->c_next = connlist->conn_hd; 4174 connlist->conn_hd->c_prev = cn; 4175 } 4176 connlist->conn_hd = cn; 4177 rw_exit(&connlist->conn_lock); 4178 4179 return (RDMA_SUCCESS); 4180 } 4181 4182 static rdma_stat 4183 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist) 4184 { 4185 rw_enter(&connlist->conn_lock, RW_WRITER); 4186 if (cn->c_prev) { 4187 cn->c_prev->c_next = cn->c_next; 4188 } 4189 if (cn->c_next) { 4190 cn->c_next->c_prev = cn->c_prev; 4191 } 4192 if (connlist->conn_hd == cn) 4193 connlist->conn_hd = cn->c_next; 4194 rw_exit(&connlist->conn_lock); 4195 4196 return (RDMA_SUCCESS); 4197 } 4198 4199 /* ARGSUSED */ 4200 static rdma_stat 4201 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4202 int addr_type, void *handle, CONN **conn) 4203 { 4204 rdma_stat status; 4205 rpcib_ping_t rpt; 4206 4207 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn); 4208 return (status); 4209 } 4210 4211 /* 4212 * rib_find_hca_connection 4213 * 4214 * if there is an existing connection to the specified address then 4215 * it will be returned in conn, otherwise conn will be set to NULL. 4216 * Also cleans up any connection that is in error state. 4217 */ 4218 static int 4219 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr, 4220 struct netbuf *d_svcaddr, CONN **conn) 4221 { 4222 CONN *cn; 4223 clock_t cv_stat, timout; 4224 4225 *conn = NULL; 4226 again: 4227 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4228 cn = hca->cl_conn_list.conn_hd; 4229 while (cn != NULL) { 4230 /* 4231 * First, clear up any connection in the ERROR state 4232 */ 4233 mutex_enter(&cn->c_lock); 4234 if (cn->c_state == C_ERROR_CONN) { 4235 if (cn->c_ref == 0) { 4236 /* 4237 * Remove connection from list and destroy it. 4238 */ 4239 cn->c_state = C_DISCONN_PEND; 4240 mutex_exit(&cn->c_lock); 4241 rw_exit(&hca->cl_conn_list.conn_lock); 4242 rib_conn_close((void *)cn); 4243 goto again; 4244 } 4245 mutex_exit(&cn->c_lock); 4246 cn = cn->c_next; 4247 continue; 4248 } 4249 if (cn->c_state == C_DISCONN_PEND) { 4250 mutex_exit(&cn->c_lock); 4251 cn = cn->c_next; 4252 continue; 4253 } 4254 4255 /* 4256 * source address is only checked for if there is one, 4257 * this is the case for retries. 4258 */ 4259 if ((cn->c_raddr.len == d_svcaddr->len) && 4260 (bcmp(d_svcaddr->buf, cn->c_raddr.buf, 4261 d_svcaddr->len) == 0) && 4262 ((s_svcaddr->len == 0) || 4263 ((cn->c_laddr.len == s_svcaddr->len) && 4264 (bcmp(s_svcaddr->buf, cn->c_laddr.buf, 4265 s_svcaddr->len) == 0)))) { 4266 /* 4267 * Our connection. Give up conn list lock 4268 * as we are done traversing the list. 4269 */ 4270 rw_exit(&hca->cl_conn_list.conn_lock); 4271 if (cn->c_state == C_CONNECTED) { 4272 cn->c_ref++; /* sharing a conn */ 4273 mutex_exit(&cn->c_lock); 4274 *conn = cn; 4275 return (RDMA_SUCCESS); 4276 } 4277 if (cn->c_state == C_CONN_PEND) { 4278 /* 4279 * Hold a reference to this conn before 4280 * we give up the lock. 4281 */ 4282 cn->c_ref++; 4283 timout = ddi_get_lbolt() + 4284 drv_usectohz(CONN_WAIT_TIME * 1000000); 4285 while ((cv_stat = cv_timedwait_sig(&cn->c_cv, 4286 &cn->c_lock, timout)) > 0 && 4287 cn->c_state == C_CONN_PEND) 4288 ; 4289 if (cv_stat == 0) { 4290 (void) rib_conn_release_locked(cn); 4291 return (RDMA_INTR); 4292 } 4293 if (cv_stat < 0) { 4294 (void) rib_conn_release_locked(cn); 4295 return (RDMA_TIMEDOUT); 4296 } 4297 if (cn->c_state == C_CONNECTED) { 4298 *conn = cn; 4299 mutex_exit(&cn->c_lock); 4300 return (RDMA_SUCCESS); 4301 } else { 4302 (void) rib_conn_release_locked(cn); 4303 return (RDMA_TIMEDOUT); 4304 } 4305 } 4306 } 4307 mutex_exit(&cn->c_lock); 4308 cn = cn->c_next; 4309 } 4310 rw_exit(&hca->cl_conn_list.conn_lock); 4311 *conn = NULL; 4312 return (RDMA_FAILED); 4313 } 4314 4315 /* 4316 * Connection management. 4317 * IBTF does not support recycling of channels. So connections are only 4318 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or 4319 * C_DISCONN_PEND state. No C_IDLE state. 4320 * C_CONN_PEND state: Connection establishment in progress to the server. 4321 * C_CONNECTED state: A connection when created is in C_CONNECTED state. 4322 * It has an RC channel associated with it. ibt_post_send/recv are allowed 4323 * only in this state. 4324 * C_ERROR_CONN state: A connection transitions to this state when WRs on the 4325 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event 4326 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA. 4327 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when 4328 * c_ref drops to 0 (this indicates that RPC has no more references to this 4329 * connection), the connection should be destroyed. A connection transitions 4330 * into this state when it is being destroyed. 4331 */ 4332 /* ARGSUSED */ 4333 static rdma_stat 4334 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr, 4335 int addr_type, rpcib_ping_t *rpt, CONN **conn) 4336 { 4337 CONN *cn; 4338 int status; 4339 rib_hca_t *hca; 4340 rib_qp_t *qp; 4341 int s_addr_len; 4342 char *s_addr_buf; 4343 4344 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 4345 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 4346 rw_enter(&hca->state_lock, RW_READER); 4347 if (hca->state != HCA_DETACHED) { 4348 status = rib_find_hca_connection(hca, s_svcaddr, 4349 d_svcaddr, conn); 4350 rw_exit(&hca->state_lock); 4351 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) { 4352 rw_exit(&rib_stat->hcas_list_lock); 4353 return (status); 4354 } 4355 } else 4356 rw_exit(&hca->state_lock); 4357 } 4358 rw_exit(&rib_stat->hcas_list_lock); 4359 4360 /* 4361 * No existing connection found, establish a new connection. 4362 */ 4363 bzero(rpt, sizeof (rpcib_ping_t)); 4364 4365 status = rib_ping_srv(addr_type, d_svcaddr, rpt); 4366 if (status != RDMA_SUCCESS) { 4367 return (RDMA_FAILED); 4368 } 4369 hca = rpt->hca; 4370 4371 if (rpt->srcip.family == AF_INET) { 4372 s_addr_len = sizeof (rpt->srcip.un.ip4addr); 4373 s_addr_buf = (char *)&rpt->srcip.un.ip4addr; 4374 } else if (rpt->srcip.family == AF_INET6) { 4375 s_addr_len = sizeof (rpt->srcip.un.ip6addr); 4376 s_addr_buf = (char *)&rpt->srcip.un.ip6addr; 4377 } else { 4378 return (RDMA_FAILED); 4379 } 4380 4381 /* 4382 * Channel to server doesn't exist yet, create one. 4383 */ 4384 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) { 4385 return (RDMA_FAILED); 4386 } 4387 cn = qptoc(qp); 4388 cn->c_state = C_CONN_PEND; 4389 cn->c_ref = 1; 4390 4391 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP); 4392 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len); 4393 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len; 4394 4395 if (rpt->srcip.family == AF_INET) { 4396 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP); 4397 (void) strcpy(cn->c_netid, RIBNETID_TCP); 4398 4399 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4400 sizeof (struct sockaddr_in); 4401 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4402 4403 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr = 4404 (uint32_t)~0; 4405 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family = 4406 (ushort_t)~0; 4407 4408 } else { 4409 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP); 4410 (void) strcpy(cn->c_netid, RIBNETID_TCP6); 4411 4412 cn->c_addrmask.len = cn->c_addrmask.maxlen = 4413 sizeof (struct sockaddr_in6); 4414 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP); 4415 4416 (void) memset( 4417 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr, 4418 (uchar_t)~0, sizeof (struct in6_addr)); 4419 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family = 4420 (sa_family_t)~0; 4421 } 4422 4423 /* 4424 * Add to conn list. 4425 * We had given up the READER lock. In the time since then, 4426 * another thread might have created the connection we are 4427 * trying here. But for now, that is quiet alright - there 4428 * might be two connections between a pair of hosts instead 4429 * of one. If we really want to close that window, 4430 * then need to check the list after acquiring the 4431 * WRITER lock. 4432 */ 4433 (void) rib_add_connlist(cn, &hca->cl_conn_list); 4434 status = rib_conn_to_srv(hca, qp, rpt); 4435 mutex_enter(&cn->c_lock); 4436 4437 if (cn->c_flags & C_CLOSE_PENDING) { 4438 /* 4439 * This handles a case where the module or 4440 * HCA detached in the time a connection is 4441 * established. In such a case close the 4442 * connection immediately if this is the 4443 * only reference. 4444 */ 4445 if (cn->c_ref == 1) { 4446 cn->c_ref--; 4447 cn->c_state = C_DISCONN_PEND; 4448 mutex_exit(&cn->c_lock); 4449 rib_conn_close((void *)cn); 4450 return (RDMA_FAILED); 4451 } 4452 4453 /* 4454 * Connection to be closed later when c_ref = 0 4455 */ 4456 status = RDMA_FAILED; 4457 } 4458 4459 if (status == RDMA_SUCCESS) { 4460 cn->c_state = C_CONNECTED; 4461 *conn = cn; 4462 } else { 4463 cn->c_state = C_ERROR_CONN; 4464 cn->c_ref--; 4465 } 4466 cv_signal(&cn->c_cv); 4467 mutex_exit(&cn->c_lock); 4468 return (status); 4469 } 4470 4471 static void 4472 rib_conn_close(void *rarg) 4473 { 4474 CONN *conn = (CONN *)rarg; 4475 rib_qp_t *qp = ctoqp(conn); 4476 4477 mutex_enter(&conn->c_lock); 4478 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4479 4480 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4481 4482 /* 4483 * Live connection in CONNECTED state. 4484 */ 4485 if (conn->c_state == C_CONNECTED) { 4486 conn->c_state = C_ERROR_CONN; 4487 } 4488 mutex_exit(&conn->c_lock); 4489 4490 rib_close_a_channel(conn); 4491 4492 mutex_enter(&conn->c_lock); 4493 conn->c_flags &= ~C_CLOSE_PENDING; 4494 } 4495 4496 mutex_exit(&conn->c_lock); 4497 4498 if (qp->mode == RIB_SERVER) 4499 (void) rib_disconnect_channel(conn, 4500 &qp->hca->srv_conn_list); 4501 else 4502 (void) rib_disconnect_channel(conn, 4503 &qp->hca->cl_conn_list); 4504 } 4505 4506 static void 4507 rib_conn_timeout_call(void *carg) 4508 { 4509 time_t idle_time; 4510 CONN *conn = (CONN *)carg; 4511 rib_hca_t *hca = ctoqp(conn)->hca; 4512 int error; 4513 4514 mutex_enter(&conn->c_lock); 4515 if ((conn->c_ref > 0) || 4516 (conn->c_state == C_DISCONN_PEND)) { 4517 conn->c_timeout = NULL; 4518 mutex_exit(&conn->c_lock); 4519 return; 4520 } 4521 4522 idle_time = (gethrestime_sec() - conn->c_last_used); 4523 4524 if ((idle_time <= rib_conn_timeout) && 4525 (conn->c_state != C_ERROR_CONN)) { 4526 /* 4527 * There was activity after the last timeout. 4528 * Extend the conn life. Unless the conn is 4529 * already in error state. 4530 */ 4531 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4532 SEC_TO_TICK(rib_conn_timeout - idle_time)); 4533 mutex_exit(&conn->c_lock); 4534 return; 4535 } 4536 4537 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close, 4538 (void *)conn, DDI_NOSLEEP); 4539 4540 /* 4541 * If taskq dispatch fails above, then reset the timeout 4542 * to try again after 10 secs. 4543 */ 4544 4545 if (error != DDI_SUCCESS) { 4546 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4547 SEC_TO_TICK(RDMA_CONN_REAP_RETRY)); 4548 mutex_exit(&conn->c_lock); 4549 return; 4550 } 4551 4552 conn->c_state = C_DISCONN_PEND; 4553 mutex_exit(&conn->c_lock); 4554 } 4555 4556 static rdma_stat 4557 rib_conn_release(CONN *conn) 4558 { 4559 mutex_enter(&conn->c_lock); 4560 return (rib_conn_release_locked(conn)); 4561 } 4562 4563 /* 4564 * Expects conn->c_lock to be held on entry. 4565 * c_lock released on return 4566 */ 4567 static rdma_stat 4568 rib_conn_release_locked(CONN *conn) 4569 { 4570 conn->c_ref--; 4571 4572 conn->c_last_used = gethrestime_sec(); 4573 if (conn->c_ref > 0) { 4574 mutex_exit(&conn->c_lock); 4575 return (RDMA_SUCCESS); 4576 } 4577 4578 /* 4579 * If a conn is C_ERROR_CONN, close the channel. 4580 */ 4581 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) { 4582 conn->c_state = C_DISCONN_PEND; 4583 mutex_exit(&conn->c_lock); 4584 rib_conn_close((void *)conn); 4585 return (RDMA_SUCCESS); 4586 } 4587 4588 /* 4589 * c_ref == 0, set a timeout for conn release 4590 */ 4591 4592 if (conn->c_timeout == NULL) { 4593 conn->c_timeout = timeout(rib_conn_timeout_call, conn, 4594 SEC_TO_TICK(rib_conn_timeout)); 4595 } 4596 4597 mutex_exit(&conn->c_lock); 4598 return (RDMA_SUCCESS); 4599 } 4600 4601 /* 4602 * Add at front of list 4603 */ 4604 static struct rdma_done_list * 4605 rdma_done_add(rib_qp_t *qp, uint32_t xid) 4606 { 4607 struct rdma_done_list *rd; 4608 4609 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4610 4611 rd = kmem_alloc(sizeof (*rd), KM_SLEEP); 4612 rd->xid = xid; 4613 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL); 4614 4615 rd->prev = NULL; 4616 rd->next = qp->rdlist; 4617 if (qp->rdlist != NULL) 4618 qp->rdlist->prev = rd; 4619 qp->rdlist = rd; 4620 4621 return (rd); 4622 } 4623 4624 static void 4625 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd) 4626 { 4627 struct rdma_done_list *r; 4628 4629 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4630 4631 r = rd->next; 4632 if (r != NULL) { 4633 r->prev = rd->prev; 4634 } 4635 4636 r = rd->prev; 4637 if (r != NULL) { 4638 r->next = rd->next; 4639 } else { 4640 qp->rdlist = rd->next; 4641 } 4642 4643 cv_destroy(&rd->rdma_done_cv); 4644 kmem_free(rd, sizeof (*rd)); 4645 } 4646 4647 static void 4648 rdma_done_rem_list(rib_qp_t *qp) 4649 { 4650 struct rdma_done_list *r, *n; 4651 4652 mutex_enter(&qp->rdlist_lock); 4653 for (r = qp->rdlist; r != NULL; r = n) { 4654 n = r->next; 4655 rdma_done_rm(qp, r); 4656 } 4657 mutex_exit(&qp->rdlist_lock); 4658 } 4659 4660 static void 4661 rdma_done_notify(rib_qp_t *qp, uint32_t xid) 4662 { 4663 struct rdma_done_list *r = qp->rdlist; 4664 4665 ASSERT(MUTEX_HELD(&qp->rdlist_lock)); 4666 4667 while (r) { 4668 if (r->xid == xid) { 4669 cv_signal(&r->rdma_done_cv); 4670 return; 4671 } else { 4672 r = r->next; 4673 } 4674 } 4675 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid, 4676 int, xid); 4677 } 4678 4679 /* 4680 * Expects conn->c_lock to be held by the caller. 4681 */ 4682 4683 static void 4684 rib_close_a_channel(CONN *conn) 4685 { 4686 rib_qp_t *qp; 4687 qp = ctoqp(conn); 4688 4689 if (qp->qp_hdl == NULL) { 4690 /* channel already freed */ 4691 return; 4692 } 4693 4694 /* 4695 * Call ibt_close_rc_channel in blocking mode 4696 * with no callbacks. 4697 */ 4698 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS, 4699 NULL, 0, NULL, NULL, 0); 4700 } 4701 4702 /* 4703 * Goes through all connections and closes the channel 4704 * This will cause all the WRs on those channels to be 4705 * flushed. 4706 */ 4707 static void 4708 rib_close_channels(rib_conn_list_t *connlist) 4709 { 4710 CONN *conn, *tmp; 4711 4712 rw_enter(&connlist->conn_lock, RW_READER); 4713 conn = connlist->conn_hd; 4714 while (conn != NULL) { 4715 mutex_enter(&conn->c_lock); 4716 tmp = conn->c_next; 4717 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) { 4718 4719 if (conn->c_state == C_CONN_PEND) { 4720 conn->c_flags |= C_CLOSE_PENDING; 4721 goto next; 4722 } 4723 4724 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING); 4725 4726 /* 4727 * Live connection in CONNECTED state. 4728 */ 4729 if (conn->c_state == C_CONNECTED) 4730 conn->c_state = C_ERROR_CONN; 4731 mutex_exit(&conn->c_lock); 4732 4733 rib_close_a_channel(conn); 4734 4735 mutex_enter(&conn->c_lock); 4736 conn->c_flags &= ~C_CLOSE_PENDING; 4737 /* Signal a pending rib_disconnect_channel() */ 4738 cv_signal(&conn->c_cv); 4739 } 4740 next: 4741 mutex_exit(&conn->c_lock); 4742 conn = tmp; 4743 } 4744 rw_exit(&connlist->conn_lock); 4745 } 4746 4747 /* 4748 * Frees up all connections that are no longer being referenced 4749 */ 4750 static void 4751 rib_purge_connlist(rib_conn_list_t *connlist) 4752 { 4753 CONN *conn; 4754 4755 top: 4756 rw_enter(&connlist->conn_lock, RW_READER); 4757 conn = connlist->conn_hd; 4758 while (conn != NULL) { 4759 mutex_enter(&conn->c_lock); 4760 4761 /* 4762 * At this point connection is either in ERROR 4763 * or DISCONN_PEND state. If in DISCONN_PEND state 4764 * then some other thread is culling that connection. 4765 * If not and if c_ref is 0, then destroy the connection. 4766 */ 4767 if (conn->c_ref == 0 && 4768 conn->c_state != C_DISCONN_PEND) { 4769 /* 4770 * Cull the connection 4771 */ 4772 conn->c_state = C_DISCONN_PEND; 4773 mutex_exit(&conn->c_lock); 4774 rw_exit(&connlist->conn_lock); 4775 (void) rib_disconnect_channel(conn, connlist); 4776 goto top; 4777 } else { 4778 /* 4779 * conn disconnect already scheduled or will 4780 * happen from conn_release when c_ref drops to 0. 4781 */ 4782 mutex_exit(&conn->c_lock); 4783 } 4784 conn = conn->c_next; 4785 } 4786 rw_exit(&connlist->conn_lock); 4787 4788 /* 4789 * At this point, only connections with c_ref != 0 are on the list 4790 */ 4791 } 4792 4793 /* 4794 * Free all the HCA resources and close 4795 * the hca. 4796 */ 4797 4798 static void 4799 rib_free_hca(rib_hca_t *hca) 4800 { 4801 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl); 4802 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl); 4803 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl); 4804 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl); 4805 4806 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t)); 4807 kmem_free(hca->clnt_scq, sizeof (rib_cq_t)); 4808 kmem_free(hca->svc_rcq, sizeof (rib_cq_t)); 4809 kmem_free(hca->svc_scq, sizeof (rib_cq_t)); 4810 4811 rib_rbufpool_destroy(hca, RECV_BUFFER); 4812 rib_rbufpool_destroy(hca, SEND_BUFFER); 4813 rib_destroy_cache(hca); 4814 if (rib_mod.rdma_count == 0) 4815 (void) rdma_unregister_mod(&rib_mod); 4816 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl); 4817 (void) ibt_close_hca(hca->hca_hdl); 4818 hca->hca_hdl = NULL; 4819 } 4820 4821 4822 static void 4823 rib_stop_hca_services(rib_hca_t *hca) 4824 { 4825 rib_stop_services(hca); 4826 rib_close_channels(&hca->cl_conn_list); 4827 rib_close_channels(&hca->srv_conn_list); 4828 4829 rib_purge_connlist(&hca->cl_conn_list); 4830 rib_purge_connlist(&hca->srv_conn_list); 4831 4832 if ((rib_stat->hcas_list == NULL) && stats_enabled) { 4833 kstat_delete_byname_zone("unix", 0, "rpcib_cache", 4834 GLOBAL_ZONEID); 4835 stats_enabled = FALSE; 4836 } 4837 4838 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER); 4839 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER); 4840 if (hca->srv_conn_list.conn_hd == NULL && 4841 hca->cl_conn_list.conn_hd == NULL) { 4842 /* 4843 * conn_lists are NULL, so destroy 4844 * buffers, close hca and be done. 4845 */ 4846 rib_free_hca(hca); 4847 } 4848 rw_exit(&hca->cl_conn_list.conn_lock); 4849 rw_exit(&hca->srv_conn_list.conn_lock); 4850 4851 if (hca->hca_hdl != NULL) { 4852 mutex_enter(&hca->inuse_lock); 4853 while (hca->inuse) 4854 cv_wait(&hca->cb_cv, &hca->inuse_lock); 4855 mutex_exit(&hca->inuse_lock); 4856 4857 rib_free_hca(hca); 4858 } 4859 rw_destroy(&hca->bound_services_lock); 4860 4861 if (hca->cleanup_helper != NULL) { 4862 ddi_taskq_destroy(hca->cleanup_helper); 4863 hca->cleanup_helper = NULL; 4864 } 4865 } 4866 4867 /* 4868 * Cleans and closes up all uses of the HCA 4869 */ 4870 static void 4871 rib_detach_hca(ibt_hca_hdl_t hca_hdl) 4872 { 4873 rib_hca_t *hca = NULL; 4874 rib_hca_t **hcap; 4875 4876 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER); 4877 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) { 4878 hca = *hcap; 4879 rw_enter(&hca->state_lock, RW_WRITER); 4880 if (hca->hca_hdl == hca_hdl) { 4881 /* 4882 * Mark as detached and remove from 4883 * hca list. 4884 */ 4885 hca->state = HCA_DETACHED; 4886 *hcap = hca->next; 4887 rib_stat->nhca_inited--; 4888 rib_mod.rdma_count--; 4889 rw_exit(&hca->state_lock); 4890 break; 4891 } 4892 rw_exit(&hca->state_lock); 4893 } 4894 rw_exit(&rib_stat->hcas_list_lock); 4895 4896 if (hca == NULL) 4897 return; 4898 ASSERT(hca->hca_hdl == hca_hdl); 4899 4900 /* 4901 * Stop all services on the HCA 4902 * Go through cl_conn_list and close all rc_channels 4903 * Go through svr_conn_list and close all rc_channels 4904 * Free connections whose c_ref has dropped to 0 4905 * Destroy all CQs 4906 * Deregister and released all buffer pool memory after all 4907 * connections are destroyed 4908 * Free the protection domain 4909 * ibt_close_hca() 4910 */ 4911 rib_stop_hca_services(hca); 4912 4913 kmem_free(hca, sizeof (*hca)); 4914 } 4915 4916 static void 4917 rib_server_side_cache_reclaim(void *argp) 4918 { 4919 cache_avl_struct_t *rcas; 4920 rib_lrc_entry_t *rb; 4921 rib_hca_t *hca = (rib_hca_t *)argp; 4922 4923 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4924 rcas = avl_first(&hca->avl_tree); 4925 if (rcas != NULL) 4926 avl_remove(&hca->avl_tree, rcas); 4927 4928 while (rcas != NULL) { 4929 while (rcas->r.forw != &rcas->r) { 4930 rcas->elements--; 4931 rb = rcas->r.forw; 4932 remque(rb); 4933 if (rb->registered) 4934 (void) rib_deregistermem_via_hca(hca, 4935 rb->lrc_buf, rb->lrc_mhandle); 4936 4937 hca->cache_allocation -= rb->lrc_len; 4938 kmem_free(rb->lrc_buf, rb->lrc_len); 4939 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4940 } 4941 mutex_destroy(&rcas->node_lock); 4942 kmem_cache_free(hca->server_side_cache, rcas); 4943 rcas = avl_first(&hca->avl_tree); 4944 if (rcas != NULL) 4945 avl_remove(&hca->avl_tree, rcas); 4946 } 4947 rw_exit(&hca->avl_rw_lock); 4948 } 4949 4950 static void 4951 rib_server_side_cache_cleanup(void *argp) 4952 { 4953 cache_avl_struct_t *rcas; 4954 rib_lrc_entry_t *rb; 4955 rib_hca_t *hca = (rib_hca_t *)argp; 4956 4957 mutex_enter(&hca->cache_allocation_lock); 4958 if (hca->cache_allocation < cache_limit) { 4959 mutex_exit(&hca->cache_allocation_lock); 4960 return; 4961 } 4962 mutex_exit(&hca->cache_allocation_lock); 4963 4964 rw_enter(&hca->avl_rw_lock, RW_WRITER); 4965 rcas = avl_last(&hca->avl_tree); 4966 if (rcas != NULL) 4967 avl_remove(&hca->avl_tree, rcas); 4968 4969 while (rcas != NULL) { 4970 while (rcas->r.forw != &rcas->r) { 4971 rcas->elements--; 4972 rb = rcas->r.forw; 4973 remque(rb); 4974 if (rb->registered) 4975 (void) rib_deregistermem_via_hca(hca, 4976 rb->lrc_buf, rb->lrc_mhandle); 4977 4978 hca->cache_allocation -= rb->lrc_len; 4979 4980 kmem_free(rb->lrc_buf, rb->lrc_len); 4981 kmem_free(rb, sizeof (rib_lrc_entry_t)); 4982 } 4983 mutex_destroy(&rcas->node_lock); 4984 if (hca->server_side_cache) { 4985 kmem_cache_free(hca->server_side_cache, rcas); 4986 } 4987 4988 if (hca->cache_allocation < cache_limit) { 4989 rw_exit(&hca->avl_rw_lock); 4990 return; 4991 } 4992 4993 rcas = avl_last(&hca->avl_tree); 4994 if (rcas != NULL) 4995 avl_remove(&hca->avl_tree, rcas); 4996 } 4997 rw_exit(&hca->avl_rw_lock); 4998 } 4999 5000 static int 5001 avl_compare(const void *t1, const void *t2) 5002 { 5003 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 5004 return (0); 5005 5006 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 5007 return (-1); 5008 5009 return (1); 5010 } 5011 5012 static void 5013 rib_destroy_cache(rib_hca_t *hca) 5014 { 5015 if (hca->avl_init) { 5016 rib_server_side_cache_reclaim((void *)hca); 5017 if (hca->server_side_cache) { 5018 kmem_cache_destroy(hca->server_side_cache); 5019 hca->server_side_cache = NULL; 5020 } 5021 avl_destroy(&hca->avl_tree); 5022 mutex_destroy(&hca->cache_allocation_lock); 5023 rw_destroy(&hca->avl_rw_lock); 5024 } 5025 hca->avl_init = FALSE; 5026 } 5027 5028 static void 5029 rib_force_cleanup(void *hca) 5030 { 5031 if (((rib_hca_t *)hca)->cleanup_helper != NULL) 5032 (void) ddi_taskq_dispatch( 5033 ((rib_hca_t *)hca)->cleanup_helper, 5034 rib_server_side_cache_cleanup, 5035 (void *)hca, DDI_NOSLEEP); 5036 } 5037 5038 static rib_lrc_entry_t * 5039 rib_get_cache_buf(CONN *conn, uint32_t len) 5040 { 5041 cache_avl_struct_t cas, *rcas; 5042 rib_hca_t *hca = (ctoqp(conn))->hca; 5043 rib_lrc_entry_t *reply_buf; 5044 avl_index_t where = NULL; 5045 uint64_t c_alloc = 0; 5046 5047 if (!hca->avl_init) 5048 goto error_alloc; 5049 5050 cas.len = len; 5051 5052 rw_enter(&hca->avl_rw_lock, RW_READER); 5053 5054 mutex_enter(&hca->cache_allocation_lock); 5055 c_alloc = hca->cache_allocation; 5056 mutex_exit(&hca->cache_allocation_lock); 5057 5058 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, 5059 &where)) == NULL) { 5060 /* Am I above the cache limit */ 5061 if ((c_alloc + len) >= cache_limit) { 5062 rib_force_cleanup((void *)hca); 5063 rw_exit(&hca->avl_rw_lock); 5064 mutex_enter(&hca->cache_allocation_lock); 5065 hca->cache_misses_above_the_limit ++; 5066 mutex_exit(&hca->cache_allocation_lock); 5067 5068 /* Allocate and register the buffer directly */ 5069 goto error_alloc; 5070 } 5071 5072 rw_exit(&hca->avl_rw_lock); 5073 rw_enter(&hca->avl_rw_lock, RW_WRITER); 5074 5075 /* Recheck to make sure no other thread added the entry in */ 5076 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, 5077 &cas, &where)) == NULL) { 5078 /* Allocate an avl tree entry */ 5079 rcas = (cache_avl_struct_t *) 5080 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP); 5081 5082 bzero(rcas, sizeof (cache_avl_struct_t)); 5083 rcas->elements = 0; 5084 rcas->r.forw = &rcas->r; 5085 rcas->r.back = &rcas->r; 5086 rcas->len = len; 5087 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL); 5088 avl_insert(&hca->avl_tree, rcas, where); 5089 } 5090 } 5091 5092 mutex_enter(&rcas->node_lock); 5093 5094 if (rcas->r.forw != &rcas->r && rcas->elements > 0) { 5095 reply_buf = rcas->r.forw; 5096 remque(reply_buf); 5097 rcas->elements--; 5098 mutex_exit(&rcas->node_lock); 5099 rw_exit(&hca->avl_rw_lock); 5100 5101 mutex_enter(&hca->cache_allocation_lock); 5102 hca->cache_hits++; 5103 hca->cache_allocation -= len; 5104 mutex_exit(&hca->cache_allocation_lock); 5105 } else { 5106 /* Am I above the cache limit */ 5107 mutex_exit(&rcas->node_lock); 5108 if ((c_alloc + len) >= cache_limit) { 5109 rib_force_cleanup((void *)hca); 5110 rw_exit(&hca->avl_rw_lock); 5111 5112 mutex_enter(&hca->cache_allocation_lock); 5113 hca->cache_misses_above_the_limit++; 5114 mutex_exit(&hca->cache_allocation_lock); 5115 /* Allocate and register the buffer directly */ 5116 goto error_alloc; 5117 } 5118 rw_exit(&hca->avl_rw_lock); 5119 mutex_enter(&hca->cache_allocation_lock); 5120 hca->cache_misses++; 5121 mutex_exit(&hca->cache_allocation_lock); 5122 /* Allocate a reply_buf entry */ 5123 reply_buf = (rib_lrc_entry_t *) 5124 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5125 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5126 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5127 reply_buf->lrc_len = len; 5128 reply_buf->registered = FALSE; 5129 reply_buf->avl_node = (void *)rcas; 5130 } 5131 5132 return (reply_buf); 5133 5134 error_alloc: 5135 reply_buf = (rib_lrc_entry_t *) 5136 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP); 5137 bzero(reply_buf, sizeof (rib_lrc_entry_t)); 5138 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP); 5139 reply_buf->lrc_len = len; 5140 reply_buf->registered = FALSE; 5141 reply_buf->avl_node = NULL; 5142 5143 return (reply_buf); 5144 } 5145 5146 /* 5147 * Return a pre-registered back to the cache (without 5148 * unregistering the buffer).. 5149 */ 5150 5151 static void 5152 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf) 5153 { 5154 cache_avl_struct_t cas, *rcas; 5155 avl_index_t where = NULL; 5156 rib_hca_t *hca = (ctoqp(conn))->hca; 5157 5158 if (!hca->avl_init) 5159 goto error_free; 5160 5161 cas.len = reg_buf->lrc_len; 5162 rw_enter(&hca->avl_rw_lock, RW_READER); 5163 if ((rcas = (cache_avl_struct_t *) 5164 avl_find(&hca->avl_tree, &cas, &where)) == NULL) { 5165 rw_exit(&hca->avl_rw_lock); 5166 goto error_free; 5167 } else { 5168 cas.len = reg_buf->lrc_len; 5169 mutex_enter(&rcas->node_lock); 5170 insque(reg_buf, &rcas->r); 5171 rcas->elements ++; 5172 mutex_exit(&rcas->node_lock); 5173 rw_exit(&hca->avl_rw_lock); 5174 mutex_enter(&hca->cache_allocation_lock); 5175 hca->cache_allocation += cas.len; 5176 mutex_exit(&hca->cache_allocation_lock); 5177 } 5178 5179 return; 5180 5181 error_free: 5182 5183 if (reg_buf->registered) 5184 (void) rib_deregistermem_via_hca(hca, 5185 reg_buf->lrc_buf, reg_buf->lrc_mhandle); 5186 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len); 5187 kmem_free(reg_buf, sizeof (rib_lrc_entry_t)); 5188 } 5189 5190 static rdma_stat 5191 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf, 5192 uint_t buflen, struct mrc *buf_handle) 5193 { 5194 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */ 5195 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */ 5196 rdma_stat status; 5197 5198 5199 /* 5200 * Note: ALL buffer pools use the same memory type RDMARW. 5201 */ 5202 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc); 5203 if (status == RDMA_SUCCESS) { 5204 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl; 5205 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey; 5206 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey; 5207 } else { 5208 buf_handle->mrc_linfo = NULL; 5209 buf_handle->mrc_lmr = 0; 5210 buf_handle->mrc_rmr = 0; 5211 } 5212 return (status); 5213 } 5214 5215 /* ARGSUSED */ 5216 static rdma_stat 5217 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf, 5218 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle) 5219 { 5220 5221 (void) rib_deregistermem_via_hca(hca, buf, buf_handle); 5222 return (RDMA_SUCCESS); 5223 } 5224 5225 /* ARGSUSED */ 5226 static rdma_stat 5227 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle) 5228 { 5229 5230 (void) ibt_deregister_mr(hca->hca_hdl, 5231 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo); 5232 return (RDMA_SUCCESS); 5233 } 5234 5235 /* 5236 * Check if the IP interface named by `lifrp' is RDMA-capable. 5237 */ 5238 static boolean_t 5239 rpcib_rdma_capable_interface(struct lifreq *lifrp) 5240 { 5241 char ifname[LIFNAMSIZ]; 5242 char *cp; 5243 5244 if (lifrp->lifr_type == IFT_IB) 5245 return (B_TRUE); 5246 5247 /* 5248 * Strip off the logical interface portion before getting 5249 * intimate with the name. 5250 */ 5251 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ); 5252 if ((cp = strchr(ifname, ':')) != NULL) 5253 *cp = '\0'; 5254 5255 return (strcmp("lo0", ifname) == 0); 5256 } 5257 5258 static int 5259 rpcib_do_ip_ioctl(int cmd, int len, void *arg) 5260 { 5261 vnode_t *kkvp, *vp; 5262 TIUSER *tiptr; 5263 struct strioctl iocb; 5264 k_sigset_t smask; 5265 int err = 0; 5266 5267 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) { 5268 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE, 5269 &tiptr, CRED()) == 0) { 5270 vp = tiptr->fp->f_vnode; 5271 } else { 5272 VN_RELE(kkvp); 5273 return (EPROTO); 5274 } 5275 } else { 5276 return (EPROTO); 5277 } 5278 5279 iocb.ic_cmd = cmd; 5280 iocb.ic_timout = 0; 5281 iocb.ic_len = len; 5282 iocb.ic_dp = (caddr_t)arg; 5283 sigintr(&smask, 0); 5284 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb); 5285 sigunintr(&smask); 5286 (void) t_kclose(tiptr, 0); 5287 VN_RELE(kkvp); 5288 return (err); 5289 } 5290 5291 /* 5292 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'. 5293 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes. 5294 */ 5295 static int 5296 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep) 5297 { 5298 int err; 5299 struct lifnum lifn; 5300 5301 bzero(&lifn, sizeof (struct lifnum)); 5302 lifn.lifn_family = AF_UNSPEC; 5303 5304 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn); 5305 if (err != 0) 5306 return (err); 5307 5308 /* 5309 * Pad the interface count to account for additional interfaces that 5310 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF. 5311 */ 5312 lifn.lifn_count += 4; 5313 5314 bzero(lifcp, sizeof (struct lifconf)); 5315 lifcp->lifc_family = AF_UNSPEC; 5316 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq); 5317 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP); 5318 5319 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp); 5320 if (err != 0) { 5321 kmem_free(lifcp->lifc_buf, *bufsizep); 5322 return (err); 5323 } 5324 return (0); 5325 } 5326 5327 static boolean_t 5328 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6) 5329 { 5330 uint_t i, nifs; 5331 uint_t bufsize; 5332 struct lifconf lifc; 5333 struct lifreq *lifrp; 5334 struct sockaddr_in *sinp; 5335 struct sockaddr_in6 *sin6p; 5336 5337 bzero(addrs4, sizeof (rpcib_ipaddrs_t)); 5338 bzero(addrs6, sizeof (rpcib_ipaddrs_t)); 5339 5340 if (rpcib_do_lifconf(&lifc, &bufsize) != 0) 5341 return (B_FALSE); 5342 5343 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) { 5344 kmem_free(lifc.lifc_buf, bufsize); 5345 return (B_FALSE); 5346 } 5347 5348 /* 5349 * Worst case is that all of the addresses are IB-capable and have 5350 * the same address family, so size our buffers accordingly. 5351 */ 5352 addrs4->ri_size = nifs * sizeof (struct sockaddr_in); 5353 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP); 5354 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6); 5355 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP); 5356 5357 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) { 5358 if (!rpcib_rdma_capable_interface(lifrp)) 5359 continue; 5360 5361 if (lifrp->lifr_addr.ss_family == AF_INET) { 5362 sinp = addrs4->ri_list; 5363 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++], 5364 sizeof (struct sockaddr_in)); 5365 } else if (lifrp->lifr_addr.ss_family == AF_INET6) { 5366 sin6p = addrs6->ri_list; 5367 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++], 5368 sizeof (struct sockaddr_in6)); 5369 } 5370 } 5371 5372 kmem_free(lifc.lifc_buf, bufsize); 5373 return (B_TRUE); 5374 } 5375 5376 /* ARGSUSED */ 5377 static int 5378 rpcib_cache_kstat_update(kstat_t *ksp, int rw) 5379 { 5380 rib_hca_t *hca; 5381 5382 if (KSTAT_WRITE == rw) { 5383 return (EACCES); 5384 } 5385 5386 rpcib_kstat.cache_limit.value.ui64 = 5387 (uint64_t)cache_limit; 5388 rw_enter(&rib_stat->hcas_list_lock, RW_READER); 5389 for (hca = rib_stat->hcas_list; hca; hca = hca->next) { 5390 rpcib_kstat.cache_allocation.value.ui64 += 5391 (uint64_t)hca->cache_allocation; 5392 rpcib_kstat.cache_hits.value.ui64 += 5393 (uint64_t)hca->cache_hits; 5394 rpcib_kstat.cache_misses.value.ui64 += 5395 (uint64_t)hca->cache_misses; 5396 rpcib_kstat.cache_misses_above_the_limit.value.ui64 += 5397 (uint64_t)hca->cache_misses_above_the_limit; 5398 } 5399 rw_exit(&rib_stat->hcas_list_lock); 5400 return (0); 5401 }