1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 */
25
26 /*
27 * Copyright (c) 2007, The Ohio State University. All rights reserved.
28 *
29 * Portions of this source code is developed by the team members of
30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 * headed by Professor Dhabaleswar K. (DK) Panda.
32 *
33 * Acknowledgements to contributions from developors:
34 * Ranjit Noronha: noronha@cse.ohio-state.edu
35 * Lei Chai : chail@cse.ohio-state.edu
36 * Weikuan Yu : yuw@cse.ohio-state.edu
37 *
38 */
39
40 /*
41 * The rpcib plugin. Implements the interface for RDMATF's
42 * interaction with IBTF.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84
85 #define NFS_RDMA_PORT 20049
86
87
88 /*
89 * Convenience structures for connection management
90 */
91 typedef struct rpcib_ipaddrs {
92 void *ri_list; /* pointer to list of addresses */
93 uint_t ri_count; /* number of addresses in list */
94 uint_t ri_size; /* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96
97
98 typedef struct rpcib_ping {
99 rib_hca_t *hca;
100 ibt_path_info_t path;
101 ibt_ip_addr_t srcip;
102 ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104
105 /*
106 * Prototype declarations for driver ops
107 */
108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 void *, void **);
111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 struct netbuf *d_svcaddr, CONN **conn);
121
122 struct {
123 kstat_named_t cache_limit;
124 kstat_named_t cache_allocation;
125 kstat_named_t cache_hits;
126 kstat_named_t cache_misses;
127 kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 {"cache_limit", KSTAT_DATA_UINT64 },
130 {"cache_allocation", KSTAT_DATA_UINT64 },
131 {"cache_hits", KSTAT_DATA_UINT64 },
132 {"cache_misses", KSTAT_DATA_UINT64 },
133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 nulldev, /* open */
139 nulldev, /* close */
140 nodev, /* strategy */
141 nodev, /* print */
142 nodev, /* dump */
143 nodev, /* read */
144 nodev, /* write */
145 nodev, /* ioctl */
146 nodev, /* devmap */
147 nodev, /* mmap */
148 nodev, /* segmap */
149 nochpoll, /* poll */
150 ddi_prop_op, /* prop_op */
151 NULL, /* stream */
152 D_MP, /* cb_flag */
153 CB_REV, /* rev */
154 nodev, /* int (*cb_aread)() */
155 nodev /* int (*cb_awrite)() */
156 };
157
158 /*
159 * Device options
160 */
161 static struct dev_ops rpcib_ops = {
162 DEVO_REV, /* devo_rev, */
163 0, /* refcnt */
164 rpcib_getinfo, /* info */
165 nulldev, /* identify */
166 nulldev, /* probe */
167 rpcib_attach, /* attach */
168 rpcib_detach, /* detach */
169 nodev, /* reset */
170 &rpcib_cbops, /* driver ops - devctl interfaces */
171 NULL, /* bus operations */
172 NULL, /* power */
173 ddi_quiesce_not_needed, /* quiesce */
174 };
175
176 /*
177 * Module linkage information.
178 */
179
180 static struct modldrv rib_modldrv = {
181 &mod_driverops, /* Driver module */
182 "RPCIB plugin driver", /* Driver name and version */
183 &rpcib_ops, /* Driver ops */
184 };
185
186 static struct modlinkage rib_modlinkage = {
187 MODREV_1,
188 { (void *)&rib_modldrv, NULL }
189 };
190
191 typedef struct rib_lrc_entry {
192 struct rib_lrc_entry *forw;
193 struct rib_lrc_entry *back;
194 char *lrc_buf;
195
196 uint32_t lrc_len;
197 void *avl_node;
198 bool_t registered;
199
200 struct mrc lrc_mhandle;
201 bool_t lrc_on_freed_list;
202 } rib_lrc_entry_t;
203
204 typedef struct cache_struct {
205 rib_lrc_entry_t r;
206 uint32_t len;
207 uint32_t elements;
208 kmutex_t node_lock;
209 avl_node_t avl_link;
210 } cache_avl_struct_t;
211
212 uint64_t cache_limit = 100 * 1024 * 1024;
213 static uint64_t cache_watermark = 80 * 1024 * 1024;
214 static bool_t stats_enabled = FALSE;
215
216 static uint64_t max_unsignaled_rws = 5;
217 int nfs_rdma_port = NFS_RDMA_PORT;
218
219 #define RIBNETID_TCP "tcp"
220 #define RIBNETID_TCP6 "tcp6"
221
222 /*
223 * rib_stat: private data pointer used when registering
224 * with the IBTF. It is returned to the consumer
225 * in all callbacks.
226 */
227 static rpcib_state_t *rib_stat = NULL;
228
229 #define RNR_RETRIES IBT_RNR_RETRY_1
230 #define MAX_PORTS 2
231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
232 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */
233
234 int preposted_rbufs = RDMA_BUFS_GRANT;
235 int send_threshold = 1;
236
237 /*
238 * Old cards with Tavor driver have limited memory footprint
239 * when booted in 32bit. The rib_max_rbufs tunable can be
240 * tuned for more buffers if needed.
241 */
242
243 #if !defined(_ELF64) && !defined(__sparc)
244 int rib_max_rbufs = MAX_BUFS;
245 #else
246 int rib_max_rbufs = 10 * MAX_BUFS;
247 #endif /* !(_ELF64) && !(__sparc) */
248
249 int rib_conn_timeout = 60 * 12; /* 12 minutes */
250
251 /*
252 * State of the plugin.
253 * ACCEPT = accepting new connections and requests.
254 * NO_ACCEPT = not accepting new connection and requests.
255 * This should eventually move to rpcib_state_t structure, since this
256 * will tell in which state the plugin is for a particular type of service
257 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
258 * state for one and in no_accept state for the other.
259 */
260 int plugin_state;
261 kmutex_t plugin_state_lock;
262
263 ldi_ident_t rpcib_li;
264
265 /*
266 * RPCIB RDMATF operations
267 */
268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
269 static rdma_stat rib_disconnect(CONN *conn);
270 static void rib_listen(struct rdma_svc_data *rd);
271 static void rib_listen_stop(struct rdma_svc_data *rd);
272 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf,
273 uint_t buflen, struct mrc *buf_handle);
274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
275 struct mrc buf_handle);
276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
277 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
279 struct mrc buf_handle);
280 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf,
281 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
282 void *lrc);
283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
284 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
286 caddr_t buf, int len, int cpu);
287
288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
289
290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
292
293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
294
295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
305 int addr_type, void *, CONN **);
306 static rdma_stat rib_conn_release(CONN *conn);
307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
308 rpcib_ping_t *, CONN **);
309 static rdma_stat rib_getinfo(rdma_info_t *info);
310
311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
313 static void rib_destroy_cache(rib_hca_t *hca);
314 static void rib_server_side_cache_reclaim(void *argp);
315 static int avl_compare(const void *t1, const void *t2);
316
317 static void rib_stop_services(rib_hca_t *);
318 static void rib_close_channels(rib_conn_list_t *);
319 static void rib_conn_close(void *);
320 static void rib_recv_rele(rib_qp_t *);
321 static rdma_stat rib_conn_release_locked(CONN *conn);
322
323 /*
324 * RPCIB addressing operations
325 */
326
327 /*
328 * RDMA operations the RPCIB module exports
329 */
330 static rdmaops_t rib_ops = {
331 rib_reachable,
332 rib_conn_get,
333 rib_conn_release,
334 rib_listen,
335 rib_listen_stop,
336 rib_registermem,
337 rib_deregistermem,
338 rib_registermemsync,
339 rib_deregistermemsync,
340 rib_syncmem,
341 rib_reg_buf_alloc,
342 rib_reg_buf_free,
343 rib_send,
344 rib_send_resp,
345 rib_post_resp,
346 rib_post_resp_remove,
347 rib_post_recv,
348 rib_recv,
349 rib_read,
350 rib_write,
351 rib_getinfo,
352 };
353
354 /*
355 * RDMATF RPCIB plugin details
356 */
357 static rdma_mod_t rib_mod = {
358 "ibtf", /* api name */
359 RDMATF_VERS_1,
360 0,
361 &rib_ops, /* rdma op vector for ibtf */
362 };
363
364 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
365 static rdma_stat rib_qp_init(rib_qp_t *, int);
366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
372 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
374 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
377 rib_qp_t **);
378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
379 rib_qp_t **);
380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
382 static int rib_free_sendwait(struct send_wid *);
383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
385 static void rdma_done_rem_list(rib_qp_t *);
386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
387
388 static void rib_async_handler(void *,
389 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
392 static int rib_free_svc_recv(struct svc_recv *);
393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
394 static void rib_free_wid(struct recv_wid *);
395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
396 static void rib_detach_hca(ibt_hca_hdl_t);
397 static void rib_close_a_channel(CONN *);
398 static void rib_send_hold(rib_qp_t *);
399 static void rib_send_rele(rib_qp_t *);
400
401 /*
402 * Registration with IBTF as a consumer
403 */
404 static struct ibt_clnt_modinfo_s rib_modinfo = {
405 IBTI_V_CURR,
406 IBT_GENERIC,
407 rib_async_handler, /* async event handler */
408 NULL, /* Memory Region Handler */
409 "nfs/ib"
410 };
411
412 /*
413 * Global strucuture
414 */
415
416 typedef struct rpcib_s {
417 dev_info_t *rpcib_dip;
418 kmutex_t rpcib_mutex;
419 } rpcib_t;
420
421 rpcib_t rpcib;
422
423 /*
424 * /etc/system controlled variable to control
425 * debugging in rpcib kernel module.
426 * Set it to values greater that 1 to control
427 * the amount of debugging messages required.
428 */
429 int rib_debug = 0;
430
431 int
432 _init(void)
433 {
434 int error;
435
436 error = mod_install((struct modlinkage *)&rib_modlinkage);
437 if (error != 0) {
438 /*
439 * Could not load module
440 */
441 return (error);
442 }
443 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
444 return (0);
445 }
446
447 int
448 _fini()
449 {
450 int status;
451
452 /*
453 * Remove module
454 */
455 if ((status = mod_remove(&rib_modlinkage)) != 0) {
456 return (status);
457 }
458 mutex_destroy(&plugin_state_lock);
459 return (0);
460 }
461
462 int
463 _info(struct modinfo *modinfop)
464 {
465 return (mod_info(&rib_modlinkage, modinfop));
466 }
467
468 /*
469 * rpcib_getinfo()
470 * Given the device number, return the devinfo pointer or the
471 * instance number.
472 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
473 */
474
475 /*ARGSUSED*/
476 static int
477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
478 {
479 int ret = DDI_SUCCESS;
480
481 switch (cmd) {
482 case DDI_INFO_DEVT2DEVINFO:
483 if (rpcib.rpcib_dip != NULL)
484 *result = rpcib.rpcib_dip;
485 else {
486 *result = NULL;
487 ret = DDI_FAILURE;
488 }
489 break;
490
491 case DDI_INFO_DEVT2INSTANCE:
492 *result = NULL;
493 break;
494
495 default:
496 ret = DDI_FAILURE;
497 }
498 return (ret);
499 }
500
501 static void
502 rpcib_free_hca_list()
503 {
504 rib_hca_t *hca, *hcap;
505
506 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
507 hca = rib_stat->hcas_list;
508 rib_stat->hcas_list = NULL;
509 rw_exit(&rib_stat->hcas_list_lock);
510 while (hca != NULL) {
511 rw_enter(&hca->state_lock, RW_WRITER);
512 hcap = hca;
513 hca = hca->next;
514 rib_stat->nhca_inited--;
515 rib_mod.rdma_count--;
516 hcap->state = HCA_DETACHED;
517 rw_exit(&hcap->state_lock);
518 rib_stop_hca_services(hcap);
519
520 kmem_free(hcap, sizeof (*hcap));
521 }
522 }
523
524 static rdma_stat
525 rpcib_free_service_list()
526 {
527 rib_service_t *service;
528 ibt_status_t ret;
529
530 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
531 while (rib_stat->service_list != NULL) {
532 service = rib_stat->service_list;
533 ret = ibt_unbind_all_services(service->srv_hdl);
534 if (ret != IBT_SUCCESS) {
535 rw_exit(&rib_stat->service_list_lock);
536 #ifdef DEBUG
537 cmn_err(CE_NOTE, "rpcib_free_service_list: "
538 "ibt_unbind_all_services failed (%d)\n", (int)ret);
539 #endif
540 return (RDMA_FAILED);
541 }
542 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
543 service->srv_hdl);
544 if (ret != IBT_SUCCESS) {
545 rw_exit(&rib_stat->service_list_lock);
546 #ifdef DEBUG
547 cmn_err(CE_NOTE, "rpcib_free_service_list: "
548 "ibt_deregister_service failed (%d)\n", (int)ret);
549 #endif
550 return (RDMA_FAILED);
551 }
552 rib_stat->service_list = service->next;
553 kmem_free(service, sizeof (rib_service_t));
554 }
555 rw_exit(&rib_stat->service_list_lock);
556
557 return (RDMA_SUCCESS);
558 }
559
560 static int
561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
562 {
563 ibt_status_t ibt_status;
564 rdma_stat r_status;
565
566 switch (cmd) {
567 case DDI_ATTACH:
568 break;
569 case DDI_RESUME:
570 return (DDI_SUCCESS);
571 default:
572 return (DDI_FAILURE);
573 }
574
575 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
576
577 mutex_enter(&rpcib.rpcib_mutex);
578 if (rpcib.rpcib_dip != NULL) {
579 mutex_exit(&rpcib.rpcib_mutex);
580 return (DDI_FAILURE);
581 }
582 rpcib.rpcib_dip = dip;
583 mutex_exit(&rpcib.rpcib_mutex);
584 /*
585 * Create the "rpcib" minor-node.
586 */
587 if (ddi_create_minor_node(dip,
588 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
589 /* Error message, no cmn_err as they print on console */
590 return (DDI_FAILURE);
591 }
592
593 if (rib_stat == NULL) {
594 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
595 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
596 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
597 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
598 }
599
600 rib_stat->hca_count = ibt_get_hca_list(NULL);
601 if (rib_stat->hca_count < 1) {
602 mutex_destroy(&rib_stat->listen_lock);
603 rw_destroy(&rib_stat->hcas_list_lock);
604 mutex_destroy(&rib_stat->open_hca_lock);
605 kmem_free(rib_stat, sizeof (*rib_stat));
606 rib_stat = NULL;
607 return (DDI_FAILURE);
608 }
609
610 ibt_status = ibt_attach(&rib_modinfo, dip,
611 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
612
613 if (ibt_status != IBT_SUCCESS) {
614 mutex_destroy(&rib_stat->listen_lock);
615 rw_destroy(&rib_stat->hcas_list_lock);
616 mutex_destroy(&rib_stat->open_hca_lock);
617 kmem_free(rib_stat, sizeof (*rib_stat));
618 rib_stat = NULL;
619 return (DDI_FAILURE);
620 }
621
622 rib_stat->service_list = NULL;
623 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
624 mutex_enter(&rib_stat->open_hca_lock);
625 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
626 mutex_exit(&rib_stat->open_hca_lock);
627 goto open_fail;
628 }
629 mutex_exit(&rib_stat->open_hca_lock);
630
631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
632 DDI_PROP_SUCCESS) {
633 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
634 "failed.");
635 goto register_fail;
636 }
637
638 /*
639 * Register with rdmatf
640 */
641 r_status = rdma_register_mod(&rib_mod);
642 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
643 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
644 "status = %d", r_status);
645 goto register_fail;
646 }
647
648 return (DDI_SUCCESS);
649
650 register_fail:
651
652 open_fail:
653 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
654 rpcib_free_hca_list();
655 (void) rpcib_free_service_list();
656 mutex_destroy(&rib_stat->listen_lock);
657 rw_destroy(&rib_stat->hcas_list_lock);
658 mutex_destroy(&rib_stat->open_hca_lock);
659 rw_destroy(&rib_stat->service_list_lock);
660 kmem_free(rib_stat, sizeof (*rib_stat));
661 rib_stat = NULL;
662 return (DDI_FAILURE);
663 }
664
665 /*ARGSUSED*/
666 static int
667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
668 {
669 switch (cmd) {
670
671 case DDI_DETACH:
672 break;
673
674 case DDI_SUSPEND:
675 default:
676 return (DDI_FAILURE);
677 }
678
679 /*
680 * Detach the hca and free resources
681 */
682 mutex_enter(&plugin_state_lock);
683 plugin_state = NO_ACCEPT;
684 mutex_exit(&plugin_state_lock);
685
686 if (rpcib_free_service_list() != RDMA_SUCCESS)
687 return (DDI_FAILURE);
688 rpcib_free_hca_list();
689
690 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
691 mutex_destroy(&rib_stat->listen_lock);
692 rw_destroy(&rib_stat->hcas_list_lock);
693 mutex_destroy(&rib_stat->open_hca_lock);
694 rw_destroy(&rib_stat->service_list_lock);
695
696 kmem_free(rib_stat, sizeof (*rib_stat));
697 rib_stat = NULL;
698
699 mutex_enter(&rpcib.rpcib_mutex);
700 rpcib.rpcib_dip = NULL;
701 mutex_exit(&rpcib.rpcib_mutex);
702 mutex_destroy(&rpcib.rpcib_mutex);
703 return (DDI_SUCCESS);
704 }
705
706
707 static void rib_rbufpool_free(rib_hca_t *, int);
708 static void rib_rbufpool_deregister(rib_hca_t *, int);
709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
711 static rdma_stat rib_rem_replylist(rib_qp_t *);
712 static int rib_remreply(rib_qp_t *, struct reply *);
713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
715
716
717 /*
718 * One CQ pair per HCA
719 */
720 static rdma_stat
721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
722 rib_cq_t **cqp)
723 {
724 rib_cq_t *cq;
725 ibt_cq_attr_t cq_attr;
726 uint32_t real_size;
727 ibt_status_t status;
728 rdma_stat error = RDMA_SUCCESS;
729
730 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
731 cq->rib_hca = hca;
732 bzero(&cq_attr, sizeof (cq_attr));
733 cq_attr.cq_size = cq_size;
734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
736 &real_size);
737 if (status != IBT_SUCCESS) {
738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
739 " status=%d", status);
740 error = RDMA_FAILED;
741 goto fail;
742 }
743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
744
745 /*
746 * Enable CQ callbacks. CQ Callbacks are single shot
747 * (e.g. you have to call ibt_enable_cq_notify()
748 * after each callback to get another one).
749 */
750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
751 if (status != IBT_SUCCESS) {
752 cmn_err(CE_WARN, "rib_create_cq: "
753 "enable_cq_notify failed, status %d", status);
754 error = RDMA_FAILED;
755 goto fail;
756 }
757 *cqp = cq;
758
759 return (error);
760 fail:
761 if (cq->rib_cq_hdl)
762 (void) ibt_free_cq(cq->rib_cq_hdl);
763 if (cq)
764 kmem_free(cq, sizeof (rib_cq_t));
765 return (error);
766 }
767
768 /*
769 * rpcib_find_hca
770 *
771 * Caller should have already locked the hcas_lock before calling
772 * this function.
773 */
774 static rib_hca_t *
775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
776 {
777 rib_hca_t *hca = ribstat->hcas_list;
778
779 while (hca && hca->hca_guid != guid)
780 hca = hca->next;
781
782 return (hca);
783 }
784
785 static rdma_stat
786 rpcib_open_hcas(rpcib_state_t *ribstat)
787 {
788 rib_hca_t *hca;
789 ibt_status_t ibt_status;
790 rdma_stat status;
791 ibt_hca_portinfo_t *pinfop;
792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
793 uint_t size, cq_size;
794 int i;
795 kstat_t *ksp;
796 cache_avl_struct_t example_avl_node;
797 char rssc_name[32];
798 int old_nhca_inited = ribstat->nhca_inited;
799 ib_guid_t *hca_guids;
800
801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
802
803 ribstat->hca_count = ibt_get_hca_list(&hca_guids);
804 if (ribstat->hca_count == 0)
805 return (RDMA_FAILED);
806
807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
808 /*
809 * Open a hca and setup for RDMA
810 */
811 for (i = 0; i < ribstat->hca_count; i++) {
812 if (rpcib_find_hca(ribstat, hca_guids[i]))
813 continue;
814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
815
816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
817 hca_guids[i], &hca->hca_hdl);
818 if (ibt_status != IBT_SUCCESS) {
819 kmem_free(hca, sizeof (rib_hca_t));
820 continue;
821 }
822 hca->hca_guid = hca_guids[i];
823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
824 hca->state = HCA_INITED;
825
826 /*
827 * query HCA info
828 */
829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
830 if (ibt_status != IBT_SUCCESS) {
831 goto fail1;
832 }
833
834 /*
835 * One PD (Protection Domain) per HCA.
836 * A qp is allowed to access a memory region
837 * only when it's in the same PD as that of
838 * the memory region.
839 */
840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
841 if (ibt_status != IBT_SUCCESS) {
842 goto fail1;
843 }
844
845 /*
846 * query HCA ports
847 */
848 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
849 0, &pinfop, &hca->hca_nports, &size);
850 if (ibt_status != IBT_SUCCESS) {
851 goto fail2;
852 }
853 hca->hca_ports = pinfop;
854 hca->hca_pinfosz = size;
855 pinfop = NULL;
856
857 cq_size = DEF_CQ_SIZE; /* default cq size */
858 /*
859 * Create 2 pairs of cq's (1 pair for client
860 * and the other pair for server) on this hca.
861 * If number of qp's gets too large, then several
862 * cq's will be needed.
863 */
864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
865 &hca->svc_rcq);
866 if (status != RDMA_SUCCESS) {
867 goto fail3;
868 }
869
870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
871 &hca->svc_scq);
872 if (status != RDMA_SUCCESS) {
873 goto fail3;
874 }
875
876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
877 &hca->clnt_rcq);
878 if (status != RDMA_SUCCESS) {
879 goto fail3;
880 }
881
882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
883 &hca->clnt_scq);
884 if (status != RDMA_SUCCESS) {
885 goto fail3;
886 }
887
888 /*
889 * Create buffer pools.
890 * Note rib_rbuf_create also allocates memory windows.
891 */
892 hca->recv_pool = rib_rbufpool_create(hca,
893 RECV_BUFFER, rib_max_rbufs);
894 if (hca->recv_pool == NULL) {
895 goto fail3;
896 }
897
898 hca->send_pool = rib_rbufpool_create(hca,
899 SEND_BUFFER, rib_max_rbufs);
900 if (hca->send_pool == NULL) {
901 rib_rbufpool_destroy(hca, RECV_BUFFER);
902 goto fail3;
903 }
904
905 if (hca->server_side_cache == NULL) {
906 (void) sprintf(rssc_name,
907 "rib_srvr_cache_%llx",
908 (long long unsigned int) hca->hca_guid);
909 hca->server_side_cache = kmem_cache_create(
910 rssc_name,
911 sizeof (cache_avl_struct_t), 0,
912 NULL,
913 NULL,
914 rib_server_side_cache_reclaim,
915 hca, NULL, 0);
916 }
917
918 avl_create(&hca->avl_tree,
919 avl_compare,
920 sizeof (cache_avl_struct_t),
921 (uint_t)(uintptr_t)&example_avl_node.avl_link-
922 (uint_t)(uintptr_t)&example_avl_node);
923
924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
925 hca->iblock);
926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
927 rw_init(&hca->avl_rw_lock,
928 NULL, RW_DRIVER, hca->iblock);
929 mutex_init(&hca->cache_allocation_lock,
930 NULL, MUTEX_DRIVER, NULL);
931 hca->avl_init = TRUE;
932
933 /* Create kstats for the cache */
934 ASSERT(INGLOBALZONE(curproc));
935
936 if (!stats_enabled) {
937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
938 KSTAT_TYPE_NAMED,
939 sizeof (rpcib_kstat) / sizeof (kstat_named_t),
940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
941 GLOBAL_ZONEID);
942 if (ksp) {
943 ksp->ks_data = (void *) &rpcib_kstat;
944 ksp->ks_update = rpcib_cache_kstat_update;
945 kstat_install(ksp);
946 stats_enabled = TRUE;
947 }
948 }
949 if (hca->cleanup_helper == NULL) {
950 char tq_name[sizeof (hca->hca_guid) * 2 + 1];
951
952 (void) snprintf(tq_name, sizeof (tq_name), "%llX",
953 (unsigned long long int) hca->hca_guid);
954 hca->cleanup_helper = ddi_taskq_create(NULL,
955 tq_name, 1, TASKQ_DEFAULTPRI, 0);
956 }
957
958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
961 hca->iblock);
962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
963 hca->iblock);
964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
965 hca->inuse = TRUE;
966
967 hca->next = ribstat->hcas_list;
968 ribstat->hcas_list = hca;
969 ribstat->nhca_inited++;
970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
971 continue;
972
973 fail3:
974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
975 fail2:
976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
977 fail1:
978 (void) ibt_close_hca(hca->hca_hdl);
979 kmem_free(hca, sizeof (rib_hca_t));
980 }
981 rw_exit(&ribstat->hcas_list_lock);
982 ibt_free_hca_list(hca_guids, ribstat->hca_count);
983 rib_mod.rdma_count = rib_stat->nhca_inited;
984
985 /*
986 * return success if at least one new hca has been configured.
987 */
988 if (ribstat->nhca_inited != old_nhca_inited)
989 return (RDMA_SUCCESS);
990 else
991 return (RDMA_FAILED);
992 }
993
994 /*
995 * Callback routines
996 */
997
998 /*
999 * SCQ handlers
1000 */
1001 /* ARGSUSED */
1002 static void
1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1004 {
1005 ibt_status_t ibt_status;
1006 ibt_wc_t wc;
1007 struct send_wid *wd;
1008 CONN *conn;
1009 rib_qp_t *qp;
1010 int i;
1011
1012 /*
1013 * Re-enable cq notify here to avoid missing any
1014 * completion queue notification.
1015 */
1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1017
1018 ibt_status = IBT_SUCCESS;
1019 while (ibt_status != IBT_CQ_EMPTY) {
1020 bzero(&wc, sizeof (wc));
1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1022 if (ibt_status != IBT_SUCCESS)
1023 return;
1024
1025 /*
1026 * Got a send completion
1027 */
1028 if (wc.wc_id != RDMA_DUMMY_WRID) {
1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1030 qp = wd->qp;
1031 conn = qptoc(qp);
1032
1033 mutex_enter(&wd->sendwait_lock);
1034 switch (wc.wc_status) {
1035 case IBT_WC_SUCCESS:
1036 wd->status = RDMA_SUCCESS;
1037 break;
1038 default:
1039 /*
1040 * RC Send Q Error Code Local state Remote State
1041 * ==================== =========== ============
1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1043 * IBT_WC_LOCAL_LEN_ERR ERROR None
1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1052 * IBT_WC_WR_FLUSHED_ERR ERROR None
1053 */
1054 /*
1055 * Channel in error state. Set connection to
1056 * ERROR and cleanup will happen either from
1057 * conn_release or from rib_conn_get
1058 */
1059 wd->status = RDMA_FAILED;
1060 mutex_enter(&conn->c_lock);
1061 if (conn->c_state != C_DISCONN_PEND)
1062 conn->c_state = C_ERROR_CONN;
1063 mutex_exit(&conn->c_lock);
1064 break;
1065 }
1066
1067 if (wd->cv_sig == 1) {
1068 /*
1069 * Notify poster
1070 */
1071 cv_signal(&wd->wait_cv);
1072 mutex_exit(&wd->sendwait_lock);
1073 } else {
1074 /*
1075 * Poster not waiting for notification.
1076 * Free the send buffers and send_wid
1077 */
1078 for (i = 0; i < wd->nsbufs; i++) {
1079 rib_rbuf_free(qptoc(wd->qp),
1080 SEND_BUFFER,
1081 (void *)(uintptr_t)wd->sbufaddr[i]);
1082 }
1083
1084 /* decrement the send ref count */
1085 rib_send_rele(qp);
1086
1087 mutex_exit(&wd->sendwait_lock);
1088 (void) rib_free_sendwait(wd);
1089 }
1090 }
1091 }
1092 }
1093
1094 /* ARGSUSED */
1095 static void
1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098 ibt_status_t ibt_status;
1099 ibt_wc_t wc;
1100 struct send_wid *wd;
1101 rib_qp_t *qp;
1102 CONN *conn;
1103 int i;
1104
1105 /*
1106 * Re-enable cq notify here to avoid missing any
1107 * completion queue notification.
1108 */
1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1110
1111 ibt_status = IBT_SUCCESS;
1112 while (ibt_status != IBT_CQ_EMPTY) {
1113 bzero(&wc, sizeof (wc));
1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1115 if (ibt_status != IBT_SUCCESS)
1116 return;
1117
1118 /*
1119 * Got a send completion
1120 */
1121 if (wc.wc_id != RDMA_DUMMY_WRID) {
1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1123 qp = wd->qp;
1124 conn = qptoc(qp);
1125 mutex_enter(&wd->sendwait_lock);
1126
1127 switch (wc.wc_status) {
1128 case IBT_WC_SUCCESS:
1129 wd->status = RDMA_SUCCESS;
1130 break;
1131 default:
1132 /*
1133 * Channel in error state. Set connection to
1134 * ERROR and cleanup will happen either from
1135 * conn_release or conn timeout.
1136 */
1137 wd->status = RDMA_FAILED;
1138 mutex_enter(&conn->c_lock);
1139 if (conn->c_state != C_DISCONN_PEND)
1140 conn->c_state = C_ERROR_CONN;
1141 mutex_exit(&conn->c_lock);
1142 break;
1143 }
1144
1145 if (wd->cv_sig == 1) {
1146 /*
1147 * Update completion status and notify poster
1148 */
1149 cv_signal(&wd->wait_cv);
1150 mutex_exit(&wd->sendwait_lock);
1151 } else {
1152 /*
1153 * Poster not waiting for notification.
1154 * Free the send buffers and send_wid
1155 */
1156 for (i = 0; i < wd->nsbufs; i++) {
1157 rib_rbuf_free(qptoc(wd->qp),
1158 SEND_BUFFER,
1159 (void *)(uintptr_t)wd->sbufaddr[i]);
1160 }
1161
1162 /* decrement the send ref count */
1163 rib_send_rele(qp);
1164
1165 mutex_exit(&wd->sendwait_lock);
1166 (void) rib_free_sendwait(wd);
1167 }
1168 }
1169 }
1170 }
1171
1172 /*
1173 * RCQ handler
1174 */
1175 /* ARGSUSED */
1176 static void
1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1178 {
1179 rib_qp_t *qp;
1180 ibt_status_t ibt_status;
1181 ibt_wc_t wc;
1182 struct recv_wid *rwid;
1183
1184 /*
1185 * Re-enable cq notify here to avoid missing any
1186 * completion queue notification.
1187 */
1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1189
1190 ibt_status = IBT_SUCCESS;
1191 while (ibt_status != IBT_CQ_EMPTY) {
1192 bzero(&wc, sizeof (wc));
1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1194 if (ibt_status != IBT_SUCCESS)
1195 return;
1196
1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1198 qp = rwid->qp;
1199
1200 if (wc.wc_status == IBT_WC_SUCCESS) {
1201 XDR inxdrs, *xdrs;
1202 uint_t xid, vers, op, find_xid = 0;
1203 struct reply *r;
1204 CONN *conn = qptoc(qp);
1205 uint32_t rdma_credit = 0;
1206
1207 xdrs = &inxdrs;
1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1209 wc.wc_bytes_xfer, XDR_DECODE);
1210 /*
1211 * Treat xid as opaque (xid is the first entity
1212 * in the rpc rdma message).
1213 */
1214 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1215
1216 /* Skip xid and set the xdr position accordingly. */
1217 XDR_SETPOS(xdrs, sizeof (uint32_t));
1218 (void) xdr_u_int(xdrs, &vers);
1219 (void) xdr_u_int(xdrs, &rdma_credit);
1220 (void) xdr_u_int(xdrs, &op);
1221 XDR_DESTROY(xdrs);
1222
1223 if (vers != RPCRDMA_VERS) {
1224 /*
1225 * Invalid RPC/RDMA version. Cannot
1226 * interoperate. Set connection to
1227 * ERROR state and bail out.
1228 */
1229 mutex_enter(&conn->c_lock);
1230 if (conn->c_state != C_DISCONN_PEND)
1231 conn->c_state = C_ERROR_CONN;
1232 mutex_exit(&conn->c_lock);
1233 rib_rbuf_free(conn, RECV_BUFFER,
1234 (void *)(uintptr_t)rwid->addr);
1235 rib_free_wid(rwid);
1236 rib_recv_rele(qp);
1237 continue;
1238 }
1239
1240 mutex_enter(&qp->replylist_lock);
1241 for (r = qp->replylist; r != NULL; r = r->next) {
1242 if (r->xid == xid) {
1243 find_xid = 1;
1244 switch (op) {
1245 case RDMA_MSG:
1246 case RDMA_NOMSG:
1247 case RDMA_MSGP:
1248 r->status = RDMA_SUCCESS;
1249 r->vaddr_cq = rwid->addr;
1250 r->bytes_xfer =
1251 wc.wc_bytes_xfer;
1252 cv_signal(&r->wait_cv);
1253 break;
1254 default:
1255 rib_rbuf_free(qptoc(qp),
1256 RECV_BUFFER,
1257 (void *)(uintptr_t)
1258 rwid->addr);
1259 break;
1260 }
1261 break;
1262 }
1263 }
1264 mutex_exit(&qp->replylist_lock);
1265 if (find_xid == 0) {
1266 /* RPC caller not waiting for reply */
1267
1268 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1269 int, xid);
1270
1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1272 (void *)(uintptr_t)rwid->addr);
1273 }
1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1275 CONN *conn = qptoc(qp);
1276
1277 /*
1278 * Connection being flushed. Just free
1279 * the posted buffer
1280 */
1281 rib_rbuf_free(conn, RECV_BUFFER,
1282 (void *)(uintptr_t)rwid->addr);
1283 } else {
1284 CONN *conn = qptoc(qp);
1285 /*
1286 * RC Recv Q Error Code Local state Remote State
1287 * ==================== =========== ============
1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1293 * IBT_WC_WR_FLUSHED_ERR None None
1294 */
1295 /*
1296 * Channel in error state. Set connection
1297 * in ERROR state.
1298 */
1299 mutex_enter(&conn->c_lock);
1300 if (conn->c_state != C_DISCONN_PEND)
1301 conn->c_state = C_ERROR_CONN;
1302 mutex_exit(&conn->c_lock);
1303 rib_rbuf_free(conn, RECV_BUFFER,
1304 (void *)(uintptr_t)rwid->addr);
1305 }
1306 rib_free_wid(rwid);
1307 rib_recv_rele(qp);
1308 }
1309 }
1310
1311 /* Server side */
1312 /* ARGSUSED */
1313 static void
1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1315 {
1316 rdma_recv_data_t *rdp;
1317 rib_qp_t *qp;
1318 ibt_status_t ibt_status;
1319 ibt_wc_t wc;
1320 struct svc_recv *s_recvp;
1321 CONN *conn;
1322 mblk_t *mp;
1323
1324 /*
1325 * Re-enable cq notify here to avoid missing any
1326 * completion queue notification.
1327 */
1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1329
1330 ibt_status = IBT_SUCCESS;
1331 while (ibt_status != IBT_CQ_EMPTY) {
1332 bzero(&wc, sizeof (wc));
1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1334 if (ibt_status != IBT_SUCCESS)
1335 return;
1336
1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1338 qp = s_recvp->qp;
1339 conn = qptoc(qp);
1340
1341 if (wc.wc_status == IBT_WC_SUCCESS) {
1342 XDR inxdrs, *xdrs;
1343 uint_t xid, vers, op;
1344 uint32_t rdma_credit;
1345
1346 xdrs = &inxdrs;
1347 /* s_recvp->vaddr stores data */
1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349 wc.wc_bytes_xfer, XDR_DECODE);
1350
1351 /*
1352 * Treat xid as opaque (xid is the first entity
1353 * in the rpc rdma message).
1354 */
1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356 /* Skip xid and set the xdr position accordingly. */
1357 XDR_SETPOS(xdrs, sizeof (uint32_t));
1358 if (!xdr_u_int(xdrs, &vers) ||
1359 !xdr_u_int(xdrs, &rdma_credit) ||
1360 !xdr_u_int(xdrs, &op)) {
1361 rib_rbuf_free(conn, RECV_BUFFER,
1362 (void *)(uintptr_t)s_recvp->vaddr);
1363 XDR_DESTROY(xdrs);
1364 rib_recv_rele(qp);
1365 (void) rib_free_svc_recv(s_recvp);
1366 continue;
1367 }
1368 XDR_DESTROY(xdrs);
1369
1370 if (vers != RPCRDMA_VERS) {
1371 /*
1372 * Invalid RPC/RDMA version.
1373 * Drop rpc rdma message.
1374 */
1375 rib_rbuf_free(conn, RECV_BUFFER,
1376 (void *)(uintptr_t)s_recvp->vaddr);
1377 rib_recv_rele(qp);
1378 (void) rib_free_svc_recv(s_recvp);
1379 continue;
1380 }
1381 /*
1382 * Is this for RDMA_DONE?
1383 */
1384 if (op == RDMA_DONE) {
1385 rib_rbuf_free(conn, RECV_BUFFER,
1386 (void *)(uintptr_t)s_recvp->vaddr);
1387 /*
1388 * Wake up the thread waiting on
1389 * a RDMA_DONE for xid
1390 */
1391 mutex_enter(&qp->rdlist_lock);
1392 rdma_done_notify(qp, xid);
1393 mutex_exit(&qp->rdlist_lock);
1394 rib_recv_rele(qp);
1395 (void) rib_free_svc_recv(s_recvp);
1396 continue;
1397 }
1398
1399 mutex_enter(&plugin_state_lock);
1400 mutex_enter(&conn->c_lock);
1401 if ((plugin_state == ACCEPT) &&
1402 (conn->c_state == C_CONNECTED)) {
1403 conn->c_ref++;
1404 mutex_exit(&conn->c_lock);
1405 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1406 == NULL)
1407 (void) strwaitbuf(
1408 sizeof (*rdp), BPRI_LO);
1409 /*
1410 * Plugin is in accept state, hence the master
1411 * transport queue for this is still accepting
1412 * requests. Hence we can call svc_queuereq to
1413 * queue this recieved msg.
1414 */
1415 rdp = (rdma_recv_data_t *)mp->b_rptr;
1416 rdp->conn = conn;
1417 rdp->rpcmsg.addr =
1418 (caddr_t)(uintptr_t)s_recvp->vaddr;
1419 rdp->rpcmsg.type = RECV_BUFFER;
1420 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1421 rdp->status = wc.wc_status;
1422 mp->b_wptr += sizeof (*rdp);
1423 (void) svc_queuereq((queue_t *)rib_stat->q, mp,
1424 FALSE);
1425 mutex_exit(&plugin_state_lock);
1426 } else {
1427 /*
1428 * The master transport for this is going
1429 * away and the queue is not accepting anymore
1430 * requests for krpc, so don't do anything, just
1431 * free the msg.
1432 */
1433 mutex_exit(&conn->c_lock);
1434 mutex_exit(&plugin_state_lock);
1435 rib_rbuf_free(conn, RECV_BUFFER,
1436 (void *)(uintptr_t)s_recvp->vaddr);
1437 }
1438 } else {
1439 rib_rbuf_free(conn, RECV_BUFFER,
1440 (void *)(uintptr_t)s_recvp->vaddr);
1441 }
1442 rib_recv_rele(qp);
1443 (void) rib_free_svc_recv(s_recvp);
1444 }
1445 }
1446
1447 static void
1448 rib_attach_hca()
1449 {
1450 mutex_enter(&rib_stat->open_hca_lock);
1451 (void) rpcib_open_hcas(rib_stat);
1452 rib_listen(NULL);
1453 mutex_exit(&rib_stat->open_hca_lock);
1454 }
1455
1456 /*
1457 * Handles DR event of IBT_HCA_DETACH_EVENT.
1458 */
1459 /* ARGSUSED */
1460 static void
1461 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1462 ibt_async_code_t code, ibt_async_event_t *event)
1463 {
1464 switch (code) {
1465 case IBT_HCA_ATTACH_EVENT:
1466 rib_attach_hca();
1467 break;
1468 case IBT_HCA_DETACH_EVENT:
1469 rib_detach_hca(hca_hdl);
1470 #ifdef DEBUG
1471 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1472 #endif
1473 break;
1474 case IBT_EVENT_PORT_UP:
1475 /*
1476 * A port is up. We should call rib_listen() since there is
1477 * a chance that rib_listen() may have failed during
1478 * rib_attach_hca() because the port had not been up yet.
1479 */
1480 rib_listen(NULL);
1481 #ifdef DEBUG
1482 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1483 #endif
1484 break;
1485 #ifdef DEBUG
1486 case IBT_EVENT_PATH_MIGRATED:
1487 cmn_err(CE_NOTE, "rib_async_handler(): "
1488 "IBT_EVENT_PATH_MIGRATED\n");
1489 break;
1490 case IBT_EVENT_SQD:
1491 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1492 break;
1493 case IBT_EVENT_COM_EST:
1494 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1495 break;
1496 case IBT_ERROR_CATASTROPHIC_CHAN:
1497 cmn_err(CE_NOTE, "rib_async_handler(): "
1498 "IBT_ERROR_CATASTROPHIC_CHAN\n");
1499 break;
1500 case IBT_ERROR_INVALID_REQUEST_CHAN:
1501 cmn_err(CE_NOTE, "rib_async_handler(): "
1502 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1503 break;
1504 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1505 cmn_err(CE_NOTE, "rib_async_handler(): "
1506 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1507 break;
1508 case IBT_ERROR_PATH_MIGRATE_REQ:
1509 cmn_err(CE_NOTE, "rib_async_handler(): "
1510 "IBT_ERROR_PATH_MIGRATE_REQ\n");
1511 break;
1512 case IBT_ERROR_CQ:
1513 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1514 break;
1515 case IBT_ERROR_PORT_DOWN:
1516 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1517 break;
1518 case IBT_ASYNC_OPAQUE1:
1519 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1520 break;
1521 case IBT_ASYNC_OPAQUE2:
1522 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1523 break;
1524 case IBT_ASYNC_OPAQUE3:
1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1526 break;
1527 case IBT_ASYNC_OPAQUE4:
1528 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1529 break;
1530 #endif
1531 default:
1532 break;
1533 }
1534 }
1535
1536 /*
1537 * Client's reachable function.
1538 */
1539 static rdma_stat
1540 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1541 {
1542 rdma_stat status;
1543 rpcib_ping_t rpt;
1544 struct netbuf saddr;
1545 CONN *conn;
1546
1547 bzero(&saddr, sizeof (struct netbuf));
1548 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1549
1550 if (status == RDMA_SUCCESS) {
1551 *handle = (void *)rpt.hca;
1552 /* release the reference */
1553 (void) rib_conn_release(conn);
1554 return (RDMA_SUCCESS);
1555 } else {
1556 *handle = NULL;
1557 DTRACE_PROBE(rpcib__i__pingfailed);
1558 return (RDMA_FAILED);
1559 }
1560 }
1561
1562 /* Client side qp creation */
1563 static rdma_stat
1564 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1565 {
1566 rib_qp_t *kqp = NULL;
1567 CONN *conn;
1568 rdma_clnt_cred_ctrl_t *cc_info;
1569
1570 ASSERT(qp != NULL);
1571 *qp = NULL;
1572
1573 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1574 conn = qptoc(kqp);
1575 kqp->hca = hca;
1576 kqp->rdmaconn.c_rdmamod = &rib_mod;
1577 kqp->rdmaconn.c_private = (caddr_t)kqp;
1578
1579 kqp->mode = RIB_CLIENT;
1580 kqp->chan_flags = IBT_BLOCKING;
1581 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1582 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1583 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1584 /*
1585 * Initialize
1586 */
1587 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1588 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1589 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1590 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1591 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1592 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1594 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1595 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1596 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1597 /*
1598 * Initialize the client credit control
1599 * portion of the rdmaconn struct.
1600 */
1601 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1602 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1603 cc_info->clnt_cc_granted_ops = 0;
1604 cc_info->clnt_cc_in_flight_ops = 0;
1605 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1606
1607 *qp = kqp;
1608 return (RDMA_SUCCESS);
1609 }
1610
1611 /* Server side qp creation */
1612 static rdma_stat
1613 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1614 {
1615 rib_qp_t *kqp = NULL;
1616 ibt_chan_sizes_t chan_sizes;
1617 ibt_rc_chan_alloc_args_t qp_attr;
1618 ibt_status_t ibt_status;
1619 rdma_srv_cred_ctrl_t *cc_info;
1620
1621 *qp = NULL;
1622
1623 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1624 kqp->hca = hca;
1625 kqp->port_num = port;
1626 kqp->rdmaconn.c_rdmamod = &rib_mod;
1627 kqp->rdmaconn.c_private = (caddr_t)kqp;
1628
1629 /*
1630 * Create the qp handle
1631 */
1632 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1633 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1634 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1635 qp_attr.rc_pd = hca->pd_hdl;
1636 qp_attr.rc_hca_port_num = port;
1637 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1638 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1639 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1640 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1641 qp_attr.rc_clone_chan = NULL;
1642 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1643 qp_attr.rc_flags = IBT_WR_SIGNALED;
1644
1645 rw_enter(&hca->state_lock, RW_READER);
1646 if (hca->state != HCA_DETACHED) {
1647 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1648 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1649 &chan_sizes);
1650 } else {
1651 rw_exit(&hca->state_lock);
1652 goto fail;
1653 }
1654 rw_exit(&hca->state_lock);
1655
1656 if (ibt_status != IBT_SUCCESS) {
1657 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1658 int, ibt_status);
1659 goto fail;
1660 }
1661
1662 kqp->mode = RIB_SERVER;
1663 kqp->chan_flags = IBT_BLOCKING;
1664 kqp->q = q; /* server ONLY */
1665
1666 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1667 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1668 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1669 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1670 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1671 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1672 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1673 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1674 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1675 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1676 /*
1677 * Set the private data area to qp to be used in callbacks
1678 */
1679 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1680 kqp->rdmaconn.c_state = C_CONNECTED;
1681
1682 /*
1683 * Initialize the server credit control
1684 * portion of the rdmaconn struct.
1685 */
1686 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1687 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1688 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1689 cc_info->srv_cc_cur_buffers_used = 0;
1690 cc_info->srv_cc_posted = preposted_rbufs;
1691
1692 *qp = kqp;
1693
1694 return (RDMA_SUCCESS);
1695 fail:
1696 if (kqp)
1697 kmem_free(kqp, sizeof (rib_qp_t));
1698
1699 return (RDMA_FAILED);
1700 }
1701
1702 /* ARGSUSED */
1703 ibt_cm_status_t
1704 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1705 ibt_cm_return_args_t *ret_args, void *priv_data,
1706 ibt_priv_data_len_t len)
1707 {
1708 rib_hca_t *hca;
1709
1710 hca = (rib_hca_t *)clnt_hdl;
1711
1712 switch (event->cm_type) {
1713
1714 /* got a connection close event */
1715 case IBT_CM_EVENT_CONN_CLOSED:
1716 {
1717 CONN *conn;
1718 rib_qp_t *qp;
1719
1720 /* check reason why connection was closed */
1721 switch (event->cm_event.closed) {
1722 case IBT_CM_CLOSED_DREP_RCVD:
1723 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1724 case IBT_CM_CLOSED_DUP:
1725 case IBT_CM_CLOSED_ABORT:
1726 case IBT_CM_CLOSED_ALREADY:
1727 /*
1728 * These cases indicate the local end initiated
1729 * the closing of the channel. Nothing to do here.
1730 */
1731 break;
1732 default:
1733 /*
1734 * Reason for CONN_CLOSED event must be one of
1735 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1736 * or IBT_CM_CLOSED_STALE. These indicate cases were
1737 * the remote end is closing the channel. In these
1738 * cases free the channel and transition to error
1739 * state
1740 */
1741 qp = ibt_get_chan_private(event->cm_channel);
1742 conn = qptoc(qp);
1743 mutex_enter(&conn->c_lock);
1744 if (conn->c_state == C_DISCONN_PEND) {
1745 mutex_exit(&conn->c_lock);
1746 break;
1747 }
1748
1749 conn->c_state = C_ERROR_CONN;
1750
1751 /*
1752 * Free the conn if c_ref is down to 0 already
1753 */
1754 if (conn->c_ref == 0) {
1755 /*
1756 * Remove from list and free conn
1757 */
1758 conn->c_state = C_DISCONN_PEND;
1759 mutex_exit(&conn->c_lock);
1760 rw_enter(&hca->state_lock, RW_READER);
1761 if (hca->state != HCA_DETACHED)
1762 (void) rib_disconnect_channel(conn,
1763 &hca->cl_conn_list);
1764 rw_exit(&hca->state_lock);
1765 } else {
1766 /*
1767 * conn will be freed when c_ref goes to 0.
1768 * Indicate to cleaning thread not to close
1769 * the connection, but just free the channel.
1770 */
1771 conn->c_flags |= C_CLOSE_NOTNEEDED;
1772 mutex_exit(&conn->c_lock);
1773 }
1774 #ifdef DEBUG
1775 if (rib_debug)
1776 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1777 "(CONN_CLOSED) channel disconnected");
1778 #endif
1779 break;
1780 }
1781 break;
1782 }
1783 default:
1784 break;
1785 }
1786 return (IBT_CM_ACCEPT);
1787 }
1788
1789 /*
1790 * Connect to the server.
1791 */
1792 rdma_stat
1793 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1794 {
1795 ibt_chan_open_args_t chan_args; /* channel args */
1796 ibt_chan_sizes_t chan_sizes;
1797 ibt_rc_chan_alloc_args_t qp_attr;
1798 ibt_status_t ibt_status;
1799 ibt_rc_returns_t ret_args; /* conn reject info */
1800 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1801 ibt_ip_cm_info_t ipcm_info;
1802 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1803
1804
1805 (void) bzero(&chan_args, sizeof (chan_args));
1806 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1807 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1808
1809 ipcm_info.src_addr.family = rptp->srcip.family;
1810 switch (ipcm_info.src_addr.family) {
1811 case AF_INET:
1812 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1813 break;
1814 case AF_INET6:
1815 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1816 break;
1817 }
1818
1819 ipcm_info.dst_addr.family = rptp->srcip.family;
1820 switch (ipcm_info.dst_addr.family) {
1821 case AF_INET:
1822 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1823 break;
1824 case AF_INET6:
1825 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1826 break;
1827 }
1828
1829 ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1830
1831 ibt_status = ibt_format_ip_private_data(&ipcm_info,
1832 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1833
1834 if (ibt_status != IBT_SUCCESS) {
1835 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1836 return (-1);
1837 }
1838
1839 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1840 /* Alloc a RC channel */
1841 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1842 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1843 qp_attr.rc_pd = hca->pd_hdl;
1844 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1845 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1846 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1847 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1848 qp_attr.rc_clone_chan = NULL;
1849 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1850 qp_attr.rc_flags = IBT_WR_SIGNALED;
1851
1852 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1853 chan_args.oc_path = &rptp->path;
1854
1855 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1856 chan_args.oc_cm_clnt_private = (void *)hca;
1857 chan_args.oc_rdma_ra_out = 4;
1858 chan_args.oc_rdma_ra_in = 4;
1859 chan_args.oc_path_retry_cnt = 2;
1860 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1861 chan_args.oc_priv_data = cmp_ip_pvt;
1862 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1863
1864 refresh:
1865 rw_enter(&hca->state_lock, RW_READER);
1866 if (hca->state != HCA_DETACHED) {
1867 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1868 IBT_ACHAN_NO_FLAGS,
1869 &qp_attr, &qp->qp_hdl,
1870 &chan_sizes);
1871 } else {
1872 rw_exit(&hca->state_lock);
1873 return (RDMA_FAILED);
1874 }
1875 rw_exit(&hca->state_lock);
1876
1877 if (ibt_status != IBT_SUCCESS) {
1878 DTRACE_PROBE1(rpcib__i_conntosrv,
1879 int, ibt_status);
1880 return (RDMA_FAILED);
1881 }
1882
1883 /* Connect to the Server */
1884 (void) bzero(&ret_args, sizeof (ret_args));
1885 mutex_enter(&qp->cb_lock);
1886 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1887 IBT_BLOCKING, &chan_args, &ret_args);
1888 if (ibt_status != IBT_SUCCESS) {
1889 DTRACE_PROBE2(rpcib__i_openrctosrv,
1890 int, ibt_status, int, ret_args.rc_status);
1891
1892 (void) ibt_free_channel(qp->qp_hdl);
1893 qp->qp_hdl = NULL;
1894 mutex_exit(&qp->cb_lock);
1895 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1896 ret_args.rc_status == IBT_CM_CONN_STALE) {
1897 /*
1898 * Got IBT_CM_CONN_STALE probably because of stale
1899 * data on the passive end of a channel that existed
1900 * prior to reboot. Retry establishing a channel
1901 * REFRESH_ATTEMPTS times, during which time the
1902 * stale conditions on the server might clear up.
1903 */
1904 goto refresh;
1905 }
1906 return (RDMA_FAILED);
1907 }
1908 mutex_exit(&qp->cb_lock);
1909 /*
1910 * Set the private data area to qp to be used in callbacks
1911 */
1912 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1913 return (RDMA_SUCCESS);
1914 }
1915
1916 rdma_stat
1917 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1918 {
1919 uint_t i, addr_count;
1920 ibt_status_t ibt_status;
1921 uint8_t num_paths_p;
1922 ibt_ip_path_attr_t ipattr;
1923 ibt_path_ip_src_t srcip;
1924 rpcib_ipaddrs_t addrs4;
1925 rpcib_ipaddrs_t addrs6;
1926 struct sockaddr_in *sinp;
1927 struct sockaddr_in6 *sin6p;
1928 rdma_stat retval = RDMA_FAILED;
1929 rib_hca_t *hca;
1930
1931 if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1932 return (RDMA_INVAL);
1933 ASSERT(raddr->buf != NULL);
1934
1935 bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1936
1937 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1938 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1939 retval = RDMA_FAILED;
1940 goto done2;
1941 }
1942
1943 if (addr_type == AF_INET) {
1944 addr_count = addrs4.ri_count;
1945 sinp = (struct sockaddr_in *)raddr->buf;
1946 rptp->dstip.family = AF_INET;
1947 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1948 sinp = addrs4.ri_list;
1949 } else {
1950 addr_count = addrs6.ri_count;
1951 sin6p = (struct sockaddr_in6 *)raddr->buf;
1952 rptp->dstip.family = AF_INET6;
1953 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1954 sin6p = addrs6.ri_list;
1955 }
1956
1957 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1958 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1959 rw_enter(&hca->state_lock, RW_READER);
1960 if (hca->state == HCA_DETACHED) {
1961 rw_exit(&hca->state_lock);
1962 continue;
1963 }
1964
1965 ipattr.ipa_dst_ip = &rptp->dstip;
1966 ipattr.ipa_hca_guid = hca->hca_guid;
1967 ipattr.ipa_ndst = 1;
1968 ipattr.ipa_max_paths = 1;
1969 ipattr.ipa_src_ip.family = rptp->dstip.family;
1970 for (i = 0; i < addr_count; i++) {
1971 num_paths_p = 0;
1972 if (addr_type == AF_INET) {
1973 ipattr.ipa_src_ip.un.ip4addr =
1974 sinp[i].sin_addr.s_addr;
1975 } else {
1976 ipattr.ipa_src_ip.un.ip6addr =
1977 sin6p[i].sin6_addr;
1978 }
1979 bzero(&srcip, sizeof (ibt_path_ip_src_t));
1980
1981 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1982 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1983 &num_paths_p, &srcip);
1984 if (ibt_status == IBT_SUCCESS &&
1985 num_paths_p != 0 &&
1986 rptp->path.pi_hca_guid == hca->hca_guid) {
1987 rptp->hca = hca;
1988 rw_exit(&hca->state_lock);
1989 if (addr_type == AF_INET) {
1990 rptp->srcip.family = AF_INET;
1991 rptp->srcip.un.ip4addr =
1992 srcip.ip_primary.un.ip4addr;
1993 } else {
1994 rptp->srcip.family = AF_INET6;
1995 rptp->srcip.un.ip6addr =
1996 srcip.ip_primary.un.ip6addr;
1997
1998 }
1999 retval = RDMA_SUCCESS;
2000 goto done1;
2001 }
2002 }
2003 rw_exit(&hca->state_lock);
2004 }
2005 done1:
2006 rw_exit(&rib_stat->hcas_list_lock);
2007 done2:
2008 if (addrs4.ri_size > 0)
2009 kmem_free(addrs4.ri_list, addrs4.ri_size);
2010 if (addrs6.ri_size > 0)
2011 kmem_free(addrs6.ri_list, addrs6.ri_size);
2012 return (retval);
2013 }
2014
2015 /*
2016 * Close channel, remove from connection list and
2017 * free up resources allocated for that channel.
2018 */
2019 rdma_stat
2020 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2021 {
2022 rib_qp_t *qp = ctoqp(conn);
2023 rib_hca_t *hca;
2024
2025 mutex_enter(&conn->c_lock);
2026 if (conn->c_timeout != NULL) {
2027 mutex_exit(&conn->c_lock);
2028 (void) untimeout(conn->c_timeout);
2029 mutex_enter(&conn->c_lock);
2030 }
2031
2032 while (conn->c_flags & C_CLOSE_PENDING) {
2033 cv_wait(&conn->c_cv, &conn->c_lock);
2034 }
2035 mutex_exit(&conn->c_lock);
2036
2037 /*
2038 * c_ref == 0 and connection is in C_DISCONN_PEND
2039 */
2040 hca = qp->hca;
2041 if (conn_list != NULL)
2042 (void) rib_rm_conn(conn, conn_list);
2043
2044 /*
2045 * There is only one case where we get here with
2046 * qp_hdl = NULL, which is during connection setup on
2047 * the client. In such a case there are no posted
2048 * send/recv buffers.
2049 */
2050 if (qp->qp_hdl != NULL) {
2051 mutex_enter(&qp->posted_rbufs_lock);
2052 while (qp->n_posted_rbufs)
2053 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2054 mutex_exit(&qp->posted_rbufs_lock);
2055
2056 mutex_enter(&qp->send_rbufs_lock);
2057 while (qp->n_send_rbufs)
2058 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2059 mutex_exit(&qp->send_rbufs_lock);
2060
2061 (void) ibt_free_channel(qp->qp_hdl);
2062 qp->qp_hdl = NULL;
2063 }
2064
2065 ASSERT(qp->rdlist == NULL);
2066
2067 if (qp->replylist != NULL) {
2068 (void) rib_rem_replylist(qp);
2069 }
2070
2071 cv_destroy(&qp->cb_conn_cv);
2072 cv_destroy(&qp->posted_rbufs_cv);
2073 cv_destroy(&qp->send_rbufs_cv);
2074 mutex_destroy(&qp->cb_lock);
2075 mutex_destroy(&qp->replylist_lock);
2076 mutex_destroy(&qp->posted_rbufs_lock);
2077 mutex_destroy(&qp->send_rbufs_lock);
2078 mutex_destroy(&qp->rdlist_lock);
2079
2080 cv_destroy(&conn->c_cv);
2081 mutex_destroy(&conn->c_lock);
2082
2083 if (conn->c_raddr.buf != NULL) {
2084 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2085 }
2086 if (conn->c_laddr.buf != NULL) {
2087 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2088 }
2089 if (conn->c_netid != NULL) {
2090 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2091 }
2092 if (conn->c_addrmask.buf != NULL) {
2093 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2094 }
2095
2096 /*
2097 * Credit control cleanup.
2098 */
2099 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2100 rdma_clnt_cred_ctrl_t *cc_info;
2101 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2102 cv_destroy(&cc_info->clnt_cc_cv);
2103 }
2104
2105 kmem_free(qp, sizeof (rib_qp_t));
2106
2107 /*
2108 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2109 * then the hca is no longer being used.
2110 */
2111 if (conn_list != NULL) {
2112 rw_enter(&hca->state_lock, RW_READER);
2113 if (hca->state == HCA_DETACHED) {
2114 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2115 if (hca->srv_conn_list.conn_hd == NULL) {
2116 rw_enter(&hca->cl_conn_list.conn_lock,
2117 RW_READER);
2118
2119 if (hca->cl_conn_list.conn_hd == NULL) {
2120 mutex_enter(&hca->inuse_lock);
2121 hca->inuse = FALSE;
2122 cv_signal(&hca->cb_cv);
2123 mutex_exit(&hca->inuse_lock);
2124 }
2125 rw_exit(&hca->cl_conn_list.conn_lock);
2126 }
2127 rw_exit(&hca->srv_conn_list.conn_lock);
2128 }
2129 rw_exit(&hca->state_lock);
2130 }
2131
2132 return (RDMA_SUCCESS);
2133 }
2134
2135 /*
2136 * All sends are done under the protection of
2137 * the wdesc->sendwait_lock. n_send_rbufs count
2138 * is protected using the send_rbufs_lock.
2139 * lock ordering is:
2140 * sendwait_lock -> send_rbufs_lock
2141 */
2142
2143 void
2144 rib_send_hold(rib_qp_t *qp)
2145 {
2146 mutex_enter(&qp->send_rbufs_lock);
2147 qp->n_send_rbufs++;
2148 mutex_exit(&qp->send_rbufs_lock);
2149 }
2150
2151 void
2152 rib_send_rele(rib_qp_t *qp)
2153 {
2154 mutex_enter(&qp->send_rbufs_lock);
2155 qp->n_send_rbufs--;
2156 if (qp->n_send_rbufs == 0)
2157 cv_signal(&qp->send_rbufs_cv);
2158 mutex_exit(&qp->send_rbufs_lock);
2159 }
2160
2161 void
2162 rib_recv_rele(rib_qp_t *qp)
2163 {
2164 mutex_enter(&qp->posted_rbufs_lock);
2165 qp->n_posted_rbufs--;
2166 if (qp->n_posted_rbufs == 0)
2167 cv_signal(&qp->posted_rbufs_cv);
2168 mutex_exit(&qp->posted_rbufs_lock);
2169 }
2170
2171 /*
2172 * Wait for send completion notification. Only on receiving a
2173 * notification be it a successful or error completion, free the
2174 * send_wid.
2175 */
2176 static rdma_stat
2177 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2178 {
2179 clock_t timout, cv_wait_ret;
2180 rdma_stat error = RDMA_SUCCESS;
2181 int i;
2182
2183 /*
2184 * Wait for send to complete
2185 */
2186 ASSERT(wd != NULL);
2187 mutex_enter(&wd->sendwait_lock);
2188 if (wd->status == (uint_t)SEND_WAIT) {
2189 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2190 ddi_get_lbolt();
2191
2192 if (qp->mode == RIB_SERVER) {
2193 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2194 &wd->sendwait_lock, timout)) > 0 &&
2195 wd->status == (uint_t)SEND_WAIT)
2196 ;
2197 switch (cv_wait_ret) {
2198 case -1: /* timeout */
2199 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2200
2201 wd->cv_sig = 0; /* no signal needed */
2202 error = RDMA_TIMEDOUT;
2203 break;
2204 default: /* got send completion */
2205 break;
2206 }
2207 } else {
2208 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2209 &wd->sendwait_lock, timout)) > 0 &&
2210 wd->status == (uint_t)SEND_WAIT)
2211 ;
2212 switch (cv_wait_ret) {
2213 case -1: /* timeout */
2214 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2215
2216 wd->cv_sig = 0; /* no signal needed */
2217 error = RDMA_TIMEDOUT;
2218 break;
2219 case 0: /* interrupted */
2220 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2221
2222 wd->cv_sig = 0; /* no signal needed */
2223 error = RDMA_INTR;
2224 break;
2225 default: /* got send completion */
2226 break;
2227 }
2228 }
2229 }
2230
2231 if (wd->status != (uint_t)SEND_WAIT) {
2232 /* got send completion */
2233 if (wd->status != RDMA_SUCCESS) {
2234 switch (wd->status) {
2235 case RDMA_CONNLOST:
2236 error = RDMA_CONNLOST;
2237 break;
2238 default:
2239 error = RDMA_FAILED;
2240 break;
2241 }
2242 }
2243 for (i = 0; i < wd->nsbufs; i++) {
2244 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2245 (void *)(uintptr_t)wd->sbufaddr[i]);
2246 }
2247
2248 rib_send_rele(qp);
2249
2250 mutex_exit(&wd->sendwait_lock);
2251 (void) rib_free_sendwait(wd);
2252
2253 } else {
2254 mutex_exit(&wd->sendwait_lock);
2255 }
2256 return (error);
2257 }
2258
2259 static struct send_wid *
2260 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2261 {
2262 struct send_wid *wd;
2263
2264 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2265 wd->xid = xid;
2266 wd->cv_sig = cv_sig;
2267 wd->qp = qp;
2268 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2269 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2270 wd->status = (uint_t)SEND_WAIT;
2271
2272 return (wd);
2273 }
2274
2275 static int
2276 rib_free_sendwait(struct send_wid *wdesc)
2277 {
2278 cv_destroy(&wdesc->wait_cv);
2279 mutex_destroy(&wdesc->sendwait_lock);
2280 kmem_free(wdesc, sizeof (*wdesc));
2281
2282 return (0);
2283 }
2284
2285 static rdma_stat
2286 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2287 {
2288 mutex_enter(&qp->replylist_lock);
2289 if (rep != NULL) {
2290 (void) rib_remreply(qp, rep);
2291 mutex_exit(&qp->replylist_lock);
2292 return (RDMA_SUCCESS);
2293 }
2294 mutex_exit(&qp->replylist_lock);
2295 return (RDMA_FAILED);
2296 }
2297
2298 /*
2299 * Send buffers are freed here only in case of error in posting
2300 * on QP. If the post succeeded, the send buffers are freed upon
2301 * send completion in rib_sendwait() or in the scq_handler.
2302 */
2303 rdma_stat
2304 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2305 int send_sig, int cv_sig, caddr_t *swid)
2306 {
2307 struct send_wid *wdesc;
2308 struct clist *clp;
2309 ibt_status_t ibt_status = IBT_SUCCESS;
2310 rdma_stat ret = RDMA_SUCCESS;
2311 ibt_send_wr_t tx_wr;
2312 int i, nds;
2313 ibt_wr_ds_t sgl[DSEG_MAX];
2314 uint_t total_msg_size;
2315 rib_qp_t *qp;
2316
2317 qp = ctoqp(conn);
2318
2319 ASSERT(cl != NULL);
2320
2321 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2322
2323 nds = 0;
2324 total_msg_size = 0;
2325 clp = cl;
2326 while (clp != NULL) {
2327 if (nds >= DSEG_MAX) {
2328 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2329 return (RDMA_FAILED);
2330 }
2331 sgl[nds].ds_va = clp->w.c_saddr;
2332 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2333 sgl[nds].ds_len = clp->c_len;
2334 total_msg_size += clp->c_len;
2335 clp = clp->c_next;
2336 nds++;
2337 }
2338
2339 if (send_sig) {
2340 /* Set SEND_SIGNAL flag. */
2341 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2342 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2343 *swid = (caddr_t)wdesc;
2344 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2345 mutex_enter(&wdesc->sendwait_lock);
2346 wdesc->nsbufs = nds;
2347 for (i = 0; i < nds; i++) {
2348 wdesc->sbufaddr[i] = sgl[i].ds_va;
2349 }
2350 } else {
2351 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2352 *swid = NULL;
2353 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2354 }
2355
2356 tx_wr.wr_opcode = IBT_WRC_SEND;
2357 tx_wr.wr_trans = IBT_RC_SRV;
2358 tx_wr.wr_nds = nds;
2359 tx_wr.wr_sgl = sgl;
2360
2361 mutex_enter(&conn->c_lock);
2362 if (conn->c_state == C_CONNECTED) {
2363 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2364 }
2365 if (conn->c_state != C_CONNECTED ||
2366 ibt_status != IBT_SUCCESS) {
2367 if (conn->c_state != C_DISCONN_PEND)
2368 conn->c_state = C_ERROR_CONN;
2369 mutex_exit(&conn->c_lock);
2370 if (send_sig) {
2371 for (i = 0; i < nds; i++) {
2372 rib_rbuf_free(conn, SEND_BUFFER,
2373 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2374 }
2375 mutex_exit(&wdesc->sendwait_lock);
2376 (void) rib_free_sendwait(wdesc);
2377 }
2378 return (RDMA_CONNLOST);
2379 }
2380
2381 mutex_exit(&conn->c_lock);
2382
2383 if (send_sig) {
2384 rib_send_hold(qp);
2385 mutex_exit(&wdesc->sendwait_lock);
2386 if (cv_sig) {
2387 /*
2388 * cv_wait for send to complete.
2389 * We can fail due to a timeout or signal or
2390 * unsuccessful send.
2391 */
2392 ret = rib_sendwait(qp, wdesc);
2393
2394 return (ret);
2395 }
2396 }
2397
2398 return (RDMA_SUCCESS);
2399 }
2400
2401
2402 rdma_stat
2403 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2404 {
2405 rdma_stat ret;
2406 caddr_t wd;
2407
2408 /* send-wait & cv_signal */
2409 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2410 return (ret);
2411 }
2412
2413 /*
2414 * Deprecated/obsolete interface not used currently
2415 * but earlier used for READ-READ protocol.
2416 * Send RPC reply and wait for RDMA_DONE.
2417 */
2418 rdma_stat
2419 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2420 {
2421 rdma_stat ret = RDMA_SUCCESS;
2422 struct rdma_done_list *rd;
2423 clock_t cv_wait_ret;
2424 caddr_t *wid = NULL;
2425 rib_qp_t *qp = ctoqp(conn);
2426
2427 mutex_enter(&qp->rdlist_lock);
2428 rd = rdma_done_add(qp, msgid);
2429
2430 /* No cv_signal (whether send-wait or no-send-wait) */
2431 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2432
2433 if (ret != RDMA_SUCCESS) {
2434 rdma_done_rm(qp, rd);
2435 } else {
2436 /*
2437 * Wait for RDMA_DONE from remote end
2438 */
2439 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2440 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2441 TR_CLOCK_TICK);
2442
2443 rdma_done_rm(qp, rd);
2444
2445 if (cv_wait_ret < 0) {
2446 ret = RDMA_TIMEDOUT;
2447 }
2448 }
2449
2450 mutex_exit(&qp->rdlist_lock);
2451 return (ret);
2452 }
2453
2454 static struct recv_wid *
2455 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2456 {
2457 struct recv_wid *rwid;
2458
2459 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2460 rwid->xid = msgid;
2461 rwid->addr = sgl->ds_va;
2462 rwid->qp = qp;
2463
2464 return (rwid);
2465 }
2466
2467 static void
2468 rib_free_wid(struct recv_wid *rwid)
2469 {
2470 kmem_free(rwid, sizeof (struct recv_wid));
2471 }
2472
2473 rdma_stat
2474 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2475 {
2476 rib_qp_t *qp = ctoqp(conn);
2477 struct clist *clp = cl;
2478 struct reply *rep;
2479 struct recv_wid *rwid;
2480 int nds;
2481 ibt_wr_ds_t sgl[DSEG_MAX];
2482 ibt_recv_wr_t recv_wr;
2483 rdma_stat ret;
2484 ibt_status_t ibt_status;
2485
2486 /*
2487 * rdma_clnt_postrecv uses RECV_BUFFER.
2488 */
2489
2490 nds = 0;
2491 while (cl != NULL) {
2492 if (nds >= DSEG_MAX) {
2493 ret = RDMA_FAILED;
2494 goto done;
2495 }
2496 sgl[nds].ds_va = cl->w.c_saddr;
2497 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2498 sgl[nds].ds_len = cl->c_len;
2499 cl = cl->c_next;
2500 nds++;
2501 }
2502
2503 if (nds != 1) {
2504 ret = RDMA_FAILED;
2505 goto done;
2506 }
2507
2508 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2509 recv_wr.wr_nds = nds;
2510 recv_wr.wr_sgl = sgl;
2511
2512 rwid = rib_create_wid(qp, &sgl[0], msgid);
2513 if (rwid) {
2514 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2515 } else {
2516 ret = RDMA_NORESOURCE;
2517 goto done;
2518 }
2519 rep = rib_addreplylist(qp, msgid);
2520 if (!rep) {
2521 rib_free_wid(rwid);
2522 ret = RDMA_NORESOURCE;
2523 goto done;
2524 }
2525
2526 mutex_enter(&conn->c_lock);
2527
2528 if (conn->c_state == C_CONNECTED) {
2529 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2530 }
2531
2532 if (conn->c_state != C_CONNECTED ||
2533 ibt_status != IBT_SUCCESS) {
2534 if (conn->c_state != C_DISCONN_PEND)
2535 conn->c_state = C_ERROR_CONN;
2536 mutex_exit(&conn->c_lock);
2537 rib_free_wid(rwid);
2538 (void) rib_rem_rep(qp, rep);
2539 ret = RDMA_CONNLOST;
2540 goto done;
2541 }
2542
2543 mutex_enter(&qp->posted_rbufs_lock);
2544 qp->n_posted_rbufs++;
2545 mutex_exit(&qp->posted_rbufs_lock);
2546
2547 mutex_exit(&conn->c_lock);
2548 return (RDMA_SUCCESS);
2549
2550 done:
2551 while (clp != NULL) {
2552 rib_rbuf_free(conn, RECV_BUFFER,
2553 (void *)(uintptr_t)clp->w.c_saddr3);
2554 clp = clp->c_next;
2555 }
2556 return (ret);
2557 }
2558
2559 rdma_stat
2560 rib_svc_post(CONN* conn, struct clist *cl)
2561 {
2562 rib_qp_t *qp = ctoqp(conn);
2563 struct svc_recv *s_recvp;
2564 int nds;
2565 ibt_wr_ds_t sgl[DSEG_MAX];
2566 ibt_recv_wr_t recv_wr;
2567 ibt_status_t ibt_status;
2568
2569 nds = 0;
2570 while (cl != NULL) {
2571 if (nds >= DSEG_MAX) {
2572 return (RDMA_FAILED);
2573 }
2574 sgl[nds].ds_va = cl->w.c_saddr;
2575 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2576 sgl[nds].ds_len = cl->c_len;
2577 cl = cl->c_next;
2578 nds++;
2579 }
2580
2581 if (nds != 1) {
2582 rib_rbuf_free(conn, RECV_BUFFER,
2583 (caddr_t)(uintptr_t)sgl[0].ds_va);
2584
2585 return (RDMA_FAILED);
2586 }
2587
2588 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2589 recv_wr.wr_nds = nds;
2590 recv_wr.wr_sgl = sgl;
2591
2592 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2593 /* Use s_recvp's addr as wr id */
2594 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2595 mutex_enter(&conn->c_lock);
2596 if (conn->c_state == C_CONNECTED) {
2597 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2598 }
2599 if (conn->c_state != C_CONNECTED ||
2600 ibt_status != IBT_SUCCESS) {
2601 if (conn->c_state != C_DISCONN_PEND)
2602 conn->c_state = C_ERROR_CONN;
2603 mutex_exit(&conn->c_lock);
2604 rib_rbuf_free(conn, RECV_BUFFER,
2605 (caddr_t)(uintptr_t)sgl[0].ds_va);
2606 (void) rib_free_svc_recv(s_recvp);
2607
2608 return (RDMA_CONNLOST);
2609 }
2610 mutex_exit(&conn->c_lock);
2611
2612 return (RDMA_SUCCESS);
2613 }
2614
2615 /* Client */
2616 rdma_stat
2617 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2618 {
2619 return (rib_clnt_post(conn, cl, msgid));
2620 }
2621
2622 /* Client */
2623 rdma_stat
2624 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2625 {
2626 rib_qp_t *qp = ctoqp(conn);
2627 struct reply *rep;
2628
2629 mutex_enter(&qp->replylist_lock);
2630 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2631 if (rep->xid == msgid) {
2632 if (rep->vaddr_cq) {
2633 rib_rbuf_free(conn, RECV_BUFFER,
2634 (caddr_t)(uintptr_t)rep->vaddr_cq);
2635 }
2636 (void) rib_remreply(qp, rep);
2637 break;
2638 }
2639 }
2640 mutex_exit(&qp->replylist_lock);
2641
2642 return (RDMA_SUCCESS);
2643 }
2644
2645 /* Server */
2646 rdma_stat
2647 rib_post_recv(CONN *conn, struct clist *cl)
2648 {
2649 rib_qp_t *qp = ctoqp(conn);
2650
2651 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2652 mutex_enter(&qp->posted_rbufs_lock);
2653 qp->n_posted_rbufs++;
2654 mutex_exit(&qp->posted_rbufs_lock);
2655 return (RDMA_SUCCESS);
2656 }
2657 return (RDMA_FAILED);
2658 }
2659
2660 /*
2661 * Client side only interface to "recv" the rpc reply buf
2662 * posted earlier by rib_post_resp(conn, cl, msgid).
2663 */
2664 rdma_stat
2665 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2666 {
2667 struct reply *rep = NULL;
2668 clock_t timout, cv_wait_ret;
2669 rdma_stat ret = RDMA_SUCCESS;
2670 rib_qp_t *qp = ctoqp(conn);
2671
2672 /*
2673 * Find the reply structure for this msgid
2674 */
2675 mutex_enter(&qp->replylist_lock);
2676
2677 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2678 if (rep->xid == msgid)
2679 break;
2680 }
2681
2682 if (rep != NULL) {
2683 /*
2684 * If message not yet received, wait.
2685 */
2686 if (rep->status == (uint_t)REPLY_WAIT) {
2687 timout = ddi_get_lbolt() +
2688 drv_usectohz(REPLY_WAIT_TIME * 1000000);
2689
2690 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2691 &qp->replylist_lock, timout)) > 0 &&
2692 rep->status == (uint_t)REPLY_WAIT)
2693 ;
2694
2695 switch (cv_wait_ret) {
2696 case -1: /* timeout */
2697 ret = RDMA_TIMEDOUT;
2698 break;
2699 case 0:
2700 ret = RDMA_INTR;
2701 break;
2702 default:
2703 break;
2704 }
2705 }
2706
2707 if (rep->status == RDMA_SUCCESS) {
2708 struct clist *cl = NULL;
2709
2710 /*
2711 * Got message successfully
2712 */
2713 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2714 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2715 *clp = cl;
2716 } else {
2717 if (rep->status != (uint_t)REPLY_WAIT) {
2718 /*
2719 * Got error in reply message. Free
2720 * recv buffer here.
2721 */
2722 ret = rep->status;
2723 rib_rbuf_free(conn, RECV_BUFFER,
2724 (caddr_t)(uintptr_t)rep->vaddr_cq);
2725 }
2726 }
2727 (void) rib_remreply(qp, rep);
2728 } else {
2729 /*
2730 * No matching reply structure found for given msgid on the
2731 * reply wait list.
2732 */
2733 ret = RDMA_INVAL;
2734 DTRACE_PROBE(rpcib__i__nomatchxid2);
2735 }
2736
2737 /*
2738 * Done.
2739 */
2740 mutex_exit(&qp->replylist_lock);
2741 return (ret);
2742 }
2743
2744 /*
2745 * RDMA write a buffer to the remote address.
2746 */
2747 rdma_stat
2748 rib_write(CONN *conn, struct clist *cl, int wait)
2749 {
2750 ibt_send_wr_t tx_wr;
2751 int cv_sig;
2752 ibt_wr_ds_t sgl[DSEG_MAX];
2753 struct send_wid *wdesc;
2754 ibt_status_t ibt_status;
2755 rdma_stat ret = RDMA_SUCCESS;
2756 rib_qp_t *qp = ctoqp(conn);
2757 uint64_t n_writes = 0;
2758
2759 if (cl == NULL) {
2760 return (RDMA_FAILED);
2761 }
2762
2763 while ((cl != NULL)) {
2764 if (cl->c_len > 0) {
2765 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2766 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2767 tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2768 cl->c_dmemhandle.mrc_rmr; /* rkey */
2769 sgl[0].ds_va = cl->w.c_saddr;
2770 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2771 sgl[0].ds_len = cl->c_len;
2772
2773 if (wait) {
2774 cv_sig = 1;
2775 } else {
2776 if (n_writes > max_unsignaled_rws) {
2777 n_writes = 0;
2778 cv_sig = 1;
2779 } else {
2780 cv_sig = 0;
2781 }
2782 }
2783
2784 if (cv_sig) {
2785 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2786 wdesc = rib_init_sendwait(0, cv_sig, qp);
2787 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2788 mutex_enter(&wdesc->sendwait_lock);
2789 } else {
2790 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2791 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2792 }
2793 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2794 tx_wr.wr_trans = IBT_RC_SRV;
2795 tx_wr.wr_nds = 1;
2796 tx_wr.wr_sgl = sgl;
2797
2798 mutex_enter(&conn->c_lock);
2799 if (conn->c_state == C_CONNECTED) {
2800 ibt_status =
2801 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2802 }
2803 if (conn->c_state != C_CONNECTED ||
2804 ibt_status != IBT_SUCCESS) {
2805 if (conn->c_state != C_DISCONN_PEND)
2806 conn->c_state = C_ERROR_CONN;
2807 mutex_exit(&conn->c_lock);
2808 if (cv_sig) {
2809 mutex_exit(&wdesc->sendwait_lock);
2810 (void) rib_free_sendwait(wdesc);
2811 }
2812 return (RDMA_CONNLOST);
2813 }
2814
2815 mutex_exit(&conn->c_lock);
2816
2817 /*
2818 * Wait for send to complete
2819 */
2820 if (cv_sig) {
2821
2822 rib_send_hold(qp);
2823 mutex_exit(&wdesc->sendwait_lock);
2824
2825 ret = rib_sendwait(qp, wdesc);
2826 if (ret != 0)
2827 return (ret);
2828 }
2829 n_writes ++;
2830 }
2831 cl = cl->c_next;
2832 }
2833 return (RDMA_SUCCESS);
2834 }
2835
2836 /*
2837 * RDMA Read a buffer from the remote address.
2838 */
2839 rdma_stat
2840 rib_read(CONN *conn, struct clist *cl, int wait)
2841 {
2842 ibt_send_wr_t rx_wr;
2843 int cv_sig = 0;
2844 ibt_wr_ds_t sgl;
2845 struct send_wid *wdesc;
2846 ibt_status_t ibt_status = IBT_SUCCESS;
2847 rdma_stat ret = RDMA_SUCCESS;
2848 rib_qp_t *qp = ctoqp(conn);
2849
2850 if (cl == NULL) {
2851 return (RDMA_FAILED);
2852 }
2853
2854 while (cl != NULL) {
2855 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2856 /*
2857 * Remote address is at the head chunk item in list.
2858 */
2859 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2860 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2861
2862 sgl.ds_va = cl->u.c_daddr;
2863 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2864 sgl.ds_len = cl->c_len;
2865
2866 /*
2867 * If there are multiple chunks to be read, and
2868 * wait is set, ask for signal only for the last chunk
2869 * and wait only on the last chunk. The completion of
2870 * RDMA_READ on last chunk ensures that reads on all
2871 * previous chunks are also completed.
2872 */
2873 if (wait && (cl->c_next == NULL)) {
2874 cv_sig = 1;
2875 wdesc = rib_init_sendwait(0, cv_sig, qp);
2876 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2877 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2878 mutex_enter(&wdesc->sendwait_lock);
2879 } else {
2880 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2881 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2882 }
2883 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2884 rx_wr.wr_trans = IBT_RC_SRV;
2885 rx_wr.wr_nds = 1;
2886 rx_wr.wr_sgl = &sgl;
2887
2888 mutex_enter(&conn->c_lock);
2889 if (conn->c_state == C_CONNECTED) {
2890 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2891 }
2892 if (conn->c_state != C_CONNECTED ||
2893 ibt_status != IBT_SUCCESS) {
2894 if (conn->c_state != C_DISCONN_PEND)
2895 conn->c_state = C_ERROR_CONN;
2896 mutex_exit(&conn->c_lock);
2897 if (wait && (cl->c_next == NULL)) {
2898 mutex_exit(&wdesc->sendwait_lock);
2899 (void) rib_free_sendwait(wdesc);
2900 }
2901 return (RDMA_CONNLOST);
2902 }
2903
2904 mutex_exit(&conn->c_lock);
2905
2906 /*
2907 * Wait for send to complete if this is the
2908 * last item in the list.
2909 */
2910 if (wait && cl->c_next == NULL) {
2911 rib_send_hold(qp);
2912 mutex_exit(&wdesc->sendwait_lock);
2913
2914 ret = rib_sendwait(qp, wdesc);
2915
2916 if (ret != 0)
2917 return (ret);
2918 }
2919 cl = cl->c_next;
2920 }
2921 return (RDMA_SUCCESS);
2922 }
2923
2924 /*
2925 * rib_srv_cm_handler()
2926 * Connection Manager callback to handle RC connection requests.
2927 */
2928 /* ARGSUSED */
2929 static ibt_cm_status_t
2930 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2931 ibt_cm_return_args_t *ret_args, void *priv_data,
2932 ibt_priv_data_len_t len)
2933 {
2934 queue_t *q;
2935 rib_qp_t *qp;
2936 rib_hca_t *hca;
2937 rdma_stat status = RDMA_SUCCESS;
2938 int i;
2939 struct clist cl;
2940 rdma_buf_t rdbuf = {0};
2941 void *buf = NULL;
2942 CONN *conn;
2943 ibt_ip_cm_info_t ipinfo;
2944 struct sockaddr_in *s;
2945 struct sockaddr_in6 *s6;
2946 int sin_size = sizeof (struct sockaddr_in);
2947 int in_size = sizeof (struct in_addr);
2948 int sin6_size = sizeof (struct sockaddr_in6);
2949
2950 ASSERT(any != NULL);
2951 ASSERT(event != NULL);
2952
2953 hca = (rib_hca_t *)any;
2954
2955 /* got a connection request */
2956 switch (event->cm_type) {
2957 case IBT_CM_EVENT_REQ_RCV:
2958 /*
2959 * If the plugin is in the NO_ACCEPT state, bail out.
2960 */
2961 mutex_enter(&plugin_state_lock);
2962 if (plugin_state == NO_ACCEPT) {
2963 mutex_exit(&plugin_state_lock);
2964 return (IBT_CM_REJECT);
2965 }
2966 mutex_exit(&plugin_state_lock);
2967
2968 /*
2969 * Need to send a MRA MAD to CM so that it does not
2970 * timeout on us.
2971 */
2972 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2973 event->cm_event.req.req_timeout * 8, NULL, 0);
2974
2975 mutex_enter(&rib_stat->open_hca_lock);
2976 q = rib_stat->q;
2977 mutex_exit(&rib_stat->open_hca_lock);
2978
2979 status = rib_svc_create_chan(hca, (caddr_t)q,
2980 event->cm_event.req.req_prim_hca_port, &qp);
2981
2982 if (status) {
2983 return (IBT_CM_REJECT);
2984 }
2985
2986 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2987 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2988 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2989 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2990
2991 /*
2992 * Pre-posts RECV buffers
2993 */
2994 conn = qptoc(qp);
2995 for (i = 0; i < preposted_rbufs; i++) {
2996 bzero(&rdbuf, sizeof (rdbuf));
2997 rdbuf.type = RECV_BUFFER;
2998 buf = rib_rbuf_alloc(conn, &rdbuf);
2999 if (buf == NULL) {
3000 /*
3001 * A connection is not established yet.
3002 * Just flush the channel. Buffers
3003 * posted till now will error out with
3004 * IBT_WC_WR_FLUSHED_ERR.
3005 */
3006 (void) ibt_flush_channel(qp->qp_hdl);
3007 (void) rib_disconnect_channel(conn, NULL);
3008 return (IBT_CM_REJECT);
3009 }
3010
3011 bzero(&cl, sizeof (cl));
3012 cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3013 cl.c_len = rdbuf.len;
3014 cl.c_smemhandle.mrc_lmr =
3015 rdbuf.handle.mrc_lmr; /* lkey */
3016 cl.c_next = NULL;
3017 status = rib_post_recv(conn, &cl);
3018 if (status != RDMA_SUCCESS) {
3019 /*
3020 * A connection is not established yet.
3021 * Just flush the channel. Buffers
3022 * posted till now will error out with
3023 * IBT_WC_WR_FLUSHED_ERR.
3024 */
3025 (void) ibt_flush_channel(qp->qp_hdl);
3026 (void) rib_disconnect_channel(conn, NULL);
3027 return (IBT_CM_REJECT);
3028 }
3029 }
3030 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3031
3032 /*
3033 * Get the address translation
3034 */
3035 rw_enter(&hca->state_lock, RW_READER);
3036 if (hca->state == HCA_DETACHED) {
3037 rw_exit(&hca->state_lock);
3038 return (IBT_CM_REJECT);
3039 }
3040 rw_exit(&hca->state_lock);
3041
3042 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3043
3044 if (ibt_get_ip_data(event->cm_priv_data_len,
3045 event->cm_priv_data,
3046 &ipinfo) != IBT_SUCCESS) {
3047
3048 return (IBT_CM_REJECT);
3049 }
3050
3051 switch (ipinfo.src_addr.family) {
3052 case AF_INET:
3053
3054 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3055 KM_SLEEP);
3056 (void) strcpy(conn->c_netid, RIBNETID_TCP);
3057
3058 conn->c_raddr.maxlen =
3059 conn->c_raddr.len = sin_size;
3060 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3061
3062 s = (struct sockaddr_in *)conn->c_raddr.buf;
3063 s->sin_family = AF_INET;
3064 bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3065 &s->sin_addr, in_size);
3066
3067 conn->c_laddr.maxlen =
3068 conn->c_laddr.len = sin_size;
3069 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3070
3071 s = (struct sockaddr_in *)conn->c_laddr.buf;
3072 s->sin_family = AF_INET;
3073 bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3074 &s->sin_addr, in_size);
3075
3076 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3077 sizeof (struct sockaddr_in);
3078 conn->c_addrmask.buf =
3079 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3080 ((struct sockaddr_in *)
3081 conn->c_addrmask.buf)->sin_addr.s_addr =
3082 (uint32_t)~0;
3083 ((struct sockaddr_in *)
3084 conn->c_addrmask.buf)->sin_family =
3085 (sa_family_t)~0;
3086 break;
3087
3088 case AF_INET6:
3089
3090 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3091 KM_SLEEP);
3092 (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3093
3094 conn->c_raddr.maxlen =
3095 conn->c_raddr.len = sin6_size;
3096 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3097
3098 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3099 s6->sin6_family = AF_INET6;
3100 bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3101 &s6->sin6_addr,
3102 sizeof (struct in6_addr));
3103
3104 conn->c_laddr.maxlen =
3105 conn->c_laddr.len = sin6_size;
3106 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3107
3108 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3109 s6->sin6_family = AF_INET6;
3110 bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3111 &s6->sin6_addr,
3112 sizeof (struct in6_addr));
3113
3114 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3115 sizeof (struct sockaddr_in6);
3116 conn->c_addrmask.buf =
3117 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3118 (void) memset(&((struct sockaddr_in6 *)
3119 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3120 sizeof (struct in6_addr));
3121 ((struct sockaddr_in6 *)
3122 conn->c_addrmask.buf)->sin6_family =
3123 (sa_family_t)~0;
3124 break;
3125
3126 default:
3127 return (IBT_CM_REJECT);
3128 }
3129
3130 break;
3131
3132 case IBT_CM_EVENT_CONN_CLOSED:
3133 {
3134 CONN *conn;
3135 rib_qp_t *qp;
3136
3137 switch (event->cm_event.closed) {
3138 case IBT_CM_CLOSED_DREP_RCVD:
3139 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3140 case IBT_CM_CLOSED_DUP:
3141 case IBT_CM_CLOSED_ABORT:
3142 case IBT_CM_CLOSED_ALREADY:
3143 /*
3144 * These cases indicate the local end initiated
3145 * the closing of the channel. Nothing to do here.
3146 */
3147 break;
3148 default:
3149 /*
3150 * Reason for CONN_CLOSED event must be one of
3151 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3152 * or IBT_CM_CLOSED_STALE. These indicate cases were
3153 * the remote end is closing the channel. In these
3154 * cases free the channel and transition to error
3155 * state
3156 */
3157 qp = ibt_get_chan_private(event->cm_channel);
3158 conn = qptoc(qp);
3159 mutex_enter(&conn->c_lock);
3160 if (conn->c_state == C_DISCONN_PEND) {
3161 mutex_exit(&conn->c_lock);
3162 break;
3163 }
3164 conn->c_state = C_ERROR_CONN;
3165
3166 /*
3167 * Free the conn if c_ref goes down to 0
3168 */
3169 if (conn->c_ref == 0) {
3170 /*
3171 * Remove from list and free conn
3172 */
3173 conn->c_state = C_DISCONN_PEND;
3174 mutex_exit(&conn->c_lock);
3175 (void) rib_disconnect_channel(conn,
3176 &hca->srv_conn_list);
3177 } else {
3178 /*
3179 * conn will be freed when c_ref goes to 0.
3180 * Indicate to cleaning thread not to close
3181 * the connection, but just free the channel.
3182 */
3183 conn->c_flags |= C_CLOSE_NOTNEEDED;
3184 mutex_exit(&conn->c_lock);
3185 }
3186 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3187 break;
3188 }
3189 break;
3190 }
3191 case IBT_CM_EVENT_CONN_EST:
3192 /*
3193 * RTU received, hence connection established.
3194 */
3195 if (rib_debug > 1)
3196 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3197 "(CONN_EST) channel established");
3198 break;
3199
3200 default:
3201 if (rib_debug > 2) {
3202 /* Let CM handle the following events. */
3203 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3204 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3205 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3206 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3207 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3208 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3209 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3210 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3211 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3212 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3213 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3214 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3215 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3216 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3217 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3218 }
3219 }
3220 return (IBT_CM_DEFAULT);
3221 }
3222
3223 /* accept all other CM messages (i.e. let the CM handle them) */
3224 return (IBT_CM_ACCEPT);
3225 }
3226
3227 static rdma_stat
3228 rib_register_service(rib_hca_t *hca, int service_type,
3229 uint8_t protocol_num, in_port_t dst_port)
3230 {
3231 ibt_srv_desc_t sdesc;
3232 ibt_hca_portinfo_t *port_infop;
3233 ib_svc_id_t srv_id;
3234 ibt_srv_hdl_t srv_hdl;
3235 uint_t port_size;
3236 uint_t pki, i, num_ports, nbinds;
3237 ibt_status_t ibt_status;
3238 rib_service_t *service;
3239 ib_pkey_t pkey;
3240
3241 /*
3242 * Query all ports for the given HCA
3243 */
3244 rw_enter(&hca->state_lock, RW_READER);
3245 if (hca->state != HCA_DETACHED) {
3246 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3247 &num_ports, &port_size);
3248 rw_exit(&hca->state_lock);
3249 } else {
3250 rw_exit(&hca->state_lock);
3251 return (RDMA_FAILED);
3252 }
3253 if (ibt_status != IBT_SUCCESS) {
3254 return (RDMA_FAILED);
3255 }
3256
3257 DTRACE_PROBE1(rpcib__i__regservice_numports,
3258 int, num_ports);
3259
3260 for (i = 0; i < num_ports; i++) {
3261 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3262 DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3263 int, i+1);
3264 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3265 DTRACE_PROBE1(rpcib__i__regservice__portactive,
3266 int, i+1);
3267 }
3268 }
3269
3270 /*
3271 * Get all the IP addresses on this system to register the
3272 * given "service type" on all DNS recognized IP addrs.
3273 * Each service type such as NFS will have all the systems
3274 * IP addresses as its different names. For now the only
3275 * type of service we support in RPCIB is NFS.
3276 */
3277 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3278 /*
3279 * Start registering and binding service to active
3280 * on active ports on this HCA.
3281 */
3282 nbinds = 0;
3283 for (service = rib_stat->service_list;
3284 service && (service->srv_type != service_type);
3285 service = service->next)
3286 ;
3287
3288 if (service == NULL) {
3289 /*
3290 * We use IP addresses as the service names for
3291 * service registration. Register each of them
3292 * with CM to obtain a svc_id and svc_hdl. We do not
3293 * register the service with machine's loopback address.
3294 */
3295 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3296 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3297 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3298 sdesc.sd_handler = rib_srv_cm_handler;
3299 sdesc.sd_flags = 0;
3300 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3301 &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3302 1, &srv_hdl, &srv_id);
3303 if ((ibt_status != IBT_SUCCESS) &&
3304 (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3305 rw_exit(&rib_stat->service_list_lock);
3306 DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3307 int, ibt_status);
3308 ibt_free_portinfo(port_infop, port_size);
3309 return (RDMA_FAILED);
3310 }
3311
3312 /*
3313 * Allocate and prepare a service entry
3314 */
3315 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3316
3317 service->srv_type = service_type;
3318 service->srv_hdl = srv_hdl;
3319 service->srv_id = srv_id;
3320
3321 service->next = rib_stat->service_list;
3322 rib_stat->service_list = service;
3323 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3324 int, service->srv_type);
3325 } else {
3326 srv_hdl = service->srv_hdl;
3327 srv_id = service->srv_id;
3328 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3329 int, service->srv_type);
3330 }
3331
3332 for (i = 0; i < num_ports; i++) {
3333 ibt_sbind_hdl_t sbp;
3334 rib_hca_service_t *hca_srv;
3335 ib_gid_t gid;
3336
3337 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3338 continue;
3339
3340 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3341 pkey = port_infop[i].p_pkey_tbl[pki];
3342
3343 rw_enter(&hca->bound_services_lock, RW_READER);
3344 gid = port_infop[i].p_sgid_tbl[0];
3345 for (hca_srv = hca->bound_services; hca_srv;
3346 hca_srv = hca_srv->next) {
3347 if ((hca_srv->srv_id == service->srv_id) &&
3348 (hca_srv->gid.gid_prefix ==
3349 gid.gid_prefix) &&
3350 (hca_srv->gid.gid_guid == gid.gid_guid))
3351 break;
3352 }
3353 rw_exit(&hca->bound_services_lock);
3354 if (hca_srv != NULL) {
3355 /*
3356 * port is alreay bound the the service
3357 */
3358 DTRACE_PROBE1(
3359 rpcib__i__regservice__already__bound,
3360 int, i+1);
3361 nbinds++;
3362 continue;
3363 }
3364
3365 if ((pkey & IBSRM_HB) &&
3366 (pkey != IB_PKEY_INVALID_FULL)) {
3367
3368 sbp = NULL;
3369 ibt_status = ibt_bind_service(srv_hdl,
3370 gid, NULL, hca, &sbp);
3371
3372 if (ibt_status == IBT_SUCCESS) {
3373 hca_srv = kmem_zalloc(
3374 sizeof (rib_hca_service_t),
3375 KM_SLEEP);
3376 hca_srv->srv_id = srv_id;
3377 hca_srv->gid = gid;
3378 hca_srv->sbind_hdl = sbp;
3379
3380 rw_enter(&hca->bound_services_lock,
3381 RW_WRITER);
3382 hca_srv->next = hca->bound_services;
3383 hca->bound_services = hca_srv;
3384 rw_exit(&hca->bound_services_lock);
3385 nbinds++;
3386 }
3387
3388 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3389 int, ibt_status);
3390 }
3391 }
3392 }
3393 rw_exit(&rib_stat->service_list_lock);
3394
3395 ibt_free_portinfo(port_infop, port_size);
3396
3397 if (nbinds == 0) {
3398 return (RDMA_FAILED);
3399 } else {
3400 /*
3401 * Put this plugin into accept state, since atleast
3402 * one registration was successful.
3403 */
3404 mutex_enter(&plugin_state_lock);
3405 plugin_state = ACCEPT;
3406 mutex_exit(&plugin_state_lock);
3407 return (RDMA_SUCCESS);
3408 }
3409 }
3410
3411 void
3412 rib_listen(struct rdma_svc_data *rd)
3413 {
3414 rdma_stat status;
3415 int n_listening = 0;
3416 rib_hca_t *hca;
3417
3418 mutex_enter(&rib_stat->listen_lock);
3419 /*
3420 * if rd parameter is NULL then it means that rib_stat->q is
3421 * already initialized by a call from RDMA and we just want to
3422 * add a newly attached HCA to the same listening state as other
3423 * HCAs.
3424 */
3425 if (rd == NULL) {
3426 if (rib_stat->q == NULL) {
3427 mutex_exit(&rib_stat->listen_lock);
3428 return;
3429 }
3430 } else {
3431 rib_stat->q = &rd->q;
3432 }
3433 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3434 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3435 /*
3436 * First check if a hca is still attached
3437 */
3438 rw_enter(&hca->state_lock, RW_READER);
3439 if (hca->state != HCA_INITED) {
3440 rw_exit(&hca->state_lock);
3441 continue;
3442 }
3443 rw_exit(&hca->state_lock);
3444
3445 /*
3446 * Right now the only service type is NFS. Hence
3447 * force feed this value. Ideally to communicate
3448 * the service type it should be passed down in
3449 * rdma_svc_data.
3450 */
3451 status = rib_register_service(hca, NFS,
3452 IPPROTO_TCP, nfs_rdma_port);
3453 if (status == RDMA_SUCCESS)
3454 n_listening++;
3455 }
3456 rw_exit(&rib_stat->hcas_list_lock);
3457
3458 /*
3459 * Service active on an HCA, check rd->err_code for more
3460 * explainable errors.
3461 */
3462 if (rd) {
3463 if (n_listening > 0) {
3464 rd->active = 1;
3465 rd->err_code = RDMA_SUCCESS;
3466 } else {
3467 rd->active = 0;
3468 rd->err_code = RDMA_FAILED;
3469 }
3470 }
3471 mutex_exit(&rib_stat->listen_lock);
3472 }
3473
3474 /* XXXX */
3475 /* ARGSUSED */
3476 static void
3477 rib_listen_stop(struct rdma_svc_data *svcdata)
3478 {
3479 rib_hca_t *hca;
3480
3481 mutex_enter(&rib_stat->listen_lock);
3482 /*
3483 * KRPC called the RDMATF to stop the listeners, this means
3484 * stop sending incomming or recieved requests to KRPC master
3485 * transport handle for RDMA-IB. This is also means that the
3486 * master transport handle, responsible for us, is going away.
3487 */
3488 mutex_enter(&plugin_state_lock);
3489 plugin_state = NO_ACCEPT;
3490 if (svcdata != NULL)
3491 svcdata->active = 0;
3492 mutex_exit(&plugin_state_lock);
3493
3494 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3495 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3496 /*
3497 * First check if a hca is still attached
3498 */
3499 rw_enter(&hca->state_lock, RW_READER);
3500 if (hca->state == HCA_DETACHED) {
3501 rw_exit(&hca->state_lock);
3502 continue;
3503 }
3504 rib_close_channels(&hca->srv_conn_list);
3505 rib_stop_services(hca);
3506 rw_exit(&hca->state_lock);
3507 }
3508 rw_exit(&rib_stat->hcas_list_lock);
3509
3510 /*
3511 * Avoid rib_listen() using the stale q field.
3512 * This could happen if a port goes up after all services
3513 * are already unregistered.
3514 */
3515 rib_stat->q = NULL;
3516 mutex_exit(&rib_stat->listen_lock);
3517 }
3518
3519 /*
3520 * Traverse the HCA's service list to unbind and deregister services.
3521 * For each bound service of HCA to be removed, first find the corresponding
3522 * service handle (srv_hdl) and then unbind the service by calling
3523 * ibt_unbind_service().
3524 */
3525 static void
3526 rib_stop_services(rib_hca_t *hca)
3527 {
3528 rib_hca_service_t *srv_list, *to_remove;
3529
3530 /*
3531 * unbind and deregister the services for this service type.
3532 * Right now there is only one service type. In future it will
3533 * be passed down to this function.
3534 */
3535 rw_enter(&hca->bound_services_lock, RW_READER);
3536 srv_list = hca->bound_services;
3537 hca->bound_services = NULL;
3538 rw_exit(&hca->bound_services_lock);
3539
3540 while (srv_list != NULL) {
3541 rib_service_t *sc;
3542
3543 to_remove = srv_list;
3544 srv_list = to_remove->next;
3545 rw_enter(&rib_stat->service_list_lock, RW_READER);
3546 for (sc = rib_stat->service_list;
3547 sc && (sc->srv_id != to_remove->srv_id);
3548 sc = sc->next)
3549 ;
3550 /*
3551 * if sc is NULL then the service doesn't exist anymore,
3552 * probably just removed completely through rib_stat.
3553 */
3554 if (sc != NULL)
3555 (void) ibt_unbind_service(sc->srv_hdl,
3556 to_remove->sbind_hdl);
3557 rw_exit(&rib_stat->service_list_lock);
3558 kmem_free(to_remove, sizeof (rib_hca_service_t));
3559 }
3560 }
3561
3562 static struct svc_recv *
3563 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3564 {
3565 struct svc_recv *recvp;
3566
3567 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3568 recvp->vaddr = sgl->ds_va;
3569 recvp->qp = qp;
3570 recvp->bytes_xfer = 0;
3571 return (recvp);
3572 }
3573
3574 static int
3575 rib_free_svc_recv(struct svc_recv *recvp)
3576 {
3577 kmem_free(recvp, sizeof (*recvp));
3578
3579 return (0);
3580 }
3581
3582 static struct reply *
3583 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3584 {
3585 struct reply *rep;
3586
3587
3588 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3589 if (rep == NULL) {
3590 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3591 return (NULL);
3592 }
3593 rep->xid = msgid;
3594 rep->vaddr_cq = NULL;
3595 rep->bytes_xfer = 0;
3596 rep->status = (uint_t)REPLY_WAIT;
3597 rep->prev = NULL;
3598 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3599
3600 mutex_enter(&qp->replylist_lock);
3601 if (qp->replylist) {
3602 rep->next = qp->replylist;
3603 qp->replylist->prev = rep;
3604 }
3605 qp->rep_list_size++;
3606
3607 DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3608 int, qp->rep_list_size);
3609
3610 qp->replylist = rep;
3611 mutex_exit(&qp->replylist_lock);
3612
3613 return (rep);
3614 }
3615
3616 static rdma_stat
3617 rib_rem_replylist(rib_qp_t *qp)
3618 {
3619 struct reply *r, *n;
3620
3621 mutex_enter(&qp->replylist_lock);
3622 for (r = qp->replylist; r != NULL; r = n) {
3623 n = r->next;
3624 (void) rib_remreply(qp, r);
3625 }
3626 mutex_exit(&qp->replylist_lock);
3627
3628 return (RDMA_SUCCESS);
3629 }
3630
3631 static int
3632 rib_remreply(rib_qp_t *qp, struct reply *rep)
3633 {
3634
3635 ASSERT(MUTEX_HELD(&qp->replylist_lock));
3636 if (rep->prev) {
3637 rep->prev->next = rep->next;
3638 }
3639 if (rep->next) {
3640 rep->next->prev = rep->prev;
3641 }
3642 if (qp->replylist == rep)
3643 qp->replylist = rep->next;
3644
3645 cv_destroy(&rep->wait_cv);
3646 qp->rep_list_size--;
3647
3648 DTRACE_PROBE1(rpcib__i__remreply__listsize,
3649 int, qp->rep_list_size);
3650
3651 kmem_free(rep, sizeof (*rep));
3652
3653 return (0);
3654 }
3655
3656 rdma_stat
3657 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3658 struct mrc *buf_handle)
3659 {
3660 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3661 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3662 rdma_stat status;
3663 rib_hca_t *hca = (ctoqp(conn))->hca;
3664
3665 /*
3666 * Note: ALL buffer pools use the same memory type RDMARW.
3667 */
3668 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3669 if (status == RDMA_SUCCESS) {
3670 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3671 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3672 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3673 } else {
3674 buf_handle->mrc_linfo = NULL;
3675 buf_handle->mrc_lmr = 0;
3676 buf_handle->mrc_rmr = 0;
3677 }
3678 return (status);
3679 }
3680
3681 static rdma_stat
3682 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3683 ibt_mr_flags_t spec,
3684 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3685 {
3686 ibt_mr_attr_t mem_attr;
3687 ibt_status_t ibt_status;
3688 mem_attr.mr_vaddr = (uintptr_t)buf;
3689 mem_attr.mr_len = (ib_msglen_t)size;
3690 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3691 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3692 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3693 IBT_MR_ENABLE_WINDOW_BIND | spec;
3694
3695 rw_enter(&hca->state_lock, RW_READER);
3696 if (hca->state != HCA_DETACHED) {
3697 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3698 &mem_attr, mr_hdlp, mr_descp);
3699 rw_exit(&hca->state_lock);
3700 } else {
3701 rw_exit(&hca->state_lock);
3702 return (RDMA_FAILED);
3703 }
3704
3705 if (ibt_status != IBT_SUCCESS) {
3706 return (RDMA_FAILED);
3707 }
3708 return (RDMA_SUCCESS);
3709 }
3710
3711 rdma_stat
3712 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3713 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3714 {
3715 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3716 rib_lrc_entry_t *l;
3717 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3718 rdma_stat status;
3719 rib_hca_t *hca = (ctoqp(conn))->hca;
3720
3721 /*
3722 * Non-coherent memory registration.
3723 */
3724 l = (rib_lrc_entry_t *)lrc;
3725 if (l) {
3726 if (l->registered) {
3727 buf_handle->mrc_linfo =
3728 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3729 buf_handle->mrc_lmr =
3730 (uint32_t)l->lrc_mhandle.mrc_lmr;
3731 buf_handle->mrc_rmr =
3732 (uint32_t)l->lrc_mhandle.mrc_rmr;
3733 *sync_handle = (RIB_SYNCMEM_HANDLE)
3734 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3735 return (RDMA_SUCCESS);
3736 } else {
3737 /* Always register the whole buffer */
3738 buf = (caddr_t)l->lrc_buf;
3739 buflen = l->lrc_len;
3740 }
3741 }
3742 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3743
3744 if (status == RDMA_SUCCESS) {
3745 if (l) {
3746 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3747 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
3748 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
3749 l->registered = TRUE;
3750 }
3751 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3752 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3753 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3754 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3755 } else {
3756 buf_handle->mrc_linfo = NULL;
3757 buf_handle->mrc_lmr = 0;
3758 buf_handle->mrc_rmr = 0;
3759 }
3760 return (status);
3761 }
3762
3763 /* ARGSUSED */
3764 rdma_stat
3765 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3766 {
3767 rib_hca_t *hca = (ctoqp(conn))->hca;
3768 /*
3769 * Allow memory deregistration even if HCA is
3770 * getting detached. Need all outstanding
3771 * memory registrations to be deregistered
3772 * before HCA_DETACH_EVENT can be accepted.
3773 */
3774 (void) ibt_deregister_mr(hca->hca_hdl,
3775 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3776 return (RDMA_SUCCESS);
3777 }
3778
3779 /* ARGSUSED */
3780 rdma_stat
3781 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3782 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3783 {
3784 rib_lrc_entry_t *l;
3785 l = (rib_lrc_entry_t *)lrc;
3786 if (l)
3787 if (l->registered)
3788 return (RDMA_SUCCESS);
3789
3790 (void) rib_deregistermem(conn, buf, buf_handle);
3791
3792 return (RDMA_SUCCESS);
3793 }
3794
3795 /* ARGSUSED */
3796 rdma_stat
3797 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3798 int len, int cpu)
3799 {
3800 ibt_status_t status;
3801 rib_hca_t *hca = (ctoqp(conn))->hca;
3802 ibt_mr_sync_t mr_segment;
3803
3804 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3805 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3806 mr_segment.ms_len = (ib_memlen_t)len;
3807 if (cpu) {
3808 /* make incoming data visible to memory */
3809 mr_segment.ms_flags = IBT_SYNC_WRITE;
3810 } else {
3811 /* make memory changes visible to IO */
3812 mr_segment.ms_flags = IBT_SYNC_READ;
3813 }
3814 rw_enter(&hca->state_lock, RW_READER);
3815 if (hca->state != HCA_DETACHED) {
3816 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3817 rw_exit(&hca->state_lock);
3818 } else {
3819 rw_exit(&hca->state_lock);
3820 return (RDMA_FAILED);
3821 }
3822
3823 if (status == IBT_SUCCESS)
3824 return (RDMA_SUCCESS);
3825 else {
3826 return (RDMA_FAILED);
3827 }
3828 }
3829
3830 /*
3831 * XXXX ????
3832 */
3833 static rdma_stat
3834 rib_getinfo(rdma_info_t *info)
3835 {
3836 /*
3837 * XXXX Hack!
3838 */
3839 info->addrlen = 16;
3840 info->mts = 1000000;
3841 info->mtu = 1000000;
3842
3843 return (RDMA_SUCCESS);
3844 }
3845
3846 rib_bufpool_t *
3847 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3848 {
3849 rib_bufpool_t *rbp = NULL;
3850 bufpool_t *bp = NULL;
3851 caddr_t buf;
3852 ibt_mr_attr_t mem_attr;
3853 ibt_status_t ibt_status;
3854 int i, j;
3855
3856 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3857
3858 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3859 num * sizeof (void *), KM_SLEEP);
3860
3861 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3862 bp->numelems = num;
3863
3864
3865 switch (ptype) {
3866 case SEND_BUFFER:
3867 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3868 bp->rsize = RPC_MSG_SZ;
3869 break;
3870 case RECV_BUFFER:
3871 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3872 bp->rsize = RPC_BUF_SIZE;
3873 break;
3874 default:
3875 goto fail;
3876 }
3877
3878 /*
3879 * Register the pool.
3880 */
3881 bp->bufsize = num * bp->rsize;
3882 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3883 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3884 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3885 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3886 sizeof (ibt_mr_desc_t), KM_SLEEP);
3887 rw_enter(&hca->state_lock, RW_READER);
3888
3889 if (hca->state == HCA_DETACHED) {
3890 rw_exit(&hca->state_lock);
3891 goto fail;
3892 }
3893
3894 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3895 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3896 mem_attr.mr_vaddr = (uintptr_t)buf;
3897 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3898 mem_attr.mr_as = NULL;
3899 ibt_status = ibt_register_mr(hca->hca_hdl,
3900 hca->pd_hdl, &mem_attr,
3901 &rbp->mr_hdl[i],
3902 &rbp->mr_desc[i]);
3903 if (ibt_status != IBT_SUCCESS) {
3904 for (j = 0; j < i; j++) {
3905 (void) ibt_deregister_mr(hca->hca_hdl,
3906 rbp->mr_hdl[j]);
3907 }
3908 rw_exit(&hca->state_lock);
3909 goto fail;
3910 }
3911 }
3912 rw_exit(&hca->state_lock);
3913 buf = (caddr_t)bp->buf;
3914 for (i = 0; i < num; i++, buf += bp->rsize) {
3915 bp->buflist[i] = (void *)buf;
3916 }
3917 bp->buffree = num - 1; /* no. of free buffers */
3918 rbp->bpool = bp;
3919
3920 return (rbp);
3921 fail:
3922 if (bp) {
3923 if (bp->buf)
3924 kmem_free(bp->buf, bp->bufsize);
3925 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3926 }
3927 if (rbp) {
3928 if (rbp->mr_hdl)
3929 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3930 if (rbp->mr_desc)
3931 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3932 kmem_free(rbp, sizeof (rib_bufpool_t));
3933 }
3934 return (NULL);
3935 }
3936
3937 static void
3938 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3939 {
3940 int i;
3941 rib_bufpool_t *rbp = NULL;
3942 bufpool_t *bp;
3943
3944 /*
3945 * Obtain pool address based on type of pool
3946 */
3947 switch (ptype) {
3948 case SEND_BUFFER:
3949 rbp = hca->send_pool;
3950 break;
3951 case RECV_BUFFER:
3952 rbp = hca->recv_pool;
3953 break;
3954 default:
3955 return;
3956 }
3957 if (rbp == NULL)
3958 return;
3959
3960 bp = rbp->bpool;
3961
3962 /*
3963 * Deregister the pool memory and free it.
3964 */
3965 for (i = 0; i < bp->numelems; i++) {
3966 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3967 }
3968 }
3969
3970 static void
3971 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3972 {
3973
3974 rib_bufpool_t *rbp = NULL;
3975 bufpool_t *bp;
3976
3977 /*
3978 * Obtain pool address based on type of pool
3979 */
3980 switch (ptype) {
3981 case SEND_BUFFER:
3982 rbp = hca->send_pool;
3983 break;
3984 case RECV_BUFFER:
3985 rbp = hca->recv_pool;
3986 break;
3987 default:
3988 return;
3989 }
3990 if (rbp == NULL)
3991 return;
3992
3993 bp = rbp->bpool;
3994
3995 /*
3996 * Free the pool memory.
3997 */
3998 if (rbp->mr_hdl)
3999 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4000
4001 if (rbp->mr_desc)
4002 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4003 if (bp->buf)
4004 kmem_free(bp->buf, bp->bufsize);
4005 mutex_destroy(&bp->buflock);
4006 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4007 kmem_free(rbp, sizeof (rib_bufpool_t));
4008 }
4009
4010 void
4011 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4012 {
4013 /*
4014 * Deregister the pool memory and free it.
4015 */
4016 rib_rbufpool_deregister(hca, ptype);
4017 rib_rbufpool_free(hca, ptype);
4018 }
4019
4020 /*
4021 * Fetch a buffer from the pool of type specified in rdbuf->type.
4022 */
4023 static rdma_stat
4024 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4025 {
4026 rib_lrc_entry_t *rlep;
4027
4028 if (rdbuf->type == RDMA_LONG_BUFFER) {
4029 rlep = rib_get_cache_buf(conn, rdbuf->len);
4030 rdbuf->rb_private = (caddr_t)rlep;
4031 rdbuf->addr = rlep->lrc_buf;
4032 rdbuf->handle = rlep->lrc_mhandle;
4033 return (RDMA_SUCCESS);
4034 }
4035
4036 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4037 if (rdbuf->addr) {
4038 switch (rdbuf->type) {
4039 case SEND_BUFFER:
4040 rdbuf->len = RPC_MSG_SZ; /* 1K */
4041 break;
4042 case RECV_BUFFER:
4043 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4044 break;
4045 default:
4046 rdbuf->len = 0;
4047 }
4048 return (RDMA_SUCCESS);
4049 } else
4050 return (RDMA_FAILED);
4051 }
4052
4053 /*
4054 * Fetch a buffer of specified type.
4055 * Note that rdbuf->handle is mw's rkey.
4056 */
4057 static void *
4058 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4059 {
4060 rib_qp_t *qp = ctoqp(conn);
4061 rib_hca_t *hca = qp->hca;
4062 rdma_btype ptype = rdbuf->type;
4063 void *buf;
4064 rib_bufpool_t *rbp = NULL;
4065 bufpool_t *bp;
4066 int i;
4067
4068 /*
4069 * Obtain pool address based on type of pool
4070 */
4071 switch (ptype) {
4072 case SEND_BUFFER:
4073 rbp = hca->send_pool;
4074 break;
4075 case RECV_BUFFER:
4076 rbp = hca->recv_pool;
4077 break;
4078 default:
4079 return (NULL);
4080 }
4081 if (rbp == NULL)
4082 return (NULL);
4083
4084 bp = rbp->bpool;
4085
4086 mutex_enter(&bp->buflock);
4087 if (bp->buffree < 0) {
4088 mutex_exit(&bp->buflock);
4089 return (NULL);
4090 }
4091
4092 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4093 buf = bp->buflist[bp->buffree];
4094 rdbuf->addr = buf;
4095 rdbuf->len = bp->rsize;
4096 for (i = bp->numelems - 1; i >= 0; i--) {
4097 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4098 rdbuf->handle.mrc_rmr =
4099 (uint32_t)rbp->mr_desc[i].md_rkey;
4100 rdbuf->handle.mrc_linfo =
4101 (uintptr_t)rbp->mr_hdl[i];
4102 rdbuf->handle.mrc_lmr =
4103 (uint32_t)rbp->mr_desc[i].md_lkey;
4104 bp->buffree--;
4105
4106 mutex_exit(&bp->buflock);
4107
4108 return (buf);
4109 }
4110 }
4111
4112 mutex_exit(&bp->buflock);
4113
4114 return (NULL);
4115 }
4116
4117 static void
4118 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4119 {
4120
4121 if (rdbuf->type == RDMA_LONG_BUFFER) {
4122 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4123 rdbuf->rb_private = NULL;
4124 return;
4125 }
4126 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4127 }
4128
4129 static void
4130 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4131 {
4132 rib_qp_t *qp = ctoqp(conn);
4133 rib_hca_t *hca = qp->hca;
4134 rib_bufpool_t *rbp = NULL;
4135 bufpool_t *bp;
4136
4137 /*
4138 * Obtain pool address based on type of pool
4139 */
4140 switch (ptype) {
4141 case SEND_BUFFER:
4142 rbp = hca->send_pool;
4143 break;
4144 case RECV_BUFFER:
4145 rbp = hca->recv_pool;
4146 break;
4147 default:
4148 return;
4149 }
4150 if (rbp == NULL)
4151 return;
4152
4153 bp = rbp->bpool;
4154
4155 mutex_enter(&bp->buflock);
4156 if (++bp->buffree >= bp->numelems) {
4157 /*
4158 * Should never happen
4159 */
4160 bp->buffree--;
4161 } else {
4162 bp->buflist[bp->buffree] = buf;
4163 }
4164 mutex_exit(&bp->buflock);
4165 }
4166
4167 static rdma_stat
4168 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4169 {
4170 rw_enter(&connlist->conn_lock, RW_WRITER);
4171 if (connlist->conn_hd) {
4172 cn->c_next = connlist->conn_hd;
4173 connlist->conn_hd->c_prev = cn;
4174 }
4175 connlist->conn_hd = cn;
4176 rw_exit(&connlist->conn_lock);
4177
4178 return (RDMA_SUCCESS);
4179 }
4180
4181 static rdma_stat
4182 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4183 {
4184 rw_enter(&connlist->conn_lock, RW_WRITER);
4185 if (cn->c_prev) {
4186 cn->c_prev->c_next = cn->c_next;
4187 }
4188 if (cn->c_next) {
4189 cn->c_next->c_prev = cn->c_prev;
4190 }
4191 if (connlist->conn_hd == cn)
4192 connlist->conn_hd = cn->c_next;
4193 rw_exit(&connlist->conn_lock);
4194
4195 return (RDMA_SUCCESS);
4196 }
4197
4198 /* ARGSUSED */
4199 static rdma_stat
4200 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4201 int addr_type, void *handle, CONN **conn)
4202 {
4203 rdma_stat status;
4204 rpcib_ping_t rpt;
4205
4206 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4207 return (status);
4208 }
4209
4210 /*
4211 * rib_find_hca_connection
4212 *
4213 * if there is an existing connection to the specified address then
4214 * it will be returned in conn, otherwise conn will be set to NULL.
4215 * Also cleans up any connection that is in error state.
4216 */
4217 static int
4218 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4219 struct netbuf *d_svcaddr, CONN **conn)
4220 {
4221 CONN *cn;
4222 clock_t cv_stat, timout;
4223
4224 *conn = NULL;
4225 again:
4226 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4227 cn = hca->cl_conn_list.conn_hd;
4228 while (cn != NULL) {
4229 /*
4230 * First, clear up any connection in the ERROR state
4231 */
4232 mutex_enter(&cn->c_lock);
4233 if (cn->c_state == C_ERROR_CONN) {
4234 if (cn->c_ref == 0) {
4235 /*
4236 * Remove connection from list and destroy it.
4237 */
4238 cn->c_state = C_DISCONN_PEND;
4239 mutex_exit(&cn->c_lock);
4240 rw_exit(&hca->cl_conn_list.conn_lock);
4241 rib_conn_close((void *)cn);
4242 goto again;
4243 }
4244 mutex_exit(&cn->c_lock);
4245 cn = cn->c_next;
4246 continue;
4247 }
4248 if (cn->c_state == C_DISCONN_PEND) {
4249 mutex_exit(&cn->c_lock);
4250 cn = cn->c_next;
4251 continue;
4252 }
4253
4254 /*
4255 * source address is only checked for if there is one,
4256 * this is the case for retries.
4257 */
4258 if ((cn->c_raddr.len == d_svcaddr->len) &&
4259 (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4260 d_svcaddr->len) == 0) &&
4261 ((s_svcaddr->len == 0) ||
4262 ((cn->c_laddr.len == s_svcaddr->len) &&
4263 (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4264 s_svcaddr->len) == 0)))) {
4265 /*
4266 * Our connection. Give up conn list lock
4267 * as we are done traversing the list.
4268 */
4269 rw_exit(&hca->cl_conn_list.conn_lock);
4270 if (cn->c_state == C_CONNECTED) {
4271 cn->c_ref++; /* sharing a conn */
4272 mutex_exit(&cn->c_lock);
4273 *conn = cn;
4274 return (RDMA_SUCCESS);
4275 }
4276 if (cn->c_state == C_CONN_PEND) {
4277 /*
4278 * Hold a reference to this conn before
4279 * we give up the lock.
4280 */
4281 cn->c_ref++;
4282 timout = ddi_get_lbolt() +
4283 drv_usectohz(CONN_WAIT_TIME * 1000000);
4284 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4285 &cn->c_lock, timout)) > 0 &&
4286 cn->c_state == C_CONN_PEND)
4287 ;
4288 if (cv_stat == 0) {
4289 (void) rib_conn_release_locked(cn);
4290 return (RDMA_INTR);
4291 }
4292 if (cv_stat < 0) {
4293 (void) rib_conn_release_locked(cn);
4294 return (RDMA_TIMEDOUT);
4295 }
4296 if (cn->c_state == C_CONNECTED) {
4297 *conn = cn;
4298 mutex_exit(&cn->c_lock);
4299 return (RDMA_SUCCESS);
4300 } else {
4301 (void) rib_conn_release_locked(cn);
4302 return (RDMA_TIMEDOUT);
4303 }
4304 }
4305 }
4306 mutex_exit(&cn->c_lock);
4307 cn = cn->c_next;
4308 }
4309 rw_exit(&hca->cl_conn_list.conn_lock);
4310 *conn = NULL;
4311 return (RDMA_FAILED);
4312 }
4313
4314 /*
4315 * Connection management.
4316 * IBTF does not support recycling of channels. So connections are only
4317 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4318 * C_DISCONN_PEND state. No C_IDLE state.
4319 * C_CONN_PEND state: Connection establishment in progress to the server.
4320 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4321 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4322 * only in this state.
4323 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4324 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4325 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4326 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4327 * c_ref drops to 0 (this indicates that RPC has no more references to this
4328 * connection), the connection should be destroyed. A connection transitions
4329 * into this state when it is being destroyed.
4330 */
4331 /* ARGSUSED */
4332 static rdma_stat
4333 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4334 int addr_type, rpcib_ping_t *rpt, CONN **conn)
4335 {
4336 CONN *cn;
4337 int status;
4338 rib_hca_t *hca;
4339 rib_qp_t *qp;
4340 int s_addr_len;
4341 char *s_addr_buf;
4342
4343 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4344 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4345 rw_enter(&hca->state_lock, RW_READER);
4346 if (hca->state != HCA_DETACHED) {
4347 status = rib_find_hca_connection(hca, s_svcaddr,
4348 d_svcaddr, conn);
4349 rw_exit(&hca->state_lock);
4350 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4351 rw_exit(&rib_stat->hcas_list_lock);
4352 return (status);
4353 }
4354 } else
4355 rw_exit(&hca->state_lock);
4356 }
4357 rw_exit(&rib_stat->hcas_list_lock);
4358
4359 /*
4360 * No existing connection found, establish a new connection.
4361 */
4362 bzero(rpt, sizeof (rpcib_ping_t));
4363
4364 status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4365 if (status != RDMA_SUCCESS) {
4366 return (RDMA_FAILED);
4367 }
4368 hca = rpt->hca;
4369
4370 if (rpt->srcip.family == AF_INET) {
4371 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4372 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4373 } else if (rpt->srcip.family == AF_INET6) {
4374 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4375 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4376 } else {
4377 return (RDMA_FAILED);
4378 }
4379
4380 /*
4381 * Channel to server doesn't exist yet, create one.
4382 */
4383 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4384 return (RDMA_FAILED);
4385 }
4386 cn = qptoc(qp);
4387 cn->c_state = C_CONN_PEND;
4388 cn->c_ref = 1;
4389
4390 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4391 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4392 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4393
4394 if (rpt->srcip.family == AF_INET) {
4395 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4396 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4397
4398 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4399 sizeof (struct sockaddr_in);
4400 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4401
4402 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4403 (uint32_t)~0;
4404 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4405 (ushort_t)~0;
4406
4407 } else {
4408 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4409 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4410
4411 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4412 sizeof (struct sockaddr_in6);
4413 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4414
4415 (void) memset(
4416 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4417 (uchar_t)~0, sizeof (struct in6_addr));
4418 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4419 (sa_family_t)~0;
4420 }
4421
4422 /*
4423 * Add to conn list.
4424 * We had given up the READER lock. In the time since then,
4425 * another thread might have created the connection we are
4426 * trying here. But for now, that is quiet alright - there
4427 * might be two connections between a pair of hosts instead
4428 * of one. If we really want to close that window,
4429 * then need to check the list after acquiring the
4430 * WRITER lock.
4431 */
4432 (void) rib_add_connlist(cn, &hca->cl_conn_list);
4433 status = rib_conn_to_srv(hca, qp, rpt);
4434 mutex_enter(&cn->c_lock);
4435
4436 if (cn->c_flags & C_CLOSE_PENDING) {
4437 /*
4438 * This handles a case where the module or
4439 * HCA detached in the time a connection is
4440 * established. In such a case close the
4441 * connection immediately if this is the
4442 * only reference.
4443 */
4444 if (cn->c_ref == 1) {
4445 cn->c_ref--;
4446 cn->c_state = C_DISCONN_PEND;
4447 mutex_exit(&cn->c_lock);
4448 rib_conn_close((void *)cn);
4449 return (RDMA_FAILED);
4450 }
4451
4452 /*
4453 * Connection to be closed later when c_ref = 0
4454 */
4455 status = RDMA_FAILED;
4456 }
4457
4458 if (status == RDMA_SUCCESS) {
4459 cn->c_state = C_CONNECTED;
4460 *conn = cn;
4461 } else {
4462 cn->c_state = C_ERROR_CONN;
4463 cn->c_ref--;
4464 }
4465 cv_signal(&cn->c_cv);
4466 mutex_exit(&cn->c_lock);
4467 return (status);
4468 }
4469
4470 static void
4471 rib_conn_close(void *rarg)
4472 {
4473 CONN *conn = (CONN *)rarg;
4474 rib_qp_t *qp = ctoqp(conn);
4475
4476 mutex_enter(&conn->c_lock);
4477 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4478
4479 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4480
4481 /*
4482 * Live connection in CONNECTED state.
4483 */
4484 if (conn->c_state == C_CONNECTED) {
4485 conn->c_state = C_ERROR_CONN;
4486 }
4487 mutex_exit(&conn->c_lock);
4488
4489 rib_close_a_channel(conn);
4490
4491 mutex_enter(&conn->c_lock);
4492 conn->c_flags &= ~C_CLOSE_PENDING;
4493 }
4494
4495 mutex_exit(&conn->c_lock);
4496
4497 if (qp->mode == RIB_SERVER)
4498 (void) rib_disconnect_channel(conn,
4499 &qp->hca->srv_conn_list);
4500 else
4501 (void) rib_disconnect_channel(conn,
4502 &qp->hca->cl_conn_list);
4503 }
4504
4505 static void
4506 rib_conn_timeout_call(void *carg)
4507 {
4508 time_t idle_time;
4509 CONN *conn = (CONN *)carg;
4510 rib_hca_t *hca = ctoqp(conn)->hca;
4511 int error;
4512
4513 mutex_enter(&conn->c_lock);
4514 if ((conn->c_ref > 0) ||
4515 (conn->c_state == C_DISCONN_PEND)) {
4516 conn->c_timeout = NULL;
4517 mutex_exit(&conn->c_lock);
4518 return;
4519 }
4520
4521 idle_time = (gethrestime_sec() - conn->c_last_used);
4522
4523 if ((idle_time <= rib_conn_timeout) &&
4524 (conn->c_state != C_ERROR_CONN)) {
4525 /*
4526 * There was activity after the last timeout.
4527 * Extend the conn life. Unless the conn is
4528 * already in error state.
4529 */
4530 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4531 SEC_TO_TICK(rib_conn_timeout - idle_time));
4532 mutex_exit(&conn->c_lock);
4533 return;
4534 }
4535
4536 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4537 (void *)conn, DDI_NOSLEEP);
4538
4539 /*
4540 * If taskq dispatch fails above, then reset the timeout
4541 * to try again after 10 secs.
4542 */
4543
4544 if (error != DDI_SUCCESS) {
4545 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4546 SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4547 mutex_exit(&conn->c_lock);
4548 return;
4549 }
4550
4551 conn->c_state = C_DISCONN_PEND;
4552 mutex_exit(&conn->c_lock);
4553 }
4554
4555 static rdma_stat
4556 rib_conn_release(CONN *conn)
4557 {
4558 mutex_enter(&conn->c_lock);
4559 return (rib_conn_release_locked(conn));
4560 }
4561
4562 /*
4563 * Expects conn->c_lock to be held on entry.
4564 * c_lock released on return
4565 */
4566 static rdma_stat
4567 rib_conn_release_locked(CONN *conn)
4568 {
4569 conn->c_ref--;
4570
4571 conn->c_last_used = gethrestime_sec();
4572 if (conn->c_ref > 0) {
4573 mutex_exit(&conn->c_lock);
4574 return (RDMA_SUCCESS);
4575 }
4576
4577 /*
4578 * If a conn is C_ERROR_CONN, close the channel.
4579 */
4580 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4581 conn->c_state = C_DISCONN_PEND;
4582 mutex_exit(&conn->c_lock);
4583 rib_conn_close((void *)conn);
4584 return (RDMA_SUCCESS);
4585 }
4586
4587 /*
4588 * c_ref == 0, set a timeout for conn release
4589 */
4590
4591 if (conn->c_timeout == NULL) {
4592 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4593 SEC_TO_TICK(rib_conn_timeout));
4594 }
4595
4596 mutex_exit(&conn->c_lock);
4597 return (RDMA_SUCCESS);
4598 }
4599
4600 /*
4601 * Add at front of list
4602 */
4603 static struct rdma_done_list *
4604 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4605 {
4606 struct rdma_done_list *rd;
4607
4608 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4609
4610 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4611 rd->xid = xid;
4612 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4613
4614 rd->prev = NULL;
4615 rd->next = qp->rdlist;
4616 if (qp->rdlist != NULL)
4617 qp->rdlist->prev = rd;
4618 qp->rdlist = rd;
4619
4620 return (rd);
4621 }
4622
4623 static void
4624 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4625 {
4626 struct rdma_done_list *r;
4627
4628 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4629
4630 r = rd->next;
4631 if (r != NULL) {
4632 r->prev = rd->prev;
4633 }
4634
4635 r = rd->prev;
4636 if (r != NULL) {
4637 r->next = rd->next;
4638 } else {
4639 qp->rdlist = rd->next;
4640 }
4641
4642 cv_destroy(&rd->rdma_done_cv);
4643 kmem_free(rd, sizeof (*rd));
4644 }
4645
4646 static void
4647 rdma_done_rem_list(rib_qp_t *qp)
4648 {
4649 struct rdma_done_list *r, *n;
4650
4651 mutex_enter(&qp->rdlist_lock);
4652 for (r = qp->rdlist; r != NULL; r = n) {
4653 n = r->next;
4654 rdma_done_rm(qp, r);
4655 }
4656 mutex_exit(&qp->rdlist_lock);
4657 }
4658
4659 static void
4660 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4661 {
4662 struct rdma_done_list *r = qp->rdlist;
4663
4664 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4665
4666 while (r) {
4667 if (r->xid == xid) {
4668 cv_signal(&r->rdma_done_cv);
4669 return;
4670 } else {
4671 r = r->next;
4672 }
4673 }
4674 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4675 int, xid);
4676 }
4677
4678 /*
4679 * Expects conn->c_lock to be held by the caller.
4680 */
4681
4682 static void
4683 rib_close_a_channel(CONN *conn)
4684 {
4685 rib_qp_t *qp;
4686 qp = ctoqp(conn);
4687
4688 if (qp->qp_hdl == NULL) {
4689 /* channel already freed */
4690 return;
4691 }
4692
4693 /*
4694 * Call ibt_close_rc_channel in blocking mode
4695 * with no callbacks.
4696 */
4697 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4698 NULL, 0, NULL, NULL, 0);
4699 }
4700
4701 /*
4702 * Goes through all connections and closes the channel
4703 * This will cause all the WRs on those channels to be
4704 * flushed.
4705 */
4706 static void
4707 rib_close_channels(rib_conn_list_t *connlist)
4708 {
4709 CONN *conn, *tmp;
4710
4711 rw_enter(&connlist->conn_lock, RW_READER);
4712 conn = connlist->conn_hd;
4713 while (conn != NULL) {
4714 mutex_enter(&conn->c_lock);
4715 tmp = conn->c_next;
4716 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4717
4718 if (conn->c_state == C_CONN_PEND) {
4719 conn->c_flags |= C_CLOSE_PENDING;
4720 goto next;
4721 }
4722
4723 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4724
4725 /*
4726 * Live connection in CONNECTED state.
4727 */
4728 if (conn->c_state == C_CONNECTED)
4729 conn->c_state = C_ERROR_CONN;
4730 mutex_exit(&conn->c_lock);
4731
4732 rib_close_a_channel(conn);
4733
4734 mutex_enter(&conn->c_lock);
4735 conn->c_flags &= ~C_CLOSE_PENDING;
4736 /* Signal a pending rib_disconnect_channel() */
4737 cv_signal(&conn->c_cv);
4738 }
4739 next:
4740 mutex_exit(&conn->c_lock);
4741 conn = tmp;
4742 }
4743 rw_exit(&connlist->conn_lock);
4744 }
4745
4746 /*
4747 * Frees up all connections that are no longer being referenced
4748 */
4749 static void
4750 rib_purge_connlist(rib_conn_list_t *connlist)
4751 {
4752 CONN *conn;
4753
4754 top:
4755 rw_enter(&connlist->conn_lock, RW_READER);
4756 conn = connlist->conn_hd;
4757 while (conn != NULL) {
4758 mutex_enter(&conn->c_lock);
4759
4760 /*
4761 * At this point connection is either in ERROR
4762 * or DISCONN_PEND state. If in DISCONN_PEND state
4763 * then some other thread is culling that connection.
4764 * If not and if c_ref is 0, then destroy the connection.
4765 */
4766 if (conn->c_ref == 0 &&
4767 conn->c_state != C_DISCONN_PEND) {
4768 /*
4769 * Cull the connection
4770 */
4771 conn->c_state = C_DISCONN_PEND;
4772 mutex_exit(&conn->c_lock);
4773 rw_exit(&connlist->conn_lock);
4774 (void) rib_disconnect_channel(conn, connlist);
4775 goto top;
4776 } else {
4777 /*
4778 * conn disconnect already scheduled or will
4779 * happen from conn_release when c_ref drops to 0.
4780 */
4781 mutex_exit(&conn->c_lock);
4782 }
4783 conn = conn->c_next;
4784 }
4785 rw_exit(&connlist->conn_lock);
4786
4787 /*
4788 * At this point, only connections with c_ref != 0 are on the list
4789 */
4790 }
4791
4792 /*
4793 * Free all the HCA resources and close
4794 * the hca.
4795 */
4796
4797 static void
4798 rib_free_hca(rib_hca_t *hca)
4799 {
4800 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4801 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4802 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4803 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4804
4805 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4806 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4807 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4808 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4809
4810 rib_rbufpool_destroy(hca, RECV_BUFFER);
4811 rib_rbufpool_destroy(hca, SEND_BUFFER);
4812 rib_destroy_cache(hca);
4813 if (rib_mod.rdma_count == 0)
4814 (void) rdma_unregister_mod(&rib_mod);
4815 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4816 (void) ibt_close_hca(hca->hca_hdl);
4817 hca->hca_hdl = NULL;
4818 }
4819
4820
4821 static void
4822 rib_stop_hca_services(rib_hca_t *hca)
4823 {
4824 rib_stop_services(hca);
4825 rib_close_channels(&hca->cl_conn_list);
4826 rib_close_channels(&hca->srv_conn_list);
4827
4828 rib_purge_connlist(&hca->cl_conn_list);
4829 rib_purge_connlist(&hca->srv_conn_list);
4830
4831 if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4832 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4833 GLOBAL_ZONEID);
4834 stats_enabled = FALSE;
4835 }
4836
4837 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4838 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4839 if (hca->srv_conn_list.conn_hd == NULL &&
4840 hca->cl_conn_list.conn_hd == NULL) {
4841 /*
4842 * conn_lists are NULL, so destroy
4843 * buffers, close hca and be done.
4844 */
4845 rib_free_hca(hca);
4846 }
4847 rw_exit(&hca->cl_conn_list.conn_lock);
4848 rw_exit(&hca->srv_conn_list.conn_lock);
4849
4850 if (hca->hca_hdl != NULL) {
4851 mutex_enter(&hca->inuse_lock);
4852 while (hca->inuse)
4853 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4854 mutex_exit(&hca->inuse_lock);
4855
4856 rib_free_hca(hca);
4857 }
4858 rw_destroy(&hca->bound_services_lock);
4859
4860 if (hca->cleanup_helper != NULL) {
4861 ddi_taskq_destroy(hca->cleanup_helper);
4862 hca->cleanup_helper = NULL;
4863 }
4864 }
4865
4866 /*
4867 * Cleans and closes up all uses of the HCA
4868 */
4869 static void
4870 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4871 {
4872 rib_hca_t *hca = NULL;
4873 rib_hca_t **hcap;
4874
4875 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4876 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4877 hca = *hcap;
4878 rw_enter(&hca->state_lock, RW_WRITER);
4879 if (hca->hca_hdl == hca_hdl) {
4880 /*
4881 * Mark as detached and remove from
4882 * hca list.
4883 */
4884 hca->state = HCA_DETACHED;
4885 *hcap = hca->next;
4886 rib_stat->nhca_inited--;
4887 rib_mod.rdma_count--;
4888 rw_exit(&hca->state_lock);
4889 break;
4890 }
4891 rw_exit(&hca->state_lock);
4892 }
4893 rw_exit(&rib_stat->hcas_list_lock);
4894
4895 if (hca == NULL)
4896 return;
4897 ASSERT(hca->hca_hdl == hca_hdl);
4898
4899 /*
4900 * Stop all services on the HCA
4901 * Go through cl_conn_list and close all rc_channels
4902 * Go through svr_conn_list and close all rc_channels
4903 * Free connections whose c_ref has dropped to 0
4904 * Destroy all CQs
4905 * Deregister and released all buffer pool memory after all
4906 * connections are destroyed
4907 * Free the protection domain
4908 * ibt_close_hca()
4909 */
4910 rib_stop_hca_services(hca);
4911
4912 kmem_free(hca, sizeof (*hca));
4913 }
4914
4915 static void
4916 rib_server_side_cache_reclaim(void *argp)
4917 {
4918 cache_avl_struct_t *rcas;
4919 rib_lrc_entry_t *rb;
4920 rib_hca_t *hca = (rib_hca_t *)argp;
4921
4922 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4923 rcas = avl_first(&hca->avl_tree);
4924 if (rcas != NULL)
4925 avl_remove(&hca->avl_tree, rcas);
4926
4927 while (rcas != NULL) {
4928 while (rcas->r.forw != &rcas->r) {
4929 rcas->elements--;
4930 rb = rcas->r.forw;
4931 remque(rb);
4932 if (rb->registered)
4933 (void) rib_deregistermem_via_hca(hca,
4934 rb->lrc_buf, rb->lrc_mhandle);
4935
4936 hca->cache_allocation -= rb->lrc_len;
4937 kmem_free(rb->lrc_buf, rb->lrc_len);
4938 kmem_free(rb, sizeof (rib_lrc_entry_t));
4939 }
4940 mutex_destroy(&rcas->node_lock);
4941 kmem_cache_free(hca->server_side_cache, rcas);
4942 rcas = avl_first(&hca->avl_tree);
4943 if (rcas != NULL)
4944 avl_remove(&hca->avl_tree, rcas);
4945 }
4946 rw_exit(&hca->avl_rw_lock);
4947 }
4948
4949 static void
4950 rib_server_side_cache_cleanup(void *argp)
4951 {
4952 cache_avl_struct_t *rcas;
4953 rib_lrc_entry_t *rb;
4954 rib_hca_t *hca = (rib_hca_t *)argp;
4955
4956 mutex_enter(&hca->cache_allocation_lock);
4957 if (hca->cache_allocation < cache_limit) {
4958 mutex_exit(&hca->cache_allocation_lock);
4959 return;
4960 }
4961 mutex_exit(&hca->cache_allocation_lock);
4962
4963 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4964 rcas = avl_last(&hca->avl_tree);
4965 if (rcas != NULL)
4966 avl_remove(&hca->avl_tree, rcas);
4967
4968 while (rcas != NULL) {
4969 while (rcas->r.forw != &rcas->r) {
4970 rcas->elements--;
4971 rb = rcas->r.forw;
4972 remque(rb);
4973 if (rb->registered)
4974 (void) rib_deregistermem_via_hca(hca,
4975 rb->lrc_buf, rb->lrc_mhandle);
4976
4977 hca->cache_allocation -= rb->lrc_len;
4978
4979 kmem_free(rb->lrc_buf, rb->lrc_len);
4980 kmem_free(rb, sizeof (rib_lrc_entry_t));
4981 }
4982 mutex_destroy(&rcas->node_lock);
4983 if (hca->server_side_cache) {
4984 kmem_cache_free(hca->server_side_cache, rcas);
4985 }
4986
4987 if (hca->cache_allocation < cache_limit) {
4988 rw_exit(&hca->avl_rw_lock);
4989 return;
4990 }
4991
4992 rcas = avl_last(&hca->avl_tree);
4993 if (rcas != NULL)
4994 avl_remove(&hca->avl_tree, rcas);
4995 }
4996 rw_exit(&hca->avl_rw_lock);
4997 }
4998
4999 static int
5000 avl_compare(const void *t1, const void *t2)
5001 {
5002 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5003 return (0);
5004
5005 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5006 return (-1);
5007
5008 return (1);
5009 }
5010
5011 static void
5012 rib_destroy_cache(rib_hca_t *hca)
5013 {
5014 if (hca->avl_init) {
5015 rib_server_side_cache_reclaim((void *)hca);
5016 if (hca->server_side_cache) {
5017 kmem_cache_destroy(hca->server_side_cache);
5018 hca->server_side_cache = NULL;
5019 }
5020 avl_destroy(&hca->avl_tree);
5021 mutex_destroy(&hca->cache_allocation_lock);
5022 rw_destroy(&hca->avl_rw_lock);
5023 }
5024 hca->avl_init = FALSE;
5025 }
5026
5027 static void
5028 rib_force_cleanup(void *hca)
5029 {
5030 if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5031 (void) ddi_taskq_dispatch(
5032 ((rib_hca_t *)hca)->cleanup_helper,
5033 rib_server_side_cache_cleanup,
5034 (void *)hca, DDI_NOSLEEP);
5035 }
5036
5037 static rib_lrc_entry_t *
5038 rib_get_cache_buf(CONN *conn, uint32_t len)
5039 {
5040 cache_avl_struct_t cas, *rcas;
5041 rib_hca_t *hca = (ctoqp(conn))->hca;
5042 rib_lrc_entry_t *reply_buf;
5043 avl_index_t where = NULL;
5044 uint64_t c_alloc = 0;
5045
5046 if (!hca->avl_init)
5047 goto error_alloc;
5048
5049 cas.len = len;
5050
5051 rw_enter(&hca->avl_rw_lock, RW_READER);
5052
5053 mutex_enter(&hca->cache_allocation_lock);
5054 c_alloc = hca->cache_allocation;
5055 mutex_exit(&hca->cache_allocation_lock);
5056
5057 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5058 &where)) == NULL) {
5059 /* Am I above the cache limit */
5060 if ((c_alloc + len) >= cache_limit) {
5061 rib_force_cleanup((void *)hca);
5062 rw_exit(&hca->avl_rw_lock);
5063 mutex_enter(&hca->cache_allocation_lock);
5064 hca->cache_misses_above_the_limit ++;
5065 mutex_exit(&hca->cache_allocation_lock);
5066
5067 /* Allocate and register the buffer directly */
5068 goto error_alloc;
5069 }
5070
5071 rw_exit(&hca->avl_rw_lock);
5072 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5073
5074 /* Recheck to make sure no other thread added the entry in */
5075 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5076 &cas, &where)) == NULL) {
5077 /* Allocate an avl tree entry */
5078 rcas = (cache_avl_struct_t *)
5079 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5080
5081 bzero(rcas, sizeof (cache_avl_struct_t));
5082 rcas->elements = 0;
5083 rcas->r.forw = &rcas->r;
5084 rcas->r.back = &rcas->r;
5085 rcas->len = len;
5086 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5087 avl_insert(&hca->avl_tree, rcas, where);
5088 }
5089 }
5090
5091 mutex_enter(&rcas->node_lock);
5092
5093 if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5094 reply_buf = rcas->r.forw;
5095 remque(reply_buf);
5096 rcas->elements--;
5097 mutex_exit(&rcas->node_lock);
5098 rw_exit(&hca->avl_rw_lock);
5099
5100 mutex_enter(&hca->cache_allocation_lock);
5101 hca->cache_hits++;
5102 hca->cache_allocation -= len;
5103 mutex_exit(&hca->cache_allocation_lock);
5104 } else {
5105 /* Am I above the cache limit */
5106 mutex_exit(&rcas->node_lock);
5107 if ((c_alloc + len) >= cache_limit) {
5108 rib_force_cleanup((void *)hca);
5109 rw_exit(&hca->avl_rw_lock);
5110
5111 mutex_enter(&hca->cache_allocation_lock);
5112 hca->cache_misses_above_the_limit++;
5113 mutex_exit(&hca->cache_allocation_lock);
5114 /* Allocate and register the buffer directly */
5115 goto error_alloc;
5116 }
5117 rw_exit(&hca->avl_rw_lock);
5118 mutex_enter(&hca->cache_allocation_lock);
5119 hca->cache_misses++;
5120 mutex_exit(&hca->cache_allocation_lock);
5121 /* Allocate a reply_buf entry */
5122 reply_buf = (rib_lrc_entry_t *)
5123 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5124 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5125 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5126 reply_buf->lrc_len = len;
5127 reply_buf->registered = FALSE;
5128 reply_buf->avl_node = (void *)rcas;
5129 }
5130
5131 return (reply_buf);
5132
5133 error_alloc:
5134 reply_buf = (rib_lrc_entry_t *)
5135 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5136 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5137 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5138 reply_buf->lrc_len = len;
5139 reply_buf->registered = FALSE;
5140 reply_buf->avl_node = NULL;
5141
5142 return (reply_buf);
5143 }
5144
5145 /*
5146 * Return a pre-registered back to the cache (without
5147 * unregistering the buffer)..
5148 */
5149
5150 static void
5151 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5152 {
5153 cache_avl_struct_t cas, *rcas;
5154 avl_index_t where = NULL;
5155 rib_hca_t *hca = (ctoqp(conn))->hca;
5156
5157 if (!hca->avl_init)
5158 goto error_free;
5159
5160 cas.len = reg_buf->lrc_len;
5161 rw_enter(&hca->avl_rw_lock, RW_READER);
5162 if ((rcas = (cache_avl_struct_t *)
5163 avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5164 rw_exit(&hca->avl_rw_lock);
5165 goto error_free;
5166 } else {
5167 cas.len = reg_buf->lrc_len;
5168 mutex_enter(&rcas->node_lock);
5169 insque(reg_buf, &rcas->r);
5170 rcas->elements ++;
5171 mutex_exit(&rcas->node_lock);
5172 rw_exit(&hca->avl_rw_lock);
5173 mutex_enter(&hca->cache_allocation_lock);
5174 hca->cache_allocation += cas.len;
5175 mutex_exit(&hca->cache_allocation_lock);
5176 }
5177
5178 return;
5179
5180 error_free:
5181
5182 if (reg_buf->registered)
5183 (void) rib_deregistermem_via_hca(hca,
5184 reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5185 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5186 kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5187 }
5188
5189 static rdma_stat
5190 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5191 uint_t buflen, struct mrc *buf_handle)
5192 {
5193 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5194 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5195 rdma_stat status;
5196
5197
5198 /*
5199 * Note: ALL buffer pools use the same memory type RDMARW.
5200 */
5201 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5202 if (status == RDMA_SUCCESS) {
5203 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5204 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5205 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5206 } else {
5207 buf_handle->mrc_linfo = NULL;
5208 buf_handle->mrc_lmr = 0;
5209 buf_handle->mrc_rmr = 0;
5210 }
5211 return (status);
5212 }
5213
5214 /* ARGSUSED */
5215 static rdma_stat
5216 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5217 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5218 {
5219
5220 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5221 return (RDMA_SUCCESS);
5222 }
5223
5224 /* ARGSUSED */
5225 static rdma_stat
5226 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5227 {
5228
5229 (void) ibt_deregister_mr(hca->hca_hdl,
5230 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5231 return (RDMA_SUCCESS);
5232 }
5233
5234 /*
5235 * Check if the IP interface named by `lifrp' is RDMA-capable.
5236 */
5237 static boolean_t
5238 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5239 {
5240 char ifname[LIFNAMSIZ];
5241 char *cp;
5242
5243 if (lifrp->lifr_type == IFT_IB)
5244 return (B_TRUE);
5245
5246 /*
5247 * Strip off the logical interface portion before getting
5248 * intimate with the name.
5249 */
5250 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5251 if ((cp = strchr(ifname, ':')) != NULL)
5252 *cp = '\0';
5253
5254 return (strcmp("lo0", ifname) == 0);
5255 }
5256
5257 static int
5258 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5259 {
5260 vnode_t *kkvp, *vp;
5261 TIUSER *tiptr;
5262 struct strioctl iocb;
5263 k_sigset_t smask;
5264 int err = 0;
5265
5266 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5267 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5268 &tiptr, CRED()) == 0) {
5269 vp = tiptr->fp->f_vnode;
5270 } else {
5271 VN_RELE(kkvp);
5272 return (EPROTO);
5273 }
5274 } else {
5275 return (EPROTO);
5276 }
5277
5278 iocb.ic_cmd = cmd;
5279 iocb.ic_timout = 0;
5280 iocb.ic_len = len;
5281 iocb.ic_dp = (caddr_t)arg;
5282 sigintr(&smask, 0);
5283 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5284 sigunintr(&smask);
5285 (void) t_kclose(tiptr, 0);
5286 VN_RELE(kkvp);
5287 return (err);
5288 }
5289
5290 /*
5291 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5292 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5293 */
5294 static int
5295 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5296 {
5297 int err;
5298 struct lifnum lifn;
5299
5300 bzero(&lifn, sizeof (struct lifnum));
5301 lifn.lifn_family = AF_UNSPEC;
5302
5303 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5304 if (err != 0)
5305 return (err);
5306
5307 /*
5308 * Pad the interface count to account for additional interfaces that
5309 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5310 */
5311 lifn.lifn_count += 4;
5312
5313 bzero(lifcp, sizeof (struct lifconf));
5314 lifcp->lifc_family = AF_UNSPEC;
5315 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5316 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5317
5318 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5319 if (err != 0) {
5320 kmem_free(lifcp->lifc_buf, *bufsizep);
5321 return (err);
5322 }
5323 return (0);
5324 }
5325
5326 static boolean_t
5327 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5328 {
5329 uint_t i, nifs;
5330 uint_t bufsize;
5331 struct lifconf lifc;
5332 struct lifreq *lifrp;
5333 struct sockaddr_in *sinp;
5334 struct sockaddr_in6 *sin6p;
5335
5336 bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5337 bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5338
5339 if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5340 return (B_FALSE);
5341
5342 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5343 kmem_free(lifc.lifc_buf, bufsize);
5344 return (B_FALSE);
5345 }
5346
5347 /*
5348 * Worst case is that all of the addresses are IB-capable and have
5349 * the same address family, so size our buffers accordingly.
5350 */
5351 addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5352 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5353 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5354 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5355
5356 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5357 if (!rpcib_rdma_capable_interface(lifrp))
5358 continue;
5359
5360 if (lifrp->lifr_addr.ss_family == AF_INET) {
5361 sinp = addrs4->ri_list;
5362 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5363 sizeof (struct sockaddr_in));
5364 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5365 sin6p = addrs6->ri_list;
5366 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5367 sizeof (struct sockaddr_in6));
5368 }
5369 }
5370
5371 kmem_free(lifc.lifc_buf, bufsize);
5372 return (B_TRUE);
5373 }
5374
5375 /* ARGSUSED */
5376 static int
5377 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5378 {
5379 rib_hca_t *hca;
5380
5381 if (KSTAT_WRITE == rw) {
5382 return (EACCES);
5383 }
5384
5385 rpcib_kstat.cache_limit.value.ui64 =
5386 (uint64_t)cache_limit;
5387 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5388 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5389 rpcib_kstat.cache_allocation.value.ui64 +=
5390 (uint64_t)hca->cache_allocation;
5391 rpcib_kstat.cache_hits.value.ui64 +=
5392 (uint64_t)hca->cache_hits;
5393 rpcib_kstat.cache_misses.value.ui64 +=
5394 (uint64_t)hca->cache_misses;
5395 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5396 (uint64_t)hca->cache_misses_above_the_limit;
5397 }
5398 rw_exit(&rib_stat->hcas_list_lock);
5399 return (0);
5400 }