Print this page
7127 remove -Wno-missing-braces from Makefile.uts
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/rpc/rpcib.c
+++ new/usr/src/uts/common/rpc/rpcib.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * Copyright (c) 2007, The Ohio State University. All rights reserved.
28 28 *
29 29 * Portions of this source code is developed by the team members of
30 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31 31 * headed by Professor Dhabaleswar K. (DK) Panda.
32 32 *
33 33 * Acknowledgements to contributions from developors:
34 34 * Ranjit Noronha: noronha@cse.ohio-state.edu
35 35 * Lei Chai : chail@cse.ohio-state.edu
36 36 * Weikuan Yu : yuw@cse.ohio-state.edu
37 37 *
38 38 */
39 39
40 40 /*
41 41 * The rpcib plugin. Implements the interface for RDMATF's
42 42 * interaction with IBTF.
43 43 */
44 44
45 45 #include <sys/param.h>
46 46 #include <sys/types.h>
47 47 #include <sys/user.h>
48 48 #include <sys/systm.h>
49 49 #include <sys/sysmacros.h>
50 50 #include <sys/proc.h>
51 51 #include <sys/socket.h>
52 52 #include <sys/file.h>
53 53 #include <sys/stream.h>
54 54 #include <sys/strsubr.h>
55 55 #include <sys/stropts.h>
56 56 #include <sys/errno.h>
57 57 #include <sys/kmem.h>
58 58 #include <sys/debug.h>
59 59 #include <sys/pathname.h>
60 60 #include <sys/kstat.h>
61 61 #include <sys/t_lock.h>
62 62 #include <sys/ddi.h>
63 63 #include <sys/cmn_err.h>
64 64 #include <sys/time.h>
65 65 #include <sys/isa_defs.h>
66 66 #include <sys/callb.h>
67 67 #include <sys/sunddi.h>
68 68 #include <sys/sunndi.h>
69 69 #include <sys/sdt.h>
70 70 #include <sys/ib/ibtl/ibti.h>
71 71 #include <rpc/rpc.h>
72 72 #include <rpc/ib.h>
73 73 #include <sys/modctl.h>
74 74 #include <sys/kstr.h>
75 75 #include <sys/sockio.h>
76 76 #include <sys/vnode.h>
77 77 #include <sys/tiuser.h>
78 78 #include <net/if.h>
79 79 #include <net/if_types.h>
80 80 #include <sys/cred.h>
81 81 #include <rpc/rpc_rdma.h>
82 82 #include <nfs/nfs.h>
83 83 #include <sys/atomic.h>
84 84
85 85 #define NFS_RDMA_PORT 20049
86 86
87 87
88 88 /*
89 89 * Convenience structures for connection management
90 90 */
91 91 typedef struct rpcib_ipaddrs {
92 92 void *ri_list; /* pointer to list of addresses */
93 93 uint_t ri_count; /* number of addresses in list */
94 94 uint_t ri_size; /* size of ri_list in bytes */
95 95 } rpcib_ipaddrs_t;
96 96
97 97
98 98 typedef struct rpcib_ping {
99 99 rib_hca_t *hca;
100 100 ibt_path_info_t path;
101 101 ibt_ip_addr_t srcip;
102 102 ibt_ip_addr_t dstip;
103 103 } rpcib_ping_t;
104 104
105 105 /*
106 106 * Prototype declarations for driver ops
107 107 */
108 108 static int rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 109 static int rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 110 void *, void **);
111 111 static int rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 113 static int rpcib_do_ip_ioctl(int, int, void *);
114 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 116 static void rib_force_cleanup(void *);
117 117 static void rib_stop_hca_services(rib_hca_t *);
118 118 static void rib_attach_hca(void);
119 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 120 struct netbuf *d_svcaddr, CONN **conn);
121 121
122 122 struct {
123 123 kstat_named_t cache_limit;
124 124 kstat_named_t cache_allocation;
125 125 kstat_named_t cache_hits;
126 126 kstat_named_t cache_misses;
127 127 kstat_named_t cache_misses_above_the_limit;
128 128 } rpcib_kstat = {
129 129 {"cache_limit", KSTAT_DATA_UINT64 },
130 130 {"cache_allocation", KSTAT_DATA_UINT64 },
131 131 {"cache_hits", KSTAT_DATA_UINT64 },
132 132 {"cache_misses", KSTAT_DATA_UINT64 },
133 133 {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 134 };
135 135
136 136 /* rpcib cb_ops */
137 137 static struct cb_ops rpcib_cbops = {
138 138 nulldev, /* open */
139 139 nulldev, /* close */
140 140 nodev, /* strategy */
141 141 nodev, /* print */
142 142 nodev, /* dump */
143 143 nodev, /* read */
144 144 nodev, /* write */
145 145 nodev, /* ioctl */
146 146 nodev, /* devmap */
147 147 nodev, /* mmap */
148 148 nodev, /* segmap */
149 149 nochpoll, /* poll */
150 150 ddi_prop_op, /* prop_op */
151 151 NULL, /* stream */
152 152 D_MP, /* cb_flag */
153 153 CB_REV, /* rev */
154 154 nodev, /* int (*cb_aread)() */
155 155 nodev /* int (*cb_awrite)() */
156 156 };
157 157
158 158 /*
159 159 * Device options
160 160 */
161 161 static struct dev_ops rpcib_ops = {
162 162 DEVO_REV, /* devo_rev, */
163 163 0, /* refcnt */
164 164 rpcib_getinfo, /* info */
165 165 nulldev, /* identify */
166 166 nulldev, /* probe */
167 167 rpcib_attach, /* attach */
168 168 rpcib_detach, /* detach */
169 169 nodev, /* reset */
170 170 &rpcib_cbops, /* driver ops - devctl interfaces */
171 171 NULL, /* bus operations */
172 172 NULL, /* power */
173 173 ddi_quiesce_not_needed, /* quiesce */
174 174 };
175 175
176 176 /*
177 177 * Module linkage information.
↓ open down ↓ |
177 lines elided |
↑ open up ↑ |
178 178 */
179 179
180 180 static struct modldrv rib_modldrv = {
181 181 &mod_driverops, /* Driver module */
182 182 "RPCIB plugin driver", /* Driver name and version */
183 183 &rpcib_ops, /* Driver ops */
184 184 };
185 185
186 186 static struct modlinkage rib_modlinkage = {
187 187 MODREV_1,
188 - (void *)&rib_modldrv,
189 - NULL
188 + { (void *)&rib_modldrv, NULL }
190 189 };
191 190
192 191 typedef struct rib_lrc_entry {
193 192 struct rib_lrc_entry *forw;
194 193 struct rib_lrc_entry *back;
195 194 char *lrc_buf;
196 195
197 196 uint32_t lrc_len;
198 197 void *avl_node;
199 198 bool_t registered;
200 199
201 200 struct mrc lrc_mhandle;
202 201 bool_t lrc_on_freed_list;
203 202 } rib_lrc_entry_t;
204 203
205 204 typedef struct cache_struct {
206 205 rib_lrc_entry_t r;
207 206 uint32_t len;
208 207 uint32_t elements;
209 208 kmutex_t node_lock;
210 209 avl_node_t avl_link;
211 210 } cache_avl_struct_t;
212 211
213 212 uint64_t cache_limit = 100 * 1024 * 1024;
214 213 static uint64_t cache_watermark = 80 * 1024 * 1024;
215 214 static bool_t stats_enabled = FALSE;
216 215
217 216 static uint64_t max_unsignaled_rws = 5;
218 217 int nfs_rdma_port = NFS_RDMA_PORT;
219 218
220 219 #define RIBNETID_TCP "tcp"
221 220 #define RIBNETID_TCP6 "tcp6"
222 221
223 222 /*
224 223 * rib_stat: private data pointer used when registering
225 224 * with the IBTF. It is returned to the consumer
226 225 * in all callbacks.
227 226 */
228 227 static rpcib_state_t *rib_stat = NULL;
229 228
230 229 #define RNR_RETRIES IBT_RNR_RETRY_1
231 230 #define MAX_PORTS 2
232 231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
233 232 #define RDMA_CONN_REAP_RETRY 10 /* 10 secs */
234 233
235 234 int preposted_rbufs = RDMA_BUFS_GRANT;
236 235 int send_threshold = 1;
237 236
238 237 /*
239 238 * Old cards with Tavor driver have limited memory footprint
240 239 * when booted in 32bit. The rib_max_rbufs tunable can be
241 240 * tuned for more buffers if needed.
242 241 */
243 242
244 243 #if !defined(_ELF64) && !defined(__sparc)
245 244 int rib_max_rbufs = MAX_BUFS;
246 245 #else
247 246 int rib_max_rbufs = 10 * MAX_BUFS;
248 247 #endif /* !(_ELF64) && !(__sparc) */
249 248
250 249 int rib_conn_timeout = 60 * 12; /* 12 minutes */
251 250
252 251 /*
253 252 * State of the plugin.
254 253 * ACCEPT = accepting new connections and requests.
255 254 * NO_ACCEPT = not accepting new connection and requests.
256 255 * This should eventually move to rpcib_state_t structure, since this
257 256 * will tell in which state the plugin is for a particular type of service
258 257 * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259 258 * state for one and in no_accept state for the other.
260 259 */
261 260 int plugin_state;
262 261 kmutex_t plugin_state_lock;
263 262
264 263 ldi_ident_t rpcib_li;
265 264
266 265 /*
267 266 * RPCIB RDMATF operations
268 267 */
269 268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 269 static rdma_stat rib_disconnect(CONN *conn);
271 270 static void rib_listen(struct rdma_svc_data *rd);
272 271 static void rib_listen_stop(struct rdma_svc_data *rd);
273 272 static rdma_stat rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf,
274 273 uint_t buflen, struct mrc *buf_handle);
275 274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 275 struct mrc buf_handle);
277 276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 277 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 279 struct mrc buf_handle);
281 280 static rdma_stat rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf,
282 281 uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 282 void *lrc);
284 283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 284 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 286 caddr_t buf, int len, int cpu);
288 287
289 288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 289
291 290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 292
294 293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 294
296 295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 305 int addr_type, void *, CONN **);
307 306 static rdma_stat rib_conn_release(CONN *conn);
308 307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 308 rpcib_ping_t *, CONN **);
310 309 static rdma_stat rib_getinfo(rdma_info_t *info);
311 310
312 311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 313 static void rib_destroy_cache(rib_hca_t *hca);
315 314 static void rib_server_side_cache_reclaim(void *argp);
316 315 static int avl_compare(const void *t1, const void *t2);
317 316
318 317 static void rib_stop_services(rib_hca_t *);
319 318 static void rib_close_channels(rib_conn_list_t *);
320 319 static void rib_conn_close(void *);
321 320 static void rib_recv_rele(rib_qp_t *);
322 321 static rdma_stat rib_conn_release_locked(CONN *conn);
323 322
324 323 /*
325 324 * RPCIB addressing operations
326 325 */
327 326
328 327 /*
329 328 * RDMA operations the RPCIB module exports
330 329 */
331 330 static rdmaops_t rib_ops = {
332 331 rib_reachable,
333 332 rib_conn_get,
334 333 rib_conn_release,
335 334 rib_listen,
336 335 rib_listen_stop,
337 336 rib_registermem,
338 337 rib_deregistermem,
339 338 rib_registermemsync,
340 339 rib_deregistermemsync,
341 340 rib_syncmem,
342 341 rib_reg_buf_alloc,
343 342 rib_reg_buf_free,
344 343 rib_send,
345 344 rib_send_resp,
346 345 rib_post_resp,
347 346 rib_post_resp_remove,
348 347 rib_post_recv,
349 348 rib_recv,
350 349 rib_read,
351 350 rib_write,
352 351 rib_getinfo,
353 352 };
354 353
355 354 /*
356 355 * RDMATF RPCIB plugin details
357 356 */
358 357 static rdma_mod_t rib_mod = {
359 358 "ibtf", /* api name */
360 359 RDMATF_VERS_1,
361 360 0,
362 361 &rib_ops, /* rdma op vector for ibtf */
363 362 };
364 363
365 364 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
366 365 static rdma_stat rib_qp_init(rib_qp_t *, int);
367 366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
368 367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
369 368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
370 369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
371 370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
372 371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
373 372 ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
374 373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
375 374 ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
376 375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
377 376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
378 377 rib_qp_t **);
379 378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
380 379 rib_qp_t **);
381 380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
382 381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
383 382 static int rib_free_sendwait(struct send_wid *);
384 383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
385 384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
386 385 static void rdma_done_rem_list(rib_qp_t *);
387 386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
388 387
389 388 static void rib_async_handler(void *,
390 389 ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
391 390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
392 391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
393 392 static int rib_free_svc_recv(struct svc_recv *);
394 393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
395 394 static void rib_free_wid(struct recv_wid *);
396 395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
397 396 static void rib_detach_hca(ibt_hca_hdl_t);
398 397 static void rib_close_a_channel(CONN *);
399 398 static void rib_send_hold(rib_qp_t *);
400 399 static void rib_send_rele(rib_qp_t *);
401 400
402 401 /*
403 402 * Registration with IBTF as a consumer
404 403 */
405 404 static struct ibt_clnt_modinfo_s rib_modinfo = {
406 405 IBTI_V_CURR,
407 406 IBT_GENERIC,
408 407 rib_async_handler, /* async event handler */
409 408 NULL, /* Memory Region Handler */
410 409 "nfs/ib"
411 410 };
412 411
413 412 /*
414 413 * Global strucuture
415 414 */
416 415
417 416 typedef struct rpcib_s {
418 417 dev_info_t *rpcib_dip;
419 418 kmutex_t rpcib_mutex;
420 419 } rpcib_t;
421 420
422 421 rpcib_t rpcib;
423 422
424 423 /*
425 424 * /etc/system controlled variable to control
426 425 * debugging in rpcib kernel module.
427 426 * Set it to values greater that 1 to control
428 427 * the amount of debugging messages required.
429 428 */
430 429 int rib_debug = 0;
431 430
432 431 int
433 432 _init(void)
434 433 {
435 434 int error;
436 435
437 436 error = mod_install((struct modlinkage *)&rib_modlinkage);
438 437 if (error != 0) {
439 438 /*
440 439 * Could not load module
441 440 */
442 441 return (error);
443 442 }
444 443 mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
445 444 return (0);
446 445 }
447 446
448 447 int
449 448 _fini()
450 449 {
451 450 int status;
452 451
453 452 /*
454 453 * Remove module
455 454 */
456 455 if ((status = mod_remove(&rib_modlinkage)) != 0) {
457 456 return (status);
458 457 }
459 458 mutex_destroy(&plugin_state_lock);
460 459 return (0);
461 460 }
462 461
463 462 int
464 463 _info(struct modinfo *modinfop)
465 464 {
466 465 return (mod_info(&rib_modlinkage, modinfop));
467 466 }
468 467
469 468 /*
470 469 * rpcib_getinfo()
471 470 * Given the device number, return the devinfo pointer or the
472 471 * instance number.
473 472 * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
474 473 */
475 474
476 475 /*ARGSUSED*/
477 476 static int
478 477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
479 478 {
480 479 int ret = DDI_SUCCESS;
481 480
482 481 switch (cmd) {
483 482 case DDI_INFO_DEVT2DEVINFO:
484 483 if (rpcib.rpcib_dip != NULL)
485 484 *result = rpcib.rpcib_dip;
486 485 else {
487 486 *result = NULL;
488 487 ret = DDI_FAILURE;
489 488 }
490 489 break;
491 490
492 491 case DDI_INFO_DEVT2INSTANCE:
493 492 *result = NULL;
494 493 break;
495 494
496 495 default:
497 496 ret = DDI_FAILURE;
498 497 }
499 498 return (ret);
500 499 }
501 500
502 501 static void
503 502 rpcib_free_hca_list()
504 503 {
505 504 rib_hca_t *hca, *hcap;
506 505
507 506 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
508 507 hca = rib_stat->hcas_list;
509 508 rib_stat->hcas_list = NULL;
510 509 rw_exit(&rib_stat->hcas_list_lock);
511 510 while (hca != NULL) {
512 511 rw_enter(&hca->state_lock, RW_WRITER);
513 512 hcap = hca;
514 513 hca = hca->next;
515 514 rib_stat->nhca_inited--;
516 515 rib_mod.rdma_count--;
517 516 hcap->state = HCA_DETACHED;
518 517 rw_exit(&hcap->state_lock);
519 518 rib_stop_hca_services(hcap);
520 519
521 520 kmem_free(hcap, sizeof (*hcap));
522 521 }
523 522 }
524 523
525 524 static rdma_stat
526 525 rpcib_free_service_list()
527 526 {
528 527 rib_service_t *service;
529 528 ibt_status_t ret;
530 529
531 530 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
532 531 while (rib_stat->service_list != NULL) {
533 532 service = rib_stat->service_list;
534 533 ret = ibt_unbind_all_services(service->srv_hdl);
535 534 if (ret != IBT_SUCCESS) {
536 535 rw_exit(&rib_stat->service_list_lock);
537 536 #ifdef DEBUG
538 537 cmn_err(CE_NOTE, "rpcib_free_service_list: "
539 538 "ibt_unbind_all_services failed (%d)\n", (int)ret);
540 539 #endif
541 540 return (RDMA_FAILED);
542 541 }
543 542 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
544 543 service->srv_hdl);
545 544 if (ret != IBT_SUCCESS) {
546 545 rw_exit(&rib_stat->service_list_lock);
547 546 #ifdef DEBUG
548 547 cmn_err(CE_NOTE, "rpcib_free_service_list: "
549 548 "ibt_deregister_service failed (%d)\n", (int)ret);
550 549 #endif
551 550 return (RDMA_FAILED);
552 551 }
553 552 rib_stat->service_list = service->next;
554 553 kmem_free(service, sizeof (rib_service_t));
555 554 }
556 555 rw_exit(&rib_stat->service_list_lock);
557 556
558 557 return (RDMA_SUCCESS);
559 558 }
560 559
561 560 static int
562 561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
563 562 {
564 563 ibt_status_t ibt_status;
565 564 rdma_stat r_status;
566 565
567 566 switch (cmd) {
568 567 case DDI_ATTACH:
569 568 break;
570 569 case DDI_RESUME:
571 570 return (DDI_SUCCESS);
572 571 default:
573 572 return (DDI_FAILURE);
574 573 }
575 574
576 575 mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
577 576
578 577 mutex_enter(&rpcib.rpcib_mutex);
579 578 if (rpcib.rpcib_dip != NULL) {
580 579 mutex_exit(&rpcib.rpcib_mutex);
581 580 return (DDI_FAILURE);
582 581 }
583 582 rpcib.rpcib_dip = dip;
584 583 mutex_exit(&rpcib.rpcib_mutex);
585 584 /*
586 585 * Create the "rpcib" minor-node.
587 586 */
588 587 if (ddi_create_minor_node(dip,
589 588 "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
590 589 /* Error message, no cmn_err as they print on console */
591 590 return (DDI_FAILURE);
592 591 }
593 592
594 593 if (rib_stat == NULL) {
595 594 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
596 595 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
597 596 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
598 597 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
599 598 }
600 599
601 600 rib_stat->hca_count = ibt_get_hca_list(NULL);
602 601 if (rib_stat->hca_count < 1) {
603 602 mutex_destroy(&rib_stat->listen_lock);
604 603 rw_destroy(&rib_stat->hcas_list_lock);
605 604 mutex_destroy(&rib_stat->open_hca_lock);
606 605 kmem_free(rib_stat, sizeof (*rib_stat));
607 606 rib_stat = NULL;
608 607 return (DDI_FAILURE);
609 608 }
610 609
611 610 ibt_status = ibt_attach(&rib_modinfo, dip,
612 611 (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
613 612
614 613 if (ibt_status != IBT_SUCCESS) {
615 614 mutex_destroy(&rib_stat->listen_lock);
616 615 rw_destroy(&rib_stat->hcas_list_lock);
617 616 mutex_destroy(&rib_stat->open_hca_lock);
618 617 kmem_free(rib_stat, sizeof (*rib_stat));
619 618 rib_stat = NULL;
620 619 return (DDI_FAILURE);
621 620 }
622 621
623 622 rib_stat->service_list = NULL;
624 623 rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
625 624 mutex_enter(&rib_stat->open_hca_lock);
626 625 if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
627 626 mutex_exit(&rib_stat->open_hca_lock);
628 627 goto open_fail;
629 628 }
630 629 mutex_exit(&rib_stat->open_hca_lock);
631 630
632 631 if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
633 632 DDI_PROP_SUCCESS) {
634 633 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
635 634 "failed.");
636 635 goto register_fail;
637 636 }
638 637
639 638 /*
640 639 * Register with rdmatf
641 640 */
642 641 r_status = rdma_register_mod(&rib_mod);
643 642 if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
644 643 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
645 644 "status = %d", r_status);
646 645 goto register_fail;
647 646 }
648 647
649 648 return (DDI_SUCCESS);
650 649
651 650 register_fail:
652 651
653 652 open_fail:
654 653 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
655 654 rpcib_free_hca_list();
656 655 (void) rpcib_free_service_list();
657 656 mutex_destroy(&rib_stat->listen_lock);
658 657 rw_destroy(&rib_stat->hcas_list_lock);
659 658 mutex_destroy(&rib_stat->open_hca_lock);
660 659 rw_destroy(&rib_stat->service_list_lock);
661 660 kmem_free(rib_stat, sizeof (*rib_stat));
662 661 rib_stat = NULL;
663 662 return (DDI_FAILURE);
664 663 }
665 664
666 665 /*ARGSUSED*/
667 666 static int
668 667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
669 668 {
670 669 switch (cmd) {
671 670
672 671 case DDI_DETACH:
673 672 break;
674 673
675 674 case DDI_SUSPEND:
676 675 default:
677 676 return (DDI_FAILURE);
678 677 }
679 678
680 679 /*
681 680 * Detach the hca and free resources
682 681 */
683 682 mutex_enter(&plugin_state_lock);
684 683 plugin_state = NO_ACCEPT;
685 684 mutex_exit(&plugin_state_lock);
686 685
687 686 if (rpcib_free_service_list() != RDMA_SUCCESS)
688 687 return (DDI_FAILURE);
689 688 rpcib_free_hca_list();
690 689
691 690 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
692 691 mutex_destroy(&rib_stat->listen_lock);
693 692 rw_destroy(&rib_stat->hcas_list_lock);
694 693 mutex_destroy(&rib_stat->open_hca_lock);
695 694 rw_destroy(&rib_stat->service_list_lock);
696 695
697 696 kmem_free(rib_stat, sizeof (*rib_stat));
698 697 rib_stat = NULL;
699 698
700 699 mutex_enter(&rpcib.rpcib_mutex);
701 700 rpcib.rpcib_dip = NULL;
702 701 mutex_exit(&rpcib.rpcib_mutex);
703 702 mutex_destroy(&rpcib.rpcib_mutex);
704 703 return (DDI_SUCCESS);
705 704 }
706 705
707 706
708 707 static void rib_rbufpool_free(rib_hca_t *, int);
709 708 static void rib_rbufpool_deregister(rib_hca_t *, int);
710 709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
711 710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
712 711 static rdma_stat rib_rem_replylist(rib_qp_t *);
713 712 static int rib_remreply(rib_qp_t *, struct reply *);
714 713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
715 714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
716 715
717 716
718 717 /*
719 718 * One CQ pair per HCA
720 719 */
721 720 static rdma_stat
722 721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
723 722 rib_cq_t **cqp)
724 723 {
725 724 rib_cq_t *cq;
726 725 ibt_cq_attr_t cq_attr;
727 726 uint32_t real_size;
728 727 ibt_status_t status;
729 728 rdma_stat error = RDMA_SUCCESS;
730 729
731 730 cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
732 731 cq->rib_hca = hca;
733 732 bzero(&cq_attr, sizeof (cq_attr));
734 733 cq_attr.cq_size = cq_size;
735 734 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
736 735 status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
737 736 &real_size);
738 737 if (status != IBT_SUCCESS) {
739 738 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
740 739 " status=%d", status);
741 740 error = RDMA_FAILED;
742 741 goto fail;
743 742 }
744 743 ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
745 744
746 745 /*
747 746 * Enable CQ callbacks. CQ Callbacks are single shot
748 747 * (e.g. you have to call ibt_enable_cq_notify()
749 748 * after each callback to get another one).
750 749 */
751 750 status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
752 751 if (status != IBT_SUCCESS) {
753 752 cmn_err(CE_WARN, "rib_create_cq: "
754 753 "enable_cq_notify failed, status %d", status);
755 754 error = RDMA_FAILED;
756 755 goto fail;
757 756 }
758 757 *cqp = cq;
759 758
760 759 return (error);
761 760 fail:
762 761 if (cq->rib_cq_hdl)
763 762 (void) ibt_free_cq(cq->rib_cq_hdl);
764 763 if (cq)
765 764 kmem_free(cq, sizeof (rib_cq_t));
766 765 return (error);
767 766 }
768 767
769 768 /*
770 769 * rpcib_find_hca
771 770 *
772 771 * Caller should have already locked the hcas_lock before calling
773 772 * this function.
774 773 */
775 774 static rib_hca_t *
776 775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
777 776 {
778 777 rib_hca_t *hca = ribstat->hcas_list;
779 778
780 779 while (hca && hca->hca_guid != guid)
781 780 hca = hca->next;
782 781
783 782 return (hca);
784 783 }
785 784
786 785 static rdma_stat
787 786 rpcib_open_hcas(rpcib_state_t *ribstat)
788 787 {
789 788 rib_hca_t *hca;
790 789 ibt_status_t ibt_status;
791 790 rdma_stat status;
792 791 ibt_hca_portinfo_t *pinfop;
793 792 ibt_pd_flags_t pd_flags = IBT_PD_NO_FLAGS;
794 793 uint_t size, cq_size;
795 794 int i;
796 795 kstat_t *ksp;
797 796 cache_avl_struct_t example_avl_node;
798 797 char rssc_name[32];
799 798 int old_nhca_inited = ribstat->nhca_inited;
800 799 ib_guid_t *hca_guids;
801 800
802 801 ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
803 802
804 803 ribstat->hca_count = ibt_get_hca_list(&hca_guids);
805 804 if (ribstat->hca_count == 0)
806 805 return (RDMA_FAILED);
807 806
808 807 rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
809 808 /*
810 809 * Open a hca and setup for RDMA
811 810 */
812 811 for (i = 0; i < ribstat->hca_count; i++) {
813 812 if (rpcib_find_hca(ribstat, hca_guids[i]))
814 813 continue;
815 814 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
816 815
817 816 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
818 817 hca_guids[i], &hca->hca_hdl);
819 818 if (ibt_status != IBT_SUCCESS) {
820 819 kmem_free(hca, sizeof (rib_hca_t));
821 820 continue;
822 821 }
823 822 hca->hca_guid = hca_guids[i];
824 823 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
825 824 hca->state = HCA_INITED;
826 825
827 826 /*
828 827 * query HCA info
829 828 */
830 829 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
831 830 if (ibt_status != IBT_SUCCESS) {
832 831 goto fail1;
833 832 }
834 833
835 834 /*
836 835 * One PD (Protection Domain) per HCA.
837 836 * A qp is allowed to access a memory region
838 837 * only when it's in the same PD as that of
839 838 * the memory region.
840 839 */
841 840 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
842 841 if (ibt_status != IBT_SUCCESS) {
843 842 goto fail1;
844 843 }
845 844
846 845 /*
847 846 * query HCA ports
848 847 */
849 848 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
850 849 0, &pinfop, &hca->hca_nports, &size);
851 850 if (ibt_status != IBT_SUCCESS) {
852 851 goto fail2;
853 852 }
854 853 hca->hca_ports = pinfop;
855 854 hca->hca_pinfosz = size;
856 855 pinfop = NULL;
857 856
858 857 cq_size = DEF_CQ_SIZE; /* default cq size */
859 858 /*
860 859 * Create 2 pairs of cq's (1 pair for client
861 860 * and the other pair for server) on this hca.
862 861 * If number of qp's gets too large, then several
863 862 * cq's will be needed.
864 863 */
865 864 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
866 865 &hca->svc_rcq);
867 866 if (status != RDMA_SUCCESS) {
868 867 goto fail3;
869 868 }
870 869
871 870 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
872 871 &hca->svc_scq);
873 872 if (status != RDMA_SUCCESS) {
874 873 goto fail3;
875 874 }
876 875
877 876 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
878 877 &hca->clnt_rcq);
879 878 if (status != RDMA_SUCCESS) {
880 879 goto fail3;
881 880 }
882 881
883 882 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
884 883 &hca->clnt_scq);
885 884 if (status != RDMA_SUCCESS) {
886 885 goto fail3;
887 886 }
888 887
889 888 /*
890 889 * Create buffer pools.
891 890 * Note rib_rbuf_create also allocates memory windows.
892 891 */
893 892 hca->recv_pool = rib_rbufpool_create(hca,
894 893 RECV_BUFFER, rib_max_rbufs);
895 894 if (hca->recv_pool == NULL) {
896 895 goto fail3;
897 896 }
898 897
899 898 hca->send_pool = rib_rbufpool_create(hca,
900 899 SEND_BUFFER, rib_max_rbufs);
901 900 if (hca->send_pool == NULL) {
902 901 rib_rbufpool_destroy(hca, RECV_BUFFER);
903 902 goto fail3;
904 903 }
905 904
906 905 if (hca->server_side_cache == NULL) {
907 906 (void) sprintf(rssc_name,
908 907 "rib_srvr_cache_%llx",
909 908 (long long unsigned int) hca->hca_guid);
910 909 hca->server_side_cache = kmem_cache_create(
911 910 rssc_name,
912 911 sizeof (cache_avl_struct_t), 0,
913 912 NULL,
914 913 NULL,
915 914 rib_server_side_cache_reclaim,
916 915 hca, NULL, 0);
917 916 }
918 917
919 918 avl_create(&hca->avl_tree,
920 919 avl_compare,
921 920 sizeof (cache_avl_struct_t),
922 921 (uint_t)(uintptr_t)&example_avl_node.avl_link-
923 922 (uint_t)(uintptr_t)&example_avl_node);
924 923
925 924 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
926 925 hca->iblock);
927 926 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
928 927 rw_init(&hca->avl_rw_lock,
929 928 NULL, RW_DRIVER, hca->iblock);
930 929 mutex_init(&hca->cache_allocation_lock,
931 930 NULL, MUTEX_DRIVER, NULL);
932 931 hca->avl_init = TRUE;
933 932
934 933 /* Create kstats for the cache */
935 934 ASSERT(INGLOBALZONE(curproc));
936 935
937 936 if (!stats_enabled) {
938 937 ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
939 938 KSTAT_TYPE_NAMED,
940 939 sizeof (rpcib_kstat) / sizeof (kstat_named_t),
941 940 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
942 941 GLOBAL_ZONEID);
943 942 if (ksp) {
944 943 ksp->ks_data = (void *) &rpcib_kstat;
945 944 ksp->ks_update = rpcib_cache_kstat_update;
946 945 kstat_install(ksp);
947 946 stats_enabled = TRUE;
948 947 }
949 948 }
950 949 if (hca->cleanup_helper == NULL) {
951 950 char tq_name[sizeof (hca->hca_guid) * 2 + 1];
952 951
953 952 (void) snprintf(tq_name, sizeof (tq_name), "%llX",
954 953 (unsigned long long int) hca->hca_guid);
955 954 hca->cleanup_helper = ddi_taskq_create(NULL,
956 955 tq_name, 1, TASKQ_DEFAULTPRI, 0);
957 956 }
958 957
959 958 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
960 959 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
961 960 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
962 961 hca->iblock);
963 962 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
964 963 hca->iblock);
965 964 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
966 965 hca->inuse = TRUE;
967 966
968 967 hca->next = ribstat->hcas_list;
969 968 ribstat->hcas_list = hca;
970 969 ribstat->nhca_inited++;
971 970 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
972 971 continue;
973 972
974 973 fail3:
975 974 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
976 975 fail2:
977 976 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
978 977 fail1:
979 978 (void) ibt_close_hca(hca->hca_hdl);
980 979 kmem_free(hca, sizeof (rib_hca_t));
981 980 }
982 981 rw_exit(&ribstat->hcas_list_lock);
983 982 ibt_free_hca_list(hca_guids, ribstat->hca_count);
984 983 rib_mod.rdma_count = rib_stat->nhca_inited;
985 984
986 985 /*
987 986 * return success if at least one new hca has been configured.
988 987 */
989 988 if (ribstat->nhca_inited != old_nhca_inited)
990 989 return (RDMA_SUCCESS);
991 990 else
992 991 return (RDMA_FAILED);
993 992 }
994 993
995 994 /*
996 995 * Callback routines
997 996 */
998 997
999 998 /*
1000 999 * SCQ handlers
1001 1000 */
1002 1001 /* ARGSUSED */
1003 1002 static void
1004 1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1005 1004 {
1006 1005 ibt_status_t ibt_status;
1007 1006 ibt_wc_t wc;
1008 1007 struct send_wid *wd;
1009 1008 CONN *conn;
1010 1009 rib_qp_t *qp;
1011 1010 int i;
1012 1011
1013 1012 /*
1014 1013 * Re-enable cq notify here to avoid missing any
1015 1014 * completion queue notification.
1016 1015 */
1017 1016 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1018 1017
1019 1018 ibt_status = IBT_SUCCESS;
1020 1019 while (ibt_status != IBT_CQ_EMPTY) {
1021 1020 bzero(&wc, sizeof (wc));
1022 1021 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1023 1022 if (ibt_status != IBT_SUCCESS)
1024 1023 return;
1025 1024
1026 1025 /*
1027 1026 * Got a send completion
1028 1027 */
1029 1028 if (wc.wc_id != RDMA_DUMMY_WRID) {
1030 1029 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1031 1030 qp = wd->qp;
1032 1031 conn = qptoc(qp);
1033 1032
1034 1033 mutex_enter(&wd->sendwait_lock);
1035 1034 switch (wc.wc_status) {
1036 1035 case IBT_WC_SUCCESS:
1037 1036 wd->status = RDMA_SUCCESS;
1038 1037 break;
1039 1038 default:
1040 1039 /*
1041 1040 * RC Send Q Error Code Local state Remote State
1042 1041 * ==================== =========== ============
1043 1042 * IBT_WC_BAD_RESPONSE_ERR ERROR None
1044 1043 * IBT_WC_LOCAL_LEN_ERR ERROR None
1045 1044 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR None
1046 1045 * IBT_WC_LOCAL_PROTECT_ERR ERROR None
1047 1046 * IBT_WC_MEM_WIN_BIND_ERR ERROR None
1048 1047 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR
1049 1048 * IBT_WC_REMOTE_ACCESS_ERR ERROR ERROR
1050 1049 * IBT_WC_REMOTE_OP_ERR ERROR ERROR
1051 1050 * IBT_WC_RNR_NAK_TIMEOUT_ERR ERROR None
1052 1051 * IBT_WC_TRANS_TIMEOUT_ERR ERROR None
1053 1052 * IBT_WC_WR_FLUSHED_ERR ERROR None
1054 1053 */
1055 1054 /*
1056 1055 * Channel in error state. Set connection to
1057 1056 * ERROR and cleanup will happen either from
1058 1057 * conn_release or from rib_conn_get
1059 1058 */
1060 1059 wd->status = RDMA_FAILED;
1061 1060 mutex_enter(&conn->c_lock);
1062 1061 if (conn->c_state != C_DISCONN_PEND)
1063 1062 conn->c_state = C_ERROR_CONN;
1064 1063 mutex_exit(&conn->c_lock);
1065 1064 break;
1066 1065 }
1067 1066
1068 1067 if (wd->cv_sig == 1) {
1069 1068 /*
1070 1069 * Notify poster
1071 1070 */
1072 1071 cv_signal(&wd->wait_cv);
1073 1072 mutex_exit(&wd->sendwait_lock);
1074 1073 } else {
1075 1074 /*
1076 1075 * Poster not waiting for notification.
1077 1076 * Free the send buffers and send_wid
1078 1077 */
1079 1078 for (i = 0; i < wd->nsbufs; i++) {
1080 1079 rib_rbuf_free(qptoc(wd->qp),
1081 1080 SEND_BUFFER,
1082 1081 (void *)(uintptr_t)wd->sbufaddr[i]);
1083 1082 }
1084 1083
1085 1084 /* decrement the send ref count */
1086 1085 rib_send_rele(qp);
1087 1086
1088 1087 mutex_exit(&wd->sendwait_lock);
1089 1088 (void) rib_free_sendwait(wd);
1090 1089 }
1091 1090 }
1092 1091 }
1093 1092 }
1094 1093
1095 1094 /* ARGSUSED */
1096 1095 static void
1097 1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1098 1097 {
1099 1098 ibt_status_t ibt_status;
1100 1099 ibt_wc_t wc;
1101 1100 struct send_wid *wd;
1102 1101 rib_qp_t *qp;
1103 1102 CONN *conn;
1104 1103 int i;
1105 1104
1106 1105 /*
1107 1106 * Re-enable cq notify here to avoid missing any
1108 1107 * completion queue notification.
1109 1108 */
1110 1109 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1111 1110
1112 1111 ibt_status = IBT_SUCCESS;
1113 1112 while (ibt_status != IBT_CQ_EMPTY) {
1114 1113 bzero(&wc, sizeof (wc));
1115 1114 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1116 1115 if (ibt_status != IBT_SUCCESS)
1117 1116 return;
1118 1117
1119 1118 /*
1120 1119 * Got a send completion
1121 1120 */
1122 1121 if (wc.wc_id != RDMA_DUMMY_WRID) {
1123 1122 wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1124 1123 qp = wd->qp;
1125 1124 conn = qptoc(qp);
1126 1125 mutex_enter(&wd->sendwait_lock);
1127 1126
1128 1127 switch (wc.wc_status) {
1129 1128 case IBT_WC_SUCCESS:
1130 1129 wd->status = RDMA_SUCCESS;
1131 1130 break;
1132 1131 default:
1133 1132 /*
1134 1133 * Channel in error state. Set connection to
1135 1134 * ERROR and cleanup will happen either from
1136 1135 * conn_release or conn timeout.
1137 1136 */
1138 1137 wd->status = RDMA_FAILED;
1139 1138 mutex_enter(&conn->c_lock);
1140 1139 if (conn->c_state != C_DISCONN_PEND)
1141 1140 conn->c_state = C_ERROR_CONN;
1142 1141 mutex_exit(&conn->c_lock);
1143 1142 break;
1144 1143 }
1145 1144
1146 1145 if (wd->cv_sig == 1) {
1147 1146 /*
1148 1147 * Update completion status and notify poster
1149 1148 */
1150 1149 cv_signal(&wd->wait_cv);
1151 1150 mutex_exit(&wd->sendwait_lock);
1152 1151 } else {
1153 1152 /*
1154 1153 * Poster not waiting for notification.
1155 1154 * Free the send buffers and send_wid
1156 1155 */
1157 1156 for (i = 0; i < wd->nsbufs; i++) {
1158 1157 rib_rbuf_free(qptoc(wd->qp),
1159 1158 SEND_BUFFER,
1160 1159 (void *)(uintptr_t)wd->sbufaddr[i]);
1161 1160 }
1162 1161
1163 1162 /* decrement the send ref count */
1164 1163 rib_send_rele(qp);
1165 1164
1166 1165 mutex_exit(&wd->sendwait_lock);
1167 1166 (void) rib_free_sendwait(wd);
1168 1167 }
1169 1168 }
1170 1169 }
1171 1170 }
1172 1171
1173 1172 /*
1174 1173 * RCQ handler
1175 1174 */
1176 1175 /* ARGSUSED */
1177 1176 static void
1178 1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1179 1178 {
1180 1179 rib_qp_t *qp;
1181 1180 ibt_status_t ibt_status;
1182 1181 ibt_wc_t wc;
1183 1182 struct recv_wid *rwid;
1184 1183
1185 1184 /*
1186 1185 * Re-enable cq notify here to avoid missing any
1187 1186 * completion queue notification.
1188 1187 */
1189 1188 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1190 1189
1191 1190 ibt_status = IBT_SUCCESS;
1192 1191 while (ibt_status != IBT_CQ_EMPTY) {
1193 1192 bzero(&wc, sizeof (wc));
1194 1193 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1195 1194 if (ibt_status != IBT_SUCCESS)
1196 1195 return;
1197 1196
1198 1197 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1199 1198 qp = rwid->qp;
1200 1199
1201 1200 if (wc.wc_status == IBT_WC_SUCCESS) {
1202 1201 XDR inxdrs, *xdrs;
1203 1202 uint_t xid, vers, op, find_xid = 0;
1204 1203 struct reply *r;
1205 1204 CONN *conn = qptoc(qp);
1206 1205 uint32_t rdma_credit = 0;
1207 1206
1208 1207 xdrs = &inxdrs;
1209 1208 xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1210 1209 wc.wc_bytes_xfer, XDR_DECODE);
1211 1210 /*
1212 1211 * Treat xid as opaque (xid is the first entity
1213 1212 * in the rpc rdma message).
1214 1213 */
1215 1214 xid = *(uint32_t *)(uintptr_t)rwid->addr;
1216 1215
1217 1216 /* Skip xid and set the xdr position accordingly. */
1218 1217 XDR_SETPOS(xdrs, sizeof (uint32_t));
1219 1218 (void) xdr_u_int(xdrs, &vers);
1220 1219 (void) xdr_u_int(xdrs, &rdma_credit);
1221 1220 (void) xdr_u_int(xdrs, &op);
1222 1221 XDR_DESTROY(xdrs);
1223 1222
1224 1223 if (vers != RPCRDMA_VERS) {
1225 1224 /*
1226 1225 * Invalid RPC/RDMA version. Cannot
1227 1226 * interoperate. Set connection to
1228 1227 * ERROR state and bail out.
1229 1228 */
1230 1229 mutex_enter(&conn->c_lock);
1231 1230 if (conn->c_state != C_DISCONN_PEND)
1232 1231 conn->c_state = C_ERROR_CONN;
1233 1232 mutex_exit(&conn->c_lock);
1234 1233 rib_rbuf_free(conn, RECV_BUFFER,
1235 1234 (void *)(uintptr_t)rwid->addr);
1236 1235 rib_free_wid(rwid);
1237 1236 rib_recv_rele(qp);
1238 1237 continue;
1239 1238 }
1240 1239
1241 1240 mutex_enter(&qp->replylist_lock);
1242 1241 for (r = qp->replylist; r != NULL; r = r->next) {
1243 1242 if (r->xid == xid) {
1244 1243 find_xid = 1;
1245 1244 switch (op) {
1246 1245 case RDMA_MSG:
1247 1246 case RDMA_NOMSG:
1248 1247 case RDMA_MSGP:
1249 1248 r->status = RDMA_SUCCESS;
1250 1249 r->vaddr_cq = rwid->addr;
1251 1250 r->bytes_xfer =
1252 1251 wc.wc_bytes_xfer;
1253 1252 cv_signal(&r->wait_cv);
1254 1253 break;
1255 1254 default:
1256 1255 rib_rbuf_free(qptoc(qp),
1257 1256 RECV_BUFFER,
1258 1257 (void *)(uintptr_t)
1259 1258 rwid->addr);
1260 1259 break;
1261 1260 }
1262 1261 break;
1263 1262 }
1264 1263 }
1265 1264 mutex_exit(&qp->replylist_lock);
1266 1265 if (find_xid == 0) {
1267 1266 /* RPC caller not waiting for reply */
1268 1267
1269 1268 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1270 1269 int, xid);
1271 1270
1272 1271 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1273 1272 (void *)(uintptr_t)rwid->addr);
1274 1273 }
1275 1274 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1276 1275 CONN *conn = qptoc(qp);
1277 1276
1278 1277 /*
1279 1278 * Connection being flushed. Just free
1280 1279 * the posted buffer
1281 1280 */
1282 1281 rib_rbuf_free(conn, RECV_BUFFER,
1283 1282 (void *)(uintptr_t)rwid->addr);
1284 1283 } else {
1285 1284 CONN *conn = qptoc(qp);
1286 1285 /*
1287 1286 * RC Recv Q Error Code Local state Remote State
1288 1287 * ==================== =========== ============
1289 1288 * IBT_WC_LOCAL_ACCESS_ERR ERROR ERROR when NAK recvd
1290 1289 * IBT_WC_LOCAL_LEN_ERR ERROR ERROR when NAK recvd
1291 1290 * IBT_WC_LOCAL_PROTECT_ERR ERROR ERROR when NAK recvd
1292 1291 * IBT_WC_LOCAL_CHAN_OP_ERR ERROR ERROR when NAK recvd
1293 1292 * IBT_WC_REMOTE_INVALID_REQ_ERR ERROR ERROR when NAK recvd
1294 1293 * IBT_WC_WR_FLUSHED_ERR None None
1295 1294 */
1296 1295 /*
1297 1296 * Channel in error state. Set connection
1298 1297 * in ERROR state.
1299 1298 */
1300 1299 mutex_enter(&conn->c_lock);
1301 1300 if (conn->c_state != C_DISCONN_PEND)
1302 1301 conn->c_state = C_ERROR_CONN;
1303 1302 mutex_exit(&conn->c_lock);
1304 1303 rib_rbuf_free(conn, RECV_BUFFER,
1305 1304 (void *)(uintptr_t)rwid->addr);
1306 1305 }
1307 1306 rib_free_wid(rwid);
1308 1307 rib_recv_rele(qp);
1309 1308 }
1310 1309 }
1311 1310
1312 1311 /* Server side */
1313 1312 /* ARGSUSED */
1314 1313 static void
1315 1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1316 1315 {
1317 1316 rdma_recv_data_t *rdp;
1318 1317 rib_qp_t *qp;
1319 1318 ibt_status_t ibt_status;
1320 1319 ibt_wc_t wc;
1321 1320 struct svc_recv *s_recvp;
1322 1321 CONN *conn;
1323 1322 mblk_t *mp;
1324 1323
1325 1324 /*
1326 1325 * Re-enable cq notify here to avoid missing any
1327 1326 * completion queue notification.
1328 1327 */
1329 1328 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1330 1329
1331 1330 ibt_status = IBT_SUCCESS;
1332 1331 while (ibt_status != IBT_CQ_EMPTY) {
1333 1332 bzero(&wc, sizeof (wc));
1334 1333 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1335 1334 if (ibt_status != IBT_SUCCESS)
1336 1335 return;
1337 1336
1338 1337 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1339 1338 qp = s_recvp->qp;
1340 1339 conn = qptoc(qp);
1341 1340
1342 1341 if (wc.wc_status == IBT_WC_SUCCESS) {
1343 1342 XDR inxdrs, *xdrs;
1344 1343 uint_t xid, vers, op;
1345 1344 uint32_t rdma_credit;
1346 1345
1347 1346 xdrs = &inxdrs;
1348 1347 /* s_recvp->vaddr stores data */
1349 1348 xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1350 1349 wc.wc_bytes_xfer, XDR_DECODE);
1351 1350
1352 1351 /*
1353 1352 * Treat xid as opaque (xid is the first entity
1354 1353 * in the rpc rdma message).
1355 1354 */
1356 1355 xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1357 1356 /* Skip xid and set the xdr position accordingly. */
1358 1357 XDR_SETPOS(xdrs, sizeof (uint32_t));
1359 1358 if (!xdr_u_int(xdrs, &vers) ||
1360 1359 !xdr_u_int(xdrs, &rdma_credit) ||
1361 1360 !xdr_u_int(xdrs, &op)) {
1362 1361 rib_rbuf_free(conn, RECV_BUFFER,
1363 1362 (void *)(uintptr_t)s_recvp->vaddr);
1364 1363 XDR_DESTROY(xdrs);
1365 1364 rib_recv_rele(qp);
1366 1365 (void) rib_free_svc_recv(s_recvp);
1367 1366 continue;
1368 1367 }
1369 1368 XDR_DESTROY(xdrs);
1370 1369
1371 1370 if (vers != RPCRDMA_VERS) {
1372 1371 /*
1373 1372 * Invalid RPC/RDMA version.
1374 1373 * Drop rpc rdma message.
1375 1374 */
1376 1375 rib_rbuf_free(conn, RECV_BUFFER,
1377 1376 (void *)(uintptr_t)s_recvp->vaddr);
1378 1377 rib_recv_rele(qp);
1379 1378 (void) rib_free_svc_recv(s_recvp);
1380 1379 continue;
1381 1380 }
1382 1381 /*
1383 1382 * Is this for RDMA_DONE?
1384 1383 */
1385 1384 if (op == RDMA_DONE) {
1386 1385 rib_rbuf_free(conn, RECV_BUFFER,
1387 1386 (void *)(uintptr_t)s_recvp->vaddr);
1388 1387 /*
1389 1388 * Wake up the thread waiting on
1390 1389 * a RDMA_DONE for xid
1391 1390 */
1392 1391 mutex_enter(&qp->rdlist_lock);
1393 1392 rdma_done_notify(qp, xid);
1394 1393 mutex_exit(&qp->rdlist_lock);
1395 1394 rib_recv_rele(qp);
1396 1395 (void) rib_free_svc_recv(s_recvp);
1397 1396 continue;
1398 1397 }
1399 1398
1400 1399 mutex_enter(&plugin_state_lock);
1401 1400 mutex_enter(&conn->c_lock);
1402 1401 if ((plugin_state == ACCEPT) &&
1403 1402 (conn->c_state == C_CONNECTED)) {
1404 1403 conn->c_ref++;
1405 1404 mutex_exit(&conn->c_lock);
1406 1405 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1407 1406 == NULL)
1408 1407 (void) strwaitbuf(
1409 1408 sizeof (*rdp), BPRI_LO);
1410 1409 /*
1411 1410 * Plugin is in accept state, hence the master
1412 1411 * transport queue for this is still accepting
1413 1412 * requests. Hence we can call svc_queuereq to
1414 1413 * queue this recieved msg.
1415 1414 */
1416 1415 rdp = (rdma_recv_data_t *)mp->b_rptr;
1417 1416 rdp->conn = conn;
1418 1417 rdp->rpcmsg.addr =
1419 1418 (caddr_t)(uintptr_t)s_recvp->vaddr;
1420 1419 rdp->rpcmsg.type = RECV_BUFFER;
1421 1420 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1422 1421 rdp->status = wc.wc_status;
1423 1422 mp->b_wptr += sizeof (*rdp);
1424 1423 (void) svc_queuereq((queue_t *)rib_stat->q, mp,
1425 1424 FALSE);
1426 1425 mutex_exit(&plugin_state_lock);
1427 1426 } else {
1428 1427 /*
1429 1428 * The master transport for this is going
1430 1429 * away and the queue is not accepting anymore
1431 1430 * requests for krpc, so don't do anything, just
1432 1431 * free the msg.
1433 1432 */
1434 1433 mutex_exit(&conn->c_lock);
1435 1434 mutex_exit(&plugin_state_lock);
1436 1435 rib_rbuf_free(conn, RECV_BUFFER,
1437 1436 (void *)(uintptr_t)s_recvp->vaddr);
1438 1437 }
1439 1438 } else {
1440 1439 rib_rbuf_free(conn, RECV_BUFFER,
1441 1440 (void *)(uintptr_t)s_recvp->vaddr);
1442 1441 }
1443 1442 rib_recv_rele(qp);
1444 1443 (void) rib_free_svc_recv(s_recvp);
1445 1444 }
1446 1445 }
1447 1446
1448 1447 static void
1449 1448 rib_attach_hca()
1450 1449 {
1451 1450 mutex_enter(&rib_stat->open_hca_lock);
1452 1451 (void) rpcib_open_hcas(rib_stat);
1453 1452 rib_listen(NULL);
1454 1453 mutex_exit(&rib_stat->open_hca_lock);
1455 1454 }
1456 1455
1457 1456 /*
1458 1457 * Handles DR event of IBT_HCA_DETACH_EVENT.
1459 1458 */
1460 1459 /* ARGSUSED */
1461 1460 static void
1462 1461 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1463 1462 ibt_async_code_t code, ibt_async_event_t *event)
1464 1463 {
1465 1464 switch (code) {
1466 1465 case IBT_HCA_ATTACH_EVENT:
1467 1466 rib_attach_hca();
1468 1467 break;
1469 1468 case IBT_HCA_DETACH_EVENT:
1470 1469 rib_detach_hca(hca_hdl);
1471 1470 #ifdef DEBUG
1472 1471 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1473 1472 #endif
1474 1473 break;
1475 1474 case IBT_EVENT_PORT_UP:
1476 1475 /*
1477 1476 * A port is up. We should call rib_listen() since there is
1478 1477 * a chance that rib_listen() may have failed during
1479 1478 * rib_attach_hca() because the port had not been up yet.
1480 1479 */
1481 1480 rib_listen(NULL);
1482 1481 #ifdef DEBUG
1483 1482 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1484 1483 #endif
1485 1484 break;
1486 1485 #ifdef DEBUG
1487 1486 case IBT_EVENT_PATH_MIGRATED:
1488 1487 cmn_err(CE_NOTE, "rib_async_handler(): "
1489 1488 "IBT_EVENT_PATH_MIGRATED\n");
1490 1489 break;
1491 1490 case IBT_EVENT_SQD:
1492 1491 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1493 1492 break;
1494 1493 case IBT_EVENT_COM_EST:
1495 1494 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1496 1495 break;
1497 1496 case IBT_ERROR_CATASTROPHIC_CHAN:
1498 1497 cmn_err(CE_NOTE, "rib_async_handler(): "
1499 1498 "IBT_ERROR_CATASTROPHIC_CHAN\n");
1500 1499 break;
1501 1500 case IBT_ERROR_INVALID_REQUEST_CHAN:
1502 1501 cmn_err(CE_NOTE, "rib_async_handler(): "
1503 1502 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1504 1503 break;
1505 1504 case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1506 1505 cmn_err(CE_NOTE, "rib_async_handler(): "
1507 1506 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1508 1507 break;
1509 1508 case IBT_ERROR_PATH_MIGRATE_REQ:
1510 1509 cmn_err(CE_NOTE, "rib_async_handler(): "
1511 1510 "IBT_ERROR_PATH_MIGRATE_REQ\n");
1512 1511 break;
1513 1512 case IBT_ERROR_CQ:
1514 1513 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1515 1514 break;
1516 1515 case IBT_ERROR_PORT_DOWN:
1517 1516 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1518 1517 break;
1519 1518 case IBT_ASYNC_OPAQUE1:
1520 1519 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1521 1520 break;
1522 1521 case IBT_ASYNC_OPAQUE2:
1523 1522 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1524 1523 break;
1525 1524 case IBT_ASYNC_OPAQUE3:
1526 1525 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1527 1526 break;
1528 1527 case IBT_ASYNC_OPAQUE4:
1529 1528 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1530 1529 break;
1531 1530 #endif
1532 1531 default:
1533 1532 break;
1534 1533 }
1535 1534 }
1536 1535
1537 1536 /*
1538 1537 * Client's reachable function.
1539 1538 */
1540 1539 static rdma_stat
1541 1540 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1542 1541 {
1543 1542 rdma_stat status;
1544 1543 rpcib_ping_t rpt;
1545 1544 struct netbuf saddr;
1546 1545 CONN *conn;
1547 1546
1548 1547 bzero(&saddr, sizeof (struct netbuf));
1549 1548 status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1550 1549
1551 1550 if (status == RDMA_SUCCESS) {
1552 1551 *handle = (void *)rpt.hca;
1553 1552 /* release the reference */
1554 1553 (void) rib_conn_release(conn);
1555 1554 return (RDMA_SUCCESS);
1556 1555 } else {
1557 1556 *handle = NULL;
1558 1557 DTRACE_PROBE(rpcib__i__pingfailed);
1559 1558 return (RDMA_FAILED);
1560 1559 }
1561 1560 }
1562 1561
1563 1562 /* Client side qp creation */
1564 1563 static rdma_stat
1565 1564 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1566 1565 {
1567 1566 rib_qp_t *kqp = NULL;
1568 1567 CONN *conn;
1569 1568 rdma_clnt_cred_ctrl_t *cc_info;
1570 1569
1571 1570 ASSERT(qp != NULL);
1572 1571 *qp = NULL;
1573 1572
1574 1573 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1575 1574 conn = qptoc(kqp);
1576 1575 kqp->hca = hca;
1577 1576 kqp->rdmaconn.c_rdmamod = &rib_mod;
1578 1577 kqp->rdmaconn.c_private = (caddr_t)kqp;
1579 1578
1580 1579 kqp->mode = RIB_CLIENT;
1581 1580 kqp->chan_flags = IBT_BLOCKING;
1582 1581 conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1583 1582 bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1584 1583 conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1585 1584 /*
1586 1585 * Initialize
1587 1586 */
1588 1587 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1589 1588 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 1589 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 1590 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1592 1591 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 1592 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 1593 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1595 1594 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 1595 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1597 1596 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1598 1597 /*
1599 1598 * Initialize the client credit control
1600 1599 * portion of the rdmaconn struct.
1601 1600 */
1602 1601 kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1603 1602 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1604 1603 cc_info->clnt_cc_granted_ops = 0;
1605 1604 cc_info->clnt_cc_in_flight_ops = 0;
1606 1605 cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1607 1606
1608 1607 *qp = kqp;
1609 1608 return (RDMA_SUCCESS);
1610 1609 }
1611 1610
1612 1611 /* Server side qp creation */
1613 1612 static rdma_stat
1614 1613 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1615 1614 {
1616 1615 rib_qp_t *kqp = NULL;
1617 1616 ibt_chan_sizes_t chan_sizes;
1618 1617 ibt_rc_chan_alloc_args_t qp_attr;
1619 1618 ibt_status_t ibt_status;
1620 1619 rdma_srv_cred_ctrl_t *cc_info;
1621 1620
1622 1621 *qp = NULL;
1623 1622
1624 1623 kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1625 1624 kqp->hca = hca;
1626 1625 kqp->port_num = port;
1627 1626 kqp->rdmaconn.c_rdmamod = &rib_mod;
1628 1627 kqp->rdmaconn.c_private = (caddr_t)kqp;
1629 1628
1630 1629 /*
1631 1630 * Create the qp handle
1632 1631 */
1633 1632 bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1634 1633 qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1635 1634 qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1636 1635 qp_attr.rc_pd = hca->pd_hdl;
1637 1636 qp_attr.rc_hca_port_num = port;
1638 1637 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1639 1638 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1640 1639 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1641 1640 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1642 1641 qp_attr.rc_clone_chan = NULL;
1643 1642 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1644 1643 qp_attr.rc_flags = IBT_WR_SIGNALED;
1645 1644
1646 1645 rw_enter(&hca->state_lock, RW_READER);
1647 1646 if (hca->state != HCA_DETACHED) {
1648 1647 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1649 1648 IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1650 1649 &chan_sizes);
1651 1650 } else {
1652 1651 rw_exit(&hca->state_lock);
1653 1652 goto fail;
1654 1653 }
1655 1654 rw_exit(&hca->state_lock);
1656 1655
1657 1656 if (ibt_status != IBT_SUCCESS) {
1658 1657 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1659 1658 int, ibt_status);
1660 1659 goto fail;
1661 1660 }
1662 1661
1663 1662 kqp->mode = RIB_SERVER;
1664 1663 kqp->chan_flags = IBT_BLOCKING;
1665 1664 kqp->q = q; /* server ONLY */
1666 1665
1667 1666 cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1668 1667 cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1669 1668 mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1670 1669 mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 1670 cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1672 1671 mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 1672 mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1674 1673 mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 1674 cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1676 1675 mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1677 1676 /*
1678 1677 * Set the private data area to qp to be used in callbacks
1679 1678 */
1680 1679 ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1681 1680 kqp->rdmaconn.c_state = C_CONNECTED;
1682 1681
1683 1682 /*
1684 1683 * Initialize the server credit control
1685 1684 * portion of the rdmaconn struct.
1686 1685 */
1687 1686 kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1688 1687 cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1689 1688 cc_info->srv_cc_buffers_granted = preposted_rbufs;
1690 1689 cc_info->srv_cc_cur_buffers_used = 0;
1691 1690 cc_info->srv_cc_posted = preposted_rbufs;
1692 1691
1693 1692 *qp = kqp;
1694 1693
1695 1694 return (RDMA_SUCCESS);
1696 1695 fail:
1697 1696 if (kqp)
1698 1697 kmem_free(kqp, sizeof (rib_qp_t));
1699 1698
1700 1699 return (RDMA_FAILED);
1701 1700 }
1702 1701
1703 1702 /* ARGSUSED */
1704 1703 ibt_cm_status_t
1705 1704 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1706 1705 ibt_cm_return_args_t *ret_args, void *priv_data,
1707 1706 ibt_priv_data_len_t len)
1708 1707 {
1709 1708 rib_hca_t *hca;
1710 1709
1711 1710 hca = (rib_hca_t *)clnt_hdl;
1712 1711
1713 1712 switch (event->cm_type) {
1714 1713
1715 1714 /* got a connection close event */
1716 1715 case IBT_CM_EVENT_CONN_CLOSED:
1717 1716 {
1718 1717 CONN *conn;
1719 1718 rib_qp_t *qp;
1720 1719
1721 1720 /* check reason why connection was closed */
1722 1721 switch (event->cm_event.closed) {
1723 1722 case IBT_CM_CLOSED_DREP_RCVD:
1724 1723 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1725 1724 case IBT_CM_CLOSED_DUP:
1726 1725 case IBT_CM_CLOSED_ABORT:
1727 1726 case IBT_CM_CLOSED_ALREADY:
1728 1727 /*
1729 1728 * These cases indicate the local end initiated
1730 1729 * the closing of the channel. Nothing to do here.
1731 1730 */
1732 1731 break;
1733 1732 default:
1734 1733 /*
1735 1734 * Reason for CONN_CLOSED event must be one of
1736 1735 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1737 1736 * or IBT_CM_CLOSED_STALE. These indicate cases were
1738 1737 * the remote end is closing the channel. In these
1739 1738 * cases free the channel and transition to error
1740 1739 * state
1741 1740 */
1742 1741 qp = ibt_get_chan_private(event->cm_channel);
1743 1742 conn = qptoc(qp);
1744 1743 mutex_enter(&conn->c_lock);
1745 1744 if (conn->c_state == C_DISCONN_PEND) {
1746 1745 mutex_exit(&conn->c_lock);
1747 1746 break;
1748 1747 }
1749 1748
1750 1749 conn->c_state = C_ERROR_CONN;
1751 1750
1752 1751 /*
1753 1752 * Free the conn if c_ref is down to 0 already
1754 1753 */
1755 1754 if (conn->c_ref == 0) {
1756 1755 /*
1757 1756 * Remove from list and free conn
1758 1757 */
1759 1758 conn->c_state = C_DISCONN_PEND;
1760 1759 mutex_exit(&conn->c_lock);
1761 1760 rw_enter(&hca->state_lock, RW_READER);
1762 1761 if (hca->state != HCA_DETACHED)
1763 1762 (void) rib_disconnect_channel(conn,
1764 1763 &hca->cl_conn_list);
1765 1764 rw_exit(&hca->state_lock);
1766 1765 } else {
1767 1766 /*
1768 1767 * conn will be freed when c_ref goes to 0.
1769 1768 * Indicate to cleaning thread not to close
1770 1769 * the connection, but just free the channel.
1771 1770 */
1772 1771 conn->c_flags |= C_CLOSE_NOTNEEDED;
1773 1772 mutex_exit(&conn->c_lock);
1774 1773 }
1775 1774 #ifdef DEBUG
1776 1775 if (rib_debug)
1777 1776 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1778 1777 "(CONN_CLOSED) channel disconnected");
1779 1778 #endif
1780 1779 break;
1781 1780 }
1782 1781 break;
1783 1782 }
1784 1783 default:
1785 1784 break;
1786 1785 }
1787 1786 return (IBT_CM_ACCEPT);
1788 1787 }
1789 1788
1790 1789 /*
1791 1790 * Connect to the server.
1792 1791 */
1793 1792 rdma_stat
1794 1793 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1795 1794 {
1796 1795 ibt_chan_open_args_t chan_args; /* channel args */
1797 1796 ibt_chan_sizes_t chan_sizes;
1798 1797 ibt_rc_chan_alloc_args_t qp_attr;
1799 1798 ibt_status_t ibt_status;
1800 1799 ibt_rc_returns_t ret_args; /* conn reject info */
1801 1800 int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1802 1801 ibt_ip_cm_info_t ipcm_info;
1803 1802 uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1804 1803
1805 1804
1806 1805 (void) bzero(&chan_args, sizeof (chan_args));
1807 1806 (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1808 1807 (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1809 1808
1810 1809 ipcm_info.src_addr.family = rptp->srcip.family;
1811 1810 switch (ipcm_info.src_addr.family) {
1812 1811 case AF_INET:
1813 1812 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1814 1813 break;
1815 1814 case AF_INET6:
1816 1815 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1817 1816 break;
1818 1817 }
1819 1818
1820 1819 ipcm_info.dst_addr.family = rptp->srcip.family;
1821 1820 switch (ipcm_info.dst_addr.family) {
1822 1821 case AF_INET:
1823 1822 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1824 1823 break;
1825 1824 case AF_INET6:
1826 1825 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1827 1826 break;
1828 1827 }
1829 1828
1830 1829 ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1831 1830
1832 1831 ibt_status = ibt_format_ip_private_data(&ipcm_info,
1833 1832 IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1834 1833
1835 1834 if (ibt_status != IBT_SUCCESS) {
1836 1835 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1837 1836 return (-1);
1838 1837 }
1839 1838
1840 1839 qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1841 1840 /* Alloc a RC channel */
1842 1841 qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1843 1842 qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1844 1843 qp_attr.rc_pd = hca->pd_hdl;
1845 1844 qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1846 1845 qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1847 1846 qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1848 1847 qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1849 1848 qp_attr.rc_clone_chan = NULL;
1850 1849 qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1851 1850 qp_attr.rc_flags = IBT_WR_SIGNALED;
1852 1851
1853 1852 rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1854 1853 chan_args.oc_path = &rptp->path;
1855 1854
1856 1855 chan_args.oc_cm_handler = rib_clnt_cm_handler;
1857 1856 chan_args.oc_cm_clnt_private = (void *)hca;
1858 1857 chan_args.oc_rdma_ra_out = 4;
1859 1858 chan_args.oc_rdma_ra_in = 4;
1860 1859 chan_args.oc_path_retry_cnt = 2;
1861 1860 chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1862 1861 chan_args.oc_priv_data = cmp_ip_pvt;
1863 1862 chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1864 1863
1865 1864 refresh:
1866 1865 rw_enter(&hca->state_lock, RW_READER);
1867 1866 if (hca->state != HCA_DETACHED) {
1868 1867 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1869 1868 IBT_ACHAN_NO_FLAGS,
1870 1869 &qp_attr, &qp->qp_hdl,
1871 1870 &chan_sizes);
1872 1871 } else {
1873 1872 rw_exit(&hca->state_lock);
1874 1873 return (RDMA_FAILED);
1875 1874 }
1876 1875 rw_exit(&hca->state_lock);
1877 1876
1878 1877 if (ibt_status != IBT_SUCCESS) {
1879 1878 DTRACE_PROBE1(rpcib__i_conntosrv,
1880 1879 int, ibt_status);
1881 1880 return (RDMA_FAILED);
1882 1881 }
1883 1882
1884 1883 /* Connect to the Server */
1885 1884 (void) bzero(&ret_args, sizeof (ret_args));
1886 1885 mutex_enter(&qp->cb_lock);
1887 1886 ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1888 1887 IBT_BLOCKING, &chan_args, &ret_args);
1889 1888 if (ibt_status != IBT_SUCCESS) {
1890 1889 DTRACE_PROBE2(rpcib__i_openrctosrv,
1891 1890 int, ibt_status, int, ret_args.rc_status);
1892 1891
1893 1892 (void) ibt_free_channel(qp->qp_hdl);
1894 1893 qp->qp_hdl = NULL;
1895 1894 mutex_exit(&qp->cb_lock);
1896 1895 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1897 1896 ret_args.rc_status == IBT_CM_CONN_STALE) {
1898 1897 /*
1899 1898 * Got IBT_CM_CONN_STALE probably because of stale
1900 1899 * data on the passive end of a channel that existed
1901 1900 * prior to reboot. Retry establishing a channel
1902 1901 * REFRESH_ATTEMPTS times, during which time the
1903 1902 * stale conditions on the server might clear up.
1904 1903 */
1905 1904 goto refresh;
1906 1905 }
1907 1906 return (RDMA_FAILED);
1908 1907 }
1909 1908 mutex_exit(&qp->cb_lock);
1910 1909 /*
1911 1910 * Set the private data area to qp to be used in callbacks
1912 1911 */
1913 1912 ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1914 1913 return (RDMA_SUCCESS);
1915 1914 }
1916 1915
1917 1916 rdma_stat
1918 1917 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1919 1918 {
1920 1919 uint_t i, addr_count;
1921 1920 ibt_status_t ibt_status;
1922 1921 uint8_t num_paths_p;
1923 1922 ibt_ip_path_attr_t ipattr;
1924 1923 ibt_path_ip_src_t srcip;
1925 1924 rpcib_ipaddrs_t addrs4;
1926 1925 rpcib_ipaddrs_t addrs6;
1927 1926 struct sockaddr_in *sinp;
1928 1927 struct sockaddr_in6 *sin6p;
1929 1928 rdma_stat retval = RDMA_FAILED;
1930 1929 rib_hca_t *hca;
1931 1930
1932 1931 if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1933 1932 return (RDMA_INVAL);
1934 1933 ASSERT(raddr->buf != NULL);
1935 1934
1936 1935 bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1937 1936
1938 1937 if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1939 1938 (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1940 1939 retval = RDMA_FAILED;
1941 1940 goto done2;
1942 1941 }
1943 1942
1944 1943 if (addr_type == AF_INET) {
1945 1944 addr_count = addrs4.ri_count;
1946 1945 sinp = (struct sockaddr_in *)raddr->buf;
1947 1946 rptp->dstip.family = AF_INET;
1948 1947 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1949 1948 sinp = addrs4.ri_list;
1950 1949 } else {
1951 1950 addr_count = addrs6.ri_count;
1952 1951 sin6p = (struct sockaddr_in6 *)raddr->buf;
1953 1952 rptp->dstip.family = AF_INET6;
1954 1953 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1955 1954 sin6p = addrs6.ri_list;
1956 1955 }
1957 1956
1958 1957 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1959 1958 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1960 1959 rw_enter(&hca->state_lock, RW_READER);
1961 1960 if (hca->state == HCA_DETACHED) {
1962 1961 rw_exit(&hca->state_lock);
1963 1962 continue;
1964 1963 }
1965 1964
1966 1965 ipattr.ipa_dst_ip = &rptp->dstip;
1967 1966 ipattr.ipa_hca_guid = hca->hca_guid;
1968 1967 ipattr.ipa_ndst = 1;
1969 1968 ipattr.ipa_max_paths = 1;
1970 1969 ipattr.ipa_src_ip.family = rptp->dstip.family;
1971 1970 for (i = 0; i < addr_count; i++) {
1972 1971 num_paths_p = 0;
1973 1972 if (addr_type == AF_INET) {
1974 1973 ipattr.ipa_src_ip.un.ip4addr =
1975 1974 sinp[i].sin_addr.s_addr;
1976 1975 } else {
1977 1976 ipattr.ipa_src_ip.un.ip6addr =
1978 1977 sin6p[i].sin6_addr;
1979 1978 }
1980 1979 bzero(&srcip, sizeof (ibt_path_ip_src_t));
1981 1980
1982 1981 ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1983 1982 IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1984 1983 &num_paths_p, &srcip);
1985 1984 if (ibt_status == IBT_SUCCESS &&
1986 1985 num_paths_p != 0 &&
1987 1986 rptp->path.pi_hca_guid == hca->hca_guid) {
1988 1987 rptp->hca = hca;
1989 1988 rw_exit(&hca->state_lock);
1990 1989 if (addr_type == AF_INET) {
1991 1990 rptp->srcip.family = AF_INET;
1992 1991 rptp->srcip.un.ip4addr =
1993 1992 srcip.ip_primary.un.ip4addr;
1994 1993 } else {
1995 1994 rptp->srcip.family = AF_INET6;
1996 1995 rptp->srcip.un.ip6addr =
1997 1996 srcip.ip_primary.un.ip6addr;
1998 1997
1999 1998 }
2000 1999 retval = RDMA_SUCCESS;
2001 2000 goto done1;
2002 2001 }
2003 2002 }
2004 2003 rw_exit(&hca->state_lock);
2005 2004 }
2006 2005 done1:
2007 2006 rw_exit(&rib_stat->hcas_list_lock);
2008 2007 done2:
2009 2008 if (addrs4.ri_size > 0)
2010 2009 kmem_free(addrs4.ri_list, addrs4.ri_size);
2011 2010 if (addrs6.ri_size > 0)
2012 2011 kmem_free(addrs6.ri_list, addrs6.ri_size);
2013 2012 return (retval);
2014 2013 }
2015 2014
2016 2015 /*
2017 2016 * Close channel, remove from connection list and
2018 2017 * free up resources allocated for that channel.
2019 2018 */
2020 2019 rdma_stat
2021 2020 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2022 2021 {
2023 2022 rib_qp_t *qp = ctoqp(conn);
2024 2023 rib_hca_t *hca;
2025 2024
2026 2025 mutex_enter(&conn->c_lock);
2027 2026 if (conn->c_timeout != NULL) {
2028 2027 mutex_exit(&conn->c_lock);
2029 2028 (void) untimeout(conn->c_timeout);
2030 2029 mutex_enter(&conn->c_lock);
2031 2030 }
2032 2031
2033 2032 while (conn->c_flags & C_CLOSE_PENDING) {
2034 2033 cv_wait(&conn->c_cv, &conn->c_lock);
2035 2034 }
2036 2035 mutex_exit(&conn->c_lock);
2037 2036
2038 2037 /*
2039 2038 * c_ref == 0 and connection is in C_DISCONN_PEND
2040 2039 */
2041 2040 hca = qp->hca;
2042 2041 if (conn_list != NULL)
2043 2042 (void) rib_rm_conn(conn, conn_list);
2044 2043
2045 2044 /*
2046 2045 * There is only one case where we get here with
2047 2046 * qp_hdl = NULL, which is during connection setup on
2048 2047 * the client. In such a case there are no posted
2049 2048 * send/recv buffers.
2050 2049 */
2051 2050 if (qp->qp_hdl != NULL) {
2052 2051 mutex_enter(&qp->posted_rbufs_lock);
2053 2052 while (qp->n_posted_rbufs)
2054 2053 cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2055 2054 mutex_exit(&qp->posted_rbufs_lock);
2056 2055
2057 2056 mutex_enter(&qp->send_rbufs_lock);
2058 2057 while (qp->n_send_rbufs)
2059 2058 cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2060 2059 mutex_exit(&qp->send_rbufs_lock);
2061 2060
2062 2061 (void) ibt_free_channel(qp->qp_hdl);
2063 2062 qp->qp_hdl = NULL;
2064 2063 }
2065 2064
2066 2065 ASSERT(qp->rdlist == NULL);
2067 2066
2068 2067 if (qp->replylist != NULL) {
2069 2068 (void) rib_rem_replylist(qp);
2070 2069 }
2071 2070
2072 2071 cv_destroy(&qp->cb_conn_cv);
2073 2072 cv_destroy(&qp->posted_rbufs_cv);
2074 2073 cv_destroy(&qp->send_rbufs_cv);
2075 2074 mutex_destroy(&qp->cb_lock);
2076 2075 mutex_destroy(&qp->replylist_lock);
2077 2076 mutex_destroy(&qp->posted_rbufs_lock);
2078 2077 mutex_destroy(&qp->send_rbufs_lock);
2079 2078 mutex_destroy(&qp->rdlist_lock);
2080 2079
2081 2080 cv_destroy(&conn->c_cv);
2082 2081 mutex_destroy(&conn->c_lock);
2083 2082
2084 2083 if (conn->c_raddr.buf != NULL) {
2085 2084 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2086 2085 }
2087 2086 if (conn->c_laddr.buf != NULL) {
2088 2087 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2089 2088 }
2090 2089 if (conn->c_netid != NULL) {
2091 2090 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2092 2091 }
2093 2092 if (conn->c_addrmask.buf != NULL) {
2094 2093 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2095 2094 }
2096 2095
2097 2096 /*
2098 2097 * Credit control cleanup.
2099 2098 */
2100 2099 if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2101 2100 rdma_clnt_cred_ctrl_t *cc_info;
2102 2101 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2103 2102 cv_destroy(&cc_info->clnt_cc_cv);
2104 2103 }
2105 2104
2106 2105 kmem_free(qp, sizeof (rib_qp_t));
2107 2106
2108 2107 /*
2109 2108 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2110 2109 * then the hca is no longer being used.
2111 2110 */
2112 2111 if (conn_list != NULL) {
2113 2112 rw_enter(&hca->state_lock, RW_READER);
2114 2113 if (hca->state == HCA_DETACHED) {
2115 2114 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2116 2115 if (hca->srv_conn_list.conn_hd == NULL) {
2117 2116 rw_enter(&hca->cl_conn_list.conn_lock,
2118 2117 RW_READER);
2119 2118
2120 2119 if (hca->cl_conn_list.conn_hd == NULL) {
2121 2120 mutex_enter(&hca->inuse_lock);
2122 2121 hca->inuse = FALSE;
2123 2122 cv_signal(&hca->cb_cv);
2124 2123 mutex_exit(&hca->inuse_lock);
2125 2124 }
2126 2125 rw_exit(&hca->cl_conn_list.conn_lock);
2127 2126 }
2128 2127 rw_exit(&hca->srv_conn_list.conn_lock);
2129 2128 }
2130 2129 rw_exit(&hca->state_lock);
2131 2130 }
2132 2131
2133 2132 return (RDMA_SUCCESS);
2134 2133 }
2135 2134
2136 2135 /*
2137 2136 * All sends are done under the protection of
2138 2137 * the wdesc->sendwait_lock. n_send_rbufs count
2139 2138 * is protected using the send_rbufs_lock.
2140 2139 * lock ordering is:
2141 2140 * sendwait_lock -> send_rbufs_lock
2142 2141 */
2143 2142
2144 2143 void
2145 2144 rib_send_hold(rib_qp_t *qp)
2146 2145 {
2147 2146 mutex_enter(&qp->send_rbufs_lock);
2148 2147 qp->n_send_rbufs++;
2149 2148 mutex_exit(&qp->send_rbufs_lock);
2150 2149 }
2151 2150
2152 2151 void
2153 2152 rib_send_rele(rib_qp_t *qp)
2154 2153 {
2155 2154 mutex_enter(&qp->send_rbufs_lock);
2156 2155 qp->n_send_rbufs--;
2157 2156 if (qp->n_send_rbufs == 0)
2158 2157 cv_signal(&qp->send_rbufs_cv);
2159 2158 mutex_exit(&qp->send_rbufs_lock);
2160 2159 }
2161 2160
2162 2161 void
2163 2162 rib_recv_rele(rib_qp_t *qp)
2164 2163 {
2165 2164 mutex_enter(&qp->posted_rbufs_lock);
2166 2165 qp->n_posted_rbufs--;
2167 2166 if (qp->n_posted_rbufs == 0)
2168 2167 cv_signal(&qp->posted_rbufs_cv);
2169 2168 mutex_exit(&qp->posted_rbufs_lock);
2170 2169 }
2171 2170
2172 2171 /*
2173 2172 * Wait for send completion notification. Only on receiving a
2174 2173 * notification be it a successful or error completion, free the
2175 2174 * send_wid.
2176 2175 */
2177 2176 static rdma_stat
2178 2177 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2179 2178 {
2180 2179 clock_t timout, cv_wait_ret;
2181 2180 rdma_stat error = RDMA_SUCCESS;
2182 2181 int i;
2183 2182
2184 2183 /*
2185 2184 * Wait for send to complete
2186 2185 */
2187 2186 ASSERT(wd != NULL);
2188 2187 mutex_enter(&wd->sendwait_lock);
2189 2188 if (wd->status == (uint_t)SEND_WAIT) {
2190 2189 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2191 2190 ddi_get_lbolt();
2192 2191
2193 2192 if (qp->mode == RIB_SERVER) {
2194 2193 while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2195 2194 &wd->sendwait_lock, timout)) > 0 &&
2196 2195 wd->status == (uint_t)SEND_WAIT)
2197 2196 ;
2198 2197 switch (cv_wait_ret) {
2199 2198 case -1: /* timeout */
2200 2199 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2201 2200
2202 2201 wd->cv_sig = 0; /* no signal needed */
2203 2202 error = RDMA_TIMEDOUT;
2204 2203 break;
2205 2204 default: /* got send completion */
2206 2205 break;
2207 2206 }
2208 2207 } else {
2209 2208 while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2210 2209 &wd->sendwait_lock, timout)) > 0 &&
2211 2210 wd->status == (uint_t)SEND_WAIT)
2212 2211 ;
2213 2212 switch (cv_wait_ret) {
2214 2213 case -1: /* timeout */
2215 2214 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2216 2215
2217 2216 wd->cv_sig = 0; /* no signal needed */
2218 2217 error = RDMA_TIMEDOUT;
2219 2218 break;
2220 2219 case 0: /* interrupted */
2221 2220 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2222 2221
2223 2222 wd->cv_sig = 0; /* no signal needed */
2224 2223 error = RDMA_INTR;
2225 2224 break;
2226 2225 default: /* got send completion */
2227 2226 break;
2228 2227 }
2229 2228 }
2230 2229 }
2231 2230
2232 2231 if (wd->status != (uint_t)SEND_WAIT) {
2233 2232 /* got send completion */
2234 2233 if (wd->status != RDMA_SUCCESS) {
2235 2234 switch (wd->status) {
2236 2235 case RDMA_CONNLOST:
2237 2236 error = RDMA_CONNLOST;
2238 2237 break;
2239 2238 default:
2240 2239 error = RDMA_FAILED;
2241 2240 break;
2242 2241 }
2243 2242 }
2244 2243 for (i = 0; i < wd->nsbufs; i++) {
2245 2244 rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2246 2245 (void *)(uintptr_t)wd->sbufaddr[i]);
2247 2246 }
2248 2247
2249 2248 rib_send_rele(qp);
2250 2249
2251 2250 mutex_exit(&wd->sendwait_lock);
2252 2251 (void) rib_free_sendwait(wd);
2253 2252
2254 2253 } else {
2255 2254 mutex_exit(&wd->sendwait_lock);
2256 2255 }
2257 2256 return (error);
2258 2257 }
2259 2258
2260 2259 static struct send_wid *
2261 2260 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2262 2261 {
2263 2262 struct send_wid *wd;
2264 2263
2265 2264 wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2266 2265 wd->xid = xid;
2267 2266 wd->cv_sig = cv_sig;
2268 2267 wd->qp = qp;
2269 2268 cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2270 2269 mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2271 2270 wd->status = (uint_t)SEND_WAIT;
2272 2271
2273 2272 return (wd);
2274 2273 }
2275 2274
2276 2275 static int
2277 2276 rib_free_sendwait(struct send_wid *wdesc)
2278 2277 {
2279 2278 cv_destroy(&wdesc->wait_cv);
2280 2279 mutex_destroy(&wdesc->sendwait_lock);
2281 2280 kmem_free(wdesc, sizeof (*wdesc));
2282 2281
2283 2282 return (0);
2284 2283 }
2285 2284
2286 2285 static rdma_stat
2287 2286 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2288 2287 {
2289 2288 mutex_enter(&qp->replylist_lock);
2290 2289 if (rep != NULL) {
2291 2290 (void) rib_remreply(qp, rep);
2292 2291 mutex_exit(&qp->replylist_lock);
2293 2292 return (RDMA_SUCCESS);
2294 2293 }
2295 2294 mutex_exit(&qp->replylist_lock);
2296 2295 return (RDMA_FAILED);
2297 2296 }
2298 2297
2299 2298 /*
2300 2299 * Send buffers are freed here only in case of error in posting
2301 2300 * on QP. If the post succeeded, the send buffers are freed upon
2302 2301 * send completion in rib_sendwait() or in the scq_handler.
2303 2302 */
2304 2303 rdma_stat
2305 2304 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2306 2305 int send_sig, int cv_sig, caddr_t *swid)
2307 2306 {
2308 2307 struct send_wid *wdesc;
2309 2308 struct clist *clp;
2310 2309 ibt_status_t ibt_status = IBT_SUCCESS;
2311 2310 rdma_stat ret = RDMA_SUCCESS;
2312 2311 ibt_send_wr_t tx_wr;
2313 2312 int i, nds;
2314 2313 ibt_wr_ds_t sgl[DSEG_MAX];
2315 2314 uint_t total_msg_size;
2316 2315 rib_qp_t *qp;
2317 2316
2318 2317 qp = ctoqp(conn);
2319 2318
2320 2319 ASSERT(cl != NULL);
2321 2320
2322 2321 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2323 2322
2324 2323 nds = 0;
2325 2324 total_msg_size = 0;
2326 2325 clp = cl;
2327 2326 while (clp != NULL) {
2328 2327 if (nds >= DSEG_MAX) {
2329 2328 DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2330 2329 return (RDMA_FAILED);
2331 2330 }
2332 2331 sgl[nds].ds_va = clp->w.c_saddr;
2333 2332 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2334 2333 sgl[nds].ds_len = clp->c_len;
2335 2334 total_msg_size += clp->c_len;
2336 2335 clp = clp->c_next;
2337 2336 nds++;
2338 2337 }
2339 2338
2340 2339 if (send_sig) {
2341 2340 /* Set SEND_SIGNAL flag. */
2342 2341 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2343 2342 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2344 2343 *swid = (caddr_t)wdesc;
2345 2344 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2346 2345 mutex_enter(&wdesc->sendwait_lock);
2347 2346 wdesc->nsbufs = nds;
2348 2347 for (i = 0; i < nds; i++) {
2349 2348 wdesc->sbufaddr[i] = sgl[i].ds_va;
2350 2349 }
2351 2350 } else {
2352 2351 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2353 2352 *swid = NULL;
2354 2353 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2355 2354 }
2356 2355
2357 2356 tx_wr.wr_opcode = IBT_WRC_SEND;
2358 2357 tx_wr.wr_trans = IBT_RC_SRV;
2359 2358 tx_wr.wr_nds = nds;
2360 2359 tx_wr.wr_sgl = sgl;
2361 2360
2362 2361 mutex_enter(&conn->c_lock);
2363 2362 if (conn->c_state == C_CONNECTED) {
2364 2363 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2365 2364 }
2366 2365 if (conn->c_state != C_CONNECTED ||
2367 2366 ibt_status != IBT_SUCCESS) {
2368 2367 if (conn->c_state != C_DISCONN_PEND)
2369 2368 conn->c_state = C_ERROR_CONN;
2370 2369 mutex_exit(&conn->c_lock);
2371 2370 if (send_sig) {
2372 2371 for (i = 0; i < nds; i++) {
2373 2372 rib_rbuf_free(conn, SEND_BUFFER,
2374 2373 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2375 2374 }
2376 2375 mutex_exit(&wdesc->sendwait_lock);
2377 2376 (void) rib_free_sendwait(wdesc);
2378 2377 }
2379 2378 return (RDMA_CONNLOST);
2380 2379 }
2381 2380
2382 2381 mutex_exit(&conn->c_lock);
2383 2382
2384 2383 if (send_sig) {
2385 2384 rib_send_hold(qp);
2386 2385 mutex_exit(&wdesc->sendwait_lock);
2387 2386 if (cv_sig) {
2388 2387 /*
2389 2388 * cv_wait for send to complete.
2390 2389 * We can fail due to a timeout or signal or
2391 2390 * unsuccessful send.
2392 2391 */
2393 2392 ret = rib_sendwait(qp, wdesc);
2394 2393
2395 2394 return (ret);
2396 2395 }
2397 2396 }
2398 2397
2399 2398 return (RDMA_SUCCESS);
2400 2399 }
2401 2400
2402 2401
2403 2402 rdma_stat
2404 2403 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2405 2404 {
2406 2405 rdma_stat ret;
2407 2406 caddr_t wd;
2408 2407
2409 2408 /* send-wait & cv_signal */
2410 2409 ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2411 2410 return (ret);
2412 2411 }
2413 2412
2414 2413 /*
2415 2414 * Deprecated/obsolete interface not used currently
2416 2415 * but earlier used for READ-READ protocol.
2417 2416 * Send RPC reply and wait for RDMA_DONE.
2418 2417 */
2419 2418 rdma_stat
2420 2419 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2421 2420 {
2422 2421 rdma_stat ret = RDMA_SUCCESS;
2423 2422 struct rdma_done_list *rd;
2424 2423 clock_t cv_wait_ret;
2425 2424 caddr_t *wid = NULL;
2426 2425 rib_qp_t *qp = ctoqp(conn);
2427 2426
2428 2427 mutex_enter(&qp->rdlist_lock);
2429 2428 rd = rdma_done_add(qp, msgid);
2430 2429
2431 2430 /* No cv_signal (whether send-wait or no-send-wait) */
2432 2431 ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2433 2432
2434 2433 if (ret != RDMA_SUCCESS) {
2435 2434 rdma_done_rm(qp, rd);
2436 2435 } else {
2437 2436 /*
2438 2437 * Wait for RDMA_DONE from remote end
2439 2438 */
2440 2439 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2441 2440 &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2442 2441 TR_CLOCK_TICK);
2443 2442
2444 2443 rdma_done_rm(qp, rd);
2445 2444
2446 2445 if (cv_wait_ret < 0) {
2447 2446 ret = RDMA_TIMEDOUT;
2448 2447 }
2449 2448 }
2450 2449
2451 2450 mutex_exit(&qp->rdlist_lock);
2452 2451 return (ret);
2453 2452 }
2454 2453
2455 2454 static struct recv_wid *
2456 2455 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2457 2456 {
2458 2457 struct recv_wid *rwid;
2459 2458
2460 2459 rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2461 2460 rwid->xid = msgid;
2462 2461 rwid->addr = sgl->ds_va;
2463 2462 rwid->qp = qp;
2464 2463
2465 2464 return (rwid);
2466 2465 }
2467 2466
2468 2467 static void
2469 2468 rib_free_wid(struct recv_wid *rwid)
2470 2469 {
2471 2470 kmem_free(rwid, sizeof (struct recv_wid));
2472 2471 }
2473 2472
2474 2473 rdma_stat
2475 2474 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2476 2475 {
2477 2476 rib_qp_t *qp = ctoqp(conn);
2478 2477 struct clist *clp = cl;
2479 2478 struct reply *rep;
2480 2479 struct recv_wid *rwid;
2481 2480 int nds;
2482 2481 ibt_wr_ds_t sgl[DSEG_MAX];
2483 2482 ibt_recv_wr_t recv_wr;
2484 2483 rdma_stat ret;
2485 2484 ibt_status_t ibt_status;
2486 2485
2487 2486 /*
2488 2487 * rdma_clnt_postrecv uses RECV_BUFFER.
2489 2488 */
2490 2489
2491 2490 nds = 0;
2492 2491 while (cl != NULL) {
2493 2492 if (nds >= DSEG_MAX) {
2494 2493 ret = RDMA_FAILED;
2495 2494 goto done;
2496 2495 }
2497 2496 sgl[nds].ds_va = cl->w.c_saddr;
2498 2497 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2499 2498 sgl[nds].ds_len = cl->c_len;
2500 2499 cl = cl->c_next;
2501 2500 nds++;
2502 2501 }
2503 2502
2504 2503 if (nds != 1) {
2505 2504 ret = RDMA_FAILED;
2506 2505 goto done;
2507 2506 }
2508 2507
2509 2508 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2510 2509 recv_wr.wr_nds = nds;
2511 2510 recv_wr.wr_sgl = sgl;
2512 2511
2513 2512 rwid = rib_create_wid(qp, &sgl[0], msgid);
2514 2513 if (rwid) {
2515 2514 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2516 2515 } else {
2517 2516 ret = RDMA_NORESOURCE;
2518 2517 goto done;
2519 2518 }
2520 2519 rep = rib_addreplylist(qp, msgid);
2521 2520 if (!rep) {
2522 2521 rib_free_wid(rwid);
2523 2522 ret = RDMA_NORESOURCE;
2524 2523 goto done;
2525 2524 }
2526 2525
2527 2526 mutex_enter(&conn->c_lock);
2528 2527
2529 2528 if (conn->c_state == C_CONNECTED) {
2530 2529 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2531 2530 }
2532 2531
2533 2532 if (conn->c_state != C_CONNECTED ||
2534 2533 ibt_status != IBT_SUCCESS) {
2535 2534 if (conn->c_state != C_DISCONN_PEND)
2536 2535 conn->c_state = C_ERROR_CONN;
2537 2536 mutex_exit(&conn->c_lock);
2538 2537 rib_free_wid(rwid);
2539 2538 (void) rib_rem_rep(qp, rep);
2540 2539 ret = RDMA_CONNLOST;
2541 2540 goto done;
2542 2541 }
2543 2542
2544 2543 mutex_enter(&qp->posted_rbufs_lock);
2545 2544 qp->n_posted_rbufs++;
2546 2545 mutex_exit(&qp->posted_rbufs_lock);
2547 2546
2548 2547 mutex_exit(&conn->c_lock);
2549 2548 return (RDMA_SUCCESS);
2550 2549
2551 2550 done:
2552 2551 while (clp != NULL) {
2553 2552 rib_rbuf_free(conn, RECV_BUFFER,
2554 2553 (void *)(uintptr_t)clp->w.c_saddr3);
2555 2554 clp = clp->c_next;
2556 2555 }
2557 2556 return (ret);
2558 2557 }
2559 2558
2560 2559 rdma_stat
2561 2560 rib_svc_post(CONN* conn, struct clist *cl)
2562 2561 {
2563 2562 rib_qp_t *qp = ctoqp(conn);
2564 2563 struct svc_recv *s_recvp;
2565 2564 int nds;
2566 2565 ibt_wr_ds_t sgl[DSEG_MAX];
2567 2566 ibt_recv_wr_t recv_wr;
2568 2567 ibt_status_t ibt_status;
2569 2568
2570 2569 nds = 0;
2571 2570 while (cl != NULL) {
2572 2571 if (nds >= DSEG_MAX) {
2573 2572 return (RDMA_FAILED);
2574 2573 }
2575 2574 sgl[nds].ds_va = cl->w.c_saddr;
2576 2575 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2577 2576 sgl[nds].ds_len = cl->c_len;
2578 2577 cl = cl->c_next;
2579 2578 nds++;
2580 2579 }
2581 2580
2582 2581 if (nds != 1) {
2583 2582 rib_rbuf_free(conn, RECV_BUFFER,
2584 2583 (caddr_t)(uintptr_t)sgl[0].ds_va);
2585 2584
2586 2585 return (RDMA_FAILED);
2587 2586 }
2588 2587
2589 2588 bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2590 2589 recv_wr.wr_nds = nds;
2591 2590 recv_wr.wr_sgl = sgl;
2592 2591
2593 2592 s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2594 2593 /* Use s_recvp's addr as wr id */
2595 2594 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2596 2595 mutex_enter(&conn->c_lock);
2597 2596 if (conn->c_state == C_CONNECTED) {
2598 2597 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2599 2598 }
2600 2599 if (conn->c_state != C_CONNECTED ||
2601 2600 ibt_status != IBT_SUCCESS) {
2602 2601 if (conn->c_state != C_DISCONN_PEND)
2603 2602 conn->c_state = C_ERROR_CONN;
2604 2603 mutex_exit(&conn->c_lock);
2605 2604 rib_rbuf_free(conn, RECV_BUFFER,
2606 2605 (caddr_t)(uintptr_t)sgl[0].ds_va);
2607 2606 (void) rib_free_svc_recv(s_recvp);
2608 2607
2609 2608 return (RDMA_CONNLOST);
2610 2609 }
2611 2610 mutex_exit(&conn->c_lock);
2612 2611
2613 2612 return (RDMA_SUCCESS);
2614 2613 }
2615 2614
2616 2615 /* Client */
2617 2616 rdma_stat
2618 2617 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2619 2618 {
2620 2619 return (rib_clnt_post(conn, cl, msgid));
2621 2620 }
2622 2621
2623 2622 /* Client */
2624 2623 rdma_stat
2625 2624 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2626 2625 {
2627 2626 rib_qp_t *qp = ctoqp(conn);
2628 2627 struct reply *rep;
2629 2628
2630 2629 mutex_enter(&qp->replylist_lock);
2631 2630 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2632 2631 if (rep->xid == msgid) {
2633 2632 if (rep->vaddr_cq) {
2634 2633 rib_rbuf_free(conn, RECV_BUFFER,
2635 2634 (caddr_t)(uintptr_t)rep->vaddr_cq);
2636 2635 }
2637 2636 (void) rib_remreply(qp, rep);
2638 2637 break;
2639 2638 }
2640 2639 }
2641 2640 mutex_exit(&qp->replylist_lock);
2642 2641
2643 2642 return (RDMA_SUCCESS);
2644 2643 }
2645 2644
2646 2645 /* Server */
2647 2646 rdma_stat
2648 2647 rib_post_recv(CONN *conn, struct clist *cl)
2649 2648 {
2650 2649 rib_qp_t *qp = ctoqp(conn);
2651 2650
2652 2651 if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2653 2652 mutex_enter(&qp->posted_rbufs_lock);
2654 2653 qp->n_posted_rbufs++;
2655 2654 mutex_exit(&qp->posted_rbufs_lock);
2656 2655 return (RDMA_SUCCESS);
2657 2656 }
2658 2657 return (RDMA_FAILED);
2659 2658 }
2660 2659
2661 2660 /*
2662 2661 * Client side only interface to "recv" the rpc reply buf
2663 2662 * posted earlier by rib_post_resp(conn, cl, msgid).
2664 2663 */
2665 2664 rdma_stat
2666 2665 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2667 2666 {
2668 2667 struct reply *rep = NULL;
2669 2668 clock_t timout, cv_wait_ret;
2670 2669 rdma_stat ret = RDMA_SUCCESS;
2671 2670 rib_qp_t *qp = ctoqp(conn);
2672 2671
2673 2672 /*
2674 2673 * Find the reply structure for this msgid
2675 2674 */
2676 2675 mutex_enter(&qp->replylist_lock);
2677 2676
2678 2677 for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2679 2678 if (rep->xid == msgid)
2680 2679 break;
2681 2680 }
2682 2681
2683 2682 if (rep != NULL) {
2684 2683 /*
2685 2684 * If message not yet received, wait.
2686 2685 */
2687 2686 if (rep->status == (uint_t)REPLY_WAIT) {
2688 2687 timout = ddi_get_lbolt() +
2689 2688 drv_usectohz(REPLY_WAIT_TIME * 1000000);
2690 2689
2691 2690 while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2692 2691 &qp->replylist_lock, timout)) > 0 &&
2693 2692 rep->status == (uint_t)REPLY_WAIT)
2694 2693 ;
2695 2694
2696 2695 switch (cv_wait_ret) {
2697 2696 case -1: /* timeout */
2698 2697 ret = RDMA_TIMEDOUT;
2699 2698 break;
2700 2699 case 0:
2701 2700 ret = RDMA_INTR;
2702 2701 break;
2703 2702 default:
2704 2703 break;
2705 2704 }
2706 2705 }
2707 2706
2708 2707 if (rep->status == RDMA_SUCCESS) {
2709 2708 struct clist *cl = NULL;
2710 2709
2711 2710 /*
2712 2711 * Got message successfully
2713 2712 */
2714 2713 clist_add(&cl, 0, rep->bytes_xfer, NULL,
2715 2714 (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2716 2715 *clp = cl;
2717 2716 } else {
2718 2717 if (rep->status != (uint_t)REPLY_WAIT) {
2719 2718 /*
2720 2719 * Got error in reply message. Free
2721 2720 * recv buffer here.
2722 2721 */
2723 2722 ret = rep->status;
2724 2723 rib_rbuf_free(conn, RECV_BUFFER,
2725 2724 (caddr_t)(uintptr_t)rep->vaddr_cq);
2726 2725 }
2727 2726 }
2728 2727 (void) rib_remreply(qp, rep);
2729 2728 } else {
2730 2729 /*
2731 2730 * No matching reply structure found for given msgid on the
2732 2731 * reply wait list.
2733 2732 */
2734 2733 ret = RDMA_INVAL;
2735 2734 DTRACE_PROBE(rpcib__i__nomatchxid2);
2736 2735 }
2737 2736
2738 2737 /*
2739 2738 * Done.
2740 2739 */
2741 2740 mutex_exit(&qp->replylist_lock);
2742 2741 return (ret);
2743 2742 }
2744 2743
2745 2744 /*
2746 2745 * RDMA write a buffer to the remote address.
2747 2746 */
2748 2747 rdma_stat
2749 2748 rib_write(CONN *conn, struct clist *cl, int wait)
2750 2749 {
2751 2750 ibt_send_wr_t tx_wr;
2752 2751 int cv_sig;
2753 2752 ibt_wr_ds_t sgl[DSEG_MAX];
2754 2753 struct send_wid *wdesc;
2755 2754 ibt_status_t ibt_status;
2756 2755 rdma_stat ret = RDMA_SUCCESS;
2757 2756 rib_qp_t *qp = ctoqp(conn);
2758 2757 uint64_t n_writes = 0;
2759 2758
2760 2759 if (cl == NULL) {
2761 2760 return (RDMA_FAILED);
2762 2761 }
2763 2762
2764 2763 while ((cl != NULL)) {
2765 2764 if (cl->c_len > 0) {
2766 2765 bzero(&tx_wr, sizeof (ibt_send_wr_t));
2767 2766 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2768 2767 tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2769 2768 cl->c_dmemhandle.mrc_rmr; /* rkey */
2770 2769 sgl[0].ds_va = cl->w.c_saddr;
2771 2770 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2772 2771 sgl[0].ds_len = cl->c_len;
2773 2772
2774 2773 if (wait) {
2775 2774 cv_sig = 1;
2776 2775 } else {
2777 2776 if (n_writes > max_unsignaled_rws) {
2778 2777 n_writes = 0;
2779 2778 cv_sig = 1;
2780 2779 } else {
2781 2780 cv_sig = 0;
2782 2781 }
2783 2782 }
2784 2783
2785 2784 if (cv_sig) {
2786 2785 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2787 2786 wdesc = rib_init_sendwait(0, cv_sig, qp);
2788 2787 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2789 2788 mutex_enter(&wdesc->sendwait_lock);
2790 2789 } else {
2791 2790 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2792 2791 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2793 2792 }
2794 2793 tx_wr.wr_opcode = IBT_WRC_RDMAW;
2795 2794 tx_wr.wr_trans = IBT_RC_SRV;
2796 2795 tx_wr.wr_nds = 1;
2797 2796 tx_wr.wr_sgl = sgl;
2798 2797
2799 2798 mutex_enter(&conn->c_lock);
2800 2799 if (conn->c_state == C_CONNECTED) {
2801 2800 ibt_status =
2802 2801 ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2803 2802 }
2804 2803 if (conn->c_state != C_CONNECTED ||
2805 2804 ibt_status != IBT_SUCCESS) {
2806 2805 if (conn->c_state != C_DISCONN_PEND)
2807 2806 conn->c_state = C_ERROR_CONN;
2808 2807 mutex_exit(&conn->c_lock);
2809 2808 if (cv_sig) {
2810 2809 mutex_exit(&wdesc->sendwait_lock);
2811 2810 (void) rib_free_sendwait(wdesc);
2812 2811 }
2813 2812 return (RDMA_CONNLOST);
2814 2813 }
2815 2814
2816 2815 mutex_exit(&conn->c_lock);
2817 2816
2818 2817 /*
2819 2818 * Wait for send to complete
2820 2819 */
2821 2820 if (cv_sig) {
2822 2821
2823 2822 rib_send_hold(qp);
2824 2823 mutex_exit(&wdesc->sendwait_lock);
2825 2824
2826 2825 ret = rib_sendwait(qp, wdesc);
2827 2826 if (ret != 0)
2828 2827 return (ret);
2829 2828 }
2830 2829 n_writes ++;
2831 2830 }
2832 2831 cl = cl->c_next;
2833 2832 }
2834 2833 return (RDMA_SUCCESS);
2835 2834 }
2836 2835
2837 2836 /*
2838 2837 * RDMA Read a buffer from the remote address.
2839 2838 */
2840 2839 rdma_stat
2841 2840 rib_read(CONN *conn, struct clist *cl, int wait)
2842 2841 {
2843 2842 ibt_send_wr_t rx_wr;
2844 2843 int cv_sig = 0;
2845 2844 ibt_wr_ds_t sgl;
2846 2845 struct send_wid *wdesc;
2847 2846 ibt_status_t ibt_status = IBT_SUCCESS;
2848 2847 rdma_stat ret = RDMA_SUCCESS;
2849 2848 rib_qp_t *qp = ctoqp(conn);
2850 2849
2851 2850 if (cl == NULL) {
2852 2851 return (RDMA_FAILED);
2853 2852 }
2854 2853
2855 2854 while (cl != NULL) {
2856 2855 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2857 2856 /*
2858 2857 * Remote address is at the head chunk item in list.
2859 2858 */
2860 2859 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2861 2860 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2862 2861
2863 2862 sgl.ds_va = cl->u.c_daddr;
2864 2863 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2865 2864 sgl.ds_len = cl->c_len;
2866 2865
2867 2866 /*
2868 2867 * If there are multiple chunks to be read, and
2869 2868 * wait is set, ask for signal only for the last chunk
2870 2869 * and wait only on the last chunk. The completion of
2871 2870 * RDMA_READ on last chunk ensures that reads on all
2872 2871 * previous chunks are also completed.
2873 2872 */
2874 2873 if (wait && (cl->c_next == NULL)) {
2875 2874 cv_sig = 1;
2876 2875 wdesc = rib_init_sendwait(0, cv_sig, qp);
2877 2876 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2878 2877 rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2879 2878 mutex_enter(&wdesc->sendwait_lock);
2880 2879 } else {
2881 2880 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2882 2881 rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2883 2882 }
2884 2883 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2885 2884 rx_wr.wr_trans = IBT_RC_SRV;
2886 2885 rx_wr.wr_nds = 1;
2887 2886 rx_wr.wr_sgl = &sgl;
2888 2887
2889 2888 mutex_enter(&conn->c_lock);
2890 2889 if (conn->c_state == C_CONNECTED) {
2891 2890 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2892 2891 }
2893 2892 if (conn->c_state != C_CONNECTED ||
2894 2893 ibt_status != IBT_SUCCESS) {
2895 2894 if (conn->c_state != C_DISCONN_PEND)
2896 2895 conn->c_state = C_ERROR_CONN;
2897 2896 mutex_exit(&conn->c_lock);
2898 2897 if (wait && (cl->c_next == NULL)) {
2899 2898 mutex_exit(&wdesc->sendwait_lock);
2900 2899 (void) rib_free_sendwait(wdesc);
2901 2900 }
2902 2901 return (RDMA_CONNLOST);
2903 2902 }
2904 2903
2905 2904 mutex_exit(&conn->c_lock);
2906 2905
2907 2906 /*
2908 2907 * Wait for send to complete if this is the
2909 2908 * last item in the list.
2910 2909 */
2911 2910 if (wait && cl->c_next == NULL) {
2912 2911 rib_send_hold(qp);
2913 2912 mutex_exit(&wdesc->sendwait_lock);
2914 2913
2915 2914 ret = rib_sendwait(qp, wdesc);
2916 2915
2917 2916 if (ret != 0)
2918 2917 return (ret);
2919 2918 }
2920 2919 cl = cl->c_next;
2921 2920 }
2922 2921 return (RDMA_SUCCESS);
2923 2922 }
2924 2923
2925 2924 /*
2926 2925 * rib_srv_cm_handler()
2927 2926 * Connection Manager callback to handle RC connection requests.
2928 2927 */
2929 2928 /* ARGSUSED */
2930 2929 static ibt_cm_status_t
2931 2930 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2932 2931 ibt_cm_return_args_t *ret_args, void *priv_data,
2933 2932 ibt_priv_data_len_t len)
2934 2933 {
2935 2934 queue_t *q;
2936 2935 rib_qp_t *qp;
2937 2936 rib_hca_t *hca;
2938 2937 rdma_stat status = RDMA_SUCCESS;
2939 2938 int i;
2940 2939 struct clist cl;
2941 2940 rdma_buf_t rdbuf = {0};
2942 2941 void *buf = NULL;
2943 2942 CONN *conn;
2944 2943 ibt_ip_cm_info_t ipinfo;
2945 2944 struct sockaddr_in *s;
2946 2945 struct sockaddr_in6 *s6;
2947 2946 int sin_size = sizeof (struct sockaddr_in);
2948 2947 int in_size = sizeof (struct in_addr);
2949 2948 int sin6_size = sizeof (struct sockaddr_in6);
2950 2949
2951 2950 ASSERT(any != NULL);
2952 2951 ASSERT(event != NULL);
2953 2952
2954 2953 hca = (rib_hca_t *)any;
2955 2954
2956 2955 /* got a connection request */
2957 2956 switch (event->cm_type) {
2958 2957 case IBT_CM_EVENT_REQ_RCV:
2959 2958 /*
2960 2959 * If the plugin is in the NO_ACCEPT state, bail out.
2961 2960 */
2962 2961 mutex_enter(&plugin_state_lock);
2963 2962 if (plugin_state == NO_ACCEPT) {
2964 2963 mutex_exit(&plugin_state_lock);
2965 2964 return (IBT_CM_REJECT);
2966 2965 }
2967 2966 mutex_exit(&plugin_state_lock);
2968 2967
2969 2968 /*
2970 2969 * Need to send a MRA MAD to CM so that it does not
2971 2970 * timeout on us.
2972 2971 */
2973 2972 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2974 2973 event->cm_event.req.req_timeout * 8, NULL, 0);
2975 2974
2976 2975 mutex_enter(&rib_stat->open_hca_lock);
2977 2976 q = rib_stat->q;
2978 2977 mutex_exit(&rib_stat->open_hca_lock);
2979 2978
2980 2979 status = rib_svc_create_chan(hca, (caddr_t)q,
2981 2980 event->cm_event.req.req_prim_hca_port, &qp);
2982 2981
2983 2982 if (status) {
2984 2983 return (IBT_CM_REJECT);
2985 2984 }
2986 2985
2987 2986 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2988 2987 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2989 2988 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2990 2989 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2991 2990
2992 2991 /*
2993 2992 * Pre-posts RECV buffers
2994 2993 */
2995 2994 conn = qptoc(qp);
2996 2995 for (i = 0; i < preposted_rbufs; i++) {
2997 2996 bzero(&rdbuf, sizeof (rdbuf));
2998 2997 rdbuf.type = RECV_BUFFER;
2999 2998 buf = rib_rbuf_alloc(conn, &rdbuf);
3000 2999 if (buf == NULL) {
3001 3000 /*
3002 3001 * A connection is not established yet.
3003 3002 * Just flush the channel. Buffers
3004 3003 * posted till now will error out with
3005 3004 * IBT_WC_WR_FLUSHED_ERR.
3006 3005 */
3007 3006 (void) ibt_flush_channel(qp->qp_hdl);
3008 3007 (void) rib_disconnect_channel(conn, NULL);
3009 3008 return (IBT_CM_REJECT);
3010 3009 }
3011 3010
3012 3011 bzero(&cl, sizeof (cl));
3013 3012 cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3014 3013 cl.c_len = rdbuf.len;
3015 3014 cl.c_smemhandle.mrc_lmr =
3016 3015 rdbuf.handle.mrc_lmr; /* lkey */
3017 3016 cl.c_next = NULL;
3018 3017 status = rib_post_recv(conn, &cl);
3019 3018 if (status != RDMA_SUCCESS) {
3020 3019 /*
3021 3020 * A connection is not established yet.
3022 3021 * Just flush the channel. Buffers
3023 3022 * posted till now will error out with
3024 3023 * IBT_WC_WR_FLUSHED_ERR.
3025 3024 */
3026 3025 (void) ibt_flush_channel(qp->qp_hdl);
3027 3026 (void) rib_disconnect_channel(conn, NULL);
3028 3027 return (IBT_CM_REJECT);
3029 3028 }
3030 3029 }
3031 3030 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3032 3031
3033 3032 /*
3034 3033 * Get the address translation
3035 3034 */
3036 3035 rw_enter(&hca->state_lock, RW_READER);
3037 3036 if (hca->state == HCA_DETACHED) {
3038 3037 rw_exit(&hca->state_lock);
3039 3038 return (IBT_CM_REJECT);
3040 3039 }
3041 3040 rw_exit(&hca->state_lock);
3042 3041
3043 3042 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3044 3043
3045 3044 if (ibt_get_ip_data(event->cm_priv_data_len,
3046 3045 event->cm_priv_data,
3047 3046 &ipinfo) != IBT_SUCCESS) {
3048 3047
3049 3048 return (IBT_CM_REJECT);
3050 3049 }
3051 3050
3052 3051 switch (ipinfo.src_addr.family) {
3053 3052 case AF_INET:
3054 3053
3055 3054 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3056 3055 KM_SLEEP);
3057 3056 (void) strcpy(conn->c_netid, RIBNETID_TCP);
3058 3057
3059 3058 conn->c_raddr.maxlen =
3060 3059 conn->c_raddr.len = sin_size;
3061 3060 conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3062 3061
3063 3062 s = (struct sockaddr_in *)conn->c_raddr.buf;
3064 3063 s->sin_family = AF_INET;
3065 3064 bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3066 3065 &s->sin_addr, in_size);
3067 3066
3068 3067 conn->c_laddr.maxlen =
3069 3068 conn->c_laddr.len = sin_size;
3070 3069 conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3071 3070
3072 3071 s = (struct sockaddr_in *)conn->c_laddr.buf;
3073 3072 s->sin_family = AF_INET;
3074 3073 bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3075 3074 &s->sin_addr, in_size);
3076 3075
3077 3076 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3078 3077 sizeof (struct sockaddr_in);
3079 3078 conn->c_addrmask.buf =
3080 3079 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3081 3080 ((struct sockaddr_in *)
3082 3081 conn->c_addrmask.buf)->sin_addr.s_addr =
3083 3082 (uint32_t)~0;
3084 3083 ((struct sockaddr_in *)
3085 3084 conn->c_addrmask.buf)->sin_family =
3086 3085 (sa_family_t)~0;
3087 3086 break;
3088 3087
3089 3088 case AF_INET6:
3090 3089
3091 3090 conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3092 3091 KM_SLEEP);
3093 3092 (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3094 3093
3095 3094 conn->c_raddr.maxlen =
3096 3095 conn->c_raddr.len = sin6_size;
3097 3096 conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3098 3097
3099 3098 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3100 3099 s6->sin6_family = AF_INET6;
3101 3100 bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3102 3101 &s6->sin6_addr,
3103 3102 sizeof (struct in6_addr));
3104 3103
3105 3104 conn->c_laddr.maxlen =
3106 3105 conn->c_laddr.len = sin6_size;
3107 3106 conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3108 3107
3109 3108 s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3110 3109 s6->sin6_family = AF_INET6;
3111 3110 bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3112 3111 &s6->sin6_addr,
3113 3112 sizeof (struct in6_addr));
3114 3113
3115 3114 conn->c_addrmask.maxlen = conn->c_addrmask.len =
3116 3115 sizeof (struct sockaddr_in6);
3117 3116 conn->c_addrmask.buf =
3118 3117 kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3119 3118 (void) memset(&((struct sockaddr_in6 *)
3120 3119 conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3121 3120 sizeof (struct in6_addr));
3122 3121 ((struct sockaddr_in6 *)
3123 3122 conn->c_addrmask.buf)->sin6_family =
3124 3123 (sa_family_t)~0;
3125 3124 break;
3126 3125
3127 3126 default:
3128 3127 return (IBT_CM_REJECT);
3129 3128 }
3130 3129
3131 3130 break;
3132 3131
3133 3132 case IBT_CM_EVENT_CONN_CLOSED:
3134 3133 {
3135 3134 CONN *conn;
3136 3135 rib_qp_t *qp;
3137 3136
3138 3137 switch (event->cm_event.closed) {
3139 3138 case IBT_CM_CLOSED_DREP_RCVD:
3140 3139 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3141 3140 case IBT_CM_CLOSED_DUP:
3142 3141 case IBT_CM_CLOSED_ABORT:
3143 3142 case IBT_CM_CLOSED_ALREADY:
3144 3143 /*
3145 3144 * These cases indicate the local end initiated
3146 3145 * the closing of the channel. Nothing to do here.
3147 3146 */
3148 3147 break;
3149 3148 default:
3150 3149 /*
3151 3150 * Reason for CONN_CLOSED event must be one of
3152 3151 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3153 3152 * or IBT_CM_CLOSED_STALE. These indicate cases were
3154 3153 * the remote end is closing the channel. In these
3155 3154 * cases free the channel and transition to error
3156 3155 * state
3157 3156 */
3158 3157 qp = ibt_get_chan_private(event->cm_channel);
3159 3158 conn = qptoc(qp);
3160 3159 mutex_enter(&conn->c_lock);
3161 3160 if (conn->c_state == C_DISCONN_PEND) {
3162 3161 mutex_exit(&conn->c_lock);
3163 3162 break;
3164 3163 }
3165 3164 conn->c_state = C_ERROR_CONN;
3166 3165
3167 3166 /*
3168 3167 * Free the conn if c_ref goes down to 0
3169 3168 */
3170 3169 if (conn->c_ref == 0) {
3171 3170 /*
3172 3171 * Remove from list and free conn
3173 3172 */
3174 3173 conn->c_state = C_DISCONN_PEND;
3175 3174 mutex_exit(&conn->c_lock);
3176 3175 (void) rib_disconnect_channel(conn,
3177 3176 &hca->srv_conn_list);
3178 3177 } else {
3179 3178 /*
3180 3179 * conn will be freed when c_ref goes to 0.
3181 3180 * Indicate to cleaning thread not to close
3182 3181 * the connection, but just free the channel.
3183 3182 */
3184 3183 conn->c_flags |= C_CLOSE_NOTNEEDED;
3185 3184 mutex_exit(&conn->c_lock);
3186 3185 }
3187 3186 DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3188 3187 break;
3189 3188 }
3190 3189 break;
3191 3190 }
3192 3191 case IBT_CM_EVENT_CONN_EST:
3193 3192 /*
3194 3193 * RTU received, hence connection established.
3195 3194 */
3196 3195 if (rib_debug > 1)
3197 3196 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3198 3197 "(CONN_EST) channel established");
3199 3198 break;
3200 3199
3201 3200 default:
3202 3201 if (rib_debug > 2) {
3203 3202 /* Let CM handle the following events. */
3204 3203 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3205 3204 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3206 3205 "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3207 3206 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3208 3207 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3209 3208 "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3210 3209 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3211 3210 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3212 3211 "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3213 3212 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3214 3213 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3215 3214 "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3216 3215 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3217 3216 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3218 3217 "server recv'ed IBT_CM_EVENT_FAILURE\n");
3219 3218 }
3220 3219 }
3221 3220 return (IBT_CM_DEFAULT);
3222 3221 }
3223 3222
3224 3223 /* accept all other CM messages (i.e. let the CM handle them) */
3225 3224 return (IBT_CM_ACCEPT);
3226 3225 }
3227 3226
3228 3227 static rdma_stat
3229 3228 rib_register_service(rib_hca_t *hca, int service_type,
3230 3229 uint8_t protocol_num, in_port_t dst_port)
3231 3230 {
3232 3231 ibt_srv_desc_t sdesc;
3233 3232 ibt_hca_portinfo_t *port_infop;
3234 3233 ib_svc_id_t srv_id;
3235 3234 ibt_srv_hdl_t srv_hdl;
3236 3235 uint_t port_size;
3237 3236 uint_t pki, i, num_ports, nbinds;
3238 3237 ibt_status_t ibt_status;
3239 3238 rib_service_t *service;
3240 3239 ib_pkey_t pkey;
3241 3240
3242 3241 /*
3243 3242 * Query all ports for the given HCA
3244 3243 */
3245 3244 rw_enter(&hca->state_lock, RW_READER);
3246 3245 if (hca->state != HCA_DETACHED) {
3247 3246 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3248 3247 &num_ports, &port_size);
3249 3248 rw_exit(&hca->state_lock);
3250 3249 } else {
3251 3250 rw_exit(&hca->state_lock);
3252 3251 return (RDMA_FAILED);
3253 3252 }
3254 3253 if (ibt_status != IBT_SUCCESS) {
3255 3254 return (RDMA_FAILED);
3256 3255 }
3257 3256
3258 3257 DTRACE_PROBE1(rpcib__i__regservice_numports,
3259 3258 int, num_ports);
3260 3259
3261 3260 for (i = 0; i < num_ports; i++) {
3262 3261 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3263 3262 DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3264 3263 int, i+1);
3265 3264 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3266 3265 DTRACE_PROBE1(rpcib__i__regservice__portactive,
3267 3266 int, i+1);
3268 3267 }
3269 3268 }
3270 3269
3271 3270 /*
3272 3271 * Get all the IP addresses on this system to register the
3273 3272 * given "service type" on all DNS recognized IP addrs.
3274 3273 * Each service type such as NFS will have all the systems
3275 3274 * IP addresses as its different names. For now the only
3276 3275 * type of service we support in RPCIB is NFS.
3277 3276 */
3278 3277 rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3279 3278 /*
3280 3279 * Start registering and binding service to active
3281 3280 * on active ports on this HCA.
3282 3281 */
3283 3282 nbinds = 0;
3284 3283 for (service = rib_stat->service_list;
3285 3284 service && (service->srv_type != service_type);
3286 3285 service = service->next)
3287 3286 ;
3288 3287
3289 3288 if (service == NULL) {
3290 3289 /*
3291 3290 * We use IP addresses as the service names for
3292 3291 * service registration. Register each of them
3293 3292 * with CM to obtain a svc_id and svc_hdl. We do not
3294 3293 * register the service with machine's loopback address.
3295 3294 */
3296 3295 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3297 3296 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3298 3297 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3299 3298 sdesc.sd_handler = rib_srv_cm_handler;
3300 3299 sdesc.sd_flags = 0;
3301 3300 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3302 3301 &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3303 3302 1, &srv_hdl, &srv_id);
3304 3303 if ((ibt_status != IBT_SUCCESS) &&
3305 3304 (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3306 3305 rw_exit(&rib_stat->service_list_lock);
3307 3306 DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3308 3307 int, ibt_status);
3309 3308 ibt_free_portinfo(port_infop, port_size);
3310 3309 return (RDMA_FAILED);
3311 3310 }
3312 3311
3313 3312 /*
3314 3313 * Allocate and prepare a service entry
3315 3314 */
3316 3315 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3317 3316
3318 3317 service->srv_type = service_type;
3319 3318 service->srv_hdl = srv_hdl;
3320 3319 service->srv_id = srv_id;
3321 3320
3322 3321 service->next = rib_stat->service_list;
3323 3322 rib_stat->service_list = service;
3324 3323 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3325 3324 int, service->srv_type);
3326 3325 } else {
3327 3326 srv_hdl = service->srv_hdl;
3328 3327 srv_id = service->srv_id;
3329 3328 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3330 3329 int, service->srv_type);
3331 3330 }
3332 3331
3333 3332 for (i = 0; i < num_ports; i++) {
3334 3333 ibt_sbind_hdl_t sbp;
3335 3334 rib_hca_service_t *hca_srv;
3336 3335 ib_gid_t gid;
3337 3336
3338 3337 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3339 3338 continue;
3340 3339
3341 3340 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3342 3341 pkey = port_infop[i].p_pkey_tbl[pki];
3343 3342
3344 3343 rw_enter(&hca->bound_services_lock, RW_READER);
3345 3344 gid = port_infop[i].p_sgid_tbl[0];
3346 3345 for (hca_srv = hca->bound_services; hca_srv;
3347 3346 hca_srv = hca_srv->next) {
3348 3347 if ((hca_srv->srv_id == service->srv_id) &&
3349 3348 (hca_srv->gid.gid_prefix ==
3350 3349 gid.gid_prefix) &&
3351 3350 (hca_srv->gid.gid_guid == gid.gid_guid))
3352 3351 break;
3353 3352 }
3354 3353 rw_exit(&hca->bound_services_lock);
3355 3354 if (hca_srv != NULL) {
3356 3355 /*
3357 3356 * port is alreay bound the the service
3358 3357 */
3359 3358 DTRACE_PROBE1(
3360 3359 rpcib__i__regservice__already__bound,
3361 3360 int, i+1);
3362 3361 nbinds++;
3363 3362 continue;
3364 3363 }
3365 3364
3366 3365 if ((pkey & IBSRM_HB) &&
3367 3366 (pkey != IB_PKEY_INVALID_FULL)) {
3368 3367
3369 3368 sbp = NULL;
3370 3369 ibt_status = ibt_bind_service(srv_hdl,
3371 3370 gid, NULL, hca, &sbp);
3372 3371
3373 3372 if (ibt_status == IBT_SUCCESS) {
3374 3373 hca_srv = kmem_zalloc(
3375 3374 sizeof (rib_hca_service_t),
3376 3375 KM_SLEEP);
3377 3376 hca_srv->srv_id = srv_id;
3378 3377 hca_srv->gid = gid;
3379 3378 hca_srv->sbind_hdl = sbp;
3380 3379
3381 3380 rw_enter(&hca->bound_services_lock,
3382 3381 RW_WRITER);
3383 3382 hca_srv->next = hca->bound_services;
3384 3383 hca->bound_services = hca_srv;
3385 3384 rw_exit(&hca->bound_services_lock);
3386 3385 nbinds++;
3387 3386 }
3388 3387
3389 3388 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3390 3389 int, ibt_status);
3391 3390 }
3392 3391 }
3393 3392 }
3394 3393 rw_exit(&rib_stat->service_list_lock);
3395 3394
3396 3395 ibt_free_portinfo(port_infop, port_size);
3397 3396
3398 3397 if (nbinds == 0) {
3399 3398 return (RDMA_FAILED);
3400 3399 } else {
3401 3400 /*
3402 3401 * Put this plugin into accept state, since atleast
3403 3402 * one registration was successful.
3404 3403 */
3405 3404 mutex_enter(&plugin_state_lock);
3406 3405 plugin_state = ACCEPT;
3407 3406 mutex_exit(&plugin_state_lock);
3408 3407 return (RDMA_SUCCESS);
3409 3408 }
3410 3409 }
3411 3410
3412 3411 void
3413 3412 rib_listen(struct rdma_svc_data *rd)
3414 3413 {
3415 3414 rdma_stat status;
3416 3415 int n_listening = 0;
3417 3416 rib_hca_t *hca;
3418 3417
3419 3418 mutex_enter(&rib_stat->listen_lock);
3420 3419 /*
3421 3420 * if rd parameter is NULL then it means that rib_stat->q is
3422 3421 * already initialized by a call from RDMA and we just want to
3423 3422 * add a newly attached HCA to the same listening state as other
3424 3423 * HCAs.
3425 3424 */
3426 3425 if (rd == NULL) {
3427 3426 if (rib_stat->q == NULL) {
3428 3427 mutex_exit(&rib_stat->listen_lock);
3429 3428 return;
3430 3429 }
3431 3430 } else {
3432 3431 rib_stat->q = &rd->q;
3433 3432 }
3434 3433 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3435 3434 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3436 3435 /*
3437 3436 * First check if a hca is still attached
3438 3437 */
3439 3438 rw_enter(&hca->state_lock, RW_READER);
3440 3439 if (hca->state != HCA_INITED) {
3441 3440 rw_exit(&hca->state_lock);
3442 3441 continue;
3443 3442 }
3444 3443 rw_exit(&hca->state_lock);
3445 3444
3446 3445 /*
3447 3446 * Right now the only service type is NFS. Hence
3448 3447 * force feed this value. Ideally to communicate
3449 3448 * the service type it should be passed down in
3450 3449 * rdma_svc_data.
3451 3450 */
3452 3451 status = rib_register_service(hca, NFS,
3453 3452 IPPROTO_TCP, nfs_rdma_port);
3454 3453 if (status == RDMA_SUCCESS)
3455 3454 n_listening++;
3456 3455 }
3457 3456 rw_exit(&rib_stat->hcas_list_lock);
3458 3457
3459 3458 /*
3460 3459 * Service active on an HCA, check rd->err_code for more
3461 3460 * explainable errors.
3462 3461 */
3463 3462 if (rd) {
3464 3463 if (n_listening > 0) {
3465 3464 rd->active = 1;
3466 3465 rd->err_code = RDMA_SUCCESS;
3467 3466 } else {
3468 3467 rd->active = 0;
3469 3468 rd->err_code = RDMA_FAILED;
3470 3469 }
3471 3470 }
3472 3471 mutex_exit(&rib_stat->listen_lock);
3473 3472 }
3474 3473
3475 3474 /* XXXX */
3476 3475 /* ARGSUSED */
3477 3476 static void
3478 3477 rib_listen_stop(struct rdma_svc_data *svcdata)
3479 3478 {
3480 3479 rib_hca_t *hca;
3481 3480
3482 3481 mutex_enter(&rib_stat->listen_lock);
3483 3482 /*
3484 3483 * KRPC called the RDMATF to stop the listeners, this means
3485 3484 * stop sending incomming or recieved requests to KRPC master
3486 3485 * transport handle for RDMA-IB. This is also means that the
3487 3486 * master transport handle, responsible for us, is going away.
3488 3487 */
3489 3488 mutex_enter(&plugin_state_lock);
3490 3489 plugin_state = NO_ACCEPT;
3491 3490 if (svcdata != NULL)
3492 3491 svcdata->active = 0;
3493 3492 mutex_exit(&plugin_state_lock);
3494 3493
3495 3494 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3496 3495 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3497 3496 /*
3498 3497 * First check if a hca is still attached
3499 3498 */
3500 3499 rw_enter(&hca->state_lock, RW_READER);
3501 3500 if (hca->state == HCA_DETACHED) {
3502 3501 rw_exit(&hca->state_lock);
3503 3502 continue;
3504 3503 }
3505 3504 rib_close_channels(&hca->srv_conn_list);
3506 3505 rib_stop_services(hca);
3507 3506 rw_exit(&hca->state_lock);
3508 3507 }
3509 3508 rw_exit(&rib_stat->hcas_list_lock);
3510 3509
3511 3510 /*
3512 3511 * Avoid rib_listen() using the stale q field.
3513 3512 * This could happen if a port goes up after all services
3514 3513 * are already unregistered.
3515 3514 */
3516 3515 rib_stat->q = NULL;
3517 3516 mutex_exit(&rib_stat->listen_lock);
3518 3517 }
3519 3518
3520 3519 /*
3521 3520 * Traverse the HCA's service list to unbind and deregister services.
3522 3521 * For each bound service of HCA to be removed, first find the corresponding
3523 3522 * service handle (srv_hdl) and then unbind the service by calling
3524 3523 * ibt_unbind_service().
3525 3524 */
3526 3525 static void
3527 3526 rib_stop_services(rib_hca_t *hca)
3528 3527 {
3529 3528 rib_hca_service_t *srv_list, *to_remove;
3530 3529
3531 3530 /*
3532 3531 * unbind and deregister the services for this service type.
3533 3532 * Right now there is only one service type. In future it will
3534 3533 * be passed down to this function.
3535 3534 */
3536 3535 rw_enter(&hca->bound_services_lock, RW_READER);
3537 3536 srv_list = hca->bound_services;
3538 3537 hca->bound_services = NULL;
3539 3538 rw_exit(&hca->bound_services_lock);
3540 3539
3541 3540 while (srv_list != NULL) {
3542 3541 rib_service_t *sc;
3543 3542
3544 3543 to_remove = srv_list;
3545 3544 srv_list = to_remove->next;
3546 3545 rw_enter(&rib_stat->service_list_lock, RW_READER);
3547 3546 for (sc = rib_stat->service_list;
3548 3547 sc && (sc->srv_id != to_remove->srv_id);
3549 3548 sc = sc->next)
3550 3549 ;
3551 3550 /*
3552 3551 * if sc is NULL then the service doesn't exist anymore,
3553 3552 * probably just removed completely through rib_stat.
3554 3553 */
3555 3554 if (sc != NULL)
3556 3555 (void) ibt_unbind_service(sc->srv_hdl,
3557 3556 to_remove->sbind_hdl);
3558 3557 rw_exit(&rib_stat->service_list_lock);
3559 3558 kmem_free(to_remove, sizeof (rib_hca_service_t));
3560 3559 }
3561 3560 }
3562 3561
3563 3562 static struct svc_recv *
3564 3563 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3565 3564 {
3566 3565 struct svc_recv *recvp;
3567 3566
3568 3567 recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3569 3568 recvp->vaddr = sgl->ds_va;
3570 3569 recvp->qp = qp;
3571 3570 recvp->bytes_xfer = 0;
3572 3571 return (recvp);
3573 3572 }
3574 3573
3575 3574 static int
3576 3575 rib_free_svc_recv(struct svc_recv *recvp)
3577 3576 {
3578 3577 kmem_free(recvp, sizeof (*recvp));
3579 3578
3580 3579 return (0);
3581 3580 }
3582 3581
3583 3582 static struct reply *
3584 3583 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3585 3584 {
3586 3585 struct reply *rep;
3587 3586
3588 3587
3589 3588 rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3590 3589 if (rep == NULL) {
3591 3590 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3592 3591 return (NULL);
3593 3592 }
3594 3593 rep->xid = msgid;
3595 3594 rep->vaddr_cq = NULL;
3596 3595 rep->bytes_xfer = 0;
3597 3596 rep->status = (uint_t)REPLY_WAIT;
3598 3597 rep->prev = NULL;
3599 3598 cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3600 3599
3601 3600 mutex_enter(&qp->replylist_lock);
3602 3601 if (qp->replylist) {
3603 3602 rep->next = qp->replylist;
3604 3603 qp->replylist->prev = rep;
3605 3604 }
3606 3605 qp->rep_list_size++;
3607 3606
3608 3607 DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3609 3608 int, qp->rep_list_size);
3610 3609
3611 3610 qp->replylist = rep;
3612 3611 mutex_exit(&qp->replylist_lock);
3613 3612
3614 3613 return (rep);
3615 3614 }
3616 3615
3617 3616 static rdma_stat
3618 3617 rib_rem_replylist(rib_qp_t *qp)
3619 3618 {
3620 3619 struct reply *r, *n;
3621 3620
3622 3621 mutex_enter(&qp->replylist_lock);
3623 3622 for (r = qp->replylist; r != NULL; r = n) {
3624 3623 n = r->next;
3625 3624 (void) rib_remreply(qp, r);
3626 3625 }
3627 3626 mutex_exit(&qp->replylist_lock);
3628 3627
3629 3628 return (RDMA_SUCCESS);
3630 3629 }
3631 3630
3632 3631 static int
3633 3632 rib_remreply(rib_qp_t *qp, struct reply *rep)
3634 3633 {
3635 3634
3636 3635 ASSERT(MUTEX_HELD(&qp->replylist_lock));
3637 3636 if (rep->prev) {
3638 3637 rep->prev->next = rep->next;
3639 3638 }
3640 3639 if (rep->next) {
3641 3640 rep->next->prev = rep->prev;
3642 3641 }
3643 3642 if (qp->replylist == rep)
3644 3643 qp->replylist = rep->next;
3645 3644
3646 3645 cv_destroy(&rep->wait_cv);
3647 3646 qp->rep_list_size--;
3648 3647
3649 3648 DTRACE_PROBE1(rpcib__i__remreply__listsize,
3650 3649 int, qp->rep_list_size);
3651 3650
3652 3651 kmem_free(rep, sizeof (*rep));
3653 3652
3654 3653 return (0);
3655 3654 }
3656 3655
3657 3656 rdma_stat
3658 3657 rib_registermem(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3659 3658 struct mrc *buf_handle)
3660 3659 {
3661 3660 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3662 3661 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3663 3662 rdma_stat status;
3664 3663 rib_hca_t *hca = (ctoqp(conn))->hca;
3665 3664
3666 3665 /*
3667 3666 * Note: ALL buffer pools use the same memory type RDMARW.
3668 3667 */
3669 3668 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3670 3669 if (status == RDMA_SUCCESS) {
3671 3670 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3672 3671 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3673 3672 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3674 3673 } else {
3675 3674 buf_handle->mrc_linfo = NULL;
3676 3675 buf_handle->mrc_lmr = 0;
3677 3676 buf_handle->mrc_rmr = 0;
3678 3677 }
3679 3678 return (status);
3680 3679 }
3681 3680
3682 3681 static rdma_stat
3683 3682 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3684 3683 ibt_mr_flags_t spec,
3685 3684 ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3686 3685 {
3687 3686 ibt_mr_attr_t mem_attr;
3688 3687 ibt_status_t ibt_status;
3689 3688 mem_attr.mr_vaddr = (uintptr_t)buf;
3690 3689 mem_attr.mr_len = (ib_msglen_t)size;
3691 3690 mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3692 3691 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3693 3692 IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3694 3693 IBT_MR_ENABLE_WINDOW_BIND | spec;
3695 3694
3696 3695 rw_enter(&hca->state_lock, RW_READER);
3697 3696 if (hca->state != HCA_DETACHED) {
3698 3697 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3699 3698 &mem_attr, mr_hdlp, mr_descp);
3700 3699 rw_exit(&hca->state_lock);
3701 3700 } else {
3702 3701 rw_exit(&hca->state_lock);
3703 3702 return (RDMA_FAILED);
3704 3703 }
3705 3704
3706 3705 if (ibt_status != IBT_SUCCESS) {
3707 3706 return (RDMA_FAILED);
3708 3707 }
3709 3708 return (RDMA_SUCCESS);
3710 3709 }
3711 3710
3712 3711 rdma_stat
3713 3712 rib_registermemsync(CONN *conn, caddr_t adsp, caddr_t buf, uint_t buflen,
3714 3713 struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3715 3714 {
3716 3715 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
3717 3716 rib_lrc_entry_t *l;
3718 3717 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
3719 3718 rdma_stat status;
3720 3719 rib_hca_t *hca = (ctoqp(conn))->hca;
3721 3720
3722 3721 /*
3723 3722 * Non-coherent memory registration.
3724 3723 */
3725 3724 l = (rib_lrc_entry_t *)lrc;
3726 3725 if (l) {
3727 3726 if (l->registered) {
3728 3727 buf_handle->mrc_linfo =
3729 3728 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3730 3729 buf_handle->mrc_lmr =
3731 3730 (uint32_t)l->lrc_mhandle.mrc_lmr;
3732 3731 buf_handle->mrc_rmr =
3733 3732 (uint32_t)l->lrc_mhandle.mrc_rmr;
3734 3733 *sync_handle = (RIB_SYNCMEM_HANDLE)
3735 3734 (uintptr_t)l->lrc_mhandle.mrc_linfo;
3736 3735 return (RDMA_SUCCESS);
3737 3736 } else {
3738 3737 /* Always register the whole buffer */
3739 3738 buf = (caddr_t)l->lrc_buf;
3740 3739 buflen = l->lrc_len;
3741 3740 }
3742 3741 }
3743 3742 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3744 3743
3745 3744 if (status == RDMA_SUCCESS) {
3746 3745 if (l) {
3747 3746 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3748 3747 l->lrc_mhandle.mrc_lmr = (uint32_t)mr_desc.md_lkey;
3749 3748 l->lrc_mhandle.mrc_rmr = (uint32_t)mr_desc.md_rkey;
3750 3749 l->registered = TRUE;
3751 3750 }
3752 3751 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3753 3752 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3754 3753 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3755 3754 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3756 3755 } else {
3757 3756 buf_handle->mrc_linfo = NULL;
3758 3757 buf_handle->mrc_lmr = 0;
3759 3758 buf_handle->mrc_rmr = 0;
3760 3759 }
3761 3760 return (status);
3762 3761 }
3763 3762
3764 3763 /* ARGSUSED */
3765 3764 rdma_stat
3766 3765 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3767 3766 {
3768 3767 rib_hca_t *hca = (ctoqp(conn))->hca;
3769 3768 /*
3770 3769 * Allow memory deregistration even if HCA is
3771 3770 * getting detached. Need all outstanding
3772 3771 * memory registrations to be deregistered
3773 3772 * before HCA_DETACH_EVENT can be accepted.
3774 3773 */
3775 3774 (void) ibt_deregister_mr(hca->hca_hdl,
3776 3775 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3777 3776 return (RDMA_SUCCESS);
3778 3777 }
3779 3778
3780 3779 /* ARGSUSED */
3781 3780 rdma_stat
3782 3781 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3783 3782 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3784 3783 {
3785 3784 rib_lrc_entry_t *l;
3786 3785 l = (rib_lrc_entry_t *)lrc;
3787 3786 if (l)
3788 3787 if (l->registered)
3789 3788 return (RDMA_SUCCESS);
3790 3789
3791 3790 (void) rib_deregistermem(conn, buf, buf_handle);
3792 3791
3793 3792 return (RDMA_SUCCESS);
3794 3793 }
3795 3794
3796 3795 /* ARGSUSED */
3797 3796 rdma_stat
3798 3797 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3799 3798 int len, int cpu)
3800 3799 {
3801 3800 ibt_status_t status;
3802 3801 rib_hca_t *hca = (ctoqp(conn))->hca;
3803 3802 ibt_mr_sync_t mr_segment;
3804 3803
3805 3804 mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3806 3805 mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3807 3806 mr_segment.ms_len = (ib_memlen_t)len;
3808 3807 if (cpu) {
3809 3808 /* make incoming data visible to memory */
3810 3809 mr_segment.ms_flags = IBT_SYNC_WRITE;
3811 3810 } else {
3812 3811 /* make memory changes visible to IO */
3813 3812 mr_segment.ms_flags = IBT_SYNC_READ;
3814 3813 }
3815 3814 rw_enter(&hca->state_lock, RW_READER);
3816 3815 if (hca->state != HCA_DETACHED) {
3817 3816 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3818 3817 rw_exit(&hca->state_lock);
3819 3818 } else {
3820 3819 rw_exit(&hca->state_lock);
3821 3820 return (RDMA_FAILED);
3822 3821 }
3823 3822
3824 3823 if (status == IBT_SUCCESS)
3825 3824 return (RDMA_SUCCESS);
3826 3825 else {
3827 3826 return (RDMA_FAILED);
3828 3827 }
3829 3828 }
3830 3829
3831 3830 /*
3832 3831 * XXXX ????
3833 3832 */
3834 3833 static rdma_stat
3835 3834 rib_getinfo(rdma_info_t *info)
3836 3835 {
3837 3836 /*
3838 3837 * XXXX Hack!
3839 3838 */
3840 3839 info->addrlen = 16;
3841 3840 info->mts = 1000000;
3842 3841 info->mtu = 1000000;
3843 3842
3844 3843 return (RDMA_SUCCESS);
3845 3844 }
3846 3845
3847 3846 rib_bufpool_t *
3848 3847 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3849 3848 {
3850 3849 rib_bufpool_t *rbp = NULL;
3851 3850 bufpool_t *bp = NULL;
3852 3851 caddr_t buf;
3853 3852 ibt_mr_attr_t mem_attr;
3854 3853 ibt_status_t ibt_status;
3855 3854 int i, j;
3856 3855
3857 3856 rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3858 3857
3859 3858 bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3860 3859 num * sizeof (void *), KM_SLEEP);
3861 3860
3862 3861 mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3863 3862 bp->numelems = num;
3864 3863
3865 3864
3866 3865 switch (ptype) {
3867 3866 case SEND_BUFFER:
3868 3867 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3869 3868 bp->rsize = RPC_MSG_SZ;
3870 3869 break;
3871 3870 case RECV_BUFFER:
3872 3871 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3873 3872 bp->rsize = RPC_BUF_SIZE;
3874 3873 break;
3875 3874 default:
3876 3875 goto fail;
3877 3876 }
3878 3877
3879 3878 /*
3880 3879 * Register the pool.
3881 3880 */
3882 3881 bp->bufsize = num * bp->rsize;
3883 3882 bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3884 3883 rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3885 3884 sizeof (ibt_mr_hdl_t), KM_SLEEP);
3886 3885 rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3887 3886 sizeof (ibt_mr_desc_t), KM_SLEEP);
3888 3887 rw_enter(&hca->state_lock, RW_READER);
3889 3888
3890 3889 if (hca->state == HCA_DETACHED) {
3891 3890 rw_exit(&hca->state_lock);
3892 3891 goto fail;
3893 3892 }
3894 3893
3895 3894 for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3896 3895 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3897 3896 mem_attr.mr_vaddr = (uintptr_t)buf;
3898 3897 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3899 3898 mem_attr.mr_as = NULL;
3900 3899 ibt_status = ibt_register_mr(hca->hca_hdl,
3901 3900 hca->pd_hdl, &mem_attr,
3902 3901 &rbp->mr_hdl[i],
3903 3902 &rbp->mr_desc[i]);
3904 3903 if (ibt_status != IBT_SUCCESS) {
3905 3904 for (j = 0; j < i; j++) {
3906 3905 (void) ibt_deregister_mr(hca->hca_hdl,
3907 3906 rbp->mr_hdl[j]);
3908 3907 }
3909 3908 rw_exit(&hca->state_lock);
3910 3909 goto fail;
3911 3910 }
3912 3911 }
3913 3912 rw_exit(&hca->state_lock);
3914 3913 buf = (caddr_t)bp->buf;
3915 3914 for (i = 0; i < num; i++, buf += bp->rsize) {
3916 3915 bp->buflist[i] = (void *)buf;
3917 3916 }
3918 3917 bp->buffree = num - 1; /* no. of free buffers */
3919 3918 rbp->bpool = bp;
3920 3919
3921 3920 return (rbp);
3922 3921 fail:
3923 3922 if (bp) {
3924 3923 if (bp->buf)
3925 3924 kmem_free(bp->buf, bp->bufsize);
3926 3925 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3927 3926 }
3928 3927 if (rbp) {
3929 3928 if (rbp->mr_hdl)
3930 3929 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3931 3930 if (rbp->mr_desc)
3932 3931 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3933 3932 kmem_free(rbp, sizeof (rib_bufpool_t));
3934 3933 }
3935 3934 return (NULL);
3936 3935 }
3937 3936
3938 3937 static void
3939 3938 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3940 3939 {
3941 3940 int i;
3942 3941 rib_bufpool_t *rbp = NULL;
3943 3942 bufpool_t *bp;
3944 3943
3945 3944 /*
3946 3945 * Obtain pool address based on type of pool
3947 3946 */
3948 3947 switch (ptype) {
3949 3948 case SEND_BUFFER:
3950 3949 rbp = hca->send_pool;
3951 3950 break;
3952 3951 case RECV_BUFFER:
3953 3952 rbp = hca->recv_pool;
3954 3953 break;
3955 3954 default:
3956 3955 return;
3957 3956 }
3958 3957 if (rbp == NULL)
3959 3958 return;
3960 3959
3961 3960 bp = rbp->bpool;
3962 3961
3963 3962 /*
3964 3963 * Deregister the pool memory and free it.
3965 3964 */
3966 3965 for (i = 0; i < bp->numelems; i++) {
3967 3966 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3968 3967 }
3969 3968 }
3970 3969
3971 3970 static void
3972 3971 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3973 3972 {
3974 3973
3975 3974 rib_bufpool_t *rbp = NULL;
3976 3975 bufpool_t *bp;
3977 3976
3978 3977 /*
3979 3978 * Obtain pool address based on type of pool
3980 3979 */
3981 3980 switch (ptype) {
3982 3981 case SEND_BUFFER:
3983 3982 rbp = hca->send_pool;
3984 3983 break;
3985 3984 case RECV_BUFFER:
3986 3985 rbp = hca->recv_pool;
3987 3986 break;
3988 3987 default:
3989 3988 return;
3990 3989 }
3991 3990 if (rbp == NULL)
3992 3991 return;
3993 3992
3994 3993 bp = rbp->bpool;
3995 3994
3996 3995 /*
3997 3996 * Free the pool memory.
3998 3997 */
3999 3998 if (rbp->mr_hdl)
4000 3999 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4001 4000
4002 4001 if (rbp->mr_desc)
4003 4002 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4004 4003 if (bp->buf)
4005 4004 kmem_free(bp->buf, bp->bufsize);
4006 4005 mutex_destroy(&bp->buflock);
4007 4006 kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4008 4007 kmem_free(rbp, sizeof (rib_bufpool_t));
4009 4008 }
4010 4009
4011 4010 void
4012 4011 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4013 4012 {
4014 4013 /*
4015 4014 * Deregister the pool memory and free it.
4016 4015 */
4017 4016 rib_rbufpool_deregister(hca, ptype);
4018 4017 rib_rbufpool_free(hca, ptype);
4019 4018 }
4020 4019
4021 4020 /*
4022 4021 * Fetch a buffer from the pool of type specified in rdbuf->type.
4023 4022 */
4024 4023 static rdma_stat
4025 4024 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4026 4025 {
4027 4026 rib_lrc_entry_t *rlep;
4028 4027
4029 4028 if (rdbuf->type == RDMA_LONG_BUFFER) {
4030 4029 rlep = rib_get_cache_buf(conn, rdbuf->len);
4031 4030 rdbuf->rb_private = (caddr_t)rlep;
4032 4031 rdbuf->addr = rlep->lrc_buf;
4033 4032 rdbuf->handle = rlep->lrc_mhandle;
4034 4033 return (RDMA_SUCCESS);
4035 4034 }
4036 4035
4037 4036 rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4038 4037 if (rdbuf->addr) {
4039 4038 switch (rdbuf->type) {
4040 4039 case SEND_BUFFER:
4041 4040 rdbuf->len = RPC_MSG_SZ; /* 1K */
4042 4041 break;
4043 4042 case RECV_BUFFER:
4044 4043 rdbuf->len = RPC_BUF_SIZE; /* 2K */
4045 4044 break;
4046 4045 default:
4047 4046 rdbuf->len = 0;
4048 4047 }
4049 4048 return (RDMA_SUCCESS);
4050 4049 } else
4051 4050 return (RDMA_FAILED);
4052 4051 }
4053 4052
4054 4053 /*
4055 4054 * Fetch a buffer of specified type.
4056 4055 * Note that rdbuf->handle is mw's rkey.
4057 4056 */
4058 4057 static void *
4059 4058 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4060 4059 {
4061 4060 rib_qp_t *qp = ctoqp(conn);
4062 4061 rib_hca_t *hca = qp->hca;
4063 4062 rdma_btype ptype = rdbuf->type;
4064 4063 void *buf;
4065 4064 rib_bufpool_t *rbp = NULL;
4066 4065 bufpool_t *bp;
4067 4066 int i;
4068 4067
4069 4068 /*
4070 4069 * Obtain pool address based on type of pool
4071 4070 */
4072 4071 switch (ptype) {
4073 4072 case SEND_BUFFER:
4074 4073 rbp = hca->send_pool;
4075 4074 break;
4076 4075 case RECV_BUFFER:
4077 4076 rbp = hca->recv_pool;
4078 4077 break;
4079 4078 default:
4080 4079 return (NULL);
4081 4080 }
4082 4081 if (rbp == NULL)
4083 4082 return (NULL);
4084 4083
4085 4084 bp = rbp->bpool;
4086 4085
4087 4086 mutex_enter(&bp->buflock);
4088 4087 if (bp->buffree < 0) {
4089 4088 mutex_exit(&bp->buflock);
4090 4089 return (NULL);
4091 4090 }
4092 4091
4093 4092 /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4094 4093 buf = bp->buflist[bp->buffree];
4095 4094 rdbuf->addr = buf;
4096 4095 rdbuf->len = bp->rsize;
4097 4096 for (i = bp->numelems - 1; i >= 0; i--) {
4098 4097 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4099 4098 rdbuf->handle.mrc_rmr =
4100 4099 (uint32_t)rbp->mr_desc[i].md_rkey;
4101 4100 rdbuf->handle.mrc_linfo =
4102 4101 (uintptr_t)rbp->mr_hdl[i];
4103 4102 rdbuf->handle.mrc_lmr =
4104 4103 (uint32_t)rbp->mr_desc[i].md_lkey;
4105 4104 bp->buffree--;
4106 4105
4107 4106 mutex_exit(&bp->buflock);
4108 4107
4109 4108 return (buf);
4110 4109 }
4111 4110 }
4112 4111
4113 4112 mutex_exit(&bp->buflock);
4114 4113
4115 4114 return (NULL);
4116 4115 }
4117 4116
4118 4117 static void
4119 4118 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4120 4119 {
4121 4120
4122 4121 if (rdbuf->type == RDMA_LONG_BUFFER) {
4123 4122 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4124 4123 rdbuf->rb_private = NULL;
4125 4124 return;
4126 4125 }
4127 4126 rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4128 4127 }
4129 4128
4130 4129 static void
4131 4130 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4132 4131 {
4133 4132 rib_qp_t *qp = ctoqp(conn);
4134 4133 rib_hca_t *hca = qp->hca;
4135 4134 rib_bufpool_t *rbp = NULL;
4136 4135 bufpool_t *bp;
4137 4136
4138 4137 /*
4139 4138 * Obtain pool address based on type of pool
4140 4139 */
4141 4140 switch (ptype) {
4142 4141 case SEND_BUFFER:
4143 4142 rbp = hca->send_pool;
4144 4143 break;
4145 4144 case RECV_BUFFER:
4146 4145 rbp = hca->recv_pool;
4147 4146 break;
4148 4147 default:
4149 4148 return;
4150 4149 }
4151 4150 if (rbp == NULL)
4152 4151 return;
4153 4152
4154 4153 bp = rbp->bpool;
4155 4154
4156 4155 mutex_enter(&bp->buflock);
4157 4156 if (++bp->buffree >= bp->numelems) {
4158 4157 /*
4159 4158 * Should never happen
4160 4159 */
4161 4160 bp->buffree--;
4162 4161 } else {
4163 4162 bp->buflist[bp->buffree] = buf;
4164 4163 }
4165 4164 mutex_exit(&bp->buflock);
4166 4165 }
4167 4166
4168 4167 static rdma_stat
4169 4168 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4170 4169 {
4171 4170 rw_enter(&connlist->conn_lock, RW_WRITER);
4172 4171 if (connlist->conn_hd) {
4173 4172 cn->c_next = connlist->conn_hd;
4174 4173 connlist->conn_hd->c_prev = cn;
4175 4174 }
4176 4175 connlist->conn_hd = cn;
4177 4176 rw_exit(&connlist->conn_lock);
4178 4177
4179 4178 return (RDMA_SUCCESS);
4180 4179 }
4181 4180
4182 4181 static rdma_stat
4183 4182 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4184 4183 {
4185 4184 rw_enter(&connlist->conn_lock, RW_WRITER);
4186 4185 if (cn->c_prev) {
4187 4186 cn->c_prev->c_next = cn->c_next;
4188 4187 }
4189 4188 if (cn->c_next) {
4190 4189 cn->c_next->c_prev = cn->c_prev;
4191 4190 }
4192 4191 if (connlist->conn_hd == cn)
4193 4192 connlist->conn_hd = cn->c_next;
4194 4193 rw_exit(&connlist->conn_lock);
4195 4194
4196 4195 return (RDMA_SUCCESS);
4197 4196 }
4198 4197
4199 4198 /* ARGSUSED */
4200 4199 static rdma_stat
4201 4200 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4202 4201 int addr_type, void *handle, CONN **conn)
4203 4202 {
4204 4203 rdma_stat status;
4205 4204 rpcib_ping_t rpt;
4206 4205
4207 4206 status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4208 4207 return (status);
4209 4208 }
4210 4209
4211 4210 /*
4212 4211 * rib_find_hca_connection
4213 4212 *
4214 4213 * if there is an existing connection to the specified address then
4215 4214 * it will be returned in conn, otherwise conn will be set to NULL.
4216 4215 * Also cleans up any connection that is in error state.
4217 4216 */
4218 4217 static int
4219 4218 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4220 4219 struct netbuf *d_svcaddr, CONN **conn)
4221 4220 {
4222 4221 CONN *cn;
4223 4222 clock_t cv_stat, timout;
4224 4223
4225 4224 *conn = NULL;
4226 4225 again:
4227 4226 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4228 4227 cn = hca->cl_conn_list.conn_hd;
4229 4228 while (cn != NULL) {
4230 4229 /*
4231 4230 * First, clear up any connection in the ERROR state
4232 4231 */
4233 4232 mutex_enter(&cn->c_lock);
4234 4233 if (cn->c_state == C_ERROR_CONN) {
4235 4234 if (cn->c_ref == 0) {
4236 4235 /*
4237 4236 * Remove connection from list and destroy it.
4238 4237 */
4239 4238 cn->c_state = C_DISCONN_PEND;
4240 4239 mutex_exit(&cn->c_lock);
4241 4240 rw_exit(&hca->cl_conn_list.conn_lock);
4242 4241 rib_conn_close((void *)cn);
4243 4242 goto again;
4244 4243 }
4245 4244 mutex_exit(&cn->c_lock);
4246 4245 cn = cn->c_next;
4247 4246 continue;
4248 4247 }
4249 4248 if (cn->c_state == C_DISCONN_PEND) {
4250 4249 mutex_exit(&cn->c_lock);
4251 4250 cn = cn->c_next;
4252 4251 continue;
4253 4252 }
4254 4253
4255 4254 /*
4256 4255 * source address is only checked for if there is one,
4257 4256 * this is the case for retries.
4258 4257 */
4259 4258 if ((cn->c_raddr.len == d_svcaddr->len) &&
4260 4259 (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4261 4260 d_svcaddr->len) == 0) &&
4262 4261 ((s_svcaddr->len == 0) ||
4263 4262 ((cn->c_laddr.len == s_svcaddr->len) &&
4264 4263 (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4265 4264 s_svcaddr->len) == 0)))) {
4266 4265 /*
4267 4266 * Our connection. Give up conn list lock
4268 4267 * as we are done traversing the list.
4269 4268 */
4270 4269 rw_exit(&hca->cl_conn_list.conn_lock);
4271 4270 if (cn->c_state == C_CONNECTED) {
4272 4271 cn->c_ref++; /* sharing a conn */
4273 4272 mutex_exit(&cn->c_lock);
4274 4273 *conn = cn;
4275 4274 return (RDMA_SUCCESS);
4276 4275 }
4277 4276 if (cn->c_state == C_CONN_PEND) {
4278 4277 /*
4279 4278 * Hold a reference to this conn before
4280 4279 * we give up the lock.
4281 4280 */
4282 4281 cn->c_ref++;
4283 4282 timout = ddi_get_lbolt() +
4284 4283 drv_usectohz(CONN_WAIT_TIME * 1000000);
4285 4284 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4286 4285 &cn->c_lock, timout)) > 0 &&
4287 4286 cn->c_state == C_CONN_PEND)
4288 4287 ;
4289 4288 if (cv_stat == 0) {
4290 4289 (void) rib_conn_release_locked(cn);
4291 4290 return (RDMA_INTR);
4292 4291 }
4293 4292 if (cv_stat < 0) {
4294 4293 (void) rib_conn_release_locked(cn);
4295 4294 return (RDMA_TIMEDOUT);
4296 4295 }
4297 4296 if (cn->c_state == C_CONNECTED) {
4298 4297 *conn = cn;
4299 4298 mutex_exit(&cn->c_lock);
4300 4299 return (RDMA_SUCCESS);
4301 4300 } else {
4302 4301 (void) rib_conn_release_locked(cn);
4303 4302 return (RDMA_TIMEDOUT);
4304 4303 }
4305 4304 }
4306 4305 }
4307 4306 mutex_exit(&cn->c_lock);
4308 4307 cn = cn->c_next;
4309 4308 }
4310 4309 rw_exit(&hca->cl_conn_list.conn_lock);
4311 4310 *conn = NULL;
4312 4311 return (RDMA_FAILED);
4313 4312 }
4314 4313
4315 4314 /*
4316 4315 * Connection management.
4317 4316 * IBTF does not support recycling of channels. So connections are only
4318 4317 * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4319 4318 * C_DISCONN_PEND state. No C_IDLE state.
4320 4319 * C_CONN_PEND state: Connection establishment in progress to the server.
4321 4320 * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4322 4321 * It has an RC channel associated with it. ibt_post_send/recv are allowed
4323 4322 * only in this state.
4324 4323 * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4325 4324 * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4326 4325 * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4327 4326 * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4328 4327 * c_ref drops to 0 (this indicates that RPC has no more references to this
4329 4328 * connection), the connection should be destroyed. A connection transitions
4330 4329 * into this state when it is being destroyed.
4331 4330 */
4332 4331 /* ARGSUSED */
4333 4332 static rdma_stat
4334 4333 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4335 4334 int addr_type, rpcib_ping_t *rpt, CONN **conn)
4336 4335 {
4337 4336 CONN *cn;
4338 4337 int status;
4339 4338 rib_hca_t *hca;
4340 4339 rib_qp_t *qp;
4341 4340 int s_addr_len;
4342 4341 char *s_addr_buf;
4343 4342
4344 4343 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4345 4344 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4346 4345 rw_enter(&hca->state_lock, RW_READER);
4347 4346 if (hca->state != HCA_DETACHED) {
4348 4347 status = rib_find_hca_connection(hca, s_svcaddr,
4349 4348 d_svcaddr, conn);
4350 4349 rw_exit(&hca->state_lock);
4351 4350 if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4352 4351 rw_exit(&rib_stat->hcas_list_lock);
4353 4352 return (status);
4354 4353 }
4355 4354 } else
4356 4355 rw_exit(&hca->state_lock);
4357 4356 }
4358 4357 rw_exit(&rib_stat->hcas_list_lock);
4359 4358
4360 4359 /*
4361 4360 * No existing connection found, establish a new connection.
4362 4361 */
4363 4362 bzero(rpt, sizeof (rpcib_ping_t));
4364 4363
4365 4364 status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4366 4365 if (status != RDMA_SUCCESS) {
4367 4366 return (RDMA_FAILED);
4368 4367 }
4369 4368 hca = rpt->hca;
4370 4369
4371 4370 if (rpt->srcip.family == AF_INET) {
4372 4371 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4373 4372 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4374 4373 } else if (rpt->srcip.family == AF_INET6) {
4375 4374 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4376 4375 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4377 4376 } else {
4378 4377 return (RDMA_FAILED);
4379 4378 }
4380 4379
4381 4380 /*
4382 4381 * Channel to server doesn't exist yet, create one.
4383 4382 */
4384 4383 if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4385 4384 return (RDMA_FAILED);
4386 4385 }
4387 4386 cn = qptoc(qp);
4388 4387 cn->c_state = C_CONN_PEND;
4389 4388 cn->c_ref = 1;
4390 4389
4391 4390 cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4392 4391 bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4393 4392 cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4394 4393
4395 4394 if (rpt->srcip.family == AF_INET) {
4396 4395 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4397 4396 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4398 4397
4399 4398 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4400 4399 sizeof (struct sockaddr_in);
4401 4400 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4402 4401
4403 4402 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4404 4403 (uint32_t)~0;
4405 4404 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4406 4405 (ushort_t)~0;
4407 4406
4408 4407 } else {
4409 4408 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4410 4409 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4411 4410
4412 4411 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4413 4412 sizeof (struct sockaddr_in6);
4414 4413 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4415 4414
4416 4415 (void) memset(
4417 4416 &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4418 4417 (uchar_t)~0, sizeof (struct in6_addr));
4419 4418 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4420 4419 (sa_family_t)~0;
4421 4420 }
4422 4421
4423 4422 /*
4424 4423 * Add to conn list.
4425 4424 * We had given up the READER lock. In the time since then,
4426 4425 * another thread might have created the connection we are
4427 4426 * trying here. But for now, that is quiet alright - there
4428 4427 * might be two connections between a pair of hosts instead
4429 4428 * of one. If we really want to close that window,
4430 4429 * then need to check the list after acquiring the
4431 4430 * WRITER lock.
4432 4431 */
4433 4432 (void) rib_add_connlist(cn, &hca->cl_conn_list);
4434 4433 status = rib_conn_to_srv(hca, qp, rpt);
4435 4434 mutex_enter(&cn->c_lock);
4436 4435
4437 4436 if (cn->c_flags & C_CLOSE_PENDING) {
4438 4437 /*
4439 4438 * This handles a case where the module or
4440 4439 * HCA detached in the time a connection is
4441 4440 * established. In such a case close the
4442 4441 * connection immediately if this is the
4443 4442 * only reference.
4444 4443 */
4445 4444 if (cn->c_ref == 1) {
4446 4445 cn->c_ref--;
4447 4446 cn->c_state = C_DISCONN_PEND;
4448 4447 mutex_exit(&cn->c_lock);
4449 4448 rib_conn_close((void *)cn);
4450 4449 return (RDMA_FAILED);
4451 4450 }
4452 4451
4453 4452 /*
4454 4453 * Connection to be closed later when c_ref = 0
4455 4454 */
4456 4455 status = RDMA_FAILED;
4457 4456 }
4458 4457
4459 4458 if (status == RDMA_SUCCESS) {
4460 4459 cn->c_state = C_CONNECTED;
4461 4460 *conn = cn;
4462 4461 } else {
4463 4462 cn->c_state = C_ERROR_CONN;
4464 4463 cn->c_ref--;
4465 4464 }
4466 4465 cv_signal(&cn->c_cv);
4467 4466 mutex_exit(&cn->c_lock);
4468 4467 return (status);
4469 4468 }
4470 4469
4471 4470 static void
4472 4471 rib_conn_close(void *rarg)
4473 4472 {
4474 4473 CONN *conn = (CONN *)rarg;
4475 4474 rib_qp_t *qp = ctoqp(conn);
4476 4475
4477 4476 mutex_enter(&conn->c_lock);
4478 4477 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4479 4478
4480 4479 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4481 4480
4482 4481 /*
4483 4482 * Live connection in CONNECTED state.
4484 4483 */
4485 4484 if (conn->c_state == C_CONNECTED) {
4486 4485 conn->c_state = C_ERROR_CONN;
4487 4486 }
4488 4487 mutex_exit(&conn->c_lock);
4489 4488
4490 4489 rib_close_a_channel(conn);
4491 4490
4492 4491 mutex_enter(&conn->c_lock);
4493 4492 conn->c_flags &= ~C_CLOSE_PENDING;
4494 4493 }
4495 4494
4496 4495 mutex_exit(&conn->c_lock);
4497 4496
4498 4497 if (qp->mode == RIB_SERVER)
4499 4498 (void) rib_disconnect_channel(conn,
4500 4499 &qp->hca->srv_conn_list);
4501 4500 else
4502 4501 (void) rib_disconnect_channel(conn,
4503 4502 &qp->hca->cl_conn_list);
4504 4503 }
4505 4504
4506 4505 static void
4507 4506 rib_conn_timeout_call(void *carg)
4508 4507 {
4509 4508 time_t idle_time;
4510 4509 CONN *conn = (CONN *)carg;
4511 4510 rib_hca_t *hca = ctoqp(conn)->hca;
4512 4511 int error;
4513 4512
4514 4513 mutex_enter(&conn->c_lock);
4515 4514 if ((conn->c_ref > 0) ||
4516 4515 (conn->c_state == C_DISCONN_PEND)) {
4517 4516 conn->c_timeout = NULL;
4518 4517 mutex_exit(&conn->c_lock);
4519 4518 return;
4520 4519 }
4521 4520
4522 4521 idle_time = (gethrestime_sec() - conn->c_last_used);
4523 4522
4524 4523 if ((idle_time <= rib_conn_timeout) &&
4525 4524 (conn->c_state != C_ERROR_CONN)) {
4526 4525 /*
4527 4526 * There was activity after the last timeout.
4528 4527 * Extend the conn life. Unless the conn is
4529 4528 * already in error state.
4530 4529 */
4531 4530 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4532 4531 SEC_TO_TICK(rib_conn_timeout - idle_time));
4533 4532 mutex_exit(&conn->c_lock);
4534 4533 return;
4535 4534 }
4536 4535
4537 4536 error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4538 4537 (void *)conn, DDI_NOSLEEP);
4539 4538
4540 4539 /*
4541 4540 * If taskq dispatch fails above, then reset the timeout
4542 4541 * to try again after 10 secs.
4543 4542 */
4544 4543
4545 4544 if (error != DDI_SUCCESS) {
4546 4545 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4547 4546 SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4548 4547 mutex_exit(&conn->c_lock);
4549 4548 return;
4550 4549 }
4551 4550
4552 4551 conn->c_state = C_DISCONN_PEND;
4553 4552 mutex_exit(&conn->c_lock);
4554 4553 }
4555 4554
4556 4555 static rdma_stat
4557 4556 rib_conn_release(CONN *conn)
4558 4557 {
4559 4558 mutex_enter(&conn->c_lock);
4560 4559 return (rib_conn_release_locked(conn));
4561 4560 }
4562 4561
4563 4562 /*
4564 4563 * Expects conn->c_lock to be held on entry.
4565 4564 * c_lock released on return
4566 4565 */
4567 4566 static rdma_stat
4568 4567 rib_conn_release_locked(CONN *conn)
4569 4568 {
4570 4569 conn->c_ref--;
4571 4570
4572 4571 conn->c_last_used = gethrestime_sec();
4573 4572 if (conn->c_ref > 0) {
4574 4573 mutex_exit(&conn->c_lock);
4575 4574 return (RDMA_SUCCESS);
4576 4575 }
4577 4576
4578 4577 /*
4579 4578 * If a conn is C_ERROR_CONN, close the channel.
4580 4579 */
4581 4580 if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4582 4581 conn->c_state = C_DISCONN_PEND;
4583 4582 mutex_exit(&conn->c_lock);
4584 4583 rib_conn_close((void *)conn);
4585 4584 return (RDMA_SUCCESS);
4586 4585 }
4587 4586
4588 4587 /*
4589 4588 * c_ref == 0, set a timeout for conn release
4590 4589 */
4591 4590
4592 4591 if (conn->c_timeout == NULL) {
4593 4592 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4594 4593 SEC_TO_TICK(rib_conn_timeout));
4595 4594 }
4596 4595
4597 4596 mutex_exit(&conn->c_lock);
4598 4597 return (RDMA_SUCCESS);
4599 4598 }
4600 4599
4601 4600 /*
4602 4601 * Add at front of list
4603 4602 */
4604 4603 static struct rdma_done_list *
4605 4604 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4606 4605 {
4607 4606 struct rdma_done_list *rd;
4608 4607
4609 4608 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4610 4609
4611 4610 rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4612 4611 rd->xid = xid;
4613 4612 cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4614 4613
4615 4614 rd->prev = NULL;
4616 4615 rd->next = qp->rdlist;
4617 4616 if (qp->rdlist != NULL)
4618 4617 qp->rdlist->prev = rd;
4619 4618 qp->rdlist = rd;
4620 4619
4621 4620 return (rd);
4622 4621 }
4623 4622
4624 4623 static void
4625 4624 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4626 4625 {
4627 4626 struct rdma_done_list *r;
4628 4627
4629 4628 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4630 4629
4631 4630 r = rd->next;
4632 4631 if (r != NULL) {
4633 4632 r->prev = rd->prev;
4634 4633 }
4635 4634
4636 4635 r = rd->prev;
4637 4636 if (r != NULL) {
4638 4637 r->next = rd->next;
4639 4638 } else {
4640 4639 qp->rdlist = rd->next;
4641 4640 }
4642 4641
4643 4642 cv_destroy(&rd->rdma_done_cv);
4644 4643 kmem_free(rd, sizeof (*rd));
4645 4644 }
4646 4645
4647 4646 static void
4648 4647 rdma_done_rem_list(rib_qp_t *qp)
4649 4648 {
4650 4649 struct rdma_done_list *r, *n;
4651 4650
4652 4651 mutex_enter(&qp->rdlist_lock);
4653 4652 for (r = qp->rdlist; r != NULL; r = n) {
4654 4653 n = r->next;
4655 4654 rdma_done_rm(qp, r);
4656 4655 }
4657 4656 mutex_exit(&qp->rdlist_lock);
4658 4657 }
4659 4658
4660 4659 static void
4661 4660 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4662 4661 {
4663 4662 struct rdma_done_list *r = qp->rdlist;
4664 4663
4665 4664 ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4666 4665
4667 4666 while (r) {
4668 4667 if (r->xid == xid) {
4669 4668 cv_signal(&r->rdma_done_cv);
4670 4669 return;
4671 4670 } else {
4672 4671 r = r->next;
4673 4672 }
4674 4673 }
4675 4674 DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4676 4675 int, xid);
4677 4676 }
4678 4677
4679 4678 /*
4680 4679 * Expects conn->c_lock to be held by the caller.
4681 4680 */
4682 4681
4683 4682 static void
4684 4683 rib_close_a_channel(CONN *conn)
4685 4684 {
4686 4685 rib_qp_t *qp;
4687 4686 qp = ctoqp(conn);
4688 4687
4689 4688 if (qp->qp_hdl == NULL) {
4690 4689 /* channel already freed */
4691 4690 return;
4692 4691 }
4693 4692
4694 4693 /*
4695 4694 * Call ibt_close_rc_channel in blocking mode
4696 4695 * with no callbacks.
4697 4696 */
4698 4697 (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4699 4698 NULL, 0, NULL, NULL, 0);
4700 4699 }
4701 4700
4702 4701 /*
4703 4702 * Goes through all connections and closes the channel
4704 4703 * This will cause all the WRs on those channels to be
4705 4704 * flushed.
4706 4705 */
4707 4706 static void
4708 4707 rib_close_channels(rib_conn_list_t *connlist)
4709 4708 {
4710 4709 CONN *conn, *tmp;
4711 4710
4712 4711 rw_enter(&connlist->conn_lock, RW_READER);
4713 4712 conn = connlist->conn_hd;
4714 4713 while (conn != NULL) {
4715 4714 mutex_enter(&conn->c_lock);
4716 4715 tmp = conn->c_next;
4717 4716 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4718 4717
4719 4718 if (conn->c_state == C_CONN_PEND) {
4720 4719 conn->c_flags |= C_CLOSE_PENDING;
4721 4720 goto next;
4722 4721 }
4723 4722
4724 4723 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4725 4724
4726 4725 /*
4727 4726 * Live connection in CONNECTED state.
4728 4727 */
4729 4728 if (conn->c_state == C_CONNECTED)
4730 4729 conn->c_state = C_ERROR_CONN;
4731 4730 mutex_exit(&conn->c_lock);
4732 4731
4733 4732 rib_close_a_channel(conn);
4734 4733
4735 4734 mutex_enter(&conn->c_lock);
4736 4735 conn->c_flags &= ~C_CLOSE_PENDING;
4737 4736 /* Signal a pending rib_disconnect_channel() */
4738 4737 cv_signal(&conn->c_cv);
4739 4738 }
4740 4739 next:
4741 4740 mutex_exit(&conn->c_lock);
4742 4741 conn = tmp;
4743 4742 }
4744 4743 rw_exit(&connlist->conn_lock);
4745 4744 }
4746 4745
4747 4746 /*
4748 4747 * Frees up all connections that are no longer being referenced
4749 4748 */
4750 4749 static void
4751 4750 rib_purge_connlist(rib_conn_list_t *connlist)
4752 4751 {
4753 4752 CONN *conn;
4754 4753
4755 4754 top:
4756 4755 rw_enter(&connlist->conn_lock, RW_READER);
4757 4756 conn = connlist->conn_hd;
4758 4757 while (conn != NULL) {
4759 4758 mutex_enter(&conn->c_lock);
4760 4759
4761 4760 /*
4762 4761 * At this point connection is either in ERROR
4763 4762 * or DISCONN_PEND state. If in DISCONN_PEND state
4764 4763 * then some other thread is culling that connection.
4765 4764 * If not and if c_ref is 0, then destroy the connection.
4766 4765 */
4767 4766 if (conn->c_ref == 0 &&
4768 4767 conn->c_state != C_DISCONN_PEND) {
4769 4768 /*
4770 4769 * Cull the connection
4771 4770 */
4772 4771 conn->c_state = C_DISCONN_PEND;
4773 4772 mutex_exit(&conn->c_lock);
4774 4773 rw_exit(&connlist->conn_lock);
4775 4774 (void) rib_disconnect_channel(conn, connlist);
4776 4775 goto top;
4777 4776 } else {
4778 4777 /*
4779 4778 * conn disconnect already scheduled or will
4780 4779 * happen from conn_release when c_ref drops to 0.
4781 4780 */
4782 4781 mutex_exit(&conn->c_lock);
4783 4782 }
4784 4783 conn = conn->c_next;
4785 4784 }
4786 4785 rw_exit(&connlist->conn_lock);
4787 4786
4788 4787 /*
4789 4788 * At this point, only connections with c_ref != 0 are on the list
4790 4789 */
4791 4790 }
4792 4791
4793 4792 /*
4794 4793 * Free all the HCA resources and close
4795 4794 * the hca.
4796 4795 */
4797 4796
4798 4797 static void
4799 4798 rib_free_hca(rib_hca_t *hca)
4800 4799 {
4801 4800 (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4802 4801 (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4803 4802 (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4804 4803 (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4805 4804
4806 4805 kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4807 4806 kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4808 4807 kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4809 4808 kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4810 4809
4811 4810 rib_rbufpool_destroy(hca, RECV_BUFFER);
4812 4811 rib_rbufpool_destroy(hca, SEND_BUFFER);
4813 4812 rib_destroy_cache(hca);
4814 4813 if (rib_mod.rdma_count == 0)
4815 4814 (void) rdma_unregister_mod(&rib_mod);
4816 4815 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4817 4816 (void) ibt_close_hca(hca->hca_hdl);
4818 4817 hca->hca_hdl = NULL;
4819 4818 }
4820 4819
4821 4820
4822 4821 static void
4823 4822 rib_stop_hca_services(rib_hca_t *hca)
4824 4823 {
4825 4824 rib_stop_services(hca);
4826 4825 rib_close_channels(&hca->cl_conn_list);
4827 4826 rib_close_channels(&hca->srv_conn_list);
4828 4827
4829 4828 rib_purge_connlist(&hca->cl_conn_list);
4830 4829 rib_purge_connlist(&hca->srv_conn_list);
4831 4830
4832 4831 if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4833 4832 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4834 4833 GLOBAL_ZONEID);
4835 4834 stats_enabled = FALSE;
4836 4835 }
4837 4836
4838 4837 rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4839 4838 rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4840 4839 if (hca->srv_conn_list.conn_hd == NULL &&
4841 4840 hca->cl_conn_list.conn_hd == NULL) {
4842 4841 /*
4843 4842 * conn_lists are NULL, so destroy
4844 4843 * buffers, close hca and be done.
4845 4844 */
4846 4845 rib_free_hca(hca);
4847 4846 }
4848 4847 rw_exit(&hca->cl_conn_list.conn_lock);
4849 4848 rw_exit(&hca->srv_conn_list.conn_lock);
4850 4849
4851 4850 if (hca->hca_hdl != NULL) {
4852 4851 mutex_enter(&hca->inuse_lock);
4853 4852 while (hca->inuse)
4854 4853 cv_wait(&hca->cb_cv, &hca->inuse_lock);
4855 4854 mutex_exit(&hca->inuse_lock);
4856 4855
4857 4856 rib_free_hca(hca);
4858 4857 }
4859 4858 rw_destroy(&hca->bound_services_lock);
4860 4859
4861 4860 if (hca->cleanup_helper != NULL) {
4862 4861 ddi_taskq_destroy(hca->cleanup_helper);
4863 4862 hca->cleanup_helper = NULL;
4864 4863 }
4865 4864 }
4866 4865
4867 4866 /*
4868 4867 * Cleans and closes up all uses of the HCA
4869 4868 */
4870 4869 static void
4871 4870 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4872 4871 {
4873 4872 rib_hca_t *hca = NULL;
4874 4873 rib_hca_t **hcap;
4875 4874
4876 4875 rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4877 4876 for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4878 4877 hca = *hcap;
4879 4878 rw_enter(&hca->state_lock, RW_WRITER);
4880 4879 if (hca->hca_hdl == hca_hdl) {
4881 4880 /*
4882 4881 * Mark as detached and remove from
4883 4882 * hca list.
4884 4883 */
4885 4884 hca->state = HCA_DETACHED;
4886 4885 *hcap = hca->next;
4887 4886 rib_stat->nhca_inited--;
4888 4887 rib_mod.rdma_count--;
4889 4888 rw_exit(&hca->state_lock);
4890 4889 break;
4891 4890 }
4892 4891 rw_exit(&hca->state_lock);
4893 4892 }
4894 4893 rw_exit(&rib_stat->hcas_list_lock);
4895 4894
4896 4895 if (hca == NULL)
4897 4896 return;
4898 4897 ASSERT(hca->hca_hdl == hca_hdl);
4899 4898
4900 4899 /*
4901 4900 * Stop all services on the HCA
4902 4901 * Go through cl_conn_list and close all rc_channels
4903 4902 * Go through svr_conn_list and close all rc_channels
4904 4903 * Free connections whose c_ref has dropped to 0
4905 4904 * Destroy all CQs
4906 4905 * Deregister and released all buffer pool memory after all
4907 4906 * connections are destroyed
4908 4907 * Free the protection domain
4909 4908 * ibt_close_hca()
4910 4909 */
4911 4910 rib_stop_hca_services(hca);
4912 4911
4913 4912 kmem_free(hca, sizeof (*hca));
4914 4913 }
4915 4914
4916 4915 static void
4917 4916 rib_server_side_cache_reclaim(void *argp)
4918 4917 {
4919 4918 cache_avl_struct_t *rcas;
4920 4919 rib_lrc_entry_t *rb;
4921 4920 rib_hca_t *hca = (rib_hca_t *)argp;
4922 4921
4923 4922 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4924 4923 rcas = avl_first(&hca->avl_tree);
4925 4924 if (rcas != NULL)
4926 4925 avl_remove(&hca->avl_tree, rcas);
4927 4926
4928 4927 while (rcas != NULL) {
4929 4928 while (rcas->r.forw != &rcas->r) {
4930 4929 rcas->elements--;
4931 4930 rb = rcas->r.forw;
4932 4931 remque(rb);
4933 4932 if (rb->registered)
4934 4933 (void) rib_deregistermem_via_hca(hca,
4935 4934 rb->lrc_buf, rb->lrc_mhandle);
4936 4935
4937 4936 hca->cache_allocation -= rb->lrc_len;
4938 4937 kmem_free(rb->lrc_buf, rb->lrc_len);
4939 4938 kmem_free(rb, sizeof (rib_lrc_entry_t));
4940 4939 }
4941 4940 mutex_destroy(&rcas->node_lock);
4942 4941 kmem_cache_free(hca->server_side_cache, rcas);
4943 4942 rcas = avl_first(&hca->avl_tree);
4944 4943 if (rcas != NULL)
4945 4944 avl_remove(&hca->avl_tree, rcas);
4946 4945 }
4947 4946 rw_exit(&hca->avl_rw_lock);
4948 4947 }
4949 4948
4950 4949 static void
4951 4950 rib_server_side_cache_cleanup(void *argp)
4952 4951 {
4953 4952 cache_avl_struct_t *rcas;
4954 4953 rib_lrc_entry_t *rb;
4955 4954 rib_hca_t *hca = (rib_hca_t *)argp;
4956 4955
4957 4956 mutex_enter(&hca->cache_allocation_lock);
4958 4957 if (hca->cache_allocation < cache_limit) {
4959 4958 mutex_exit(&hca->cache_allocation_lock);
4960 4959 return;
4961 4960 }
4962 4961 mutex_exit(&hca->cache_allocation_lock);
4963 4962
4964 4963 rw_enter(&hca->avl_rw_lock, RW_WRITER);
4965 4964 rcas = avl_last(&hca->avl_tree);
4966 4965 if (rcas != NULL)
4967 4966 avl_remove(&hca->avl_tree, rcas);
4968 4967
4969 4968 while (rcas != NULL) {
4970 4969 while (rcas->r.forw != &rcas->r) {
4971 4970 rcas->elements--;
4972 4971 rb = rcas->r.forw;
4973 4972 remque(rb);
4974 4973 if (rb->registered)
4975 4974 (void) rib_deregistermem_via_hca(hca,
4976 4975 rb->lrc_buf, rb->lrc_mhandle);
4977 4976
4978 4977 hca->cache_allocation -= rb->lrc_len;
4979 4978
4980 4979 kmem_free(rb->lrc_buf, rb->lrc_len);
4981 4980 kmem_free(rb, sizeof (rib_lrc_entry_t));
4982 4981 }
4983 4982 mutex_destroy(&rcas->node_lock);
4984 4983 if (hca->server_side_cache) {
4985 4984 kmem_cache_free(hca->server_side_cache, rcas);
4986 4985 }
4987 4986
4988 4987 if (hca->cache_allocation < cache_limit) {
4989 4988 rw_exit(&hca->avl_rw_lock);
4990 4989 return;
4991 4990 }
4992 4991
4993 4992 rcas = avl_last(&hca->avl_tree);
4994 4993 if (rcas != NULL)
4995 4994 avl_remove(&hca->avl_tree, rcas);
4996 4995 }
4997 4996 rw_exit(&hca->avl_rw_lock);
4998 4997 }
4999 4998
5000 4999 static int
5001 5000 avl_compare(const void *t1, const void *t2)
5002 5001 {
5003 5002 if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5004 5003 return (0);
5005 5004
5006 5005 if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5007 5006 return (-1);
5008 5007
5009 5008 return (1);
5010 5009 }
5011 5010
5012 5011 static void
5013 5012 rib_destroy_cache(rib_hca_t *hca)
5014 5013 {
5015 5014 if (hca->avl_init) {
5016 5015 rib_server_side_cache_reclaim((void *)hca);
5017 5016 if (hca->server_side_cache) {
5018 5017 kmem_cache_destroy(hca->server_side_cache);
5019 5018 hca->server_side_cache = NULL;
5020 5019 }
5021 5020 avl_destroy(&hca->avl_tree);
5022 5021 mutex_destroy(&hca->cache_allocation_lock);
5023 5022 rw_destroy(&hca->avl_rw_lock);
5024 5023 }
5025 5024 hca->avl_init = FALSE;
5026 5025 }
5027 5026
5028 5027 static void
5029 5028 rib_force_cleanup(void *hca)
5030 5029 {
5031 5030 if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5032 5031 (void) ddi_taskq_dispatch(
5033 5032 ((rib_hca_t *)hca)->cleanup_helper,
5034 5033 rib_server_side_cache_cleanup,
5035 5034 (void *)hca, DDI_NOSLEEP);
5036 5035 }
5037 5036
5038 5037 static rib_lrc_entry_t *
5039 5038 rib_get_cache_buf(CONN *conn, uint32_t len)
5040 5039 {
5041 5040 cache_avl_struct_t cas, *rcas;
5042 5041 rib_hca_t *hca = (ctoqp(conn))->hca;
5043 5042 rib_lrc_entry_t *reply_buf;
5044 5043 avl_index_t where = NULL;
5045 5044 uint64_t c_alloc = 0;
5046 5045
5047 5046 if (!hca->avl_init)
5048 5047 goto error_alloc;
5049 5048
5050 5049 cas.len = len;
5051 5050
5052 5051 rw_enter(&hca->avl_rw_lock, RW_READER);
5053 5052
5054 5053 mutex_enter(&hca->cache_allocation_lock);
5055 5054 c_alloc = hca->cache_allocation;
5056 5055 mutex_exit(&hca->cache_allocation_lock);
5057 5056
5058 5057 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5059 5058 &where)) == NULL) {
5060 5059 /* Am I above the cache limit */
5061 5060 if ((c_alloc + len) >= cache_limit) {
5062 5061 rib_force_cleanup((void *)hca);
5063 5062 rw_exit(&hca->avl_rw_lock);
5064 5063 mutex_enter(&hca->cache_allocation_lock);
5065 5064 hca->cache_misses_above_the_limit ++;
5066 5065 mutex_exit(&hca->cache_allocation_lock);
5067 5066
5068 5067 /* Allocate and register the buffer directly */
5069 5068 goto error_alloc;
5070 5069 }
5071 5070
5072 5071 rw_exit(&hca->avl_rw_lock);
5073 5072 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5074 5073
5075 5074 /* Recheck to make sure no other thread added the entry in */
5076 5075 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5077 5076 &cas, &where)) == NULL) {
5078 5077 /* Allocate an avl tree entry */
5079 5078 rcas = (cache_avl_struct_t *)
5080 5079 kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5081 5080
5082 5081 bzero(rcas, sizeof (cache_avl_struct_t));
5083 5082 rcas->elements = 0;
5084 5083 rcas->r.forw = &rcas->r;
5085 5084 rcas->r.back = &rcas->r;
5086 5085 rcas->len = len;
5087 5086 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5088 5087 avl_insert(&hca->avl_tree, rcas, where);
5089 5088 }
5090 5089 }
5091 5090
5092 5091 mutex_enter(&rcas->node_lock);
5093 5092
5094 5093 if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5095 5094 reply_buf = rcas->r.forw;
5096 5095 remque(reply_buf);
5097 5096 rcas->elements--;
5098 5097 mutex_exit(&rcas->node_lock);
5099 5098 rw_exit(&hca->avl_rw_lock);
5100 5099
5101 5100 mutex_enter(&hca->cache_allocation_lock);
5102 5101 hca->cache_hits++;
5103 5102 hca->cache_allocation -= len;
5104 5103 mutex_exit(&hca->cache_allocation_lock);
5105 5104 } else {
5106 5105 /* Am I above the cache limit */
5107 5106 mutex_exit(&rcas->node_lock);
5108 5107 if ((c_alloc + len) >= cache_limit) {
5109 5108 rib_force_cleanup((void *)hca);
5110 5109 rw_exit(&hca->avl_rw_lock);
5111 5110
5112 5111 mutex_enter(&hca->cache_allocation_lock);
5113 5112 hca->cache_misses_above_the_limit++;
5114 5113 mutex_exit(&hca->cache_allocation_lock);
5115 5114 /* Allocate and register the buffer directly */
5116 5115 goto error_alloc;
5117 5116 }
5118 5117 rw_exit(&hca->avl_rw_lock);
5119 5118 mutex_enter(&hca->cache_allocation_lock);
5120 5119 hca->cache_misses++;
5121 5120 mutex_exit(&hca->cache_allocation_lock);
5122 5121 /* Allocate a reply_buf entry */
5123 5122 reply_buf = (rib_lrc_entry_t *)
5124 5123 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5125 5124 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5126 5125 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5127 5126 reply_buf->lrc_len = len;
5128 5127 reply_buf->registered = FALSE;
5129 5128 reply_buf->avl_node = (void *)rcas;
5130 5129 }
5131 5130
5132 5131 return (reply_buf);
5133 5132
5134 5133 error_alloc:
5135 5134 reply_buf = (rib_lrc_entry_t *)
5136 5135 kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5137 5136 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5138 5137 reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5139 5138 reply_buf->lrc_len = len;
5140 5139 reply_buf->registered = FALSE;
5141 5140 reply_buf->avl_node = NULL;
5142 5141
5143 5142 return (reply_buf);
5144 5143 }
5145 5144
5146 5145 /*
5147 5146 * Return a pre-registered back to the cache (without
5148 5147 * unregistering the buffer)..
5149 5148 */
5150 5149
5151 5150 static void
5152 5151 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5153 5152 {
5154 5153 cache_avl_struct_t cas, *rcas;
5155 5154 avl_index_t where = NULL;
5156 5155 rib_hca_t *hca = (ctoqp(conn))->hca;
5157 5156
5158 5157 if (!hca->avl_init)
5159 5158 goto error_free;
5160 5159
5161 5160 cas.len = reg_buf->lrc_len;
5162 5161 rw_enter(&hca->avl_rw_lock, RW_READER);
5163 5162 if ((rcas = (cache_avl_struct_t *)
5164 5163 avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5165 5164 rw_exit(&hca->avl_rw_lock);
5166 5165 goto error_free;
5167 5166 } else {
5168 5167 cas.len = reg_buf->lrc_len;
5169 5168 mutex_enter(&rcas->node_lock);
5170 5169 insque(reg_buf, &rcas->r);
5171 5170 rcas->elements ++;
5172 5171 mutex_exit(&rcas->node_lock);
5173 5172 rw_exit(&hca->avl_rw_lock);
5174 5173 mutex_enter(&hca->cache_allocation_lock);
5175 5174 hca->cache_allocation += cas.len;
5176 5175 mutex_exit(&hca->cache_allocation_lock);
5177 5176 }
5178 5177
5179 5178 return;
5180 5179
5181 5180 error_free:
5182 5181
5183 5182 if (reg_buf->registered)
5184 5183 (void) rib_deregistermem_via_hca(hca,
5185 5184 reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5186 5185 kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5187 5186 kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5188 5187 }
5189 5188
5190 5189 static rdma_stat
5191 5190 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5192 5191 uint_t buflen, struct mrc *buf_handle)
5193 5192 {
5194 5193 ibt_mr_hdl_t mr_hdl = NULL; /* memory region handle */
5195 5194 ibt_mr_desc_t mr_desc; /* vaddr, lkey, rkey */
5196 5195 rdma_stat status;
5197 5196
5198 5197
5199 5198 /*
5200 5199 * Note: ALL buffer pools use the same memory type RDMARW.
5201 5200 */
5202 5201 status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5203 5202 if (status == RDMA_SUCCESS) {
5204 5203 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5205 5204 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5206 5205 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5207 5206 } else {
5208 5207 buf_handle->mrc_linfo = NULL;
5209 5208 buf_handle->mrc_lmr = 0;
5210 5209 buf_handle->mrc_rmr = 0;
5211 5210 }
5212 5211 return (status);
5213 5212 }
5214 5213
5215 5214 /* ARGSUSED */
5216 5215 static rdma_stat
5217 5216 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5218 5217 struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5219 5218 {
5220 5219
5221 5220 (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5222 5221 return (RDMA_SUCCESS);
5223 5222 }
5224 5223
5225 5224 /* ARGSUSED */
5226 5225 static rdma_stat
5227 5226 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5228 5227 {
5229 5228
5230 5229 (void) ibt_deregister_mr(hca->hca_hdl,
5231 5230 (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5232 5231 return (RDMA_SUCCESS);
5233 5232 }
5234 5233
5235 5234 /*
5236 5235 * Check if the IP interface named by `lifrp' is RDMA-capable.
5237 5236 */
5238 5237 static boolean_t
5239 5238 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5240 5239 {
5241 5240 char ifname[LIFNAMSIZ];
5242 5241 char *cp;
5243 5242
5244 5243 if (lifrp->lifr_type == IFT_IB)
5245 5244 return (B_TRUE);
5246 5245
5247 5246 /*
5248 5247 * Strip off the logical interface portion before getting
5249 5248 * intimate with the name.
5250 5249 */
5251 5250 (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5252 5251 if ((cp = strchr(ifname, ':')) != NULL)
5253 5252 *cp = '\0';
5254 5253
5255 5254 return (strcmp("lo0", ifname) == 0);
5256 5255 }
5257 5256
5258 5257 static int
5259 5258 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5260 5259 {
5261 5260 vnode_t *kkvp, *vp;
5262 5261 TIUSER *tiptr;
5263 5262 struct strioctl iocb;
5264 5263 k_sigset_t smask;
5265 5264 int err = 0;
5266 5265
5267 5266 if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5268 5267 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5269 5268 &tiptr, CRED()) == 0) {
5270 5269 vp = tiptr->fp->f_vnode;
5271 5270 } else {
5272 5271 VN_RELE(kkvp);
5273 5272 return (EPROTO);
5274 5273 }
5275 5274 } else {
5276 5275 return (EPROTO);
5277 5276 }
5278 5277
5279 5278 iocb.ic_cmd = cmd;
5280 5279 iocb.ic_timout = 0;
5281 5280 iocb.ic_len = len;
5282 5281 iocb.ic_dp = (caddr_t)arg;
5283 5282 sigintr(&smask, 0);
5284 5283 err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5285 5284 sigunintr(&smask);
5286 5285 (void) t_kclose(tiptr, 0);
5287 5286 VN_RELE(kkvp);
5288 5287 return (err);
5289 5288 }
5290 5289
5291 5290 /*
5292 5291 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5293 5292 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5294 5293 */
5295 5294 static int
5296 5295 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5297 5296 {
5298 5297 int err;
5299 5298 struct lifnum lifn;
5300 5299
5301 5300 bzero(&lifn, sizeof (struct lifnum));
5302 5301 lifn.lifn_family = AF_UNSPEC;
5303 5302
5304 5303 err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5305 5304 if (err != 0)
5306 5305 return (err);
5307 5306
5308 5307 /*
5309 5308 * Pad the interface count to account for additional interfaces that
5310 5309 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5311 5310 */
5312 5311 lifn.lifn_count += 4;
5313 5312
5314 5313 bzero(lifcp, sizeof (struct lifconf));
5315 5314 lifcp->lifc_family = AF_UNSPEC;
5316 5315 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5317 5316 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5318 5317
5319 5318 err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5320 5319 if (err != 0) {
5321 5320 kmem_free(lifcp->lifc_buf, *bufsizep);
5322 5321 return (err);
5323 5322 }
5324 5323 return (0);
5325 5324 }
5326 5325
5327 5326 static boolean_t
5328 5327 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5329 5328 {
5330 5329 uint_t i, nifs;
5331 5330 uint_t bufsize;
5332 5331 struct lifconf lifc;
5333 5332 struct lifreq *lifrp;
5334 5333 struct sockaddr_in *sinp;
5335 5334 struct sockaddr_in6 *sin6p;
5336 5335
5337 5336 bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5338 5337 bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5339 5338
5340 5339 if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5341 5340 return (B_FALSE);
5342 5341
5343 5342 if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5344 5343 kmem_free(lifc.lifc_buf, bufsize);
5345 5344 return (B_FALSE);
5346 5345 }
5347 5346
5348 5347 /*
5349 5348 * Worst case is that all of the addresses are IB-capable and have
5350 5349 * the same address family, so size our buffers accordingly.
5351 5350 */
5352 5351 addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5353 5352 addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5354 5353 addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5355 5354 addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5356 5355
5357 5356 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5358 5357 if (!rpcib_rdma_capable_interface(lifrp))
5359 5358 continue;
5360 5359
5361 5360 if (lifrp->lifr_addr.ss_family == AF_INET) {
5362 5361 sinp = addrs4->ri_list;
5363 5362 bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5364 5363 sizeof (struct sockaddr_in));
5365 5364 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5366 5365 sin6p = addrs6->ri_list;
5367 5366 bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5368 5367 sizeof (struct sockaddr_in6));
5369 5368 }
5370 5369 }
5371 5370
5372 5371 kmem_free(lifc.lifc_buf, bufsize);
5373 5372 return (B_TRUE);
5374 5373 }
5375 5374
5376 5375 /* ARGSUSED */
5377 5376 static int
5378 5377 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5379 5378 {
5380 5379 rib_hca_t *hca;
5381 5380
5382 5381 if (KSTAT_WRITE == rw) {
5383 5382 return (EACCES);
5384 5383 }
5385 5384
5386 5385 rpcib_kstat.cache_limit.value.ui64 =
5387 5386 (uint64_t)cache_limit;
5388 5387 rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5389 5388 for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5390 5389 rpcib_kstat.cache_allocation.value.ui64 +=
5391 5390 (uint64_t)hca->cache_allocation;
5392 5391 rpcib_kstat.cache_hits.value.ui64 +=
5393 5392 (uint64_t)hca->cache_hits;
5394 5393 rpcib_kstat.cache_misses.value.ui64 +=
5395 5394 (uint64_t)hca->cache_misses;
5396 5395 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5397 5396 (uint64_t)hca->cache_misses_above_the_limit;
5398 5397 }
5399 5398 rw_exit(&rib_stat->hcas_list_lock);
5400 5399 return (0);
5401 5400 }
↓ open down ↓ |
5202 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX