1 /*
2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 */
4
5 /*
6 * This file contains code imported from the OFED rds source file ib.c
7 * Oracle elects to have and use the contents of ib.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
10 */
11
12 /*
13 * Copyright (c) 2006 Oracle. All rights reserved.
14 *
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
20 *
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
23 * conditions are met:
24 *
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
27 * disclaimer.
28 *
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
33 *
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 * SOFTWARE.
42 *
43 */
44 #include <sys/sysmacros.h>
45 #include <sys/rds.h>
46
47 #include <sys/ib/ibtl/ibti.h>
48 #include <sys/ib/clients/rdsv3/rdsv3.h>
49 #include <sys/ib/clients/rdsv3/ib.h>
50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51
52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
53
54 struct list rdsv3_ib_devices;
55
56 /* NOTE: if also grabbing ibdev lock, grab this first */
57 kmutex_t ib_nodev_conns_lock;
58 list_t ib_nodev_conns;
59
60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
62
63 void
64 rdsv3_ib_add_one(ib_device_t *device)
65 {
66 struct rdsv3_ib_device *rds_ibdev;
67 ibt_hca_attr_t *dev_attr;
68 char name[64];
69
70 RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
71
72 /* Only handle IB (no iWARP) devices */
73 if (device->node_type != RDMA_NODE_IB_CA)
74 return;
75
76 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
77 KM_NOSLEEP);
78 if (!dev_attr)
79 return;
80
81 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
82 RDSV3_DPRINTF2("rdsv3_ib_add_one",
83 "Query device failed for %s", device->name);
84 goto free_attr;
85 }
86
87 /* We depend on Reserved Lkey */
88 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
89 RDSV3_DPRINTF2("rdsv3_ib_add_one",
90 "Reserved Lkey support is required: %s",
91 device->name);
92 goto free_attr;
93 }
94
95 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
96 if (!rds_ibdev)
97 goto free_attr;
98
99 rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
100 rds_ibdev->hca_attr = *dev_attr;
101
102 rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
103 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
104
105 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
106 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
107
108 rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
109 rds_ibdev->max_responder_resources =
110 (uint_t)dev_attr->hca_max_rdma_in_qp;
111
112 rds_ibdev->dev = device;
113 rds_ibdev->pd = ib_alloc_pd(device);
114 if (IS_ERR(rds_ibdev->pd))
115 goto free_dev;
116
117 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
118 goto free_dev;
119 }
120
121 if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
122 rdsv3_ib_destroy_mr_pool(rds_ibdev);
123 goto free_dev;
124 }
125
126 (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
127 (longlong_t)htonll(dev_attr->hca_node_guid));
128 rds_ibdev->ib_frag_slab = kmem_cache_create(name,
129 sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
130 rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
131 if (rds_ibdev->ib_frag_slab == NULL) {
132 RDSV3_DPRINTF2("rdsv3_ib_add_one",
133 "kmem_cache_create for ib_frag_slab failed for device: %s",
134 device->name);
135 rdsv3_ib_destroy_mr_pool(rds_ibdev);
136 rdsv3_ib_destroy_inc_pool(rds_ibdev);
137 goto free_dev;
138 }
139
140 rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
141 (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
142 if (rds_ibdev->aft_hcagp == NULL) {
143 rdsv3_ib_destroy_mr_pool(rds_ibdev);
144 rdsv3_ib_destroy_inc_pool(rds_ibdev);
145 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
146 goto free_dev;
147 }
148 rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
149 (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
150 rds_ibdev->aft_hcagp);
151 if (rds_ibdev->fmr_soft_cq == NULL) {
152 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
153 rdsv3_ib_destroy_mr_pool(rds_ibdev);
154 rdsv3_ib_destroy_inc_pool(rds_ibdev);
155 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
156 goto free_dev;
157 }
158
159 rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
160 (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
161 rds_ibdev->aft_hcagp);
162 if (rds_ibdev->inc_soft_cq == NULL) {
163 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
164 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
165 rdsv3_ib_destroy_mr_pool(rds_ibdev);
166 rdsv3_ib_destroy_inc_pool(rds_ibdev);
167 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
168 goto free_dev;
169 }
170
171 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
172 offsetof(struct rdsv3_ib_ipaddr, list));
173 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
174 offsetof(struct rdsv3_ib_connection, ib_node));
175
176 list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
177
178 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
179
180 RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
181
182 goto free_attr;
183
184 err_pd:
185 (void) ib_dealloc_pd(rds_ibdev->pd);
186 free_dev:
187 mutex_destroy(&rds_ibdev->spinlock);
188 rw_destroy(&rds_ibdev->rwlock);
189 kmem_free(rds_ibdev, sizeof (*rds_ibdev));
190 free_attr:
191 kmem_free(dev_attr, sizeof (*dev_attr));
192 }
193
194 void
195 rdsv3_ib_remove_one(struct ib_device *device)
196 {
197 struct rdsv3_ib_device *rds_ibdev;
198 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
199
200 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
201
202 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
203 if (!rds_ibdev)
204 return;
205
206 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
207 list) {
208 list_remove_node(&i_ipaddr->list);
209 kmem_free(i_ipaddr, sizeof (*i_ipaddr));
210 }
211
212 rdsv3_ib_destroy_conns(rds_ibdev);
213
214 if (rds_ibdev->fmr_soft_cq)
215 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
216 if (rds_ibdev->inc_soft_cq)
217 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
218
219 rdsv3_ib_destroy_mr_pool(rds_ibdev);
220 rdsv3_ib_destroy_inc_pool(rds_ibdev);
221
222 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
223
224 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
225
226 #if 0
227 while (ib_dealloc_pd(rds_ibdev->pd)) {
228 RDSV3_DPRINTF5("rdsv3_ib_remove_one",
229 "%s-%d Failed to dealloc pd %p",
230 __func__, __LINE__, rds_ibdev->pd);
231 delay(drv_usectohz(1000));
232 }
233 #else
234 if (ib_dealloc_pd(rds_ibdev->pd)) {
235 RDSV3_DPRINTF2("rdsv3_ib_remove_one",
236 "Failed to dealloc pd %p\n", rds_ibdev->pd);
237 }
238 #endif
239
240 list_destroy(&rds_ibdev->ipaddr_list);
241 list_destroy(&rds_ibdev->conn_list);
242 list_remove_node(&rds_ibdev->list);
243 mutex_destroy(&rds_ibdev->spinlock);
244 rw_destroy(&rds_ibdev->rwlock);
245 kmem_free(rds_ibdev, sizeof (*rds_ibdev));
246
247 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
248 }
249
250 struct ib_client rdsv3_ib_client = {
251 .name = "rdsv3_ib",
252 .add = rdsv3_ib_add_one,
253 .remove = rdsv3_ib_remove_one,
254 .clnt_hdl = NULL,
255 .state = IB_CLNT_UNINITIALIZED
256 };
257
258 static int
259 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
260 void *buffer)
261 {
262 struct rds_info_rdma_connection *iinfo = buffer;
263 struct rdsv3_ib_connection *ic;
264
265 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
266 conn, buffer);
267
268 /* We will only ever look at IB transports */
269 if (conn->c_trans != &rdsv3_ib_transport)
270 return (0);
271
272 iinfo->src_addr = conn->c_laddr;
273 iinfo->dst_addr = conn->c_faddr;
274
275 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
276 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
277 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
278 struct rdsv3_ib_device *rds_ibdev;
279 struct rdma_dev_addr *dev_addr;
280
281 ic = conn->c_transport_data;
282 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
283
284 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
285 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
286
287 rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
288 &rdsv3_ib_client);
289 iinfo->max_send_wr = ic->i_send_ring.w_nr;
290 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
291 iinfo->max_send_sge = rds_ibdev->max_sge;
292 }
293
294 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
295 conn, buffer);
296 return (1);
297 }
298
299 static void
300 rds_ib_ic_info(struct rsock *sock, unsigned int len,
301 struct rdsv3_info_iterator *iter,
302 struct rdsv3_info_lengths *lens)
303 {
304 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
305 sock, iter, lens, len);
306
307 rdsv3_for_each_conn_info(sock, len, iter, lens,
308 rds_ib_conn_info_visitor,
309 sizeof (struct rds_info_rdma_connection));
310 }
311
312 /*
313 * Early RDS/IB was built to only bind to an address if there is an IPoIB
314 * device with that address set.
315 *
316 * If it were me, I'd advocate for something more flexible. Sending and
317 * receiving should be device-agnostic. Transports would try and maintain
318 * connections between peers who have messages queued. Userspace would be
319 * allowed to influence which paths have priority. We could call userspace
320 * asserting this policy "routing".
321 */
322 static int
323 rds_ib_laddr_check(uint32_be_t addr)
324 {
325 int ret;
326 struct rdma_cm_id *cm_id;
327 struct sockaddr_in sin;
328
329 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
330
331 /*
332 * Create a CMA ID and try to bind it. This catches both
333 * IB and iWARP capable NICs.
334 */
335 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
336 if (!cm_id)
337 return (-EADDRNOTAVAIL);
338
339 (void) memset(&sin, 0, sizeof (sin));
340 sin.sin_family = AF_INET;
341 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
342
343 /* rdma_bind_addr will only succeed for IB & iWARP devices */
344 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
345 /*
346 * due to this, we will claim to support iWARP devices unless we
347 * check node_type.
348 */
349 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
350 ret = -EADDRNOTAVAIL;
351
352 RDSV3_DPRINTF5("rds_ib_laddr_check",
353 "addr %u.%u.%u.%u ret %d node type %d",
354 NIPQUAD(addr), ret,
355 cm_id->device ? cm_id->device->node_type : -1);
356
357 rdma_destroy_id(cm_id);
358
359 return (ret);
360 }
361
362 void
363 rdsv3_ib_exit(void)
364 {
365 RDSV3_DPRINTF4("rds_ib_exit", "Enter");
366
367 rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
368 rdsv3_ib_destroy_nodev_conns();
369 ib_unregister_client(&rdsv3_ib_client);
370 rdsv3_ib_sysctl_exit();
371 rdsv3_ib_recv_exit();
372 rdsv3_trans_unregister(&rdsv3_ib_transport);
373 kmem_free(rdsv3_ib_stats,
374 nr_cpus * sizeof (struct rdsv3_ib_statistics));
375 mutex_destroy(&ib_nodev_conns_lock);
376 list_destroy(&ib_nodev_conns);
377 list_destroy(&rdsv3_ib_devices);
378
379 RDSV3_DPRINTF4("rds_ib_exit", "Return");
380 }
381
382 struct rdsv3_transport rdsv3_ib_transport = {
383 .laddr_check = rds_ib_laddr_check,
384 .xmit_complete = rdsv3_ib_xmit_complete,
385 .xmit = rdsv3_ib_xmit,
386 .xmit_cong_map = NULL,
387 .xmit_rdma = rdsv3_ib_xmit_rdma,
388 .recv = rdsv3_ib_recv,
389 .conn_alloc = rdsv3_ib_conn_alloc,
390 .conn_free = rdsv3_ib_conn_free,
391 .conn_connect = rdsv3_ib_conn_connect,
392 .conn_shutdown = rdsv3_ib_conn_shutdown,
393 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user,
394 .inc_free = rdsv3_ib_inc_free,
395 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect,
396 .cm_handle_connect = rdsv3_ib_cm_handle_connect,
397 .cm_connect_complete = rdsv3_ib_cm_connect_complete,
398 .stats_info_copy = rdsv3_ib_stats_info_copy,
399 .exit = rdsv3_ib_exit,
400 .get_mr = rdsv3_ib_get_mr,
401 .sync_mr = rdsv3_ib_sync_mr,
402 .free_mr = rdsv3_ib_free_mr,
403 .flush_mrs = rdsv3_ib_flush_mrs,
404 .t_name = "infiniband",
405 .t_type = RDS_TRANS_IB
406 };
407
408 int
409 rdsv3_ib_init(void)
410 {
411 int ret;
412
413 RDSV3_DPRINTF4("rds_ib_init", "Enter");
414
415 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
416 offsetof(struct rdsv3_ib_device, list));
417 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
418 offsetof(struct rdsv3_ib_connection, ib_node));
419 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
420
421 /* allocate space for ib statistics */
422 ASSERT(rdsv3_ib_stats == NULL);
423 rdsv3_ib_stats = kmem_zalloc(nr_cpus *
424 sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
425
426 rdsv3_ib_client.dip = rdsv3_dev_info;
427 ret = ib_register_client(&rdsv3_ib_client);
428 if (ret)
429 goto out;
430
431 ret = rdsv3_ib_sysctl_init();
432 if (ret)
433 goto out_ibreg;
434
435 ret = rdsv3_ib_recv_init();
436 if (ret)
437 goto out_sysctl;
438
439 ret = rdsv3_trans_register(&rdsv3_ib_transport);
440 if (ret)
441 goto out_recv;
442
443 rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
444
445 RDSV3_DPRINTF4("rds_ib_init", "Return");
446
447 return (0);
448
449 out_recv:
450 rdsv3_ib_recv_exit();
451 out_sysctl:
452 rdsv3_ib_sysctl_exit();
453 out_ibreg:
454 ib_unregister_client(&rdsv3_ib_client);
455 out:
456 kmem_free(rdsv3_ib_stats,
457 nr_cpus * sizeof (struct rdsv3_ib_statistics));
458 mutex_destroy(&ib_nodev_conns_lock);
459 list_destroy(&ib_nodev_conns);
460 list_destroy(&rdsv3_ib_devices);
461 return (ret);
462 }