1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file ib.c 7 * Oracle elects to have and use the contents of ib.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/sysmacros.h> 45 #include <sys/rds.h> 46 47 #include <sys/ib/ibtl/ibti.h> 48 #include <sys/ib/clients/rdsv3/rdsv3.h> 49 #include <sys/ib/clients/rdsv3/ib.h> 50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 51 52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; 53 54 struct list rdsv3_ib_devices; 55 56 /* NOTE: if also grabbing ibdev lock, grab this first */ 57 kmutex_t ib_nodev_conns_lock; 58 list_t ib_nodev_conns; 59 60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags); 61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg); 62 63 void 64 rdsv3_ib_add_one(ib_device_t *device) 65 { 66 struct rdsv3_ib_device *rds_ibdev; 67 ibt_hca_attr_t *dev_attr; 68 char name[64]; 69 70 RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device); 71 72 /* Only handle IB (no iWARP) devices */ 73 if (device->node_type != RDMA_NODE_IB_CA) 74 return; 75 76 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), 77 KM_NOSLEEP); 78 if (!dev_attr) 79 return; 80 81 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { 82 RDSV3_DPRINTF2("rdsv3_ib_add_one", 83 "Query device failed for %s", device->name); 84 goto free_attr; 85 } 86 87 /* We depend on Reserved Lkey */ 88 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { 89 RDSV3_DPRINTF2("rdsv3_ib_add_one", 90 "Reserved Lkey support is required: %s", 91 device->name); 92 goto free_attr; 93 } 94 95 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); 96 if (!rds_ibdev) 97 goto free_attr; 98 99 rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device); 100 rds_ibdev->hca_attr = *dev_attr; 101 102 rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL); 103 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); 104 105 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; 106 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); 107 108 rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp; 109 rds_ibdev->max_responder_resources = 110 (uint_t)dev_attr->hca_max_rdma_in_qp; 111 112 rds_ibdev->dev = device; 113 rds_ibdev->pd = ib_alloc_pd(device); 114 if (IS_ERR(rds_ibdev->pd)) 115 goto free_dev; 116 117 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { 118 goto free_dev; 119 } 120 121 if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) { 122 rdsv3_ib_destroy_mr_pool(rds_ibdev); 123 goto free_dev; 124 } 125 126 (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx", 127 (longlong_t)htonll(dev_attr->hca_node_guid)); 128 rds_ibdev->ib_frag_slab = kmem_cache_create(name, 129 sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor, 130 rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0); 131 if (rds_ibdev->ib_frag_slab == NULL) { 132 RDSV3_DPRINTF2("rdsv3_ib_add_one", 133 "kmem_cache_create for ib_frag_slab failed for device: %s", 134 device->name); 135 rdsv3_ib_destroy_mr_pool(rds_ibdev); 136 rdsv3_ib_destroy_inc_pool(rds_ibdev); 137 goto free_dev; 138 } 139 140 rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl, 141 (uint64_t)rds_ibdev->hca_attr.hca_node_guid); 142 if (rds_ibdev->aft_hcagp == NULL) { 143 rdsv3_ib_destroy_mr_pool(rds_ibdev); 144 rdsv3_ib_destroy_inc_pool(rds_ibdev); 145 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 146 goto free_dev; 147 } 148 rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn, 149 (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU, 150 rds_ibdev->aft_hcagp); 151 if (rds_ibdev->fmr_soft_cq == NULL) { 152 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 153 rdsv3_ib_destroy_mr_pool(rds_ibdev); 154 rdsv3_ib_destroy_inc_pool(rds_ibdev); 155 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 156 goto free_dev; 157 } 158 159 rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist, 160 (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU, 161 rds_ibdev->aft_hcagp); 162 if (rds_ibdev->inc_soft_cq == NULL) { 163 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); 164 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 165 rdsv3_ib_destroy_mr_pool(rds_ibdev); 166 rdsv3_ib_destroy_inc_pool(rds_ibdev); 167 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 168 goto free_dev; 169 } 170 171 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), 172 offsetof(struct rdsv3_ib_ipaddr, list)); 173 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), 174 offsetof(struct rdsv3_ib_connection, ib_node)); 175 176 list_insert_tail(&rdsv3_ib_devices, rds_ibdev); 177 178 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); 179 180 RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device); 181 182 goto free_attr; 183 184 err_pd: 185 (void) ib_dealloc_pd(rds_ibdev->pd); 186 free_dev: 187 mutex_destroy(&rds_ibdev->spinlock); 188 rw_destroy(&rds_ibdev->rwlock); 189 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 190 free_attr: 191 kmem_free(dev_attr, sizeof (*dev_attr)); 192 } 193 194 void 195 rdsv3_ib_remove_one(struct ib_device *device) 196 { 197 struct rdsv3_ib_device *rds_ibdev; 198 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; 199 200 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device); 201 202 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); 203 if (!rds_ibdev) 204 return; 205 206 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, 207 list) { 208 list_remove_node(&i_ipaddr->list); 209 kmem_free(i_ipaddr, sizeof (*i_ipaddr)); 210 } 211 212 rdsv3_ib_destroy_conns(rds_ibdev); 213 214 if (rds_ibdev->fmr_soft_cq) 215 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); 216 if (rds_ibdev->inc_soft_cq) 217 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq); 218 219 rdsv3_ib_destroy_mr_pool(rds_ibdev); 220 rdsv3_ib_destroy_inc_pool(rds_ibdev); 221 222 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 223 224 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 225 226 #if 0 227 while (ib_dealloc_pd(rds_ibdev->pd)) { 228 RDSV3_DPRINTF5("rdsv3_ib_remove_one", 229 "%s-%d Failed to dealloc pd %p", 230 __func__, __LINE__, rds_ibdev->pd); 231 delay(drv_usectohz(1000)); 232 } 233 #else 234 if (ib_dealloc_pd(rds_ibdev->pd)) { 235 RDSV3_DPRINTF2("rdsv3_ib_remove_one", 236 "Failed to dealloc pd %p\n", rds_ibdev->pd); 237 } 238 #endif 239 240 list_destroy(&rds_ibdev->ipaddr_list); 241 list_destroy(&rds_ibdev->conn_list); 242 list_remove_node(&rds_ibdev->list); 243 mutex_destroy(&rds_ibdev->spinlock); 244 rw_destroy(&rds_ibdev->rwlock); 245 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 246 247 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device); 248 } 249 250 struct ib_client rdsv3_ib_client = { 251 .name = "rdsv3_ib", 252 .add = rdsv3_ib_add_one, 253 .remove = rdsv3_ib_remove_one, 254 .clnt_hdl = NULL, 255 .state = IB_CLNT_UNINITIALIZED 256 }; 257 258 static int 259 rds_ib_conn_info_visitor(struct rdsv3_connection *conn, 260 void *buffer) 261 { 262 struct rds_info_rdma_connection *iinfo = buffer; 263 struct rdsv3_ib_connection *ic; 264 265 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 266 conn, buffer); 267 268 /* We will only ever look at IB transports */ 269 if (conn->c_trans != &rdsv3_ib_transport) 270 return (0); 271 272 iinfo->src_addr = conn->c_laddr; 273 iinfo->dst_addr = conn->c_faddr; 274 275 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); 276 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); 277 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 278 struct rdsv3_ib_device *rds_ibdev; 279 struct rdma_dev_addr *dev_addr; 280 281 ic = conn->c_transport_data; 282 dev_addr = &ic->i_cm_id->route.addr.dev_addr; 283 284 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); 285 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); 286 287 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, 288 &rdsv3_ib_client); 289 iinfo->max_send_wr = ic->i_send_ring.w_nr; 290 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 291 iinfo->max_send_sge = rds_ibdev->max_sge; 292 } 293 294 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 295 conn, buffer); 296 return (1); 297 } 298 299 static void 300 rds_ib_ic_info(struct rsock *sock, unsigned int len, 301 struct rdsv3_info_iterator *iter, 302 struct rdsv3_info_lengths *lens) 303 { 304 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", 305 sock, iter, lens, len); 306 307 rdsv3_for_each_conn_info(sock, len, iter, lens, 308 rds_ib_conn_info_visitor, 309 sizeof (struct rds_info_rdma_connection)); 310 } 311 312 /* 313 * Early RDS/IB was built to only bind to an address if there is an IPoIB 314 * device with that address set. 315 * 316 * If it were me, I'd advocate for something more flexible. Sending and 317 * receiving should be device-agnostic. Transports would try and maintain 318 * connections between peers who have messages queued. Userspace would be 319 * allowed to influence which paths have priority. We could call userspace 320 * asserting this policy "routing". 321 */ 322 static int 323 rds_ib_laddr_check(uint32_be_t addr) 324 { 325 int ret; 326 struct rdma_cm_id *cm_id; 327 struct sockaddr_in sin; 328 329 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); 330 331 /* 332 * Create a CMA ID and try to bind it. This catches both 333 * IB and iWARP capable NICs. 334 */ 335 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 336 if (!cm_id) 337 return (-EADDRNOTAVAIL); 338 339 (void) memset(&sin, 0, sizeof (sin)); 340 sin.sin_family = AF_INET; 341 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); 342 343 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 344 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 345 /* 346 * due to this, we will claim to support iWARP devices unless we 347 * check node_type. 348 */ 349 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 350 ret = -EADDRNOTAVAIL; 351 352 RDSV3_DPRINTF5("rds_ib_laddr_check", 353 "addr %u.%u.%u.%u ret %d node type %d", 354 NIPQUAD(addr), ret, 355 cm_id->device ? cm_id->device->node_type : -1); 356 357 rdma_destroy_id(cm_id); 358 359 return (ret); 360 } 361 362 void 363 rdsv3_ib_exit(void) 364 { 365 RDSV3_DPRINTF4("rds_ib_exit", "Enter"); 366 367 rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 368 rdsv3_ib_destroy_nodev_conns(); 369 ib_unregister_client(&rdsv3_ib_client); 370 rdsv3_ib_sysctl_exit(); 371 rdsv3_ib_recv_exit(); 372 rdsv3_trans_unregister(&rdsv3_ib_transport); 373 kmem_free(rdsv3_ib_stats, 374 nr_cpus * sizeof (struct rdsv3_ib_statistics)); 375 mutex_destroy(&ib_nodev_conns_lock); 376 list_destroy(&ib_nodev_conns); 377 list_destroy(&rdsv3_ib_devices); 378 379 RDSV3_DPRINTF4("rds_ib_exit", "Return"); 380 } 381 382 struct rdsv3_transport rdsv3_ib_transport = { 383 .laddr_check = rds_ib_laddr_check, 384 .xmit_complete = rdsv3_ib_xmit_complete, 385 .xmit = rdsv3_ib_xmit, 386 .xmit_cong_map = NULL, 387 .xmit_rdma = rdsv3_ib_xmit_rdma, 388 .recv = rdsv3_ib_recv, 389 .conn_alloc = rdsv3_ib_conn_alloc, 390 .conn_free = rdsv3_ib_conn_free, 391 .conn_connect = rdsv3_ib_conn_connect, 392 .conn_shutdown = rdsv3_ib_conn_shutdown, 393 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, 394 .inc_free = rdsv3_ib_inc_free, 395 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, 396 .cm_handle_connect = rdsv3_ib_cm_handle_connect, 397 .cm_connect_complete = rdsv3_ib_cm_connect_complete, 398 .stats_info_copy = rdsv3_ib_stats_info_copy, 399 .exit = rdsv3_ib_exit, 400 .get_mr = rdsv3_ib_get_mr, 401 .sync_mr = rdsv3_ib_sync_mr, 402 .free_mr = rdsv3_ib_free_mr, 403 .flush_mrs = rdsv3_ib_flush_mrs, 404 .t_name = "infiniband", 405 .t_type = RDS_TRANS_IB 406 }; 407 408 int 409 rdsv3_ib_init(void) 410 { 411 int ret; 412 413 RDSV3_DPRINTF4("rds_ib_init", "Enter"); 414 415 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), 416 offsetof(struct rdsv3_ib_device, list)); 417 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), 418 offsetof(struct rdsv3_ib_connection, ib_node)); 419 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); 420 421 /* allocate space for ib statistics */ 422 ASSERT(rdsv3_ib_stats == NULL); 423 rdsv3_ib_stats = kmem_zalloc(nr_cpus * 424 sizeof (struct rdsv3_ib_statistics), KM_SLEEP); 425 426 rdsv3_ib_client.dip = rdsv3_dev_info; 427 ret = ib_register_client(&rdsv3_ib_client); 428 if (ret) 429 goto out; 430 431 ret = rdsv3_ib_sysctl_init(); 432 if (ret) 433 goto out_ibreg; 434 435 ret = rdsv3_ib_recv_init(); 436 if (ret) 437 goto out_sysctl; 438 439 ret = rdsv3_trans_register(&rdsv3_ib_transport); 440 if (ret) 441 goto out_recv; 442 443 rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 444 445 RDSV3_DPRINTF4("rds_ib_init", "Return"); 446 447 return (0); 448 449 out_recv: 450 rdsv3_ib_recv_exit(); 451 out_sysctl: 452 rdsv3_ib_sysctl_exit(); 453 out_ibreg: 454 ib_unregister_client(&rdsv3_ib_client); 455 out: 456 kmem_free(rdsv3_ib_stats, 457 nr_cpus * sizeof (struct rdsv3_ib_statistics)); 458 mutex_destroy(&ib_nodev_conns_lock); 459 list_destroy(&ib_nodev_conns); 460 list_destroy(&rdsv3_ib_devices); 461 return (ret); 462 }