1 /*
   2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 
   5 /*
   6  * This file contains code imported from the OFED rds source file ib.c
   7  * Oracle elects to have and use the contents of ib.c under and governed
   8  * by the OpenIB.org BSD license (see below for full license text). However,
   9  * the following notice accompanied the original version of this file:
  10  */
  11 
  12 /*
  13  * Copyright (c) 2006 Oracle.  All rights reserved.
  14  *
  15  * This software is available to you under a choice of one of two
  16  * licenses.  You may choose to be licensed under the terms of the GNU
  17  * General Public License (GPL) Version 2, available from the file
  18  * COPYING in the main directory of this source tree, or the
  19  * OpenIB.org BSD license below:
  20  *
  21  *     Redistribution and use in source and binary forms, with or
  22  *     without modification, are permitted provided that the following
  23  *     conditions are met:
  24  *
  25  *      - Redistributions of source code must retain the above
  26  *        copyright notice, this list of conditions and the following
  27  *        disclaimer.
  28  *
  29  *      - Redistributions in binary form must reproduce the above
  30  *        copyright notice, this list of conditions and the following
  31  *        disclaimer in the documentation and/or other materials
  32  *        provided with the distribution.
  33  *
  34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  41  * SOFTWARE.
  42  *
  43  */
  44 #include <sys/sysmacros.h>
  45 #include <sys/rds.h>
  46 
  47 #include <sys/ib/ibtl/ibti.h>
  48 #include <sys/ib/clients/rdsv3/rdsv3.h>
  49 #include <sys/ib/clients/rdsv3/ib.h>
  50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
  51 
  52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
  53 
  54 struct list     rdsv3_ib_devices;
  55 
  56 /* NOTE: if also grabbing ibdev lock, grab this first */
  57 kmutex_t ib_nodev_conns_lock;
  58 list_t ib_nodev_conns;
  59 
  60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
  61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
  62 
  63 void
  64 rdsv3_ib_add_one(ib_device_t *device)
  65 {
  66         struct rdsv3_ib_device *rds_ibdev;
  67         ibt_hca_attr_t *dev_attr;
  68         char name[64];
  69 
  70         RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
  71 
  72         /* Only handle IB (no iWARP) devices */
  73         if (device->node_type != RDMA_NODE_IB_CA)
  74                 return;
  75 
  76         dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
  77             KM_NOSLEEP);
  78         if (!dev_attr)
  79                 return;
  80 
  81         if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
  82                 RDSV3_DPRINTF2("rdsv3_ib_add_one",
  83                     "Query device failed for %s", device->name);
  84                 goto free_attr;
  85         }
  86 
  87         /* We depend on Reserved Lkey */
  88         if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
  89                 RDSV3_DPRINTF2("rdsv3_ib_add_one",
  90                     "Reserved Lkey support is required: %s",
  91                     device->name);
  92                 goto free_attr;
  93         }
  94 
  95         rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
  96         if (!rds_ibdev)
  97                 goto free_attr;
  98 
  99         rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
 100         rds_ibdev->hca_attr =  *dev_attr;
 101 
 102         rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
 103         mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
 104 
 105         rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
 106         rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
 107 
 108         rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
 109         rds_ibdev->max_responder_resources =
 110             (uint_t)dev_attr->hca_max_rdma_in_qp;
 111 
 112         rds_ibdev->dev = device;
 113         rds_ibdev->pd = ib_alloc_pd(device);
 114         if (IS_ERR(rds_ibdev->pd))
 115                 goto free_dev;
 116 
 117         if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
 118                 goto free_dev;
 119         }
 120 
 121         if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
 122                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 123                 goto free_dev;
 124         }
 125 
 126         (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
 127             (longlong_t)htonll(dev_attr->hca_node_guid));
 128         rds_ibdev->ib_frag_slab = kmem_cache_create(name,
 129             sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
 130             rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
 131         if (rds_ibdev->ib_frag_slab == NULL) {
 132                 RDSV3_DPRINTF2("rdsv3_ib_add_one",
 133                     "kmem_cache_create for ib_frag_slab failed for device: %s",
 134                     device->name);
 135                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 136                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 137                 goto free_dev;
 138         }
 139 
 140         rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
 141             (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
 142         if (rds_ibdev->aft_hcagp == NULL) {
 143                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 144                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 145                 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 146                 goto free_dev;
 147         }
 148         rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
 149             (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
 150             rds_ibdev->aft_hcagp);
 151         if (rds_ibdev->fmr_soft_cq == NULL) {
 152                 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
 153                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 154                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 155                 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 156                 goto free_dev;
 157         }
 158 
 159         rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
 160             (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
 161             rds_ibdev->aft_hcagp);
 162         if (rds_ibdev->inc_soft_cq == NULL) {
 163                 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
 164                 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
 165                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 166                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 167                 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 168                 goto free_dev;
 169         }
 170 
 171         list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
 172             offsetof(struct rdsv3_ib_ipaddr, list));
 173         list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
 174             offsetof(struct rdsv3_ib_connection, ib_node));
 175 
 176         list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
 177 
 178         ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
 179 
 180         RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
 181 
 182         goto free_attr;
 183 
 184 err_pd:
 185         (void) ib_dealloc_pd(rds_ibdev->pd);
 186 free_dev:
 187         mutex_destroy(&rds_ibdev->spinlock);
 188         rw_destroy(&rds_ibdev->rwlock);
 189         kmem_free(rds_ibdev, sizeof (*rds_ibdev));
 190 free_attr:
 191         kmem_free(dev_attr, sizeof (*dev_attr));
 192 }
 193 
 194 void
 195 rdsv3_ib_remove_one(struct ib_device *device)
 196 {
 197         struct rdsv3_ib_device *rds_ibdev;
 198         struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
 199 
 200         RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
 201 
 202         rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
 203         if (!rds_ibdev)
 204                 return;
 205 
 206         RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
 207             list) {
 208                 list_remove_node(&i_ipaddr->list);
 209                 kmem_free(i_ipaddr, sizeof (*i_ipaddr));
 210         }
 211 
 212         rdsv3_ib_destroy_conns(rds_ibdev);
 213 
 214         if (rds_ibdev->fmr_soft_cq)
 215                 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
 216         if (rds_ibdev->inc_soft_cq)
 217                 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
 218 
 219         rdsv3_ib_destroy_mr_pool(rds_ibdev);
 220         rdsv3_ib_destroy_inc_pool(rds_ibdev);
 221 
 222         kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 223 
 224         rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
 225 
 226 #if 0
 227         while (ib_dealloc_pd(rds_ibdev->pd)) {
 228                 RDSV3_DPRINTF5("rdsv3_ib_remove_one",
 229                     "%s-%d Failed to dealloc pd %p",
 230                     __func__, __LINE__, rds_ibdev->pd);
 231                 delay(drv_usectohz(1000));
 232         }
 233 #else
 234         if (ib_dealloc_pd(rds_ibdev->pd)) {
 235                 RDSV3_DPRINTF2("rdsv3_ib_remove_one",
 236                     "Failed to dealloc pd %p\n", rds_ibdev->pd);
 237         }
 238 #endif
 239 
 240         list_destroy(&rds_ibdev->ipaddr_list);
 241         list_destroy(&rds_ibdev->conn_list);
 242         list_remove_node(&rds_ibdev->list);
 243         mutex_destroy(&rds_ibdev->spinlock);
 244         rw_destroy(&rds_ibdev->rwlock);
 245         kmem_free(rds_ibdev, sizeof (*rds_ibdev));
 246 
 247         RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
 248 }
 249 
 250 struct ib_client rdsv3_ib_client = {
 251         .name           = "rdsv3_ib",
 252         .add            = rdsv3_ib_add_one,
 253         .remove         = rdsv3_ib_remove_one,
 254         .clnt_hdl       = NULL,
 255         .state          = IB_CLNT_UNINITIALIZED
 256 };
 257 
 258 static int
 259 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
 260     void *buffer)
 261 {
 262         struct rds_info_rdma_connection *iinfo = buffer;
 263         struct rdsv3_ib_connection *ic;
 264 
 265         RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
 266             conn, buffer);
 267 
 268         /* We will only ever look at IB transports */
 269         if (conn->c_trans != &rdsv3_ib_transport)
 270                 return (0);
 271 
 272         iinfo->src_addr = conn->c_laddr;
 273         iinfo->dst_addr = conn->c_faddr;
 274 
 275         (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
 276         (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
 277         if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
 278                 struct rdsv3_ib_device *rds_ibdev;
 279                 struct rdma_dev_addr *dev_addr;
 280 
 281                 ic = conn->c_transport_data;
 282                 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
 283 
 284                 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
 285                 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
 286 
 287                 rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
 288                     &rdsv3_ib_client);
 289                 iinfo->max_send_wr = ic->i_send_ring.w_nr;
 290                 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
 291                 iinfo->max_send_sge = rds_ibdev->max_sge;
 292         }
 293 
 294         RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
 295             conn, buffer);
 296         return (1);
 297 }
 298 
 299 static void
 300 rds_ib_ic_info(struct rsock *sock, unsigned int len,
 301     struct rdsv3_info_iterator *iter,
 302     struct rdsv3_info_lengths *lens)
 303 {
 304         RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
 305             sock, iter, lens, len);
 306 
 307         rdsv3_for_each_conn_info(sock, len, iter, lens,
 308             rds_ib_conn_info_visitor,
 309             sizeof (struct rds_info_rdma_connection));
 310 }
 311 
 312 /*
 313  * Early RDS/IB was built to only bind to an address if there is an IPoIB
 314  * device with that address set.
 315  *
 316  * If it were me, I'd advocate for something more flexible.  Sending and
 317  * receiving should be device-agnostic.  Transports would try and maintain
 318  * connections between peers who have messages queued.  Userspace would be
 319  * allowed to influence which paths have priority.  We could call userspace
 320  * asserting this policy "routing".
 321  */
 322 static int
 323 rds_ib_laddr_check(uint32_be_t addr)
 324 {
 325         int ret;
 326         struct rdma_cm_id *cm_id;
 327         struct sockaddr_in sin;
 328 
 329         RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
 330 
 331         /*
 332          * Create a CMA ID and try to bind it. This catches both
 333          * IB and iWARP capable NICs.
 334          */
 335         cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
 336         if (!cm_id)
 337                 return (-EADDRNOTAVAIL);
 338 
 339         (void) memset(&sin, 0, sizeof (sin));
 340         sin.sin_family = AF_INET;
 341         sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
 342 
 343         /* rdma_bind_addr will only succeed for IB & iWARP devices */
 344         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
 345         /*
 346          * due to this, we will claim to support iWARP devices unless we
 347          * check node_type.
 348          */
 349         if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
 350                 ret = -EADDRNOTAVAIL;
 351 
 352         RDSV3_DPRINTF5("rds_ib_laddr_check",
 353             "addr %u.%u.%u.%u ret %d node type %d",
 354             NIPQUAD(addr), ret,
 355             cm_id->device ? cm_id->device->node_type : -1);
 356 
 357         rdma_destroy_id(cm_id);
 358 
 359         return (ret);
 360 }
 361 
 362 void
 363 rdsv3_ib_exit(void)
 364 {
 365         RDSV3_DPRINTF4("rds_ib_exit", "Enter");
 366 
 367         rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 368         rdsv3_ib_destroy_nodev_conns();
 369         ib_unregister_client(&rdsv3_ib_client);
 370         rdsv3_ib_sysctl_exit();
 371         rdsv3_ib_recv_exit();
 372         rdsv3_trans_unregister(&rdsv3_ib_transport);
 373         kmem_free(rdsv3_ib_stats,
 374             nr_cpus * sizeof (struct rdsv3_ib_statistics));
 375         mutex_destroy(&ib_nodev_conns_lock);
 376         list_destroy(&ib_nodev_conns);
 377         list_destroy(&rdsv3_ib_devices);
 378 
 379         RDSV3_DPRINTF4("rds_ib_exit", "Return");
 380 }
 381 
 382 struct rdsv3_transport rdsv3_ib_transport = {
 383         .laddr_check            = rds_ib_laddr_check,
 384         .xmit_complete          = rdsv3_ib_xmit_complete,
 385         .xmit                   = rdsv3_ib_xmit,
 386         .xmit_cong_map          = NULL,
 387         .xmit_rdma              = rdsv3_ib_xmit_rdma,
 388         .recv                   = rdsv3_ib_recv,
 389         .conn_alloc             = rdsv3_ib_conn_alloc,
 390         .conn_free              = rdsv3_ib_conn_free,
 391         .conn_connect           = rdsv3_ib_conn_connect,
 392         .conn_shutdown          = rdsv3_ib_conn_shutdown,
 393         .inc_copy_to_user       = rdsv3_ib_inc_copy_to_user,
 394         .inc_free               = rdsv3_ib_inc_free,
 395         .cm_initiate_connect    = rdsv3_ib_cm_initiate_connect,
 396         .cm_handle_connect      = rdsv3_ib_cm_handle_connect,
 397         .cm_connect_complete    = rdsv3_ib_cm_connect_complete,
 398         .stats_info_copy        = rdsv3_ib_stats_info_copy,
 399         .exit                   = rdsv3_ib_exit,
 400         .get_mr                 = rdsv3_ib_get_mr,
 401         .sync_mr                = rdsv3_ib_sync_mr,
 402         .free_mr                = rdsv3_ib_free_mr,
 403         .flush_mrs              = rdsv3_ib_flush_mrs,
 404         .t_name                 = "infiniband",
 405         .t_type                 = RDS_TRANS_IB
 406 };
 407 
 408 int
 409 rdsv3_ib_init(void)
 410 {
 411         int ret;
 412 
 413         RDSV3_DPRINTF4("rds_ib_init", "Enter");
 414 
 415         list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
 416             offsetof(struct rdsv3_ib_device, list));
 417         list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
 418             offsetof(struct rdsv3_ib_connection, ib_node));
 419         mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
 420 
 421         /* allocate space for ib statistics */
 422         ASSERT(rdsv3_ib_stats == NULL);
 423         rdsv3_ib_stats = kmem_zalloc(nr_cpus *
 424             sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
 425 
 426         rdsv3_ib_client.dip = rdsv3_dev_info;
 427         ret = ib_register_client(&rdsv3_ib_client);
 428         if (ret)
 429                 goto out;
 430 
 431         ret = rdsv3_ib_sysctl_init();
 432         if (ret)
 433                 goto out_ibreg;
 434 
 435         ret = rdsv3_ib_recv_init();
 436         if (ret)
 437                 goto out_sysctl;
 438 
 439         ret = rdsv3_trans_register(&rdsv3_ib_transport);
 440         if (ret)
 441                 goto out_recv;
 442 
 443         rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 444 
 445         RDSV3_DPRINTF4("rds_ib_init", "Return");
 446 
 447         return (0);
 448 
 449 out_recv:
 450         rdsv3_ib_recv_exit();
 451 out_sysctl:
 452         rdsv3_ib_sysctl_exit();
 453 out_ibreg:
 454         ib_unregister_client(&rdsv3_ib_client);
 455 out:
 456         kmem_free(rdsv3_ib_stats,
 457             nr_cpus * sizeof (struct rdsv3_ib_statistics));
 458         mutex_destroy(&ib_nodev_conns_lock);
 459         list_destroy(&ib_nodev_conns);
 460         list_destroy(&rdsv3_ib_devices);
 461         return (ret);
 462 }