1 /*
   2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 
   5 /*
   6  * This file contains code imported from the OFED rds source file ib.c
   7  * Oracle elects to have and use the contents of ib.c under and governed
   8  * by the OpenIB.org BSD license (see below for full license text). However,
   9  * the following notice accompanied the original version of this file:
  10  */
  11 
  12 /*
  13  * Copyright (c) 2006 Oracle.  All rights reserved.
  14  *
  15  * This software is available to you under a choice of one of two
  16  * licenses.  You may choose to be licensed under the terms of the GNU
  17  * General Public License (GPL) Version 2, available from the file
  18  * COPYING in the main directory of this source tree, or the
  19  * OpenIB.org BSD license below:
  20  *
  21  *     Redistribution and use in source and binary forms, with or
  22  *     without modification, are permitted provided that the following
  23  *     conditions are met:
  24  *
  25  *      - Redistributions of source code must retain the above
  26  *        copyright notice, this list of conditions and the following
  27  *        disclaimer.
  28  *
  29  *      - Redistributions in binary form must reproduce the above
  30  *        copyright notice, this list of conditions and the following
  31  *        disclaimer in the documentation and/or other materials
  32  *        provided with the distribution.
  33  *
  34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  41  * SOFTWARE.
  42  *
  43  */
  44 #include <sys/sysmacros.h>
  45 #include <sys/rds.h>
  46 
  47 #include <sys/ib/ibtl/ibti.h>
  48 #include <sys/ib/clients/rdsv3/rdsv3.h>
  49 #include <sys/ib/clients/rdsv3/ib.h>
  50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
  51 
  52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
  53 
  54 struct list     rdsv3_ib_devices;
  55 
  56 /* NOTE: if also grabbing ibdev lock, grab this first */
  57 kmutex_t ib_nodev_conns_lock;
  58 list_t ib_nodev_conns;
  59 
  60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
  61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
  62 
  63 void
  64 rdsv3_ib_add_one(ib_device_t *device)
  65 {
  66         struct rdsv3_ib_device *rds_ibdev;
  67         ibt_hca_attr_t *dev_attr;
  68         char name[64];
  69 
  70         RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
  71 
  72         /* Only handle IB (no iWARP) devices */
  73         if (device->node_type != RDMA_NODE_IB_CA)
  74                 return;
  75 
  76         dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
  77             KM_NOSLEEP);
  78         if (!dev_attr)
  79                 return;
  80 
  81         if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
  82                 RDSV3_DPRINTF2("rdsv3_ib_add_one",
  83                     "Query device failed for %s", device->name);
  84                 goto free_attr;
  85         }
  86 
  87         /* We depend on Reserved Lkey */
  88         if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
  89                 RDSV3_DPRINTF2("rdsv3_ib_add_one",
  90                     "Reserved Lkey support is required: %s",
  91                     device->name);
  92                 goto free_attr;
  93         }
  94 
  95         rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
  96         if (!rds_ibdev)
  97                 goto free_attr;
  98 
  99         rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
 100         rds_ibdev->hca_attr =  *dev_attr;
 101 
 102         rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
 103         mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
 104 
 105         rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
 106         rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
 107 
 108         rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
 109         rds_ibdev->max_responder_resources =
 110             (uint_t)dev_attr->hca_max_rdma_in_qp;
 111 
 112         rds_ibdev->dev = device;
 113         rds_ibdev->pd = ib_alloc_pd(device);
 114         if (IS_ERR(rds_ibdev->pd))
 115                 goto free_dev;
 116 
 117         if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
 118                 goto free_dev;
 119         }
 120 
 121         if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
 122                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 123                 goto free_dev;
 124         }
 125 
 126         (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
 127             (longlong_t)htonll(dev_attr->hca_node_guid));
 128         rds_ibdev->ib_frag_slab = kmem_cache_create(name,
 129             sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
 130             rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
 131         if (rds_ibdev->ib_frag_slab == NULL) {
 132                 RDSV3_DPRINTF2("rdsv3_ib_add_one",
 133                     "kmem_cache_create for ib_frag_slab failed for device: %s",
 134                     device->name);
 135                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 136                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 137                 goto free_dev;
 138         }
 139 
 140         rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
 141             (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
 142         if (rds_ibdev->aft_hcagp == NULL) {
 143                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 144                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 145                 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 146                 goto free_dev;
 147         }
 148         rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
 149             (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
 150             rds_ibdev->aft_hcagp);
 151         if (rds_ibdev->fmr_soft_cq == NULL) {
 152                 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
 153                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 154                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 155                 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 156                 goto free_dev;
 157         }
 158 
 159         rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
 160             (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
 161             rds_ibdev->aft_hcagp);
 162         if (rds_ibdev->inc_soft_cq == NULL) {
 163                 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
 164                 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
 165                 rdsv3_ib_destroy_mr_pool(rds_ibdev);
 166                 rdsv3_ib_destroy_inc_pool(rds_ibdev);
 167                 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 168                 goto free_dev;
 169         }
 170 
 171         list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
 172             offsetof(struct rdsv3_ib_ipaddr, list));
 173         list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
 174             offsetof(struct rdsv3_ib_connection, ib_node));
 175 
 176         list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
 177 
 178         ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
 179 
 180         RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
 181 
 182         goto free_attr;
 183 
 184 err_pd:
 185         (void) ib_dealloc_pd(rds_ibdev->pd);
 186 free_dev:
 187         mutex_destroy(&rds_ibdev->spinlock);
 188         rw_destroy(&rds_ibdev->rwlock);
 189         kmem_free(rds_ibdev, sizeof (*rds_ibdev));
 190 free_attr:
 191         kmem_free(dev_attr, sizeof (*dev_attr));
 192 }
 193 
 194 void
 195 rdsv3_ib_remove_one(struct ib_device *device)
 196 {
 197         struct rdsv3_ib_device *rds_ibdev;
 198         struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
 199 
 200         RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
 201 
 202         rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
 203         if (!rds_ibdev)
 204                 return;
 205 
 206         RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
 207             list) {
 208                 list_remove_node(&i_ipaddr->list);
 209                 kmem_free(i_ipaddr, sizeof (*i_ipaddr));
 210         }
 211 
 212         rdsv3_ib_destroy_conns(rds_ibdev);
 213 
 214         if (rds_ibdev->fmr_soft_cq)
 215                 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
 216         if (rds_ibdev->inc_soft_cq)
 217                 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
 218 
 219         rdsv3_ib_destroy_mr_pool(rds_ibdev);
 220         rdsv3_ib_destroy_inc_pool(rds_ibdev);
 221 
 222         kmem_cache_destroy(rds_ibdev->ib_frag_slab);
 223 
 224         rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
 225 
 226 #if 0
 227         while (ib_dealloc_pd(rds_ibdev->pd)) {
 228 #ifndef __lock_lint
 229                 RDSV3_DPRINTF5("rdsv3_ib_remove_one",
 230                     "%s-%d Failed to dealloc pd %p",
 231                     __func__, __LINE__, rds_ibdev->pd);
 232 #endif
 233                 delay(drv_usectohz(1000));
 234         }
 235 #else
 236         if (ib_dealloc_pd(rds_ibdev->pd)) {
 237 #ifndef __lock_lint
 238                 RDSV3_DPRINTF2("rdsv3_ib_remove_one",
 239                     "Failed to dealloc pd %p\n", rds_ibdev->pd);
 240 #endif
 241         }
 242 #endif
 243 
 244         list_destroy(&rds_ibdev->ipaddr_list);
 245         list_destroy(&rds_ibdev->conn_list);
 246         list_remove_node(&rds_ibdev->list);
 247         mutex_destroy(&rds_ibdev->spinlock);
 248         rw_destroy(&rds_ibdev->rwlock);
 249         kmem_free(rds_ibdev, sizeof (*rds_ibdev));
 250 
 251         RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
 252 }
 253 
 254 #ifndef __lock_lint
 255 struct ib_client rdsv3_ib_client = {
 256         .name           = "rdsv3_ib",
 257         .add            = rdsv3_ib_add_one,
 258         .remove         = rdsv3_ib_remove_one,
 259         .clnt_hdl       = NULL,
 260         .state          = IB_CLNT_UNINITIALIZED
 261 };
 262 #else
 263 struct ib_client rdsv3_ib_client = {
 264         "rdsv3_ib",
 265         rdsv3_ib_add_one,
 266         rdsv3_ib_remove_one,
 267         NULL,
 268         NULL,
 269         IB_CLNT_UNINITIALIZED
 270 };
 271 #endif
 272 
 273 static int
 274 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
 275     void *buffer)
 276 {
 277         struct rds_info_rdma_connection *iinfo = buffer;
 278         struct rdsv3_ib_connection *ic;
 279 
 280         RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
 281             conn, buffer);
 282 
 283         /* We will only ever look at IB transports */
 284         if (conn->c_trans != &rdsv3_ib_transport)
 285                 return (0);
 286 
 287         iinfo->src_addr = conn->c_laddr;
 288         iinfo->dst_addr = conn->c_faddr;
 289 
 290         (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
 291         (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
 292         if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
 293                 struct rdsv3_ib_device *rds_ibdev;
 294                 struct rdma_dev_addr *dev_addr;
 295 
 296                 ic = conn->c_transport_data;
 297                 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
 298 
 299                 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
 300                 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
 301 
 302                 rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
 303                     &rdsv3_ib_client);
 304                 iinfo->max_send_wr = ic->i_send_ring.w_nr;
 305                 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
 306                 iinfo->max_send_sge = rds_ibdev->max_sge;
 307         }
 308 
 309         RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
 310             conn, buffer);
 311         return (1);
 312 }
 313 
 314 static void
 315 rds_ib_ic_info(struct rsock *sock, unsigned int len,
 316     struct rdsv3_info_iterator *iter,
 317     struct rdsv3_info_lengths *lens)
 318 {
 319         RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
 320             sock, iter, lens, len);
 321 
 322         rdsv3_for_each_conn_info(sock, len, iter, lens,
 323             rds_ib_conn_info_visitor,
 324             sizeof (struct rds_info_rdma_connection));
 325 }
 326 
 327 /*
 328  * Early RDS/IB was built to only bind to an address if there is an IPoIB
 329  * device with that address set.
 330  *
 331  * If it were me, I'd advocate for something more flexible.  Sending and
 332  * receiving should be device-agnostic.  Transports would try and maintain
 333  * connections between peers who have messages queued.  Userspace would be
 334  * allowed to influence which paths have priority.  We could call userspace
 335  * asserting this policy "routing".
 336  */
 337 static int
 338 rds_ib_laddr_check(uint32_be_t addr)
 339 {
 340         int ret;
 341         struct rdma_cm_id *cm_id;
 342         struct sockaddr_in sin;
 343 
 344         RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
 345 
 346         /*
 347          * Create a CMA ID and try to bind it. This catches both
 348          * IB and iWARP capable NICs.
 349          */
 350         cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
 351         if (!cm_id)
 352                 return (-EADDRNOTAVAIL);
 353 
 354         (void) memset(&sin, 0, sizeof (sin));
 355         sin.sin_family = AF_INET;
 356         sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
 357 
 358         /* rdma_bind_addr will only succeed for IB & iWARP devices */
 359         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
 360         /*
 361          * due to this, we will claim to support iWARP devices unless we
 362          * check node_type.
 363          */
 364         if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
 365                 ret = -EADDRNOTAVAIL;
 366 
 367         RDSV3_DPRINTF5("rds_ib_laddr_check",
 368             "addr %u.%u.%u.%u ret %d node type %d",
 369             NIPQUAD(addr), ret,
 370             cm_id->device ? cm_id->device->node_type : -1);
 371 
 372         rdma_destroy_id(cm_id);
 373 
 374         return (ret);
 375 }
 376 
 377 void
 378 rdsv3_ib_exit(void)
 379 {
 380         RDSV3_DPRINTF4("rds_ib_exit", "Enter");
 381 
 382         rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 383         rdsv3_ib_destroy_nodev_conns();
 384         ib_unregister_client(&rdsv3_ib_client);
 385         rdsv3_ib_sysctl_exit();
 386         rdsv3_ib_recv_exit();
 387         rdsv3_trans_unregister(&rdsv3_ib_transport);
 388         kmem_free(rdsv3_ib_stats,
 389             nr_cpus * sizeof (struct rdsv3_ib_statistics));
 390         mutex_destroy(&ib_nodev_conns_lock);
 391         list_destroy(&ib_nodev_conns);
 392         list_destroy(&rdsv3_ib_devices);
 393 
 394         RDSV3_DPRINTF4("rds_ib_exit", "Return");
 395 }
 396 
 397 #ifndef __lock_lint
 398 struct rdsv3_transport rdsv3_ib_transport = {
 399         .laddr_check            = rds_ib_laddr_check,
 400         .xmit_complete          = rdsv3_ib_xmit_complete,
 401         .xmit                   = rdsv3_ib_xmit,
 402         .xmit_cong_map          = NULL,
 403         .xmit_rdma              = rdsv3_ib_xmit_rdma,
 404         .recv                   = rdsv3_ib_recv,
 405         .conn_alloc             = rdsv3_ib_conn_alloc,
 406         .conn_free              = rdsv3_ib_conn_free,
 407         .conn_connect           = rdsv3_ib_conn_connect,
 408         .conn_shutdown          = rdsv3_ib_conn_shutdown,
 409         .inc_copy_to_user       = rdsv3_ib_inc_copy_to_user,
 410         .inc_free               = rdsv3_ib_inc_free,
 411         .cm_initiate_connect    = rdsv3_ib_cm_initiate_connect,
 412         .cm_handle_connect      = rdsv3_ib_cm_handle_connect,
 413         .cm_connect_complete    = rdsv3_ib_cm_connect_complete,
 414         .stats_info_copy        = rdsv3_ib_stats_info_copy,
 415         .exit                   = rdsv3_ib_exit,
 416         .get_mr                 = rdsv3_ib_get_mr,
 417         .sync_mr                = rdsv3_ib_sync_mr,
 418         .free_mr                = rdsv3_ib_free_mr,
 419         .flush_mrs              = rdsv3_ib_flush_mrs,
 420         .t_name                 = "infiniband",
 421         .t_type                 = RDS_TRANS_IB
 422 };
 423 #else
 424 struct rdsv3_transport rdsv3_ib_transport;
 425 #endif
 426 
 427 int
 428 rdsv3_ib_init(void)
 429 {
 430         int ret;
 431 
 432         RDSV3_DPRINTF4("rds_ib_init", "Enter");
 433 
 434         list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
 435             offsetof(struct rdsv3_ib_device, list));
 436         list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
 437             offsetof(struct rdsv3_ib_connection, ib_node));
 438         mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
 439 
 440         /* allocate space for ib statistics */
 441         ASSERT(rdsv3_ib_stats == NULL);
 442         rdsv3_ib_stats = kmem_zalloc(nr_cpus *
 443             sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
 444 
 445         rdsv3_ib_client.dip = rdsv3_dev_info;
 446         ret = ib_register_client(&rdsv3_ib_client);
 447         if (ret)
 448                 goto out;
 449 
 450         ret = rdsv3_ib_sysctl_init();
 451         if (ret)
 452                 goto out_ibreg;
 453 
 454         ret = rdsv3_ib_recv_init();
 455         if (ret)
 456                 goto out_sysctl;
 457 
 458         ret = rdsv3_trans_register(&rdsv3_ib_transport);
 459         if (ret)
 460                 goto out_recv;
 461 
 462         rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 463 
 464         RDSV3_DPRINTF4("rds_ib_init", "Return");
 465 
 466         return (0);
 467 
 468 out_recv:
 469         rdsv3_ib_recv_exit();
 470 out_sysctl:
 471         rdsv3_ib_sysctl_exit();
 472 out_ibreg:
 473         ib_unregister_client(&rdsv3_ib_client);
 474 out:
 475         kmem_free(rdsv3_ib_stats,
 476             nr_cpus * sizeof (struct rdsv3_ib_statistics));
 477         mutex_destroy(&ib_nodev_conns_lock);
 478         list_destroy(&ib_nodev_conns);
 479         list_destroy(&rdsv3_ib_devices);
 480         return (ret);
 481 }