1 /*
   2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 
   5 /*
   6  * This file contains code imported from the OFED rds source file ib_cm.c
   7  * Oracle elects to have and use the contents of ib_cm.c under and governed
   8  * by the OpenIB.org BSD license (see below for full license text). However,
   9  * the following notice accompanied the original version of this file:
  10  */
  11 
  12 /*
  13  * Copyright (c) 2006 Oracle.  All rights reserved.
  14  *
  15  * This software is available to you under a choice of one of two
  16  * licenses.  You may choose to be licensed under the terms of the GNU
  17  * General Public License (GPL) Version 2, available from the file
  18  * COPYING in the main directory of this source tree, or the
  19  * OpenIB.org BSD license below:
  20  *
  21  *     Redistribution and use in source and binary forms, with or
  22  *     without modification, are permitted provided that the following
  23  *     conditions are met:
  24  *
  25  *      - Redistributions of source code must retain the above
  26  *        copyright notice, this list of conditions and the following
  27  *        disclaimer.
  28  *
  29  *      - Redistributions in binary form must reproduce the above
  30  *        copyright notice, this list of conditions and the following
  31  *        disclaimer in the documentation and/or other materials
  32  *        provided with the distribution.
  33  *
  34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  41  * SOFTWARE.
  42  *
  43  */
  44 #include <sys/rds.h>
  45 
  46 #include <sys/ib/clients/of/ofed_kernel.h>
  47 #include <sys/ib/clients/of/rdma/ib_addr.h>
  48 #include <sys/ib/clients/of/rdma/rdma_cm.h>
  49 
  50 #include <sys/ib/clients/rdsv3/rdsv3.h>
  51 #include <sys/ib/clients/rdsv3/ib.h>
  52 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
  53 
  54 extern int rdsv3_enable_snd_cq;
  55 
  56 /*
  57  * Set the selected protocol version
  58  */
  59 static void
  60 rdsv3_ib_set_protocol(struct rdsv3_connection *conn, unsigned int version)
  61 {
  62         RDSV3_DPRINTF4("rdsv3_ib_set_protocol", "conn: %p version: %d",
  63             conn, version);
  64         conn->c_version = version;
  65 }
  66 
  67 /*
  68  * Set up flow control
  69  */
  70 static void
  71 rdsv3_ib_set_flow_control(struct rdsv3_connection *conn, uint32_t credits)
  72 {
  73         struct rdsv3_ib_connection *ic = conn->c_transport_data;
  74 
  75         RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
  76             "Enter: conn: %p credits: %d", conn, credits);
  77 
  78         if (rdsv3_ib_sysctl_flow_control && credits != 0) {
  79                 /* We're doing flow control */
  80                 ic->i_flowctl = 1;
  81                 rdsv3_ib_send_add_credits(conn, credits);
  82         } else {
  83                 ic->i_flowctl = 0;
  84         }
  85 
  86         RDSV3_DPRINTF2("rdsv3_ib_set_flow_control",
  87             "Return: conn: %p credits: %d",
  88             conn, credits);
  89 }
  90 
  91 /*
  92  * Tune RNR behavior. Without flow control, we use a rather
  93  * low timeout, but not the absolute minimum - this should
  94  * be tunable.
  95  *
  96  * We already set the RNR retry count to 7 (which is the
  97  * smallest infinite number :-) above.
  98  * If flow control is off, we want to change this back to 0
  99  * so that we learn quickly when our credit accounting is
 100  * buggy.
 101  *
 102  * Caller passes in a qp_attr pointer - don't waste stack spacv
 103  * by allocation this twice.
 104  */
 105 static void
 106 rdsv3_ib_tune_rnr(struct rdsv3_ib_connection *ic, struct ib_qp_attr *attr)
 107 {
 108         int ret;
 109 
 110         RDSV3_DPRINTF2("rdsv3_ib_tune_rnr", "Enter ic: %p attr: %p",
 111             ic, attr);
 112 
 113         attr->min_rnr_timer = IB_RNR_TIMER_000_32;
 114         ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
 115         if (ret)
 116                 RDSV3_DPRINTF2("rdsv3_ib_tune_rnr",
 117                     "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d", -ret);
 118 }
 119 
 120 /*
 121  * Connection established.
 122  * We get here for both outgoing and incoming connection.
 123  */
 124 void
 125 rdsv3_ib_cm_connect_complete(struct rdsv3_connection *conn,
 126     struct rdma_cm_event *event)
 127 {
 128         const struct rdsv3_ib_connect_private *dp = NULL;
 129         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 130         struct rdsv3_ib_device *rds_ibdev =
 131             ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
 132         struct ib_qp_attr qp_attr;
 133         int err;
 134 
 135         RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
 136             "Enter conn: %p event: %p", conn, event);
 137 
 138         if (event->param.conn.private_data_len >= sizeof (*dp)) {
 139                 dp = event->param.conn.private_data;
 140 
 141                 /* make sure it isn't empty data */
 142                 if (dp->dp_protocol_major) {
 143                         rdsv3_ib_set_protocol(conn,
 144                             RDS_PROTOCOL(dp->dp_protocol_major,
 145                             dp->dp_protocol_minor));
 146                         rdsv3_ib_set_flow_control(conn,
 147                             ntohl(dp->dp_credit));
 148                 }
 149         }
 150 
 151         if (conn->c_version < RDS_PROTOCOL(3, 1)) {
 152                 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
 153                     "RDS/IB: Connection to %u.%u.%u.%u version %u.%u failed",
 154                     NIPQUAD(conn->c_faddr),
 155                     RDS_PROTOCOL_MAJOR(conn->c_version),
 156                     RDS_PROTOCOL_MINOR(conn->c_version));
 157                 rdsv3_conn_destroy(conn);
 158                 return;
 159         } else {
 160                 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
 161                     "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s",
 162                     NIPQUAD(conn->c_faddr),
 163                     RDS_PROTOCOL_MAJOR(conn->c_version),
 164                     RDS_PROTOCOL_MINOR(conn->c_version),
 165                     ic->i_flowctl ? ", flow control" : "");
 166         }
 167 
 168         ASSERT(ic->i_soft_cq == NULL);
 169         ic->i_soft_cq = rdsv3_af_intr_thr_create(rdsv3_ib_tasklet_fn,
 170             (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp,
 171             ic->i_cq->ibt_cq);
 172         if (rdsv3_enable_snd_cq) {
 173                 ic->i_snd_soft_cq = rdsv3_af_intr_thr_create(
 174                     rdsv3_ib_snd_tasklet_fn,
 175                     (void *)ic, SCQ_INTR_BIND_CPU, rds_ibdev->aft_hcagp,
 176                     ic->i_snd_cq->ibt_cq);
 177         }
 178         /* rdsv3_ib_refill_fn is expecting i_max_recv_alloc set */
 179         ic->i_max_recv_alloc = rdsv3_ib_sysctl_max_recv_allocation;
 180         ic->i_refill_rq = rdsv3_af_thr_create(rdsv3_ib_refill_fn, (void *)conn,
 181             SCQ_WRK_BIND_CPU, rds_ibdev->aft_hcagp);
 182         rdsv3_af_grp_draw(rds_ibdev->aft_hcagp);
 183 
 184         (void) ib_req_notify_cq(ic->i_cq, IB_CQ_SOLICITED);
 185         if (rdsv3_enable_snd_cq) {
 186                 (void) ib_req_notify_cq(ic->i_snd_cq, IB_CQ_NEXT_COMP);
 187         }
 188 
 189         /*
 190          * Init rings and fill recv. this needs to wait until protocol
 191          * negotiation
 192          * is complete, since ring layout is different from 3.0 to 3.1.
 193          */
 194         rdsv3_ib_send_init_ring(ic);
 195         rdsv3_ib_recv_init_ring(ic);
 196         /*
 197          * Post receive buffers - as a side effect, this will update
 198          * the posted credit count.
 199          */
 200         (void) rdsv3_ib_recv_refill(conn, 1);
 201 
 202         /* Tune RNR behavior */
 203         rdsv3_ib_tune_rnr(ic, &qp_attr);
 204 
 205         qp_attr.qp_state = IB_QPS_RTS;
 206         err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
 207         if (err)
 208                 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
 209                     "ib_modify_qp(IB_QP_STATE, RTS): err=%d", err);
 210 
 211         /* update ib_device with this local ipaddr & conn */
 212         err = rdsv3_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
 213         if (err)
 214                 RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
 215                     "rdsv3_ib_update_ipaddr failed (%d)", err);
 216         rdsv3_ib_add_conn(rds_ibdev, conn);
 217 
 218         /*
 219          * If the peer gave us the last packet it saw, process this as if
 220          * we had received a regular ACK.
 221          */
 222         if (dp && dp->dp_ack_seq)
 223                 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
 224 
 225         rdsv3_connect_complete(conn);
 226 
 227         RDSV3_DPRINTF2("rdsv3_ib_cm_connect_complete",
 228             "Return conn: %p event: %p",
 229             conn, event);
 230 }
 231 
 232 static void
 233 rdsv3_ib_cm_fill_conn_param(struct rdsv3_connection *conn,
 234     struct rdma_conn_param *conn_param,
 235     struct rdsv3_ib_connect_private *dp,
 236     uint32_t protocol_version,
 237     uint32_t max_responder_resources,
 238     uint32_t max_initiator_depth)
 239 {
 240         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 241         struct rdsv3_ib_device *rds_ibdev;
 242 
 243         RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
 244             "Enter conn: %p conn_param: %p private: %p version: %d",
 245             conn, conn_param, dp, protocol_version);
 246 
 247         (void) memset(conn_param, 0, sizeof (struct rdma_conn_param));
 248 
 249         rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rdsv3_ib_client);
 250 
 251         conn_param->responder_resources =
 252             MIN(rds_ibdev->max_responder_resources, max_responder_resources);
 253         conn_param->initiator_depth =
 254             MIN(rds_ibdev->max_initiator_depth, max_initiator_depth);
 255         conn_param->retry_count = min(rdsv3_ib_retry_count, 7);
 256         conn_param->rnr_retry_count = 7;
 257 
 258         if (dp) {
 259                 (void) memset(dp, 0, sizeof (*dp));
 260                 dp->dp_saddr = conn->c_laddr;
 261                 dp->dp_daddr = conn->c_faddr;
 262                 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
 263                 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
 264                 dp->dp_protocol_minor_mask =
 265                     htons(RDSV3_IB_SUPPORTED_PROTOCOLS);
 266                 dp->dp_ack_seq = rdsv3_ib_piggyb_ack(ic);
 267 
 268                 /* Advertise flow control */
 269                 if (ic->i_flowctl) {
 270                         unsigned int credits;
 271 
 272                         credits = IB_GET_POST_CREDITS(
 273                             atomic_get(&ic->i_credits));
 274                         dp->dp_credit = htonl(credits);
 275                         atomic_add_32(&ic->i_credits,
 276                             -IB_SET_POST_CREDITS(credits));
 277                 }
 278 
 279                 conn_param->private_data = dp;
 280                 conn_param->private_data_len = sizeof (*dp);
 281         }
 282 
 283         RDSV3_DPRINTF2("rdsv3_ib_cm_fill_conn_param",
 284             "Return conn: %p conn_param: %p private: %p version: %d",
 285             conn, conn_param, dp, protocol_version);
 286 }
 287 
 288 static void
 289 rdsv3_ib_cq_event_handler(struct ib_event *event, void *data)
 290 {
 291         RDSV3_DPRINTF3("rdsv3_ib_cq_event_handler", "event %u data %p",
 292             event->event, data);
 293 }
 294 
 295 static void
 296 rdsv3_ib_snd_cq_comp_handler(struct ib_cq *cq, void *context)
 297 {
 298         struct rdsv3_connection *conn = context;
 299         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 300 
 301         RDSV3_DPRINTF4("rdsv3_ib_snd_cq_comp_handler",
 302             "Enter(conn: %p ic: %p cq: %p)", conn, ic, cq);
 303 
 304         rdsv3_af_thr_fire(ic->i_snd_soft_cq);
 305 }
 306 
 307 void
 308 rdsv3_ib_snd_tasklet_fn(void *data)
 309 {
 310         struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
 311         struct rdsv3_connection *conn = ic->conn;
 312         struct rdsv3_ib_ack_state ack_state = { 0, };
 313         ibt_wc_t wc;
 314         uint_t polled;
 315 
 316         RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
 317             "Enter(conn: %p ic: %p)", conn, ic);
 318 
 319         /*
 320          * Poll in a loop before and after enabling the next event
 321          */
 322         while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) ==
 323             IBT_SUCCESS) {
 324                 RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
 325                     "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
 326                     (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status,
 327                     wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
 328 
 329                 ASSERT(wc.wc_id & RDSV3_IB_SEND_OP);
 330                 rdsv3_ib_send_cqe_handler(ic, &wc);
 331         }
 332         (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_snd_cq),
 333             IBT_NEXT_COMPLETION);
 334         while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_snd_cq), &wc, 1, &polled) ==
 335             IBT_SUCCESS) {
 336                 RDSV3_DPRINTF4("rdsv3_ib_snd_tasklet_fn",
 337                     "wc_id 0x%llx type %d status %u byte_len %u imm_data %u\n",
 338                     (unsigned long long)wc.wc_id, wc.wc_type, wc.wc_status,
 339                     wc.wc_bytes_xfer, ntohl(wc.wc_immed_data));
 340 
 341                 ASSERT(wc.wc_id & RDSV3_IB_SEND_OP);
 342                 rdsv3_ib_send_cqe_handler(ic, &wc);
 343         }
 344 }
 345 
 346 static void
 347 rdsv3_ib_cq_comp_handler(struct ib_cq *cq, void *context)
 348 {
 349         struct rdsv3_connection *conn = context;
 350         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 351 
 352         RDSV3_DPRINTF4("rdsv3_ib_cq_comp_handler",
 353             "Enter(conn: %p cq: %p)", conn, cq);
 354 
 355         rdsv3_ib_stats_inc(s_ib_evt_handler_call);
 356 
 357         rdsv3_af_thr_fire(ic->i_soft_cq);
 358 }
 359 
 360 void
 361 rdsv3_ib_refill_fn(void *data)
 362 {
 363         struct rdsv3_connection *conn = (struct rdsv3_connection *)data;
 364 
 365         (void) rdsv3_ib_recv_refill(conn, 0);
 366 }
 367 
 368 void
 369 rdsv3_ib_tasklet_fn(void *data)
 370 {
 371         struct rdsv3_ib_connection *ic = (struct rdsv3_ib_connection *)data;
 372         struct rdsv3_connection *conn = ic->conn;
 373         struct rdsv3_ib_ack_state ack_state = { 0, };
 374         ibt_wc_t wc[RDSV3_IB_WC_POLL_SIZE];
 375         uint_t polled;
 376         int i;
 377 
 378         RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
 379             "Enter(conn: %p ic: %p)", conn, ic);
 380 
 381         rdsv3_ib_stats_inc(s_ib_tasklet_call);
 382 
 383         /*
 384          * Poll in a loop before and after enabling the next event
 385          */
 386         while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc[0],
 387             RDSV3_IB_WC_POLL_SIZE, &polled) == IBT_SUCCESS) {
 388                 for (i = 0; i < polled; i++) {
 389                         RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
 390                         "wc_id 0x%llx type %d status %u byte_len %u \
 391                             imm_data %u\n",
 392                             (unsigned long long)wc[i].wc_id, wc[i].wc_type,
 393                             wc[i].wc_status, wc[i].wc_bytes_xfer,
 394                             ntohl(wc[i].wc_immed_data));
 395 
 396                         if (wc[i].wc_id & RDSV3_IB_SEND_OP) {
 397                                 rdsv3_ib_send_cqe_handler(ic, &wc[i]);
 398                         } else {
 399                                 rdsv3_ib_recv_cqe_handler(ic, &wc[i],
 400                                     &ack_state);
 401                         }
 402                 }
 403         }
 404         (void) ibt_enable_cq_notify(RDSV3_CQ2CQHDL(ic->i_cq),
 405             IBT_NEXT_SOLICITED);
 406         while (ibt_poll_cq(RDSV3_CQ2CQHDL(ic->i_cq), &wc[0],
 407             RDSV3_IB_WC_POLL_SIZE, &polled) == IBT_SUCCESS) {
 408                 for (i = 0; i < polled; i++) {
 409                         RDSV3_DPRINTF4("rdsv3_ib_tasklet_fn",
 410                         "wc_id 0x%llx type %d status %u byte_len %u \
 411                             imm_data %u\n",
 412                             (unsigned long long)wc[i].wc_id, wc[i].wc_type,
 413                             wc[i].wc_status, wc[i].wc_bytes_xfer,
 414                             ntohl(wc[i].wc_immed_data));
 415 
 416                         if (wc[i].wc_id & RDSV3_IB_SEND_OP) {
 417                                 rdsv3_ib_send_cqe_handler(ic, &wc[i]);
 418                         } else {
 419                                 rdsv3_ib_recv_cqe_handler(ic, &wc[i],
 420                                     &ack_state);
 421                         }
 422                 }
 423         }
 424 
 425         if (ack_state.ack_next_valid) {
 426                 rdsv3_ib_set_ack(ic, ack_state.ack_next,
 427                     ack_state.ack_required);
 428         }
 429         if (ack_state.ack_recv_valid && ack_state.ack_recv > ic->i_ack_recv) {
 430                 rdsv3_send_drop_acked(conn, ack_state.ack_recv, NULL);
 431                 ic->i_ack_recv = ack_state.ack_recv;
 432         }
 433         if (rdsv3_conn_up(conn)) {
 434                 if (!test_bit(RDSV3_LL_SEND_FULL, &conn->c_flags))
 435                         (void) rdsv3_send_xmit(ic->conn);
 436                 rdsv3_ib_attempt_ack(ic);
 437         }
 438 }
 439 
 440 static void
 441 rdsv3_ib_qp_event_handler(struct ib_event *event, void *data)
 442 {
 443         struct rdsv3_connection *conn = data;
 444         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 445 
 446         RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "conn %p ic %p event %u",
 447             conn, ic, event->event);
 448 
 449         switch (event->event) {
 450         case IB_EVENT_COMM_EST:
 451                 (void) rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
 452                 break;
 453         default:
 454                 if (conn) {
 455                         RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
 456                             "RDS/IB: Fatal QP Event %u - "
 457                             "connection %u.%u.%u.%u ->%u.%u.%u.%u "
 458                             "...reconnecting",
 459                             event->event, NIPQUAD(conn->c_laddr),
 460                             NIPQUAD(conn->c_faddr));
 461                         rdsv3_conn_drop(conn);
 462                 } else {
 463                         RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler",
 464                             "RDS/IB: Fatal QP Event %u - connection"
 465                             "...reconnecting", event->event);
 466                 }
 467                 break;
 468         }
 469 
 470         RDSV3_DPRINTF2("rdsv3_ib_qp_event_handler", "Return conn: %p event: %p",
 471             conn, event);
 472 }
 473 
 474 extern int rdsv3_ib_alloc_hdrs(ib_device_t *dev,
 475     struct rdsv3_ib_connection *ic);
 476 extern void rdsv3_ib_free_hdrs(ib_device_t *dev,
 477     struct rdsv3_ib_connection *ic);
 478 
 479 /*
 480  * This needs to be very careful to not leave IS_ERR pointers around for
 481  * cleanup to trip over.
 482  */
 483 static int
 484 rdsv3_ib_setup_qp(struct rdsv3_connection *conn)
 485 {
 486         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 487         struct ib_device *dev = ic->i_cm_id->device;
 488         struct ib_qp_init_attr attr;
 489         struct rdsv3_ib_device *rds_ibdev;
 490         ibt_send_wr_t *wrp;
 491         ibt_wr_ds_t *sgl;
 492         int ret, i;
 493 
 494         RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "Enter conn: %p", conn);
 495 
 496         /*
 497          * rdsv3_ib_add_one creates a rdsv3_ib_device object per IB device,
 498          * and allocates a protection domain, memory range and FMR pool
 499          * for each.  If that fails for any reason, it will not register
 500          * the rds_ibdev at all.
 501          */
 502         rds_ibdev = ib_get_client_data(dev, &rdsv3_ib_client);
 503         if (!rds_ibdev) {
 504                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 505                     "RDS/IB: No client_data for device %s", dev->name);
 506                 return (-EOPNOTSUPP);
 507         }
 508         ic->rds_ibdev = rds_ibdev;
 509 
 510         if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
 511                 rdsv3_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
 512         if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
 513                 rdsv3_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
 514 
 515         /* Protection domain and memory range */
 516         ic->i_pd = rds_ibdev->pd;
 517 
 518         /*
 519          * IB_CQ_VECTOR_LEAST_ATTACHED and/or the corresponding feature is
 520          * not implmeneted in Hermon yet, but we can pass it to ib_create_cq()
 521          * anyway.
 522          */
 523         ic->i_cq = ib_create_cq(dev, rdsv3_ib_cq_comp_handler,
 524             rdsv3_ib_cq_event_handler, conn,
 525             ic->i_recv_ring.w_nr + ic->i_send_ring.w_nr + 1,
 526             rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp));
 527         if (IS_ERR(ic->i_cq)) {
 528                 ret = PTR_ERR(ic->i_cq);
 529                 ic->i_cq = NULL;
 530                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 531                     "ib_create_cq failed: %d", ret);
 532                 goto out;
 533         }
 534         if (rdsv3_enable_snd_cq) {
 535                 ic->i_snd_cq = ib_create_cq(dev, rdsv3_ib_snd_cq_comp_handler,
 536                     rdsv3_ib_cq_event_handler, conn, ic->i_send_ring.w_nr + 1,
 537                     rdsv3_af_grp_get_sched(ic->rds_ibdev->aft_hcagp));
 538                 if (IS_ERR(ic->i_snd_cq)) {
 539                         ret = PTR_ERR(ic->i_snd_cq);
 540                         (void) ib_destroy_cq(ic->i_cq);
 541                         ic->i_cq = NULL;
 542                         ic->i_snd_cq = NULL;
 543                         RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 544                             "ib_create_cq send cq failed: %d", ret);
 545                         goto out;
 546                 }
 547         }
 548 
 549         /* XXX negotiate max send/recv with remote? */
 550         (void) memset(&attr, 0, sizeof (attr));
 551         attr.event_handler = rdsv3_ib_qp_event_handler;
 552         attr.qp_context = conn;
 553         /* + 1 to allow for the single ack message */
 554         attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
 555         attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
 556         attr.cap.max_send_sge = rds_ibdev->max_sge;
 557         attr.cap.max_recv_sge = RDSV3_IB_RECV_SGE;
 558         attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 559         attr.qp_type = IB_QPT_RC;
 560         if (rdsv3_enable_snd_cq) {
 561                 attr.send_cq = ic->i_snd_cq;
 562         } else {
 563                 attr.send_cq = ic->i_cq;
 564         }
 565         attr.recv_cq = ic->i_cq;
 566 
 567         /*
 568          * XXX this can fail if max_*_wr is too large?  Are we supposed
 569          * to back off until we get a value that the hardware can support?
 570          */
 571         ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
 572         if (ret) {
 573                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 574                     "rdma_create_qp failed: %d", ret);
 575                 goto out;
 576         }
 577 
 578         ret = rdsv3_ib_alloc_hdrs(dev, ic);
 579         if (ret != 0) {
 580                 ret = -ENOMEM;
 581                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 582                     "rdsv3_ib_alloc_hdrs failed: %d", ret);
 583                 goto out;
 584         }
 585 
 586         ic->i_sends = kmem_alloc(ic->i_send_ring.w_nr *
 587             sizeof (struct rdsv3_ib_send_work), KM_NOSLEEP);
 588         if (ic->i_sends == NULL) {
 589                 ret = -ENOMEM;
 590                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 591                     "send allocation failed: %d", ret);
 592                 goto out;
 593         }
 594         (void) memset(ic->i_sends, 0, ic->i_send_ring.w_nr *
 595             sizeof (struct rdsv3_ib_send_work));
 596 
 597         ic->i_send_wrs =
 598             kmem_alloc(ic->i_send_ring.w_nr * (sizeof (ibt_send_wr_t) +
 599             RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)), KM_NOSLEEP);
 600         if (ic->i_send_wrs == NULL) {
 601                 ret = -ENOMEM;
 602                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 603                     "Send WR allocation failed: %d", ret);
 604                 goto out;
 605         }
 606         sgl = (ibt_wr_ds_t *)((uint8_t *)ic->i_send_wrs +
 607             (ic->i_send_ring.w_nr * sizeof (ibt_send_wr_t)));
 608         for (i = 0; i < ic->i_send_ring.w_nr; i++) {
 609                 wrp = &ic->i_send_wrs[i];
 610                 wrp->wr_sgl = &sgl[i * RDSV3_IB_MAX_SGE];
 611         }
 612 
 613         ic->i_recvs = kmem_alloc(ic->i_recv_ring.w_nr *
 614             sizeof (struct rdsv3_ib_recv_work), KM_NOSLEEP);
 615         if (ic->i_recvs == NULL) {
 616                 ret = -ENOMEM;
 617                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 618                     "recv allocation failed: %d", ret);
 619                 goto out;
 620         }
 621         (void) memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr *
 622             sizeof (struct rdsv3_ib_recv_work));
 623 
 624         ic->i_recv_wrs =
 625             kmem_alloc(ic->i_recv_ring.w_nr * sizeof (ibt_recv_wr_t),
 626             KM_NOSLEEP);
 627         if (ic->i_recv_wrs == NULL) {
 628                 ret = -ENOMEM;
 629                 RDSV3_DPRINTF2("rdsv3_ib_setup_qp",
 630                     "Recv WR allocation failed: %d", ret);
 631                 goto out;
 632         }
 633 
 634         rdsv3_ib_recv_init_ack(ic);
 635 
 636         RDSV3_DPRINTF2("rdsv3_ib_setup_qp", "conn %p pd %p mr %p cq %p",
 637             conn, ic->i_pd, ic->i_mr, ic->i_cq);
 638 
 639 out:
 640         return (ret);
 641 }
 642 
 643 static uint32_t
 644 rdsv3_ib_protocol_compatible(struct rdma_cm_event *event)
 645 {
 646         const struct rdsv3_ib_connect_private *dp =
 647             event->param.conn.private_data;
 648         uint16_t common;
 649         uint32_t version = 0;
 650 
 651         RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Enter event: %p",
 652             event);
 653 
 654         /*
 655          * rdma_cm private data is odd - when there is any private data in the
 656          * request, we will be given a pretty large buffer without telling us
 657          * the
 658          * original size. The only way to tell the difference is by looking at
 659          * the contents, which are initialized to zero.
 660          * If the protocol version fields aren't set,
 661          * this is a connection attempt
 662          * from an older version. This could could be 3.0 or 2.0 -
 663          * we can't tell.
 664          * We really should have changed this for OFED 1.3 :-(
 665          */
 666 
 667         /* Be paranoid. RDS always has privdata */
 668         if (!event->param.conn.private_data_len) {
 669                 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
 670                     "RDS incoming connection has no private data, rejecting");
 671                 return (0);
 672         }
 673 
 674         /* Even if len is crap *now* I still want to check it. -ASG */
 675         if (event->param.conn.private_data_len < sizeof (*dp) ||
 676             dp->dp_protocol_major == 0)
 677                 return (RDS_PROTOCOL_3_0);
 678 
 679         common = ntohs(dp->dp_protocol_minor_mask) &
 680             RDSV3_IB_SUPPORTED_PROTOCOLS;
 681         if (dp->dp_protocol_major == 3 && common) {
 682                 version = RDS_PROTOCOL_3_0;
 683                 while ((common >>= 1) != 0)
 684                         version++;
 685         } else {
 686                 RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible",
 687                     "RDS: Connection from %u.%u.%u.%u using "
 688                     "incompatible protocol version %u.%u\n",
 689                     NIPQUAD(dp->dp_saddr),
 690                     dp->dp_protocol_major,
 691                     dp->dp_protocol_minor);
 692         }
 693 
 694         RDSV3_DPRINTF2("rdsv3_ib_protocol_compatible", "Return event: %p",
 695             event);
 696 
 697         return (version);
 698 }
 699 
 700 int
 701 rdsv3_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 702     struct rdma_cm_event *event)
 703 {
 704         uint64_be_t lguid = cm_id->route.path_rec->sgid.global.interface_id;
 705         uint64_be_t fguid = cm_id->route.path_rec->dgid.global.interface_id;
 706         const struct rdsv3_ib_connect_private *dp =
 707             event->param.conn.private_data;
 708         struct rdsv3_ib_connect_private dp_rep;
 709         struct rdsv3_connection *conn = NULL;
 710         struct rdsv3_ib_connection *ic = NULL;
 711         struct rdma_conn_param conn_param;
 712         uint32_t version;
 713         int err, destroy = 1;
 714         boolean_t conn_created = B_FALSE;
 715 
 716         RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 717             "Enter cm_id: %p event: %p", cm_id, event);
 718 
 719         /* Check whether the remote protocol version matches ours. */
 720         version = rdsv3_ib_protocol_compatible(event);
 721         if (!version) {
 722                 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 723                     "version mismatch");
 724                 goto out;
 725         }
 726 
 727         RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 728             "saddr %u.%u.%u.%u daddr %u.%u.%u.%u RDSv%d.%d lguid 0x%llx fguid "
 729             "0x%llx", NIPQUAD(dp->dp_saddr), NIPQUAD(dp->dp_daddr),
 730             RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
 731             (unsigned long long)ntohll(lguid),
 732             (unsigned long long)ntohll(fguid));
 733 
 734         conn = rdsv3_conn_create(dp->dp_daddr, dp->dp_saddr,
 735             &rdsv3_ib_transport, KM_NOSLEEP);
 736         if (IS_ERR(conn)) {
 737                 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 738                     "rdsv3_conn_create failed (%ld)", PTR_ERR(conn));
 739                 conn = NULL;
 740                 goto out;
 741         }
 742 
 743         /*
 744          * The connection request may occur while the
 745          * previous connection exist, e.g. in case of failover.
 746          * But as connections may be initiated simultaneously
 747          * by both hosts, we have a random backoff mechanism -
 748          * see the comment above rdsv3_queue_reconnect()
 749          */
 750         mutex_enter(&conn->c_cm_lock);
 751         if (!rdsv3_conn_transition(conn, RDSV3_CONN_DOWN,
 752             RDSV3_CONN_CONNECTING)) {
 753                 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
 754                         RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 755                             "incoming connect when connected: %p",
 756                             conn);
 757                         rdsv3_conn_drop(conn);
 758                         rdsv3_ib_stats_inc(s_ib_listen_closed_stale);
 759                         mutex_exit(&conn->c_cm_lock);
 760                         goto out;
 761                 } else if (rdsv3_conn_state(conn) == RDSV3_CONN_CONNECTING) {
 762                         /* Wait and see - our connect may still be succeeding */
 763                         RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 764                             "peer-to-peer connection request: %p, "
 765                             "lguid: 0x%llx fguid: 0x%llx",
 766                             conn, lguid, fguid);
 767                         rdsv3_ib_stats_inc(s_ib_connect_raced);
 768                 }
 769                 mutex_exit(&conn->c_cm_lock);
 770                 goto out;
 771         }
 772 
 773         ic = conn->c_transport_data;
 774 
 775         rdsv3_ib_set_protocol(conn, version);
 776         rdsv3_ib_set_flow_control(conn, ntohl(dp->dp_credit));
 777 
 778         /*
 779          * If the peer gave us the last packet it saw, process this as if
 780          * we had received a regular ACK.
 781          */
 782         if (dp->dp_ack_seq)
 783                 rdsv3_send_drop_acked(conn, ntohll(dp->dp_ack_seq), NULL);
 784 
 785         ASSERT(!cm_id->context);
 786         ASSERT(!ic->i_cm_id);
 787 
 788         if (ic->i_cm_id != NULL)
 789                 RDSV3_PANIC();
 790 
 791         ic->i_cm_id = cm_id;
 792         cm_id->context = conn;
 793 
 794         /*
 795          * We got halfway through setting up the ib_connection, if we
 796          * fail now, we have to take the long route out of this mess.
 797          */
 798         destroy = 0;
 799 
 800         err = rdsv3_ib_setup_qp(conn);
 801         if (err) {
 802                 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 803                     "rdsv3_ib_setup_qp failed (%d)", err);
 804                 mutex_exit(&conn->c_cm_lock);
 805                 rdsv3_conn_drop(conn);
 806                 goto out;
 807         }
 808 
 809         rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
 810             event->param.conn.responder_resources,
 811             event->param.conn.initiator_depth);
 812 
 813         /* rdma_accept() calls rdma_reject() internally if it fails */
 814         err = rdma_accept(cm_id, &conn_param);
 815         mutex_exit(&conn->c_cm_lock);
 816         if (err) {
 817                 RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 818                     "rdma_accept failed (%d)", err);
 819                 rdsv3_conn_drop(conn);
 820                 goto out;
 821         }
 822 
 823         RDSV3_DPRINTF2("rdsv3_ib_cm_handle_connect",
 824             "Return cm_id: %p event: %p", cm_id, event);
 825 
 826         return (0);
 827 
 828 out:
 829         (void) rdma_reject(cm_id, NULL, 0);
 830         return (destroy);
 831 }
 832 
 833 
 834 int
 835 rdsv3_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
 836 {
 837         struct rdsv3_connection *conn = cm_id->context;
 838         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 839         struct rdma_conn_param conn_param;
 840         struct rdsv3_ib_connect_private dp;
 841         int ret;
 842 
 843         RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect", "Enter: cm_id: %p",
 844             cm_id);
 845 
 846         /*
 847          * If the peer doesn't do protocol negotiation, we must
 848          * default to RDSv3.0
 849          */
 850         rdsv3_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
 851         ic->i_flowctl =
 852             rdsv3_ib_sysctl_flow_control;       /* advertise flow control */
 853 
 854         ret = rdsv3_ib_setup_qp(conn);
 855         if (ret) {
 856                 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
 857                     "rdsv3_ib_setup_qp failed (%d)", ret);
 858                 rdsv3_conn_drop(conn);
 859                 goto out;
 860         }
 861 
 862         rdsv3_ib_cm_fill_conn_param(conn, &conn_param, &dp,
 863             RDS_PROTOCOL_VERSION, UINT_MAX, UINT_MAX);
 864 
 865         ret = rdma_connect(cm_id, &conn_param);
 866         if (ret) {
 867                 RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
 868                     "rdma_connect failed (%d)", ret);
 869                 rdsv3_conn_drop(conn);
 870         }
 871 
 872         RDSV3_DPRINTF2("rdsv3_ib_cm_initiate_connect",
 873             "Return: cm_id: %p", cm_id);
 874 
 875 out:
 876         /*
 877          * Beware - returning non-zero tells the rdma_cm to destroy
 878          * the cm_id. We should certainly not do it as long as we still
 879          * "own" the cm_id.
 880          */
 881         if (ret) {
 882                 if (ic->i_cm_id == cm_id)
 883                         ret = 0;
 884         }
 885         return (ret);
 886 }
 887 
 888 int
 889 rdsv3_ib_conn_connect(struct rdsv3_connection *conn)
 890 {
 891         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 892         struct sockaddr_in src, dest;
 893         ipaddr_t        laddr, faddr;
 894         int ret;
 895 
 896         RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Enter: conn: %p", conn);
 897 
 898         /*
 899          * XXX I wonder what affect the port space has
 900          */
 901         /* delegate cm event handler to rdma_transport */
 902         ic->i_cm_id = rdma_create_id(rdsv3_rdma_cm_event_handler, conn,
 903             RDMA_PS_TCP);
 904         if (IS_ERR(ic->i_cm_id)) {
 905                 ret = PTR_ERR(ic->i_cm_id);
 906                 ic->i_cm_id = NULL;
 907                 RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
 908                     "rdma_create_id() failed: %d", ret);
 909                 goto out;
 910         }
 911 
 912         RDSV3_DPRINTF3("rdsv3_ib_conn_connect",
 913             "created cm id %p for conn %p", ic->i_cm_id, conn);
 914 
 915         /* The ipaddr should be in the network order */
 916         laddr = conn->c_laddr;
 917         faddr = conn->c_faddr;
 918         ret = rdsv3_sc_path_lookup(&laddr, &faddr);
 919         if (ret == 0) {
 920                 RDSV3_DPRINTF2(LABEL, "Path not found (0x%x 0x%x)",
 921                     ntohl(laddr), ntohl(faddr));
 922         }
 923 
 924         src.sin_family = AF_INET;
 925         src.sin_addr.s_addr = (uint32_t)laddr;
 926         src.sin_port = (uint16_t)htons(0);
 927 
 928         dest.sin_family = AF_INET;
 929         dest.sin_addr.s_addr = (uint32_t)faddr;
 930         dest.sin_port = (uint16_t)htons(RDSV3_PORT);
 931 
 932         ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
 933             (struct sockaddr *)&dest,
 934             RDSV3_RDMA_RESOLVE_TIMEOUT_MS);
 935         if (ret) {
 936                 RDSV3_DPRINTF2("rdsv3_ib_conn_connect",
 937                     "addr resolve failed for cm id %p: %d", ic->i_cm_id, ret);
 938                 rdma_destroy_id(ic->i_cm_id);
 939                 ic->i_cm_id = NULL;
 940         }
 941 
 942         RDSV3_DPRINTF2("rdsv3_ib_conn_connect", "Return: conn: %p", conn);
 943 
 944 out:
 945         return (ret);
 946 }
 947 
 948 /*
 949  * This is so careful about only cleaning up resources that were built up
 950  * so that it can be called at any point during startup.  In fact it
 951  * can be called multiple times for a given connection.
 952  */
 953 void
 954 rdsv3_ib_conn_shutdown(struct rdsv3_connection *conn)
 955 {
 956         struct rdsv3_ib_connection *ic = conn->c_transport_data;
 957         int err = 0;
 958 
 959         RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
 960             "cm %p pd %p cq %p qp %p", ic->i_cm_id,
 961             ic->i_pd, ic->i_cq, ic->i_cm_id ? ic->i_cm_id->qp : NULL);
 962 
 963         if (ic->i_cm_id) {
 964                 struct ib_device *dev = ic->i_cm_id->device;
 965 
 966                 RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
 967                     "disconnecting cm %p", ic->i_cm_id);
 968                 err = rdma_disconnect(ic->i_cm_id);
 969                 if (err) {
 970                         /*
 971                          * Actually this may happen quite frequently, when
 972                          * an outgoing connect raced with an incoming connect.
 973                          */
 974                         RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown",
 975                             "failed to disconnect, cm: %p err %d",
 976                             ic->i_cm_id, err);
 977                 }
 978 
 979                 if (ic->i_cm_id->qp) {
 980                         (void) ibt_flush_qp(
 981                             ib_get_ibt_channel_hdl(ic->i_cm_id));
 982                         /*
 983                          * Don't wait for the send ring to be empty -- there
 984                          * may be completed non-signaled entries sitting on
 985                          * there. We unmap these below.
 986                          */
 987                         rdsv3_wait_event(&ic->i_recv_ring.w_empty_wait,
 988                             rdsv3_ib_ring_empty(&ic->i_recv_ring));
 989                         /*
 990                          * Note that Linux original code calls
 991                          * rdma_destroy_qp() after rdsv3_ib_recv_clear_ring(ic).
 992                          */
 993                         rdma_destroy_qp(ic->i_cm_id);
 994                 }
 995 
 996                 if (rdsv3_enable_snd_cq) {
 997                         if (ic->i_snd_soft_cq) {
 998                                 rdsv3_af_thr_destroy(ic->i_snd_soft_cq);
 999                                 ic->i_snd_soft_cq = NULL;
1000                         }
1001                         if (ic->i_snd_cq)
1002                                 (void) ib_destroy_cq(ic->i_snd_cq);
1003                 }
1004                 if (ic->i_soft_cq) {
1005                         rdsv3_af_thr_destroy(ic->i_soft_cq);
1006                         ic->i_soft_cq = NULL;
1007                 }
1008                 if (ic->i_refill_rq) {
1009                         rdsv3_af_thr_destroy(ic->i_refill_rq);
1010                         ic->i_refill_rq = NULL;
1011                 }
1012                 if (ic->i_cq)
1013                         (void) ib_destroy_cq(ic->i_cq);
1014 
1015                 if (ic->i_mr)
1016                         rdsv3_ib_free_hdrs(dev, ic);
1017 
1018                 if (ic->i_sends)
1019                         rdsv3_ib_send_clear_ring(ic);
1020                 if (ic->i_recvs)
1021                         rdsv3_ib_recv_clear_ring(ic);
1022 
1023                 rdma_destroy_id(ic->i_cm_id);
1024 
1025                 /*
1026                  * Move connection back to the nodev list.
1027                  */
1028                 if (ic->i_on_dev_list)
1029                         rdsv3_ib_remove_conn(ic->rds_ibdev, conn);
1030 
1031                 ic->i_cm_id = NULL;
1032                 ic->i_pd = NULL;
1033                 ic->i_mr = NULL;
1034                 ic->i_cq = NULL;
1035                 ic->i_snd_cq = NULL;
1036                 ic->i_send_hdrs = NULL;
1037                 ic->i_recv_hdrs = NULL;
1038                 ic->i_ack = NULL;
1039         }
1040         ASSERT(!ic->i_on_dev_list);
1041 
1042         /* Clear pending transmit */
1043         if (ic->i_rm) {
1044                 rdsv3_message_put(ic->i_rm);
1045                 ic->i_rm = NULL;
1046         }
1047 
1048         /* Clear the ACK state */
1049         clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
1050         ic->i_ack_next = 0;
1051         ic->i_ack_recv = 0;
1052 
1053         /* Clear flow control state */
1054         ic->i_flowctl = 0;
1055         ic->i_credits = 0;
1056 
1057         rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
1058         rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
1059 
1060         if (ic->i_ibinc) {
1061                 rdsv3_inc_put(&ic->i_ibinc->ii_inc);
1062                 ic->i_ibinc = NULL;
1063         }
1064 
1065         if (ic->i_sends) {
1066                 kmem_free(ic->i_sends,
1067                     ic->i_send_ring.w_nr * sizeof (struct rdsv3_ib_send_work));
1068                 ic->i_sends = NULL;
1069         }
1070         if (ic->i_send_wrs) {
1071                 kmem_free(ic->i_send_wrs, ic->i_send_ring.w_nr *
1072                     (sizeof (ibt_send_wr_t) +
1073                     RDSV3_IB_MAX_SGE * sizeof (ibt_wr_ds_t)));
1074                 ic->i_send_wrs = NULL;
1075         }
1076         if (ic->i_recvs) {
1077                 kmem_free(ic->i_recvs,
1078                     ic->i_recv_ring.w_nr * sizeof (struct rdsv3_ib_recv_work));
1079                 ic->i_recvs = NULL;
1080         }
1081         if (ic->i_recv_wrs) {
1082                 kmem_free(ic->i_recv_wrs, ic->i_recv_ring.w_nr *
1083                     (sizeof (ibt_recv_wr_t)));
1084                 ic->i_recv_wrs = NULL;
1085         }
1086 
1087         RDSV3_DPRINTF2("rdsv3_ib_conn_shutdown", "Return conn: %p", conn);
1088 }
1089 
1090 /* ARGSUSED */
1091 int
1092 rdsv3_ib_conn_alloc(struct rdsv3_connection *conn, int gfp)
1093 {
1094         struct rdsv3_ib_connection *ic;
1095 
1096         RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn: %p", conn);
1097 
1098         /* XXX too lazy? */
1099         ic = kmem_zalloc(sizeof (struct rdsv3_ib_connection), gfp);
1100         if (!ic)
1101                 return (-ENOMEM);
1102 
1103         list_link_init(&ic->ib_node);
1104 
1105         mutex_init(&ic->i_recv_mutex, NULL, MUTEX_DRIVER, NULL);
1106         mutex_init(&ic->i_ack_lock, NULL, MUTEX_DRIVER, NULL);
1107 
1108         /*
1109          * rdsv3_ib_conn_shutdown() waits for these to be emptied so they
1110          * must be initialized before it can be called.
1111          */
1112         rdsv3_ib_ring_init(&ic->i_send_ring, rdsv3_ib_sysctl_max_send_wr);
1113         rdsv3_ib_ring_init(&ic->i_recv_ring, rdsv3_ib_sysctl_max_recv_wr);
1114 
1115         ic->conn = conn;
1116         conn->c_transport_data = ic;
1117 
1118         mutex_enter(&ib_nodev_conns_lock);
1119         list_insert_tail(&ib_nodev_conns, ic);
1120         mutex_exit(&ib_nodev_conns_lock);
1121 
1122         RDSV3_DPRINTF2("rdsv3_ib_conn_alloc", "conn %p conn ic %p",
1123             conn, conn->c_transport_data);
1124         return (0);
1125 }
1126 
1127 /*
1128  * Free a connection. Connection must be shut down and not set for reconnect.
1129  */
1130 void
1131 rdsv3_ib_conn_free(void *arg)
1132 {
1133         struct rdsv3_ib_connection *ic = arg;
1134         kmutex_t        *lock_ptr;
1135 
1136         RDSV3_DPRINTF2("rdsv3_ib_conn_free", "ic %p\n", ic);
1137 
1138         /*
1139          * Conn is either on a dev's list or on the nodev list.
1140          * A race with shutdown() or connect() would cause problems
1141          * (since rds_ibdev would change) but that should never happen.
1142          */
1143         lock_ptr = ic->i_on_dev_list ?
1144             &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
1145 
1146         mutex_enter(lock_ptr);
1147         list_remove_node(&ic->ib_node);
1148         mutex_exit(lock_ptr);
1149         kmem_free(ic, sizeof (*ic));
1150 }
1151 
1152 /*
1153  * An error occurred on the connection
1154  */
1155 void
1156 __rdsv3_ib_conn_error(struct rdsv3_connection *conn)
1157 {
1158         rdsv3_conn_drop(conn);
1159 }