1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
  28  *
  29  * Portions of this source code is developed by the team members of
  30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
  31  * headed by Professor Dhabaleswar K. (DK) Panda.
  32  *
  33  * Acknowledgements to contributions from developors:
  34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
  35  *   Lei Chai      : chail@cse.ohio-state.edu
  36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
  37  *
  38  */
  39 
  40 /*
  41  * The rpcib plugin. Implements the interface for RDMATF's
  42  * interaction with IBTF.
  43  */
  44 
  45 #include <sys/param.h>
  46 #include <sys/types.h>
  47 #include <sys/user.h>
  48 #include <sys/systm.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/proc.h>
  51 #include <sys/socket.h>
  52 #include <sys/file.h>
  53 #include <sys/stream.h>
  54 #include <sys/strsubr.h>
  55 #include <sys/stropts.h>
  56 #include <sys/errno.h>
  57 #include <sys/kmem.h>
  58 #include <sys/debug.h>
  59 #include <sys/pathname.h>
  60 #include <sys/kstat.h>
  61 #include <sys/t_lock.h>
  62 #include <sys/ddi.h>
  63 #include <sys/cmn_err.h>
  64 #include <sys/time.h>
  65 #include <sys/isa_defs.h>
  66 #include <sys/callb.h>
  67 #include <sys/sunddi.h>
  68 #include <sys/sunndi.h>
  69 #include <sys/sdt.h>
  70 #include <sys/ib/ibtl/ibti.h>
  71 #include <rpc/rpc.h>
  72 #include <rpc/ib.h>
  73 #include <sys/modctl.h>
  74 #include <sys/kstr.h>
  75 #include <sys/sockio.h>
  76 #include <sys/vnode.h>
  77 #include <sys/tiuser.h>
  78 #include <net/if.h>
  79 #include <net/if_types.h>
  80 #include <sys/cred.h>
  81 #include <rpc/rpc_rdma.h>
  82 #include <nfs/nfs.h>
  83 #include <sys/atomic.h>
  84 
  85 #define NFS_RDMA_PORT   20049
  86 
  87 
  88 /*
  89  * Convenience structures for connection management
  90  */
  91 typedef struct rpcib_ipaddrs {
  92         void    *ri_list;       /* pointer to list of addresses */
  93         uint_t  ri_count;       /* number of addresses in list */
  94         uint_t  ri_size;        /* size of ri_list in bytes */
  95 } rpcib_ipaddrs_t;
  96 
  97 
  98 typedef struct rpcib_ping {
  99         rib_hca_t  *hca;
 100         ibt_path_info_t path;
 101         ibt_ip_addr_t srcip;
 102         ibt_ip_addr_t dstip;
 103 } rpcib_ping_t;
 104 
 105 /*
 106  * Prototype declarations for driver ops
 107  */
 108 static int      rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
 109 static int      rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
 110                                 void *, void **);
 111 static int      rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
 112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
 113 static int      rpcib_do_ip_ioctl(int, int, void *);
 114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
 115 static int rpcib_cache_kstat_update(kstat_t *, int);
 116 static void rib_force_cleanup(void *);
 117 static void rib_stop_hca_services(rib_hca_t *);
 118 static void rib_attach_hca(void);
 119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
 120                 struct netbuf *d_svcaddr, CONN **conn);
 121 
 122 struct {
 123         kstat_named_t cache_limit;
 124         kstat_named_t cache_allocation;
 125         kstat_named_t cache_hits;
 126         kstat_named_t cache_misses;
 127         kstat_named_t cache_misses_above_the_limit;
 128 } rpcib_kstat = {
 129         {"cache_limit",                 KSTAT_DATA_UINT64 },
 130         {"cache_allocation",            KSTAT_DATA_UINT64 },
 131         {"cache_hits",                  KSTAT_DATA_UINT64 },
 132         {"cache_misses",                KSTAT_DATA_UINT64 },
 133         {"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
 134 };
 135 
 136 /* rpcib cb_ops */
 137 static struct cb_ops rpcib_cbops = {
 138         nulldev,                /* open */
 139         nulldev,                /* close */
 140         nodev,                  /* strategy */
 141         nodev,                  /* print */
 142         nodev,                  /* dump */
 143         nodev,                  /* read */
 144         nodev,                  /* write */
 145         nodev,                  /* ioctl */
 146         nodev,                  /* devmap */
 147         nodev,                  /* mmap */
 148         nodev,                  /* segmap */
 149         nochpoll,               /* poll */
 150         ddi_prop_op,            /* prop_op */
 151         NULL,                   /* stream */
 152         D_MP,                   /* cb_flag */
 153         CB_REV,                 /* rev */
 154         nodev,                  /* int (*cb_aread)() */
 155         nodev                   /* int (*cb_awrite)() */
 156 };
 157 
 158 /*
 159  * Device options
 160  */
 161 static struct dev_ops rpcib_ops = {
 162         DEVO_REV,               /* devo_rev, */
 163         0,                      /* refcnt  */
 164         rpcib_getinfo,          /* info */
 165         nulldev,                /* identify */
 166         nulldev,                /* probe */
 167         rpcib_attach,           /* attach */
 168         rpcib_detach,           /* detach */
 169         nodev,                  /* reset */
 170         &rpcib_cbops,                   /* driver ops - devctl interfaces */
 171         NULL,                   /* bus operations */
 172         NULL,                   /* power */
 173         ddi_quiesce_not_needed,         /* quiesce */
 174 };
 175 
 176 /*
 177  * Module linkage information.
 178  */
 179 
 180 static struct modldrv rib_modldrv = {
 181         &mod_driverops,             /* Driver module */
 182         "RPCIB plugin driver",  /* Driver name and version */
 183         &rpcib_ops,         /* Driver ops */
 184 };
 185 
 186 static struct modlinkage rib_modlinkage = {
 187         MODREV_1,
 188         { (void *)&rib_modldrv, NULL }
 189 };
 190 
 191 typedef struct rib_lrc_entry {
 192         struct rib_lrc_entry *forw;
 193         struct rib_lrc_entry *back;
 194         char *lrc_buf;
 195 
 196         uint32_t lrc_len;
 197         void  *avl_node;
 198         bool_t registered;
 199 
 200         struct mrc lrc_mhandle;
 201         bool_t lrc_on_freed_list;
 202 } rib_lrc_entry_t;
 203 
 204 typedef struct cache_struct     {
 205         rib_lrc_entry_t         r;
 206         uint32_t                len;
 207         uint32_t                elements;
 208         kmutex_t                node_lock;
 209         avl_node_t              avl_link;
 210 } cache_avl_struct_t;
 211 
 212 uint64_t        cache_limit = 100 * 1024 * 1024;
 213 static uint64_t cache_watermark = 80 * 1024 * 1024;
 214 static bool_t   stats_enabled = FALSE;
 215 
 216 static uint64_t max_unsignaled_rws = 5;
 217 int nfs_rdma_port = NFS_RDMA_PORT;
 218 
 219 #define RIBNETID_TCP    "tcp"
 220 #define RIBNETID_TCP6   "tcp6"
 221 
 222 /*
 223  * rib_stat: private data pointer used when registering
 224  *      with the IBTF.  It is returned to the consumer
 225  *      in all callbacks.
 226  */
 227 static rpcib_state_t *rib_stat = NULL;
 228 
 229 #define RNR_RETRIES     IBT_RNR_RETRY_1
 230 #define MAX_PORTS       2
 231 #define RDMA_DUMMY_WRID 0x4D3A1D4D3A1D
 232 #define RDMA_CONN_REAP_RETRY    10      /* 10 secs */
 233 
 234 int preposted_rbufs = RDMA_BUFS_GRANT;
 235 int send_threshold = 1;
 236 
 237 /*
 238  * Old cards with Tavor driver have limited memory footprint
 239  * when booted in 32bit. The rib_max_rbufs tunable can be
 240  * tuned for more buffers if needed.
 241  */
 242 
 243 #if !defined(_ELF64) && !defined(__sparc)
 244 int rib_max_rbufs = MAX_BUFS;
 245 #else
 246 int rib_max_rbufs = 10 * MAX_BUFS;
 247 #endif  /* !(_ELF64) && !(__sparc) */
 248 
 249 int rib_conn_timeout = 60 * 12;         /* 12 minutes */
 250 
 251 /*
 252  * State of the plugin.
 253  * ACCEPT = accepting new connections and requests.
 254  * NO_ACCEPT = not accepting new connection and requests.
 255  * This should eventually move to rpcib_state_t structure, since this
 256  * will tell in which state the plugin is for a particular type of service
 257  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
 258  * state for one and in no_accept state for the other.
 259  */
 260 int             plugin_state;
 261 kmutex_t        plugin_state_lock;
 262 
 263 ldi_ident_t rpcib_li;
 264 
 265 /*
 266  * RPCIB RDMATF operations
 267  */
 268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
 269 static rdma_stat rib_disconnect(CONN *conn);
 270 static void rib_listen(struct rdma_svc_data *rd);
 271 static void rib_listen_stop(struct rdma_svc_data *rd);
 272 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
 273         uint_t buflen, struct mrc *buf_handle);
 274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
 275         struct mrc buf_handle);
 276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
 277                 caddr_t buf, uint_t buflen, struct mrc *buf_handle);
 278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
 279                 struct mrc buf_handle);
 280 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
 281         uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
 282         void *lrc);
 283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
 284         struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
 285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
 286         caddr_t buf, int len, int cpu);
 287 
 288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
 289 
 290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
 291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
 292 
 293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
 294 
 295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
 296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
 297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
 298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
 299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
 300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
 301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
 302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
 303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
 304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
 305         int addr_type, void *, CONN **);
 306 static rdma_stat rib_conn_release(CONN *conn);
 307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
 308         rpcib_ping_t *, CONN **);
 309 static rdma_stat rib_getinfo(rdma_info_t *info);
 310 
 311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
 312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
 313 static void rib_destroy_cache(rib_hca_t *hca);
 314 static  void    rib_server_side_cache_reclaim(void *argp);
 315 static int avl_compare(const void *t1, const void *t2);
 316 
 317 static void rib_stop_services(rib_hca_t *);
 318 static void rib_close_channels(rib_conn_list_t *);
 319 static void rib_conn_close(void *);
 320 static void rib_recv_rele(rib_qp_t *);
 321 static rdma_stat rib_conn_release_locked(CONN *conn);
 322 
 323 /*
 324  * RPCIB addressing operations
 325  */
 326 
 327 /*
 328  * RDMA operations the RPCIB module exports
 329  */
 330 static rdmaops_t rib_ops = {
 331         rib_reachable,
 332         rib_conn_get,
 333         rib_conn_release,
 334         rib_listen,
 335         rib_listen_stop,
 336         rib_registermem,
 337         rib_deregistermem,
 338         rib_registermemsync,
 339         rib_deregistermemsync,
 340         rib_syncmem,
 341         rib_reg_buf_alloc,
 342         rib_reg_buf_free,
 343         rib_send,
 344         rib_send_resp,
 345         rib_post_resp,
 346         rib_post_resp_remove,
 347         rib_post_recv,
 348         rib_recv,
 349         rib_read,
 350         rib_write,
 351         rib_getinfo,
 352 };
 353 
 354 /*
 355  * RDMATF RPCIB plugin details
 356  */
 357 static rdma_mod_t rib_mod = {
 358         "ibtf",         /* api name */
 359         RDMATF_VERS_1,
 360         0,
 361         &rib_ops,   /* rdma op vector for ibtf */
 362 };
 363 
 364 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
 365 static rdma_stat rib_qp_init(rib_qp_t *, int);
 366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
 367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
 368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
 369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
 370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
 371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
 372         ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
 373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
 374         ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
 375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
 376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
 377         rib_qp_t **);
 378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
 379         rib_qp_t **);
 380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
 381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
 382 static int rib_free_sendwait(struct send_wid *);
 383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
 384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
 385 static void rdma_done_rem_list(rib_qp_t *);
 386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
 387 
 388 static void rib_async_handler(void *,
 389         ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
 390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
 391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
 392 static int rib_free_svc_recv(struct svc_recv *);
 393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
 394 static void rib_free_wid(struct recv_wid *);
 395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
 396 static void rib_detach_hca(ibt_hca_hdl_t);
 397 static void rib_close_a_channel(CONN *);
 398 static void rib_send_hold(rib_qp_t *);
 399 static void rib_send_rele(rib_qp_t *);
 400 
 401 /*
 402  * Registration with IBTF as a consumer
 403  */
 404 static struct ibt_clnt_modinfo_s rib_modinfo = {
 405         IBTI_V_CURR,
 406         IBT_GENERIC,
 407         rib_async_handler,      /* async event handler */
 408         NULL,                   /* Memory Region Handler */
 409         "nfs/ib"
 410 };
 411 
 412 /*
 413  * Global strucuture
 414  */
 415 
 416 typedef struct rpcib_s {
 417         dev_info_t      *rpcib_dip;
 418         kmutex_t        rpcib_mutex;
 419 } rpcib_t;
 420 
 421 rpcib_t rpcib;
 422 
 423 /*
 424  * /etc/system controlled variable to control
 425  * debugging in rpcib kernel module.
 426  * Set it to values greater that 1 to control
 427  * the amount of debugging messages required.
 428  */
 429 int rib_debug = 0;
 430 
 431 int
 432 _init(void)
 433 {
 434         int error;
 435 
 436         error = mod_install((struct modlinkage *)&rib_modlinkage);
 437         if (error != 0) {
 438                 /*
 439                  * Could not load module
 440                  */
 441                 return (error);
 442         }
 443         mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
 444         return (0);
 445 }
 446 
 447 int
 448 _fini()
 449 {
 450         int status;
 451 
 452         /*
 453          * Remove module
 454          */
 455         if ((status = mod_remove(&rib_modlinkage)) != 0) {
 456                 return (status);
 457         }
 458         mutex_destroy(&plugin_state_lock);
 459         return (0);
 460 }
 461 
 462 int
 463 _info(struct modinfo *modinfop)
 464 {
 465         return (mod_info(&rib_modlinkage, modinfop));
 466 }
 467 
 468 /*
 469  * rpcib_getinfo()
 470  * Given the device number, return the devinfo pointer or the
 471  * instance number.
 472  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
 473  */
 474 
 475 /*ARGSUSED*/
 476 static int
 477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
 478 {
 479         int ret = DDI_SUCCESS;
 480 
 481         switch (cmd) {
 482         case DDI_INFO_DEVT2DEVINFO:
 483                 if (rpcib.rpcib_dip != NULL)
 484                         *result = rpcib.rpcib_dip;
 485                 else {
 486                         *result = NULL;
 487                         ret = DDI_FAILURE;
 488                 }
 489                 break;
 490 
 491         case DDI_INFO_DEVT2INSTANCE:
 492                 *result = NULL;
 493                 break;
 494 
 495         default:
 496                 ret = DDI_FAILURE;
 497         }
 498         return (ret);
 499 }
 500 
 501 static void
 502 rpcib_free_hca_list()
 503 {
 504         rib_hca_t *hca, *hcap;
 505 
 506         rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
 507         hca = rib_stat->hcas_list;
 508         rib_stat->hcas_list = NULL;
 509         rw_exit(&rib_stat->hcas_list_lock);
 510         while (hca != NULL) {
 511                 rw_enter(&hca->state_lock, RW_WRITER);
 512                 hcap = hca;
 513                 hca = hca->next;
 514                 rib_stat->nhca_inited--;
 515                 rib_mod.rdma_count--;
 516                 hcap->state = HCA_DETACHED;
 517                 rw_exit(&hcap->state_lock);
 518                 rib_stop_hca_services(hcap);
 519 
 520                 kmem_free(hcap, sizeof (*hcap));
 521         }
 522 }
 523 
 524 static rdma_stat
 525 rpcib_free_service_list()
 526 {
 527         rib_service_t *service;
 528         ibt_status_t ret;
 529 
 530         rw_enter(&rib_stat->service_list_lock, RW_WRITER);
 531         while (rib_stat->service_list != NULL) {
 532                 service = rib_stat->service_list;
 533                 ret = ibt_unbind_all_services(service->srv_hdl);
 534                 if (ret != IBT_SUCCESS) {
 535                         rw_exit(&rib_stat->service_list_lock);
 536 #ifdef DEBUG
 537                         cmn_err(CE_NOTE, "rpcib_free_service_list: "
 538                             "ibt_unbind_all_services failed (%d)\n", (int)ret);
 539 #endif
 540                         return (RDMA_FAILED);
 541                 }
 542                 ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
 543                     service->srv_hdl);
 544                 if (ret != IBT_SUCCESS) {
 545                         rw_exit(&rib_stat->service_list_lock);
 546 #ifdef DEBUG
 547                         cmn_err(CE_NOTE, "rpcib_free_service_list: "
 548                             "ibt_deregister_service failed (%d)\n", (int)ret);
 549 #endif
 550                         return (RDMA_FAILED);
 551                 }
 552                 rib_stat->service_list = service->next;
 553                 kmem_free(service, sizeof (rib_service_t));
 554         }
 555         rw_exit(&rib_stat->service_list_lock);
 556 
 557         return (RDMA_SUCCESS);
 558 }
 559 
 560 static int
 561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 562 {
 563         ibt_status_t    ibt_status;
 564         rdma_stat       r_status;
 565 
 566         switch (cmd) {
 567         case DDI_ATTACH:
 568                 break;
 569         case DDI_RESUME:
 570                 return (DDI_SUCCESS);
 571         default:
 572                 return (DDI_FAILURE);
 573         }
 574 
 575         mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
 576 
 577         mutex_enter(&rpcib.rpcib_mutex);
 578         if (rpcib.rpcib_dip != NULL) {
 579                 mutex_exit(&rpcib.rpcib_mutex);
 580                 return (DDI_FAILURE);
 581         }
 582         rpcib.rpcib_dip = dip;
 583         mutex_exit(&rpcib.rpcib_mutex);
 584         /*
 585          * Create the "rpcib" minor-node.
 586          */
 587         if (ddi_create_minor_node(dip,
 588             "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
 589                 /* Error message, no cmn_err as they print on console */
 590                 return (DDI_FAILURE);
 591         }
 592 
 593         if (rib_stat == NULL) {
 594                 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
 595                 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
 596                 rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
 597                 mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
 598         }
 599 
 600         rib_stat->hca_count = ibt_get_hca_list(NULL);
 601         if (rib_stat->hca_count < 1) {
 602                 mutex_destroy(&rib_stat->listen_lock);
 603                 rw_destroy(&rib_stat->hcas_list_lock);
 604                 mutex_destroy(&rib_stat->open_hca_lock);
 605                 kmem_free(rib_stat, sizeof (*rib_stat));
 606                 rib_stat = NULL;
 607                 return (DDI_FAILURE);
 608         }
 609 
 610         ibt_status = ibt_attach(&rib_modinfo, dip,
 611             (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
 612 
 613         if (ibt_status != IBT_SUCCESS) {
 614                 mutex_destroy(&rib_stat->listen_lock);
 615                 rw_destroy(&rib_stat->hcas_list_lock);
 616                 mutex_destroy(&rib_stat->open_hca_lock);
 617                 kmem_free(rib_stat, sizeof (*rib_stat));
 618                 rib_stat = NULL;
 619                 return (DDI_FAILURE);
 620         }
 621 
 622         rib_stat->service_list = NULL;
 623         rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
 624         mutex_enter(&rib_stat->open_hca_lock);
 625         if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
 626                 mutex_exit(&rib_stat->open_hca_lock);
 627                 goto open_fail;
 628         }
 629         mutex_exit(&rib_stat->open_hca_lock);
 630 
 631         if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
 632             DDI_PROP_SUCCESS) {
 633                 cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
 634                     "failed.");
 635                 goto register_fail;
 636         }
 637 
 638         /*
 639          * Register with rdmatf
 640          */
 641         r_status = rdma_register_mod(&rib_mod);
 642         if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
 643                 cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
 644                     "status = %d", r_status);
 645                 goto register_fail;
 646         }
 647 
 648         return (DDI_SUCCESS);
 649 
 650 register_fail:
 651 
 652 open_fail:
 653         (void) ibt_detach(rib_stat->ibt_clnt_hdl);
 654         rpcib_free_hca_list();
 655         (void) rpcib_free_service_list();
 656         mutex_destroy(&rib_stat->listen_lock);
 657         rw_destroy(&rib_stat->hcas_list_lock);
 658         mutex_destroy(&rib_stat->open_hca_lock);
 659         rw_destroy(&rib_stat->service_list_lock);
 660         kmem_free(rib_stat, sizeof (*rib_stat));
 661         rib_stat = NULL;
 662         return (DDI_FAILURE);
 663 }
 664 
 665 /*ARGSUSED*/
 666 static int
 667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 668 {
 669         switch (cmd) {
 670 
 671         case DDI_DETACH:
 672                 break;
 673 
 674         case DDI_SUSPEND:
 675         default:
 676                 return (DDI_FAILURE);
 677         }
 678 
 679         /*
 680          * Detach the hca and free resources
 681          */
 682         mutex_enter(&plugin_state_lock);
 683         plugin_state = NO_ACCEPT;
 684         mutex_exit(&plugin_state_lock);
 685 
 686         if (rpcib_free_service_list() != RDMA_SUCCESS)
 687                 return (DDI_FAILURE);
 688         rpcib_free_hca_list();
 689 
 690         (void) ibt_detach(rib_stat->ibt_clnt_hdl);
 691         mutex_destroy(&rib_stat->listen_lock);
 692         rw_destroy(&rib_stat->hcas_list_lock);
 693         mutex_destroy(&rib_stat->open_hca_lock);
 694         rw_destroy(&rib_stat->service_list_lock);
 695 
 696         kmem_free(rib_stat, sizeof (*rib_stat));
 697         rib_stat = NULL;
 698 
 699         mutex_enter(&rpcib.rpcib_mutex);
 700         rpcib.rpcib_dip = NULL;
 701         mutex_exit(&rpcib.rpcib_mutex);
 702         mutex_destroy(&rpcib.rpcib_mutex);
 703         return (DDI_SUCCESS);
 704 }
 705 
 706 
 707 static void rib_rbufpool_free(rib_hca_t *, int);
 708 static void rib_rbufpool_deregister(rib_hca_t *, int);
 709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
 710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
 711 static rdma_stat rib_rem_replylist(rib_qp_t *);
 712 static int rib_remreply(rib_qp_t *, struct reply *);
 713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
 714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
 715 
 716 
 717 /*
 718  * One CQ pair per HCA
 719  */
 720 static rdma_stat
 721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
 722         rib_cq_t **cqp)
 723 {
 724         rib_cq_t        *cq;
 725         ibt_cq_attr_t   cq_attr;
 726         uint32_t        real_size;
 727         ibt_status_t    status;
 728         rdma_stat       error = RDMA_SUCCESS;
 729 
 730         cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
 731         cq->rib_hca = hca;
 732         bzero(&cq_attr, sizeof (cq_attr));
 733         cq_attr.cq_size = cq_size;
 734         cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
 735         status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
 736             &real_size);
 737         if (status != IBT_SUCCESS) {
 738                 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
 739                     " status=%d", status);
 740                 error = RDMA_FAILED;
 741                 goto fail;
 742         }
 743         ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
 744 
 745         /*
 746          * Enable CQ callbacks. CQ Callbacks are single shot
 747          * (e.g. you have to call ibt_enable_cq_notify()
 748          * after each callback to get another one).
 749          */
 750         status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
 751         if (status != IBT_SUCCESS) {
 752                 cmn_err(CE_WARN, "rib_create_cq: "
 753                     "enable_cq_notify failed, status %d", status);
 754                 error = RDMA_FAILED;
 755                 goto fail;
 756         }
 757         *cqp = cq;
 758 
 759         return (error);
 760 fail:
 761         if (cq->rib_cq_hdl)
 762                 (void) ibt_free_cq(cq->rib_cq_hdl);
 763         if (cq)
 764                 kmem_free(cq, sizeof (rib_cq_t));
 765         return (error);
 766 }
 767 
 768 /*
 769  * rpcib_find_hca
 770  *
 771  * Caller should have already locked the hcas_lock before calling
 772  * this function.
 773  */
 774 static rib_hca_t *
 775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
 776 {
 777         rib_hca_t *hca = ribstat->hcas_list;
 778 
 779         while (hca && hca->hca_guid != guid)
 780                 hca = hca->next;
 781 
 782         return (hca);
 783 }
 784 
 785 static rdma_stat
 786 rpcib_open_hcas(rpcib_state_t *ribstat)
 787 {
 788         rib_hca_t               *hca;
 789         ibt_status_t            ibt_status;
 790         rdma_stat               status;
 791         ibt_hca_portinfo_t      *pinfop;
 792         ibt_pd_flags_t          pd_flags = IBT_PD_NO_FLAGS;
 793         uint_t                  size, cq_size;
 794         int                     i;
 795         kstat_t *ksp;
 796         cache_avl_struct_t example_avl_node;
 797         char rssc_name[32];
 798         int old_nhca_inited = ribstat->nhca_inited;
 799         ib_guid_t               *hca_guids;
 800 
 801         ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
 802 
 803         ribstat->hca_count = ibt_get_hca_list(&hca_guids);
 804         if (ribstat->hca_count == 0)
 805                 return (RDMA_FAILED);
 806 
 807         rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
 808         /*
 809          * Open a hca and setup for RDMA
 810          */
 811         for (i = 0; i < ribstat->hca_count; i++) {
 812                 if (rpcib_find_hca(ribstat, hca_guids[i]))
 813                         continue;
 814                 hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
 815 
 816                 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
 817                     hca_guids[i], &hca->hca_hdl);
 818                 if (ibt_status != IBT_SUCCESS) {
 819                         kmem_free(hca, sizeof (rib_hca_t));
 820                         continue;
 821                 }
 822                 hca->hca_guid = hca_guids[i];
 823                 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
 824                 hca->state = HCA_INITED;
 825 
 826                 /*
 827                  * query HCA info
 828                  */
 829                 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
 830                 if (ibt_status != IBT_SUCCESS) {
 831                         goto fail1;
 832                 }
 833 
 834                 /*
 835                  * One PD (Protection Domain) per HCA.
 836                  * A qp is allowed to access a memory region
 837                  * only when it's in the same PD as that of
 838                  * the memory region.
 839                  */
 840                 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
 841                 if (ibt_status != IBT_SUCCESS) {
 842                         goto fail1;
 843                 }
 844 
 845                 /*
 846                  * query HCA ports
 847                  */
 848                 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
 849                     0, &pinfop, &hca->hca_nports, &size);
 850                 if (ibt_status != IBT_SUCCESS) {
 851                         goto fail2;
 852                 }
 853                 hca->hca_ports = pinfop;
 854                 hca->hca_pinfosz = size;
 855                 pinfop = NULL;
 856 
 857                 cq_size = DEF_CQ_SIZE; /* default cq size */
 858                 /*
 859                  * Create 2 pairs of cq's (1 pair for client
 860                  * and the other pair for server) on this hca.
 861                  * If number of qp's gets too large, then several
 862                  * cq's will be needed.
 863                  */
 864                 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
 865                     &hca->svc_rcq);
 866                 if (status != RDMA_SUCCESS) {
 867                         goto fail3;
 868                 }
 869 
 870                 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
 871                     &hca->svc_scq);
 872                 if (status != RDMA_SUCCESS) {
 873                         goto fail3;
 874                 }
 875 
 876                 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
 877                     &hca->clnt_rcq);
 878                 if (status != RDMA_SUCCESS) {
 879                         goto fail3;
 880                 }
 881 
 882                 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
 883                     &hca->clnt_scq);
 884                 if (status != RDMA_SUCCESS) {
 885                         goto fail3;
 886                 }
 887 
 888                 /*
 889                  * Create buffer pools.
 890                  * Note rib_rbuf_create also allocates memory windows.
 891                  */
 892                 hca->recv_pool = rib_rbufpool_create(hca,
 893                     RECV_BUFFER, rib_max_rbufs);
 894                 if (hca->recv_pool == NULL) {
 895                         goto fail3;
 896                 }
 897 
 898                 hca->send_pool = rib_rbufpool_create(hca,
 899                     SEND_BUFFER, rib_max_rbufs);
 900                 if (hca->send_pool == NULL) {
 901                         rib_rbufpool_destroy(hca, RECV_BUFFER);
 902                         goto fail3;
 903                 }
 904 
 905                 if (hca->server_side_cache == NULL) {
 906                         (void) sprintf(rssc_name,
 907                             "rib_srvr_cache_%llx",
 908                             (long long unsigned int) hca->hca_guid);
 909                         hca->server_side_cache = kmem_cache_create(
 910                             rssc_name,
 911                             sizeof (cache_avl_struct_t), 0,
 912                             NULL,
 913                             NULL,
 914                             rib_server_side_cache_reclaim,
 915                             hca, NULL, 0);
 916                 }
 917 
 918                 avl_create(&hca->avl_tree,
 919                     avl_compare,
 920                     sizeof (cache_avl_struct_t),
 921                     (uint_t)(uintptr_t)&example_avl_node.avl_link-
 922                     (uint_t)(uintptr_t)&example_avl_node);
 923 
 924                 rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
 925                     hca->iblock);
 926                 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
 927                 rw_init(&hca->avl_rw_lock,
 928                     NULL, RW_DRIVER, hca->iblock);
 929                 mutex_init(&hca->cache_allocation_lock,
 930                     NULL, MUTEX_DRIVER, NULL);
 931                 hca->avl_init = TRUE;
 932 
 933                 /* Create kstats for the cache */
 934                 ASSERT(INGLOBALZONE(curproc));
 935 
 936                 if (!stats_enabled) {
 937                         ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
 938                             KSTAT_TYPE_NAMED,
 939                             sizeof (rpcib_kstat) / sizeof (kstat_named_t),
 940                             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
 941                             GLOBAL_ZONEID);
 942                         if (ksp) {
 943                                 ksp->ks_data = (void *) &rpcib_kstat;
 944                                 ksp->ks_update = rpcib_cache_kstat_update;
 945                                 kstat_install(ksp);
 946                                 stats_enabled = TRUE;
 947                         }
 948                 }
 949                 if (hca->cleanup_helper == NULL) {
 950                         char tq_name[sizeof (hca->hca_guid) * 2 + 1];
 951 
 952                         (void) snprintf(tq_name, sizeof (tq_name), "%llX",
 953                             (unsigned long long int) hca->hca_guid);
 954                         hca->cleanup_helper = ddi_taskq_create(NULL,
 955                             tq_name, 1, TASKQ_DEFAULTPRI, 0);
 956                 }
 957 
 958                 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
 959                 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
 960                 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
 961                     hca->iblock);
 962                 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
 963                     hca->iblock);
 964                 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
 965                 hca->inuse = TRUE;
 966 
 967                 hca->next = ribstat->hcas_list;
 968                 ribstat->hcas_list = hca;
 969                 ribstat->nhca_inited++;
 970                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
 971                 continue;
 972 
 973 fail3:
 974                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
 975 fail2:
 976                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
 977 fail1:
 978                 (void) ibt_close_hca(hca->hca_hdl);
 979                 kmem_free(hca, sizeof (rib_hca_t));
 980         }
 981         rw_exit(&ribstat->hcas_list_lock);
 982         ibt_free_hca_list(hca_guids, ribstat->hca_count);
 983         rib_mod.rdma_count = rib_stat->nhca_inited;
 984 
 985         /*
 986          * return success if at least one new hca has been configured.
 987          */
 988         if (ribstat->nhca_inited != old_nhca_inited)
 989                 return (RDMA_SUCCESS);
 990         else
 991                 return (RDMA_FAILED);
 992 }
 993 
 994 /*
 995  * Callback routines
 996  */
 997 
 998 /*
 999  * SCQ handlers
1000  */
1001 /* ARGSUSED */
1002 static void
1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1004 {
1005         ibt_status_t    ibt_status;
1006         ibt_wc_t        wc;
1007         struct send_wid *wd;
1008         CONN            *conn;
1009         rib_qp_t        *qp;
1010         int             i;
1011 
1012         /*
1013          * Re-enable cq notify here to avoid missing any
1014          * completion queue notification.
1015          */
1016         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1017 
1018         ibt_status = IBT_SUCCESS;
1019         while (ibt_status != IBT_CQ_EMPTY) {
1020                 bzero(&wc, sizeof (wc));
1021                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1022                 if (ibt_status != IBT_SUCCESS)
1023                         return;
1024 
1025                 /*
1026                  * Got a send completion
1027                  */
1028                 if (wc.wc_id != RDMA_DUMMY_WRID) {
1029                         wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1030                         qp = wd->qp;
1031                         conn = qptoc(qp);
1032 
1033                         mutex_enter(&wd->sendwait_lock);
1034                         switch (wc.wc_status) {
1035                         case IBT_WC_SUCCESS:
1036                                 wd->status = RDMA_SUCCESS;
1037                                 break;
1038                         default:
1039 /*
1040  *    RC Send Q Error Code              Local state     Remote State
1041  *    ====================              ===========     ============
1042  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1043  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1044  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1045  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1046  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1047  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1048  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1049  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1050  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1051  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1052  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1053  */
1054                                 /*
1055                                  * Channel in error state. Set connection to
1056                                  * ERROR and cleanup will happen either from
1057                                  * conn_release  or from rib_conn_get
1058                                  */
1059                                 wd->status = RDMA_FAILED;
1060                                 mutex_enter(&conn->c_lock);
1061                                 if (conn->c_state != C_DISCONN_PEND)
1062                                         conn->c_state = C_ERROR_CONN;
1063                                 mutex_exit(&conn->c_lock);
1064                                 break;
1065                         }
1066 
1067                         if (wd->cv_sig == 1) {
1068                                 /*
1069                                  * Notify poster
1070                                  */
1071                                 cv_signal(&wd->wait_cv);
1072                                 mutex_exit(&wd->sendwait_lock);
1073                         } else {
1074                                 /*
1075                                  * Poster not waiting for notification.
1076                                  * Free the send buffers and send_wid
1077                                  */
1078                                 for (i = 0; i < wd->nsbufs; i++) {
1079                                         rib_rbuf_free(qptoc(wd->qp),
1080                                             SEND_BUFFER,
1081                                             (void *)(uintptr_t)wd->sbufaddr[i]);
1082                                 }
1083 
1084                                 /* decrement the send ref count */
1085                                 rib_send_rele(qp);
1086 
1087                                 mutex_exit(&wd->sendwait_lock);
1088                                 (void) rib_free_sendwait(wd);
1089                         }
1090                 }
1091         }
1092 }
1093 
1094 /* ARGSUSED */
1095 static void
1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098         ibt_status_t    ibt_status;
1099         ibt_wc_t        wc;
1100         struct send_wid *wd;
1101         rib_qp_t        *qp;
1102         CONN            *conn;
1103         int             i;
1104 
1105         /*
1106          * Re-enable cq notify here to avoid missing any
1107          * completion queue notification.
1108          */
1109         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1110 
1111         ibt_status = IBT_SUCCESS;
1112         while (ibt_status != IBT_CQ_EMPTY) {
1113                 bzero(&wc, sizeof (wc));
1114                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1115                 if (ibt_status != IBT_SUCCESS)
1116                         return;
1117 
1118                 /*
1119                  * Got a send completion
1120                  */
1121                 if (wc.wc_id != RDMA_DUMMY_WRID) {
1122                         wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1123                         qp = wd->qp;
1124                         conn = qptoc(qp);
1125                         mutex_enter(&wd->sendwait_lock);
1126 
1127                         switch (wc.wc_status) {
1128                         case IBT_WC_SUCCESS:
1129                                 wd->status = RDMA_SUCCESS;
1130                                 break;
1131                         default:
1132                                 /*
1133                                  * Channel in error state. Set connection to
1134                                  * ERROR and cleanup will happen either from
1135                                  * conn_release  or conn timeout.
1136                                  */
1137                                 wd->status = RDMA_FAILED;
1138                                 mutex_enter(&conn->c_lock);
1139                                 if (conn->c_state != C_DISCONN_PEND)
1140                                         conn->c_state = C_ERROR_CONN;
1141                                 mutex_exit(&conn->c_lock);
1142                                 break;
1143                         }
1144 
1145                         if (wd->cv_sig == 1) {
1146                                 /*
1147                                  * Update completion status and notify poster
1148                                  */
1149                                 cv_signal(&wd->wait_cv);
1150                                 mutex_exit(&wd->sendwait_lock);
1151                         } else {
1152                                 /*
1153                                  * Poster not waiting for notification.
1154                                  * Free the send buffers and send_wid
1155                                  */
1156                                 for (i = 0; i < wd->nsbufs; i++) {
1157                                         rib_rbuf_free(qptoc(wd->qp),
1158                                             SEND_BUFFER,
1159                                             (void *)(uintptr_t)wd->sbufaddr[i]);
1160                                 }
1161 
1162                                 /* decrement the send ref count */
1163                                 rib_send_rele(qp);
1164 
1165                                 mutex_exit(&wd->sendwait_lock);
1166                                 (void) rib_free_sendwait(wd);
1167                         }
1168                 }
1169         }
1170 }
1171 
1172 /*
1173  * RCQ handler
1174  */
1175 /* ARGSUSED */
1176 static void
1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1178 {
1179         rib_qp_t        *qp;
1180         ibt_status_t    ibt_status;
1181         ibt_wc_t        wc;
1182         struct recv_wid *rwid;
1183 
1184         /*
1185          * Re-enable cq notify here to avoid missing any
1186          * completion queue notification.
1187          */
1188         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1189 
1190         ibt_status = IBT_SUCCESS;
1191         while (ibt_status != IBT_CQ_EMPTY) {
1192                 bzero(&wc, sizeof (wc));
1193                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1194                 if (ibt_status != IBT_SUCCESS)
1195                         return;
1196 
1197                 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1198                 qp = rwid->qp;
1199 
1200                 if (wc.wc_status == IBT_WC_SUCCESS) {
1201                         XDR     inxdrs, *xdrs;
1202                         uint_t  xid, vers, op, find_xid = 0;
1203                         struct reply    *r;
1204                         CONN *conn = qptoc(qp);
1205                         uint32_t rdma_credit = 0;
1206 
1207                         xdrs = &inxdrs;
1208                         xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1209                             wc.wc_bytes_xfer, XDR_DECODE);
1210                         /*
1211                          * Treat xid as opaque (xid is the first entity
1212                          * in the rpc rdma message).
1213                          */
1214                         xid = *(uint32_t *)(uintptr_t)rwid->addr;
1215 
1216                         /* Skip xid and set the xdr position accordingly. */
1217                         XDR_SETPOS(xdrs, sizeof (uint32_t));
1218                         (void) xdr_u_int(xdrs, &vers);
1219                         (void) xdr_u_int(xdrs, &rdma_credit);
1220                         (void) xdr_u_int(xdrs, &op);
1221                         XDR_DESTROY(xdrs);
1222 
1223                         if (vers != RPCRDMA_VERS) {
1224                                 /*
1225                                  * Invalid RPC/RDMA version. Cannot
1226                                  * interoperate.  Set connection to
1227                                  * ERROR state and bail out.
1228                                  */
1229                                 mutex_enter(&conn->c_lock);
1230                                 if (conn->c_state != C_DISCONN_PEND)
1231                                         conn->c_state = C_ERROR_CONN;
1232                                 mutex_exit(&conn->c_lock);
1233                                 rib_rbuf_free(conn, RECV_BUFFER,
1234                                     (void *)(uintptr_t)rwid->addr);
1235                                 rib_free_wid(rwid);
1236                                 rib_recv_rele(qp);
1237                                 continue;
1238                         }
1239 
1240                         mutex_enter(&qp->replylist_lock);
1241                         for (r = qp->replylist; r != NULL; r = r->next) {
1242                                 if (r->xid == xid) {
1243                                         find_xid = 1;
1244                                         switch (op) {
1245                                         case RDMA_MSG:
1246                                         case RDMA_NOMSG:
1247                                         case RDMA_MSGP:
1248                                                 r->status = RDMA_SUCCESS;
1249                                                 r->vaddr_cq = rwid->addr;
1250                                                 r->bytes_xfer =
1251                                                     wc.wc_bytes_xfer;
1252                                                 cv_signal(&r->wait_cv);
1253                                                 break;
1254                                         default:
1255                                                 rib_rbuf_free(qptoc(qp),
1256                                                     RECV_BUFFER,
1257                                                     (void *)(uintptr_t)
1258                                                     rwid->addr);
1259                                                 break;
1260                                         }
1261                                         break;
1262                                 }
1263                         }
1264                         mutex_exit(&qp->replylist_lock);
1265                         if (find_xid == 0) {
1266                                 /* RPC caller not waiting for reply */
1267 
1268                                 DTRACE_PROBE1(rpcib__i__nomatchxid1,
1269                                     int, xid);
1270 
1271                                 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1272                                     (void *)(uintptr_t)rwid->addr);
1273                         }
1274                 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1275                         CONN *conn = qptoc(qp);
1276 
1277                         /*
1278                          * Connection being flushed. Just free
1279                          * the posted buffer
1280                          */
1281                         rib_rbuf_free(conn, RECV_BUFFER,
1282                             (void *)(uintptr_t)rwid->addr);
1283                 } else {
1284                         CONN *conn = qptoc(qp);
1285 /*
1286  *  RC Recv Q Error Code                Local state     Remote State
1287  *  ====================                ===========     ============
1288  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1289  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1291  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1292  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1293  *  IBT_WC_WR_FLUSHED_ERR               None            None
1294  */
1295                         /*
1296                          * Channel in error state. Set connection
1297                          * in ERROR state.
1298                          */
1299                         mutex_enter(&conn->c_lock);
1300                         if (conn->c_state != C_DISCONN_PEND)
1301                                 conn->c_state = C_ERROR_CONN;
1302                         mutex_exit(&conn->c_lock);
1303                         rib_rbuf_free(conn, RECV_BUFFER,
1304                             (void *)(uintptr_t)rwid->addr);
1305                 }
1306                 rib_free_wid(rwid);
1307                 rib_recv_rele(qp);
1308         }
1309 }
1310 
1311 /* Server side */
1312 /* ARGSUSED */
1313 static void
1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1315 {
1316         rdma_recv_data_t *rdp;
1317         rib_qp_t        *qp;
1318         ibt_status_t    ibt_status;
1319         ibt_wc_t        wc;
1320         struct svc_recv *s_recvp;
1321         CONN            *conn;
1322         mblk_t          *mp;
1323 
1324         /*
1325          * Re-enable cq notify here to avoid missing any
1326          * completion queue notification.
1327          */
1328         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1329 
1330         ibt_status = IBT_SUCCESS;
1331         while (ibt_status != IBT_CQ_EMPTY) {
1332                 bzero(&wc, sizeof (wc));
1333                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1334                 if (ibt_status != IBT_SUCCESS)
1335                         return;
1336 
1337                 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1338                 qp = s_recvp->qp;
1339                 conn = qptoc(qp);
1340 
1341                 if (wc.wc_status == IBT_WC_SUCCESS) {
1342                         XDR     inxdrs, *xdrs;
1343                         uint_t  xid, vers, op;
1344                         uint32_t rdma_credit;
1345 
1346                         xdrs = &inxdrs;
1347                         /* s_recvp->vaddr stores data */
1348                         xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349                             wc.wc_bytes_xfer, XDR_DECODE);
1350 
1351                         /*
1352                          * Treat xid as opaque (xid is the first entity
1353                          * in the rpc rdma message).
1354                          */
1355                         xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356                         /* Skip xid and set the xdr position accordingly. */
1357                         XDR_SETPOS(xdrs, sizeof (uint32_t));
1358                         if (!xdr_u_int(xdrs, &vers) ||
1359                             !xdr_u_int(xdrs, &rdma_credit) ||
1360                             !xdr_u_int(xdrs, &op)) {
1361                                 rib_rbuf_free(conn, RECV_BUFFER,
1362                                     (void *)(uintptr_t)s_recvp->vaddr);
1363                                 XDR_DESTROY(xdrs);
1364                                 rib_recv_rele(qp);
1365                                 (void) rib_free_svc_recv(s_recvp);
1366                                 continue;
1367                         }
1368                         XDR_DESTROY(xdrs);
1369 
1370                         if (vers != RPCRDMA_VERS) {
1371                                 /*
1372                                  * Invalid RPC/RDMA version.
1373                                  * Drop rpc rdma message.
1374                                  */
1375                                 rib_rbuf_free(conn, RECV_BUFFER,
1376                                     (void *)(uintptr_t)s_recvp->vaddr);
1377                                 rib_recv_rele(qp);
1378                                 (void) rib_free_svc_recv(s_recvp);
1379                                 continue;
1380                         }
1381                         /*
1382                          * Is this for RDMA_DONE?
1383                          */
1384                         if (op == RDMA_DONE) {
1385                                 rib_rbuf_free(conn, RECV_BUFFER,
1386                                     (void *)(uintptr_t)s_recvp->vaddr);
1387                                 /*
1388                                  * Wake up the thread waiting on
1389                                  * a RDMA_DONE for xid
1390                                  */
1391                                 mutex_enter(&qp->rdlist_lock);
1392                                 rdma_done_notify(qp, xid);
1393                                 mutex_exit(&qp->rdlist_lock);
1394                                 rib_recv_rele(qp);
1395                                 (void) rib_free_svc_recv(s_recvp);
1396                                 continue;
1397                         }
1398 
1399                         mutex_enter(&plugin_state_lock);
1400                         mutex_enter(&conn->c_lock);
1401                         if ((plugin_state == ACCEPT) &&
1402                             (conn->c_state == C_CONNECTED)) {
1403                                 conn->c_ref++;
1404                                 mutex_exit(&conn->c_lock);
1405                                 while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1406                                     == NULL)
1407                                         (void) strwaitbuf(
1408                                             sizeof (*rdp), BPRI_LO);
1409                                 /*
1410                                  * Plugin is in accept state, hence the master
1411                                  * transport queue for this is still accepting
1412                                  * requests. Hence we can call svc_queuereq to
1413                                  * queue this recieved msg.
1414                                  */
1415                                 rdp = (rdma_recv_data_t *)mp->b_rptr;
1416                                 rdp->conn = conn;
1417                                 rdp->rpcmsg.addr =
1418                                     (caddr_t)(uintptr_t)s_recvp->vaddr;
1419                                 rdp->rpcmsg.type = RECV_BUFFER;
1420                                 rdp->rpcmsg.len = wc.wc_bytes_xfer;
1421                                 rdp->status = wc.wc_status;
1422                                 mp->b_wptr += sizeof (*rdp);
1423                                 (void) svc_queuereq((queue_t *)rib_stat->q, mp,
1424                                     FALSE);
1425                                 mutex_exit(&plugin_state_lock);
1426                         } else {
1427                                 /*
1428                                  * The master transport for this is going
1429                                  * away and the queue is not accepting anymore
1430                                  * requests for krpc, so don't do anything, just
1431                                  * free the msg.
1432                                  */
1433                                 mutex_exit(&conn->c_lock);
1434                                 mutex_exit(&plugin_state_lock);
1435                                 rib_rbuf_free(conn, RECV_BUFFER,
1436                                     (void *)(uintptr_t)s_recvp->vaddr);
1437                         }
1438                 } else {
1439                         rib_rbuf_free(conn, RECV_BUFFER,
1440                             (void *)(uintptr_t)s_recvp->vaddr);
1441                 }
1442                 rib_recv_rele(qp);
1443                 (void) rib_free_svc_recv(s_recvp);
1444         }
1445 }
1446 
1447 static void
1448 rib_attach_hca()
1449 {
1450         mutex_enter(&rib_stat->open_hca_lock);
1451         (void) rpcib_open_hcas(rib_stat);
1452         rib_listen(NULL);
1453         mutex_exit(&rib_stat->open_hca_lock);
1454 }
1455 
1456 /*
1457  * Handles DR event of IBT_HCA_DETACH_EVENT.
1458  */
1459 /* ARGSUSED */
1460 static void
1461 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1462         ibt_async_code_t code, ibt_async_event_t *event)
1463 {
1464         switch (code) {
1465         case IBT_HCA_ATTACH_EVENT:
1466                 rib_attach_hca();
1467                 break;
1468         case IBT_HCA_DETACH_EVENT:
1469                 rib_detach_hca(hca_hdl);
1470 #ifdef DEBUG
1471                 cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1472 #endif
1473                 break;
1474         case IBT_EVENT_PORT_UP:
1475                 /*
1476                  * A port is up. We should call rib_listen() since there is
1477                  * a chance that rib_listen() may have failed during
1478                  * rib_attach_hca() because the port had not been up yet.
1479                  */
1480                 rib_listen(NULL);
1481 #ifdef DEBUG
1482                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1483 #endif
1484                 break;
1485 #ifdef DEBUG
1486         case IBT_EVENT_PATH_MIGRATED:
1487                 cmn_err(CE_NOTE, "rib_async_handler(): "
1488                     "IBT_EVENT_PATH_MIGRATED\n");
1489                 break;
1490         case IBT_EVENT_SQD:
1491                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1492                 break;
1493         case IBT_EVENT_COM_EST:
1494                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1495                 break;
1496         case IBT_ERROR_CATASTROPHIC_CHAN:
1497                 cmn_err(CE_NOTE, "rib_async_handler(): "
1498                     "IBT_ERROR_CATASTROPHIC_CHAN\n");
1499                 break;
1500         case IBT_ERROR_INVALID_REQUEST_CHAN:
1501                 cmn_err(CE_NOTE, "rib_async_handler(): "
1502                     "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1503                 break;
1504         case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1505                 cmn_err(CE_NOTE, "rib_async_handler(): "
1506                     "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1507                 break;
1508         case IBT_ERROR_PATH_MIGRATE_REQ:
1509                 cmn_err(CE_NOTE, "rib_async_handler(): "
1510                     "IBT_ERROR_PATH_MIGRATE_REQ\n");
1511                 break;
1512         case IBT_ERROR_CQ:
1513                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1514                 break;
1515         case IBT_ERROR_PORT_DOWN:
1516                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1517                 break;
1518         case IBT_ASYNC_OPAQUE1:
1519                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1520                 break;
1521         case IBT_ASYNC_OPAQUE2:
1522                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1523                 break;
1524         case IBT_ASYNC_OPAQUE3:
1525                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1526                 break;
1527         case IBT_ASYNC_OPAQUE4:
1528                 cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1529                 break;
1530 #endif
1531         default:
1532                 break;
1533         }
1534 }
1535 
1536 /*
1537  * Client's reachable function.
1538  */
1539 static rdma_stat
1540 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1541 {
1542         rdma_stat       status;
1543         rpcib_ping_t    rpt;
1544         struct netbuf   saddr;
1545         CONN            *conn;
1546 
1547         bzero(&saddr, sizeof (struct netbuf));
1548         status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1549 
1550         if (status == RDMA_SUCCESS) {
1551                 *handle = (void *)rpt.hca;
1552                 /* release the reference */
1553                 (void) rib_conn_release(conn);
1554                 return (RDMA_SUCCESS);
1555         } else {
1556                 *handle = NULL;
1557                 DTRACE_PROBE(rpcib__i__pingfailed);
1558                 return (RDMA_FAILED);
1559         }
1560 }
1561 
1562 /* Client side qp creation */
1563 static rdma_stat
1564 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1565 {
1566         rib_qp_t        *kqp = NULL;
1567         CONN            *conn;
1568         rdma_clnt_cred_ctrl_t *cc_info;
1569 
1570         ASSERT(qp != NULL);
1571         *qp = NULL;
1572 
1573         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1574         conn = qptoc(kqp);
1575         kqp->hca = hca;
1576         kqp->rdmaconn.c_rdmamod = &rib_mod;
1577         kqp->rdmaconn.c_private = (caddr_t)kqp;
1578 
1579         kqp->mode = RIB_CLIENT;
1580         kqp->chan_flags = IBT_BLOCKING;
1581         conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1582         bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1583         conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1584         /*
1585          * Initialize
1586          */
1587         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1588         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1589         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1590         cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1591         mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1592         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1594         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1595         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1596         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1597         /*
1598          * Initialize the client credit control
1599          * portion of the rdmaconn struct.
1600          */
1601         kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1602         cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1603         cc_info->clnt_cc_granted_ops = 0;
1604         cc_info->clnt_cc_in_flight_ops = 0;
1605         cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1606 
1607         *qp = kqp;
1608         return (RDMA_SUCCESS);
1609 }
1610 
1611 /* Server side qp creation */
1612 static rdma_stat
1613 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1614 {
1615         rib_qp_t        *kqp = NULL;
1616         ibt_chan_sizes_t        chan_sizes;
1617         ibt_rc_chan_alloc_args_t        qp_attr;
1618         ibt_status_t            ibt_status;
1619         rdma_srv_cred_ctrl_t *cc_info;
1620 
1621         *qp = NULL;
1622 
1623         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1624         kqp->hca = hca;
1625         kqp->port_num = port;
1626         kqp->rdmaconn.c_rdmamod = &rib_mod;
1627         kqp->rdmaconn.c_private = (caddr_t)kqp;
1628 
1629         /*
1630          * Create the qp handle
1631          */
1632         bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1633         qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1634         qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1635         qp_attr.rc_pd = hca->pd_hdl;
1636         qp_attr.rc_hca_port_num = port;
1637         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1638         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1639         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1640         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1641         qp_attr.rc_clone_chan = NULL;
1642         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1643         qp_attr.rc_flags = IBT_WR_SIGNALED;
1644 
1645         rw_enter(&hca->state_lock, RW_READER);
1646         if (hca->state != HCA_DETACHED) {
1647                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1648                     IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1649                     &chan_sizes);
1650         } else {
1651                 rw_exit(&hca->state_lock);
1652                 goto fail;
1653         }
1654         rw_exit(&hca->state_lock);
1655 
1656         if (ibt_status != IBT_SUCCESS) {
1657                 DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1658                     int, ibt_status);
1659                 goto fail;
1660         }
1661 
1662         kqp->mode = RIB_SERVER;
1663         kqp->chan_flags = IBT_BLOCKING;
1664         kqp->q = q;  /* server ONLY */
1665 
1666         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1667         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1668         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1669         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1670         cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1671         mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1672         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1673         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1674         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1675         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1676         /*
1677          * Set the private data area to qp to be used in callbacks
1678          */
1679         ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1680         kqp->rdmaconn.c_state = C_CONNECTED;
1681 
1682         /*
1683          * Initialize the server credit control
1684          * portion of the rdmaconn struct.
1685          */
1686         kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1687         cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1688         cc_info->srv_cc_buffers_granted = preposted_rbufs;
1689         cc_info->srv_cc_cur_buffers_used = 0;
1690         cc_info->srv_cc_posted = preposted_rbufs;
1691 
1692         *qp = kqp;
1693 
1694         return (RDMA_SUCCESS);
1695 fail:
1696         if (kqp)
1697                 kmem_free(kqp, sizeof (rib_qp_t));
1698 
1699         return (RDMA_FAILED);
1700 }
1701 
1702 /* ARGSUSED */
1703 ibt_cm_status_t
1704 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1705     ibt_cm_return_args_t *ret_args, void *priv_data,
1706     ibt_priv_data_len_t len)
1707 {
1708         rib_hca_t       *hca;
1709 
1710         hca = (rib_hca_t *)clnt_hdl;
1711 
1712         switch (event->cm_type) {
1713 
1714         /* got a connection close event */
1715         case IBT_CM_EVENT_CONN_CLOSED:
1716         {
1717                 CONN    *conn;
1718                 rib_qp_t *qp;
1719 
1720                 /* check reason why connection was closed */
1721                 switch (event->cm_event.closed) {
1722                 case IBT_CM_CLOSED_DREP_RCVD:
1723                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1724                 case IBT_CM_CLOSED_DUP:
1725                 case IBT_CM_CLOSED_ABORT:
1726                 case IBT_CM_CLOSED_ALREADY:
1727                         /*
1728                          * These cases indicate the local end initiated
1729                          * the closing of the channel. Nothing to do here.
1730                          */
1731                         break;
1732                 default:
1733                         /*
1734                          * Reason for CONN_CLOSED event must be one of
1735                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1736                          * or IBT_CM_CLOSED_STALE. These indicate cases were
1737                          * the remote end is closing the channel. In these
1738                          * cases free the channel and transition to error
1739                          * state
1740                          */
1741                         qp = ibt_get_chan_private(event->cm_channel);
1742                         conn = qptoc(qp);
1743                         mutex_enter(&conn->c_lock);
1744                         if (conn->c_state == C_DISCONN_PEND) {
1745                                 mutex_exit(&conn->c_lock);
1746                                 break;
1747                         }
1748 
1749                         conn->c_state = C_ERROR_CONN;
1750 
1751                         /*
1752                          * Free the conn if c_ref is down to 0 already
1753                          */
1754                         if (conn->c_ref == 0) {
1755                                 /*
1756                                  * Remove from list and free conn
1757                                  */
1758                                 conn->c_state = C_DISCONN_PEND;
1759                                 mutex_exit(&conn->c_lock);
1760                                 rw_enter(&hca->state_lock, RW_READER);
1761                                 if (hca->state != HCA_DETACHED)
1762                                         (void) rib_disconnect_channel(conn,
1763                                             &hca->cl_conn_list);
1764                                 rw_exit(&hca->state_lock);
1765                         } else {
1766                                 /*
1767                                  * conn will be freed when c_ref goes to 0.
1768                                  * Indicate to cleaning thread not to close
1769                                  * the connection, but just free the channel.
1770                                  */
1771                                 conn->c_flags |= C_CLOSE_NOTNEEDED;
1772                                 mutex_exit(&conn->c_lock);
1773                         }
1774 #ifdef DEBUG
1775                         if (rib_debug)
1776                                 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1777                                     "(CONN_CLOSED) channel disconnected");
1778 #endif
1779                         break;
1780                 }
1781                 break;
1782         }
1783         default:
1784                 break;
1785         }
1786         return (IBT_CM_ACCEPT);
1787 }
1788 
1789 /*
1790  * Connect to the server.
1791  */
1792 rdma_stat
1793 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1794 {
1795         ibt_chan_open_args_t    chan_args;      /* channel args */
1796         ibt_chan_sizes_t        chan_sizes;
1797         ibt_rc_chan_alloc_args_t        qp_attr;
1798         ibt_status_t            ibt_status;
1799         ibt_rc_returns_t        ret_args;       /* conn reject info */
1800         int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
1801         ibt_ip_cm_info_t        ipcm_info;
1802         uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1803 
1804 
1805         (void) bzero(&chan_args, sizeof (chan_args));
1806         (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1807         (void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1808 
1809         ipcm_info.src_addr.family = rptp->srcip.family;
1810         switch (ipcm_info.src_addr.family) {
1811         case AF_INET:
1812                 ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1813                 break;
1814         case AF_INET6:
1815                 ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1816                 break;
1817         }
1818 
1819         ipcm_info.dst_addr.family = rptp->srcip.family;
1820         switch (ipcm_info.dst_addr.family) {
1821         case AF_INET:
1822                 ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1823                 break;
1824         case AF_INET6:
1825                 ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1826                 break;
1827         }
1828 
1829         ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1830 
1831         ibt_status = ibt_format_ip_private_data(&ipcm_info,
1832             IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1833 
1834         if (ibt_status != IBT_SUCCESS) {
1835                 cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1836                 return (-1);
1837         }
1838 
1839         qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1840         /* Alloc a RC channel */
1841         qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1842         qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1843         qp_attr.rc_pd = hca->pd_hdl;
1844         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1845         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1846         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1847         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1848         qp_attr.rc_clone_chan = NULL;
1849         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1850         qp_attr.rc_flags = IBT_WR_SIGNALED;
1851 
1852         rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1853         chan_args.oc_path = &rptp->path;
1854 
1855         chan_args.oc_cm_handler = rib_clnt_cm_handler;
1856         chan_args.oc_cm_clnt_private = (void *)hca;
1857         chan_args.oc_rdma_ra_out = 4;
1858         chan_args.oc_rdma_ra_in = 4;
1859         chan_args.oc_path_retry_cnt = 2;
1860         chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1861         chan_args.oc_priv_data = cmp_ip_pvt;
1862         chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1863 
1864 refresh:
1865         rw_enter(&hca->state_lock, RW_READER);
1866         if (hca->state != HCA_DETACHED) {
1867                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1868                     IBT_ACHAN_NO_FLAGS,
1869                     &qp_attr, &qp->qp_hdl,
1870                     &chan_sizes);
1871         } else {
1872                 rw_exit(&hca->state_lock);
1873                 return (RDMA_FAILED);
1874         }
1875         rw_exit(&hca->state_lock);
1876 
1877         if (ibt_status != IBT_SUCCESS) {
1878                 DTRACE_PROBE1(rpcib__i_conntosrv,
1879                     int, ibt_status);
1880                 return (RDMA_FAILED);
1881         }
1882 
1883         /* Connect to the Server */
1884         (void) bzero(&ret_args, sizeof (ret_args));
1885         mutex_enter(&qp->cb_lock);
1886         ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1887             IBT_BLOCKING, &chan_args, &ret_args);
1888         if (ibt_status != IBT_SUCCESS) {
1889                 DTRACE_PROBE2(rpcib__i_openrctosrv,
1890                     int, ibt_status, int, ret_args.rc_status);
1891 
1892                 (void) ibt_free_channel(qp->qp_hdl);
1893                 qp->qp_hdl = NULL;
1894                 mutex_exit(&qp->cb_lock);
1895                 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1896                     ret_args.rc_status == IBT_CM_CONN_STALE) {
1897                         /*
1898                          * Got IBT_CM_CONN_STALE probably because of stale
1899                          * data on the passive end of a channel that existed
1900                          * prior to reboot. Retry establishing a channel
1901                          * REFRESH_ATTEMPTS times, during which time the
1902                          * stale conditions on the server might clear up.
1903                          */
1904                         goto refresh;
1905                 }
1906                 return (RDMA_FAILED);
1907         }
1908         mutex_exit(&qp->cb_lock);
1909         /*
1910          * Set the private data area to qp to be used in callbacks
1911          */
1912         ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1913         return (RDMA_SUCCESS);
1914 }
1915 
1916 rdma_stat
1917 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1918 {
1919         uint_t                  i, addr_count;
1920         ibt_status_t            ibt_status;
1921         uint8_t                 num_paths_p;
1922         ibt_ip_path_attr_t      ipattr;
1923         ibt_path_ip_src_t       srcip;
1924         rpcib_ipaddrs_t         addrs4;
1925         rpcib_ipaddrs_t         addrs6;
1926         struct sockaddr_in      *sinp;
1927         struct sockaddr_in6     *sin6p;
1928         rdma_stat               retval = RDMA_FAILED;
1929         rib_hca_t *hca;
1930 
1931         if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1932                 return (RDMA_INVAL);
1933         ASSERT(raddr->buf != NULL);
1934 
1935         bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1936 
1937         if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1938             (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1939                 retval = RDMA_FAILED;
1940                 goto done2;
1941         }
1942 
1943         if (addr_type == AF_INET) {
1944                 addr_count = addrs4.ri_count;
1945                 sinp = (struct sockaddr_in *)raddr->buf;
1946                 rptp->dstip.family = AF_INET;
1947                 rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1948                 sinp = addrs4.ri_list;
1949         } else {
1950                 addr_count = addrs6.ri_count;
1951                 sin6p = (struct sockaddr_in6 *)raddr->buf;
1952                 rptp->dstip.family = AF_INET6;
1953                 rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1954                 sin6p = addrs6.ri_list;
1955         }
1956 
1957         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1958         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1959                 rw_enter(&hca->state_lock, RW_READER);
1960                 if (hca->state == HCA_DETACHED) {
1961                         rw_exit(&hca->state_lock);
1962                         continue;
1963                 }
1964 
1965                 ipattr.ipa_dst_ip       = &rptp->dstip;
1966                 ipattr.ipa_hca_guid     = hca->hca_guid;
1967                 ipattr.ipa_ndst         = 1;
1968                 ipattr.ipa_max_paths    = 1;
1969                 ipattr.ipa_src_ip.family = rptp->dstip.family;
1970                 for (i = 0; i < addr_count; i++) {
1971                         num_paths_p = 0;
1972                         if (addr_type == AF_INET) {
1973                                 ipattr.ipa_src_ip.un.ip4addr =
1974                                     sinp[i].sin_addr.s_addr;
1975                         } else {
1976                                 ipattr.ipa_src_ip.un.ip6addr =
1977                                     sin6p[i].sin6_addr;
1978                         }
1979                         bzero(&srcip, sizeof (ibt_path_ip_src_t));
1980 
1981                         ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1982                             IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1983                             &num_paths_p, &srcip);
1984                         if (ibt_status == IBT_SUCCESS &&
1985                             num_paths_p != 0 &&
1986                             rptp->path.pi_hca_guid == hca->hca_guid) {
1987                                 rptp->hca = hca;
1988                                 rw_exit(&hca->state_lock);
1989                                 if (addr_type == AF_INET) {
1990                                         rptp->srcip.family = AF_INET;
1991                                         rptp->srcip.un.ip4addr =
1992                                             srcip.ip_primary.un.ip4addr;
1993                                 } else {
1994                                         rptp->srcip.family = AF_INET6;
1995                                         rptp->srcip.un.ip6addr =
1996                                             srcip.ip_primary.un.ip6addr;
1997 
1998                                 }
1999                                 retval = RDMA_SUCCESS;
2000                                 goto done1;
2001                         }
2002                 }
2003                 rw_exit(&hca->state_lock);
2004         }
2005 done1:
2006         rw_exit(&rib_stat->hcas_list_lock);
2007 done2:
2008         if (addrs4.ri_size > 0)
2009                 kmem_free(addrs4.ri_list, addrs4.ri_size);
2010         if (addrs6.ri_size > 0)
2011                 kmem_free(addrs6.ri_list, addrs6.ri_size);
2012         return (retval);
2013 }
2014 
2015 /*
2016  * Close channel, remove from connection list and
2017  * free up resources allocated for that channel.
2018  */
2019 rdma_stat
2020 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2021 {
2022         rib_qp_t        *qp = ctoqp(conn);
2023         rib_hca_t       *hca;
2024 
2025         mutex_enter(&conn->c_lock);
2026         if (conn->c_timeout != NULL) {
2027                 mutex_exit(&conn->c_lock);
2028                 (void) untimeout(conn->c_timeout);
2029                 mutex_enter(&conn->c_lock);
2030         }
2031 
2032         while (conn->c_flags & C_CLOSE_PENDING) {
2033                 cv_wait(&conn->c_cv, &conn->c_lock);
2034         }
2035         mutex_exit(&conn->c_lock);
2036 
2037         /*
2038          * c_ref == 0 and connection is in C_DISCONN_PEND
2039          */
2040         hca = qp->hca;
2041         if (conn_list != NULL)
2042                 (void) rib_rm_conn(conn, conn_list);
2043 
2044         /*
2045          * There is only one case where we get here with
2046          * qp_hdl = NULL, which is during connection setup on
2047          * the client. In such a case there are no posted
2048          * send/recv buffers.
2049          */
2050         if (qp->qp_hdl != NULL) {
2051                 mutex_enter(&qp->posted_rbufs_lock);
2052                 while (qp->n_posted_rbufs)
2053                         cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2054                 mutex_exit(&qp->posted_rbufs_lock);
2055 
2056                 mutex_enter(&qp->send_rbufs_lock);
2057                 while (qp->n_send_rbufs)
2058                         cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2059                         mutex_exit(&qp->send_rbufs_lock);
2060 
2061                 (void) ibt_free_channel(qp->qp_hdl);
2062                         qp->qp_hdl = NULL;
2063         }
2064 
2065         ASSERT(qp->rdlist == NULL);
2066 
2067         if (qp->replylist != NULL) {
2068                 (void) rib_rem_replylist(qp);
2069         }
2070 
2071         cv_destroy(&qp->cb_conn_cv);
2072         cv_destroy(&qp->posted_rbufs_cv);
2073         cv_destroy(&qp->send_rbufs_cv);
2074         mutex_destroy(&qp->cb_lock);
2075         mutex_destroy(&qp->replylist_lock);
2076         mutex_destroy(&qp->posted_rbufs_lock);
2077         mutex_destroy(&qp->send_rbufs_lock);
2078         mutex_destroy(&qp->rdlist_lock);
2079 
2080         cv_destroy(&conn->c_cv);
2081         mutex_destroy(&conn->c_lock);
2082 
2083         if (conn->c_raddr.buf != NULL) {
2084                 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2085         }
2086         if (conn->c_laddr.buf != NULL) {
2087                 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2088         }
2089         if (conn->c_netid != NULL) {
2090                 kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2091         }
2092         if (conn->c_addrmask.buf != NULL) {
2093                 kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2094         }
2095 
2096         /*
2097          * Credit control cleanup.
2098          */
2099         if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2100                 rdma_clnt_cred_ctrl_t *cc_info;
2101                 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2102                 cv_destroy(&cc_info->clnt_cc_cv);
2103         }
2104 
2105         kmem_free(qp, sizeof (rib_qp_t));
2106 
2107         /*
2108          * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2109          * then the hca is no longer being used.
2110          */
2111         if (conn_list != NULL) {
2112                 rw_enter(&hca->state_lock, RW_READER);
2113                 if (hca->state == HCA_DETACHED) {
2114                         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2115                         if (hca->srv_conn_list.conn_hd == NULL) {
2116                                 rw_enter(&hca->cl_conn_list.conn_lock,
2117                                     RW_READER);
2118 
2119                                 if (hca->cl_conn_list.conn_hd == NULL) {
2120                                         mutex_enter(&hca->inuse_lock);
2121                                         hca->inuse = FALSE;
2122                                         cv_signal(&hca->cb_cv);
2123                                         mutex_exit(&hca->inuse_lock);
2124                                 }
2125                                 rw_exit(&hca->cl_conn_list.conn_lock);
2126                         }
2127                         rw_exit(&hca->srv_conn_list.conn_lock);
2128                 }
2129                 rw_exit(&hca->state_lock);
2130         }
2131 
2132         return (RDMA_SUCCESS);
2133 }
2134 
2135 /*
2136  * All sends are done under the protection of
2137  * the wdesc->sendwait_lock. n_send_rbufs count
2138  * is protected using the send_rbufs_lock.
2139  * lock ordering is:
2140  * sendwait_lock -> send_rbufs_lock
2141  */
2142 
2143 void
2144 rib_send_hold(rib_qp_t *qp)
2145 {
2146         mutex_enter(&qp->send_rbufs_lock);
2147         qp->n_send_rbufs++;
2148         mutex_exit(&qp->send_rbufs_lock);
2149 }
2150 
2151 void
2152 rib_send_rele(rib_qp_t *qp)
2153 {
2154         mutex_enter(&qp->send_rbufs_lock);
2155         qp->n_send_rbufs--;
2156         if (qp->n_send_rbufs == 0)
2157                 cv_signal(&qp->send_rbufs_cv);
2158         mutex_exit(&qp->send_rbufs_lock);
2159 }
2160 
2161 void
2162 rib_recv_rele(rib_qp_t *qp)
2163 {
2164         mutex_enter(&qp->posted_rbufs_lock);
2165         qp->n_posted_rbufs--;
2166         if (qp->n_posted_rbufs == 0)
2167                 cv_signal(&qp->posted_rbufs_cv);
2168         mutex_exit(&qp->posted_rbufs_lock);
2169 }
2170 
2171 /*
2172  * Wait for send completion notification. Only on receiving a
2173  * notification be it a successful or error completion, free the
2174  * send_wid.
2175  */
2176 static rdma_stat
2177 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2178 {
2179         clock_t timout, cv_wait_ret;
2180         rdma_stat error = RDMA_SUCCESS;
2181         int     i;
2182 
2183         /*
2184          * Wait for send to complete
2185          */
2186         ASSERT(wd != NULL);
2187         mutex_enter(&wd->sendwait_lock);
2188         if (wd->status == (uint_t)SEND_WAIT) {
2189                 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2190                     ddi_get_lbolt();
2191 
2192                 if (qp->mode == RIB_SERVER) {
2193                         while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2194                             &wd->sendwait_lock, timout)) > 0 &&
2195                             wd->status == (uint_t)SEND_WAIT)
2196                                 ;
2197                         switch (cv_wait_ret) {
2198                         case -1:        /* timeout */
2199                                 DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2200 
2201                                 wd->cv_sig = 0;              /* no signal needed */
2202                                 error = RDMA_TIMEDOUT;
2203                                 break;
2204                         default:        /* got send completion */
2205                                 break;
2206                         }
2207                 } else {
2208                         while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2209                             &wd->sendwait_lock, timout)) > 0 &&
2210                             wd->status == (uint_t)SEND_WAIT)
2211                                 ;
2212                         switch (cv_wait_ret) {
2213                         case -1:        /* timeout */
2214                                 DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2215 
2216                                 wd->cv_sig = 0;              /* no signal needed */
2217                                 error = RDMA_TIMEDOUT;
2218                                 break;
2219                         case 0:         /* interrupted */
2220                                 DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2221 
2222                                 wd->cv_sig = 0;              /* no signal needed */
2223                                 error = RDMA_INTR;
2224                                 break;
2225                         default:        /* got send completion */
2226                                 break;
2227                         }
2228                 }
2229         }
2230 
2231         if (wd->status != (uint_t)SEND_WAIT) {
2232                 /* got send completion */
2233                 if (wd->status != RDMA_SUCCESS) {
2234                         switch (wd->status) {
2235                         case RDMA_CONNLOST:
2236                                 error = RDMA_CONNLOST;
2237                                 break;
2238                         default:
2239                                 error = RDMA_FAILED;
2240                                 break;
2241                         }
2242                 }
2243                 for (i = 0; i < wd->nsbufs; i++) {
2244                         rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2245                             (void *)(uintptr_t)wd->sbufaddr[i]);
2246                 }
2247 
2248                 rib_send_rele(qp);
2249 
2250                 mutex_exit(&wd->sendwait_lock);
2251                 (void) rib_free_sendwait(wd);
2252 
2253         } else {
2254                 mutex_exit(&wd->sendwait_lock);
2255         }
2256         return (error);
2257 }
2258 
2259 static struct send_wid *
2260 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2261 {
2262         struct send_wid *wd;
2263 
2264         wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2265         wd->xid = xid;
2266         wd->cv_sig = cv_sig;
2267         wd->qp = qp;
2268         cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2269         mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2270         wd->status = (uint_t)SEND_WAIT;
2271 
2272         return (wd);
2273 }
2274 
2275 static int
2276 rib_free_sendwait(struct send_wid *wdesc)
2277 {
2278         cv_destroy(&wdesc->wait_cv);
2279         mutex_destroy(&wdesc->sendwait_lock);
2280         kmem_free(wdesc, sizeof (*wdesc));
2281 
2282         return (0);
2283 }
2284 
2285 static rdma_stat
2286 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2287 {
2288         mutex_enter(&qp->replylist_lock);
2289         if (rep != NULL) {
2290                 (void) rib_remreply(qp, rep);
2291                 mutex_exit(&qp->replylist_lock);
2292                 return (RDMA_SUCCESS);
2293         }
2294         mutex_exit(&qp->replylist_lock);
2295         return (RDMA_FAILED);
2296 }
2297 
2298 /*
2299  * Send buffers are freed here only in case of error in posting
2300  * on QP. If the post succeeded, the send buffers are freed upon
2301  * send completion in rib_sendwait() or in the scq_handler.
2302  */
2303 rdma_stat
2304 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2305         int send_sig, int cv_sig, caddr_t *swid)
2306 {
2307         struct send_wid *wdesc;
2308         struct clist    *clp;
2309         ibt_status_t    ibt_status = IBT_SUCCESS;
2310         rdma_stat       ret = RDMA_SUCCESS;
2311         ibt_send_wr_t   tx_wr;
2312         int             i, nds;
2313         ibt_wr_ds_t     sgl[DSEG_MAX];
2314         uint_t          total_msg_size;
2315         rib_qp_t        *qp;
2316 
2317         qp = ctoqp(conn);
2318 
2319         ASSERT(cl != NULL);
2320 
2321         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2322 
2323         nds = 0;
2324         total_msg_size = 0;
2325         clp = cl;
2326         while (clp != NULL) {
2327                 if (nds >= DSEG_MAX) {
2328                         DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2329                         return (RDMA_FAILED);
2330                 }
2331                 sgl[nds].ds_va = clp->w.c_saddr;
2332                 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2333                 sgl[nds].ds_len = clp->c_len;
2334                 total_msg_size += clp->c_len;
2335                 clp = clp->c_next;
2336                 nds++;
2337         }
2338 
2339         if (send_sig) {
2340                 /* Set SEND_SIGNAL flag. */
2341                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2342                 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2343                 *swid = (caddr_t)wdesc;
2344                 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2345                 mutex_enter(&wdesc->sendwait_lock);
2346                 wdesc->nsbufs = nds;
2347                 for (i = 0; i < nds; i++) {
2348                         wdesc->sbufaddr[i] = sgl[i].ds_va;
2349                 }
2350         } else {
2351                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2352                 *swid = NULL;
2353                 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2354         }
2355 
2356         tx_wr.wr_opcode = IBT_WRC_SEND;
2357         tx_wr.wr_trans = IBT_RC_SRV;
2358         tx_wr.wr_nds = nds;
2359         tx_wr.wr_sgl = sgl;
2360 
2361         mutex_enter(&conn->c_lock);
2362         if (conn->c_state == C_CONNECTED) {
2363                 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2364         }
2365         if (conn->c_state != C_CONNECTED ||
2366             ibt_status != IBT_SUCCESS) {
2367                 if (conn->c_state != C_DISCONN_PEND)
2368                         conn->c_state = C_ERROR_CONN;
2369                 mutex_exit(&conn->c_lock);
2370                 if (send_sig) {
2371                         for (i = 0; i < nds; i++) {
2372                                 rib_rbuf_free(conn, SEND_BUFFER,
2373                                     (void *)(uintptr_t)wdesc->sbufaddr[i]);
2374                         }
2375                         mutex_exit(&wdesc->sendwait_lock);
2376                         (void) rib_free_sendwait(wdesc);
2377                 }
2378                 return (RDMA_CONNLOST);
2379         }
2380 
2381         mutex_exit(&conn->c_lock);
2382 
2383         if (send_sig) {
2384                 rib_send_hold(qp);
2385                 mutex_exit(&wdesc->sendwait_lock);
2386                 if (cv_sig) {
2387                         /*
2388                          * cv_wait for send to complete.
2389                          * We can fail due to a timeout or signal or
2390                          * unsuccessful send.
2391                          */
2392                         ret = rib_sendwait(qp, wdesc);
2393 
2394                         return (ret);
2395                 }
2396         }
2397 
2398         return (RDMA_SUCCESS);
2399 }
2400 
2401 
2402 rdma_stat
2403 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2404 {
2405         rdma_stat       ret;
2406         caddr_t         wd;
2407 
2408         /* send-wait & cv_signal */
2409         ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2410         return (ret);
2411 }
2412 
2413 /*
2414  * Deprecated/obsolete interface not used currently
2415  * but earlier used for READ-READ protocol.
2416  * Send RPC reply and wait for RDMA_DONE.
2417  */
2418 rdma_stat
2419 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2420 {
2421         rdma_stat ret = RDMA_SUCCESS;
2422         struct rdma_done_list *rd;
2423         clock_t cv_wait_ret;
2424         caddr_t *wid = NULL;
2425         rib_qp_t *qp = ctoqp(conn);
2426 
2427         mutex_enter(&qp->rdlist_lock);
2428         rd = rdma_done_add(qp, msgid);
2429 
2430         /* No cv_signal (whether send-wait or no-send-wait) */
2431         ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2432 
2433         if (ret != RDMA_SUCCESS) {
2434                 rdma_done_rm(qp, rd);
2435         } else {
2436                 /*
2437                  * Wait for RDMA_DONE from remote end
2438                  */
2439                 cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2440                     &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2441                     TR_CLOCK_TICK);
2442 
2443                 rdma_done_rm(qp, rd);
2444 
2445                 if (cv_wait_ret < 0) {
2446                         ret = RDMA_TIMEDOUT;
2447                 }
2448         }
2449 
2450         mutex_exit(&qp->rdlist_lock);
2451         return (ret);
2452 }
2453 
2454 static struct recv_wid *
2455 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2456 {
2457         struct recv_wid *rwid;
2458 
2459         rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2460         rwid->xid = msgid;
2461         rwid->addr = sgl->ds_va;
2462         rwid->qp = qp;
2463 
2464         return (rwid);
2465 }
2466 
2467 static void
2468 rib_free_wid(struct recv_wid *rwid)
2469 {
2470         kmem_free(rwid, sizeof (struct recv_wid));
2471 }
2472 
2473 rdma_stat
2474 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2475 {
2476         rib_qp_t        *qp = ctoqp(conn);
2477         struct clist    *clp = cl;
2478         struct reply    *rep;
2479         struct recv_wid *rwid;
2480         int             nds;
2481         ibt_wr_ds_t     sgl[DSEG_MAX];
2482         ibt_recv_wr_t   recv_wr;
2483         rdma_stat       ret;
2484         ibt_status_t    ibt_status;
2485 
2486         /*
2487          * rdma_clnt_postrecv uses RECV_BUFFER.
2488          */
2489 
2490         nds = 0;
2491         while (cl != NULL) {
2492                 if (nds >= DSEG_MAX) {
2493                         ret = RDMA_FAILED;
2494                         goto done;
2495                 }
2496                 sgl[nds].ds_va = cl->w.c_saddr;
2497                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2498                 sgl[nds].ds_len = cl->c_len;
2499                 cl = cl->c_next;
2500                 nds++;
2501         }
2502 
2503         if (nds != 1) {
2504                 ret = RDMA_FAILED;
2505                 goto done;
2506         }
2507 
2508         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2509         recv_wr.wr_nds = nds;
2510         recv_wr.wr_sgl = sgl;
2511 
2512         rwid = rib_create_wid(qp, &sgl[0], msgid);
2513         if (rwid) {
2514                 recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2515         } else {
2516                 ret = RDMA_NORESOURCE;
2517                 goto done;
2518         }
2519         rep = rib_addreplylist(qp, msgid);
2520         if (!rep) {
2521                 rib_free_wid(rwid);
2522                 ret = RDMA_NORESOURCE;
2523                 goto done;
2524         }
2525 
2526         mutex_enter(&conn->c_lock);
2527 
2528         if (conn->c_state == C_CONNECTED) {
2529                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2530         }
2531 
2532         if (conn->c_state != C_CONNECTED ||
2533             ibt_status != IBT_SUCCESS) {
2534                 if (conn->c_state != C_DISCONN_PEND)
2535                         conn->c_state = C_ERROR_CONN;
2536                 mutex_exit(&conn->c_lock);
2537                 rib_free_wid(rwid);
2538                 (void) rib_rem_rep(qp, rep);
2539                 ret = RDMA_CONNLOST;
2540                 goto done;
2541         }
2542 
2543         mutex_enter(&qp->posted_rbufs_lock);
2544         qp->n_posted_rbufs++;
2545         mutex_exit(&qp->posted_rbufs_lock);
2546 
2547         mutex_exit(&conn->c_lock);
2548         return (RDMA_SUCCESS);
2549 
2550 done:
2551         while (clp != NULL) {
2552                 rib_rbuf_free(conn, RECV_BUFFER,
2553                     (void *)(uintptr_t)clp->w.c_saddr3);
2554                 clp = clp->c_next;
2555         }
2556         return (ret);
2557 }
2558 
2559 rdma_stat
2560 rib_svc_post(CONN* conn, struct clist *cl)
2561 {
2562         rib_qp_t        *qp = ctoqp(conn);
2563         struct svc_recv *s_recvp;
2564         int             nds;
2565         ibt_wr_ds_t     sgl[DSEG_MAX];
2566         ibt_recv_wr_t   recv_wr;
2567         ibt_status_t    ibt_status;
2568 
2569         nds = 0;
2570         while (cl != NULL) {
2571                 if (nds >= DSEG_MAX) {
2572                         return (RDMA_FAILED);
2573                 }
2574                 sgl[nds].ds_va = cl->w.c_saddr;
2575                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2576                 sgl[nds].ds_len = cl->c_len;
2577                 cl = cl->c_next;
2578                 nds++;
2579         }
2580 
2581         if (nds != 1) {
2582                 rib_rbuf_free(conn, RECV_BUFFER,
2583                     (caddr_t)(uintptr_t)sgl[0].ds_va);
2584 
2585                 return (RDMA_FAILED);
2586         }
2587 
2588         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2589         recv_wr.wr_nds = nds;
2590         recv_wr.wr_sgl = sgl;
2591 
2592         s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2593         /* Use s_recvp's addr as wr id */
2594         recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2595         mutex_enter(&conn->c_lock);
2596         if (conn->c_state == C_CONNECTED) {
2597                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2598         }
2599         if (conn->c_state != C_CONNECTED ||
2600             ibt_status != IBT_SUCCESS) {
2601                 if (conn->c_state != C_DISCONN_PEND)
2602                         conn->c_state = C_ERROR_CONN;
2603                 mutex_exit(&conn->c_lock);
2604                 rib_rbuf_free(conn, RECV_BUFFER,
2605                     (caddr_t)(uintptr_t)sgl[0].ds_va);
2606                 (void) rib_free_svc_recv(s_recvp);
2607 
2608                 return (RDMA_CONNLOST);
2609         }
2610         mutex_exit(&conn->c_lock);
2611 
2612         return (RDMA_SUCCESS);
2613 }
2614 
2615 /* Client */
2616 rdma_stat
2617 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2618 {
2619         return (rib_clnt_post(conn, cl, msgid));
2620 }
2621 
2622 /* Client */
2623 rdma_stat
2624 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2625 {
2626         rib_qp_t        *qp = ctoqp(conn);
2627         struct reply    *rep;
2628 
2629         mutex_enter(&qp->replylist_lock);
2630         for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2631                 if (rep->xid == msgid) {
2632                         if (rep->vaddr_cq) {
2633                                 rib_rbuf_free(conn, RECV_BUFFER,
2634                                     (caddr_t)(uintptr_t)rep->vaddr_cq);
2635                         }
2636                         (void) rib_remreply(qp, rep);
2637                         break;
2638                 }
2639         }
2640         mutex_exit(&qp->replylist_lock);
2641 
2642         return (RDMA_SUCCESS);
2643 }
2644 
2645 /* Server */
2646 rdma_stat
2647 rib_post_recv(CONN *conn, struct clist *cl)
2648 {
2649         rib_qp_t        *qp = ctoqp(conn);
2650 
2651         if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2652                 mutex_enter(&qp->posted_rbufs_lock);
2653                 qp->n_posted_rbufs++;
2654                 mutex_exit(&qp->posted_rbufs_lock);
2655                 return (RDMA_SUCCESS);
2656         }
2657         return (RDMA_FAILED);
2658 }
2659 
2660 /*
2661  * Client side only interface to "recv" the rpc reply buf
2662  * posted earlier by rib_post_resp(conn, cl, msgid).
2663  */
2664 rdma_stat
2665 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2666 {
2667         struct reply *rep = NULL;
2668         clock_t timout, cv_wait_ret;
2669         rdma_stat ret = RDMA_SUCCESS;
2670         rib_qp_t *qp = ctoqp(conn);
2671 
2672         /*
2673          * Find the reply structure for this msgid
2674          */
2675         mutex_enter(&qp->replylist_lock);
2676 
2677         for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2678                 if (rep->xid == msgid)
2679                         break;
2680         }
2681 
2682         if (rep != NULL) {
2683                 /*
2684                  * If message not yet received, wait.
2685                  */
2686                 if (rep->status == (uint_t)REPLY_WAIT) {
2687                         timout = ddi_get_lbolt() +
2688                             drv_usectohz(REPLY_WAIT_TIME * 1000000);
2689 
2690                         while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2691                             &qp->replylist_lock, timout)) > 0 &&
2692                             rep->status == (uint_t)REPLY_WAIT)
2693                                 ;
2694 
2695                         switch (cv_wait_ret) {
2696                         case -1:        /* timeout */
2697                                 ret = RDMA_TIMEDOUT;
2698                                 break;
2699                         case 0:
2700                                 ret = RDMA_INTR;
2701                                 break;
2702                         default:
2703                                 break;
2704                         }
2705                 }
2706 
2707                 if (rep->status == RDMA_SUCCESS) {
2708                         struct clist *cl = NULL;
2709 
2710                         /*
2711                          * Got message successfully
2712                          */
2713                         clist_add(&cl, 0, rep->bytes_xfer, NULL,
2714                             (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2715                         *clp = cl;
2716                 } else {
2717                         if (rep->status != (uint_t)REPLY_WAIT) {
2718                                 /*
2719                                  * Got error in reply message. Free
2720                                  * recv buffer here.
2721                                  */
2722                                 ret = rep->status;
2723                                 rib_rbuf_free(conn, RECV_BUFFER,
2724                                     (caddr_t)(uintptr_t)rep->vaddr_cq);
2725                         }
2726                 }
2727                 (void) rib_remreply(qp, rep);
2728         } else {
2729                 /*
2730                  * No matching reply structure found for given msgid on the
2731                  * reply wait list.
2732                  */
2733                 ret = RDMA_INVAL;
2734                 DTRACE_PROBE(rpcib__i__nomatchxid2);
2735         }
2736 
2737         /*
2738          * Done.
2739          */
2740         mutex_exit(&qp->replylist_lock);
2741         return (ret);
2742 }
2743 
2744 /*
2745  * RDMA write a buffer to the remote address.
2746  */
2747 rdma_stat
2748 rib_write(CONN *conn, struct clist *cl, int wait)
2749 {
2750         ibt_send_wr_t   tx_wr;
2751         int             cv_sig;
2752         ibt_wr_ds_t     sgl[DSEG_MAX];
2753         struct send_wid *wdesc;
2754         ibt_status_t    ibt_status;
2755         rdma_stat       ret = RDMA_SUCCESS;
2756         rib_qp_t        *qp = ctoqp(conn);
2757         uint64_t        n_writes = 0;
2758 
2759         if (cl == NULL) {
2760                 return (RDMA_FAILED);
2761         }
2762 
2763         while ((cl != NULL)) {
2764                 if (cl->c_len > 0) {
2765                         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2766                         tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2767                         tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2768                             cl->c_dmemhandle.mrc_rmr; /* rkey */
2769                         sgl[0].ds_va = cl->w.c_saddr;
2770                         sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2771                         sgl[0].ds_len = cl->c_len;
2772 
2773                         if (wait) {
2774                                 cv_sig = 1;
2775                         } else {
2776                                 if (n_writes > max_unsignaled_rws) {
2777                                         n_writes = 0;
2778                                         cv_sig = 1;
2779                                 } else {
2780                                         cv_sig = 0;
2781                                 }
2782                         }
2783 
2784                         if (cv_sig) {
2785                                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2786                                 wdesc = rib_init_sendwait(0, cv_sig, qp);
2787                                 tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2788                                 mutex_enter(&wdesc->sendwait_lock);
2789                         } else {
2790                                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2791                                 tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2792                         }
2793                         tx_wr.wr_opcode = IBT_WRC_RDMAW;
2794                         tx_wr.wr_trans = IBT_RC_SRV;
2795                         tx_wr.wr_nds = 1;
2796                         tx_wr.wr_sgl = sgl;
2797 
2798                         mutex_enter(&conn->c_lock);
2799                         if (conn->c_state == C_CONNECTED) {
2800                                 ibt_status =
2801                                     ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2802                         }
2803                         if (conn->c_state != C_CONNECTED ||
2804                             ibt_status != IBT_SUCCESS) {
2805                                 if (conn->c_state != C_DISCONN_PEND)
2806                                         conn->c_state = C_ERROR_CONN;
2807                                 mutex_exit(&conn->c_lock);
2808                                 if (cv_sig) {
2809                                         mutex_exit(&wdesc->sendwait_lock);
2810                                         (void) rib_free_sendwait(wdesc);
2811                                 }
2812                                 return (RDMA_CONNLOST);
2813                         }
2814 
2815                         mutex_exit(&conn->c_lock);
2816 
2817                         /*
2818                          * Wait for send to complete
2819                          */
2820                         if (cv_sig) {
2821 
2822                                 rib_send_hold(qp);
2823                                 mutex_exit(&wdesc->sendwait_lock);
2824 
2825                                 ret = rib_sendwait(qp, wdesc);
2826                                 if (ret != 0)
2827                                         return (ret);
2828                         }
2829                         n_writes ++;
2830                 }
2831                 cl = cl->c_next;
2832         }
2833         return (RDMA_SUCCESS);
2834 }
2835 
2836 /*
2837  * RDMA Read a buffer from the remote address.
2838  */
2839 rdma_stat
2840 rib_read(CONN *conn, struct clist *cl, int wait)
2841 {
2842         ibt_send_wr_t   rx_wr;
2843         int             cv_sig = 0;
2844         ibt_wr_ds_t     sgl;
2845         struct send_wid *wdesc;
2846         ibt_status_t    ibt_status = IBT_SUCCESS;
2847         rdma_stat       ret = RDMA_SUCCESS;
2848         rib_qp_t        *qp = ctoqp(conn);
2849 
2850         if (cl == NULL) {
2851                 return (RDMA_FAILED);
2852         }
2853 
2854         while (cl != NULL) {
2855                 bzero(&rx_wr, sizeof (ibt_send_wr_t));
2856                 /*
2857                  * Remote address is at the head chunk item in list.
2858                  */
2859                 rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2860                 rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2861 
2862                 sgl.ds_va = cl->u.c_daddr;
2863                 sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2864                 sgl.ds_len = cl->c_len;
2865 
2866                 /*
2867                  * If there are multiple chunks to be read, and
2868                  * wait is set, ask for signal only for the last chunk
2869                  * and wait only on the last chunk. The completion of
2870                  * RDMA_READ on last chunk ensures that reads on all
2871                  * previous chunks are also completed.
2872                  */
2873                 if (wait && (cl->c_next == NULL)) {
2874                         cv_sig = 1;
2875                         wdesc = rib_init_sendwait(0, cv_sig, qp);
2876                         rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2877                         rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2878                         mutex_enter(&wdesc->sendwait_lock);
2879                 } else {
2880                         rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2881                         rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2882                 }
2883                 rx_wr.wr_opcode = IBT_WRC_RDMAR;
2884                 rx_wr.wr_trans = IBT_RC_SRV;
2885                 rx_wr.wr_nds = 1;
2886                 rx_wr.wr_sgl = &sgl;
2887 
2888                 mutex_enter(&conn->c_lock);
2889                 if (conn->c_state == C_CONNECTED) {
2890                         ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2891                 }
2892                 if (conn->c_state != C_CONNECTED ||
2893                     ibt_status != IBT_SUCCESS) {
2894                         if (conn->c_state != C_DISCONN_PEND)
2895                                 conn->c_state = C_ERROR_CONN;
2896                         mutex_exit(&conn->c_lock);
2897                         if (wait && (cl->c_next == NULL)) {
2898                                 mutex_exit(&wdesc->sendwait_lock);
2899                                 (void) rib_free_sendwait(wdesc);
2900                         }
2901                         return (RDMA_CONNLOST);
2902                 }
2903 
2904                 mutex_exit(&conn->c_lock);
2905 
2906                 /*
2907                  * Wait for send to complete if this is the
2908                  * last item in the list.
2909                  */
2910                 if (wait && cl->c_next == NULL) {
2911                         rib_send_hold(qp);
2912                         mutex_exit(&wdesc->sendwait_lock);
2913 
2914                         ret = rib_sendwait(qp, wdesc);
2915 
2916                         if (ret != 0)
2917                                 return (ret);
2918                 }
2919                 cl = cl->c_next;
2920         }
2921         return (RDMA_SUCCESS);
2922 }
2923 
2924 /*
2925  * rib_srv_cm_handler()
2926  *    Connection Manager callback to handle RC connection requests.
2927  */
2928 /* ARGSUSED */
2929 static ibt_cm_status_t
2930 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2931         ibt_cm_return_args_t *ret_args, void *priv_data,
2932         ibt_priv_data_len_t len)
2933 {
2934         queue_t         *q;
2935         rib_qp_t        *qp;
2936         rib_hca_t       *hca;
2937         rdma_stat       status = RDMA_SUCCESS;
2938         int             i;
2939         struct clist    cl;
2940         rdma_buf_t      rdbuf = {0};
2941         void            *buf = NULL;
2942         CONN            *conn;
2943         ibt_ip_cm_info_t        ipinfo;
2944         struct sockaddr_in *s;
2945         struct sockaddr_in6 *s6;
2946         int sin_size = sizeof (struct sockaddr_in);
2947         int in_size = sizeof (struct in_addr);
2948         int sin6_size = sizeof (struct sockaddr_in6);
2949 
2950         ASSERT(any != NULL);
2951         ASSERT(event != NULL);
2952 
2953         hca = (rib_hca_t *)any;
2954 
2955         /* got a connection request */
2956         switch (event->cm_type) {
2957         case IBT_CM_EVENT_REQ_RCV:
2958                 /*
2959                  * If the plugin is in the NO_ACCEPT state, bail out.
2960                  */
2961                 mutex_enter(&plugin_state_lock);
2962                 if (plugin_state == NO_ACCEPT) {
2963                         mutex_exit(&plugin_state_lock);
2964                         return (IBT_CM_REJECT);
2965                 }
2966                 mutex_exit(&plugin_state_lock);
2967 
2968                 /*
2969                  * Need to send a MRA MAD to CM so that it does not
2970                  * timeout on us.
2971                  */
2972                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2973                     event->cm_event.req.req_timeout * 8, NULL, 0);
2974 
2975                 mutex_enter(&rib_stat->open_hca_lock);
2976                 q = rib_stat->q;
2977                 mutex_exit(&rib_stat->open_hca_lock);
2978 
2979                 status = rib_svc_create_chan(hca, (caddr_t)q,
2980                     event->cm_event.req.req_prim_hca_port, &qp);
2981 
2982                 if (status) {
2983                         return (IBT_CM_REJECT);
2984                 }
2985 
2986                 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2987                 ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2988                 ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2989                 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2990 
2991                 /*
2992                  * Pre-posts RECV buffers
2993                  */
2994                 conn = qptoc(qp);
2995                 for (i = 0; i < preposted_rbufs; i++) {
2996                         bzero(&rdbuf, sizeof (rdbuf));
2997                         rdbuf.type = RECV_BUFFER;
2998                         buf = rib_rbuf_alloc(conn, &rdbuf);
2999                         if (buf == NULL) {
3000                                 /*
3001                                  * A connection is not established yet.
3002                                  * Just flush the channel. Buffers
3003                                  * posted till now will error out with
3004                                  * IBT_WC_WR_FLUSHED_ERR.
3005                                  */
3006                                 (void) ibt_flush_channel(qp->qp_hdl);
3007                                 (void) rib_disconnect_channel(conn, NULL);
3008                                 return (IBT_CM_REJECT);
3009                         }
3010 
3011                         bzero(&cl, sizeof (cl));
3012                         cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3013                         cl.c_len = rdbuf.len;
3014                         cl.c_smemhandle.mrc_lmr =
3015                             rdbuf.handle.mrc_lmr; /* lkey */
3016                         cl.c_next = NULL;
3017                         status = rib_post_recv(conn, &cl);
3018                         if (status != RDMA_SUCCESS) {
3019                                 /*
3020                                  * A connection is not established yet.
3021                                  * Just flush the channel. Buffers
3022                                  * posted till now will error out with
3023                                  * IBT_WC_WR_FLUSHED_ERR.
3024                                  */
3025                                 (void) ibt_flush_channel(qp->qp_hdl);
3026                                 (void) rib_disconnect_channel(conn, NULL);
3027                                 return (IBT_CM_REJECT);
3028                         }
3029                 }
3030                 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3031 
3032                 /*
3033                  * Get the address translation
3034                  */
3035                 rw_enter(&hca->state_lock, RW_READER);
3036                 if (hca->state == HCA_DETACHED) {
3037                         rw_exit(&hca->state_lock);
3038                         return (IBT_CM_REJECT);
3039                 }
3040                 rw_exit(&hca->state_lock);
3041 
3042                 bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3043 
3044                 if (ibt_get_ip_data(event->cm_priv_data_len,
3045                     event->cm_priv_data,
3046                     &ipinfo) != IBT_SUCCESS) {
3047 
3048                         return (IBT_CM_REJECT);
3049                 }
3050 
3051                 switch (ipinfo.src_addr.family) {
3052                 case AF_INET:
3053 
3054                         conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3055                             KM_SLEEP);
3056                         (void) strcpy(conn->c_netid, RIBNETID_TCP);
3057 
3058                         conn->c_raddr.maxlen =
3059                             conn->c_raddr.len = sin_size;
3060                         conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3061 
3062                         s = (struct sockaddr_in *)conn->c_raddr.buf;
3063                         s->sin_family = AF_INET;
3064                         bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3065                             &s->sin_addr, in_size);
3066 
3067                         conn->c_laddr.maxlen =
3068                             conn->c_laddr.len = sin_size;
3069                         conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3070 
3071                         s = (struct sockaddr_in *)conn->c_laddr.buf;
3072                         s->sin_family = AF_INET;
3073                         bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3074                             &s->sin_addr, in_size);
3075 
3076                         conn->c_addrmask.maxlen = conn->c_addrmask.len =
3077                             sizeof (struct sockaddr_in);
3078                         conn->c_addrmask.buf =
3079                             kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3080                         ((struct sockaddr_in *)
3081                             conn->c_addrmask.buf)->sin_addr.s_addr =
3082                             (uint32_t)~0;
3083                         ((struct sockaddr_in *)
3084                             conn->c_addrmask.buf)->sin_family =
3085                             (sa_family_t)~0;
3086                         break;
3087 
3088                 case AF_INET6:
3089 
3090                         conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3091                             KM_SLEEP);
3092                         (void) strcpy(conn->c_netid, RIBNETID_TCP6);
3093 
3094                         conn->c_raddr.maxlen =
3095                             conn->c_raddr.len = sin6_size;
3096                         conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3097 
3098                         s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3099                         s6->sin6_family = AF_INET6;
3100                         bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3101                             &s6->sin6_addr,
3102                             sizeof (struct in6_addr));
3103 
3104                         conn->c_laddr.maxlen =
3105                             conn->c_laddr.len = sin6_size;
3106                         conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3107 
3108                         s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3109                         s6->sin6_family = AF_INET6;
3110                         bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3111                             &s6->sin6_addr,
3112                             sizeof (struct in6_addr));
3113 
3114                         conn->c_addrmask.maxlen = conn->c_addrmask.len =
3115                             sizeof (struct sockaddr_in6);
3116                         conn->c_addrmask.buf =
3117                             kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3118                         (void) memset(&((struct sockaddr_in6 *)
3119                             conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3120                             sizeof (struct in6_addr));
3121                         ((struct sockaddr_in6 *)
3122                             conn->c_addrmask.buf)->sin6_family =
3123                             (sa_family_t)~0;
3124                         break;
3125 
3126                 default:
3127                         return (IBT_CM_REJECT);
3128                 }
3129 
3130                 break;
3131 
3132         case IBT_CM_EVENT_CONN_CLOSED:
3133         {
3134                 CONN            *conn;
3135                 rib_qp_t        *qp;
3136 
3137                 switch (event->cm_event.closed) {
3138                 case IBT_CM_CLOSED_DREP_RCVD:
3139                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3140                 case IBT_CM_CLOSED_DUP:
3141                 case IBT_CM_CLOSED_ABORT:
3142                 case IBT_CM_CLOSED_ALREADY:
3143                         /*
3144                          * These cases indicate the local end initiated
3145                          * the closing of the channel. Nothing to do here.
3146                          */
3147                         break;
3148                 default:
3149                         /*
3150                          * Reason for CONN_CLOSED event must be one of
3151                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3152                          * or IBT_CM_CLOSED_STALE. These indicate cases were
3153                          * the remote end is closing the channel. In these
3154                          * cases free the channel and transition to error
3155                          * state
3156                          */
3157                         qp = ibt_get_chan_private(event->cm_channel);
3158                         conn = qptoc(qp);
3159                         mutex_enter(&conn->c_lock);
3160                         if (conn->c_state == C_DISCONN_PEND) {
3161                                 mutex_exit(&conn->c_lock);
3162                                 break;
3163                         }
3164                         conn->c_state = C_ERROR_CONN;
3165 
3166                         /*
3167                          * Free the conn if c_ref goes down to 0
3168                          */
3169                         if (conn->c_ref == 0) {
3170                                 /*
3171                                  * Remove from list and free conn
3172                                  */
3173                                 conn->c_state = C_DISCONN_PEND;
3174                                 mutex_exit(&conn->c_lock);
3175                                 (void) rib_disconnect_channel(conn,
3176                                     &hca->srv_conn_list);
3177                         } else {
3178                                 /*
3179                                  * conn will be freed when c_ref goes to 0.
3180                                  * Indicate to cleaning thread not to close
3181                                  * the connection, but just free the channel.
3182                                  */
3183                                 conn->c_flags |= C_CLOSE_NOTNEEDED;
3184                                 mutex_exit(&conn->c_lock);
3185                         }
3186                         DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3187                         break;
3188                 }
3189                 break;
3190         }
3191         case IBT_CM_EVENT_CONN_EST:
3192                 /*
3193                  * RTU received, hence connection established.
3194                  */
3195                 if (rib_debug > 1)
3196                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3197                             "(CONN_EST) channel established");
3198                 break;
3199 
3200         default:
3201                 if (rib_debug > 2) {
3202                         /* Let CM handle the following events. */
3203                         if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3204                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3205                                     "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3206                         } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3207                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3208                                     "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3209                         } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3210                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3211                                     "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3212                         } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3213                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3214                                     "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3215                         } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3216                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3217                                     "server recv'ed IBT_CM_EVENT_FAILURE\n");
3218                         }
3219                 }
3220                 return (IBT_CM_DEFAULT);
3221         }
3222 
3223         /* accept all other CM messages (i.e. let the CM handle them) */
3224         return (IBT_CM_ACCEPT);
3225 }
3226 
3227 static rdma_stat
3228 rib_register_service(rib_hca_t *hca, int service_type,
3229         uint8_t protocol_num, in_port_t dst_port)
3230 {
3231         ibt_srv_desc_t          sdesc;
3232         ibt_hca_portinfo_t      *port_infop;
3233         ib_svc_id_t             srv_id;
3234         ibt_srv_hdl_t           srv_hdl;
3235         uint_t                  port_size;
3236         uint_t                  pki, i, num_ports, nbinds;
3237         ibt_status_t            ibt_status;
3238         rib_service_t           *service;
3239         ib_pkey_t               pkey;
3240 
3241         /*
3242          * Query all ports for the given HCA
3243          */
3244         rw_enter(&hca->state_lock, RW_READER);
3245         if (hca->state != HCA_DETACHED) {
3246                 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3247                     &num_ports, &port_size);
3248                 rw_exit(&hca->state_lock);
3249         } else {
3250                 rw_exit(&hca->state_lock);
3251                 return (RDMA_FAILED);
3252         }
3253         if (ibt_status != IBT_SUCCESS) {
3254                 return (RDMA_FAILED);
3255         }
3256 
3257         DTRACE_PROBE1(rpcib__i__regservice_numports,
3258             int, num_ports);
3259 
3260         for (i = 0; i < num_ports; i++) {
3261                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3262                         DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3263                             int, i+1);
3264                 } else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3265                         DTRACE_PROBE1(rpcib__i__regservice__portactive,
3266                             int, i+1);
3267                 }
3268         }
3269 
3270         /*
3271          * Get all the IP addresses on this system to register the
3272          * given "service type" on all DNS recognized IP addrs.
3273          * Each service type such as NFS will have all the systems
3274          * IP addresses as its different names. For now the only
3275          * type of service we support in RPCIB is NFS.
3276          */
3277         rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3278         /*
3279          * Start registering and binding service to active
3280          * on active ports on this HCA.
3281          */
3282         nbinds = 0;
3283         for (service = rib_stat->service_list;
3284             service && (service->srv_type != service_type);
3285             service = service->next)
3286                 ;
3287 
3288         if (service == NULL) {
3289                 /*
3290                  * We use IP addresses as the service names for
3291                  * service registration.  Register each of them
3292                  * with CM to obtain a svc_id and svc_hdl.  We do not
3293                  * register the service with machine's loopback address.
3294                  */
3295                 (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3296                 (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3297                 (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3298                 sdesc.sd_handler = rib_srv_cm_handler;
3299                 sdesc.sd_flags = 0;
3300                 ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3301                     &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3302                     1, &srv_hdl, &srv_id);
3303                 if ((ibt_status != IBT_SUCCESS) &&
3304                     (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3305                         rw_exit(&rib_stat->service_list_lock);
3306                         DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3307                             int, ibt_status);
3308                         ibt_free_portinfo(port_infop, port_size);
3309                         return (RDMA_FAILED);
3310                 }
3311 
3312                 /*
3313                  * Allocate and prepare a service entry
3314                  */
3315                 service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3316 
3317                 service->srv_type = service_type;
3318                 service->srv_hdl = srv_hdl;
3319                 service->srv_id = srv_id;
3320 
3321                 service->next = rib_stat->service_list;
3322                 rib_stat->service_list = service;
3323                 DTRACE_PROBE1(rpcib__i__regservice__new__service,
3324                     int, service->srv_type);
3325         } else {
3326                 srv_hdl = service->srv_hdl;
3327                 srv_id = service->srv_id;
3328                 DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3329                     int, service->srv_type);
3330         }
3331 
3332         for (i = 0; i < num_ports; i++) {
3333                 ibt_sbind_hdl_t         sbp;
3334                 rib_hca_service_t       *hca_srv;
3335                 ib_gid_t                gid;
3336 
3337                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3338                         continue;
3339 
3340                 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3341                         pkey = port_infop[i].p_pkey_tbl[pki];
3342 
3343                         rw_enter(&hca->bound_services_lock, RW_READER);
3344                         gid = port_infop[i].p_sgid_tbl[0];
3345                         for (hca_srv = hca->bound_services; hca_srv;
3346                             hca_srv = hca_srv->next) {
3347                                 if ((hca_srv->srv_id == service->srv_id) &&
3348                                     (hca_srv->gid.gid_prefix ==
3349                                     gid.gid_prefix) &&
3350                                     (hca_srv->gid.gid_guid == gid.gid_guid))
3351                                         break;
3352                         }
3353                         rw_exit(&hca->bound_services_lock);
3354                         if (hca_srv != NULL) {
3355                                 /*
3356                                  * port is alreay bound the the service
3357                                  */
3358                                 DTRACE_PROBE1(
3359                                     rpcib__i__regservice__already__bound,
3360                                     int, i+1);
3361                                 nbinds++;
3362                                 continue;
3363                         }
3364 
3365                         if ((pkey & IBSRM_HB) &&
3366                             (pkey != IB_PKEY_INVALID_FULL)) {
3367 
3368                                 sbp = NULL;
3369                                 ibt_status = ibt_bind_service(srv_hdl,
3370                                     gid, NULL, hca, &sbp);
3371 
3372                                 if (ibt_status == IBT_SUCCESS) {
3373                                         hca_srv = kmem_zalloc(
3374                                             sizeof (rib_hca_service_t),
3375                                             KM_SLEEP);
3376                                         hca_srv->srv_id = srv_id;
3377                                         hca_srv->gid = gid;
3378                                         hca_srv->sbind_hdl = sbp;
3379 
3380                                         rw_enter(&hca->bound_services_lock,
3381                                             RW_WRITER);
3382                                         hca_srv->next = hca->bound_services;
3383                                         hca->bound_services = hca_srv;
3384                                         rw_exit(&hca->bound_services_lock);
3385                                         nbinds++;
3386                                 }
3387 
3388                                 DTRACE_PROBE1(rpcib__i__regservice__bindres,
3389                                     int, ibt_status);
3390                         }
3391                 }
3392         }
3393         rw_exit(&rib_stat->service_list_lock);
3394 
3395         ibt_free_portinfo(port_infop, port_size);
3396 
3397         if (nbinds == 0) {
3398                 return (RDMA_FAILED);
3399         } else {
3400                 /*
3401                  * Put this plugin into accept state, since atleast
3402                  * one registration was successful.
3403                  */
3404                 mutex_enter(&plugin_state_lock);
3405                 plugin_state = ACCEPT;
3406                 mutex_exit(&plugin_state_lock);
3407                 return (RDMA_SUCCESS);
3408         }
3409 }
3410 
3411 void
3412 rib_listen(struct rdma_svc_data *rd)
3413 {
3414         rdma_stat status;
3415         int n_listening = 0;
3416         rib_hca_t *hca;
3417 
3418         mutex_enter(&rib_stat->listen_lock);
3419         /*
3420          * if rd parameter is NULL then it means that rib_stat->q is
3421          * already initialized by a call from RDMA and we just want to
3422          * add a newly attached HCA to the same listening state as other
3423          * HCAs.
3424          */
3425         if (rd == NULL) {
3426                 if (rib_stat->q == NULL) {
3427                         mutex_exit(&rib_stat->listen_lock);
3428                         return;
3429                 }
3430         } else {
3431                 rib_stat->q = &rd->q;
3432         }
3433         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3434         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3435                 /*
3436                  * First check if a hca is still attached
3437                  */
3438                 rw_enter(&hca->state_lock, RW_READER);
3439                 if (hca->state != HCA_INITED) {
3440                         rw_exit(&hca->state_lock);
3441                         continue;
3442                 }
3443                 rw_exit(&hca->state_lock);
3444 
3445                 /*
3446                  * Right now the only service type is NFS. Hence
3447                  * force feed this value. Ideally to communicate
3448                  * the service type it should be passed down in
3449                  * rdma_svc_data.
3450                  */
3451                 status = rib_register_service(hca, NFS,
3452                     IPPROTO_TCP, nfs_rdma_port);
3453                 if (status == RDMA_SUCCESS)
3454                         n_listening++;
3455         }
3456         rw_exit(&rib_stat->hcas_list_lock);
3457 
3458         /*
3459          * Service active on an HCA, check rd->err_code for more
3460          * explainable errors.
3461          */
3462         if (rd) {
3463                 if (n_listening > 0) {
3464                         rd->active = 1;
3465                         rd->err_code = RDMA_SUCCESS;
3466                 } else {
3467                         rd->active = 0;
3468                         rd->err_code = RDMA_FAILED;
3469                 }
3470         }
3471         mutex_exit(&rib_stat->listen_lock);
3472 }
3473 
3474 /* XXXX */
3475 /* ARGSUSED */
3476 static void
3477 rib_listen_stop(struct rdma_svc_data *svcdata)
3478 {
3479         rib_hca_t               *hca;
3480 
3481         mutex_enter(&rib_stat->listen_lock);
3482         /*
3483          * KRPC called the RDMATF to stop the listeners, this means
3484          * stop sending incomming or recieved requests to KRPC master
3485          * transport handle for RDMA-IB. This is also means that the
3486          * master transport handle, responsible for us, is going away.
3487          */
3488         mutex_enter(&plugin_state_lock);
3489         plugin_state = NO_ACCEPT;
3490         if (svcdata != NULL)
3491                 svcdata->active = 0;
3492         mutex_exit(&plugin_state_lock);
3493 
3494         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3495         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3496                 /*
3497                  * First check if a hca is still attached
3498                  */
3499                 rw_enter(&hca->state_lock, RW_READER);
3500                 if (hca->state == HCA_DETACHED) {
3501                         rw_exit(&hca->state_lock);
3502                         continue;
3503                 }
3504                 rib_close_channels(&hca->srv_conn_list);
3505                 rib_stop_services(hca);
3506                 rw_exit(&hca->state_lock);
3507         }
3508         rw_exit(&rib_stat->hcas_list_lock);
3509 
3510         /*
3511          * Avoid rib_listen() using the stale q field.
3512          * This could happen if a port goes up after all services
3513          * are already unregistered.
3514          */
3515         rib_stat->q = NULL;
3516         mutex_exit(&rib_stat->listen_lock);
3517 }
3518 
3519 /*
3520  * Traverse the HCA's service list to unbind and deregister services.
3521  * For each bound service of HCA to be removed, first find the corresponding
3522  * service handle (srv_hdl) and then unbind the service by calling
3523  * ibt_unbind_service().
3524  */
3525 static void
3526 rib_stop_services(rib_hca_t *hca)
3527 {
3528         rib_hca_service_t *srv_list, *to_remove;
3529 
3530         /*
3531          * unbind and deregister the services for this service type.
3532          * Right now there is only one service type. In future it will
3533          * be passed down to this function.
3534          */
3535         rw_enter(&hca->bound_services_lock, RW_READER);
3536         srv_list = hca->bound_services;
3537         hca->bound_services = NULL;
3538         rw_exit(&hca->bound_services_lock);
3539 
3540         while (srv_list != NULL) {
3541                 rib_service_t *sc;
3542 
3543                 to_remove = srv_list;
3544                 srv_list = to_remove->next;
3545                 rw_enter(&rib_stat->service_list_lock, RW_READER);
3546                 for (sc = rib_stat->service_list;
3547                     sc && (sc->srv_id != to_remove->srv_id);
3548                     sc = sc->next)
3549                         ;
3550                 /*
3551                  * if sc is NULL then the service doesn't exist anymore,
3552                  * probably just removed completely through rib_stat.
3553                  */
3554                 if (sc != NULL)
3555                         (void) ibt_unbind_service(sc->srv_hdl,
3556                             to_remove->sbind_hdl);
3557                 rw_exit(&rib_stat->service_list_lock);
3558                 kmem_free(to_remove, sizeof (rib_hca_service_t));
3559         }
3560 }
3561 
3562 static struct svc_recv *
3563 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3564 {
3565         struct svc_recv *recvp;
3566 
3567         recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3568         recvp->vaddr = sgl->ds_va;
3569         recvp->qp = qp;
3570         recvp->bytes_xfer = 0;
3571         return (recvp);
3572 }
3573 
3574 static int
3575 rib_free_svc_recv(struct svc_recv *recvp)
3576 {
3577         kmem_free(recvp, sizeof (*recvp));
3578 
3579         return (0);
3580 }
3581 
3582 static struct reply *
3583 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3584 {
3585         struct reply    *rep;
3586 
3587 
3588         rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3589         if (rep == NULL) {
3590                 DTRACE_PROBE(rpcib__i__addrreply__nomem);
3591                 return (NULL);
3592         }
3593         rep->xid = msgid;
3594         rep->vaddr_cq = NULL;
3595         rep->bytes_xfer = 0;
3596         rep->status = (uint_t)REPLY_WAIT;
3597         rep->prev = NULL;
3598         cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3599 
3600         mutex_enter(&qp->replylist_lock);
3601         if (qp->replylist) {
3602                 rep->next = qp->replylist;
3603                 qp->replylist->prev = rep;
3604         }
3605         qp->rep_list_size++;
3606 
3607         DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3608             int, qp->rep_list_size);
3609 
3610         qp->replylist = rep;
3611         mutex_exit(&qp->replylist_lock);
3612 
3613         return (rep);
3614 }
3615 
3616 static rdma_stat
3617 rib_rem_replylist(rib_qp_t *qp)
3618 {
3619         struct reply    *r, *n;
3620 
3621         mutex_enter(&qp->replylist_lock);
3622         for (r = qp->replylist; r != NULL; r = n) {
3623                 n = r->next;
3624                 (void) rib_remreply(qp, r);
3625         }
3626         mutex_exit(&qp->replylist_lock);
3627 
3628         return (RDMA_SUCCESS);
3629 }
3630 
3631 static int
3632 rib_remreply(rib_qp_t *qp, struct reply *rep)
3633 {
3634 
3635         ASSERT(MUTEX_HELD(&qp->replylist_lock));
3636         if (rep->prev) {
3637                 rep->prev->next = rep->next;
3638         }
3639         if (rep->next) {
3640                 rep->next->prev = rep->prev;
3641         }
3642         if (qp->replylist == rep)
3643                 qp->replylist = rep->next;
3644 
3645         cv_destroy(&rep->wait_cv);
3646         qp->rep_list_size--;
3647 
3648         DTRACE_PROBE1(rpcib__i__remreply__listsize,
3649             int, qp->rep_list_size);
3650 
3651         kmem_free(rep, sizeof (*rep));
3652 
3653         return (0);
3654 }
3655 
3656 rdma_stat
3657 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3658         struct mrc *buf_handle)
3659 {
3660         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
3661         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
3662         rdma_stat       status;
3663         rib_hca_t       *hca = (ctoqp(conn))->hca;
3664 
3665         /*
3666          * Note: ALL buffer pools use the same memory type RDMARW.
3667          */
3668         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3669         if (status == RDMA_SUCCESS) {
3670                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3671                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3672                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3673         } else {
3674                 buf_handle->mrc_linfo = NULL;
3675                 buf_handle->mrc_lmr = 0;
3676                 buf_handle->mrc_rmr = 0;
3677         }
3678         return (status);
3679 }
3680 
3681 static rdma_stat
3682 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3683         ibt_mr_flags_t spec,
3684         ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3685 {
3686         ibt_mr_attr_t   mem_attr;
3687         ibt_status_t    ibt_status;
3688         mem_attr.mr_vaddr = (uintptr_t)buf;
3689         mem_attr.mr_len = (ib_msglen_t)size;
3690         mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3691         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3692             IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3693             IBT_MR_ENABLE_WINDOW_BIND | spec;
3694 
3695         rw_enter(&hca->state_lock, RW_READER);
3696         if (hca->state != HCA_DETACHED) {
3697                 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3698                     &mem_attr, mr_hdlp, mr_descp);
3699                 rw_exit(&hca->state_lock);
3700         } else {
3701                 rw_exit(&hca->state_lock);
3702                 return (RDMA_FAILED);
3703         }
3704 
3705         if (ibt_status != IBT_SUCCESS) {
3706                 return (RDMA_FAILED);
3707         }
3708         return (RDMA_SUCCESS);
3709 }
3710 
3711 rdma_stat
3712 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3713         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3714 {
3715         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
3716         rib_lrc_entry_t *l;
3717         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
3718         rdma_stat       status;
3719         rib_hca_t       *hca = (ctoqp(conn))->hca;
3720 
3721         /*
3722          * Non-coherent memory registration.
3723          */
3724         l = (rib_lrc_entry_t *)lrc;
3725         if (l) {
3726                 if (l->registered) {
3727                         buf_handle->mrc_linfo =
3728                             (uintptr_t)l->lrc_mhandle.mrc_linfo;
3729                         buf_handle->mrc_lmr =
3730                             (uint32_t)l->lrc_mhandle.mrc_lmr;
3731                         buf_handle->mrc_rmr =
3732                             (uint32_t)l->lrc_mhandle.mrc_rmr;
3733                         *sync_handle = (RIB_SYNCMEM_HANDLE)
3734                             (uintptr_t)l->lrc_mhandle.mrc_linfo;
3735                         return (RDMA_SUCCESS);
3736                 } else {
3737                         /* Always register the whole buffer */
3738                         buf = (caddr_t)l->lrc_buf;
3739                         buflen = l->lrc_len;
3740                 }
3741         }
3742         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3743 
3744         if (status == RDMA_SUCCESS) {
3745                 if (l) {
3746                         l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3747                         l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3748                         l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3749                         l->registered                 = TRUE;
3750                 }
3751                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3752                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3753                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3754                 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3755         } else {
3756                 buf_handle->mrc_linfo = NULL;
3757                 buf_handle->mrc_lmr = 0;
3758                 buf_handle->mrc_rmr = 0;
3759         }
3760         return (status);
3761 }
3762 
3763 /* ARGSUSED */
3764 rdma_stat
3765 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3766 {
3767         rib_hca_t *hca = (ctoqp(conn))->hca;
3768         /*
3769          * Allow memory deregistration even if HCA is
3770          * getting detached. Need all outstanding
3771          * memory registrations to be deregistered
3772          * before HCA_DETACH_EVENT can be accepted.
3773          */
3774         (void) ibt_deregister_mr(hca->hca_hdl,
3775             (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3776         return (RDMA_SUCCESS);
3777 }
3778 
3779 /* ARGSUSED */
3780 rdma_stat
3781 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3782                 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3783 {
3784         rib_lrc_entry_t *l;
3785         l = (rib_lrc_entry_t *)lrc;
3786         if (l)
3787                 if (l->registered)
3788                         return (RDMA_SUCCESS);
3789 
3790         (void) rib_deregistermem(conn, buf, buf_handle);
3791 
3792         return (RDMA_SUCCESS);
3793 }
3794 
3795 /* ARGSUSED */
3796 rdma_stat
3797 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3798                 int len, int cpu)
3799 {
3800         ibt_status_t    status;
3801         rib_hca_t *hca = (ctoqp(conn))->hca;
3802         ibt_mr_sync_t   mr_segment;
3803 
3804         mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3805         mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3806         mr_segment.ms_len = (ib_memlen_t)len;
3807         if (cpu) {
3808                 /* make incoming data visible to memory */
3809                 mr_segment.ms_flags = IBT_SYNC_WRITE;
3810         } else {
3811                 /* make memory changes visible to IO */
3812                 mr_segment.ms_flags = IBT_SYNC_READ;
3813         }
3814         rw_enter(&hca->state_lock, RW_READER);
3815         if (hca->state != HCA_DETACHED) {
3816                 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3817                 rw_exit(&hca->state_lock);
3818         } else {
3819                 rw_exit(&hca->state_lock);
3820                 return (RDMA_FAILED);
3821         }
3822 
3823         if (status == IBT_SUCCESS)
3824                 return (RDMA_SUCCESS);
3825         else {
3826                 return (RDMA_FAILED);
3827         }
3828 }
3829 
3830 /*
3831  * XXXX ????
3832  */
3833 static rdma_stat
3834 rib_getinfo(rdma_info_t *info)
3835 {
3836         /*
3837          * XXXX Hack!
3838          */
3839         info->addrlen = 16;
3840         info->mts = 1000000;
3841         info->mtu = 1000000;
3842 
3843         return (RDMA_SUCCESS);
3844 }
3845 
3846 rib_bufpool_t *
3847 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3848 {
3849         rib_bufpool_t   *rbp = NULL;
3850         bufpool_t       *bp = NULL;
3851         caddr_t         buf;
3852         ibt_mr_attr_t   mem_attr;
3853         ibt_status_t    ibt_status;
3854         int             i, j;
3855 
3856         rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3857 
3858         bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3859             num * sizeof (void *), KM_SLEEP);
3860 
3861         mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3862         bp->numelems = num;
3863 
3864 
3865         switch (ptype) {
3866         case SEND_BUFFER:
3867                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3868                 bp->rsize = RPC_MSG_SZ;
3869                 break;
3870         case RECV_BUFFER:
3871                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3872                 bp->rsize = RPC_BUF_SIZE;
3873                 break;
3874         default:
3875                 goto fail;
3876         }
3877 
3878         /*
3879          * Register the pool.
3880          */
3881         bp->bufsize = num * bp->rsize;
3882         bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3883         rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3884             sizeof (ibt_mr_hdl_t), KM_SLEEP);
3885         rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3886             sizeof (ibt_mr_desc_t), KM_SLEEP);
3887         rw_enter(&hca->state_lock, RW_READER);
3888 
3889         if (hca->state == HCA_DETACHED) {
3890                 rw_exit(&hca->state_lock);
3891                 goto fail;
3892         }
3893 
3894         for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3895                 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3896                 mem_attr.mr_vaddr = (uintptr_t)buf;
3897                 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3898                 mem_attr.mr_as = NULL;
3899                 ibt_status = ibt_register_mr(hca->hca_hdl,
3900                     hca->pd_hdl, &mem_attr,
3901                     &rbp->mr_hdl[i],
3902                     &rbp->mr_desc[i]);
3903                 if (ibt_status != IBT_SUCCESS) {
3904                         for (j = 0; j < i; j++) {
3905                                 (void) ibt_deregister_mr(hca->hca_hdl,
3906                                     rbp->mr_hdl[j]);
3907                         }
3908                         rw_exit(&hca->state_lock);
3909                         goto fail;
3910                 }
3911         }
3912         rw_exit(&hca->state_lock);
3913         buf = (caddr_t)bp->buf;
3914         for (i = 0; i < num; i++, buf += bp->rsize) {
3915                 bp->buflist[i] = (void *)buf;
3916         }
3917         bp->buffree = num - 1;       /* no. of free buffers */
3918         rbp->bpool = bp;
3919 
3920         return (rbp);
3921 fail:
3922         if (bp) {
3923                 if (bp->buf)
3924                         kmem_free(bp->buf, bp->bufsize);
3925                 kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3926         }
3927         if (rbp) {
3928                 if (rbp->mr_hdl)
3929                         kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3930                 if (rbp->mr_desc)
3931                         kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3932                 kmem_free(rbp, sizeof (rib_bufpool_t));
3933         }
3934         return (NULL);
3935 }
3936 
3937 static void
3938 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3939 {
3940         int i;
3941         rib_bufpool_t *rbp = NULL;
3942         bufpool_t *bp;
3943 
3944         /*
3945          * Obtain pool address based on type of pool
3946          */
3947         switch (ptype) {
3948                 case SEND_BUFFER:
3949                         rbp = hca->send_pool;
3950                         break;
3951                 case RECV_BUFFER:
3952                         rbp = hca->recv_pool;
3953                         break;
3954                 default:
3955                         return;
3956         }
3957         if (rbp == NULL)
3958                 return;
3959 
3960         bp = rbp->bpool;
3961 
3962         /*
3963          * Deregister the pool memory and free it.
3964          */
3965         for (i = 0; i < bp->numelems; i++) {
3966                 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3967         }
3968 }
3969 
3970 static void
3971 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3972 {
3973 
3974         rib_bufpool_t *rbp = NULL;
3975         bufpool_t *bp;
3976 
3977         /*
3978          * Obtain pool address based on type of pool
3979          */
3980         switch (ptype) {
3981                 case SEND_BUFFER:
3982                         rbp = hca->send_pool;
3983                         break;
3984                 case RECV_BUFFER:
3985                         rbp = hca->recv_pool;
3986                         break;
3987                 default:
3988                         return;
3989         }
3990         if (rbp == NULL)
3991                 return;
3992 
3993         bp = rbp->bpool;
3994 
3995         /*
3996          * Free the pool memory.
3997          */
3998         if (rbp->mr_hdl)
3999                 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4000 
4001         if (rbp->mr_desc)
4002                 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4003         if (bp->buf)
4004                 kmem_free(bp->buf, bp->bufsize);
4005         mutex_destroy(&bp->buflock);
4006         kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4007         kmem_free(rbp, sizeof (rib_bufpool_t));
4008 }
4009 
4010 void
4011 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4012 {
4013         /*
4014          * Deregister the pool memory and free it.
4015          */
4016         rib_rbufpool_deregister(hca, ptype);
4017         rib_rbufpool_free(hca, ptype);
4018 }
4019 
4020 /*
4021  * Fetch a buffer from the pool of type specified in rdbuf->type.
4022  */
4023 static rdma_stat
4024 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4025 {
4026         rib_lrc_entry_t *rlep;
4027 
4028         if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4029                 rlep = rib_get_cache_buf(conn, rdbuf->len);
4030                 rdbuf->rb_private =  (caddr_t)rlep;
4031                 rdbuf->addr = rlep->lrc_buf;
4032                 rdbuf->handle = rlep->lrc_mhandle;
4033                 return (RDMA_SUCCESS);
4034         }
4035 
4036         rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4037         if (rdbuf->addr) {
4038                 switch (rdbuf->type) {
4039                 case SEND_BUFFER:
4040                         rdbuf->len = RPC_MSG_SZ;     /* 1K */
4041                         break;
4042                 case RECV_BUFFER:
4043                         rdbuf->len = RPC_BUF_SIZE; /* 2K */
4044                         break;
4045                 default:
4046                         rdbuf->len = 0;
4047                 }
4048                 return (RDMA_SUCCESS);
4049         } else
4050                 return (RDMA_FAILED);
4051 }
4052 
4053 /*
4054  * Fetch a buffer of specified type.
4055  * Note that rdbuf->handle is mw's rkey.
4056  */
4057 static void *
4058 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4059 {
4060         rib_qp_t        *qp = ctoqp(conn);
4061         rib_hca_t       *hca = qp->hca;
4062         rdma_btype      ptype = rdbuf->type;
4063         void            *buf;
4064         rib_bufpool_t   *rbp = NULL;
4065         bufpool_t       *bp;
4066         int             i;
4067 
4068         /*
4069          * Obtain pool address based on type of pool
4070          */
4071         switch (ptype) {
4072         case SEND_BUFFER:
4073                 rbp = hca->send_pool;
4074                 break;
4075         case RECV_BUFFER:
4076                 rbp = hca->recv_pool;
4077                 break;
4078         default:
4079                 return (NULL);
4080         }
4081         if (rbp == NULL)
4082                 return (NULL);
4083 
4084         bp = rbp->bpool;
4085 
4086         mutex_enter(&bp->buflock);
4087         if (bp->buffree < 0) {
4088                 mutex_exit(&bp->buflock);
4089                 return (NULL);
4090         }
4091 
4092         /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4093         buf = bp->buflist[bp->buffree];
4094         rdbuf->addr = buf;
4095         rdbuf->len = bp->rsize;
4096         for (i = bp->numelems - 1; i >= 0; i--) {
4097                 if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4098                         rdbuf->handle.mrc_rmr =
4099                             (uint32_t)rbp->mr_desc[i].md_rkey;
4100                         rdbuf->handle.mrc_linfo =
4101                             (uintptr_t)rbp->mr_hdl[i];
4102                         rdbuf->handle.mrc_lmr =
4103                             (uint32_t)rbp->mr_desc[i].md_lkey;
4104                         bp->buffree--;
4105 
4106                         mutex_exit(&bp->buflock);
4107 
4108                         return (buf);
4109                 }
4110         }
4111 
4112         mutex_exit(&bp->buflock);
4113 
4114         return (NULL);
4115 }
4116 
4117 static void
4118 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4119 {
4120 
4121         if (rdbuf->type == RDMA_LONG_BUFFER) {
4122                 rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4123                 rdbuf->rb_private = NULL;
4124                 return;
4125         }
4126         rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4127 }
4128 
4129 static void
4130 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4131 {
4132         rib_qp_t *qp = ctoqp(conn);
4133         rib_hca_t *hca = qp->hca;
4134         rib_bufpool_t *rbp = NULL;
4135         bufpool_t *bp;
4136 
4137         /*
4138          * Obtain pool address based on type of pool
4139          */
4140         switch (ptype) {
4141         case SEND_BUFFER:
4142                 rbp = hca->send_pool;
4143                 break;
4144         case RECV_BUFFER:
4145                 rbp = hca->recv_pool;
4146                 break;
4147         default:
4148                 return;
4149         }
4150         if (rbp == NULL)
4151                 return;
4152 
4153         bp = rbp->bpool;
4154 
4155         mutex_enter(&bp->buflock);
4156         if (++bp->buffree >= bp->numelems) {
4157                 /*
4158                  * Should never happen
4159                  */
4160                 bp->buffree--;
4161         } else {
4162                 bp->buflist[bp->buffree] = buf;
4163         }
4164         mutex_exit(&bp->buflock);
4165 }
4166 
4167 static rdma_stat
4168 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4169 {
4170         rw_enter(&connlist->conn_lock, RW_WRITER);
4171         if (connlist->conn_hd) {
4172                 cn->c_next = connlist->conn_hd;
4173                 connlist->conn_hd->c_prev = cn;
4174         }
4175         connlist->conn_hd = cn;
4176         rw_exit(&connlist->conn_lock);
4177 
4178         return (RDMA_SUCCESS);
4179 }
4180 
4181 static rdma_stat
4182 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4183 {
4184         rw_enter(&connlist->conn_lock, RW_WRITER);
4185         if (cn->c_prev) {
4186                 cn->c_prev->c_next = cn->c_next;
4187         }
4188         if (cn->c_next) {
4189                 cn->c_next->c_prev = cn->c_prev;
4190         }
4191         if (connlist->conn_hd == cn)
4192                 connlist->conn_hd = cn->c_next;
4193         rw_exit(&connlist->conn_lock);
4194 
4195         return (RDMA_SUCCESS);
4196 }
4197 
4198 /* ARGSUSED */
4199 static rdma_stat
4200 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4201     int addr_type, void *handle, CONN **conn)
4202 {
4203         rdma_stat status;
4204         rpcib_ping_t rpt;
4205 
4206         status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4207         return (status);
4208 }
4209 
4210 /*
4211  * rib_find_hca_connection
4212  *
4213  * if there is an existing connection to the specified address then
4214  * it will be returned in conn, otherwise conn will be set to NULL.
4215  * Also cleans up any connection that is in error state.
4216  */
4217 static int
4218 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4219     struct netbuf *d_svcaddr, CONN **conn)
4220 {
4221         CONN *cn;
4222         clock_t cv_stat, timout;
4223 
4224         *conn = NULL;
4225 again:
4226         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4227         cn = hca->cl_conn_list.conn_hd;
4228         while (cn != NULL) {
4229                 /*
4230                  * First, clear up any connection in the ERROR state
4231                  */
4232                 mutex_enter(&cn->c_lock);
4233                 if (cn->c_state == C_ERROR_CONN) {
4234                         if (cn->c_ref == 0) {
4235                                 /*
4236                                  * Remove connection from list and destroy it.
4237                                  */
4238                                 cn->c_state = C_DISCONN_PEND;
4239                                 mutex_exit(&cn->c_lock);
4240                                 rw_exit(&hca->cl_conn_list.conn_lock);
4241                                 rib_conn_close((void *)cn);
4242                                 goto again;
4243                         }
4244                         mutex_exit(&cn->c_lock);
4245                         cn = cn->c_next;
4246                         continue;
4247                 }
4248                 if (cn->c_state == C_DISCONN_PEND) {
4249                         mutex_exit(&cn->c_lock);
4250                         cn = cn->c_next;
4251                         continue;
4252                 }
4253 
4254                 /*
4255                  * source address is only checked for if there is one,
4256                  * this is the case for retries.
4257                  */
4258                 if ((cn->c_raddr.len == d_svcaddr->len) &&
4259                     (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4260                     d_svcaddr->len) == 0) &&
4261                     ((s_svcaddr->len == 0) ||
4262                     ((cn->c_laddr.len == s_svcaddr->len) &&
4263                     (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4264                     s_svcaddr->len) == 0)))) {
4265                         /*
4266                          * Our connection. Give up conn list lock
4267                          * as we are done traversing the list.
4268                          */
4269                         rw_exit(&hca->cl_conn_list.conn_lock);
4270                         if (cn->c_state == C_CONNECTED) {
4271                                 cn->c_ref++; /* sharing a conn */
4272                                 mutex_exit(&cn->c_lock);
4273                                 *conn = cn;
4274                                 return (RDMA_SUCCESS);
4275                         }
4276                         if (cn->c_state == C_CONN_PEND) {
4277                                 /*
4278                                  * Hold a reference to this conn before
4279                                  * we give up the lock.
4280                                  */
4281                                 cn->c_ref++;
4282                                 timout =  ddi_get_lbolt() +
4283                                     drv_usectohz(CONN_WAIT_TIME * 1000000);
4284                                 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4285                                     &cn->c_lock, timout)) > 0 &&
4286                                     cn->c_state == C_CONN_PEND)
4287                                         ;
4288                                 if (cv_stat == 0) {
4289                                         (void) rib_conn_release_locked(cn);
4290                                         return (RDMA_INTR);
4291                                 }
4292                                 if (cv_stat < 0) {
4293                                         (void) rib_conn_release_locked(cn);
4294                                         return (RDMA_TIMEDOUT);
4295                                 }
4296                                 if (cn->c_state == C_CONNECTED) {
4297                                         *conn = cn;
4298                                         mutex_exit(&cn->c_lock);
4299                                         return (RDMA_SUCCESS);
4300                                 } else {
4301                                         (void) rib_conn_release_locked(cn);
4302                                         return (RDMA_TIMEDOUT);
4303                                 }
4304                         }
4305                 }
4306                 mutex_exit(&cn->c_lock);
4307                 cn = cn->c_next;
4308         }
4309         rw_exit(&hca->cl_conn_list.conn_lock);
4310         *conn = NULL;
4311         return (RDMA_FAILED);
4312 }
4313 
4314 /*
4315  * Connection management.
4316  * IBTF does not support recycling of channels. So connections are only
4317  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4318  * C_DISCONN_PEND state. No C_IDLE state.
4319  * C_CONN_PEND state: Connection establishment in progress to the server.
4320  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4321  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4322  * only in this state.
4323  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4324  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4325  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4326  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4327  * c_ref drops to 0 (this indicates that RPC has no more references to this
4328  * connection), the connection should be destroyed. A connection transitions
4329  * into this state when it is being destroyed.
4330  */
4331 /* ARGSUSED */
4332 static rdma_stat
4333 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4334     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4335 {
4336         CONN *cn;
4337         int status;
4338         rib_hca_t *hca;
4339         rib_qp_t *qp;
4340         int s_addr_len;
4341         char *s_addr_buf;
4342 
4343         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4344         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4345                 rw_enter(&hca->state_lock, RW_READER);
4346                 if (hca->state != HCA_DETACHED) {
4347                         status = rib_find_hca_connection(hca, s_svcaddr,
4348                             d_svcaddr, conn);
4349                         rw_exit(&hca->state_lock);
4350                         if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4351                                 rw_exit(&rib_stat->hcas_list_lock);
4352                                 return (status);
4353                         }
4354                 } else
4355                         rw_exit(&hca->state_lock);
4356         }
4357         rw_exit(&rib_stat->hcas_list_lock);
4358 
4359         /*
4360          * No existing connection found, establish a new connection.
4361          */
4362         bzero(rpt, sizeof (rpcib_ping_t));
4363 
4364         status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4365         if (status != RDMA_SUCCESS) {
4366                 return (RDMA_FAILED);
4367         }
4368         hca = rpt->hca;
4369 
4370         if (rpt->srcip.family == AF_INET) {
4371                 s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4372                 s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4373         } else if (rpt->srcip.family == AF_INET6) {
4374                 s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4375                 s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4376         } else {
4377                 return (RDMA_FAILED);
4378         }
4379 
4380         /*
4381          * Channel to server doesn't exist yet, create one.
4382          */
4383         if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4384                 return (RDMA_FAILED);
4385         }
4386         cn = qptoc(qp);
4387         cn->c_state = C_CONN_PEND;
4388         cn->c_ref = 1;
4389 
4390         cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4391         bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4392         cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4393 
4394         if (rpt->srcip.family == AF_INET) {
4395                 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4396                 (void) strcpy(cn->c_netid, RIBNETID_TCP);
4397 
4398                 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4399                     sizeof (struct sockaddr_in);
4400                 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4401 
4402                 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4403                     (uint32_t)~0;
4404                 ((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4405                     (ushort_t)~0;
4406 
4407         } else {
4408                 cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4409                 (void) strcpy(cn->c_netid, RIBNETID_TCP6);
4410 
4411                 cn->c_addrmask.len = cn->c_addrmask.maxlen =
4412                     sizeof (struct sockaddr_in6);
4413                 cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4414 
4415                 (void) memset(
4416                     &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4417                     (uchar_t)~0, sizeof (struct in6_addr));
4418                 ((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4419                     (sa_family_t)~0;
4420         }
4421 
4422         /*
4423          * Add to conn list.
4424          * We had given up the READER lock. In the time since then,
4425          * another thread might have created the connection we are
4426          * trying here. But for now, that is quiet alright - there
4427          * might be two connections between a pair of hosts instead
4428          * of one. If we really want to close that window,
4429          * then need to check the list after acquiring the
4430          * WRITER lock.
4431          */
4432         (void) rib_add_connlist(cn, &hca->cl_conn_list);
4433         status = rib_conn_to_srv(hca, qp, rpt);
4434         mutex_enter(&cn->c_lock);
4435 
4436         if (cn->c_flags & C_CLOSE_PENDING) {
4437                 /*
4438                  * This handles a case where the module or
4439                  * HCA detached in the time a connection is
4440                  * established. In such a case close the
4441                  * connection immediately if this is the
4442                  * only reference.
4443                  */
4444                 if (cn->c_ref == 1) {
4445                         cn->c_ref--;
4446                         cn->c_state = C_DISCONN_PEND;
4447                         mutex_exit(&cn->c_lock);
4448                         rib_conn_close((void *)cn);
4449                         return (RDMA_FAILED);
4450                 }
4451 
4452                 /*
4453                  * Connection to be closed later when c_ref = 0
4454                  */
4455                 status = RDMA_FAILED;
4456         }
4457 
4458         if (status == RDMA_SUCCESS) {
4459                 cn->c_state = C_CONNECTED;
4460                 *conn = cn;
4461         } else {
4462                 cn->c_state = C_ERROR_CONN;
4463                 cn->c_ref--;
4464         }
4465         cv_signal(&cn->c_cv);
4466         mutex_exit(&cn->c_lock);
4467         return (status);
4468 }
4469 
4470 static void
4471 rib_conn_close(void *rarg)
4472 {
4473         CONN *conn = (CONN *)rarg;
4474         rib_qp_t *qp = ctoqp(conn);
4475 
4476         mutex_enter(&conn->c_lock);
4477         if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4478 
4479                 conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4480 
4481                 /*
4482                  * Live connection in CONNECTED state.
4483                  */
4484                 if (conn->c_state == C_CONNECTED) {
4485                         conn->c_state = C_ERROR_CONN;
4486                 }
4487                 mutex_exit(&conn->c_lock);
4488 
4489                 rib_close_a_channel(conn);
4490 
4491                 mutex_enter(&conn->c_lock);
4492                 conn->c_flags &= ~C_CLOSE_PENDING;
4493         }
4494 
4495         mutex_exit(&conn->c_lock);
4496 
4497         if (qp->mode == RIB_SERVER)
4498                 (void) rib_disconnect_channel(conn,
4499                     &qp->hca->srv_conn_list);
4500         else
4501                 (void) rib_disconnect_channel(conn,
4502                     &qp->hca->cl_conn_list);
4503 }
4504 
4505 static void
4506 rib_conn_timeout_call(void *carg)
4507 {
4508         time_t idle_time;
4509         CONN *conn = (CONN *)carg;
4510         rib_hca_t *hca = ctoqp(conn)->hca;
4511         int error;
4512 
4513         mutex_enter(&conn->c_lock);
4514         if ((conn->c_ref > 0) ||
4515             (conn->c_state == C_DISCONN_PEND)) {
4516                 conn->c_timeout = NULL;
4517                 mutex_exit(&conn->c_lock);
4518                 return;
4519         }
4520 
4521         idle_time = (gethrestime_sec() - conn->c_last_used);
4522 
4523         if ((idle_time <= rib_conn_timeout) &&
4524             (conn->c_state != C_ERROR_CONN)) {
4525                 /*
4526                  * There was activity after the last timeout.
4527                  * Extend the conn life. Unless the conn is
4528                  * already in error state.
4529                  */
4530                 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4531                     SEC_TO_TICK(rib_conn_timeout - idle_time));
4532                 mutex_exit(&conn->c_lock);
4533                 return;
4534         }
4535 
4536         error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4537             (void *)conn, DDI_NOSLEEP);
4538 
4539         /*
4540          * If taskq dispatch fails above, then reset the timeout
4541          * to try again after 10 secs.
4542          */
4543 
4544         if (error != DDI_SUCCESS) {
4545                 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4546                     SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4547                 mutex_exit(&conn->c_lock);
4548                 return;
4549         }
4550 
4551         conn->c_state = C_DISCONN_PEND;
4552         mutex_exit(&conn->c_lock);
4553 }
4554 
4555 static rdma_stat
4556 rib_conn_release(CONN *conn)
4557 {
4558         mutex_enter(&conn->c_lock);
4559         return (rib_conn_release_locked(conn));
4560 }
4561 
4562 /*
4563  * Expects conn->c_lock to be held on entry.
4564  * c_lock released on return
4565  */
4566 static rdma_stat
4567 rib_conn_release_locked(CONN *conn)
4568 {
4569         conn->c_ref--;
4570 
4571         conn->c_last_used = gethrestime_sec();
4572         if (conn->c_ref > 0) {
4573                 mutex_exit(&conn->c_lock);
4574                 return (RDMA_SUCCESS);
4575         }
4576 
4577         /*
4578          * If a conn is C_ERROR_CONN, close the channel.
4579          */
4580         if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4581                 conn->c_state = C_DISCONN_PEND;
4582                 mutex_exit(&conn->c_lock);
4583                 rib_conn_close((void *)conn);
4584                 return (RDMA_SUCCESS);
4585         }
4586 
4587         /*
4588          * c_ref == 0, set a timeout for conn release
4589          */
4590 
4591         if (conn->c_timeout == NULL) {
4592                 conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4593                     SEC_TO_TICK(rib_conn_timeout));
4594         }
4595 
4596         mutex_exit(&conn->c_lock);
4597         return (RDMA_SUCCESS);
4598 }
4599 
4600 /*
4601  * Add at front of list
4602  */
4603 static struct rdma_done_list *
4604 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4605 {
4606         struct rdma_done_list *rd;
4607 
4608         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4609 
4610         rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4611         rd->xid = xid;
4612         cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4613 
4614         rd->prev = NULL;
4615         rd->next = qp->rdlist;
4616         if (qp->rdlist != NULL)
4617                 qp->rdlist->prev = rd;
4618         qp->rdlist = rd;
4619 
4620         return (rd);
4621 }
4622 
4623 static void
4624 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4625 {
4626         struct rdma_done_list *r;
4627 
4628         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4629 
4630         r = rd->next;
4631         if (r != NULL) {
4632                 r->prev = rd->prev;
4633         }
4634 
4635         r = rd->prev;
4636         if (r != NULL) {
4637                 r->next = rd->next;
4638         } else {
4639                 qp->rdlist = rd->next;
4640         }
4641 
4642         cv_destroy(&rd->rdma_done_cv);
4643         kmem_free(rd, sizeof (*rd));
4644 }
4645 
4646 static void
4647 rdma_done_rem_list(rib_qp_t *qp)
4648 {
4649         struct rdma_done_list   *r, *n;
4650 
4651         mutex_enter(&qp->rdlist_lock);
4652         for (r = qp->rdlist; r != NULL; r = n) {
4653                 n = r->next;
4654                 rdma_done_rm(qp, r);
4655         }
4656         mutex_exit(&qp->rdlist_lock);
4657 }
4658 
4659 static void
4660 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4661 {
4662         struct rdma_done_list *r = qp->rdlist;
4663 
4664         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4665 
4666         while (r) {
4667                 if (r->xid == xid) {
4668                         cv_signal(&r->rdma_done_cv);
4669                         return;
4670                 } else {
4671                         r = r->next;
4672                 }
4673         }
4674         DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4675             int, xid);
4676 }
4677 
4678 /*
4679  * Expects conn->c_lock to be held by the caller.
4680  */
4681 
4682 static void
4683 rib_close_a_channel(CONN *conn)
4684 {
4685         rib_qp_t        *qp;
4686         qp = ctoqp(conn);
4687 
4688         if (qp->qp_hdl == NULL) {
4689                 /* channel already freed */
4690                 return;
4691         }
4692 
4693         /*
4694          * Call ibt_close_rc_channel in blocking mode
4695          * with no callbacks.
4696          */
4697         (void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4698             NULL, 0, NULL, NULL, 0);
4699 }
4700 
4701 /*
4702  * Goes through all connections and closes the channel
4703  * This will cause all the WRs on those channels to be
4704  * flushed.
4705  */
4706 static void
4707 rib_close_channels(rib_conn_list_t *connlist)
4708 {
4709         CONN            *conn, *tmp;
4710 
4711         rw_enter(&connlist->conn_lock, RW_READER);
4712         conn = connlist->conn_hd;
4713         while (conn != NULL) {
4714                 mutex_enter(&conn->c_lock);
4715                 tmp = conn->c_next;
4716                 if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4717 
4718                         if (conn->c_state == C_CONN_PEND) {
4719                                 conn->c_flags |= C_CLOSE_PENDING;
4720                                 goto next;
4721                         }
4722 
4723                         conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4724 
4725                         /*
4726                          * Live connection in CONNECTED state.
4727                          */
4728                         if (conn->c_state == C_CONNECTED)
4729                                 conn->c_state = C_ERROR_CONN;
4730                         mutex_exit(&conn->c_lock);
4731 
4732                         rib_close_a_channel(conn);
4733 
4734                         mutex_enter(&conn->c_lock);
4735                         conn->c_flags &= ~C_CLOSE_PENDING;
4736                         /* Signal a pending rib_disconnect_channel() */
4737                         cv_signal(&conn->c_cv);
4738                 }
4739 next:
4740                 mutex_exit(&conn->c_lock);
4741                 conn = tmp;
4742         }
4743         rw_exit(&connlist->conn_lock);
4744 }
4745 
4746 /*
4747  * Frees up all connections that are no longer being referenced
4748  */
4749 static void
4750 rib_purge_connlist(rib_conn_list_t *connlist)
4751 {
4752         CONN            *conn;
4753 
4754 top:
4755         rw_enter(&connlist->conn_lock, RW_READER);
4756         conn = connlist->conn_hd;
4757         while (conn != NULL) {
4758                 mutex_enter(&conn->c_lock);
4759 
4760                 /*
4761                  * At this point connection is either in ERROR
4762                  * or DISCONN_PEND state. If in DISCONN_PEND state
4763                  * then some other thread is culling that connection.
4764                  * If not and if c_ref is 0, then destroy the connection.
4765                  */
4766                 if (conn->c_ref == 0 &&
4767                     conn->c_state != C_DISCONN_PEND) {
4768                         /*
4769                          * Cull the connection
4770                          */
4771                         conn->c_state = C_DISCONN_PEND;
4772                         mutex_exit(&conn->c_lock);
4773                         rw_exit(&connlist->conn_lock);
4774                         (void) rib_disconnect_channel(conn, connlist);
4775                         goto top;
4776                 } else {
4777                         /*
4778                          * conn disconnect already scheduled or will
4779                          * happen from conn_release when c_ref drops to 0.
4780                          */
4781                         mutex_exit(&conn->c_lock);
4782                 }
4783                 conn = conn->c_next;
4784         }
4785         rw_exit(&connlist->conn_lock);
4786 
4787         /*
4788          * At this point, only connections with c_ref != 0 are on the list
4789          */
4790 }
4791 
4792 /*
4793  * Free all the HCA resources and close
4794  * the hca.
4795  */
4796 
4797 static void
4798 rib_free_hca(rib_hca_t *hca)
4799 {
4800         (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4801         (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4802         (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4803         (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4804 
4805         kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4806         kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4807         kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4808         kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4809 
4810         rib_rbufpool_destroy(hca, RECV_BUFFER);
4811         rib_rbufpool_destroy(hca, SEND_BUFFER);
4812         rib_destroy_cache(hca);
4813         if (rib_mod.rdma_count == 0)
4814                 (void) rdma_unregister_mod(&rib_mod);
4815         (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4816         (void) ibt_close_hca(hca->hca_hdl);
4817         hca->hca_hdl = NULL;
4818 }
4819 
4820 
4821 static void
4822 rib_stop_hca_services(rib_hca_t *hca)
4823 {
4824         rib_stop_services(hca);
4825         rib_close_channels(&hca->cl_conn_list);
4826         rib_close_channels(&hca->srv_conn_list);
4827 
4828         rib_purge_connlist(&hca->cl_conn_list);
4829         rib_purge_connlist(&hca->srv_conn_list);
4830 
4831         if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4832                 kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4833                     GLOBAL_ZONEID);
4834                 stats_enabled = FALSE;
4835         }
4836 
4837         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4838         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4839         if (hca->srv_conn_list.conn_hd == NULL &&
4840             hca->cl_conn_list.conn_hd == NULL) {
4841                 /*
4842                  * conn_lists are NULL, so destroy
4843                  * buffers, close hca and be done.
4844                  */
4845                 rib_free_hca(hca);
4846         }
4847         rw_exit(&hca->cl_conn_list.conn_lock);
4848         rw_exit(&hca->srv_conn_list.conn_lock);
4849 
4850         if (hca->hca_hdl != NULL) {
4851                 mutex_enter(&hca->inuse_lock);
4852                 while (hca->inuse)
4853                         cv_wait(&hca->cb_cv, &hca->inuse_lock);
4854                 mutex_exit(&hca->inuse_lock);
4855 
4856                 rib_free_hca(hca);
4857         }
4858         rw_destroy(&hca->bound_services_lock);
4859 
4860         if (hca->cleanup_helper != NULL) {
4861                 ddi_taskq_destroy(hca->cleanup_helper);
4862                 hca->cleanup_helper = NULL;
4863         }
4864 }
4865 
4866 /*
4867  * Cleans and closes up all uses of the HCA
4868  */
4869 static void
4870 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4871 {
4872         rib_hca_t *hca = NULL;
4873         rib_hca_t **hcap;
4874 
4875         rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4876         for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4877                 hca = *hcap;
4878                 rw_enter(&hca->state_lock, RW_WRITER);
4879                 if (hca->hca_hdl == hca_hdl) {
4880                         /*
4881                          * Mark as detached and remove from
4882                          * hca list.
4883                          */
4884                         hca->state = HCA_DETACHED;
4885                         *hcap = hca->next;
4886                         rib_stat->nhca_inited--;
4887                         rib_mod.rdma_count--;
4888                         rw_exit(&hca->state_lock);
4889                         break;
4890                 }
4891                 rw_exit(&hca->state_lock);
4892         }
4893         rw_exit(&rib_stat->hcas_list_lock);
4894 
4895         if (hca == NULL)
4896                 return;
4897         ASSERT(hca->hca_hdl == hca_hdl);
4898 
4899         /*
4900          * Stop all services on the HCA
4901          * Go through cl_conn_list and close all rc_channels
4902          * Go through svr_conn_list and close all rc_channels
4903          * Free connections whose c_ref has dropped to 0
4904          * Destroy all CQs
4905          * Deregister and released all buffer pool memory after all
4906          * connections are destroyed
4907          * Free the protection domain
4908          * ibt_close_hca()
4909          */
4910         rib_stop_hca_services(hca);
4911 
4912         kmem_free(hca, sizeof (*hca));
4913 }
4914 
4915 static void
4916 rib_server_side_cache_reclaim(void *argp)
4917 {
4918         cache_avl_struct_t    *rcas;
4919         rib_lrc_entry_t         *rb;
4920         rib_hca_t *hca = (rib_hca_t *)argp;
4921 
4922         rw_enter(&hca->avl_rw_lock, RW_WRITER);
4923         rcas = avl_first(&hca->avl_tree);
4924         if (rcas != NULL)
4925                 avl_remove(&hca->avl_tree, rcas);
4926 
4927         while (rcas != NULL) {
4928                 while (rcas->r.forw != &rcas->r) {
4929                         rcas->elements--;
4930                         rb = rcas->r.forw;
4931                         remque(rb);
4932                         if (rb->registered)
4933                                 (void) rib_deregistermem_via_hca(hca,
4934                                     rb->lrc_buf, rb->lrc_mhandle);
4935 
4936                         hca->cache_allocation -= rb->lrc_len;
4937                         kmem_free(rb->lrc_buf, rb->lrc_len);
4938                         kmem_free(rb, sizeof (rib_lrc_entry_t));
4939                 }
4940                 mutex_destroy(&rcas->node_lock);
4941                 kmem_cache_free(hca->server_side_cache, rcas);
4942                 rcas = avl_first(&hca->avl_tree);
4943                 if (rcas != NULL)
4944                         avl_remove(&hca->avl_tree, rcas);
4945         }
4946         rw_exit(&hca->avl_rw_lock);
4947 }
4948 
4949 static void
4950 rib_server_side_cache_cleanup(void *argp)
4951 {
4952         cache_avl_struct_t    *rcas;
4953         rib_lrc_entry_t         *rb;
4954         rib_hca_t *hca = (rib_hca_t *)argp;
4955 
4956         mutex_enter(&hca->cache_allocation_lock);
4957         if (hca->cache_allocation < cache_limit) {
4958                 mutex_exit(&hca->cache_allocation_lock);
4959                 return;
4960         }
4961         mutex_exit(&hca->cache_allocation_lock);
4962 
4963         rw_enter(&hca->avl_rw_lock, RW_WRITER);
4964         rcas = avl_last(&hca->avl_tree);
4965         if (rcas != NULL)
4966                 avl_remove(&hca->avl_tree, rcas);
4967 
4968         while (rcas != NULL) {
4969                 while (rcas->r.forw != &rcas->r) {
4970                         rcas->elements--;
4971                         rb = rcas->r.forw;
4972                         remque(rb);
4973                         if (rb->registered)
4974                                 (void) rib_deregistermem_via_hca(hca,
4975                                     rb->lrc_buf, rb->lrc_mhandle);
4976 
4977                         hca->cache_allocation -= rb->lrc_len;
4978 
4979                         kmem_free(rb->lrc_buf, rb->lrc_len);
4980                         kmem_free(rb, sizeof (rib_lrc_entry_t));
4981                 }
4982                 mutex_destroy(&rcas->node_lock);
4983                 if (hca->server_side_cache) {
4984                         kmem_cache_free(hca->server_side_cache, rcas);
4985                 }
4986 
4987                 if (hca->cache_allocation < cache_limit) {
4988                         rw_exit(&hca->avl_rw_lock);
4989                         return;
4990                 }
4991 
4992                 rcas = avl_last(&hca->avl_tree);
4993                 if (rcas != NULL)
4994                         avl_remove(&hca->avl_tree, rcas);
4995         }
4996         rw_exit(&hca->avl_rw_lock);
4997 }
4998 
4999 static int
5000 avl_compare(const void *t1, const void *t2)
5001 {
5002         if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5003                 return (0);
5004 
5005         if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5006                 return (-1);
5007 
5008         return (1);
5009 }
5010 
5011 static void
5012 rib_destroy_cache(rib_hca_t *hca)
5013 {
5014         if (hca->avl_init) {
5015                 rib_server_side_cache_reclaim((void *)hca);
5016                 if (hca->server_side_cache) {
5017                         kmem_cache_destroy(hca->server_side_cache);
5018                         hca->server_side_cache = NULL;
5019                 }
5020                 avl_destroy(&hca->avl_tree);
5021                 mutex_destroy(&hca->cache_allocation_lock);
5022                 rw_destroy(&hca->avl_rw_lock);
5023         }
5024         hca->avl_init = FALSE;
5025 }
5026 
5027 static void
5028 rib_force_cleanup(void *hca)
5029 {
5030         if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5031                 (void) ddi_taskq_dispatch(
5032                     ((rib_hca_t *)hca)->cleanup_helper,
5033                     rib_server_side_cache_cleanup,
5034                     (void *)hca, DDI_NOSLEEP);
5035 }
5036 
5037 static rib_lrc_entry_t *
5038 rib_get_cache_buf(CONN *conn, uint32_t len)
5039 {
5040         cache_avl_struct_t      cas, *rcas;
5041         rib_hca_t       *hca = (ctoqp(conn))->hca;
5042         rib_lrc_entry_t *reply_buf;
5043         avl_index_t where = NULL;
5044         uint64_t c_alloc = 0;
5045 
5046         if (!hca->avl_init)
5047                 goto  error_alloc;
5048 
5049         cas.len = len;
5050 
5051         rw_enter(&hca->avl_rw_lock, RW_READER);
5052 
5053         mutex_enter(&hca->cache_allocation_lock);
5054         c_alloc = hca->cache_allocation;
5055         mutex_exit(&hca->cache_allocation_lock);
5056 
5057         if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5058             &where)) == NULL) {
5059                 /* Am I above the cache limit */
5060                 if ((c_alloc + len) >= cache_limit) {
5061                         rib_force_cleanup((void *)hca);
5062                         rw_exit(&hca->avl_rw_lock);
5063                         mutex_enter(&hca->cache_allocation_lock);
5064                         hca->cache_misses_above_the_limit ++;
5065                         mutex_exit(&hca->cache_allocation_lock);
5066 
5067                         /* Allocate and register the buffer directly */
5068                         goto error_alloc;
5069                 }
5070 
5071                 rw_exit(&hca->avl_rw_lock);
5072                 rw_enter(&hca->avl_rw_lock, RW_WRITER);
5073 
5074                 /* Recheck to make sure no other thread added the entry in */
5075                 if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5076                     &cas, &where)) == NULL) {
5077                         /* Allocate an avl tree entry */
5078                         rcas = (cache_avl_struct_t *)
5079                             kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5080 
5081                         bzero(rcas, sizeof (cache_avl_struct_t));
5082                         rcas->elements = 0;
5083                         rcas->r.forw = &rcas->r;
5084                         rcas->r.back = &rcas->r;
5085                         rcas->len = len;
5086                         mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5087                         avl_insert(&hca->avl_tree, rcas, where);
5088                 }
5089         }
5090 
5091         mutex_enter(&rcas->node_lock);
5092 
5093         if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5094                 reply_buf = rcas->r.forw;
5095                 remque(reply_buf);
5096                 rcas->elements--;
5097                 mutex_exit(&rcas->node_lock);
5098                 rw_exit(&hca->avl_rw_lock);
5099 
5100                 mutex_enter(&hca->cache_allocation_lock);
5101                 hca->cache_hits++;
5102                 hca->cache_allocation -= len;
5103                 mutex_exit(&hca->cache_allocation_lock);
5104         } else {
5105                 /* Am I above the cache limit */
5106                 mutex_exit(&rcas->node_lock);
5107                 if ((c_alloc + len) >= cache_limit) {
5108                         rib_force_cleanup((void *)hca);
5109                         rw_exit(&hca->avl_rw_lock);
5110 
5111                         mutex_enter(&hca->cache_allocation_lock);
5112                         hca->cache_misses_above_the_limit++;
5113                         mutex_exit(&hca->cache_allocation_lock);
5114                         /* Allocate and register the buffer directly */
5115                         goto error_alloc;
5116                 }
5117                 rw_exit(&hca->avl_rw_lock);
5118                 mutex_enter(&hca->cache_allocation_lock);
5119                 hca->cache_misses++;
5120                 mutex_exit(&hca->cache_allocation_lock);
5121                 /* Allocate a reply_buf entry */
5122                 reply_buf = (rib_lrc_entry_t *)
5123                     kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5124                 bzero(reply_buf, sizeof (rib_lrc_entry_t));
5125                 reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5126                 reply_buf->lrc_len  = len;
5127                 reply_buf->registered = FALSE;
5128                 reply_buf->avl_node = (void *)rcas;
5129         }
5130 
5131         return (reply_buf);
5132 
5133 error_alloc:
5134         reply_buf = (rib_lrc_entry_t *)
5135             kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5136         bzero(reply_buf, sizeof (rib_lrc_entry_t));
5137         reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5138         reply_buf->lrc_len = len;
5139         reply_buf->registered = FALSE;
5140         reply_buf->avl_node = NULL;
5141 
5142         return (reply_buf);
5143 }
5144 
5145 /*
5146  * Return a pre-registered back to the cache (without
5147  * unregistering the buffer)..
5148  */
5149 
5150 static void
5151 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5152 {
5153         cache_avl_struct_t    cas, *rcas;
5154         avl_index_t where = NULL;
5155         rib_hca_t       *hca = (ctoqp(conn))->hca;
5156 
5157         if (!hca->avl_init)
5158                 goto  error_free;
5159 
5160         cas.len = reg_buf->lrc_len;
5161         rw_enter(&hca->avl_rw_lock, RW_READER);
5162         if ((rcas = (cache_avl_struct_t *)
5163             avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5164                 rw_exit(&hca->avl_rw_lock);
5165                 goto error_free;
5166         } else {
5167                 cas.len = reg_buf->lrc_len;
5168                 mutex_enter(&rcas->node_lock);
5169                 insque(reg_buf, &rcas->r);
5170                 rcas->elements ++;
5171                 mutex_exit(&rcas->node_lock);
5172                 rw_exit(&hca->avl_rw_lock);
5173                 mutex_enter(&hca->cache_allocation_lock);
5174                 hca->cache_allocation += cas.len;
5175                 mutex_exit(&hca->cache_allocation_lock);
5176         }
5177 
5178         return;
5179 
5180 error_free:
5181 
5182         if (reg_buf->registered)
5183                 (void) rib_deregistermem_via_hca(hca,
5184                     reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5185         kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5186         kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5187 }
5188 
5189 static rdma_stat
5190 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5191         uint_t buflen, struct mrc *buf_handle)
5192 {
5193         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
5194         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
5195         rdma_stat       status;
5196 
5197 
5198         /*
5199          * Note: ALL buffer pools use the same memory type RDMARW.
5200          */
5201         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5202         if (status == RDMA_SUCCESS) {
5203                 buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5204                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5205                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5206         } else {
5207                 buf_handle->mrc_linfo = NULL;
5208                 buf_handle->mrc_lmr = 0;
5209                 buf_handle->mrc_rmr = 0;
5210         }
5211         return (status);
5212 }
5213 
5214 /* ARGSUSED */
5215 static rdma_stat
5216 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5217     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5218 {
5219 
5220         (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5221         return (RDMA_SUCCESS);
5222 }
5223 
5224 /* ARGSUSED */
5225 static rdma_stat
5226 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5227 {
5228 
5229         (void) ibt_deregister_mr(hca->hca_hdl,
5230             (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5231         return (RDMA_SUCCESS);
5232 }
5233 
5234 /*
5235  * Check if the IP interface named by `lifrp' is RDMA-capable.
5236  */
5237 static boolean_t
5238 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5239 {
5240         char ifname[LIFNAMSIZ];
5241         char *cp;
5242 
5243         if (lifrp->lifr_type == IFT_IB)
5244                 return (B_TRUE);
5245 
5246         /*
5247          * Strip off the logical interface portion before getting
5248          * intimate with the name.
5249          */
5250         (void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5251         if ((cp = strchr(ifname, ':')) != NULL)
5252                 *cp = '\0';
5253 
5254         return (strcmp("lo0", ifname) == 0);
5255 }
5256 
5257 static int
5258 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5259 {
5260         vnode_t *kkvp, *vp;
5261         TIUSER  *tiptr;
5262         struct  strioctl iocb;
5263         k_sigset_t smask;
5264         int     err = 0;
5265 
5266         if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5267                 if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5268                     &tiptr, CRED()) == 0) {
5269                         vp = tiptr->fp->f_vnode;
5270                 } else {
5271                         VN_RELE(kkvp);
5272                         return (EPROTO);
5273                 }
5274         } else {
5275                 return (EPROTO);
5276         }
5277 
5278         iocb.ic_cmd = cmd;
5279         iocb.ic_timout = 0;
5280         iocb.ic_len = len;
5281         iocb.ic_dp = (caddr_t)arg;
5282         sigintr(&smask, 0);
5283         err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5284         sigunintr(&smask);
5285         (void) t_kclose(tiptr, 0);
5286         VN_RELE(kkvp);
5287         return (err);
5288 }
5289 
5290 /*
5291  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5292  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5293  */
5294 static int
5295 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5296 {
5297         int err;
5298         struct lifnum lifn;
5299 
5300         bzero(&lifn, sizeof (struct lifnum));
5301         lifn.lifn_family = AF_UNSPEC;
5302 
5303         err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5304         if (err != 0)
5305                 return (err);
5306 
5307         /*
5308          * Pad the interface count to account for additional interfaces that
5309          * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5310          */
5311         lifn.lifn_count += 4;
5312 
5313         bzero(lifcp, sizeof (struct lifconf));
5314         lifcp->lifc_family = AF_UNSPEC;
5315         lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5316         lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5317 
5318         err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5319         if (err != 0) {
5320                 kmem_free(lifcp->lifc_buf, *bufsizep);
5321                 return (err);
5322         }
5323         return (0);
5324 }
5325 
5326 static boolean_t
5327 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5328 {
5329         uint_t i, nifs;
5330         uint_t bufsize;
5331         struct lifconf lifc;
5332         struct lifreq *lifrp;
5333         struct sockaddr_in *sinp;
5334         struct sockaddr_in6 *sin6p;
5335 
5336         bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5337         bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5338 
5339         if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5340                 return (B_FALSE);
5341 
5342         if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5343                 kmem_free(lifc.lifc_buf, bufsize);
5344                 return (B_FALSE);
5345         }
5346 
5347         /*
5348          * Worst case is that all of the addresses are IB-capable and have
5349          * the same address family, so size our buffers accordingly.
5350          */
5351         addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5352         addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5353         addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5354         addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5355 
5356         for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5357                 if (!rpcib_rdma_capable_interface(lifrp))
5358                         continue;
5359 
5360                 if (lifrp->lifr_addr.ss_family == AF_INET) {
5361                         sinp = addrs4->ri_list;
5362                         bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5363                             sizeof (struct sockaddr_in));
5364                 } else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5365                         sin6p = addrs6->ri_list;
5366                         bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5367                             sizeof (struct sockaddr_in6));
5368                 }
5369         }
5370 
5371         kmem_free(lifc.lifc_buf, bufsize);
5372         return (B_TRUE);
5373 }
5374 
5375 /* ARGSUSED */
5376 static int
5377 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5378 {
5379         rib_hca_t *hca;
5380 
5381         if (KSTAT_WRITE == rw) {
5382                 return (EACCES);
5383         }
5384 
5385         rpcib_kstat.cache_limit.value.ui64 =
5386             (uint64_t)cache_limit;
5387         rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5388         for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5389                 rpcib_kstat.cache_allocation.value.ui64 +=
5390                     (uint64_t)hca->cache_allocation;
5391                 rpcib_kstat.cache_hits.value.ui64 +=
5392                     (uint64_t)hca->cache_hits;
5393                 rpcib_kstat.cache_misses.value.ui64 +=
5394                     (uint64_t)hca->cache_misses;
5395                 rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5396                     (uint64_t)hca->cache_misses_above_the_limit;
5397         }
5398         rw_exit(&rib_stat->hcas_list_lock);
5399         return (0);
5400 }