1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * hermon_wr.c
  28  *    Hermon Work Request Processing Routines
  29  *
  30  *    Implements all the routines necessary to provide the PostSend(),
  31  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
  32  *    necessary to implement the Hermon WRID tracking mechanism.
  33  */
  34 
  35 #include <sys/types.h>
  36 #include <sys/conf.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/modctl.h>
  40 #include <sys/avl.h>
  41 
  42 #include <sys/ib/adapters/hermon/hermon.h>
  43 
  44 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
  45 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
  46 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
  47     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  48 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
  49     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  50 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
  51 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
  52     ibt_recv_wr_t *wr, uint64_t *desc);
  53 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
  54     ibt_recv_wr_t *wr, uint64_t *desc);
  55 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
  56     uint_t send_or_recv);
  57 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
  58 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
  59     hermon_workq_avl_t *wqavl);
  60 
  61 static  ibt_wr_ds_t     null_sgl = { 0, 0x00000100, 0 };
  62 
  63 /*
  64  * Add ability to try to debug RDMA_READ/RDMA_WRITE failures.
  65  *
  66  *      0x1 - print rkey used during post_send
  67  *      0x2 - print sgls used during post_send
  68  *      0x4 - print FMR comings and goings
  69  */
  70 int hermon_rdma_debug = 0x0;
  71 
  72 static int
  73 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
  74     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
  75 {
  76         hermon_hw_snd_wqe_ud_t          *ud;
  77         hermon_workq_hdr_t              *wq;
  78         hermon_ahhdl_t                  ah;
  79         ibt_wr_rfci_send_t              *rfci;
  80         ibt_wr_init_send_t              *is;
  81         ibt_ud_dest_t                   *dest;
  82         uint64_t                        *desc;
  83         uint32_t                        desc_sz;
  84         uint32_t                        signaled_dbd, solicited;
  85         uint32_t                        head, tail, next_tail, qsize_msk;
  86         uint32_t                        hdrmwqes;
  87         uint32_t                        nopcode, fence, immed_data = 0;
  88         hermon_hw_wqe_sgl_t             *ds, *old_ds;
  89         ibt_wr_ds_t                     *sgl;
  90         int                             nds;
  91         int                             i, j, last_ds, num_ds, status;
  92         uint32_t                        *wqe_start;
  93         int                             sectperwqe;
  94         uint_t                          posted_cnt = 0;
  95         int                             total_len, strong_order, fc_bits, cksum;
  96 
  97 
  98         /* initialize the FMA retry loop */
  99         hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
 100 
 101         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
 102         _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
 103 
 104         /* Grab the lock for the WRID list */
 105         membar_consumer();
 106 
 107         /* Save away some initial QP state */
 108         wq = qp->qp_sq_wqhdr;
 109         qsize_msk = wq->wq_mask;
 110         hdrmwqes  = qp->qp_sq_hdrmwqes;              /* in WQEs  */
 111         sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
 112 
 113         tail      = wq->wq_tail;
 114         head      = wq->wq_head;
 115         status    = DDI_SUCCESS;
 116 
 117 post_next:
 118         /*
 119          * Check for "queue full" condition.  If the queue
 120          * is already full, then no more WQEs can be posted.
 121          * So break out, ring a doorbell (if necessary) and
 122          * return an error
 123          */
 124         if (wq->wq_full != 0) {
 125                 status = IBT_QP_FULL;
 126                 goto done;
 127         }
 128 
 129         next_tail = (tail + 1) & qsize_msk;
 130         if (((tail + hdrmwqes) & qsize_msk) == head) {
 131                 wq->wq_full = 1;
 132         }
 133 
 134         desc = HERMON_QP_SQ_ENTRY(qp, tail);
 135 
 136         nds = wr->wr_nds;
 137         sgl = wr->wr_sgl;
 138         num_ds = 0;
 139         strong_order = 0;
 140         fc_bits = 0;
 141         cksum = 0;
 142 
 143         /*
 144          * Build a Send or Send_LSO WQE
 145          */
 146         switch (wr->wr_opcode) {
 147         case IBT_WRC_SEND_LSO:
 148                 if (wr->wr_trans != IBT_UD_SRV) {
 149                         status = IBT_QP_SRV_TYPE_INVALID;
 150                         goto done;
 151                 }
 152                 nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
 153                 if (wr->wr_flags & IBT_WR_SEND_CKSUM)
 154                         cksum = 0x30;
 155                 if (wr->wr.ud_lso.lso_hdr_sz > 60) {
 156                         nopcode |= (1 << 6);      /* ReRead bit must be set */
 157                 }
 158                 dest = wr->wr.ud_lso.lso_ud_dest;
 159                 ah = (hermon_ahhdl_t)dest->ud_ah;
 160                 if (ah == NULL) {
 161                         status = IBT_AH_HDL_INVALID;
 162                         goto done;
 163                 }
 164                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
 165                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 166                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
 167                     sizeof (hermon_hw_snd_wqe_ud_t));
 168                 HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
 169 
 170                 total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
 171                 if ((uintptr_t)ds + total_len + (nds * 16) >
 172                     (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
 173                         status = IBT_QP_SGL_LEN_INVALID;
 174                         goto done;
 175                 }
 176                 old_ds = ds;
 177                 bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
 178                     wr->wr.ud_lso.lso_hdr_sz);
 179                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
 180                 i = 0;
 181                 break;
 182 
 183         case IBT_WRC_SEND:
 184                 nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
 185                 if (qp->qp_serv_type == HERMON_QP_UD) {
 186                         if (wr->wr_trans != IBT_UD_SRV) {
 187                                 status = IBT_QP_SRV_TYPE_INVALID;
 188                                 goto done;
 189                         }
 190                         if (wr->wr_flags & IBT_WR_SEND_CKSUM)
 191                                 cksum = 0x30;
 192                         dest = wr->wr.ud.udwr_dest;
 193                 } else if (qp->qp_serv_type == HERMON_QP_RFCI) {
 194                         if (wr->wr_trans != IBT_RFCI_SRV) {
 195                                 status = IBT_QP_SRV_TYPE_INVALID;
 196                                 goto done;
 197                         }
 198                         rfci = &wr->wr.fc.rfci_send;
 199                         if ((wr->wr_flags & IBT_WR_SEND_FC_CRC) != 0) {
 200                                 nopcode |= (rfci->rfci_eof << 16);
 201                                 fc_bits = 0x40; /* set FCRC */
 202                         }
 203                         dest = rfci->rfci_dest;
 204                 } else {
 205                         status = IBT_QP_OP_TYPE_INVALID;
 206                         goto done;
 207                 }
 208                 if (wr->wr_flags & IBT_WR_SEND_IMMED) {
 209                         /* "|=" changes 0xa to 0xb without touching FCEOF */
 210                         nopcode |= HERMON_WQE_SEND_NOPCODE_SENDI;
 211                         immed_data = wr->wr.ud.udwr_immed;
 212                 }
 213                 ah = (hermon_ahhdl_t)dest->ud_ah;
 214                 if (ah == NULL) {
 215                         status = IBT_AH_HDL_INVALID;
 216                         goto done;
 217                 }
 218                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
 219                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 220                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
 221                     sizeof (hermon_hw_snd_wqe_ud_t));
 222                 HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
 223                 i = 0;
 224                 break;
 225 
 226         case IBT_WRC_INIT_SEND_FCMD:
 227                 if (qp->qp_serv_type != HERMON_QP_FCMND) {
 228                         status = IBT_QP_OP_TYPE_INVALID;
 229                         goto done;
 230                 }
 231                 if (wr->wr_trans != IBT_FCMD_SRV) {
 232                         status = IBT_QP_SRV_TYPE_INVALID;
 233                         goto done;
 234                 }
 235                 nopcode = HERMON_WQE_FCP_OPCODE_INIT_AND_SEND;
 236                 is = wr->wr.fc.fc_is;
 237                 dest = is->is_ctl.fc_dest;
 238                 ah = (hermon_ahhdl_t)dest->ud_ah;
 239                 if (ah == NULL) {
 240                         status = IBT_AH_HDL_INVALID;
 241                         goto done;
 242                 }
 243                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
 244                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 245                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
 246                     sizeof (hermon_hw_snd_wqe_ud_t));
 247                 HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
 248                 old_ds = ds;
 249                 /* move ds beyond the FCP-3 Init Segment */
 250                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + 0x10);
 251                 i = 0;
 252                 break;
 253 
 254         case IBT_WRC_FAST_REG_PMR:
 255         {
 256                 hermon_hw_snd_wqe_frwr_t        *frwr;
 257 
 258                 if (qp->qp_serv_type != HERMON_QP_FCMND) {
 259                         status = IBT_QP_OP_TYPE_INVALID;
 260                         goto done;
 261                 }
 262                 if (wr->wr_trans != IBT_FCMD_SRV) {
 263                         status = IBT_QP_SRV_TYPE_INVALID;
 264                         goto done;
 265                 }
 266                 nopcode = HERMON_WQE_SEND_NOPCODE_FRWR;
 267                 frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc +
 268                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 269                 HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.fc.reg_pmr);
 270                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr +
 271                     sizeof (hermon_hw_snd_wqe_frwr_t));
 272                 nds = 0;
 273                 strong_order = 0x80;
 274                 break;
 275         }
 276 
 277 #if 0
 278         /* firmware does not support this */
 279         case IBT_WRC_LOCAL_INVALIDATE:
 280         {
 281                 hermon_hw_snd_wqe_local_inv_t   *li;
 282 
 283                 if (qp->qp_serv_type != HERMON_QP_FCMND) {
 284                         status = IBT_QP_OP_TYPE_INVALID;
 285                         goto done;
 286                 }
 287                 if (wr->wr_trans != IBT_FCMD_SRV) {
 288                         status = IBT_QP_SRV_TYPE_INVALID;
 289                         goto done;
 290                 }
 291                 nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV;
 292                 li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc +
 293                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 294                 HERMON_WQE_BUILD_LI(qp, li, wr->wr.fc.li);
 295                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li +
 296                     sizeof (hermon_hw_snd_wqe_local_inv_t));
 297                 nds = 0;
 298                 strong_order = 0x80;
 299                 break;
 300         }
 301 #endif
 302         default:
 303                 status = IBT_QP_OP_TYPE_INVALID;
 304                 goto done;
 305         }
 306 
 307         if (nds > qp->qp_sq_sgl) {
 308                 status = IBT_QP_SGL_LEN_INVALID;
 309                 goto done;
 310         }
 311         for (last_ds = num_ds, j = i; j < nds; j++) {
 312                 if (sgl[j].ds_len != 0)
 313                         last_ds++;      /* real last ds of wqe to fill */
 314         }
 315         desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
 316         for (j = nds; --j >= i; ) {
 317                 if (sgl[j].ds_len == 0) {
 318                         continue;
 319                 }
 320 
 321                 /*
 322                  * Fill in the Data Segment(s) for the current WQE, using the
 323                  * information contained in the scatter-gather list of the
 324                  * work request.
 325                  */
 326                 last_ds--;
 327                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
 328         }
 329 
 330         membar_producer();
 331 
 332         if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
 333                 HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
 334                     wr->wr.ud_lso.lso_hdr_sz);
 335         } else if (wr->wr_opcode == IBT_WRC_INIT_SEND_FCMD) {
 336                 /* This sits in the STAMP, so must be set after setting SGL */
 337                 HERMON_WQE_BUILD_FCP3_INIT(old_ds, is->is_ctl.fc_frame_ctrl,
 338                     is->is_cs_priority, is->is_tx_seq_id, is->is_fc_mtu,
 339                     is->is_dest_id, is->is_op, is->is_rem_exch,
 340                     is->is_exch_qp_idx);
 341 
 342                 /* The following will be used in HERMON_WQE_SET_CTRL_SEGMENT */
 343                 /* SIT bit in FCP-3 ctrl segment */
 344                 desc_sz |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_SIT) ? 0x80 : 0;
 345                 /* LS bit in FCP-3 ctrl segment */
 346                 fc_bits |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_LAST_SEQ) ?
 347                     0x10000 : 0;
 348                 fc_bits |= ((is->is_ctl.fc_routing_ctrl & 0xF) << 20) |
 349                     (is->is_ctl.fc_seq_id << 24);
 350                 immed_data = is->is_ctl.fc_parameter;
 351         }
 352 
 353         fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
 354 
 355         signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
 356             (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0;
 357 
 358         solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0;
 359 
 360         HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
 361             solicited, signaled_dbd, cksum, qp, strong_order, fc_bits);
 362 
 363         wq->wq_wrid[tail] = wr->wr_id;
 364 
 365         tail = next_tail;
 366 
 367         /* Update some of the state in the QP */
 368         wq->wq_tail = tail;
 369 
 370         membar_producer();
 371 
 372         /* Now set the ownership bit and opcode (first dword). */
 373         HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
 374 
 375         posted_cnt++;
 376         if (--num_wr > 0) {
 377                 /* do the invalidate of the headroom */
 378                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 379                     (tail + hdrmwqes) & qsize_msk);
 380                 for (i = 16; i < sectperwqe; i += 16) {
 381                         wqe_start[i] = 0xFFFFFFFF;
 382                 }
 383 
 384                 wr++;
 385                 goto post_next;
 386         }
 387 done:
 388         if (posted_cnt != 0) {
 389                 ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
 390 
 391                 membar_producer();
 392 
 393                 /* the FMA retry loop starts for Hermon doorbell register. */
 394                 hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
 395                     fm_status, fm_test_num);
 396 
 397                 HERMON_UAR_DOORBELL(state, uarhdl,
 398                     (uint64_t *)(void *)&state->hs_uar->send,
 399                     (uint64_t)qp->qp_ring);
 400 
 401                 /* the FMA retry loop ends. */
 402                 hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
 403                     fm_status, fm_test_num);
 404 
 405                 /* do the invalidate of the headroom */
 406                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 407                     (tail + hdrmwqes) & qsize_msk);
 408                 for (i = 16; i < sectperwqe; i += 16) {
 409                         wqe_start[i] = 0xFFFFFFFF;
 410                 }
 411         }
 412         if (num_posted != NULL)
 413                 *num_posted = posted_cnt;
 414 
 415         mutex_exit(&qp->qp_sq_lock);
 416 
 417         return (status);
 418 
 419 pio_error:
 420         mutex_exit(&qp->qp_sq_lock);
 421         hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
 422         return (ibc_get_ci_failure(0));
 423 }
 424 
 425 static int
 426 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
 427     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 428 {
 429         uint64_t                        *desc;
 430         hermon_workq_hdr_t              *wq;
 431         uint32_t                        desc_sz;
 432         uint32_t                        signaled_dbd, solicited;
 433         uint32_t                        head, tail, next_tail, qsize_msk;
 434         uint32_t                        hdrmwqes;
 435         int                             status;
 436         uint32_t                        nopcode, fence, immed_data = 0;
 437         hermon_hw_snd_wqe_remaddr_t     *rc;
 438         hermon_hw_snd_wqe_atomic_t      *at;
 439         hermon_hw_snd_wqe_bind_t        *bn;
 440         hermon_hw_snd_wqe_frwr_t        *frwr;
 441         hermon_hw_snd_wqe_local_inv_t   *li;
 442         hermon_hw_wqe_sgl_t             *ds;
 443         ibt_wr_ds_t                     *sgl;
 444         int                             nds;
 445         int                             i, last_ds, num_ds;
 446         uint32_t                        *wqe_start;
 447         int                             sectperwqe;
 448         uint_t                          posted_cnt = 0;
 449         int                             strong_order;
 450         int                             print_rdma;
 451         int                             rlen;
 452         uint32_t                        rkey;
 453         uint64_t                        raddr;
 454 
 455         /* initialize the FMA retry loop */
 456         hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
 457 
 458         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
 459         _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
 460 
 461         /* Save away some initial QP state */
 462         wq = qp->qp_sq_wqhdr;
 463         qsize_msk = wq->wq_mask;
 464         hdrmwqes  = qp->qp_sq_hdrmwqes;              /* in WQEs  */
 465         sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
 466 
 467         tail      = wq->wq_tail;
 468         head      = wq->wq_head;
 469         status    = DDI_SUCCESS;
 470 
 471 post_next:
 472         print_rdma = 0;
 473         rlen = 0;
 474         strong_order = 0;
 475 
 476         /*
 477          * Check for "queue full" condition.  If the queue
 478          * is already full, then no more WQEs can be posted.
 479          * So break out, ring a doorbell (if necessary) and
 480          * return an error
 481          */
 482         if (wq->wq_full != 0) {
 483                 status = IBT_QP_FULL;
 484                 goto done;
 485         }
 486         next_tail = (tail + 1) & qsize_msk;
 487         if (((tail + hdrmwqes) & qsize_msk) == head) {
 488                 wq->wq_full = 1;
 489         }
 490 
 491         desc = HERMON_QP_SQ_ENTRY(qp, tail);
 492 
 493         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
 494             sizeof (hermon_hw_snd_wqe_ctrl_t));
 495         nds = wr->wr_nds;
 496         sgl = wr->wr_sgl;
 497         num_ds = 0;
 498         if (wr->wr_trans != IBT_RC_SRV) {
 499                 status = IBT_QP_SRV_TYPE_INVALID;
 500                 goto done;
 501         }
 502 
 503         /*
 504          * Validate the operation type.  For RC requests, we allow
 505          * "Send", "RDMA Read", "RDMA Write", various "Atomic"
 506          * operations, and memory window "Bind"
 507          */
 508         switch (wr->wr_opcode) {
 509         default:
 510                 status = IBT_QP_OP_TYPE_INVALID;
 511                 goto done;
 512 
 513         case IBT_WRC_SEND:
 514                 if (wr->wr_flags & IBT_WR_SEND_REMOTE_INVAL) {
 515                         nopcode = HERMON_WQE_SEND_NOPCODE_SND_INV;
 516                         immed_data = wr->wr.rc.rcwr.send_inval;
 517                 } else if (wr->wr_flags & IBT_WR_SEND_IMMED) {
 518                         nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
 519                         immed_data = wr->wr.rc.rcwr.send_immed;
 520                 } else {
 521                         nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
 522                 }
 523                 break;
 524 
 525         /*
 526          * If this is an RDMA Read or RDMA Write request, then fill
 527          * in the "Remote Address" header fields.
 528          */
 529         case IBT_WRC_RDMAW:
 530                 if (wr->wr_flags & IBT_WR_SEND_IMMED) {
 531                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
 532                         immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
 533                 } else {
 534                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
 535                 }
 536                 /* FALLTHROUGH */
 537         case IBT_WRC_RDMAR:
 538                 if (wr->wr_opcode == IBT_WRC_RDMAR)
 539                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
 540                 rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
 541                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 542 
 543                 /*
 544                  * Build the Remote Address Segment for the WQE, using
 545                  * the information from the RC work request.
 546                  */
 547                 HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
 548 
 549                 if (hermon_rdma_debug) {
 550                         print_rdma = hermon_rdma_debug;
 551                         rkey = wr->wr.rc.rcwr.rdma.rdma_rkey;
 552                         raddr = wr->wr.rc.rcwr.rdma.rdma_raddr;
 553                 }
 554 
 555                 /* Update "ds" for filling in Data Segments (below) */
 556                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
 557                     sizeof (hermon_hw_snd_wqe_remaddr_t));
 558                 break;
 559 
 560         /*
 561          * If this is one of the Atomic type operations (i.e
 562          * Compare-Swap or Fetch-Add), then fill in both the "Remote
 563          * Address" header fields and the "Atomic" header fields.
 564          */
 565         case IBT_WRC_CSWAP:
 566                 nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
 567                 /* FALLTHROUGH */
 568         case IBT_WRC_FADD:
 569                 if (wr->wr_opcode == IBT_WRC_FADD)
 570                         nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
 571                 rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
 572                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 573                 at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
 574                     sizeof (hermon_hw_snd_wqe_remaddr_t));
 575 
 576                 /*
 577                  * Build the Remote Address and Atomic Segments for
 578                  * the WQE, using the information from the RC Atomic
 579                  * work request.
 580                  */
 581                 HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
 582                 HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
 583 
 584                 /* Update "ds" for filling in Data Segments (below) */
 585                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
 586                     sizeof (hermon_hw_snd_wqe_atomic_t));
 587 
 588                 /*
 589                  * Update "nds" and "sgl" because Atomic requests have
 590                  * only a single Data Segment.
 591                  */
 592                 nds = 1;
 593                 sgl = wr->wr_sgl;
 594                 break;
 595 
 596         /*
 597          * If this is memory window Bind operation, then we call the
 598          * hermon_wr_bind_check() routine to validate the request and
 599          * to generate the updated RKey.  If this is successful, then
 600          * we fill in the WQE's "Bind" header fields.
 601          */
 602         case IBT_WRC_BIND:
 603                 nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
 604                 status = hermon_wr_bind_check(state, wr);
 605                 if (status != DDI_SUCCESS)
 606                         goto done;
 607 
 608                 bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
 609                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 610 
 611                 /*
 612                  * Build the Bind Memory Window Segments for the WQE,
 613                  * using the information from the RC Bind memory
 614                  * window work request.
 615                  */
 616                 HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
 617 
 618                 /*
 619                  * Update the "ds" pointer.  Even though the "bind"
 620                  * operation requires no SGLs, this is necessary to
 621                  * facilitate the correct descriptor size calculations
 622                  * (below).
 623                  */
 624                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
 625                     sizeof (hermon_hw_snd_wqe_bind_t));
 626                 nds = 0;
 627                 break;
 628 
 629         case IBT_WRC_FAST_REG_PMR:
 630                 nopcode = HERMON_WQE_SEND_NOPCODE_FRWR;
 631                 frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc +
 632                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 633                 HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.rc.rcwr.reg_pmr);
 634                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr +
 635                     sizeof (hermon_hw_snd_wqe_frwr_t));
 636                 nds = 0;
 637                 strong_order = 0x80;
 638                 break;
 639 
 640         case IBT_WRC_LOCAL_INVALIDATE:
 641                 nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV;
 642                 li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc +
 643                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 644                 HERMON_WQE_BUILD_LI(qp, li, wr->wr.rc.rcwr.li);
 645                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li +
 646                     sizeof (hermon_hw_snd_wqe_local_inv_t));
 647                 nds = 0;
 648                 strong_order = 0x80;
 649                 break;
 650         }
 651 
 652         /*
 653          * Now fill in the Data Segments (SGL) for the Send WQE based
 654          * on the values setup above (i.e. "sgl", "nds", and the "ds"
 655          * pointer. Start by checking for a valid number of SGL entries
 656          */
 657         if (nds > qp->qp_sq_sgl) {
 658                 status = IBT_QP_SGL_LEN_INVALID;
 659                 goto done;
 660         }
 661 
 662         for (last_ds = num_ds, i = 0; i < nds; i++) {
 663                 if (sgl[i].ds_len != 0)
 664                         last_ds++;      /* real last ds of wqe to fill */
 665         }
 666         desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
 667         for (i = nds; --i >= 0; ) {
 668                 if (sgl[i].ds_len == 0) {
 669                         continue;
 670                 }
 671                 rlen += sgl[i].ds_len;
 672                 if (print_rdma & 0x2)
 673                         IBTF_DPRINTF_L2("rdma", "post: [%d]: laddr %llx  "
 674                             "llen %x", i, sgl[i].ds_va, sgl[i].ds_len);
 675 
 676                 /*
 677                  * Fill in the Data Segment(s) for the current WQE, using the
 678                  * information contained in the scatter-gather list of the
 679                  * work request.
 680                  */
 681                 last_ds--;
 682                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
 683         }
 684         /* ensure RDMA READ does not exceed HCA limit */
 685         if ((wr->wr_opcode == IBT_WRC_RDMAR) && (desc_sz >
 686             state->hs_ibtfinfo.hca_attr->hca_conn_rdma_read_sgl_sz + 2)) {
 687                 status = IBT_QP_SGL_LEN_INVALID;
 688                 goto done;
 689         }
 690 
 691         if (print_rdma & 0x1) {
 692                 IBTF_DPRINTF_L2("rdma", "post: indx %x  rkey %x  raddr %llx  "
 693                     "total len %x", tail, rkey, raddr, rlen);
 694         }
 695 
 696         fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
 697 
 698         signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
 699             (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0;
 700 
 701         solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0;
 702 
 703         HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
 704             signaled_dbd, 0, qp, strong_order, 0);
 705 
 706         wq->wq_wrid[tail] = wr->wr_id;
 707 
 708         tail = next_tail;
 709 
 710         /* Update some of the state in the QP */
 711         wq->wq_tail = tail;
 712 
 713         membar_producer();
 714 
 715         /* Now set the ownership bit of the first one in the chain. */
 716         HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
 717 
 718         posted_cnt++;
 719         if (--num_wr > 0) {
 720                 /* do the invalidate of the headroom */
 721                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 722                     (tail + hdrmwqes) & qsize_msk);
 723                 for (i = 16; i < sectperwqe; i += 16) {
 724                         wqe_start[i] = 0xFFFFFFFF;
 725                 }
 726 
 727                 wr++;
 728                 goto post_next;
 729         }
 730 done:
 731 
 732         if (posted_cnt != 0) {
 733                 ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
 734 
 735                 membar_producer();
 736 
 737                 /* the FMA retry loop starts for Hermon doorbell register. */
 738                 hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
 739                     fm_status, fm_test_num);
 740 
 741                 /* Ring the doorbell */
 742                 HERMON_UAR_DOORBELL(state, uarhdl,
 743                     (uint64_t *)(void *)&state->hs_uar->send,
 744                     (uint64_t)qp->qp_ring);
 745 
 746                 /* the FMA retry loop ends. */
 747                 hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
 748                     fm_status, fm_test_num);
 749 
 750                 /* do the invalidate of the headroom */
 751                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 752                     (tail + hdrmwqes) & qsize_msk);
 753                 for (i = 16; i < sectperwqe; i += 16) {
 754                         wqe_start[i] = 0xFFFFFFFF;
 755                 }
 756         }
 757         /*
 758          * Update the "num_posted" return value (if necessary).
 759          * Then drop the locks and return success.
 760          */
 761         if (num_posted != NULL) {
 762                 *num_posted = posted_cnt;
 763         }
 764 
 765         mutex_exit(&qp->qp_sq_lock);
 766         return (status);
 767 
 768 pio_error:
 769         mutex_exit(&qp->qp_sq_lock);
 770         hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
 771         return (ibc_get_ci_failure(0));
 772 }
 773 
 774 /*
 775  * hermon_post_send()
 776  *    Context: Can be called from interrupt or base context.
 777  */
 778 int
 779 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
 780     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 781 {
 782         ibt_send_wr_t                   *curr_wr;
 783         hermon_workq_hdr_t              *wq;
 784         hermon_ahhdl_t                  ah;
 785         uint64_t                        *desc, *prev;
 786         uint32_t                        desc_sz;
 787         uint32_t                        signaled_dbd, solicited;
 788         uint32_t                        head, tail, next_tail, qsize_msk;
 789         uint32_t                        hdrmwqes;
 790         uint_t                          currindx, wrindx, numremain;
 791         uint_t                          chainlen;
 792         uint_t                          posted_cnt, maxstat;
 793         uint_t                          total_posted;
 794         int                             status;
 795         uint32_t                        nopcode, fence, immed_data = 0;
 796         uint32_t                        prev_nopcode;
 797         uint_t                          qp_state;
 798 
 799         /* initialize the FMA retry loop */
 800         hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
 801 
 802         /*
 803          * Check for user-mappable QP memory.  Note:  We do not allow kernel
 804          * clients to post to QP memory that is accessible directly by the
 805          * user.  If the QP memory is user accessible, then return an error.
 806          */
 807         if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
 808                 return (IBT_QP_HDL_INVALID);
 809         }
 810 
 811         mutex_enter(&qp->qp_sq_lock);
 812 
 813         /*
 814          * Check QP state.  Can not post Send requests from the "Reset",
 815          * "Init", or "RTR" states
 816          */
 817         qp_state = qp->qp_state_for_post_send;
 818         if ((qp_state == HERMON_QP_RESET) ||
 819             (qp_state == HERMON_QP_INIT) ||
 820             (qp_state == HERMON_QP_RTR)) {
 821                 mutex_exit(&qp->qp_sq_lock);
 822                 return (IBT_QP_STATE_INVALID);
 823         }
 824 
 825         if (qp->qp_is_special)
 826                 goto post_many;
 827 
 828         /* Use these optimized functions most of the time */
 829         if (qp->qp_type == IBT_UD_RQP) {
 830                 return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
 831         }
 832 
 833         if (qp->qp_serv_type == HERMON_QP_RC) {
 834                 return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
 835         }
 836 
 837         if (qp->qp_serv_type == HERMON_QP_UC)
 838                 goto post_many;
 839 
 840         mutex_exit(&qp->qp_sq_lock);
 841         return (IBT_QP_SRV_TYPE_INVALID);
 842 
 843 post_many:
 844         /* general loop for non-optimized posting */
 845 
 846         /* Save away some initial QP state */
 847         wq = qp->qp_sq_wqhdr;
 848         qsize_msk = wq->wq_mask;
 849         tail      = wq->wq_tail;
 850         head      = wq->wq_head;
 851         hdrmwqes  = qp->qp_sq_hdrmwqes;              /* in WQEs  */
 852 
 853         /* Initialize posted_cnt */
 854         posted_cnt = 0;
 855         total_posted = 0;
 856 
 857         /*
 858          * For each ibt_send_wr_t in the wr[] list passed in, parse the
 859          * request and build a Send WQE.  NOTE:  Because we are potentially
 860          * building a chain of WQEs to post, we want to build them all first,
 861          * and set the valid (HW Ownership) bit on all but the first.
 862          * However, we do not want to validate the first one until the
 863          * entire chain of WQEs has been built.  Then in the final
 864          * we set the valid bit in the first, flush if needed, and as a last
 865          * step ring the appropriate doorbell.  NOTE: the doorbell ring may
 866          * NOT be needed if the HCA is already processing, but the doorbell
 867          * ring will be done regardless. NOTE ALSO:  It is possible for
 868          * more Work Requests to be posted than the HW will support at one
 869          * shot.  If this happens, we need to be able to post and ring
 870          * several chains here until the the entire request is complete.
 871          * NOTE ALSO:  the term "chain" is used to differentiate it from
 872          * Work Request List passed in; and because that's the terminology
 873          * from the previous generations of HCA - but the WQEs are not, in fact
 874          * chained together for Hermon
 875          */
 876 
 877         wrindx = 0;
 878         numremain = num_wr;
 879         status    = DDI_SUCCESS;
 880         while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 881                 /*
 882                  * For the first WQE on a new chain we need "prev" to point
 883                  * to the current descriptor.
 884                  */
 885                 prev = HERMON_QP_SQ_ENTRY(qp, tail);
 886 
 887                 /*
 888                  * Break the request up into lists that are less than or
 889                  * equal to the maximum number of WQEs that can be posted
 890                  * per doorbell ring - 256 currently
 891                  */
 892                 chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
 893                     HERMON_QP_MAXDESC_PER_DB : numremain;
 894                 numremain -= chainlen;
 895 
 896                 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 897                         /*
 898                          * Check for "queue full" condition.  If the queue
 899                          * is already full, then no more WQEs can be posted.
 900                          * So break out, ring a doorbell (if necessary) and
 901                          * return an error
 902                          */
 903                         if (wq->wq_full != 0) {
 904                                 status = IBT_QP_FULL;
 905                                 break;
 906                         }
 907 
 908                         /*
 909                          * Increment the "tail index". Check for "queue
 910                          * full" condition incl. headroom.  If we detect that
 911                          * the current work request is going to fill the work
 912                          * queue, then we mark this condition and continue.
 913                          * Don't need >=, because going one-by-one we have to
 914                          * hit it exactly sooner or later
 915                          */
 916 
 917                         next_tail = (tail + 1) & qsize_msk;
 918                         if (((tail + hdrmwqes) & qsize_msk) == head) {
 919                                 wq->wq_full = 1;
 920                         }
 921 
 922                         /*
 923                          * Get the address of the location where the next
 924                          * Send WQE should be built
 925                          */
 926                         desc = HERMON_QP_SQ_ENTRY(qp, tail);
 927                         /*
 928                          * Call hermon_wqe_send_build() to build the WQE
 929                          * at the given address.  This routine uses the
 930                          * information in the ibt_send_wr_t list (wr[]) and
 931                          * returns the size of the WQE when it returns.
 932                          */
 933                         status = hermon_wqe_send_build(state, qp,
 934                             &wr[wrindx], desc, &desc_sz);
 935                         if (status != DDI_SUCCESS) {
 936                                 break;
 937                         }
 938 
 939                         /*
 940                          * Now, build the Ctrl Segment based on
 941                          * what was just done
 942                          */
 943                         curr_wr = &wr[wrindx];
 944 
 945                         switch (curr_wr->wr_opcode) {
 946                         case IBT_WRC_RDMAW:
 947                                 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
 948                                         nopcode =
 949                                             HERMON_WQE_SEND_NOPCODE_RDMAWI;
 950                                         immed_data =
 951                                             hermon_wr_get_immediate(curr_wr);
 952                                 } else {
 953                                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
 954                                 }
 955                                 break;
 956 
 957                         case IBT_WRC_SEND:
 958                                 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
 959                                         nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
 960                                         immed_data =
 961                                             hermon_wr_get_immediate(curr_wr);
 962                                 } else {
 963                                         nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
 964                                 }
 965                                 break;
 966 
 967                         case IBT_WRC_SEND_LSO:
 968                                 nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
 969                                 break;
 970 
 971                         case IBT_WRC_RDMAR:
 972                                 nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
 973                                 break;
 974 
 975                         case IBT_WRC_CSWAP:
 976                                 nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
 977                                 break;
 978 
 979                         case IBT_WRC_FADD:
 980                                 nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
 981                                 break;
 982 
 983                         case IBT_WRC_BIND:
 984                                 nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
 985                                 break;
 986                         }
 987 
 988                         fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
 989 
 990                         /*
 991                          * now, build up the control segment, leaving the
 992                          * owner bit as it is
 993                          */
 994 
 995                         if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
 996                             (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
 997                                 signaled_dbd = 0xC;
 998                         } else {
 999                                 signaled_dbd = 0;
1000                         }
1001                         if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
1002                                 solicited = 0x2;
1003                         else
1004                                 solicited = 0;
1005 
1006                         if (qp->qp_is_special) {
1007                                 /* Ensure correctness, set the ReRead bit */
1008                                 nopcode |= (1 << 6);
1009                                 ah = (hermon_ahhdl_t)
1010                                     curr_wr->wr.ud.udwr_dest->ud_ah;
1011                                 mutex_enter(&ah->ah_lock);
1012                                 maxstat = ah->ah_udav->max_stat_rate;
1013                                 HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
1014                                     signaled_dbd, maxstat, ah->ah_udav->rlid,
1015                                     qp, ah->ah_udav->sl);
1016                                 mutex_exit(&ah->ah_lock);
1017                         } else {
1018                                 HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
1019                                     fence, immed_data, solicited,
1020                                     signaled_dbd, 0, qp, 0, 0);
1021                         }
1022                         wq->wq_wrid[tail] = curr_wr->wr_id;
1023 
1024                         /*
1025                          * If this is not the first descriptor on the current
1026                          * chain, then set the ownership bit.
1027                          */
1028                         if (currindx != 0) {            /* not the first */
1029                                 membar_producer();
1030                                 HERMON_SET_SEND_WQE_OWNER(qp,
1031                                     (uint32_t *)desc, nopcode);
1032                         } else
1033                                 prev_nopcode = nopcode;
1034 
1035                         /*
1036                          * Update the current "tail index" and increment
1037                          * "posted_cnt"
1038                          */
1039                         tail = next_tail;
1040                         posted_cnt++;
1041                 }
1042 
1043                 /*
1044                  * If we reach here and there are one or more WQEs which have
1045                  * been successfully built as a chain, we have to finish up
1046                  * and prepare them for writing to the HW
1047                  * The steps are:
1048                  *      1. do the headroom fixup
1049                  *      2. add in the size of the headroom for the sync
1050                  *      3. write the owner bit for the first WQE
1051                  *      4. sync them
1052                  *      5. fix up the structures
1053                  *      6. hit the doorbell in UAR
1054                  */
1055                 if (posted_cnt != 0) {
1056                         ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
1057 
1058                         /* do the invalidate of the headroom */
1059 
1060                         hermon_wqe_headroom(tail, qp);
1061 
1062                         /* Update some of the state in the QP */
1063                         wq->wq_tail = tail;
1064                         total_posted += posted_cnt;
1065                         posted_cnt = 0;
1066 
1067                         membar_producer();
1068 
1069                         /*
1070                          * Now set the ownership bit of the first
1071                          * one in the chain
1072                          */
1073                         HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
1074                             prev_nopcode);
1075 
1076                         /* the FMA retry loop starts for Hermon doorbell. */
1077                         hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
1078                             fm_status, fm_test);
1079 
1080                         HERMON_UAR_DOORBELL(state, uarhdl,
1081                             (uint64_t *)(void *)&state->hs_uar->send,
1082                             (uint64_t)qp->qp_ring);
1083 
1084                         /* the FMA retry loop ends. */
1085                         hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
1086                             fm_status, fm_test);
1087                 }
1088         }
1089 
1090         /*
1091          * Update the "num_posted" return value (if necessary).
1092          * Then drop the locks and return success.
1093          */
1094         if (num_posted != NULL) {
1095                 *num_posted = total_posted;
1096         }
1097         mutex_exit(&qp->qp_sq_lock);
1098         return (status);
1099 
1100 pio_error:
1101         mutex_exit(&qp->qp_sq_lock);
1102         hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1103         return (ibc_get_ci_failure(0));
1104 }
1105 
1106 
1107 /*
1108  * hermon_post_recv()
1109  *    Context: Can be called from interrupt or base context.
1110  */
1111 int
1112 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
1113     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1114 {
1115         uint64_t                        *desc;
1116         hermon_workq_hdr_t              *wq;
1117         uint32_t                        head, tail, next_tail, qsize_msk;
1118         uint_t                          wrindx;
1119         uint_t                          posted_cnt;
1120         int                             status;
1121 
1122         /*
1123          * Check for user-mappable QP memory.  Note:  We do not allow kernel
1124          * clients to post to QP memory that is accessible directly by the
1125          * user.  If the QP memory is user accessible, then return an error.
1126          */
1127         if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
1128                 return (IBT_QP_HDL_INVALID);
1129         }
1130 
1131         /* Initialize posted_cnt */
1132         posted_cnt = 0;
1133 
1134         mutex_enter(&qp->qp_lock);
1135 
1136         /*
1137          * Check if QP is associated with an SRQ
1138          */
1139         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
1140                 mutex_exit(&qp->qp_lock);
1141                 return (IBT_SRQ_IN_USE);
1142         }
1143 
1144         /*
1145          * Check QP state.  Can not post Recv requests from the "Reset" state
1146          */
1147         if (qp->qp_state == HERMON_QP_RESET) {
1148                 mutex_exit(&qp->qp_lock);
1149                 return (IBT_QP_STATE_INVALID);
1150         }
1151 
1152         /* Check that work request transport type is valid */
1153         if ((qp->qp_type != IBT_UD_RQP) &&
1154             (qp->qp_serv_type != HERMON_QP_RC) &&
1155             (qp->qp_serv_type != HERMON_QP_UC)) {
1156                 mutex_exit(&qp->qp_lock);
1157                 return (IBT_QP_SRV_TYPE_INVALID);
1158         }
1159 
1160         /*
1161          * Grab the lock for the WRID list, i.e., membar_consumer().
1162          * This is not needed because the mutex_enter() above has
1163          * the same effect.
1164          */
1165 
1166         /* Save away some initial QP state */
1167         wq = qp->qp_rq_wqhdr;
1168         qsize_msk = wq->wq_mask;
1169         tail      = wq->wq_tail;
1170         head      = wq->wq_head;
1171 
1172         wrindx = 0;
1173         status    = DDI_SUCCESS;
1174 
1175         for (wrindx = 0; wrindx < num_wr; wrindx++) {
1176                 if (wq->wq_full != 0) {
1177                         status = IBT_QP_FULL;
1178                         break;
1179                 }
1180                 next_tail = (tail + 1) & qsize_msk;
1181                 if (next_tail == head) {
1182                         wq->wq_full = 1;
1183                 }
1184                 desc = HERMON_QP_RQ_ENTRY(qp, tail);
1185                 status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1186                 if (status != DDI_SUCCESS) {
1187                         break;
1188                 }
1189 
1190                 wq->wq_wrid[tail] = wr[wrindx].wr_id;
1191                 qp->qp_rq_wqecntr++;
1192 
1193                 tail = next_tail;
1194                 posted_cnt++;
1195         }
1196 
1197         if (posted_cnt != 0) {
1198 
1199                 wq->wq_tail = tail;
1200 
1201                 membar_producer();      /* ensure wrids are visible */
1202 
1203                 /* Update the doorbell record w/ wqecntr */
1204                 HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1205                     qp->qp_rq_wqecntr & 0xFFFF);
1206         }
1207 
1208         if (num_posted != NULL) {
1209                 *num_posted = posted_cnt;
1210         }
1211 
1212 
1213         mutex_exit(&qp->qp_lock);
1214         return (status);
1215 }
1216 
1217 /*
1218  * hermon_post_srq()
1219  *    Context: Can be called from interrupt or base context.
1220  */
1221 int
1222 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1223     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1224 {
1225         uint64_t                        *desc;
1226         hermon_workq_hdr_t              *wq;
1227         uint_t                          indx, wrindx;
1228         uint_t                          posted_cnt;
1229         int                             status;
1230 
1231         mutex_enter(&srq->srq_lock);
1232 
1233         /*
1234          * Check for user-mappable QP memory.  Note:  We do not allow kernel
1235          * clients to post to QP memory that is accessible directly by the
1236          * user.  If the QP memory is user accessible, then return an error.
1237          */
1238         if (srq->srq_is_umap) {
1239                 mutex_exit(&srq->srq_lock);
1240                 return (IBT_SRQ_HDL_INVALID);
1241         }
1242 
1243         /*
1244          * Check SRQ state.  Can not post Recv requests when SRQ is in error
1245          */
1246         if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1247                 mutex_exit(&srq->srq_lock);
1248                 return (IBT_QP_STATE_INVALID);
1249         }
1250 
1251         status = DDI_SUCCESS;
1252         posted_cnt = 0;
1253         wq = srq->srq_wq_wqhdr;
1254         indx = wq->wq_head;
1255 
1256         for (wrindx = 0; wrindx < num_wr; wrindx++) {
1257 
1258                 if (indx == wq->wq_tail) {
1259                         status = IBT_QP_FULL;
1260                         break;
1261                 }
1262                 desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1263 
1264                 wq->wq_wrid[indx] = wr[wrindx].wr_id;
1265 
1266                 status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1267                 if (status != DDI_SUCCESS) {
1268                         break;
1269                 }
1270 
1271                 posted_cnt++;
1272                 indx = htons(((uint16_t *)desc)[1]);
1273                 wq->wq_head = indx;
1274         }
1275 
1276         if (posted_cnt != 0) {
1277 
1278                 srq->srq_wq_wqecntr += posted_cnt;
1279 
1280                 membar_producer();      /* ensure wrids are visible */
1281 
1282                 /* Ring the doorbell w/ wqecntr */
1283                 HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1284                     srq->srq_wq_wqecntr & 0xFFFF);
1285         }
1286 
1287         if (num_posted != NULL) {
1288                 *num_posted = posted_cnt;
1289         }
1290 
1291         mutex_exit(&srq->srq_lock);
1292         return (status);
1293 }
1294 
1295 
1296 /*
1297  * hermon_wqe_send_build()
1298  *    Context: Can be called from interrupt or base context.
1299  */
1300 static int
1301 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1302     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1303 {
1304         hermon_hw_snd_wqe_ud_t          *ud;
1305         hermon_hw_snd_wqe_remaddr_t     *rc;
1306         hermon_hw_snd_wqe_atomic_t      *at;
1307         hermon_hw_snd_wqe_remaddr_t     *uc;
1308         hermon_hw_snd_wqe_bind_t        *bn;
1309         hermon_hw_wqe_sgl_t             *ds, *old_ds;
1310         ibt_ud_dest_t                   *dest;
1311         ibt_wr_ds_t                     *sgl;
1312         hermon_ahhdl_t                  ah;
1313         uint32_t                        nds;
1314         int                             i, j, last_ds, num_ds, status;
1315         int                             tmpsize;
1316 
1317         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1318 
1319         /* Initialize the information for the Data Segments */
1320         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1321             sizeof (hermon_hw_snd_wqe_ctrl_t));
1322         nds = wr->wr_nds;
1323         sgl = wr->wr_sgl;
1324         num_ds = 0;
1325         i = 0;
1326 
1327         /*
1328          * Build a Send WQE depends first and foremost on the transport
1329          * type of Work Request (i.e. UD, RC, or UC)
1330          */
1331         switch (wr->wr_trans) {
1332         case IBT_UD_SRV:
1333                 /* Ensure that work request transport type matches QP type */
1334                 if (qp->qp_serv_type != HERMON_QP_UD) {
1335                         return (IBT_QP_SRV_TYPE_INVALID);
1336                 }
1337 
1338                 /*
1339                  * Validate the operation type.  For UD requests, only the
1340                  * "Send" and "Send LSO" operations are valid.
1341                  */
1342                 if (wr->wr_opcode != IBT_WRC_SEND &&
1343                     wr->wr_opcode != IBT_WRC_SEND_LSO) {
1344                         return (IBT_QP_OP_TYPE_INVALID);
1345                 }
1346 
1347                 /*
1348                  * If this is a Special QP (QP0 or QP1), then we need to
1349                  * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1350                  * and return whatever status it returns
1351                  */
1352                 if (qp->qp_is_special) {
1353                         if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1354                                 return (IBT_QP_OP_TYPE_INVALID);
1355                         }
1356                         status = hermon_wqe_mlx_build(state, qp,
1357                             wr, desc, size);
1358                         return (status);
1359                 }
1360 
1361                 /*
1362                  * Otherwise, if this is a normal UD Send request, then fill
1363                  * all the fields in the Hermon UD header for the WQE.  Note:
1364                  * to do this we'll need to extract some information from the
1365                  * Address Handle passed with the work request.
1366                  */
1367                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1368                     sizeof (hermon_hw_snd_wqe_ctrl_t));
1369                 if (wr->wr_opcode == IBT_WRC_SEND) {
1370                         dest = wr->wr.ud.udwr_dest;
1371                 } else {
1372                         dest = wr->wr.ud_lso.lso_ud_dest;
1373                 }
1374                 ah = (hermon_ahhdl_t)dest->ud_ah;
1375                 if (ah == NULL) {
1376                         return (IBT_AH_HDL_INVALID);
1377                 }
1378 
1379                 /*
1380                  * Build the Unreliable Datagram Segment for the WQE, using
1381                  * the information from the address handle and the work
1382                  * request.
1383                  */
1384                 /* mutex_enter(&ah->ah_lock); */
1385                 if (wr->wr_opcode == IBT_WRC_SEND) {
1386                         HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1387                 } else {        /* IBT_WRC_SEND_LSO */
1388                         HERMON_WQE_BUILD_UD(qp, ud, ah,
1389                             wr->wr.ud_lso.lso_ud_dest);
1390                 }
1391                 /* mutex_exit(&ah->ah_lock); */
1392 
1393                 /* Update "ds" for filling in Data Segments (below) */
1394                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1395                     sizeof (hermon_hw_snd_wqe_ud_t));
1396 
1397                 if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1398                         int total_len;
1399 
1400                         total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1401                         if ((uintptr_t)ds + total_len + (nds * 16) >
1402                             (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1403                                 return (IBT_QP_SGL_LEN_INVALID);
1404 
1405                         bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1406                             wr->wr.ud_lso.lso_hdr_sz);
1407                         old_ds = ds;
1408                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1409                         for (; i < nds; i++) {
1410                                 if (sgl[i].ds_len == 0)
1411                                         continue;
1412                                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1413                                     &sgl[i]);
1414                                 num_ds++;
1415                                 i++;
1416                                 break;
1417                         }
1418                         membar_producer();
1419                         HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1420                             wr->wr.ud_lso.lso_hdr_sz);
1421                 }
1422 
1423                 break;
1424 
1425         case IBT_RC_SRV:
1426                 /* Ensure that work request transport type matches QP type */
1427                 if (qp->qp_serv_type != HERMON_QP_RC) {
1428                         return (IBT_QP_SRV_TYPE_INVALID);
1429                 }
1430 
1431                 /*
1432                  * Validate the operation type.  For RC requests, we allow
1433                  * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1434                  * operations, and memory window "Bind"
1435                  */
1436                 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1437                     (wr->wr_opcode != IBT_WRC_RDMAR) &&
1438                     (wr->wr_opcode != IBT_WRC_RDMAW) &&
1439                     (wr->wr_opcode != IBT_WRC_CSWAP) &&
1440                     (wr->wr_opcode != IBT_WRC_FADD) &&
1441                     (wr->wr_opcode != IBT_WRC_BIND)) {
1442                         return (IBT_QP_OP_TYPE_INVALID);
1443                 }
1444 
1445                 /*
1446                  * If this is a Send request, then all we need to do is break
1447                  * out and here and begin the Data Segment processing below
1448                  */
1449                 if (wr->wr_opcode == IBT_WRC_SEND) {
1450                         break;
1451                 }
1452 
1453                 /*
1454                  * If this is an RDMA Read or RDMA Write request, then fill
1455                  * in the "Remote Address" header fields.
1456                  */
1457                 if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1458                     (wr->wr_opcode == IBT_WRC_RDMAW)) {
1459                         rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1460                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1461 
1462                         /*
1463                          * Build the Remote Address Segment for the WQE, using
1464                          * the information from the RC work request.
1465                          */
1466                         HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1467 
1468                         /* Update "ds" for filling in Data Segments (below) */
1469                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1470                             sizeof (hermon_hw_snd_wqe_remaddr_t));
1471                         break;
1472                 }
1473 
1474                 /*
1475                  * If this is one of the Atomic type operations (i.e
1476                  * Compare-Swap or Fetch-Add), then fill in both the "Remote
1477                  * Address" header fields and the "Atomic" header fields.
1478                  */
1479                 if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1480                     (wr->wr_opcode == IBT_WRC_FADD)) {
1481                         rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1482                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1483                         at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1484                             sizeof (hermon_hw_snd_wqe_remaddr_t));
1485 
1486                         /*
1487                          * Build the Remote Address and Atomic Segments for
1488                          * the WQE, using the information from the RC Atomic
1489                          * work request.
1490                          */
1491                         HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1492                         HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1493 
1494                         /* Update "ds" for filling in Data Segments (below) */
1495                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1496                             sizeof (hermon_hw_snd_wqe_atomic_t));
1497 
1498                         /*
1499                          * Update "nds" and "sgl" because Atomic requests have
1500                          * only a single Data Segment (and they are encoded
1501                          * somewhat differently in the work request.
1502                          */
1503                         nds = 1;
1504                         sgl = wr->wr_sgl;
1505                         break;
1506                 }
1507 
1508                 /*
1509                  * If this is memory window Bind operation, then we call the
1510                  * hermon_wr_bind_check() routine to validate the request and
1511                  * to generate the updated RKey.  If this is successful, then
1512                  * we fill in the WQE's "Bind" header fields.
1513                  */
1514                 if (wr->wr_opcode == IBT_WRC_BIND) {
1515                         status = hermon_wr_bind_check(state, wr);
1516                         if (status != DDI_SUCCESS) {
1517                                 return (status);
1518                         }
1519 
1520                         bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1521                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1522 
1523                         /*
1524                          * Build the Bind Memory Window Segments for the WQE,
1525                          * using the information from the RC Bind memory
1526                          * window work request.
1527                          */
1528                         HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1529 
1530                         /*
1531                          * Update the "ds" pointer.  Even though the "bind"
1532                          * operation requires no SGLs, this is necessary to
1533                          * facilitate the correct descriptor size calculations
1534                          * (below).
1535                          */
1536                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1537                             sizeof (hermon_hw_snd_wqe_bind_t));
1538                         nds = 0;
1539                 }
1540                 break;
1541 
1542         case IBT_UC_SRV:
1543                 /* Ensure that work request transport type matches QP type */
1544                 if (qp->qp_serv_type != HERMON_QP_UC) {
1545                         return (IBT_QP_SRV_TYPE_INVALID);
1546                 }
1547 
1548                 /*
1549                  * Validate the operation type.  For UC requests, we only
1550                  * allow "Send", "RDMA Write", and memory window "Bind".
1551                  * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1552                  * operations
1553                  */
1554                 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1555                     (wr->wr_opcode != IBT_WRC_RDMAW) &&
1556                     (wr->wr_opcode != IBT_WRC_BIND)) {
1557                         return (IBT_QP_OP_TYPE_INVALID);
1558                 }
1559 
1560                 /*
1561                  * If this is a Send request, then all we need to do is break
1562                  * out and here and begin the Data Segment processing below
1563                  */
1564                 if (wr->wr_opcode == IBT_WRC_SEND) {
1565                         break;
1566                 }
1567 
1568                 /*
1569                  * If this is an RDMA Write request, then fill in the "Remote
1570                  * Address" header fields.
1571                  */
1572                 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1573                         uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1574                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1575 
1576                         /*
1577                          * Build the Remote Address Segment for the WQE, using
1578                          * the information from the UC work request.
1579                          */
1580                         HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1581 
1582                         /* Update "ds" for filling in Data Segments (below) */
1583                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1584                             sizeof (hermon_hw_snd_wqe_remaddr_t));
1585                         break;
1586                 }
1587 
1588                 /*
1589                  * If this is memory window Bind operation, then we call the
1590                  * hermon_wr_bind_check() routine to validate the request and
1591                  * to generate the updated RKey.  If this is successful, then
1592                  * we fill in the WQE's "Bind" header fields.
1593                  */
1594                 if (wr->wr_opcode == IBT_WRC_BIND) {
1595                         status = hermon_wr_bind_check(state, wr);
1596                         if (status != DDI_SUCCESS) {
1597                                 return (status);
1598                         }
1599 
1600                         bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1601                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1602 
1603                         /*
1604                          * Build the Bind Memory Window Segments for the WQE,
1605                          * using the information from the UC Bind memory
1606                          * window work request.
1607                          */
1608                         HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1609 
1610                         /*
1611                          * Update the "ds" pointer.  Even though the "bind"
1612                          * operation requires no SGLs, this is necessary to
1613                          * facilitate the correct descriptor size calculations
1614                          * (below).
1615                          */
1616                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1617                             sizeof (hermon_hw_snd_wqe_bind_t));
1618                         nds = 0;
1619                 }
1620                 break;
1621 
1622         default:
1623                 return (IBT_QP_SRV_TYPE_INVALID);
1624         }
1625 
1626         /*
1627          * Now fill in the Data Segments (SGL) for the Send WQE based on
1628          * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1629          * Start by checking for a valid number of SGL entries
1630          */
1631         if (nds > qp->qp_sq_sgl) {
1632                 return (IBT_QP_SGL_LEN_INVALID);
1633         }
1634 
1635         /*
1636          * For each SGL in the Send Work Request, fill in the Send WQE's data
1637          * segments.  Note: We skip any SGL with zero size because Hermon
1638          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1639          * the encoding for zero means a 2GB transfer.
1640          */
1641         for (last_ds = num_ds, j = i; j < nds; j++) {
1642                 if (sgl[j].ds_len != 0)
1643                         last_ds++;      /* real last ds of wqe to fill */
1644         }
1645 
1646         /*
1647          * Return the size of descriptor (in 16-byte chunks)
1648          * For Hermon, we want them (for now) to be on stride size
1649          * boundaries, which was implicit in Tavor/Arbel
1650          *
1651          */
1652         tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1653 
1654         *size = tmpsize >> 0x4;
1655 
1656         for (j = nds; --j >= i; ) {
1657                 if (sgl[j].ds_len == 0) {
1658                         continue;
1659                 }
1660 
1661                 /*
1662                  * Fill in the Data Segment(s) for the current WQE, using the
1663                  * information contained in the scatter-gather list of the
1664                  * work request.
1665                  */
1666                 last_ds--;
1667                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1668         }
1669 
1670         return (DDI_SUCCESS);
1671 }
1672 
1673 
1674 
1675 /*
1676  * hermon_wqe_mlx_build()
1677  *    Context: Can be called from interrupt or base context.
1678  */
1679 static int
1680 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1681     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1682 {
1683         hermon_ahhdl_t          ah;
1684         hermon_hw_udav_t        *udav;
1685         ib_lrh_hdr_t            *lrh;
1686         ib_grh_t                *grh;
1687         ib_bth_hdr_t            *bth;
1688         ib_deth_hdr_t           *deth;
1689         hermon_hw_wqe_sgl_t     *ds;
1690         ibt_wr_ds_t             *sgl;
1691         uint8_t                 *mgmtclass, *hpoint, *hcount;
1692         uint32_t                nds, offset, pktlen;
1693         uint32_t                desc_sz;
1694         int                     i, num_ds;
1695         int                     tmpsize;
1696 
1697         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1698 
1699         /* Initialize the information for the Data Segments */
1700         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1701             sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1702 
1703         /*
1704          * Pull the address handle from the work request. The UDAV will
1705          * be used to answer some questions about the request.
1706          */
1707         ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1708         if (ah == NULL) {
1709                 return (IBT_AH_HDL_INVALID);
1710         }
1711         mutex_enter(&ah->ah_lock);
1712         udav = ah->ah_udav;
1713 
1714         /*
1715          * If the request is for QP1 and the destination LID is equal to
1716          * the Permissive LID, then return an error.  This combination is
1717          * not allowed
1718          */
1719         if ((udav->rlid == IB_LID_PERMISSIVE) &&
1720             (qp->qp_is_special == HERMON_QP_GSI)) {
1721                 mutex_exit(&ah->ah_lock);
1722                 return (IBT_AH_HDL_INVALID);
1723         }
1724 
1725         /*
1726          * Calculate the size of the packet headers, including the GRH
1727          * (if necessary)
1728          */
1729         desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1730             sizeof (ib_deth_hdr_t);
1731         if (udav->grh) {
1732                 desc_sz += sizeof (ib_grh_t);
1733         }
1734 
1735         /*
1736          * Begin to build the first "inline" data segment for the packet
1737          * headers.  Note:  By specifying "inline" we can build the contents
1738          * of the MAD packet headers directly into the work queue (as part
1739          * descriptor).  This has the advantage of both speeding things up
1740          * and of not requiring the driver to allocate/register any additional
1741          * memory for the packet headers.
1742          */
1743         HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1744         desc_sz += 4;
1745 
1746         /*
1747          * Build Local Route Header (LRH)
1748          *    We start here by building the LRH into a temporary location.
1749          *    When we have finished we copy the LRH data into the descriptor.
1750          *
1751          *    Notice that the VL values are hardcoded.  This is not a problem
1752          *    because VL15 is decided later based on the value in the MLX
1753          *    transport "next/ctrl" header (see the "vl15" bit below), and it
1754          *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1755          *    values.  This rule does not hold for loopback packets however
1756          *    (all of which bypass the SL-to-VL tables) and it is the reason
1757          *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1758          *
1759          *    Notice also that Source LID is hardcoded to the Permissive LID
1760          *    (0xFFFF).  This is also not a problem because if the Destination
1761          *    LID is not the Permissive LID, then the "slr" value in the MLX
1762          *    transport "next/ctrl" header will be set to zero and the hardware
1763          *    will pull the LID from value in the port.
1764          */
1765         lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1766         pktlen = (desc_sz + 0x100) >> 2;
1767         HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1768 
1769         /*
1770          * Build Global Route Header (GRH)
1771          *    This is only built if necessary as defined by the "grh" bit in
1772          *    the address vector.  Note:  We also calculate the offset to the
1773          *    next header (BTH) based on whether or not the "grh" bit is set.
1774          */
1775         if (udav->grh) {
1776                 /*
1777                  * If the request is for QP0, then return an error.  The
1778                  * combination of global routine (GRH) and QP0 is not allowed.
1779                  */
1780                 if (qp->qp_is_special == HERMON_QP_SMI) {
1781                         mutex_exit(&ah->ah_lock);
1782                         return (IBT_AH_HDL_INVALID);
1783                 }
1784                 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1785                 HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1786 
1787                 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1788         } else {
1789                 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1790         }
1791         mutex_exit(&ah->ah_lock);
1792 
1793 
1794         /*
1795          * Build Base Transport Header (BTH)
1796          *    Notice that the M, PadCnt, and TVer fields are all set
1797          *    to zero implicitly.  This is true for all Management Datagrams
1798          *    MADs whether GSI are SMI.
1799          */
1800         HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1801 
1802         /*
1803          * Build Datagram Extended Transport Header (DETH)
1804          */
1805         deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1806         HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1807 
1808         /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1809         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1810         ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1811         nds = wr->wr_nds;
1812         sgl = wr->wr_sgl;
1813         num_ds = 0;
1814 
1815         /*
1816          * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1817          * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1818          * Start by checking for a valid number of SGL entries
1819          */
1820         if (nds > qp->qp_sq_sgl) {
1821                 return (IBT_QP_SGL_LEN_INVALID);
1822         }
1823 
1824         /*
1825          * For each SGL in the Send Work Request, fill in the MLX WQE's data
1826          * segments.  Note: We skip any SGL with zero size because Hermon
1827          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1828          * the encoding for zero means a 2GB transfer.  Because of this special
1829          * encoding in the hardware, we mask the requested length with
1830          * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1831          * zero.)
1832          */
1833         mgmtclass = hpoint = hcount = NULL;
1834         offset = 0;
1835         for (i = 0; i < nds; i++) {
1836                 if (sgl[i].ds_len == 0) {
1837                         continue;
1838                 }
1839 
1840                 /*
1841                  * Fill in the Data Segment(s) for the MLX send WQE, using
1842                  * the information contained in the scatter-gather list of
1843                  * the work request.
1844                  */
1845                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1846 
1847                 /*
1848                  * Search through the contents of all MADs posted to QP0 to
1849                  * initialize pointers to the places where Directed Route "hop
1850                  * pointer", "hop count", and "mgmtclass" would be.  Hermon
1851                  * needs these updated (i.e. incremented or decremented, as
1852                  * necessary) by software.
1853                  */
1854                 if (qp->qp_is_special == HERMON_QP_SMI) {
1855 
1856                         HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1857                             offset, sgl[i].ds_va, sgl[i].ds_len);
1858 
1859                         HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1860                             offset, sgl[i].ds_va, sgl[i].ds_len);
1861 
1862                         HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1863                             offset, sgl[i].ds_va, sgl[i].ds_len);
1864 
1865                         offset += sgl[i].ds_len;
1866                 }
1867                 num_ds++;
1868         }
1869 
1870         /*
1871          * Hermon's Directed Route MADs need to have the "hop pointer"
1872          * incremented/decremented (as necessary) depending on whether it is
1873          * currently less than or greater than the "hop count" (i.e. whether
1874          * the MAD is a request or a response.)
1875          */
1876         if (qp->qp_is_special == HERMON_QP_SMI) {
1877                 HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1878                     *hpoint, *hcount);
1879         }
1880 
1881         /*
1882          * Now fill in the ICRC Data Segment.  This data segment is inlined
1883          * just like the packets headers above, but it is only four bytes and
1884          * set to zero (to indicate that we wish the hardware to generate ICRC.
1885          */
1886         HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1887         num_ds++;
1888 
1889         /*
1890          * Return the size of descriptor (in 16-byte chunks)
1891          * For Hermon, we want them (for now) to be on stride size
1892          * boundaries, which was implicit in Tavor/Arbel
1893          */
1894         tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1895 
1896         *size = tmpsize >> 0x04;
1897 
1898         return (DDI_SUCCESS);
1899 }
1900 
1901 
1902 
1903 /*
1904  * hermon_wqe_recv_build()
1905  *    Context: Can be called from interrupt or base context.
1906  */
1907 /* ARGSUSED */
1908 static int
1909 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1910     ibt_recv_wr_t *wr, uint64_t *desc)
1911 {
1912         hermon_hw_wqe_sgl_t     *ds;
1913         int                     i, num_ds;
1914 
1915         ASSERT(MUTEX_HELD(&qp->qp_lock));
1916 
1917         /*
1918          * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1919          * need to have a reserved for the ctrl, there is none on the
1920          * recv queue for hermon, but will need to put an invalid
1921          * (null) scatter pointer per PRM
1922          */
1923         ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1924         num_ds = 0;
1925 
1926         /* Check for valid number of SGL entries */
1927         if (wr->wr_nds > qp->qp_rq_sgl) {
1928                 return (IBT_QP_SGL_LEN_INVALID);
1929         }
1930 
1931         /*
1932          * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1933          * segments.  Note: We skip any SGL with zero size because Hermon
1934          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1935          * the encoding for zero means a 2GB transfer.  Because of this special
1936          * encoding in the hardware, we mask the requested length with
1937          * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1938          * zero.)
1939          */
1940         for (i = 0; i < wr->wr_nds; i++) {
1941                 if (wr->wr_sgl[i].ds_len == 0) {
1942                         continue;
1943                 }
1944 
1945                 /*
1946                  * Fill in the Data Segment(s) for the receive WQE, using the
1947                  * information contained in the scatter-gather list of the
1948                  * work request.
1949                  */
1950                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1951                 num_ds++;
1952         }
1953 
1954         /* put the null sgl pointer as well if needed */
1955         if (num_ds < qp->qp_rq_sgl) {
1956                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1957         }
1958 
1959         return (DDI_SUCCESS);
1960 }
1961 
1962 
1963 
1964 /*
1965  * hermon_wqe_srq_build()
1966  *    Context: Can be called from interrupt or base context.
1967  */
1968 /* ARGSUSED */
1969 static int
1970 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1971     ibt_recv_wr_t *wr, uint64_t *desc)
1972 {
1973         hermon_hw_wqe_sgl_t     *ds;
1974         int                     i, num_ds;
1975 
1976         ASSERT(MUTEX_HELD(&srq->srq_lock));
1977 
1978         /* Fill in the Data Segments (SGL) for the Recv WQE */
1979         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1980             sizeof (hermon_hw_srq_wqe_next_t));
1981         num_ds = 0;
1982 
1983         /* Check for valid number of SGL entries */
1984         if (wr->wr_nds > srq->srq_wq_sgl) {
1985                 return (IBT_QP_SGL_LEN_INVALID);
1986         }
1987 
1988         /*
1989          * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1990          * segments.  Note: We skip any SGL with zero size because Hermon
1991          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1992          * the encoding for zero means a 2GB transfer.  Because of this special
1993          * encoding in the hardware, we mask the requested length with
1994          * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1995          * zero.)
1996          */
1997         for (i = 0; i < wr->wr_nds; i++) {
1998                 if (wr->wr_sgl[i].ds_len == 0) {
1999                         continue;
2000                 }
2001 
2002                 /*
2003                  * Fill in the Data Segment(s) for the receive WQE, using the
2004                  * information contained in the scatter-gather list of the
2005                  * work request.
2006                  */
2007                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
2008                 num_ds++;
2009         }
2010 
2011         /*
2012          * put in the null sgl pointer as well, if needed
2013          */
2014         if (num_ds < srq->srq_wq_sgl) {
2015                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
2016         }
2017 
2018         return (DDI_SUCCESS);
2019 }
2020 
2021 
2022 /*
2023  * hermon_wr_get_immediate()
2024  *    Context: Can be called from interrupt or base context.
2025  */
2026 static uint32_t
2027 hermon_wr_get_immediate(ibt_send_wr_t *wr)
2028 {
2029         /*
2030          * This routine extracts the "immediate data" from the appropriate
2031          * location in the IBTF work request.  Because of the way the
2032          * work request structure is defined, the location for this data
2033          * depends on the actual work request operation type.
2034          */
2035 
2036         /* For RDMA Write, test if RC or UC */
2037         if (wr->wr_opcode == IBT_WRC_RDMAW) {
2038                 if (wr->wr_trans == IBT_RC_SRV) {
2039                         return (wr->wr.rc.rcwr.rdma.rdma_immed);
2040                 } else {  /* IBT_UC_SRV */
2041                         return (wr->wr.uc.ucwr.rdma.rdma_immed);
2042                 }
2043         }
2044 
2045         /* For Send, test if RC, UD, or UC */
2046         if (wr->wr_opcode == IBT_WRC_SEND) {
2047                 if (wr->wr_trans == IBT_RC_SRV) {
2048                         return (wr->wr.rc.rcwr.send_immed);
2049                 } else if (wr->wr_trans == IBT_UD_SRV) {
2050                         return (wr->wr.ud.udwr_immed);
2051                 } else {  /* IBT_UC_SRV */
2052                         return (wr->wr.uc.ucwr.send_immed);
2053                 }
2054         }
2055 
2056         /*
2057          * If any other type of request, then immediate is undefined
2058          */
2059         return (0);
2060 }
2061 
2062 /*
2063  * hermon_wqe_headroom()
2064  *      Context: can be called from interrupt or base, currently only from
2065  *      base context.
2066  * Routine that fills in the headroom for the Send Queue
2067  */
2068 
2069 static void
2070 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
2071 {
2072         uint32_t        *wqe_start, *wqe_top, *wqe_base, qsize;
2073         int             hdrmwqes, wqesizebytes, sectperwqe;
2074         uint32_t        invalue;
2075         int             i, j;
2076 
2077         qsize    = qp->qp_sq_bufsz;
2078         wqesizebytes = 1 << qp->qp_sq_log_wqesz;
2079         sectperwqe = wqesizebytes >> 6;   /* 64 bytes/section */
2080         hdrmwqes = qp->qp_sq_hdrmwqes;
2081         wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
2082         wqe_top   = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
2083         wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
2084 
2085         for (i = 0; i < hdrmwqes; i++)       {
2086                 for (j = 0; j < sectperwqe; j++) {
2087                         if (j == 0) {           /* 1st section of wqe */
2088                                 /* perserve ownership bit */
2089                                 invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
2090                                     wqe_start) | 0x7FFFFFFF;
2091                         } else {
2092                                 /* or just invalidate it */
2093                                 invalue = 0xFFFFFFFF;
2094                         }
2095                         ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
2096                         wqe_start += 16;        /* move 64 bytes */
2097                 }
2098                 if (wqe_start == wqe_top)       /* hit the end of the queue */
2099                         wqe_start = wqe_base;   /* wrap to start */
2100         }
2101 }
2102 
2103 /*
2104  * hermon_wr_bind_check()
2105  *    Context: Can be called from interrupt or base context.
2106  */
2107 /* ARGSUSED */
2108 static int
2109 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2110 {
2111         ibt_bind_flags_t        bind_flags;
2112         uint64_t                vaddr, len;
2113         uint64_t                reg_start_addr, reg_end_addr;
2114         hermon_mwhdl_t          mw;
2115         hermon_mrhdl_t          mr;
2116         hermon_rsrc_t           *mpt;
2117         uint32_t                new_rkey;
2118 
2119         /* Check for a valid Memory Window handle in the WR */
2120         mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2121         if (mw == NULL) {
2122                 return (IBT_MW_HDL_INVALID);
2123         }
2124 
2125         /* Check for a valid Memory Region handle in the WR */
2126         mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2127         if (mr == NULL) {
2128                 return (IBT_MR_HDL_INVALID);
2129         }
2130 
2131         mutex_enter(&mr->mr_lock);
2132         mutex_enter(&mw->mr_lock);
2133 
2134         /*
2135          * Check here to see if the memory region has already been partially
2136          * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2137          * If so, this is an error, return failure.
2138          */
2139         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2140                 mutex_exit(&mr->mr_lock);
2141                 mutex_exit(&mw->mr_lock);
2142                 return (IBT_MR_HDL_INVALID);
2143         }
2144 
2145         /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2146         if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2147                 mutex_exit(&mr->mr_lock);
2148                 mutex_exit(&mw->mr_lock);
2149                 return (IBT_MR_RKEY_INVALID);
2150         }
2151 
2152         /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2153         if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2154                 mutex_exit(&mr->mr_lock);
2155                 mutex_exit(&mw->mr_lock);
2156                 return (IBT_MR_LKEY_INVALID);
2157         }
2158 
2159         /*
2160          * Now check for valid "vaddr" and "len".  Note:  We don't check the
2161          * "vaddr" range when "len == 0" (i.e. on unbind operations)
2162          */
2163         len = wr->wr.rc.rcwr.bind->bind_len;
2164         if (len != 0) {
2165                 vaddr = wr->wr.rc.rcwr.bind->bind_va;
2166                 reg_start_addr = mr->mr_bindinfo.bi_addr;
2167                 reg_end_addr   = mr->mr_bindinfo.bi_addr +
2168                     (mr->mr_bindinfo.bi_len - 1);
2169                 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2170                         mutex_exit(&mr->mr_lock);
2171                         mutex_exit(&mw->mr_lock);
2172                         return (IBT_MR_VA_INVALID);
2173                 }
2174                 vaddr = (vaddr + len) - 1;
2175                 if (vaddr > reg_end_addr) {
2176                         mutex_exit(&mr->mr_lock);
2177                         mutex_exit(&mw->mr_lock);
2178                         return (IBT_MR_LEN_INVALID);
2179                 }
2180         }
2181 
2182         /*
2183          * Validate the bind access flags.  Remote Write and Atomic access for
2184          * the Memory Window require that Local Write access be set in the
2185          * corresponding Memory Region.
2186          */
2187         bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2188         if (((bind_flags & IBT_WR_BIND_WRITE) ||
2189             (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2190             !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2191                 mutex_exit(&mr->mr_lock);
2192                 mutex_exit(&mw->mr_lock);
2193                 return (IBT_MR_ACCESS_REQ_INVALID);
2194         }
2195 
2196         /* Calculate the new RKey for the Memory Window */
2197         mpt = mw->mr_mptrsrcp;
2198         new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2199         new_rkey = hermon_mr_key_swap(new_rkey);
2200 
2201         wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2202         mw->mr_rkey = new_rkey;
2203 
2204         mutex_exit(&mr->mr_lock);
2205         mutex_exit(&mw->mr_lock);
2206         return (DDI_SUCCESS);
2207 }
2208 
2209 
2210 /*
2211  * hermon_wrid_from_reset_handling()
2212  *    Context: Can be called from interrupt or base context.
2213  */
2214 /* ARGSUSED */
2215 int
2216 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2217 {
2218         hermon_workq_hdr_t      *swq, *rwq;
2219 
2220         if (qp->qp_alloc_flags & IBT_QP_USER_MAP)
2221                 return (DDI_SUCCESS);
2222 
2223 #ifdef __lock_lint
2224         mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2225         mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2226 #else
2227         /* grab the cq lock(s) to modify the wqavl tree */
2228         if (qp->qp_rq_cqhdl)
2229                 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2230         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2231             qp->qp_sq_cqhdl != NULL)
2232                 mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2233 #endif
2234 
2235         /* Chain the newly allocated work queue header to the CQ's list */
2236         if (qp->qp_sq_cqhdl)
2237                 hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2238 
2239         swq = qp->qp_sq_wqhdr;
2240         swq->wq_head = 0;
2241         swq->wq_tail = 0;
2242         swq->wq_full = 0;
2243 
2244         /*
2245          * Now we repeat all the above operations for the receive work queue,
2246          * or shared receive work queue.
2247          *
2248          * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2249          */
2250 
2251 #ifdef __lock_lint
2252         mutex_enter(&qp->qp_srqhdl->srq_lock);
2253 #else
2254         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2255                 mutex_enter(&qp->qp_srqhdl->srq_lock);
2256         } else {
2257                 rwq = qp->qp_rq_wqhdr;
2258                 rwq->wq_head = 0;
2259                 rwq->wq_tail = 0;
2260                 rwq->wq_full = 0;
2261                 qp->qp_rq_wqecntr = 0;
2262         }
2263 #endif
2264         hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2265 
2266 #ifdef __lock_lint
2267         mutex_exit(&qp->qp_srqhdl->srq_lock);
2268 #else
2269         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2270                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2271         }
2272 #endif
2273 
2274 #ifdef __lock_lint
2275         mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2276         mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2277 #else
2278         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2279             qp->qp_sq_cqhdl != NULL)
2280                 mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2281         if (qp->qp_rq_cqhdl)
2282                 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2283 #endif
2284         return (DDI_SUCCESS);
2285 }
2286 
2287 
2288 /*
2289  * hermon_wrid_to_reset_handling()
2290  *    Context: Can be called from interrupt or base context.
2291  */
2292 int
2293 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2294 {
2295         if (qp->qp_alloc_flags & IBT_QP_USER_MAP)
2296                 return (DDI_SUCCESS);
2297 
2298         /*
2299          * If there are unpolled entries in these CQs, they are
2300          * polled/flushed.
2301          * Grab the CQ lock(s) before manipulating the lists.
2302          */
2303 #ifdef __lock_lint
2304         mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2305         mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2306 #else
2307         /* grab the cq lock(s) to modify the wqavl tree */
2308         if (qp->qp_rq_cqhdl)
2309                 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2310         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2311             qp->qp_sq_cqhdl != NULL)
2312                 mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2313 #endif
2314 
2315 #ifdef __lock_lint
2316         mutex_enter(&qp->qp_srqhdl->srq_lock);
2317 #else
2318         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2319                 mutex_enter(&qp->qp_srqhdl->srq_lock);
2320         }
2321 #endif
2322         /*
2323          * Flush the entries on the CQ for this QP's QPN.
2324          */
2325         hermon_cq_entries_flush(state, qp);
2326 
2327 #ifdef __lock_lint
2328         mutex_exit(&qp->qp_srqhdl->srq_lock);
2329 #else
2330         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2331                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2332         }
2333 #endif
2334 
2335         hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2336         if (qp->qp_sq_cqhdl != NULL)
2337                 hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2338 
2339 #ifdef __lock_lint
2340         mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2341         mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2342 #else
2343         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2344             qp->qp_sq_cqhdl != NULL)
2345                 mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2346         if (qp->qp_rq_cqhdl)
2347                 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2348 #endif
2349 
2350         return (IBT_SUCCESS);
2351 }
2352 
2353 
2354 /*
2355  * hermon_wrid_get_entry()
2356  *    Context: Can be called from interrupt or base context.
2357  */
2358 uint64_t
2359 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2360 {
2361         hermon_workq_avl_t      *wqa;
2362         hermon_workq_hdr_t      *wq;
2363         uint64_t                wrid;
2364         uint_t                  send_or_recv, qpnum;
2365         uint32_t                indx;
2366 
2367         /*
2368          * Determine whether this CQE is a send or receive completion.
2369          */
2370         send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2371 
2372         /* Find the work queue for this QP number (send or receive side) */
2373         qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2374         wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2375         wq = wqa->wqa_wq;
2376 
2377         /*
2378          * Regardless of whether the completion is the result of a "success"
2379          * or a "failure", we lock the list of "containers" and attempt to
2380          * search for the the first matching completion (i.e. the first WR
2381          * with a matching WQE addr and size).  Once we find it, we pull out
2382          * the "wrid" field and return it (see below).  XXX Note: One possible
2383          * future enhancement would be to enable this routine to skip over
2384          * any "unsignaled" completions to go directly to the next "signaled"
2385          * entry on success.
2386          */
2387         indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2388         wrid = wq->wq_wrid[indx];
2389         if (wqa->wqa_srq_en) {
2390                 struct hermon_sw_srq_s  *srq;
2391                 uint64_t                *desc;
2392 
2393                 /* put wqe back on the srq free list */
2394                 srq = wqa->wqa_srq;
2395                 mutex_enter(&srq->srq_lock);
2396                 desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2397                 ((uint16_t *)desc)[1] = htons(indx);
2398                 wq->wq_tail = indx;
2399                 mutex_exit(&srq->srq_lock);
2400         } else {
2401                 wq->wq_head = (indx + 1) & wq->wq_mask;
2402                 wq->wq_full = 0;
2403         }
2404 
2405         return (wrid);
2406 }
2407 
2408 
2409 int
2410 hermon_wrid_workq_compare(const void *p1, const void *p2)
2411 {
2412         hermon_workq_compare_t  *cmpp;
2413         hermon_workq_avl_t      *curr;
2414 
2415         cmpp = (hermon_workq_compare_t *)p1;
2416         curr = (hermon_workq_avl_t *)p2;
2417 
2418         if (cmpp->cmp_qpn < curr->wqa_qpn)
2419                 return (-1);
2420         else if (cmpp->cmp_qpn > curr->wqa_qpn)
2421                 return (+1);
2422         else if (cmpp->cmp_type < curr->wqa_type)
2423                 return (-1);
2424         else if (cmpp->cmp_type > curr->wqa_type)
2425                 return (+1);
2426         else
2427                 return (0);
2428 }
2429 
2430 
2431 /*
2432  * hermon_wrid_workq_find()
2433  *    Context: Can be called from interrupt or base context.
2434  */
2435 static hermon_workq_avl_t *
2436 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2437 {
2438         hermon_workq_avl_t      *curr;
2439         hermon_workq_compare_t  cmp;
2440 
2441         /*
2442          * Walk the CQ's work queue list, trying to find a send or recv queue
2443          * with the same QP number.  We do this even if we are going to later
2444          * create a new entry because it helps us easily find the end of the
2445          * list.
2446          */
2447         cmp.cmp_qpn = qpn;
2448         cmp.cmp_type = wq_type;
2449 #ifdef __lock_lint
2450         hermon_wrid_workq_compare(NULL, NULL);
2451 #endif
2452         curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2453 
2454         return (curr);
2455 }
2456 
2457 
2458 /*
2459  * hermon_wrid_wqhdr_create()
2460  *    Context: Can be called from base context.
2461  */
2462 /* ARGSUSED */
2463 hermon_workq_hdr_t *
2464 hermon_wrid_wqhdr_create(int bufsz)
2465 {
2466         hermon_workq_hdr_t      *wqhdr;
2467 
2468         /*
2469          * Allocate space for the wqhdr, and an array to record all the wrids.
2470          */
2471         wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2472         if (wqhdr == NULL) {
2473                 return (NULL);
2474         }
2475         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
2476         wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2477         if (wqhdr->wq_wrid == NULL) {
2478                 kmem_free(wqhdr, sizeof (*wqhdr));
2479                 return (NULL);
2480         }
2481         wqhdr->wq_size = bufsz;
2482         wqhdr->wq_mask = bufsz - 1;
2483 
2484         return (wqhdr);
2485 }
2486 
2487 void
2488 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2489 {
2490         kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2491         kmem_free(wqhdr, sizeof (*wqhdr));
2492 }
2493 
2494 
2495 /*
2496  * hermon_cq_workq_add()
2497  *    Context: Can be called from interrupt or base context.
2498  */
2499 static void
2500 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2501 {
2502         hermon_workq_compare_t  cmp;
2503         avl_index_t             where;
2504 
2505         cmp.cmp_qpn = wqavl->wqa_qpn;
2506         cmp.cmp_type = wqavl->wqa_type;
2507 #ifdef __lock_lint
2508         hermon_wrid_workq_compare(NULL, NULL);
2509 #endif
2510         (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2511         avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2512 }
2513 
2514 
2515 /*
2516  * hermon_cq_workq_remove()
2517  *    Context: Can be called from interrupt or base context.
2518  */
2519 static void
2520 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2521 {
2522 #ifdef __lock_lint
2523         hermon_wrid_workq_compare(NULL, NULL);
2524 #endif
2525         avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2526 }