1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * hermon_wr.c
  28  *    Hermon Work Request Processing Routines
  29  *
  30  *    Implements all the routines necessary to provide the PostSend(),
  31  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
  32  *    necessary to implement the Hermon WRID tracking mechanism.
  33  */
  34 
  35 #include <sys/types.h>
  36 #include <sys/conf.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/modctl.h>
  40 #include <sys/avl.h>
  41 
  42 #include <sys/ib/adapters/hermon/hermon.h>
  43 
  44 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
  45 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
  46 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
  47     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  48 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
  49     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  50 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
  51 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
  52     ibt_recv_wr_t *wr, uint64_t *desc);
  53 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
  54     ibt_recv_wr_t *wr, uint64_t *desc);
  55 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
  56     uint_t send_or_recv);
  57 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
  58 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
  59     hermon_workq_avl_t *wqavl);
  60 
  61 static  ibt_wr_ds_t     null_sgl = { 0, 0x00000100, 0 };
  62 
  63 /*
  64  * Add ability to try to debug RDMA_READ/RDMA_WRITE failures.
  65  *
  66  *      0x1 - print rkey used during post_send
  67  *      0x2 - print sgls used during post_send
  68  *      0x4 - print FMR comings and goings
  69  */
  70 int hermon_rdma_debug = 0x0;
  71 
  72 static int
  73 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
  74     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
  75 {
  76         hermon_hw_snd_wqe_ud_t          *ud;
  77         hermon_workq_hdr_t              *wq;
  78         hermon_ahhdl_t                  ah;
  79         ibt_wr_rfci_send_t              *rfci;
  80         ibt_wr_init_send_t              *is;
  81         ibt_ud_dest_t                   *dest;
  82         uint64_t                        *desc;
  83         uint32_t                        desc_sz;
  84         uint32_t                        signaled_dbd, solicited;
  85         uint32_t                        head, tail, next_tail, qsize_msk;
  86         uint32_t                        hdrmwqes;
  87         uint32_t                        nopcode, fence, immed_data = 0;
  88         hermon_hw_wqe_sgl_t             *ds, *old_ds;
  89         ibt_wr_ds_t                     *sgl;
  90         int                             nds;
  91         int                             i, j, last_ds, num_ds, status;
  92         uint32_t                        *wqe_start;
  93         int                             sectperwqe;
  94         uint_t                          posted_cnt = 0;
  95         int                             total_len, strong_order, fc_bits, cksum;
  96 
  97 
  98         /* initialize the FMA retry loop */
  99         hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
 100 
 101         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
 102 
 103         /* Grab the lock for the WRID list */
 104         membar_consumer();
 105 
 106         /* Save away some initial QP state */
 107         wq = qp->qp_sq_wqhdr;
 108         qsize_msk = wq->wq_mask;
 109         hdrmwqes  = qp->qp_sq_hdrmwqes;              /* in WQEs  */
 110         sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
 111 
 112         tail      = wq->wq_tail;
 113         head      = wq->wq_head;
 114         status    = DDI_SUCCESS;
 115 
 116 post_next:
 117         /*
 118          * Check for "queue full" condition.  If the queue
 119          * is already full, then no more WQEs can be posted.
 120          * So break out, ring a doorbell (if necessary) and
 121          * return an error
 122          */
 123         if (wq->wq_full != 0) {
 124                 status = IBT_QP_FULL;
 125                 goto done;
 126         }
 127 
 128         next_tail = (tail + 1) & qsize_msk;
 129         if (((tail + hdrmwqes) & qsize_msk) == head) {
 130                 wq->wq_full = 1;
 131         }
 132 
 133         desc = HERMON_QP_SQ_ENTRY(qp, tail);
 134 
 135         nds = wr->wr_nds;
 136         sgl = wr->wr_sgl;
 137         num_ds = 0;
 138         strong_order = 0;
 139         fc_bits = 0;
 140         cksum = 0;
 141 
 142         /*
 143          * Build a Send or Send_LSO WQE
 144          */
 145         switch (wr->wr_opcode) {
 146         case IBT_WRC_SEND_LSO:
 147                 if (wr->wr_trans != IBT_UD_SRV) {
 148                         status = IBT_QP_SRV_TYPE_INVALID;
 149                         goto done;
 150                 }
 151                 nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
 152                 if (wr->wr_flags & IBT_WR_SEND_CKSUM)
 153                         cksum = 0x30;
 154                 if (wr->wr.ud_lso.lso_hdr_sz > 60) {
 155                         nopcode |= (1 << 6);      /* ReRead bit must be set */
 156                 }
 157                 dest = wr->wr.ud_lso.lso_ud_dest;
 158                 ah = (hermon_ahhdl_t)dest->ud_ah;
 159                 if (ah == NULL) {
 160                         status = IBT_AH_HDL_INVALID;
 161                         goto done;
 162                 }
 163                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
 164                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 165                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
 166                     sizeof (hermon_hw_snd_wqe_ud_t));
 167                 HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
 168 
 169                 total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
 170                 if ((uintptr_t)ds + total_len + (nds * 16) >
 171                     (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
 172                         status = IBT_QP_SGL_LEN_INVALID;
 173                         goto done;
 174                 }
 175                 old_ds = ds;
 176                 bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)old_ds + 1,
 177                     wr->wr.ud_lso.lso_hdr_sz);
 178                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
 179                 i = 0;
 180                 break;
 181 
 182         case IBT_WRC_SEND:
 183                 nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
 184                 if (qp->qp_serv_type == HERMON_QP_UD) {
 185                         if (wr->wr_trans != IBT_UD_SRV) {
 186                                 status = IBT_QP_SRV_TYPE_INVALID;
 187                                 goto done;
 188                         }
 189                         if (wr->wr_flags & IBT_WR_SEND_CKSUM)
 190                                 cksum = 0x30;
 191                         dest = wr->wr.ud.udwr_dest;
 192                 } else if (qp->qp_serv_type == HERMON_QP_RFCI) {
 193                         if (wr->wr_trans != IBT_RFCI_SRV) {
 194                                 status = IBT_QP_SRV_TYPE_INVALID;
 195                                 goto done;
 196                         }
 197                         rfci = &wr->wr.fc.rfci_send;
 198                         if ((wr->wr_flags & IBT_WR_SEND_FC_CRC) != 0) {
 199                                 nopcode |= (rfci->rfci_eof << 16);
 200                                 fc_bits = 0x40; /* set FCRC */
 201                         }
 202                         dest = rfci->rfci_dest;
 203                 } else {
 204                         status = IBT_QP_OP_TYPE_INVALID;
 205                         goto done;
 206                 }
 207                 if (wr->wr_flags & IBT_WR_SEND_IMMED) {
 208                         /* "|=" changes 0xa to 0xb without touching FCEOF */
 209                         nopcode |= HERMON_WQE_SEND_NOPCODE_SENDI;
 210                         immed_data = wr->wr.ud.udwr_immed;
 211                 }
 212                 ah = (hermon_ahhdl_t)dest->ud_ah;
 213                 if (ah == NULL) {
 214                         status = IBT_AH_HDL_INVALID;
 215                         goto done;
 216                 }
 217                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
 218                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 219                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
 220                     sizeof (hermon_hw_snd_wqe_ud_t));
 221                 HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
 222                 i = 0;
 223                 break;
 224 
 225         case IBT_WRC_INIT_SEND_FCMD:
 226                 if (qp->qp_serv_type != HERMON_QP_FCMND) {
 227                         status = IBT_QP_OP_TYPE_INVALID;
 228                         goto done;
 229                 }
 230                 if (wr->wr_trans != IBT_FCMD_SRV) {
 231                         status = IBT_QP_SRV_TYPE_INVALID;
 232                         goto done;
 233                 }
 234                 nopcode = HERMON_WQE_FCP_OPCODE_INIT_AND_SEND;
 235                 is = wr->wr.fc.fc_is;
 236                 dest = is->is_ctl.fc_dest;
 237                 ah = (hermon_ahhdl_t)dest->ud_ah;
 238                 if (ah == NULL) {
 239                         status = IBT_AH_HDL_INVALID;
 240                         goto done;
 241                 }
 242                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
 243                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 244                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
 245                     sizeof (hermon_hw_snd_wqe_ud_t));
 246                 HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
 247                 old_ds = ds;
 248                 /* move ds beyond the FCP-3 Init Segment */
 249                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + 0x10);
 250                 i = 0;
 251                 break;
 252 
 253         case IBT_WRC_FAST_REG_PMR:
 254         {
 255                 hermon_hw_snd_wqe_frwr_t        *frwr;
 256 
 257                 if (qp->qp_serv_type != HERMON_QP_FCMND) {
 258                         status = IBT_QP_OP_TYPE_INVALID;
 259                         goto done;
 260                 }
 261                 if (wr->wr_trans != IBT_FCMD_SRV) {
 262                         status = IBT_QP_SRV_TYPE_INVALID;
 263                         goto done;
 264                 }
 265                 nopcode = HERMON_WQE_SEND_NOPCODE_FRWR;
 266                 frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc +
 267                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 268                 HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.fc.reg_pmr);
 269                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr +
 270                     sizeof (hermon_hw_snd_wqe_frwr_t));
 271                 nds = 0;
 272                 strong_order = 0x80;
 273                 break;
 274         }
 275 
 276 #if 0
 277         /* firmware does not support this */
 278         case IBT_WRC_LOCAL_INVALIDATE:
 279         {
 280                 hermon_hw_snd_wqe_local_inv_t   *li;
 281 
 282                 if (qp->qp_serv_type != HERMON_QP_FCMND) {
 283                         status = IBT_QP_OP_TYPE_INVALID;
 284                         goto done;
 285                 }
 286                 if (wr->wr_trans != IBT_FCMD_SRV) {
 287                         status = IBT_QP_SRV_TYPE_INVALID;
 288                         goto done;
 289                 }
 290                 nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV;
 291                 li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc +
 292                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 293                 HERMON_WQE_BUILD_LI(qp, li, wr->wr.fc.li);
 294                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li +
 295                     sizeof (hermon_hw_snd_wqe_local_inv_t));
 296                 nds = 0;
 297                 strong_order = 0x80;
 298                 break;
 299         }
 300 #endif
 301         default:
 302                 status = IBT_QP_OP_TYPE_INVALID;
 303                 goto done;
 304         }
 305 
 306         if (nds > qp->qp_sq_sgl) {
 307                 status = IBT_QP_SGL_LEN_INVALID;
 308                 goto done;
 309         }
 310         for (last_ds = num_ds, j = i; j < nds; j++) {
 311                 if (sgl[j].ds_len != 0)
 312                         last_ds++;      /* real last ds of wqe to fill */
 313         }
 314         desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
 315         for (j = nds; --j >= i; ) {
 316                 if (sgl[j].ds_len == 0) {
 317                         continue;
 318                 }
 319 
 320                 /*
 321                  * Fill in the Data Segment(s) for the current WQE, using the
 322                  * information contained in the scatter-gather list of the
 323                  * work request.
 324                  */
 325                 last_ds--;
 326                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
 327         }
 328 
 329         membar_producer();
 330 
 331         if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
 332                 HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
 333                     wr->wr.ud_lso.lso_hdr_sz);
 334         } else if (wr->wr_opcode == IBT_WRC_INIT_SEND_FCMD) {
 335                 /* This sits in the STAMP, so must be set after setting SGL */
 336                 HERMON_WQE_BUILD_FCP3_INIT(old_ds, is->is_ctl.fc_frame_ctrl,
 337                     is->is_cs_priority, is->is_tx_seq_id, is->is_fc_mtu,
 338                     is->is_dest_id, is->is_op, is->is_rem_exch,
 339                     is->is_exch_qp_idx);
 340 
 341                 /* The following will be used in HERMON_WQE_SET_CTRL_SEGMENT */
 342                 /* SIT bit in FCP-3 ctrl segment */
 343                 desc_sz |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_SIT) ? 0x80 : 0;
 344                 /* LS bit in FCP-3 ctrl segment */
 345                 fc_bits |= (is->is_ctl.fc_frame_ctrl & IBT_FCTL_LAST_SEQ) ?
 346                     0x10000 : 0;
 347                 fc_bits |= ((is->is_ctl.fc_routing_ctrl & 0xF) << 20) |
 348                     (is->is_ctl.fc_seq_id << 24);
 349                 immed_data = is->is_ctl.fc_parameter;
 350         }
 351 
 352         fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
 353 
 354         signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
 355             (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0;
 356 
 357         solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0;
 358 
 359         HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
 360             solicited, signaled_dbd, cksum, qp, strong_order, fc_bits);
 361 
 362         wq->wq_wrid[tail] = wr->wr_id;
 363 
 364         tail = next_tail;
 365 
 366         /* Update some of the state in the QP */
 367         wq->wq_tail = tail;
 368 
 369         membar_producer();
 370 
 371         /* Now set the ownership bit and opcode (first dword). */
 372         HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
 373 
 374         posted_cnt++;
 375         if (--num_wr > 0) {
 376                 /* do the invalidate of the headroom */
 377                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 378                     (tail + hdrmwqes) & qsize_msk);
 379                 for (i = 16; i < sectperwqe; i += 16) {
 380                         wqe_start[i] = 0xFFFFFFFF;
 381                 }
 382 
 383                 wr++;
 384                 goto post_next;
 385         }
 386 done:
 387         if (posted_cnt != 0) {
 388                 ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
 389 
 390                 membar_producer();
 391 
 392                 /* the FMA retry loop starts for Hermon doorbell register. */
 393                 hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
 394                     fm_status, fm_test_num);
 395 
 396                 HERMON_UAR_DOORBELL(state, uarhdl,
 397                     (uint64_t *)(void *)&state->hs_uar->send,
 398                     (uint64_t)qp->qp_ring);
 399 
 400                 /* the FMA retry loop ends. */
 401                 hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
 402                     fm_status, fm_test_num);
 403 
 404                 /* do the invalidate of the headroom */
 405                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 406                     (tail + hdrmwqes) & qsize_msk);
 407                 for (i = 16; i < sectperwqe; i += 16) {
 408                         wqe_start[i] = 0xFFFFFFFF;
 409                 }
 410         }
 411         if (num_posted != NULL)
 412                 *num_posted = posted_cnt;
 413 
 414         mutex_exit(&qp->qp_sq_lock);
 415 
 416         return (status);
 417 
 418 pio_error:
 419         mutex_exit(&qp->qp_sq_lock);
 420         hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
 421         return (ibc_get_ci_failure(0));
 422 }
 423 
 424 static int
 425 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
 426     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 427 {
 428         uint64_t                        *desc;
 429         hermon_workq_hdr_t              *wq;
 430         uint32_t                        desc_sz;
 431         uint32_t                        signaled_dbd, solicited;
 432         uint32_t                        head, tail, next_tail, qsize_msk;
 433         uint32_t                        hdrmwqes;
 434         int                             status;
 435         uint32_t                        nopcode, fence, immed_data = 0;
 436         hermon_hw_snd_wqe_remaddr_t     *rc;
 437         hermon_hw_snd_wqe_atomic_t      *at;
 438         hermon_hw_snd_wqe_bind_t        *bn;
 439         hermon_hw_snd_wqe_frwr_t        *frwr;
 440         hermon_hw_snd_wqe_local_inv_t   *li;
 441         hermon_hw_wqe_sgl_t             *ds;
 442         ibt_wr_ds_t                     *sgl;
 443         int                             nds;
 444         int                             i, last_ds, num_ds;
 445         uint32_t                        *wqe_start;
 446         int                             sectperwqe;
 447         uint_t                          posted_cnt = 0;
 448         int                             strong_order;
 449         int                             print_rdma;
 450         int                             rlen;
 451         uint32_t                        rkey;
 452         uint64_t                        raddr;
 453 
 454         /* initialize the FMA retry loop */
 455         hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
 456 
 457         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
 458 
 459         /* Save away some initial QP state */
 460         wq = qp->qp_sq_wqhdr;
 461         qsize_msk = wq->wq_mask;
 462         hdrmwqes  = qp->qp_sq_hdrmwqes;              /* in WQEs  */
 463         sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
 464 
 465         tail      = wq->wq_tail;
 466         head      = wq->wq_head;
 467         status    = DDI_SUCCESS;
 468 
 469 post_next:
 470         print_rdma = 0;
 471         rlen = 0;
 472         strong_order = 0;
 473 
 474         /*
 475          * Check for "queue full" condition.  If the queue
 476          * is already full, then no more WQEs can be posted.
 477          * So break out, ring a doorbell (if necessary) and
 478          * return an error
 479          */
 480         if (wq->wq_full != 0) {
 481                 status = IBT_QP_FULL;
 482                 goto done;
 483         }
 484         next_tail = (tail + 1) & qsize_msk;
 485         if (((tail + hdrmwqes) & qsize_msk) == head) {
 486                 wq->wq_full = 1;
 487         }
 488 
 489         desc = HERMON_QP_SQ_ENTRY(qp, tail);
 490 
 491         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
 492             sizeof (hermon_hw_snd_wqe_ctrl_t));
 493         nds = wr->wr_nds;
 494         sgl = wr->wr_sgl;
 495         num_ds = 0;
 496         if (wr->wr_trans != IBT_RC_SRV) {
 497                 status = IBT_QP_SRV_TYPE_INVALID;
 498                 goto done;
 499         }
 500 
 501         /*
 502          * Validate the operation type.  For RC requests, we allow
 503          * "Send", "RDMA Read", "RDMA Write", various "Atomic"
 504          * operations, and memory window "Bind"
 505          */
 506         switch (wr->wr_opcode) {
 507         default:
 508                 status = IBT_QP_OP_TYPE_INVALID;
 509                 goto done;
 510 
 511         case IBT_WRC_SEND:
 512                 if (wr->wr_flags & IBT_WR_SEND_REMOTE_INVAL) {
 513                         nopcode = HERMON_WQE_SEND_NOPCODE_SND_INV;
 514                         immed_data = wr->wr.rc.rcwr.send_inval;
 515                 } else if (wr->wr_flags & IBT_WR_SEND_IMMED) {
 516                         nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
 517                         immed_data = wr->wr.rc.rcwr.send_immed;
 518                 } else {
 519                         nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
 520                 }
 521                 break;
 522 
 523         /*
 524          * If this is an RDMA Read or RDMA Write request, then fill
 525          * in the "Remote Address" header fields.
 526          */
 527         case IBT_WRC_RDMAW:
 528                 if (wr->wr_flags & IBT_WR_SEND_IMMED) {
 529                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
 530                         immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
 531                 } else {
 532                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
 533                 }
 534                 /* FALLTHROUGH */
 535         case IBT_WRC_RDMAR:
 536                 if (wr->wr_opcode == IBT_WRC_RDMAR)
 537                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
 538                 rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
 539                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 540 
 541                 /*
 542                  * Build the Remote Address Segment for the WQE, using
 543                  * the information from the RC work request.
 544                  */
 545                 HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
 546 
 547                 if (hermon_rdma_debug) {
 548                         print_rdma = hermon_rdma_debug;
 549                         rkey = wr->wr.rc.rcwr.rdma.rdma_rkey;
 550                         raddr = wr->wr.rc.rcwr.rdma.rdma_raddr;
 551                 }
 552 
 553                 /* Update "ds" for filling in Data Segments (below) */
 554                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
 555                     sizeof (hermon_hw_snd_wqe_remaddr_t));
 556                 break;
 557 
 558         /*
 559          * If this is one of the Atomic type operations (i.e
 560          * Compare-Swap or Fetch-Add), then fill in both the "Remote
 561          * Address" header fields and the "Atomic" header fields.
 562          */
 563         case IBT_WRC_CSWAP:
 564                 nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
 565                 /* FALLTHROUGH */
 566         case IBT_WRC_FADD:
 567                 if (wr->wr_opcode == IBT_WRC_FADD)
 568                         nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
 569                 rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
 570                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 571                 at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
 572                     sizeof (hermon_hw_snd_wqe_remaddr_t));
 573 
 574                 /*
 575                  * Build the Remote Address and Atomic Segments for
 576                  * the WQE, using the information from the RC Atomic
 577                  * work request.
 578                  */
 579                 HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
 580                 HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
 581 
 582                 /* Update "ds" for filling in Data Segments (below) */
 583                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
 584                     sizeof (hermon_hw_snd_wqe_atomic_t));
 585 
 586                 /*
 587                  * Update "nds" and "sgl" because Atomic requests have
 588                  * only a single Data Segment.
 589                  */
 590                 nds = 1;
 591                 sgl = wr->wr_sgl;
 592                 break;
 593 
 594         /*
 595          * If this is memory window Bind operation, then we call the
 596          * hermon_wr_bind_check() routine to validate the request and
 597          * to generate the updated RKey.  If this is successful, then
 598          * we fill in the WQE's "Bind" header fields.
 599          */
 600         case IBT_WRC_BIND:
 601                 nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
 602                 status = hermon_wr_bind_check(state, wr);
 603                 if (status != DDI_SUCCESS)
 604                         goto done;
 605 
 606                 bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
 607                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 608 
 609                 /*
 610                  * Build the Bind Memory Window Segments for the WQE,
 611                  * using the information from the RC Bind memory
 612                  * window work request.
 613                  */
 614                 HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
 615 
 616                 /*
 617                  * Update the "ds" pointer.  Even though the "bind"
 618                  * operation requires no SGLs, this is necessary to
 619                  * facilitate the correct descriptor size calculations
 620                  * (below).
 621                  */
 622                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
 623                     sizeof (hermon_hw_snd_wqe_bind_t));
 624                 nds = 0;
 625                 break;
 626 
 627         case IBT_WRC_FAST_REG_PMR:
 628                 nopcode = HERMON_WQE_SEND_NOPCODE_FRWR;
 629                 frwr = (hermon_hw_snd_wqe_frwr_t *)((uintptr_t)desc +
 630                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 631                 HERMON_WQE_BUILD_FRWR(qp, frwr, wr->wr.rc.rcwr.reg_pmr);
 632                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)frwr +
 633                     sizeof (hermon_hw_snd_wqe_frwr_t));
 634                 nds = 0;
 635                 strong_order = 0x80;
 636                 break;
 637 
 638         case IBT_WRC_LOCAL_INVALIDATE:
 639                 nopcode = HERMON_WQE_SEND_NOPCODE_LCL_INV;
 640                 li = (hermon_hw_snd_wqe_local_inv_t *)((uintptr_t)desc +
 641                     sizeof (hermon_hw_snd_wqe_ctrl_t));
 642                 HERMON_WQE_BUILD_LI(qp, li, wr->wr.rc.rcwr.li);
 643                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)li +
 644                     sizeof (hermon_hw_snd_wqe_local_inv_t));
 645                 nds = 0;
 646                 strong_order = 0x80;
 647                 break;
 648         }
 649 
 650         /*
 651          * Now fill in the Data Segments (SGL) for the Send WQE based
 652          * on the values setup above (i.e. "sgl", "nds", and the "ds"
 653          * pointer. Start by checking for a valid number of SGL entries
 654          */
 655         if (nds > qp->qp_sq_sgl) {
 656                 status = IBT_QP_SGL_LEN_INVALID;
 657                 goto done;
 658         }
 659 
 660         for (last_ds = num_ds, i = 0; i < nds; i++) {
 661                 if (sgl[i].ds_len != 0)
 662                         last_ds++;      /* real last ds of wqe to fill */
 663         }
 664         desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
 665         for (i = nds; --i >= 0; ) {
 666                 if (sgl[i].ds_len == 0) {
 667                         continue;
 668                 }
 669                 rlen += sgl[i].ds_len;
 670                 if (print_rdma & 0x2)
 671                         IBTF_DPRINTF_L2("rdma", "post: [%d]: laddr %llx  "
 672                             "llen %x", i, sgl[i].ds_va, sgl[i].ds_len);
 673 
 674                 /*
 675                  * Fill in the Data Segment(s) for the current WQE, using the
 676                  * information contained in the scatter-gather list of the
 677                  * work request.
 678                  */
 679                 last_ds--;
 680                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
 681         }
 682         /* ensure RDMA READ does not exceed HCA limit */
 683         if ((wr->wr_opcode == IBT_WRC_RDMAR) && (desc_sz >
 684             state->hs_ibtfinfo.hca_attr->hca_conn_rdma_read_sgl_sz + 2)) {
 685                 status = IBT_QP_SGL_LEN_INVALID;
 686                 goto done;
 687         }
 688 
 689         if (print_rdma & 0x1) {
 690                 IBTF_DPRINTF_L2("rdma", "post: indx %x  rkey %x  raddr %llx  "
 691                     "total len %x", tail, rkey, raddr, rlen);
 692         }
 693 
 694         fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
 695 
 696         signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
 697             (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 0xC : 0;
 698 
 699         solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 0x2 : 0;
 700 
 701         HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
 702             signaled_dbd, 0, qp, strong_order, 0);
 703 
 704         wq->wq_wrid[tail] = wr->wr_id;
 705 
 706         tail = next_tail;
 707 
 708         /* Update some of the state in the QP */
 709         wq->wq_tail = tail;
 710 
 711         membar_producer();
 712 
 713         /* Now set the ownership bit of the first one in the chain. */
 714         HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
 715 
 716         posted_cnt++;
 717         if (--num_wr > 0) {
 718                 /* do the invalidate of the headroom */
 719                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 720                     (tail + hdrmwqes) & qsize_msk);
 721                 for (i = 16; i < sectperwqe; i += 16) {
 722                         wqe_start[i] = 0xFFFFFFFF;
 723                 }
 724 
 725                 wr++;
 726                 goto post_next;
 727         }
 728 done:
 729 
 730         if (posted_cnt != 0) {
 731                 ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
 732 
 733                 membar_producer();
 734 
 735                 /* the FMA retry loop starts for Hermon doorbell register. */
 736                 hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
 737                     fm_status, fm_test_num);
 738 
 739                 /* Ring the doorbell */
 740                 HERMON_UAR_DOORBELL(state, uarhdl,
 741                     (uint64_t *)(void *)&state->hs_uar->send,
 742                     (uint64_t)qp->qp_ring);
 743 
 744                 /* the FMA retry loop ends. */
 745                 hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
 746                     fm_status, fm_test_num);
 747 
 748                 /* do the invalidate of the headroom */
 749                 wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
 750                     (tail + hdrmwqes) & qsize_msk);
 751                 for (i = 16; i < sectperwqe; i += 16) {
 752                         wqe_start[i] = 0xFFFFFFFF;
 753                 }
 754         }
 755         /*
 756          * Update the "num_posted" return value (if necessary).
 757          * Then drop the locks and return success.
 758          */
 759         if (num_posted != NULL) {
 760                 *num_posted = posted_cnt;
 761         }
 762 
 763         mutex_exit(&qp->qp_sq_lock);
 764         return (status);
 765 
 766 pio_error:
 767         mutex_exit(&qp->qp_sq_lock);
 768         hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
 769         return (ibc_get_ci_failure(0));
 770 }
 771 
 772 /*
 773  * hermon_post_send()
 774  *    Context: Can be called from interrupt or base context.
 775  */
 776 int
 777 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
 778     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 779 {
 780         ibt_send_wr_t                   *curr_wr;
 781         hermon_workq_hdr_t              *wq;
 782         hermon_ahhdl_t                  ah;
 783         uint64_t                        *desc, *prev;
 784         uint32_t                        desc_sz;
 785         uint32_t                        signaled_dbd, solicited;
 786         uint32_t                        head, tail, next_tail, qsize_msk;
 787         uint32_t                        hdrmwqes;
 788         uint_t                          currindx, wrindx, numremain;
 789         uint_t                          chainlen;
 790         uint_t                          posted_cnt, maxstat;
 791         uint_t                          total_posted;
 792         int                             status;
 793         uint32_t                        nopcode, fence, immed_data = 0;
 794         uint32_t                        prev_nopcode;
 795         uint_t                          qp_state;
 796 
 797         /* initialize the FMA retry loop */
 798         hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
 799 
 800         /*
 801          * Check for user-mappable QP memory.  Note:  We do not allow kernel
 802          * clients to post to QP memory that is accessible directly by the
 803          * user.  If the QP memory is user accessible, then return an error.
 804          */
 805         if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
 806                 return (IBT_QP_HDL_INVALID);
 807         }
 808 
 809         mutex_enter(&qp->qp_sq_lock);
 810 
 811         /*
 812          * Check QP state.  Can not post Send requests from the "Reset",
 813          * "Init", or "RTR" states
 814          */
 815         qp_state = qp->qp_state_for_post_send;
 816         if ((qp_state == HERMON_QP_RESET) ||
 817             (qp_state == HERMON_QP_INIT) ||
 818             (qp_state == HERMON_QP_RTR)) {
 819                 mutex_exit(&qp->qp_sq_lock);
 820                 return (IBT_QP_STATE_INVALID);
 821         }
 822 
 823         if (qp->qp_is_special)
 824                 goto post_many;
 825 
 826         /* Use these optimized functions most of the time */
 827         if (qp->qp_type == IBT_UD_RQP) {
 828                 return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
 829         }
 830 
 831         if (qp->qp_serv_type == HERMON_QP_RC) {
 832                 return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
 833         }
 834 
 835         if (qp->qp_serv_type == HERMON_QP_UC)
 836                 goto post_many;
 837 
 838         mutex_exit(&qp->qp_sq_lock);
 839         return (IBT_QP_SRV_TYPE_INVALID);
 840 
 841 post_many:
 842         /* general loop for non-optimized posting */
 843 
 844         /* Save away some initial QP state */
 845         wq = qp->qp_sq_wqhdr;
 846         qsize_msk = wq->wq_mask;
 847         tail      = wq->wq_tail;
 848         head      = wq->wq_head;
 849         hdrmwqes  = qp->qp_sq_hdrmwqes;              /* in WQEs  */
 850 
 851         /* Initialize posted_cnt */
 852         posted_cnt = 0;
 853         total_posted = 0;
 854 
 855         /*
 856          * For each ibt_send_wr_t in the wr[] list passed in, parse the
 857          * request and build a Send WQE.  NOTE:  Because we are potentially
 858          * building a chain of WQEs to post, we want to build them all first,
 859          * and set the valid (HW Ownership) bit on all but the first.
 860          * However, we do not want to validate the first one until the
 861          * entire chain of WQEs has been built.  Then in the final
 862          * we set the valid bit in the first, flush if needed, and as a last
 863          * step ring the appropriate doorbell.  NOTE: the doorbell ring may
 864          * NOT be needed if the HCA is already processing, but the doorbell
 865          * ring will be done regardless. NOTE ALSO:  It is possible for
 866          * more Work Requests to be posted than the HW will support at one
 867          * shot.  If this happens, we need to be able to post and ring
 868          * several chains here until the the entire request is complete.
 869          * NOTE ALSO:  the term "chain" is used to differentiate it from
 870          * Work Request List passed in; and because that's the terminology
 871          * from the previous generations of HCA - but the WQEs are not, in fact
 872          * chained together for Hermon
 873          */
 874 
 875         wrindx = 0;
 876         numremain = num_wr;
 877         status    = DDI_SUCCESS;
 878         while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 879                 /*
 880                  * For the first WQE on a new chain we need "prev" to point
 881                  * to the current descriptor.
 882                  */
 883                 prev = HERMON_QP_SQ_ENTRY(qp, tail);
 884 
 885                 /*
 886                  * Break the request up into lists that are less than or
 887                  * equal to the maximum number of WQEs that can be posted
 888                  * per doorbell ring - 256 currently
 889                  */
 890                 chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
 891                     HERMON_QP_MAXDESC_PER_DB : numremain;
 892                 numremain -= chainlen;
 893 
 894                 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 895                         /*
 896                          * Check for "queue full" condition.  If the queue
 897                          * is already full, then no more WQEs can be posted.
 898                          * So break out, ring a doorbell (if necessary) and
 899                          * return an error
 900                          */
 901                         if (wq->wq_full != 0) {
 902                                 status = IBT_QP_FULL;
 903                                 break;
 904                         }
 905 
 906                         /*
 907                          * Increment the "tail index". Check for "queue
 908                          * full" condition incl. headroom.  If we detect that
 909                          * the current work request is going to fill the work
 910                          * queue, then we mark this condition and continue.
 911                          * Don't need >=, because going one-by-one we have to
 912                          * hit it exactly sooner or later
 913                          */
 914 
 915                         next_tail = (tail + 1) & qsize_msk;
 916                         if (((tail + hdrmwqes) & qsize_msk) == head) {
 917                                 wq->wq_full = 1;
 918                         }
 919 
 920                         /*
 921                          * Get the address of the location where the next
 922                          * Send WQE should be built
 923                          */
 924                         desc = HERMON_QP_SQ_ENTRY(qp, tail);
 925                         /*
 926                          * Call hermon_wqe_send_build() to build the WQE
 927                          * at the given address.  This routine uses the
 928                          * information in the ibt_send_wr_t list (wr[]) and
 929                          * returns the size of the WQE when it returns.
 930                          */
 931                         status = hermon_wqe_send_build(state, qp,
 932                             &wr[wrindx], desc, &desc_sz);
 933                         if (status != DDI_SUCCESS) {
 934                                 break;
 935                         }
 936 
 937                         /*
 938                          * Now, build the Ctrl Segment based on
 939                          * what was just done
 940                          */
 941                         curr_wr = &wr[wrindx];
 942 
 943                         switch (curr_wr->wr_opcode) {
 944                         case IBT_WRC_RDMAW:
 945                                 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
 946                                         nopcode =
 947                                             HERMON_WQE_SEND_NOPCODE_RDMAWI;
 948                                         immed_data =
 949                                             hermon_wr_get_immediate(curr_wr);
 950                                 } else {
 951                                         nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
 952                                 }
 953                                 break;
 954 
 955                         case IBT_WRC_SEND:
 956                                 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
 957                                         nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
 958                                         immed_data =
 959                                             hermon_wr_get_immediate(curr_wr);
 960                                 } else {
 961                                         nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
 962                                 }
 963                                 break;
 964 
 965                         case IBT_WRC_SEND_LSO:
 966                                 nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
 967                                 break;
 968 
 969                         case IBT_WRC_RDMAR:
 970                                 nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
 971                                 break;
 972 
 973                         case IBT_WRC_CSWAP:
 974                                 nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
 975                                 break;
 976 
 977                         case IBT_WRC_FADD:
 978                                 nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
 979                                 break;
 980 
 981                         case IBT_WRC_BIND:
 982                                 nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
 983                                 break;
 984                         }
 985 
 986                         fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
 987 
 988                         /*
 989                          * now, build up the control segment, leaving the
 990                          * owner bit as it is
 991                          */
 992 
 993                         if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
 994                             (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
 995                                 signaled_dbd = 0xC;
 996                         } else {
 997                                 signaled_dbd = 0;
 998                         }
 999                         if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
1000                                 solicited = 0x2;
1001                         else
1002                                 solicited = 0;
1003 
1004                         if (qp->qp_is_special) {
1005                                 /* Ensure correctness, set the ReRead bit */
1006                                 nopcode |= (1 << 6);
1007                                 ah = (hermon_ahhdl_t)
1008                                     curr_wr->wr.ud.udwr_dest->ud_ah;
1009                                 mutex_enter(&ah->ah_lock);
1010                                 maxstat = ah->ah_udav->max_stat_rate;
1011                                 HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
1012                                     signaled_dbd, maxstat, ah->ah_udav->rlid,
1013                                     qp, ah->ah_udav->sl);
1014                                 mutex_exit(&ah->ah_lock);
1015                         } else {
1016                                 HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
1017                                     fence, immed_data, solicited,
1018                                     signaled_dbd, 0, qp, 0, 0);
1019                         }
1020                         wq->wq_wrid[tail] = curr_wr->wr_id;
1021 
1022                         /*
1023                          * If this is not the first descriptor on the current
1024                          * chain, then set the ownership bit.
1025                          */
1026                         if (currindx != 0) {            /* not the first */
1027                                 membar_producer();
1028                                 HERMON_SET_SEND_WQE_OWNER(qp,
1029                                     (uint32_t *)desc, nopcode);
1030                         } else
1031                                 prev_nopcode = nopcode;
1032 
1033                         /*
1034                          * Update the current "tail index" and increment
1035                          * "posted_cnt"
1036                          */
1037                         tail = next_tail;
1038                         posted_cnt++;
1039                 }
1040 
1041                 /*
1042                  * If we reach here and there are one or more WQEs which have
1043                  * been successfully built as a chain, we have to finish up
1044                  * and prepare them for writing to the HW
1045                  * The steps are:
1046                  *      1. do the headroom fixup
1047                  *      2. add in the size of the headroom for the sync
1048                  *      3. write the owner bit for the first WQE
1049                  *      4. sync them
1050                  *      5. fix up the structures
1051                  *      6. hit the doorbell in UAR
1052                  */
1053                 if (posted_cnt != 0) {
1054                         ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
1055 
1056                         /* do the invalidate of the headroom */
1057 
1058                         hermon_wqe_headroom(tail, qp);
1059 
1060                         /* Update some of the state in the QP */
1061                         wq->wq_tail = tail;
1062                         total_posted += posted_cnt;
1063                         posted_cnt = 0;
1064 
1065                         membar_producer();
1066 
1067                         /*
1068                          * Now set the ownership bit of the first
1069                          * one in the chain
1070                          */
1071                         HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
1072                             prev_nopcode);
1073 
1074                         /* the FMA retry loop starts for Hermon doorbell. */
1075                         hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
1076                             fm_status, fm_test);
1077 
1078                         HERMON_UAR_DOORBELL(state, uarhdl,
1079                             (uint64_t *)(void *)&state->hs_uar->send,
1080                             (uint64_t)qp->qp_ring);
1081 
1082                         /* the FMA retry loop ends. */
1083                         hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
1084                             fm_status, fm_test);
1085                 }
1086         }
1087 
1088         /*
1089          * Update the "num_posted" return value (if necessary).
1090          * Then drop the locks and return success.
1091          */
1092         if (num_posted != NULL) {
1093                 *num_posted = total_posted;
1094         }
1095         mutex_exit(&qp->qp_sq_lock);
1096         return (status);
1097 
1098 pio_error:
1099         mutex_exit(&qp->qp_sq_lock);
1100         hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1101         return (ibc_get_ci_failure(0));
1102 }
1103 
1104 
1105 /*
1106  * hermon_post_recv()
1107  *    Context: Can be called from interrupt or base context.
1108  */
1109 int
1110 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
1111     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1112 {
1113         uint64_t                        *desc;
1114         hermon_workq_hdr_t              *wq;
1115         uint32_t                        head, tail, next_tail, qsize_msk;
1116         uint_t                          wrindx;
1117         uint_t                          posted_cnt;
1118         int                             status;
1119 
1120         /*
1121          * Check for user-mappable QP memory.  Note:  We do not allow kernel
1122          * clients to post to QP memory that is accessible directly by the
1123          * user.  If the QP memory is user accessible, then return an error.
1124          */
1125         if (qp->qp_alloc_flags & IBT_QP_USER_MAP) {
1126                 return (IBT_QP_HDL_INVALID);
1127         }
1128 
1129         /* Initialize posted_cnt */
1130         posted_cnt = 0;
1131 
1132         mutex_enter(&qp->qp_lock);
1133 
1134         /*
1135          * Check if QP is associated with an SRQ
1136          */
1137         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
1138                 mutex_exit(&qp->qp_lock);
1139                 return (IBT_SRQ_IN_USE);
1140         }
1141 
1142         /*
1143          * Check QP state.  Can not post Recv requests from the "Reset" state
1144          */
1145         if (qp->qp_state == HERMON_QP_RESET) {
1146                 mutex_exit(&qp->qp_lock);
1147                 return (IBT_QP_STATE_INVALID);
1148         }
1149 
1150         /* Check that work request transport type is valid */
1151         if ((qp->qp_type != IBT_UD_RQP) &&
1152             (qp->qp_serv_type != HERMON_QP_RC) &&
1153             (qp->qp_serv_type != HERMON_QP_UC)) {
1154                 mutex_exit(&qp->qp_lock);
1155                 return (IBT_QP_SRV_TYPE_INVALID);
1156         }
1157 
1158         /*
1159          * Grab the lock for the WRID list, i.e., membar_consumer().
1160          * This is not needed because the mutex_enter() above has
1161          * the same effect.
1162          */
1163 
1164         /* Save away some initial QP state */
1165         wq = qp->qp_rq_wqhdr;
1166         qsize_msk = wq->wq_mask;
1167         tail      = wq->wq_tail;
1168         head      = wq->wq_head;
1169 
1170         wrindx = 0;
1171         status    = DDI_SUCCESS;
1172 
1173         for (wrindx = 0; wrindx < num_wr; wrindx++) {
1174                 if (wq->wq_full != 0) {
1175                         status = IBT_QP_FULL;
1176                         break;
1177                 }
1178                 next_tail = (tail + 1) & qsize_msk;
1179                 if (next_tail == head) {
1180                         wq->wq_full = 1;
1181                 }
1182                 desc = HERMON_QP_RQ_ENTRY(qp, tail);
1183                 status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1184                 if (status != DDI_SUCCESS) {
1185                         break;
1186                 }
1187 
1188                 wq->wq_wrid[tail] = wr[wrindx].wr_id;
1189                 qp->qp_rq_wqecntr++;
1190 
1191                 tail = next_tail;
1192                 posted_cnt++;
1193         }
1194 
1195         if (posted_cnt != 0) {
1196 
1197                 wq->wq_tail = tail;
1198 
1199                 membar_producer();      /* ensure wrids are visible */
1200 
1201                 /* Update the doorbell record w/ wqecntr */
1202                 HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1203                     qp->qp_rq_wqecntr & 0xFFFF);
1204         }
1205 
1206         if (num_posted != NULL) {
1207                 *num_posted = posted_cnt;
1208         }
1209 
1210 
1211         mutex_exit(&qp->qp_lock);
1212         return (status);
1213 }
1214 
1215 /*
1216  * hermon_post_srq()
1217  *    Context: Can be called from interrupt or base context.
1218  */
1219 int
1220 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1221     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1222 {
1223         uint64_t                        *desc;
1224         hermon_workq_hdr_t              *wq;
1225         uint_t                          indx, wrindx;
1226         uint_t                          posted_cnt;
1227         int                             status;
1228 
1229         mutex_enter(&srq->srq_lock);
1230 
1231         /*
1232          * Check for user-mappable QP memory.  Note:  We do not allow kernel
1233          * clients to post to QP memory that is accessible directly by the
1234          * user.  If the QP memory is user accessible, then return an error.
1235          */
1236         if (srq->srq_is_umap) {
1237                 mutex_exit(&srq->srq_lock);
1238                 return (IBT_SRQ_HDL_INVALID);
1239         }
1240 
1241         /*
1242          * Check SRQ state.  Can not post Recv requests when SRQ is in error
1243          */
1244         if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1245                 mutex_exit(&srq->srq_lock);
1246                 return (IBT_QP_STATE_INVALID);
1247         }
1248 
1249         status = DDI_SUCCESS;
1250         posted_cnt = 0;
1251         wq = srq->srq_wq_wqhdr;
1252         indx = wq->wq_head;
1253 
1254         for (wrindx = 0; wrindx < num_wr; wrindx++) {
1255 
1256                 if (indx == wq->wq_tail) {
1257                         status = IBT_QP_FULL;
1258                         break;
1259                 }
1260                 desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1261 
1262                 wq->wq_wrid[indx] = wr[wrindx].wr_id;
1263 
1264                 status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1265                 if (status != DDI_SUCCESS) {
1266                         break;
1267                 }
1268 
1269                 posted_cnt++;
1270                 indx = htons(((uint16_t *)desc)[1]);
1271                 wq->wq_head = indx;
1272         }
1273 
1274         if (posted_cnt != 0) {
1275 
1276                 srq->srq_wq_wqecntr += posted_cnt;
1277 
1278                 membar_producer();      /* ensure wrids are visible */
1279 
1280                 /* Ring the doorbell w/ wqecntr */
1281                 HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1282                     srq->srq_wq_wqecntr & 0xFFFF);
1283         }
1284 
1285         if (num_posted != NULL) {
1286                 *num_posted = posted_cnt;
1287         }
1288 
1289         mutex_exit(&srq->srq_lock);
1290         return (status);
1291 }
1292 
1293 
1294 /*
1295  * hermon_wqe_send_build()
1296  *    Context: Can be called from interrupt or base context.
1297  */
1298 static int
1299 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1300     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1301 {
1302         hermon_hw_snd_wqe_ud_t          *ud;
1303         hermon_hw_snd_wqe_remaddr_t     *rc;
1304         hermon_hw_snd_wqe_atomic_t      *at;
1305         hermon_hw_snd_wqe_remaddr_t     *uc;
1306         hermon_hw_snd_wqe_bind_t        *bn;
1307         hermon_hw_wqe_sgl_t             *ds, *old_ds;
1308         ibt_ud_dest_t                   *dest;
1309         ibt_wr_ds_t                     *sgl;
1310         hermon_ahhdl_t                  ah;
1311         uint32_t                        nds;
1312         int                             i, j, last_ds, num_ds, status;
1313         int                             tmpsize;
1314 
1315         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1316 
1317         /* Initialize the information for the Data Segments */
1318         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1319             sizeof (hermon_hw_snd_wqe_ctrl_t));
1320         nds = wr->wr_nds;
1321         sgl = wr->wr_sgl;
1322         num_ds = 0;
1323         i = 0;
1324 
1325         /*
1326          * Build a Send WQE depends first and foremost on the transport
1327          * type of Work Request (i.e. UD, RC, or UC)
1328          */
1329         switch (wr->wr_trans) {
1330         case IBT_UD_SRV:
1331                 /* Ensure that work request transport type matches QP type */
1332                 if (qp->qp_serv_type != HERMON_QP_UD) {
1333                         return (IBT_QP_SRV_TYPE_INVALID);
1334                 }
1335 
1336                 /*
1337                  * Validate the operation type.  For UD requests, only the
1338                  * "Send" and "Send LSO" operations are valid.
1339                  */
1340                 if (wr->wr_opcode != IBT_WRC_SEND &&
1341                     wr->wr_opcode != IBT_WRC_SEND_LSO) {
1342                         return (IBT_QP_OP_TYPE_INVALID);
1343                 }
1344 
1345                 /*
1346                  * If this is a Special QP (QP0 or QP1), then we need to
1347                  * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1348                  * and return whatever status it returns
1349                  */
1350                 if (qp->qp_is_special) {
1351                         if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1352                                 return (IBT_QP_OP_TYPE_INVALID);
1353                         }
1354                         status = hermon_wqe_mlx_build(state, qp,
1355                             wr, desc, size);
1356                         return (status);
1357                 }
1358 
1359                 /*
1360                  * Otherwise, if this is a normal UD Send request, then fill
1361                  * all the fields in the Hermon UD header for the WQE.  Note:
1362                  * to do this we'll need to extract some information from the
1363                  * Address Handle passed with the work request.
1364                  */
1365                 ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1366                     sizeof (hermon_hw_snd_wqe_ctrl_t));
1367                 if (wr->wr_opcode == IBT_WRC_SEND) {
1368                         dest = wr->wr.ud.udwr_dest;
1369                 } else {
1370                         dest = wr->wr.ud_lso.lso_ud_dest;
1371                 }
1372                 ah = (hermon_ahhdl_t)dest->ud_ah;
1373                 if (ah == NULL) {
1374                         return (IBT_AH_HDL_INVALID);
1375                 }
1376 
1377                 /*
1378                  * Build the Unreliable Datagram Segment for the WQE, using
1379                  * the information from the address handle and the work
1380                  * request.
1381                  */
1382                 /* mutex_enter(&ah->ah_lock); */
1383                 if (wr->wr_opcode == IBT_WRC_SEND) {
1384                         HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1385                 } else {        /* IBT_WRC_SEND_LSO */
1386                         HERMON_WQE_BUILD_UD(qp, ud, ah,
1387                             wr->wr.ud_lso.lso_ud_dest);
1388                 }
1389                 /* mutex_exit(&ah->ah_lock); */
1390 
1391                 /* Update "ds" for filling in Data Segments (below) */
1392                 ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1393                     sizeof (hermon_hw_snd_wqe_ud_t));
1394 
1395                 if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1396                         int total_len;
1397 
1398                         total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1399                         if ((uintptr_t)ds + total_len + (nds * 16) >
1400                             (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1401                                 return (IBT_QP_SGL_LEN_INVALID);
1402 
1403                         bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1404                             wr->wr.ud_lso.lso_hdr_sz);
1405                         old_ds = ds;
1406                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1407                         for (; i < nds; i++) {
1408                                 if (sgl[i].ds_len == 0)
1409                                         continue;
1410                                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1411                                     &sgl[i]);
1412                                 num_ds++;
1413                                 i++;
1414                                 break;
1415                         }
1416                         membar_producer();
1417                         HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1418                             wr->wr.ud_lso.lso_hdr_sz);
1419                 }
1420 
1421                 break;
1422 
1423         case IBT_RC_SRV:
1424                 /* Ensure that work request transport type matches QP type */
1425                 if (qp->qp_serv_type != HERMON_QP_RC) {
1426                         return (IBT_QP_SRV_TYPE_INVALID);
1427                 }
1428 
1429                 /*
1430                  * Validate the operation type.  For RC requests, we allow
1431                  * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1432                  * operations, and memory window "Bind"
1433                  */
1434                 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1435                     (wr->wr_opcode != IBT_WRC_RDMAR) &&
1436                     (wr->wr_opcode != IBT_WRC_RDMAW) &&
1437                     (wr->wr_opcode != IBT_WRC_CSWAP) &&
1438                     (wr->wr_opcode != IBT_WRC_FADD) &&
1439                     (wr->wr_opcode != IBT_WRC_BIND)) {
1440                         return (IBT_QP_OP_TYPE_INVALID);
1441                 }
1442 
1443                 /*
1444                  * If this is a Send request, then all we need to do is break
1445                  * out and here and begin the Data Segment processing below
1446                  */
1447                 if (wr->wr_opcode == IBT_WRC_SEND) {
1448                         break;
1449                 }
1450 
1451                 /*
1452                  * If this is an RDMA Read or RDMA Write request, then fill
1453                  * in the "Remote Address" header fields.
1454                  */
1455                 if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1456                     (wr->wr_opcode == IBT_WRC_RDMAW)) {
1457                         rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1458                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1459 
1460                         /*
1461                          * Build the Remote Address Segment for the WQE, using
1462                          * the information from the RC work request.
1463                          */
1464                         HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1465 
1466                         /* Update "ds" for filling in Data Segments (below) */
1467                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1468                             sizeof (hermon_hw_snd_wqe_remaddr_t));
1469                         break;
1470                 }
1471 
1472                 /*
1473                  * If this is one of the Atomic type operations (i.e
1474                  * Compare-Swap or Fetch-Add), then fill in both the "Remote
1475                  * Address" header fields and the "Atomic" header fields.
1476                  */
1477                 if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1478                     (wr->wr_opcode == IBT_WRC_FADD)) {
1479                         rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1480                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1481                         at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1482                             sizeof (hermon_hw_snd_wqe_remaddr_t));
1483 
1484                         /*
1485                          * Build the Remote Address and Atomic Segments for
1486                          * the WQE, using the information from the RC Atomic
1487                          * work request.
1488                          */
1489                         HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1490                         HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1491 
1492                         /* Update "ds" for filling in Data Segments (below) */
1493                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1494                             sizeof (hermon_hw_snd_wqe_atomic_t));
1495 
1496                         /*
1497                          * Update "nds" and "sgl" because Atomic requests have
1498                          * only a single Data Segment (and they are encoded
1499                          * somewhat differently in the work request.
1500                          */
1501                         nds = 1;
1502                         sgl = wr->wr_sgl;
1503                         break;
1504                 }
1505 
1506                 /*
1507                  * If this is memory window Bind operation, then we call the
1508                  * hermon_wr_bind_check() routine to validate the request and
1509                  * to generate the updated RKey.  If this is successful, then
1510                  * we fill in the WQE's "Bind" header fields.
1511                  */
1512                 if (wr->wr_opcode == IBT_WRC_BIND) {
1513                         status = hermon_wr_bind_check(state, wr);
1514                         if (status != DDI_SUCCESS) {
1515                                 return (status);
1516                         }
1517 
1518                         bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1519                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1520 
1521                         /*
1522                          * Build the Bind Memory Window Segments for the WQE,
1523                          * using the information from the RC Bind memory
1524                          * window work request.
1525                          */
1526                         HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1527 
1528                         /*
1529                          * Update the "ds" pointer.  Even though the "bind"
1530                          * operation requires no SGLs, this is necessary to
1531                          * facilitate the correct descriptor size calculations
1532                          * (below).
1533                          */
1534                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1535                             sizeof (hermon_hw_snd_wqe_bind_t));
1536                         nds = 0;
1537                 }
1538                 break;
1539 
1540         case IBT_UC_SRV:
1541                 /* Ensure that work request transport type matches QP type */
1542                 if (qp->qp_serv_type != HERMON_QP_UC) {
1543                         return (IBT_QP_SRV_TYPE_INVALID);
1544                 }
1545 
1546                 /*
1547                  * Validate the operation type.  For UC requests, we only
1548                  * allow "Send", "RDMA Write", and memory window "Bind".
1549                  * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1550                  * operations
1551                  */
1552                 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1553                     (wr->wr_opcode != IBT_WRC_RDMAW) &&
1554                     (wr->wr_opcode != IBT_WRC_BIND)) {
1555                         return (IBT_QP_OP_TYPE_INVALID);
1556                 }
1557 
1558                 /*
1559                  * If this is a Send request, then all we need to do is break
1560                  * out and here and begin the Data Segment processing below
1561                  */
1562                 if (wr->wr_opcode == IBT_WRC_SEND) {
1563                         break;
1564                 }
1565 
1566                 /*
1567                  * If this is an RDMA Write request, then fill in the "Remote
1568                  * Address" header fields.
1569                  */
1570                 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1571                         uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1572                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1573 
1574                         /*
1575                          * Build the Remote Address Segment for the WQE, using
1576                          * the information from the UC work request.
1577                          */
1578                         HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1579 
1580                         /* Update "ds" for filling in Data Segments (below) */
1581                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1582                             sizeof (hermon_hw_snd_wqe_remaddr_t));
1583                         break;
1584                 }
1585 
1586                 /*
1587                  * If this is memory window Bind operation, then we call the
1588                  * hermon_wr_bind_check() routine to validate the request and
1589                  * to generate the updated RKey.  If this is successful, then
1590                  * we fill in the WQE's "Bind" header fields.
1591                  */
1592                 if (wr->wr_opcode == IBT_WRC_BIND) {
1593                         status = hermon_wr_bind_check(state, wr);
1594                         if (status != DDI_SUCCESS) {
1595                                 return (status);
1596                         }
1597 
1598                         bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1599                             sizeof (hermon_hw_snd_wqe_ctrl_t));
1600 
1601                         /*
1602                          * Build the Bind Memory Window Segments for the WQE,
1603                          * using the information from the UC Bind memory
1604                          * window work request.
1605                          */
1606                         HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1607 
1608                         /*
1609                          * Update the "ds" pointer.  Even though the "bind"
1610                          * operation requires no SGLs, this is necessary to
1611                          * facilitate the correct descriptor size calculations
1612                          * (below).
1613                          */
1614                         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1615                             sizeof (hermon_hw_snd_wqe_bind_t));
1616                         nds = 0;
1617                 }
1618                 break;
1619 
1620         default:
1621                 return (IBT_QP_SRV_TYPE_INVALID);
1622         }
1623 
1624         /*
1625          * Now fill in the Data Segments (SGL) for the Send WQE based on
1626          * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1627          * Start by checking for a valid number of SGL entries
1628          */
1629         if (nds > qp->qp_sq_sgl) {
1630                 return (IBT_QP_SGL_LEN_INVALID);
1631         }
1632 
1633         /*
1634          * For each SGL in the Send Work Request, fill in the Send WQE's data
1635          * segments.  Note: We skip any SGL with zero size because Hermon
1636          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1637          * the encoding for zero means a 2GB transfer.
1638          */
1639         for (last_ds = num_ds, j = i; j < nds; j++) {
1640                 if (sgl[j].ds_len != 0)
1641                         last_ds++;      /* real last ds of wqe to fill */
1642         }
1643 
1644         /*
1645          * Return the size of descriptor (in 16-byte chunks)
1646          * For Hermon, we want them (for now) to be on stride size
1647          * boundaries, which was implicit in Tavor/Arbel
1648          *
1649          */
1650         tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1651 
1652         *size = tmpsize >> 0x4;
1653 
1654         for (j = nds; --j >= i; ) {
1655                 if (sgl[j].ds_len == 0) {
1656                         continue;
1657                 }
1658 
1659                 /*
1660                  * Fill in the Data Segment(s) for the current WQE, using the
1661                  * information contained in the scatter-gather list of the
1662                  * work request.
1663                  */
1664                 last_ds--;
1665                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1666         }
1667 
1668         return (DDI_SUCCESS);
1669 }
1670 
1671 
1672 
1673 /*
1674  * hermon_wqe_mlx_build()
1675  *    Context: Can be called from interrupt or base context.
1676  */
1677 static int
1678 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1679     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1680 {
1681         hermon_ahhdl_t          ah;
1682         hermon_hw_udav_t        *udav;
1683         ib_lrh_hdr_t            *lrh;
1684         ib_grh_t                *grh;
1685         ib_bth_hdr_t            *bth;
1686         ib_deth_hdr_t           *deth;
1687         hermon_hw_wqe_sgl_t     *ds;
1688         ibt_wr_ds_t             *sgl;
1689         uint8_t                 *mgmtclass, *hpoint, *hcount;
1690         uint32_t                nds, offset, pktlen;
1691         uint32_t                desc_sz;
1692         int                     i, num_ds;
1693         int                     tmpsize;
1694 
1695         ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1696 
1697         /* Initialize the information for the Data Segments */
1698         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1699             sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1700 
1701         /*
1702          * Pull the address handle from the work request. The UDAV will
1703          * be used to answer some questions about the request.
1704          */
1705         ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1706         if (ah == NULL) {
1707                 return (IBT_AH_HDL_INVALID);
1708         }
1709         mutex_enter(&ah->ah_lock);
1710         udav = ah->ah_udav;
1711 
1712         /*
1713          * If the request is for QP1 and the destination LID is equal to
1714          * the Permissive LID, then return an error.  This combination is
1715          * not allowed
1716          */
1717         if ((udav->rlid == IB_LID_PERMISSIVE) &&
1718             (qp->qp_is_special == HERMON_QP_GSI)) {
1719                 mutex_exit(&ah->ah_lock);
1720                 return (IBT_AH_HDL_INVALID);
1721         }
1722 
1723         /*
1724          * Calculate the size of the packet headers, including the GRH
1725          * (if necessary)
1726          */
1727         desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1728             sizeof (ib_deth_hdr_t);
1729         if (udav->grh) {
1730                 desc_sz += sizeof (ib_grh_t);
1731         }
1732 
1733         /*
1734          * Begin to build the first "inline" data segment for the packet
1735          * headers.  Note:  By specifying "inline" we can build the contents
1736          * of the MAD packet headers directly into the work queue (as part
1737          * descriptor).  This has the advantage of both speeding things up
1738          * and of not requiring the driver to allocate/register any additional
1739          * memory for the packet headers.
1740          */
1741         HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1742         desc_sz += 4;
1743 
1744         /*
1745          * Build Local Route Header (LRH)
1746          *    We start here by building the LRH into a temporary location.
1747          *    When we have finished we copy the LRH data into the descriptor.
1748          *
1749          *    Notice that the VL values are hardcoded.  This is not a problem
1750          *    because VL15 is decided later based on the value in the MLX
1751          *    transport "next/ctrl" header (see the "vl15" bit below), and it
1752          *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1753          *    values.  This rule does not hold for loopback packets however
1754          *    (all of which bypass the SL-to-VL tables) and it is the reason
1755          *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1756          *
1757          *    Notice also that Source LID is hardcoded to the Permissive LID
1758          *    (0xFFFF).  This is also not a problem because if the Destination
1759          *    LID is not the Permissive LID, then the "slr" value in the MLX
1760          *    transport "next/ctrl" header will be set to zero and the hardware
1761          *    will pull the LID from value in the port.
1762          */
1763         lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1764         pktlen = (desc_sz + 0x100) >> 2;
1765         HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1766 
1767         /*
1768          * Build Global Route Header (GRH)
1769          *    This is only built if necessary as defined by the "grh" bit in
1770          *    the address vector.  Note:  We also calculate the offset to the
1771          *    next header (BTH) based on whether or not the "grh" bit is set.
1772          */
1773         if (udav->grh) {
1774                 /*
1775                  * If the request is for QP0, then return an error.  The
1776                  * combination of global routine (GRH) and QP0 is not allowed.
1777                  */
1778                 if (qp->qp_is_special == HERMON_QP_SMI) {
1779                         mutex_exit(&ah->ah_lock);
1780                         return (IBT_AH_HDL_INVALID);
1781                 }
1782                 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1783                 HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1784 
1785                 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1786         } else {
1787                 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1788         }
1789         mutex_exit(&ah->ah_lock);
1790 
1791 
1792         /*
1793          * Build Base Transport Header (BTH)
1794          *    Notice that the M, PadCnt, and TVer fields are all set
1795          *    to zero implicitly.  This is true for all Management Datagrams
1796          *    MADs whether GSI are SMI.
1797          */
1798         HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1799 
1800         /*
1801          * Build Datagram Extended Transport Header (DETH)
1802          */
1803         deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1804         HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1805 
1806         /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1807         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1808         ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1809         nds = wr->wr_nds;
1810         sgl = wr->wr_sgl;
1811         num_ds = 0;
1812 
1813         /*
1814          * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1815          * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1816          * Start by checking for a valid number of SGL entries
1817          */
1818         if (nds > qp->qp_sq_sgl) {
1819                 return (IBT_QP_SGL_LEN_INVALID);
1820         }
1821 
1822         /*
1823          * For each SGL in the Send Work Request, fill in the MLX WQE's data
1824          * segments.  Note: We skip any SGL with zero size because Hermon
1825          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1826          * the encoding for zero means a 2GB transfer.  Because of this special
1827          * encoding in the hardware, we mask the requested length with
1828          * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1829          * zero.)
1830          */
1831         mgmtclass = hpoint = hcount = NULL;
1832         offset = 0;
1833         for (i = 0; i < nds; i++) {
1834                 if (sgl[i].ds_len == 0) {
1835                         continue;
1836                 }
1837 
1838                 /*
1839                  * Fill in the Data Segment(s) for the MLX send WQE, using
1840                  * the information contained in the scatter-gather list of
1841                  * the work request.
1842                  */
1843                 HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1844 
1845                 /*
1846                  * Search through the contents of all MADs posted to QP0 to
1847                  * initialize pointers to the places where Directed Route "hop
1848                  * pointer", "hop count", and "mgmtclass" would be.  Hermon
1849                  * needs these updated (i.e. incremented or decremented, as
1850                  * necessary) by software.
1851                  */
1852                 if (qp->qp_is_special == HERMON_QP_SMI) {
1853 
1854                         HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1855                             offset, sgl[i].ds_va, sgl[i].ds_len);
1856 
1857                         HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1858                             offset, sgl[i].ds_va, sgl[i].ds_len);
1859 
1860                         HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1861                             offset, sgl[i].ds_va, sgl[i].ds_len);
1862 
1863                         offset += sgl[i].ds_len;
1864                 }
1865                 num_ds++;
1866         }
1867 
1868         /*
1869          * Hermon's Directed Route MADs need to have the "hop pointer"
1870          * incremented/decremented (as necessary) depending on whether it is
1871          * currently less than or greater than the "hop count" (i.e. whether
1872          * the MAD is a request or a response.)
1873          */
1874         if (qp->qp_is_special == HERMON_QP_SMI) {
1875                 HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1876                     *hpoint, *hcount);
1877         }
1878 
1879         /*
1880          * Now fill in the ICRC Data Segment.  This data segment is inlined
1881          * just like the packets headers above, but it is only four bytes and
1882          * set to zero (to indicate that we wish the hardware to generate ICRC.
1883          */
1884         HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1885         num_ds++;
1886 
1887         /*
1888          * Return the size of descriptor (in 16-byte chunks)
1889          * For Hermon, we want them (for now) to be on stride size
1890          * boundaries, which was implicit in Tavor/Arbel
1891          */
1892         tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1893 
1894         *size = tmpsize >> 0x04;
1895 
1896         return (DDI_SUCCESS);
1897 }
1898 
1899 
1900 
1901 /*
1902  * hermon_wqe_recv_build()
1903  *    Context: Can be called from interrupt or base context.
1904  */
1905 /* ARGSUSED */
1906 static int
1907 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1908     ibt_recv_wr_t *wr, uint64_t *desc)
1909 {
1910         hermon_hw_wqe_sgl_t     *ds;
1911         int                     i, num_ds;
1912 
1913         ASSERT(MUTEX_HELD(&qp->qp_lock));
1914 
1915         /*
1916          * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1917          * need to have a reserved for the ctrl, there is none on the
1918          * recv queue for hermon, but will need to put an invalid
1919          * (null) scatter pointer per PRM
1920          */
1921         ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1922         num_ds = 0;
1923 
1924         /* Check for valid number of SGL entries */
1925         if (wr->wr_nds > qp->qp_rq_sgl) {
1926                 return (IBT_QP_SGL_LEN_INVALID);
1927         }
1928 
1929         /*
1930          * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1931          * segments.  Note: We skip any SGL with zero size because Hermon
1932          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1933          * the encoding for zero means a 2GB transfer.  Because of this special
1934          * encoding in the hardware, we mask the requested length with
1935          * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1936          * zero.)
1937          */
1938         for (i = 0; i < wr->wr_nds; i++) {
1939                 if (wr->wr_sgl[i].ds_len == 0) {
1940                         continue;
1941                 }
1942 
1943                 /*
1944                  * Fill in the Data Segment(s) for the receive WQE, using the
1945                  * information contained in the scatter-gather list of the
1946                  * work request.
1947                  */
1948                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1949                 num_ds++;
1950         }
1951 
1952         /* put the null sgl pointer as well if needed */
1953         if (num_ds < qp->qp_rq_sgl) {
1954                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1955         }
1956 
1957         return (DDI_SUCCESS);
1958 }
1959 
1960 
1961 
1962 /*
1963  * hermon_wqe_srq_build()
1964  *    Context: Can be called from interrupt or base context.
1965  */
1966 /* ARGSUSED */
1967 static int
1968 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1969     ibt_recv_wr_t *wr, uint64_t *desc)
1970 {
1971         hermon_hw_wqe_sgl_t     *ds;
1972         int                     i, num_ds;
1973 
1974         ASSERT(MUTEX_HELD(&srq->srq_lock));
1975 
1976         /* Fill in the Data Segments (SGL) for the Recv WQE */
1977         ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1978             sizeof (hermon_hw_srq_wqe_next_t));
1979         num_ds = 0;
1980 
1981         /* Check for valid number of SGL entries */
1982         if (wr->wr_nds > srq->srq_wq_sgl) {
1983                 return (IBT_QP_SGL_LEN_INVALID);
1984         }
1985 
1986         /*
1987          * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1988          * segments.  Note: We skip any SGL with zero size because Hermon
1989          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1990          * the encoding for zero means a 2GB transfer.  Because of this special
1991          * encoding in the hardware, we mask the requested length with
1992          * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1993          * zero.)
1994          */
1995         for (i = 0; i < wr->wr_nds; i++) {
1996                 if (wr->wr_sgl[i].ds_len == 0) {
1997                         continue;
1998                 }
1999 
2000                 /*
2001                  * Fill in the Data Segment(s) for the receive WQE, using the
2002                  * information contained in the scatter-gather list of the
2003                  * work request.
2004                  */
2005                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
2006                 num_ds++;
2007         }
2008 
2009         /*
2010          * put in the null sgl pointer as well, if needed
2011          */
2012         if (num_ds < srq->srq_wq_sgl) {
2013                 HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
2014         }
2015 
2016         return (DDI_SUCCESS);
2017 }
2018 
2019 
2020 /*
2021  * hermon_wr_get_immediate()
2022  *    Context: Can be called from interrupt or base context.
2023  */
2024 static uint32_t
2025 hermon_wr_get_immediate(ibt_send_wr_t *wr)
2026 {
2027         /*
2028          * This routine extracts the "immediate data" from the appropriate
2029          * location in the IBTF work request.  Because of the way the
2030          * work request structure is defined, the location for this data
2031          * depends on the actual work request operation type.
2032          */
2033 
2034         /* For RDMA Write, test if RC or UC */
2035         if (wr->wr_opcode == IBT_WRC_RDMAW) {
2036                 if (wr->wr_trans == IBT_RC_SRV) {
2037                         return (wr->wr.rc.rcwr.rdma.rdma_immed);
2038                 } else {  /* IBT_UC_SRV */
2039                         return (wr->wr.uc.ucwr.rdma.rdma_immed);
2040                 }
2041         }
2042 
2043         /* For Send, test if RC, UD, or UC */
2044         if (wr->wr_opcode == IBT_WRC_SEND) {
2045                 if (wr->wr_trans == IBT_RC_SRV) {
2046                         return (wr->wr.rc.rcwr.send_immed);
2047                 } else if (wr->wr_trans == IBT_UD_SRV) {
2048                         return (wr->wr.ud.udwr_immed);
2049                 } else {  /* IBT_UC_SRV */
2050                         return (wr->wr.uc.ucwr.send_immed);
2051                 }
2052         }
2053 
2054         /*
2055          * If any other type of request, then immediate is undefined
2056          */
2057         return (0);
2058 }
2059 
2060 /*
2061  * hermon_wqe_headroom()
2062  *      Context: can be called from interrupt or base, currently only from
2063  *      base context.
2064  * Routine that fills in the headroom for the Send Queue
2065  */
2066 
2067 static void
2068 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
2069 {
2070         uint32_t        *wqe_start, *wqe_top, *wqe_base, qsize;
2071         int             hdrmwqes, wqesizebytes, sectperwqe;
2072         uint32_t        invalue;
2073         int             i, j;
2074 
2075         qsize    = qp->qp_sq_bufsz;
2076         wqesizebytes = 1 << qp->qp_sq_log_wqesz;
2077         sectperwqe = wqesizebytes >> 6;   /* 64 bytes/section */
2078         hdrmwqes = qp->qp_sq_hdrmwqes;
2079         wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
2080         wqe_top   = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
2081         wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
2082 
2083         for (i = 0; i < hdrmwqes; i++)       {
2084                 for (j = 0; j < sectperwqe; j++) {
2085                         if (j == 0) {           /* 1st section of wqe */
2086                                 /* perserve ownership bit */
2087                                 invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
2088                                     wqe_start) | 0x7FFFFFFF;
2089                         } else {
2090                                 /* or just invalidate it */
2091                                 invalue = 0xFFFFFFFF;
2092                         }
2093                         ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
2094                         wqe_start += 16;        /* move 64 bytes */
2095                 }
2096                 if (wqe_start == wqe_top)       /* hit the end of the queue */
2097                         wqe_start = wqe_base;   /* wrap to start */
2098         }
2099 }
2100 
2101 /*
2102  * hermon_wr_bind_check()
2103  *    Context: Can be called from interrupt or base context.
2104  */
2105 /* ARGSUSED */
2106 static int
2107 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2108 {
2109         ibt_bind_flags_t        bind_flags;
2110         uint64_t                vaddr, len;
2111         uint64_t                reg_start_addr, reg_end_addr;
2112         hermon_mwhdl_t          mw;
2113         hermon_mrhdl_t          mr;
2114         hermon_rsrc_t           *mpt;
2115         uint32_t                new_rkey;
2116 
2117         /* Check for a valid Memory Window handle in the WR */
2118         mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2119         if (mw == NULL) {
2120                 return (IBT_MW_HDL_INVALID);
2121         }
2122 
2123         /* Check for a valid Memory Region handle in the WR */
2124         mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2125         if (mr == NULL) {
2126                 return (IBT_MR_HDL_INVALID);
2127         }
2128 
2129         mutex_enter(&mr->mr_lock);
2130         mutex_enter(&mw->mr_lock);
2131 
2132         /*
2133          * Check here to see if the memory region has already been partially
2134          * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2135          * If so, this is an error, return failure.
2136          */
2137         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2138                 mutex_exit(&mr->mr_lock);
2139                 mutex_exit(&mw->mr_lock);
2140                 return (IBT_MR_HDL_INVALID);
2141         }
2142 
2143         /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2144         if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2145                 mutex_exit(&mr->mr_lock);
2146                 mutex_exit(&mw->mr_lock);
2147                 return (IBT_MR_RKEY_INVALID);
2148         }
2149 
2150         /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2151         if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2152                 mutex_exit(&mr->mr_lock);
2153                 mutex_exit(&mw->mr_lock);
2154                 return (IBT_MR_LKEY_INVALID);
2155         }
2156 
2157         /*
2158          * Now check for valid "vaddr" and "len".  Note:  We don't check the
2159          * "vaddr" range when "len == 0" (i.e. on unbind operations)
2160          */
2161         len = wr->wr.rc.rcwr.bind->bind_len;
2162         if (len != 0) {
2163                 vaddr = wr->wr.rc.rcwr.bind->bind_va;
2164                 reg_start_addr = mr->mr_bindinfo.bi_addr;
2165                 reg_end_addr   = mr->mr_bindinfo.bi_addr +
2166                     (mr->mr_bindinfo.bi_len - 1);
2167                 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2168                         mutex_exit(&mr->mr_lock);
2169                         mutex_exit(&mw->mr_lock);
2170                         return (IBT_MR_VA_INVALID);
2171                 }
2172                 vaddr = (vaddr + len) - 1;
2173                 if (vaddr > reg_end_addr) {
2174                         mutex_exit(&mr->mr_lock);
2175                         mutex_exit(&mw->mr_lock);
2176                         return (IBT_MR_LEN_INVALID);
2177                 }
2178         }
2179 
2180         /*
2181          * Validate the bind access flags.  Remote Write and Atomic access for
2182          * the Memory Window require that Local Write access be set in the
2183          * corresponding Memory Region.
2184          */
2185         bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2186         if (((bind_flags & IBT_WR_BIND_WRITE) ||
2187             (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2188             !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2189                 mutex_exit(&mr->mr_lock);
2190                 mutex_exit(&mw->mr_lock);
2191                 return (IBT_MR_ACCESS_REQ_INVALID);
2192         }
2193 
2194         /* Calculate the new RKey for the Memory Window */
2195         mpt = mw->mr_mptrsrcp;
2196         new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2197         new_rkey = hermon_mr_key_swap(new_rkey);
2198 
2199         wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2200         mw->mr_rkey = new_rkey;
2201 
2202         mutex_exit(&mr->mr_lock);
2203         mutex_exit(&mw->mr_lock);
2204         return (DDI_SUCCESS);
2205 }
2206 
2207 
2208 /*
2209  * hermon_wrid_from_reset_handling()
2210  *    Context: Can be called from interrupt or base context.
2211  */
2212 /* ARGSUSED */
2213 int
2214 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2215 {
2216         hermon_workq_hdr_t      *swq, *rwq;
2217 
2218         if (qp->qp_alloc_flags & IBT_QP_USER_MAP)
2219                 return (DDI_SUCCESS);
2220 
2221         /* grab the cq lock(s) to modify the wqavl tree */
2222         if (qp->qp_rq_cqhdl)
2223                 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2224         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2225             qp->qp_sq_cqhdl != NULL)
2226                 mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2227 
2228         /* Chain the newly allocated work queue header to the CQ's list */
2229         if (qp->qp_sq_cqhdl)
2230                 hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2231 
2232         swq = qp->qp_sq_wqhdr;
2233         swq->wq_head = 0;
2234         swq->wq_tail = 0;
2235         swq->wq_full = 0;
2236 
2237         /*
2238          * Now we repeat all the above operations for the receive work queue,
2239          * or shared receive work queue.
2240          *
2241          * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2242          */
2243 
2244         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2245                 mutex_enter(&qp->qp_srqhdl->srq_lock);
2246         } else {
2247                 rwq = qp->qp_rq_wqhdr;
2248                 rwq->wq_head = 0;
2249                 rwq->wq_tail = 0;
2250                 rwq->wq_full = 0;
2251                 qp->qp_rq_wqecntr = 0;
2252         }
2253         hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2254 
2255         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2256                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2257         }
2258 
2259         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2260             qp->qp_sq_cqhdl != NULL)
2261                 mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2262         if (qp->qp_rq_cqhdl)
2263                 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2264         return (DDI_SUCCESS);
2265 }
2266 
2267 
2268 /*
2269  * hermon_wrid_to_reset_handling()
2270  *    Context: Can be called from interrupt or base context.
2271  */
2272 int
2273 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2274 {
2275         if (qp->qp_alloc_flags & IBT_QP_USER_MAP)
2276                 return (DDI_SUCCESS);
2277 
2278         /*
2279          * If there are unpolled entries in these CQs, they are
2280          * polled/flushed.
2281          * Grab the CQ lock(s) before manipulating the lists.
2282          */
2283         /* grab the cq lock(s) to modify the wqavl tree */
2284         if (qp->qp_rq_cqhdl)
2285                 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2286         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2287             qp->qp_sq_cqhdl != NULL)
2288                 mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2289 
2290         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2291                 mutex_enter(&qp->qp_srqhdl->srq_lock);
2292         }
2293         /*
2294          * Flush the entries on the CQ for this QP's QPN.
2295          */
2296         hermon_cq_entries_flush(state, qp);
2297 
2298         if (qp->qp_alloc_flags & IBT_QP_USES_SRQ) {
2299                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2300         }
2301 
2302         hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2303         if (qp->qp_sq_cqhdl != NULL)
2304                 hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2305 
2306         if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl &&
2307             qp->qp_sq_cqhdl != NULL)
2308                 mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2309         if (qp->qp_rq_cqhdl)
2310                 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2311 
2312         return (IBT_SUCCESS);
2313 }
2314 
2315 
2316 /*
2317  * hermon_wrid_get_entry()
2318  *    Context: Can be called from interrupt or base context.
2319  */
2320 uint64_t
2321 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2322 {
2323         hermon_workq_avl_t      *wqa;
2324         hermon_workq_hdr_t      *wq;
2325         uint64_t                wrid;
2326         uint_t                  send_or_recv, qpnum;
2327         uint32_t                indx;
2328 
2329         /*
2330          * Determine whether this CQE is a send or receive completion.
2331          */
2332         send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2333 
2334         /* Find the work queue for this QP number (send or receive side) */
2335         qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2336         wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2337         wq = wqa->wqa_wq;
2338 
2339         /*
2340          * Regardless of whether the completion is the result of a "success"
2341          * or a "failure", we lock the list of "containers" and attempt to
2342          * search for the the first matching completion (i.e. the first WR
2343          * with a matching WQE addr and size).  Once we find it, we pull out
2344          * the "wrid" field and return it (see below).  XXX Note: One possible
2345          * future enhancement would be to enable this routine to skip over
2346          * any "unsignaled" completions to go directly to the next "signaled"
2347          * entry on success.
2348          */
2349         indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2350         wrid = wq->wq_wrid[indx];
2351         if (wqa->wqa_srq_en) {
2352                 struct hermon_sw_srq_s  *srq;
2353                 uint64_t                *desc;
2354 
2355                 /* put wqe back on the srq free list */
2356                 srq = wqa->wqa_srq;
2357                 mutex_enter(&srq->srq_lock);
2358                 desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2359                 ((uint16_t *)desc)[1] = htons(indx);
2360                 wq->wq_tail = indx;
2361                 mutex_exit(&srq->srq_lock);
2362         } else {
2363                 wq->wq_head = (indx + 1) & wq->wq_mask;
2364                 wq->wq_full = 0;
2365         }
2366 
2367         return (wrid);
2368 }
2369 
2370 
2371 int
2372 hermon_wrid_workq_compare(const void *p1, const void *p2)
2373 {
2374         hermon_workq_compare_t  *cmpp;
2375         hermon_workq_avl_t      *curr;
2376 
2377         cmpp = (hermon_workq_compare_t *)p1;
2378         curr = (hermon_workq_avl_t *)p2;
2379 
2380         if (cmpp->cmp_qpn < curr->wqa_qpn)
2381                 return (-1);
2382         else if (cmpp->cmp_qpn > curr->wqa_qpn)
2383                 return (+1);
2384         else if (cmpp->cmp_type < curr->wqa_type)
2385                 return (-1);
2386         else if (cmpp->cmp_type > curr->wqa_type)
2387                 return (+1);
2388         else
2389                 return (0);
2390 }
2391 
2392 
2393 /*
2394  * hermon_wrid_workq_find()
2395  *    Context: Can be called from interrupt or base context.
2396  */
2397 static hermon_workq_avl_t *
2398 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2399 {
2400         hermon_workq_avl_t      *curr;
2401         hermon_workq_compare_t  cmp;
2402 
2403         /*
2404          * Walk the CQ's work queue list, trying to find a send or recv queue
2405          * with the same QP number.  We do this even if we are going to later
2406          * create a new entry because it helps us easily find the end of the
2407          * list.
2408          */
2409         cmp.cmp_qpn = qpn;
2410         cmp.cmp_type = wq_type;
2411         curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2412 
2413         return (curr);
2414 }
2415 
2416 
2417 /*
2418  * hermon_wrid_wqhdr_create()
2419  *    Context: Can be called from base context.
2420  */
2421 /* ARGSUSED */
2422 hermon_workq_hdr_t *
2423 hermon_wrid_wqhdr_create(int bufsz)
2424 {
2425         hermon_workq_hdr_t      *wqhdr;
2426 
2427         /*
2428          * Allocate space for the wqhdr, and an array to record all the wrids.
2429          */
2430         wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2431         if (wqhdr == NULL) {
2432                 return (NULL);
2433         }
2434         wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2435         if (wqhdr->wq_wrid == NULL) {
2436                 kmem_free(wqhdr, sizeof (*wqhdr));
2437                 return (NULL);
2438         }
2439         wqhdr->wq_size = bufsz;
2440         wqhdr->wq_mask = bufsz - 1;
2441 
2442         return (wqhdr);
2443 }
2444 
2445 void
2446 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2447 {
2448         kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2449         kmem_free(wqhdr, sizeof (*wqhdr));
2450 }
2451 
2452 
2453 /*
2454  * hermon_cq_workq_add()
2455  *    Context: Can be called from interrupt or base context.
2456  */
2457 static void
2458 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2459 {
2460         hermon_workq_compare_t  cmp;
2461         avl_index_t             where;
2462 
2463         cmp.cmp_qpn = wqavl->wqa_qpn;
2464         cmp.cmp_type = wqavl->wqa_type;
2465         (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2466         avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2467 }
2468 
2469 
2470 /*
2471  * hermon_cq_workq_remove()
2472  *    Context: Can be called from interrupt or base context.
2473  */
2474 static void
2475 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2476 {
2477         avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2478 }