1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * tavor_wr.c
  29  *    Tavor Work Request Processing Routines
  30  *
  31  *    Implements all the routines necessary to provide the PostSend(),
  32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
  33  *    necessary to implement the Tavor WRID tracking mechanism.
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/conf.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/modctl.h>
  41 #include <sys/avl.h>
  42 
  43 #include <sys/ib/adapters/tavor/tavor.h>
  44 
  45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
  46     uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
  47 #pragma inline(tavor_qp_send_doorbell)
  48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
  49     uint32_t nds, uint32_t qpn, uint32_t credits);
  50 #pragma inline(tavor_qp_recv_doorbell)
  51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
  52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
  53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
  54     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
  56     ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
  57     uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
  58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
  59     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
  61     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
  62     tavor_qphdl_t qp);
  63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
  64     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
  65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
  66     uint64_t *prev, tavor_qphdl_t qp);
  67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
  68     ibt_recv_wr_t *wr, uint64_t *desc);
  69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
  70     tavor_srqhdl_t srq);
  71 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
  72     uint_t sync_to, uint_t sync_type, uint_t flag);
  73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
  74     tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
  75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
  76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
  77     uint_t send_or_recv);
  78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
  79     tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
  80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
  81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
  82     tavor_wrid_list_hdr_t *wrid_list);
  83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
  84     tavor_wrid_list_hdr_t *wrid_list);
  85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
  86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
  87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
  88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
  89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
  90 
  91 /*
  92  * tavor_post_send()
  93  *    Context: Can be called from interrupt or base context.
  94  */
  95 int
  96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
  97     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
  98 {
  99         tavor_sw_wqe_dbinfo_t           dbinfo;
 100         tavor_wrid_list_hdr_t           *wridlist;
 101         tavor_wrid_entry_t              *wre_last;
 102         uint64_t                        *desc, *prev, *first;
 103         uint32_t                        desc_sz, first_sz;
 104         uint32_t                        wqeaddrsz, signaled_dbd;
 105         uint32_t                        head, tail, next_tail, qsize_msk;
 106         uint32_t                        sync_from, sync_to;
 107         uint_t                          currindx, wrindx, numremain;
 108         uint_t                          chainlen, chainbegin, posted_cnt;
 109         uint_t                          maxdb = TAVOR_QP_MAXDESC_PER_DB;
 110         int                             status;
 111 
 112         TAVOR_TNF_ENTER(tavor_post_send);
 113 
 114         /*
 115          * Check for user-mappable QP memory.  Note:  We do not allow kernel
 116          * clients to post to QP memory that is accessible directly by the
 117          * user.  If the QP memory is user accessible, then return an error.
 118          */
 119         if (qp->qp_is_umap) {
 120                 TNF_PROBE_0(tavor_post_send_inv_usrmapped_type,
 121                     TAVOR_TNF_ERROR, "");
 122                 TAVOR_TNF_EXIT(tavor_post_send);
 123                 return (IBT_QP_HDL_INVALID);
 124         }
 125 
 126         /* Initialize posted_cnt */
 127         posted_cnt = 0;
 128 
 129         mutex_enter(&qp->qp_lock);
 130 
 131         /*
 132          * Check QP state.  Can not post Send requests from the "Reset",
 133          * "Init", or "RTR" states
 134          */
 135         if ((qp->qp_state == TAVOR_QP_RESET) ||
 136             (qp->qp_state == TAVOR_QP_INIT) ||
 137             (qp->qp_state == TAVOR_QP_RTR)) {
 138                 mutex_exit(&qp->qp_lock);
 139                 TNF_PROBE_0(tavor_post_send_inv_qpstate_fail,
 140                     TAVOR_TNF_ERROR, "");
 141                 TAVOR_TNF_EXIT(tavor_post_send);
 142                 return (IBT_QP_STATE_INVALID);
 143         }
 144 
 145         /* Grab the lock for the WRID list */
 146         mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
 147         wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
 148 
 149         /* Save away some initial QP state */
 150         qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
 151         tail      = qp->qp_sq_wqhdr->wq_tail;
 152         head      = qp->qp_sq_wqhdr->wq_head;
 153 
 154         /*
 155          * For each ibt_send_wr_t in the wr[] list passed in, parse the
 156          * request and build a Send WQE.  Note:  Because we are potentially
 157          * building a chain of WQEs, we want to link them all together.
 158          * However, we do not want to link the first one to the previous
 159          * WQE until the entire chain has been linked.  Then in the last
 160          * step we ring the appropriate doorbell.  Note:  It is possible for
 161          * more Work Requests to be posted than the HW will support at one
 162          * shot.  If this happens, we need to be able to post and ring
 163          * several chains here until the the entire request is complete.
 164          */
 165         wrindx = 0;
 166         numremain = num_wr;
 167         status    = DDI_SUCCESS;
 168         while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 169                 /*
 170                  * For the first WQE on a new chain we need "prev" to point
 171                  * to the current descriptor.  As we begin to process
 172                  * further, "prev" will be updated to point to the previous
 173                  * WQE on the current chain (see below).
 174                  */
 175                 prev = TAVOR_QP_SQ_ENTRY(qp, tail);
 176 
 177                 /*
 178                  * Before we begin, save the current "tail index" for later
 179                  * DMA sync
 180                  */
 181                 sync_from = tail;
 182 
 183                 /*
 184                  * Break the request up into chains that are less than or
 185                  * equal to the maximum number of WQEs that can be posted
 186                  * per doorbell ring
 187                  */
 188                 chainlen   = (numremain > maxdb) ? maxdb : numremain;
 189                 numremain -= chainlen;
 190                 chainbegin = wrindx;
 191                 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 192                         /*
 193                          * Check for "queue full" condition.  If the queue
 194                          * is already full, then no more WQEs can be posted.
 195                          * So break out, ring a doorbell (if necessary) and
 196                          * return an error
 197                          */
 198                         if (qp->qp_sq_wqhdr->wq_full != 0) {
 199                                 status = IBT_QP_FULL;
 200                                 TNF_PROBE_0_DEBUG(tavor_post_send_sqfull,
 201                                     TAVOR_TNF_TRACE, "");
 202                                 break;
 203                         }
 204 
 205                         /*
 206                          * Increment the "tail index" and check for "queue
 207                          * full" condition.  If we detect that the current
 208                          * work request is going to fill the work queue, then
 209                          * we mark this condition and continue.
 210                          */
 211                         next_tail = (tail + 1) & qsize_msk;
 212                         if (next_tail == head) {
 213                                 qp->qp_sq_wqhdr->wq_full = 1;
 214                         }
 215 
 216                         /*
 217                          * Get the address of the location where the next
 218                          * Send WQE should be built
 219                          */
 220                         desc = TAVOR_QP_SQ_ENTRY(qp, tail);
 221 
 222                         /*
 223                          * Call tavor_wqe_send_build() to build the WQE
 224                          * at the given address.  This routine uses the
 225                          * information in the ibt_send_wr_t list (wr[]) and
 226                          * returns the size of the WQE when it returns.
 227                          */
 228                         status = tavor_wqe_send_build(state, qp,
 229                             &wr[wrindx], desc, &desc_sz);
 230                         if (status != DDI_SUCCESS) {
 231                                 TNF_PROBE_0(tavor_post_send_bldwqe_fail,
 232                                     TAVOR_TNF_ERROR, "");
 233                                 break;
 234                         }
 235 
 236                         /*
 237                          * Add a WRID entry to the WRID list.  Need to
 238                          * calculate the "wqeaddrsz" and "signaled_dbd"
 239                          * values to pass to tavor_wrid_add_entry()
 240                          */
 241                         wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
 242                             ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
 243                             desc_sz);
 244                         if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
 245                             (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
 246                                 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
 247                         } else {
 248                                 signaled_dbd = 0;
 249                         }
 250                         tavor_wrid_add_entry(qp->qp_sq_wqhdr,
 251                             wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
 252 
 253                         /*
 254                          * If this is not the first descriptor on the current
 255                          * chain, then link it to the previous WQE.  Otherwise,
 256                          * save the address and size of this descriptor (in
 257                          * "first" and "first_sz" respectively) and continue.
 258                          * Note: Linking a WQE to the the previous one will
 259                          * depend on whether the two WQEs are from "special
 260                          * QPs" (i.e. MLX transport WQEs) or whether they are
 261                          * normal Send WQEs.
 262                          */
 263                         if (currindx != 0) {
 264                                 if (qp->qp_is_special) {
 265                                         tavor_wqe_mlx_linknext(&wr[wrindx - 1],
 266                                             desc, desc_sz, prev, NULL, qp);
 267                                 } else {
 268                                         tavor_wqe_send_linknext(&wr[wrindx],
 269                                             &wr[wrindx - 1], desc, desc_sz,
 270                                             prev, NULL, qp);
 271                                 }
 272                                 prev = desc;
 273                         } else {
 274                                 first    = desc;
 275                                 first_sz = desc_sz;
 276                         }
 277 
 278                         /*
 279                          * Update the current "tail index" and increment
 280                          * "posted_cnt"
 281                          */
 282                         tail = next_tail;
 283                         posted_cnt++;
 284                 }
 285 
 286                 /*
 287                  * If we reach here and there are one or more WQEs which have
 288                  * been successfully chained together, then we need to link
 289                  * the current chain to the previously executing chain of
 290                  * descriptor (if there is one) and ring the doorbell for the
 291                  * send work queue.
 292                  */
 293                 if (currindx != 0) {
 294                         /*
 295                          * Before we link the chain, we need to ensure that the
 296                          * "next" field on the last WQE is set to NULL (to
 297                          * indicate the end of the chain).  Note: Just as it
 298                          * did above, the format for the "next" fields in a
 299                          * given WQE depend on whether the WQE is MLX
 300                          * transport or not.
 301                          */
 302                         if (qp->qp_is_special) {
 303                                 tavor_wqe_mlx_linknext(&wr[chainbegin +
 304                                     currindx - 1], NULL, 0, prev, NULL, qp);
 305                         } else {
 306                                 tavor_wqe_send_linknext(NULL,
 307                                     &wr[chainbegin + currindx - 1], NULL, 0,
 308                                     prev, NULL, qp);
 309                         }
 310 
 311                         /* Save away updated "tail index" for the DMA sync */
 312                         sync_to = tail;
 313 
 314                         /* Do a DMA sync for current send WQE(s) */
 315                         tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
 316                             DDI_DMA_SYNC_FORDEV);
 317 
 318                         /*
 319                          * Now link the chain to the old chain (if there was
 320                          * one.  Note: still need to pay attention to whether
 321                          * the QP used MLX transport WQEs or not.
 322                          */
 323                         if (qp->qp_is_special) {
 324                                 tavor_wqe_mlx_linknext(NULL, first, first_sz,
 325                                     qp->qp_sq_lastwqeaddr, &dbinfo, qp);
 326                         } else {
 327                                 tavor_wqe_send_linknext(&wr[chainbegin], NULL,
 328                                     first, first_sz, qp->qp_sq_lastwqeaddr,
 329                                     &dbinfo, qp);
 330                         }
 331 
 332                         /*
 333                          * If there was a valid previous WQE (i.e. non-NULL),
 334                          * then sync it too.  This is because we have updated
 335                          * its "next" fields and we want to ensure that the
 336                          * hardware can see the changes.
 337                          */
 338                         if (qp->qp_sq_lastwqeaddr != NULL) {
 339                                 sync_to   = sync_from;
 340                                 sync_from = (sync_from - 1) & qsize_msk;
 341                                 tavor_wqe_sync(qp, sync_from, sync_to,
 342                                     TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
 343                         }
 344 
 345                         /*
 346                          * Now if the WRID tail entry is non-NULL, then this
 347                          * represents the entry to which we are chaining the
 348                          * new entries.  Since we are going to ring the
 349                          * doorbell for this WQE, we want set its "dbd" bit.
 350                          *
 351                          * On the other hand, if the tail is NULL, even though
 352                          * we will have rung the doorbell for the previous WQE
 353                          * (for the hardware's sake) it is irrelevant to our
 354                          * purposes (for tracking WRIDs) because we know the
 355                          * request must have already completed.
 356                          */
 357                         wre_last = wridlist->wl_wre_old_tail;
 358                         if (wre_last != NULL) {
 359                                 wre_last->wr_signaled_dbd |=
 360                                     TAVOR_WRID_ENTRY_DOORBELLED;
 361                         }
 362 
 363                         /* Update some of the state in the QP */
 364                         qp->qp_sq_lastwqeaddr         = desc;
 365                         qp->qp_sq_wqhdr->wq_tail = tail;
 366 
 367                         /* Ring the doorbell */
 368                         tavor_qp_send_doorbell(state,
 369                             (uint32_t)((uintptr_t)first - qp->qp_desc_off),
 370                             first_sz, qp->qp_qpnum, dbinfo.db_fence,
 371                             dbinfo.db_nopcode);
 372                 }
 373         }
 374 
 375         /*
 376          * Update the "num_posted" return value (if necessary).  Then drop
 377          * the locks and return success.
 378          */
 379         if (num_posted != NULL) {
 380                 *num_posted = posted_cnt;
 381         }
 382 
 383         mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
 384         mutex_exit(&qp->qp_lock);
 385 
 386         TAVOR_TNF_EXIT(tavor_post_send);
 387         return (status);
 388 }
 389 
 390 
 391 /*
 392  * tavor_post_recv()
 393  *    Context: Can be called from interrupt or base context.
 394  */
 395 int
 396 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
 397     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 398 {
 399         uint64_t                        *desc, *prev, *first;
 400         uint32_t                        desc_sz, first_sz;
 401         uint32_t                        wqeaddrsz, signaled_dbd;
 402         uint32_t                        head, tail, next_tail, qsize_msk;
 403         uint32_t                        sync_from, sync_to;
 404         uint_t                          currindx, wrindx, numremain;
 405         uint_t                          chainlen, posted_cnt;
 406         uint_t                          maxdb = TAVOR_QP_MAXDESC_PER_DB;
 407         int                             status;
 408 
 409         TAVOR_TNF_ENTER(tavor_post_recv);
 410 
 411         /*
 412          * Check for user-mappable QP memory.  Note:  We do not allow kernel
 413          * clients to post to QP memory that is accessible directly by the
 414          * user.  If the QP memory is user accessible, then return an error.
 415          */
 416         if (qp->qp_is_umap) {
 417                 TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type,
 418                     TAVOR_TNF_ERROR, "");
 419                 TAVOR_TNF_EXIT(tavor_post_recv);
 420                 return (IBT_QP_HDL_INVALID);
 421         }
 422 
 423         /* Initialize posted_cnt */
 424         posted_cnt = 0;
 425 
 426         mutex_enter(&qp->qp_lock);
 427 
 428         /*
 429          * Check if QP is associated with an SRQ
 430          */
 431         if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
 432                 mutex_exit(&qp->qp_lock);
 433                 TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq,
 434                     TAVOR_TNF_ERROR, "");
 435                 TAVOR_TNF_EXIT(tavor_post_recv);
 436                 return (IBT_SRQ_IN_USE);
 437         }
 438 
 439         /*
 440          * Check QP state.  Can not post Recv requests from the "Reset" state
 441          */
 442         if (qp->qp_state == TAVOR_QP_RESET) {
 443                 mutex_exit(&qp->qp_lock);
 444                 TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail,
 445                     TAVOR_TNF_ERROR, "");
 446                 TAVOR_TNF_EXIT(tavor_post_recv);
 447                 return (IBT_QP_STATE_INVALID);
 448         }
 449 
 450         /* Grab the lock for the WRID list */
 451         mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
 452 
 453         /* Save away some initial QP state */
 454         qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
 455         tail      = qp->qp_rq_wqhdr->wq_tail;
 456         head      = qp->qp_rq_wqhdr->wq_head;
 457 
 458         /*
 459          * For each ibt_recv_wr_t in the wr[] list passed in, parse the
 460          * request and build a Recv WQE.  Note:  Because we are potentially
 461          * building a chain of WQEs, we want to link them all together.
 462          * However, we do not want to link the first one to the previous
 463          * WQE until the entire chain has been linked.  Then in the last
 464          * step we ring the appropriate doorbell.  Note:  It is possible for
 465          * more Work Requests to be posted than the HW will support at one
 466          * shot.  If this happens, we need to be able to post and ring
 467          * several chains here until the the entire request is complete.
 468          */
 469         wrindx = 0;
 470         numremain = num_wr;
 471         status    = DDI_SUCCESS;
 472         while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 473                 /*
 474                  * For the first WQE on a new chain we need "prev" to point
 475                  * to the current descriptor.  As we begin to process
 476                  * further, "prev" will be updated to point to the previous
 477                  * WQE on the current chain (see below).
 478                  */
 479                 prev = TAVOR_QP_RQ_ENTRY(qp, tail);
 480 
 481                 /*
 482                  * Before we begin, save the current "tail index" for later
 483                  * DMA sync
 484                  */
 485                 sync_from = tail;
 486 
 487                 /*
 488                  * Break the request up into chains that are less than or
 489                  * equal to the maximum number of WQEs that can be posted
 490                  * per doorbell ring
 491                  */
 492                 chainlen = (numremain > maxdb) ? maxdb : numremain;
 493                 numremain -= chainlen;
 494                 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 495                         /*
 496                          * Check for "queue full" condition.  If the queue
 497                          * is already full, then no more WQEs can be posted.
 498                          * So break out, ring a doorbell (if necessary) and
 499                          * return an error
 500                          */
 501                         if (qp->qp_rq_wqhdr->wq_full != 0) {
 502                                 status = IBT_QP_FULL;
 503                                 TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull,
 504                                     TAVOR_TNF_TRACE, "");
 505                                 break;
 506                         }
 507 
 508                         /*
 509                          * Increment the "tail index" and check for "queue
 510                          * full" condition.  If we detect that the current
 511                          * work request is going to fill the work queue, then
 512                          * we mark this condition and continue.
 513                          */
 514                         next_tail = (tail + 1) & qsize_msk;
 515                         if (next_tail == head) {
 516                                 qp->qp_rq_wqhdr->wq_full = 1;
 517                         }
 518 
 519                         /*
 520                          * Get the address of the location where the next
 521                          * Recv WQE should be built
 522                          */
 523                         desc = TAVOR_QP_RQ_ENTRY(qp, tail);
 524 
 525                         /*
 526                          * Call tavor_wqe_recv_build() to build the WQE
 527                          * at the given address.  This routine uses the
 528                          * information in the ibt_recv_wr_t list (wr[]) and
 529                          * returns the size of the WQE when it returns.
 530                          */
 531                         status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
 532                             desc, &desc_sz);
 533                         if (status != DDI_SUCCESS) {
 534                                 TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
 535                                     TAVOR_TNF_ERROR, "");
 536                                 break;
 537                         }
 538 
 539                         /*
 540                          * Add a WRID entry to the WRID list.  Need to
 541                          * calculate the "wqeaddrsz" and "signaled_dbd"
 542                          * values to pass to tavor_wrid_add_entry().  Note:
 543                          * all Recv WQEs are essentially "signaled" and
 544                          * "doorbelled" (since Tavor HW requires all
 545                          * RecvWQE's to have their "DBD" bits set).
 546                          */
 547                         wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
 548                             ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
 549                             desc_sz);
 550                         signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
 551                             TAVOR_WRID_ENTRY_DOORBELLED;
 552                         tavor_wrid_add_entry(qp->qp_rq_wqhdr,
 553                             wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
 554 
 555                         /*
 556                          * If this is not the first descriptor on the current
 557                          * chain, then link it to the previous WQE.  Otherwise,
 558                          * save the address and size of this descriptor (in
 559                          * "first" and "first_sz" respectively) and continue.
 560                          */
 561                         if (currindx != 0) {
 562                                 tavor_wqe_recv_linknext(desc, desc_sz, prev,
 563                                     qp);
 564                                 prev = desc;
 565                         } else {
 566                                 first    = desc;
 567                                 first_sz = desc_sz;
 568                         }
 569 
 570                         /*
 571                          * Update the current "tail index" and increment
 572                          * "posted_cnt"
 573                          */
 574                         tail = next_tail;
 575                         posted_cnt++;
 576                 }
 577 
 578                 /*
 579                  * If we reach here and there are one or more WQEs which have
 580                  * been successfully chained together, then we need to link
 581                  * the current chain to the previously executing chain of
 582                  * descriptor (if there is one) and ring the doorbell for the
 583                  * recv work queue.
 584                  */
 585                 if (currindx != 0) {
 586                         /*
 587                          * Before we link the chain, we need to ensure that the
 588                          * "next" field on the last WQE is set to NULL (to
 589                          * indicate the end of the chain).
 590                          */
 591                         tavor_wqe_recv_linknext(NULL, 0, prev, qp);
 592 
 593                         /* Save away updated "tail index" for the DMA sync */
 594                         sync_to = tail;
 595 
 596                         /* Do a DMA sync for current recv WQE(s) */
 597                         tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
 598                             DDI_DMA_SYNC_FORDEV);
 599 
 600                         /*
 601                          * Now link the chain to the old chain (if there was
 602                          * one.
 603                          */
 604                         tavor_wqe_recv_linknext(first, first_sz,
 605                             qp->qp_rq_lastwqeaddr, qp);
 606 
 607                         /*
 608                          * If there was a valid previous WQE (i.e. non-NULL),
 609                          * then sync it too.  This is because we have updated
 610                          * its "next" fields and we want to ensure that the
 611                          * hardware can see the changes.
 612                          */
 613                         if (qp->qp_rq_lastwqeaddr != NULL) {
 614                                 sync_to   = sync_from;
 615                                 sync_from = (sync_from - 1) & qsize_msk;
 616                                 tavor_wqe_sync(qp, sync_from, sync_to,
 617                                     TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
 618                         }
 619 
 620                         /* Update some of the state in the QP */
 621                         qp->qp_rq_lastwqeaddr         = desc;
 622                         qp->qp_rq_wqhdr->wq_tail = tail;
 623 
 624                         /* Ring the doorbell */
 625                         tavor_qp_recv_doorbell(state,
 626                             (uint32_t)((uintptr_t)first - qp->qp_desc_off),
 627                             first_sz, qp->qp_qpnum, (chainlen % maxdb));
 628                 }
 629         }
 630 
 631         /*
 632          * Update the "num_posted" return value (if necessary).  Then drop
 633          * the locks and return success.
 634          */
 635         if (num_posted != NULL) {
 636                 *num_posted = posted_cnt;
 637         }
 638 
 639         mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
 640         mutex_exit(&qp->qp_lock);
 641 
 642         TAVOR_TNF_EXIT(tavor_post_recv);
 643         return (status);
 644 }
 645 
 646 /*
 647  * tavor_post_srq()
 648  *    Context: Can be called from interrupt or base context.
 649  */
 650 int
 651 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
 652     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 653 {
 654         uint64_t                        *desc, *prev, *first, *last_wqe_addr;
 655         uint32_t                        signaled_dbd;
 656         uint32_t                        sync_indx;
 657         uint_t                          currindx, wrindx, numremain;
 658         uint_t                          chainlen, posted_cnt;
 659         uint_t                          maxdb = TAVOR_QP_MAXDESC_PER_DB;
 660         int                             status;
 661 
 662         TAVOR_TNF_ENTER(tavor_post_srq);
 663 
 664         /*
 665          * Check for user-mappable QP memory.  Note:  We do not allow kernel
 666          * clients to post to QP memory that is accessible directly by the
 667          * user.  If the QP memory is user accessible, then return an error.
 668          */
 669         if (srq->srq_is_umap) {
 670                 TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type,
 671                     TAVOR_TNF_ERROR, "");
 672                 TAVOR_TNF_EXIT(tavor_post_srq);
 673                 return (IBT_SRQ_HDL_INVALID);
 674         }
 675 
 676         /* Initialize posted_cnt */
 677         posted_cnt = 0;
 678 
 679         mutex_enter(&srq->srq_lock);
 680 
 681         /*
 682          * Check SRQ state.  Can not post Recv requests when SRQ is in error
 683          */
 684         if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
 685                 mutex_exit(&srq->srq_lock);
 686                 TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail,
 687                     TAVOR_TNF_ERROR, "");
 688                 TAVOR_TNF_EXIT(tavor_post_srq);
 689                 return (IBT_QP_STATE_INVALID);
 690         }
 691 
 692         /* Grab the lock for the WRID list */
 693         mutex_enter(&srq->srq_wrid_wql->wql_lock);
 694 
 695         /*
 696          * For each ibt_recv_wr_t in the wr[] list passed in, parse the
 697          * request and build a Recv WQE.  Note:  Because we are potentially
 698          * building a chain of WQEs, we want to link them all together.
 699          * However, we do not want to link the first one to the previous
 700          * WQE until the entire chain has been linked.  Then in the last
 701          * step we ring the appropriate doorbell.  Note:  It is possible for
 702          * more Work Requests to be posted than the HW will support at one
 703          * shot.  If this happens, we need to be able to post and ring
 704          * several chains here until the the entire request is complete.
 705          */
 706         wrindx = 0;
 707         numremain = num_wr;
 708         status    = DDI_SUCCESS;
 709         while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 710                 /*
 711                  * For the first WQE on a new chain we need "prev" to point
 712                  * to the current descriptor.  As we begin to process
 713                  * further, "prev" will be updated to point to the previous
 714                  * WQE on the current chain (see below).
 715                  */
 716                 if (srq->srq_wq_lastwqeindx == -1) {
 717                         prev = NULL;
 718                 } else {
 719                         prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
 720                 }
 721 
 722                 /*
 723                  * Break the request up into chains that are less than or
 724                  * equal to the maximum number of WQEs that can be posted
 725                  * per doorbell ring
 726                  */
 727                 chainlen = (numremain > maxdb) ? maxdb : numremain;
 728                 numremain -= chainlen;
 729                 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 730 
 731                         /*
 732                          * Check for "queue full" condition.  If the queue
 733                          * is already full, then no more WQEs can be posted.
 734                          * So break out, ring a doorbell (if necessary) and
 735                          * return an error
 736                          */
 737                         if (srq->srq_wridlist->wl_free_list_indx == -1) {
 738                                 status = IBT_QP_FULL;
 739                                 TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull,
 740                                     TAVOR_TNF_TRACE, "");
 741                                 break;
 742                         }
 743 
 744                         /*
 745                          * Get the address of the location where the next
 746                          * Recv WQE should be built
 747                          */
 748                         desc = TAVOR_SRQ_WQE_ADDR(srq,
 749                             srq->srq_wridlist->wl_free_list_indx);
 750 
 751                         /*
 752                          * Add a WRID entry to the WRID list.  Need to
 753                          * set the "signaled_dbd" values to pass to
 754                          * tavor_wrid_add_entry().  Note: all Recv WQEs are
 755                          * essentially "signaled"
 756                          *
 757                          * The 'size' is stored at srq_alloc time, in the
 758                          * srq_wq_stride.  This is a constant value required
 759                          * for SRQ.
 760                          */
 761                         signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
 762                         tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
 763                             signaled_dbd);
 764 
 765                         /*
 766                          * Call tavor_wqe_srq_build() to build the WQE
 767                          * at the given address.  This routine uses the
 768                          * information in the ibt_recv_wr_t list (wr[]) and
 769                          * returns the size of the WQE when it returns.
 770                          */
 771                         status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
 772                             desc);
 773                         if (status != DDI_SUCCESS) {
 774                                 TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
 775                                     TAVOR_TNF_ERROR, "");
 776                                 break;
 777                         }
 778 
 779                         /*
 780                          * If this is not the first descriptor on the current
 781                          * chain, then link it to the previous WQE.  Otherwise,
 782                          * save the address of this descriptor (in "first") and
 783                          * continue.
 784                          */
 785                         if (currindx != 0) {
 786                                 tavor_wqe_srq_linknext(desc, prev, srq);
 787                                 sync_indx = TAVOR_SRQ_WQE_INDEX(
 788                                     srq->srq_wq_buf, prev,
 789                                     srq->srq_wq_log_wqesz);
 790 
 791                                 /* Do a DMA sync for previous recv WQE */
 792                                 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
 793                                     TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
 794 
 795                                 prev = desc;
 796                         } else {
 797 
 798                                 /*
 799                                  * In this case, the last WQE on the chain is
 800                                  * also considered 'first'.  So set prev to
 801                                  * first, here.
 802                                  */
 803                                 first = prev = desc;
 804                         }
 805 
 806                         /*
 807                          * Increment "posted_cnt"
 808                          */
 809                         posted_cnt++;
 810                 }
 811 
 812                 /*
 813                  * If we reach here and there are one or more WQEs which have
 814                  * been successfully chained together, then we need to link
 815                  * the current chain to the previously executing chain of
 816                  * descriptor (if there is one) and ring the doorbell for the
 817                  * recv work queue.
 818                  */
 819                 if (currindx != 0) {
 820                         /*
 821                          * Before we link the chain, we need to ensure that the
 822                          * "next" field on the last WQE is set to NULL (to
 823                          * indicate the end of the chain).
 824                          */
 825                         tavor_wqe_srq_linknext(NULL, prev, srq);
 826 
 827                         sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
 828                             srq->srq_wq_log_wqesz);
 829 
 830                         /* Do a DMA sync for current recv WQE */
 831                         tavor_wqe_sync(srq, sync_indx, sync_indx+1,
 832                             TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
 833 
 834                         /*
 835                          * Now link the chain to the old chain (if there was
 836                          * one).
 837                          */
 838                         if (srq->srq_wq_lastwqeindx == -1) {
 839                                 last_wqe_addr = NULL;
 840                         } else {
 841                                 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
 842                                     srq->srq_wq_lastwqeindx);
 843                         }
 844                         tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
 845 
 846                         /*
 847                          * If there was a valid previous WQE (i.e. valid index),
 848                          * then sync it too.  This is because we have updated
 849                          * its "next" fields and we want to ensure that the
 850                          * hardware can see the changes.
 851                          */
 852                         if (srq->srq_wq_lastwqeindx != -1) {
 853                                 sync_indx = srq->srq_wq_lastwqeindx;
 854                                 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
 855                                     TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
 856                         }
 857 
 858                         /* Update some of the state in the QP */
 859                         srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
 860                             srq->srq_wq_buf, desc,
 861                             srq->srq_wq_log_wqesz);
 862 
 863                         /* Ring the doorbell */
 864                         /* SRQ needs NDS of 0 */
 865                         tavor_qp_recv_doorbell(state,
 866                             (uint32_t)((uintptr_t)first - srq->srq_desc_off),
 867                             0, srq->srq_srqnum, (chainlen % maxdb));
 868                 }
 869         }
 870 
 871         /*
 872          * Update the "num_posted" return value (if necessary).  Then drop
 873          * the locks and return success.
 874          */
 875         if (num_posted != NULL) {
 876                 *num_posted = posted_cnt;
 877         }
 878 
 879         mutex_exit(&srq->srq_wrid_wql->wql_lock);
 880         mutex_exit(&srq->srq_lock);
 881 
 882         TAVOR_TNF_EXIT(tavor_post_srq);
 883         return (status);
 884 }
 885 
 886 
 887 /*
 888  * tavor_qp_send_doorbell()
 889  *    Context: Can be called from interrupt or base context.
 890  */
 891 static void
 892 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
 893     uint32_t qpn, uint32_t fence, uint32_t nopcode)
 894 {
 895         uint64_t        doorbell = 0;
 896 
 897         /* Build the doorbell from the parameters */
 898         doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
 899             TAVOR_QPSNDDB_NDA_SHIFT) |
 900             ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
 901             ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
 902             ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
 903 
 904         TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "",
 905             tnf_ulong, doorbell, doorbell);
 906 
 907         /* Write the doorbell to UAR */
 908         TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
 909             doorbell);
 910 }
 911 
 912 
 913 /*
 914  * tavor_qp_recv_doorbell()
 915  *    Context: Can be called from interrupt or base context.
 916  */
 917 static void
 918 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
 919     uint32_t qpn, uint32_t credits)
 920 {
 921         uint64_t        doorbell = 0;
 922 
 923         /* Build the doorbell from the parameters */
 924         doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
 925             TAVOR_QPRCVDB_NDA_SHIFT) |
 926             ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
 927             ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
 928 
 929         TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "",
 930             tnf_ulong, doorbell, doorbell);
 931 
 932         /* Write the doorbell to UAR */
 933         TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
 934             doorbell);
 935 }
 936 
 937 
 938 /*
 939  * tavor_wqe_send_build()
 940  *    Context: Can be called from interrupt or base context.
 941  */
 942 static int
 943 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
 944     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
 945 {
 946         tavor_hw_snd_wqe_ud_t           *ud;
 947         tavor_hw_snd_wqe_remaddr_t      *rc;
 948         tavor_hw_snd_wqe_atomic_t       *at;
 949         tavor_hw_snd_wqe_remaddr_t      *uc;
 950         tavor_hw_snd_wqe_bind_t         *bn;
 951         tavor_hw_wqe_sgl_t              *ds;
 952         ibt_wr_ds_t                     *sgl;
 953         tavor_ahhdl_t                   ah;
 954         uint32_t                        nds;
 955         int                             i, num_ds, status;
 956 
 957         TAVOR_TNF_ENTER(tavor_wqe_send_build);
 958 
 959         ASSERT(MUTEX_HELD(&qp->qp_lock));
 960 
 961         /* Initialize the information for the Data Segments */
 962         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
 963             sizeof (tavor_hw_snd_wqe_nextctrl_t));
 964         nds = wr->wr_nds;
 965         sgl = wr->wr_sgl;
 966         num_ds = 0;
 967 
 968         /*
 969          * Build a Send WQE depends first and foremost on the transport
 970          * type of Work Request (i.e. UD, RC, or UC)
 971          */
 972         switch (wr->wr_trans) {
 973         case IBT_UD_SRV:
 974                 /* Ensure that work request transport type matches QP type */
 975                 if (qp->qp_serv_type != TAVOR_QP_UD) {
 976                         TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
 977                             TAVOR_TNF_ERROR, "");
 978                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
 979                         return (IBT_QP_SRV_TYPE_INVALID);
 980                 }
 981 
 982                 /*
 983                  * Validate the operation type.  For UD requests, only the
 984                  * "Send" operation is valid
 985                  */
 986                 if (wr->wr_opcode != IBT_WRC_SEND) {
 987                         TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
 988                             TAVOR_TNF_ERROR, "");
 989                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
 990                         return (IBT_QP_OP_TYPE_INVALID);
 991                 }
 992 
 993                 /*
 994                  * If this is a Special QP (QP0 or QP1), then we need to
 995                  * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
 996                  * and return whatever status it returns
 997                  */
 998                 if (qp->qp_is_special) {
 999                         status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
1000                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1001                         return (status);
1002                 }
1003 
1004                 /*
1005                  * Otherwise, if this is a normal UD Send request, then fill
1006                  * all the fields in the Tavor UD header for the WQE.  Note:
1007                  * to do this we'll need to extract some information from the
1008                  * Address Handle passed with the work request.
1009                  */
1010                 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1011                     sizeof (tavor_hw_snd_wqe_nextctrl_t));
1012                 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1013                 if (ah == NULL) {
1014                         TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail,
1015                             TAVOR_TNF_ERROR, "");
1016                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1017                         return (IBT_AH_HDL_INVALID);
1018                 }
1019 
1020                 /*
1021                  * Build the Unreliable Datagram Segment for the WQE, using
1022                  * the information from the address handle and the work
1023                  * request.
1024                  */
1025                 mutex_enter(&ah->ah_lock);
1026                 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
1027                 mutex_exit(&ah->ah_lock);
1028 
1029                 /* Update "ds" for filling in Data Segments (below) */
1030                 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
1031                     sizeof (tavor_hw_snd_wqe_ud_t));
1032                 break;
1033 
1034         case IBT_RC_SRV:
1035                 /* Ensure that work request transport type matches QP type */
1036                 if (qp->qp_serv_type != TAVOR_QP_RC) {
1037                         TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1038                             TAVOR_TNF_ERROR, "");
1039                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1040                         return (IBT_QP_SRV_TYPE_INVALID);
1041                 }
1042 
1043                 /*
1044                  * Validate the operation type.  For RC requests, we allow
1045                  * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1046                  * operations, and memory window "Bind"
1047                  */
1048                 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1049                     (wr->wr_opcode != IBT_WRC_RDMAR) &&
1050                     (wr->wr_opcode != IBT_WRC_RDMAW) &&
1051                     (wr->wr_opcode != IBT_WRC_CSWAP) &&
1052                     (wr->wr_opcode != IBT_WRC_FADD) &&
1053                     (wr->wr_opcode != IBT_WRC_BIND)) {
1054                         TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1055                             TAVOR_TNF_ERROR, "");
1056                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1057                         return (IBT_QP_OP_TYPE_INVALID);
1058                 }
1059 
1060                 /*
1061                  * If this is a Send request, then all we need to do is break
1062                  * out and here and begin the Data Segment processing below
1063                  */
1064                 if (wr->wr_opcode == IBT_WRC_SEND) {
1065                         break;
1066                 }
1067 
1068                 /*
1069                  * If this is an RDMA Read or RDMA Write request, then fill
1070                  * in the "Remote Address" header fields.
1071                  */
1072                 if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1073                     (wr->wr_opcode == IBT_WRC_RDMAW)) {
1074                         rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1075                             sizeof (tavor_hw_snd_wqe_nextctrl_t));
1076 
1077                         /*
1078                          * Build the Remote Address Segment for the WQE, using
1079                          * the information from the RC work request.
1080                          */
1081                         TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1082 
1083                         /* Update "ds" for filling in Data Segments (below) */
1084                         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1085                             sizeof (tavor_hw_snd_wqe_remaddr_t));
1086                         break;
1087                 }
1088 
1089                 /*
1090                  * If this is one of the Atomic type operations (i.e
1091                  * Compare-Swap or Fetch-Add), then fill in both the "Remote
1092                  * Address" header fields and the "Atomic" header fields.
1093                  */
1094                 if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1095                     (wr->wr_opcode == IBT_WRC_FADD)) {
1096                         rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1097                             sizeof (tavor_hw_snd_wqe_nextctrl_t));
1098                         at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1099                             sizeof (tavor_hw_snd_wqe_remaddr_t));
1100 
1101                         /*
1102                          * Build the Remote Address and Atomic Segments for
1103                          * the WQE, using the information from the RC Atomic
1104                          * work request.
1105                          */
1106                         TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1107                         TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1108 
1109                         /* Update "ds" for filling in Data Segments (below) */
1110                         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1111                             sizeof (tavor_hw_snd_wqe_atomic_t));
1112 
1113                         /*
1114                          * Update "nds" and "sgl" because Atomic requests have
1115                          * only a single Data Segment (and they are encoded
1116                          * somewhat differently in the work request.
1117                          */
1118                         nds = 1;
1119                         sgl = wr->wr_sgl;
1120                         break;
1121                 }
1122 
1123                 /*
1124                  * If this is memory window Bind operation, then we call the
1125                  * tavor_wr_bind_check() routine to validate the request and
1126                  * to generate the updated RKey.  If this is successful, then
1127                  * we fill in the WQE's "Bind" header fields.
1128                  */
1129                 if (wr->wr_opcode == IBT_WRC_BIND) {
1130                         status = tavor_wr_bind_check(state, wr);
1131                         if (status != DDI_SUCCESS) {
1132                                 TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1133                                     TAVOR_TNF_ERROR, "");
1134                                 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1135                                 return (status);
1136                         }
1137 
1138                         bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1139                             sizeof (tavor_hw_snd_wqe_nextctrl_t));
1140 
1141                         /*
1142                          * Build the Bind Memory Window Segments for the WQE,
1143                          * using the information from the RC Bind memory
1144                          * window work request.
1145                          */
1146                         TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1147 
1148                         /*
1149                          * Update the "ds" pointer.  Even though the "bind"
1150                          * operation requires no SGLs, this is necessary to
1151                          * facilitate the correct descriptor size calculations
1152                          * (below).
1153                          */
1154                         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1155                             sizeof (tavor_hw_snd_wqe_bind_t));
1156                         nds = 0;
1157                 }
1158                 break;
1159 
1160         case IBT_UC_SRV:
1161                 /* Ensure that work request transport type matches QP type */
1162                 if (qp->qp_serv_type != TAVOR_QP_UC) {
1163                         TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1164                             TAVOR_TNF_ERROR, "");
1165                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1166                         return (IBT_QP_SRV_TYPE_INVALID);
1167                 }
1168 
1169                 /*
1170                  * Validate the operation type.  For UC requests, we only
1171                  * allow "Send", "RDMA Write", and memory window "Bind".
1172                  * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1173                  * operations
1174                  */
1175                 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1176                     (wr->wr_opcode != IBT_WRC_RDMAW) &&
1177                     (wr->wr_opcode != IBT_WRC_BIND)) {
1178                         TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1179                             TAVOR_TNF_ERROR, "");
1180                         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1181                         return (IBT_QP_OP_TYPE_INVALID);
1182                 }
1183 
1184                 /*
1185                  * If this is a Send request, then all we need to do is break
1186                  * out and here and begin the Data Segment processing below
1187                  */
1188                 if (wr->wr_opcode == IBT_WRC_SEND) {
1189                         break;
1190                 }
1191 
1192                 /*
1193                  * If this is an RDMA Write request, then fill in the "Remote
1194                  * Address" header fields.
1195                  */
1196                 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1197                         uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1198                             sizeof (tavor_hw_snd_wqe_nextctrl_t));
1199 
1200                         /*
1201                          * Build the Remote Address Segment for the WQE, using
1202                          * the information from the UC work request.
1203                          */
1204                         TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1205 
1206                         /* Update "ds" for filling in Data Segments (below) */
1207                         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1208                             sizeof (tavor_hw_snd_wqe_remaddr_t));
1209                         break;
1210                 }
1211 
1212                 /*
1213                  * If this is memory window Bind operation, then we call the
1214                  * tavor_wr_bind_check() routine to validate the request and
1215                  * to generate the updated RKey.  If this is successful, then
1216                  * we fill in the WQE's "Bind" header fields.
1217                  */
1218                 if (wr->wr_opcode == IBT_WRC_BIND) {
1219                         status = tavor_wr_bind_check(state, wr);
1220                         if (status != DDI_SUCCESS) {
1221                                 TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1222                                     TAVOR_TNF_ERROR, "");
1223                                 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1224                                 return (status);
1225                         }
1226 
1227                         bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1228                             sizeof (tavor_hw_snd_wqe_nextctrl_t));
1229 
1230                         /*
1231                          * Build the Bind Memory Window Segments for the WQE,
1232                          * using the information from the UC Bind memory
1233                          * window work request.
1234                          */
1235                         TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1236 
1237                         /*
1238                          * Update the "ds" pointer.  Even though the "bind"
1239                          * operation requires no SGLs, this is necessary to
1240                          * facilitate the correct descriptor size calculations
1241                          * (below).
1242                          */
1243                         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1244                             sizeof (tavor_hw_snd_wqe_bind_t));
1245                         nds = 0;
1246                 }
1247                 break;
1248 
1249         default:
1250                 TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail,
1251                     TAVOR_TNF_ERROR, "");
1252                 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1253                 return (IBT_QP_SRV_TYPE_INVALID);
1254         }
1255 
1256         /*
1257          * Now fill in the Data Segments (SGL) for the Send WQE based on
1258          * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1259          * Start by checking for a valid number of SGL entries
1260          */
1261         if (nds > qp->qp_sq_sgl) {
1262                 TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail,
1263                     TAVOR_TNF_ERROR, "");
1264                 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1265                 return (IBT_QP_SGL_LEN_INVALID);
1266         }
1267 
1268         /*
1269          * For each SGL in the Send Work Request, fill in the Send WQE's data
1270          * segments.  Note: We skip any SGL with zero size because Tavor
1271          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1272          * the encoding for zero means a 2GB transfer.  Because of this special
1273          * encoding in the hardware, we mask the requested length with
1274          * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1275          * zero.)
1276          */
1277         for (i = 0; i < nds; i++) {
1278                 if (sgl[i].ds_len == 0) {
1279                         continue;
1280                 }
1281 
1282                 /*
1283                  * Fill in the Data Segment(s) for the current WQE, using the
1284                  * information contained in the scatter-gather list of the
1285                  * work request.
1286                  */
1287                 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1288                 num_ds++;
1289         }
1290 
1291         /* Return the size of descriptor (in 16-byte chunks) */
1292         *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1293 
1294         TAVOR_TNF_EXIT(tavor_wqe_send_build);
1295         return (DDI_SUCCESS);
1296 }
1297 
1298 
1299 /*
1300  * tavor_wqe_send_linknext()
1301  *    Context: Can be called from interrupt or base context.
1302  */
1303 static void
1304 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1305     uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1306     tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1307 {
1308         uint64_t        next, ctrl;
1309         uint32_t        nopcode, fence;
1310 
1311         /*
1312          * Calculate the "next" field of the descriptor.  This amounts to
1313          * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1314          * fields (see tavor_hw.h for more).  Note:  If there is no next
1315          * descriptor (i.e. if the current descriptor is the last WQE on
1316          * the chain), then set "next" to zero.
1317          */
1318         if (curr_desc != NULL) {
1319                 /*
1320                  * Determine the value for the Tavor WQE "nopcode" field
1321                  * by using the IBTF opcode from the work request
1322                  */
1323                 switch (curr_wr->wr_opcode) {
1324                 case IBT_WRC_RDMAW:
1325                         if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1326                                 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1327                         } else {
1328                                 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1329                         }
1330                         break;
1331 
1332                 case IBT_WRC_SEND:
1333                         if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1334                                 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1335                         } else {
1336                                 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1337                         }
1338                         break;
1339 
1340                 case IBT_WRC_RDMAR:
1341                         nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1342                         break;
1343 
1344                 case IBT_WRC_CSWAP:
1345                         nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1346                         break;
1347 
1348                 case IBT_WRC_FADD:
1349                         nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1350                         break;
1351 
1352                 case IBT_WRC_BIND:
1353                         nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1354                         break;
1355                 }
1356 
1357                 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1358                     - qp->qp_desc_off);
1359                 next  = ((uint64_t)(uintptr_t)curr_desc &
1360                     TAVOR_WQE_NDA_MASK) << 32;
1361                 next  = next | ((uint64_t)nopcode << 32);
1362                 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1363                 if (fence) {
1364                         next = next | TAVOR_WQE_SEND_FENCE_MASK;
1365                 }
1366                 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1367 
1368                 /*
1369                  * If a send queue doorbell will be rung for the next
1370                  * WQE on the chain, then set the current WQE's "dbd" bit.
1371                  * Note: We also update the "dbinfo" structure here to pass
1372                  * back information about what should (later) be included
1373                  * in the send queue doorbell.
1374                  */
1375                 if (dbinfo) {
1376                         next = next | TAVOR_WQE_DBD_MASK;
1377                         dbinfo->db_nopcode = nopcode;
1378                         dbinfo->db_fence   = fence;
1379                 }
1380         } else {
1381                 next = 0;
1382         }
1383 
1384         /*
1385          * If this WQE is supposed to be linked to the previous descriptor,
1386          * then we need to update not only the previous WQE's "next" fields
1387          * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1388          * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1389          * the "e" bit is always hardcoded to zero.
1390          */
1391         if (prev_desc != NULL) {
1392                 /*
1393                  * If a send queue doorbell will be rung for the next WQE on
1394                  * the chain, then update the current WQE's "next" field and
1395                  * return.
1396                  * Note: We don't want to modify the "ctrl" field here because
1397                  * that portion of the previous WQE has already been set
1398                  * correctly at some previous point in time.
1399                  */
1400                 if (dbinfo) {
1401                         TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1402                         return;
1403                 }
1404 
1405                 ctrl = 0;
1406 
1407                 /* Set the "c" (i.e. "signaled") bit appropriately */
1408                 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1409                         ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1410                 }
1411 
1412                 /* Set the "s" (i.e. "solicited") bit appropriately */
1413                 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1414                         ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1415                 }
1416 
1417                 /* Set the "i" bit and the immediate data appropriately */
1418                 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1419                         ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1420                         ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1421                 }
1422 
1423                 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1424         }
1425 }
1426 
1427 
1428 /*
1429  * tavor_wqe_mlx_build()
1430  *    Context: Can be called from interrupt or base context.
1431  */
1432 static int
1433 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1434     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1435 {
1436         tavor_hw_udav_t         udav;
1437         tavor_ahhdl_t           ah;
1438         ib_lrh_hdr_t            *lrh;
1439         ib_grh_t                *grh;
1440         ib_bth_hdr_t            *bth;
1441         ib_deth_hdr_t           *deth;
1442         tavor_hw_wqe_sgl_t      *ds;
1443         ibt_wr_ds_t             *sgl;
1444         uint8_t                 *mgmtclass, *hpoint, *hcount;
1445         uint64_t                data;
1446         uint32_t                nds, offset, pktlen;
1447         uint32_t                desc_sz, udav_sz;
1448         int                     i, num_ds;
1449 
1450         TAVOR_TNF_ENTER(tavor_wqe_mlx_build);
1451 
1452         ASSERT(MUTEX_HELD(&qp->qp_lock));
1453 
1454         /* Initialize the information for the Data Segments */
1455         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1456             sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1457 
1458         /*
1459          * Pull the address handle from the work request and read in
1460          * the contents of the UDAV.  This will be used to answer some
1461          * questions about the request.
1462          */
1463         ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1464         if (ah == NULL) {
1465                 TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail,
1466                     TAVOR_TNF_ERROR, "");
1467                 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1468                 return (IBT_AH_HDL_INVALID);
1469         }
1470         mutex_enter(&ah->ah_lock);
1471         udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1472         for (i = 0; i < udav_sz; i++) {
1473                 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1474                     ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1475                 ((uint64_t *)&udav)[i] = data;
1476         }
1477         mutex_exit(&ah->ah_lock);
1478 
1479         /*
1480          * If the request is for QP1 and the destination LID is equal to
1481          * the Permissive LID, then return an error.  This combination is
1482          * not allowed
1483          */
1484         if ((udav.rlid == IB_LID_PERMISSIVE) &&
1485             (qp->qp_is_special == TAVOR_QP_GSI)) {
1486                 TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail,
1487                     TAVOR_TNF_ERROR, "");
1488                 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1489                 return (IBT_AH_HDL_INVALID);
1490         }
1491 
1492         /*
1493          * Calculate the size of the packet headers, including the GRH
1494          * (if necessary)
1495          */
1496         desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1497             sizeof (ib_deth_hdr_t);
1498         if (udav.grh) {
1499                 desc_sz += sizeof (ib_grh_t);
1500         }
1501 
1502         /*
1503          * Begin to build the first "inline" data segment for the packet
1504          * headers.  Note:  By specifying "inline" we can build the contents
1505          * of the MAD packet headers directly into the work queue (as part
1506          * descriptor).  This has the advantage of both speeding things up
1507          * and of not requiring the driver to allocate/register any additional
1508          * memory for the packet headers.
1509          */
1510         TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1511         desc_sz += 4;
1512 
1513         /*
1514          * Build Local Route Header (LRH)
1515          *    We start here by building the LRH into a temporary location.
1516          *    When we have finished we copy the LRH data into the descriptor.
1517          *
1518          *    Notice that the VL values are hardcoded.  This is not a problem
1519          *    because VL15 is decided later based on the value in the MLX
1520          *    transport "next/ctrl" header (see the "vl15" bit below), and it
1521          *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1522          *    values.  This rule does not hold for loopback packets however
1523          *    (all of which bypass the SL-to-VL tables) and it is the reason
1524          *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1525          *
1526          *    Notice also that Source LID is hardcoded to the Permissive LID
1527          *    (0xFFFF).  This is also not a problem because if the Destination
1528          *    LID is not the Permissive LID, then the "slr" value in the MLX
1529          *    transport "next/ctrl" header will be set to zero and the hardware
1530          *    will pull the LID from value in the port.
1531          */
1532         lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1533         pktlen = (desc_sz + 0x100) >> 2;
1534         TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1535 
1536         /*
1537          * Build Global Route Header (GRH)
1538          *    This is only built if necessary as defined by the "grh" bit in
1539          *    the address vector.  Note:  We also calculate the offset to the
1540          *    next header (BTH) based on whether or not the "grh" bit is set.
1541          */
1542         if (udav.grh) {
1543                 /*
1544                  * If the request is for QP0, then return an error.  The
1545                  * combination of global routine (GRH) and QP0 is not allowed.
1546                  */
1547                 if (qp->qp_is_special == TAVOR_QP_SMI) {
1548                         TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail,
1549                             TAVOR_TNF_ERROR, "");
1550                         TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1551                         return (IBT_AH_HDL_INVALID);
1552                 }
1553                 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1554                 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1555 
1556                 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1557         } else {
1558                 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1559         }
1560 
1561 
1562         /*
1563          * Build Base Transport Header (BTH)
1564          *    Notice that the M, PadCnt, and TVer fields are all set
1565          *    to zero implicitly.  This is true for all Management Datagrams
1566          *    MADs whether GSI are SMI.
1567          */
1568         TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1569 
1570         /*
1571          * Build Datagram Extended Transport Header (DETH)
1572          */
1573         deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1574         TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1575 
1576         /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1577         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1578         ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1579         nds = wr->wr_nds;
1580         sgl = wr->wr_sgl;
1581         num_ds = 0;
1582 
1583         /*
1584          * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1585          * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1586          * Start by checking for a valid number of SGL entries
1587          */
1588         if (nds > qp->qp_sq_sgl) {
1589                 TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail,
1590                     TAVOR_TNF_ERROR, "");
1591                 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1592                 return (IBT_QP_SGL_LEN_INVALID);
1593         }
1594 
1595         /*
1596          * For each SGL in the Send Work Request, fill in the MLX WQE's data
1597          * segments.  Note: We skip any SGL with zero size because Tavor
1598          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1599          * the encoding for zero means a 2GB transfer.  Because of this special
1600          * encoding in the hardware, we mask the requested length with
1601          * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1602          * zero.)
1603          */
1604         mgmtclass = hpoint = hcount = NULL;
1605         offset = 0;
1606         for (i = 0; i < nds; i++) {
1607                 if (sgl[i].ds_len == 0) {
1608                         continue;
1609                 }
1610 
1611                 /*
1612                  * Fill in the Data Segment(s) for the MLX send WQE, using
1613                  * the information contained in the scatter-gather list of
1614                  * the work request.
1615                  */
1616                 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1617 
1618                 /*
1619                  * Search through the contents of all MADs posted to QP0 to
1620                  * initialize pointers to the places where Directed Route "hop
1621                  * pointer", "hop count", and "mgmtclass" would be.  Tavor
1622                  * needs these updated (i.e. incremented or decremented, as
1623                  * necessary) by software.
1624                  */
1625                 if (qp->qp_is_special == TAVOR_QP_SMI) {
1626 
1627                         TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1628                             offset, sgl[i].ds_va, sgl[i].ds_len);
1629 
1630                         TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1631                             offset, sgl[i].ds_va, sgl[i].ds_len);
1632 
1633                         TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1634                             offset, sgl[i].ds_va, sgl[i].ds_len);
1635 
1636                         offset += sgl[i].ds_len;
1637                 }
1638                 num_ds++;
1639         }
1640 
1641         /*
1642          * Tavor's Directed Route MADs need to have the "hop pointer"
1643          * incremented/decremented (as necessary) depending on whether it is
1644          * currently less than or greater than the "hop count" (i.e. whether
1645          * the MAD is a request or a response.)
1646          */
1647         if (qp->qp_is_special == TAVOR_QP_SMI) {
1648                 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1649                     *hpoint, *hcount);
1650         }
1651 
1652         /*
1653          * Now fill in the ICRC Data Segment.  This data segment is inlined
1654          * just like the packets headers above, but it is only four bytes and
1655          * set to zero (to indicate that we wish the hardware to generate ICRC.
1656          */
1657         TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1658         num_ds++;
1659 
1660         /* Return the size of descriptor (in 16-byte chunks) */
1661         *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1662 
1663         TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1664         return (DDI_SUCCESS);
1665 }
1666 
1667 
1668 /*
1669  * tavor_wqe_mlx_linknext()
1670  *    Context: Can be called from interrupt or base context.
1671  */
1672 static void
1673 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1674     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1675     tavor_qphdl_t qp)
1676 {
1677         tavor_hw_udav_t         udav;
1678         tavor_ahhdl_t           ah;
1679         uint64_t                next, ctrl, data;
1680         uint_t                  nopcode;
1681         uint_t                  udav_sz;
1682         int                     i;
1683 
1684         /*
1685          * Calculate the "next" field of the descriptor.  This amounts to
1686          * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1687          * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1688          * if the current descriptor is the last WQE on the chain), then set
1689          * "next" to zero.
1690          */
1691         if (curr_desc != NULL) {
1692                 /*
1693                  * The only valid Tavor WQE "nopcode" for MLX transport
1694                  * requests is the "Send" code.
1695                  */
1696                 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1697                 curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1698                     (uintptr_t)curr_desc - qp->qp_desc_off);
1699                 next = (uint64_t)((uintptr_t)curr_desc &
1700                     TAVOR_WQE_NDA_MASK) << 32;
1701                 next = next | ((uint64_t)nopcode << 32);
1702                 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1703 
1704                 /*
1705                  * If a send queue doorbell will be rung for the next
1706                  * WQE on the chain, then set the current WQE's "dbd" bit.
1707                  * Note: We also update the "dbinfo" structure here to pass
1708                  * back information about what should (later) be included
1709                  * in the send queue doorbell.
1710                  */
1711                 if (dbinfo) {
1712                         next = next | TAVOR_WQE_DBD_MASK;
1713                         dbinfo->db_nopcode = nopcode;
1714                         dbinfo->db_fence   = 0;
1715                 }
1716         } else {
1717                 next = 0;
1718         }
1719 
1720         /*
1721          * If this WQE is supposed to be linked to the previous descriptor,
1722          * then we need to update not only the previous WQE's "next" fields
1723          * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1724          * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1725          * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1726          * always hardcoded to zero.
1727          */
1728         if (prev_desc != NULL) {
1729                 /*
1730                  * If a send queue doorbell will be rung for the next WQE on
1731                  * the chain, then update the current WQE's "next" field and
1732                  * return.
1733                  * Note: We don't want to modify the "ctrl" field here because
1734                  * that portion of the previous WQE has already been set
1735                  * correctly at some previous point in time.
1736                  */
1737                 if (dbinfo) {
1738                         TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1739                         return;
1740                 }
1741 
1742                 /*
1743                  * Pull the address handle from the work request and read in
1744                  * the contents of the UDAV.  This will be used to answer some
1745                  * questions about the request.
1746                  */
1747                 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1748                 mutex_enter(&ah->ah_lock);
1749                 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1750                 for (i = 0; i < udav_sz; i++) {
1751                         data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1752                             ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1753                         ((uint64_t *)&udav)[i] = data;
1754                 }
1755                 mutex_exit(&ah->ah_lock);
1756 
1757                 ctrl = 0;
1758 
1759                 /* Only QP0 uses VL15, otherwise use VL in the packet */
1760                 if (qp->qp_is_special == TAVOR_QP_SMI) {
1761                         ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1762                 }
1763 
1764                 /*
1765                  * The SLR (Source LID Replace) bit determines whether the
1766                  * source LID for an outgoing MLX packet should come from the
1767                  * PortInfo (SLR = 0) or should be left as it is in the
1768                  * descriptor (SLR = 1).  The latter is necessary for packets
1769                  * to be sent with the Permissive LID.
1770                  */
1771                 if (udav.rlid == IB_LID_PERMISSIVE) {
1772                         ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1773                 }
1774 
1775                 /* Fill in the max static rate from the address handle */
1776                 ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1777                     TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1778 
1779                 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1780                 if (qp->qp_is_special != TAVOR_QP_SMI) {
1781                         ctrl = ctrl | ((uint64_t)udav.sl <<
1782                             TAVOR_WQE_MLXHDR_SL_SHIFT);
1783                 }
1784 
1785                 /* Set the "c" (i.e. "signaled") bit appropriately */
1786                 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1787                         ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1788                 }
1789 
1790                 /* Fill in the destination LID from the address handle */
1791                 ctrl = ctrl | ((uint64_t)udav.rlid <<
1792                     TAVOR_WQE_MLXHDR_RLID_SHIFT);
1793 
1794                 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1795         }
1796 }
1797 
1798 
1799 /*
1800  * tavor_wqe_recv_build()
1801  *    Context: Can be called from interrupt or base context.
1802  */
1803 /* ARGSUSED */
1804 static int
1805 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1806     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1807 {
1808         tavor_hw_wqe_sgl_t      *ds;
1809         int                     i, num_ds;
1810 
1811         TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1812 
1813         ASSERT(MUTEX_HELD(&qp->qp_lock));
1814 
1815         /* Check that work request transport type is valid */
1816         if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1817             (qp->qp_serv_type != TAVOR_QP_RC) &&
1818             (qp->qp_serv_type != TAVOR_QP_UC)) {
1819                 TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail,
1820                     TAVOR_TNF_ERROR, "");
1821                 TAVOR_TNF_EXIT(tavor_build_recv_wqe);
1822                 return (IBT_QP_SRV_TYPE_INVALID);
1823         }
1824 
1825         /* Fill in the Data Segments (SGL) for the Recv WQE */
1826         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1827             sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1828         num_ds = 0;
1829 
1830         /* Check for valid number of SGL entries */
1831         if (wr->wr_nds > qp->qp_rq_sgl) {
1832                 TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail,
1833                     TAVOR_TNF_ERROR, "");
1834                 TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1835                 return (IBT_QP_SGL_LEN_INVALID);
1836         }
1837 
1838         /*
1839          * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1840          * segments.  Note: We skip any SGL with zero size because Tavor
1841          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1842          * the encoding for zero means a 2GB transfer.  Because of this special
1843          * encoding in the hardware, we mask the requested length with
1844          * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1845          * zero.)
1846          */
1847         for (i = 0; i < wr->wr_nds; i++) {
1848                 if (wr->wr_sgl[i].ds_len == 0) {
1849                         continue;
1850                 }
1851 
1852                 /*
1853                  * Fill in the Data Segment(s) for the receive WQE, using the
1854                  * information contained in the scatter-gather list of the
1855                  * work request.
1856                  */
1857                 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1858                 num_ds++;
1859         }
1860 
1861         /* Return the size of descriptor (in 16-byte chunks) */
1862         *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1863 
1864         TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1865         return (DDI_SUCCESS);
1866 }
1867 
1868 
1869 /*
1870  * tavor_wqe_recv_linknext()
1871  *    Context: Can be called from interrupt or base context.
1872  */
1873 static void
1874 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1875     uint64_t *prev_desc, tavor_qphdl_t qp)
1876 {
1877         uint64_t        next;
1878 
1879         /*
1880          * Calculate the "next" field of the descriptor.  This amounts to
1881          * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1882          * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1883          * if the current descriptor is the last WQE on the chain), then set
1884          * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1885          * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1886          * In either case, we must add a single bit in the "reserved" field
1887          * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1888          * workaround for a known Tavor errata that can cause Recv WQEs with
1889          * zero in the NDA field to behave improperly.
1890          */
1891         if (curr_desc != NULL) {
1892                 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1893                     qp->qp_desc_off);
1894                 next = (uint64_t)((uintptr_t)curr_desc &
1895                     TAVOR_WQE_NDA_MASK) << 32;
1896                 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1897                     TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1898         } else {
1899                 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1900         }
1901 
1902         /*
1903          * If this WQE is supposed to be linked to the previous descriptor,
1904          * then we need to update not only the previous WQE's "next" fields
1905          * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1906          * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1907          * bits are always hardcoded to zero.
1908          */
1909         if (prev_desc != NULL) {
1910                 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1911         }
1912 }
1913 
1914 
1915 /*
1916  * tavor_wqe_srq_build()
1917  *    Context: Can be called from interrupt or base context.
1918  */
1919 /* ARGSUSED */
1920 static int
1921 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1922     ibt_recv_wr_t *wr, uint64_t *desc)
1923 {
1924         tavor_hw_wqe_sgl_t      *ds;
1925         ibt_wr_ds_t             end_sgl;
1926         int                     i, num_ds;
1927 
1928         TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1929 
1930         ASSERT(MUTEX_HELD(&srq->srq_lock));
1931 
1932         /* Fill in the Data Segments (SGL) for the Recv WQE */
1933         ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1934             sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1935         num_ds = 0;
1936 
1937         /* Check for valid number of SGL entries */
1938         if (wr->wr_nds > srq->srq_wq_sgl) {
1939                 TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail,
1940                     TAVOR_TNF_ERROR, "");
1941                 TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1942                 return (IBT_QP_SGL_LEN_INVALID);
1943         }
1944 
1945         /*
1946          * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1947          * segments.  Note: We skip any SGL with zero size because Tavor
1948          * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1949          * the encoding for zero means a 2GB transfer.  Because of this special
1950          * encoding in the hardware, we mask the requested length with
1951          * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1952          * zero.)
1953          */
1954         for (i = 0; i < wr->wr_nds; i++) {
1955                 if (wr->wr_sgl[i].ds_len == 0) {
1956                         continue;
1957                 }
1958 
1959                 /*
1960                  * Fill in the Data Segment(s) for the receive WQE, using the
1961                  * information contained in the scatter-gather list of the
1962                  * work request.
1963                  */
1964                 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1965                 num_ds++;
1966         }
1967 
1968         /*
1969          * For SRQ, if the number of data segments is less than the maximum
1970          * specified at alloc, then we have to fill in a special "key" entry in
1971          * the sgl entry after the last valid one in this post request.  We do
1972          * that here.
1973          */
1974         if (num_ds < srq->srq_wq_sgl) {
1975                 end_sgl.ds_va  = 0;
1976                 end_sgl.ds_len = 0;
1977                 end_sgl.ds_key = 0x1;
1978                 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1979         }
1980 
1981         TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1982         return (DDI_SUCCESS);
1983 }
1984 
1985 
1986 /*
1987  * tavor_wqe_srq_linknext()
1988  *    Context: Can be called from interrupt or base context.
1989  */
1990 static void
1991 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1992     tavor_srqhdl_t srq)
1993 {
1994         uint64_t        next;
1995 
1996         /*
1997          * Calculate the "next" field of the descriptor.  This amounts to
1998          * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1999          * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
2000          * if the current descriptor is the last WQE on the chain), then set
2001          * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
2002          * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
2003          * In either case, we must add a single bit in the "reserved" field
2004          * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2005          * workaround for a known Tavor errata that can cause Recv WQEs with
2006          * zero in the NDA field to behave improperly.
2007          */
2008         if (curr_desc != NULL) {
2009                 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
2010                     srq->srq_desc_off);
2011                 next = (uint64_t)((uintptr_t)curr_desc &
2012                     TAVOR_WQE_NDA_MASK) << 32;
2013                 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2014         } else {
2015                 next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2016         }
2017 
2018         /*
2019          * If this WQE is supposed to be linked to the previous descriptor,
2020          * then we need to update not only the previous WQE's "next" fields
2021          * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
2022          * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
2023          * bits are always hardcoded to zero.
2024          */
2025         if (prev_desc != NULL) {
2026                 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
2027         }
2028 }
2029 
2030 
2031 /*
2032  * tavor_wr_get_immediate()
2033  *    Context: Can be called from interrupt or base context.
2034  */
2035 static uint32_t
2036 tavor_wr_get_immediate(ibt_send_wr_t *wr)
2037 {
2038         /*
2039          * This routine extracts the "immediate data" from the appropriate
2040          * location in the IBTF work request.  Because of the way the
2041          * work request structure is defined, the location for this data
2042          * depends on the actual work request operation type.
2043          */
2044 
2045         /* For RDMA Write, test if RC or UC */
2046         if (wr->wr_opcode == IBT_WRC_RDMAW) {
2047                 if (wr->wr_trans == IBT_RC_SRV) {
2048                         return (wr->wr.rc.rcwr.rdma.rdma_immed);
2049                 } else {  /* IBT_UC_SRV */
2050                         return (wr->wr.uc.ucwr.rdma.rdma_immed);
2051                 }
2052         }
2053 
2054         /* For Send, test if RC, UD, or UC */
2055         if (wr->wr_opcode == IBT_WRC_SEND) {
2056                 if (wr->wr_trans == IBT_RC_SRV) {
2057                         return (wr->wr.rc.rcwr.send_immed);
2058                 } else if (wr->wr_trans == IBT_UD_SRV) {
2059                         return (wr->wr.ud.udwr_immed);
2060                 } else {  /* IBT_UC_SRV */
2061                         return (wr->wr.uc.ucwr.send_immed);
2062                 }
2063         }
2064 
2065         /*
2066          * If any other type of request, then immediate is undefined
2067          */
2068         return (0);
2069 }
2070 
2071 
2072 /*
2073  * tavor_wqe_sync()
2074  *    Context: Can be called from interrupt or base context.
2075  */
2076 static void
2077 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
2078     uint_t sync_type, uint_t flag)
2079 {
2080         tavor_qphdl_t           qp;
2081         tavor_srqhdl_t          srq;
2082         uint_t                  is_sync_req;
2083         uint64_t                *wqe_from, *wqe_to, *wqe_base, *wqe_top;
2084         ddi_dma_handle_t        dmahdl;
2085         off_t                   offset;
2086         size_t                  length;
2087         uint32_t                qsize;
2088         int                     status;
2089 
2090         TAVOR_TNF_ENTER(tavor_wqe_sync);
2091 
2092         if (sync_type == TAVOR_WR_SRQ) {
2093                 srq = (tavor_srqhdl_t)hdl;
2094                 is_sync_req = srq->srq_sync;
2095                 /* Get the DMA handle from SRQ context */
2096                 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2097         } else {
2098                 qp = (tavor_qphdl_t)hdl;
2099                 is_sync_req = qp->qp_sync;
2100                 /* Get the DMA handle from QP context */
2101                 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2102         }
2103 
2104         /* Determine if the work queues need to be synced or not */
2105         if (is_sync_req == 0) {
2106                 TAVOR_TNF_EXIT(tavor_wqe_sync);
2107                 return;
2108         }
2109 
2110         /*
2111          * Depending on the type of the work queue, we grab information
2112          * about the address ranges we need to DMA sync.
2113          */
2114         if (sync_type == TAVOR_WR_SEND) {
2115                 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
2116                 wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
2117                 qsize    = qp->qp_sq_bufsz;
2118 
2119                 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2120                 wqe_top  = TAVOR_QP_SQ_ENTRY(qp, qsize);
2121         } else if (sync_type == TAVOR_WR_RECV) {
2122                 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2123                 wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2124                 qsize    = qp->qp_rq_bufsz;
2125 
2126                 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2127                 wqe_top  = TAVOR_QP_RQ_ENTRY(qp, qsize);
2128         } else {
2129                 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2130                 wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2131                 qsize    = srq->srq_wq_bufsz;
2132 
2133                 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2134                 wqe_top  = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2135         }
2136 
2137         /*
2138          * There are two possible cases for the beginning and end of the WQE
2139          * chain we are trying to sync.  Either this is the simple case, where
2140          * the end of the chain is below the beginning of the chain, or it is
2141          * the "wrap-around" case, where the end of the chain has wrapped over
2142          * the end of the queue.  In the former case, we simply need to
2143          * calculate the span from beginning to end and sync it.  In the latter
2144          * case, however, we need to calculate the span from the top of the
2145          * work queue to the end of the chain and sync that, and then we need
2146          * to find the other portion (from beginning of chain to end of queue)
2147          * and sync that as well.  Note: if the "top to end" span is actually
2148          * zero length, then we don't do a DMA sync because a zero length DMA
2149          * sync unnecessarily syncs the entire work queue.
2150          */
2151         if (wqe_to > wqe_from) {
2152                 /* "From Beginning to End" */
2153                 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2154                 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2155 
2156                 status = ddi_dma_sync(dmahdl, offset, length, flag);
2157                 if (status != DDI_SUCCESS) {
2158                         TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2159                         TAVOR_TNF_EXIT(tavor_wqe_sync);
2160                         return;
2161                 }
2162         } else {
2163                 /* "From Top to End" */
2164                 offset = (off_t)0;
2165                 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2166                 if (length) {
2167                         status = ddi_dma_sync(dmahdl, offset, length, flag);
2168                         if (status != DDI_SUCCESS) {
2169                                 TNF_PROBE_0(tavor_wqe_sync_fail,
2170                                     TAVOR_TNF_ERROR, "");
2171                                 TAVOR_TNF_EXIT(tavor_wqe_sync);
2172                                 return;
2173                         }
2174                 }
2175 
2176                 /* "From Beginning to Bottom" */
2177                 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2178                 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2179                 status = ddi_dma_sync(dmahdl, offset, length, flag);
2180                 if (status != DDI_SUCCESS) {
2181                         TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2182                         TAVOR_TNF_EXIT(tavor_wqe_sync);
2183                         return;
2184                 }
2185         }
2186 
2187         TAVOR_TNF_EXIT(tavor_wqe_sync);
2188 }
2189 
2190 
2191 /*
2192  * tavor_wr_bind_check()
2193  *    Context: Can be called from interrupt or base context.
2194  */
2195 static int
2196 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2197 {
2198         ibt_bind_flags_t        bind_flags;
2199         uint64_t                vaddr, len;
2200         uint64_t                reg_start_addr, reg_end_addr;
2201         tavor_mwhdl_t           mw;
2202         tavor_mrhdl_t           mr;
2203         tavor_rsrc_t            *mpt;
2204         uint32_t                new_rkey;
2205 
2206         TAVOR_TNF_ENTER(tavor_wr_bind_check);
2207 
2208         /* Check for a valid Memory Window handle in the WR */
2209         mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2210         if (mw == NULL) {
2211                 TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail,
2212                     TAVOR_TNF_ERROR, "");
2213                 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2214                 return (IBT_MW_HDL_INVALID);
2215         }
2216 
2217         /* Check for a valid Memory Region handle in the WR */
2218         mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2219         if (mr == NULL) {
2220                 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail,
2221                     TAVOR_TNF_ERROR, "");
2222                 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2223                 return (IBT_MR_HDL_INVALID);
2224         }
2225 
2226         mutex_enter(&mr->mr_lock);
2227         mutex_enter(&mw->mr_lock);
2228 
2229         /*
2230          * Check here to see if the memory region has already been partially
2231          * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2232          * If so, this is an error, return failure.
2233          */
2234         if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2235                 mutex_exit(&mr->mr_lock);
2236                 mutex_exit(&mw->mr_lock);
2237                 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail,
2238                     TAVOR_TNF_ERROR, "");
2239                 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2240                 return (IBT_MR_HDL_INVALID);
2241         }
2242 
2243         /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2244         if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2245                 mutex_exit(&mr->mr_lock);
2246                 mutex_exit(&mw->mr_lock);
2247                 TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail,
2248                     TAVOR_TNF_ERROR, "");
2249                 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2250                 return (IBT_MR_RKEY_INVALID);
2251         }
2252 
2253         /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2254         if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2255                 mutex_exit(&mr->mr_lock);
2256                 mutex_exit(&mw->mr_lock);
2257                 TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail,
2258                     TAVOR_TNF_ERROR, "");
2259                 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2260                 return (IBT_MR_LKEY_INVALID);
2261         }
2262 
2263         /*
2264          * Now check for valid "vaddr" and "len".  Note:  We don't check the
2265          * "vaddr" range when "len == 0" (i.e. on unbind operations)
2266          */
2267         len = wr->wr.rc.rcwr.bind->bind_len;
2268         if (len != 0) {
2269                 vaddr = wr->wr.rc.rcwr.bind->bind_va;
2270                 reg_start_addr = mr->mr_bindinfo.bi_addr;
2271                 reg_end_addr   = mr->mr_bindinfo.bi_addr +
2272                     (mr->mr_bindinfo.bi_len - 1);
2273                 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2274                         mutex_exit(&mr->mr_lock);
2275                         mutex_exit(&mw->mr_lock);
2276                         TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail,
2277                             TAVOR_TNF_ERROR, "");
2278                         TAVOR_TNF_EXIT(tavor_wr_bind_check);
2279                         return (IBT_MR_VA_INVALID);
2280                 }
2281                 vaddr = (vaddr + len) - 1;
2282                 if (vaddr > reg_end_addr) {
2283                         mutex_exit(&mr->mr_lock);
2284                         mutex_exit(&mw->mr_lock);
2285                         TNF_PROBE_0(tavor_wr_bind_check_invlen_fail,
2286                             TAVOR_TNF_ERROR, "");
2287                         TAVOR_TNF_EXIT(tavor_wr_bind_check);
2288                         return (IBT_MR_LEN_INVALID);
2289                 }
2290         }
2291 
2292         /*
2293          * Validate the bind access flags.  Remote Write and Atomic access for
2294          * the Memory Window require that Local Write access be set in the
2295          * corresponding Memory Region.
2296          */
2297         bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2298         if (((bind_flags & IBT_WR_BIND_WRITE) ||
2299             (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2300             !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2301                 mutex_exit(&mr->mr_lock);
2302                 mutex_exit(&mw->mr_lock);
2303                 TNF_PROBE_0(tavor_wr_bind_check_invflags_fail,
2304                     TAVOR_TNF_ERROR, "");
2305                 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2306                 return (IBT_MR_ACCESS_REQ_INVALID);
2307         }
2308 
2309         /* Calculate the new RKey for the Memory Window */
2310         mpt = mw->mr_mptrsrcp;
2311         tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2312 
2313         wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2314         mw->mr_rkey = new_rkey;
2315 
2316         mutex_exit(&mr->mr_lock);
2317         mutex_exit(&mw->mr_lock);
2318         TAVOR_TNF_EXIT(tavor_wr_bind_check);
2319         return (DDI_SUCCESS);
2320 }
2321 
2322 
2323 /*
2324  * tavor_wrid_from_reset_handling()
2325  *    Context: Can be called from interrupt or base context.
2326  */
2327 int
2328 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2329 {
2330         tavor_workq_hdr_t       *swq, *rwq;
2331         tavor_wrid_list_hdr_t   *s_wridlist, *r_wridlist;
2332         uint_t                  create_new_swq = 0, create_new_rwq = 0;
2333         uint_t                  create_wql = 0;
2334         uint_t                  qp_srq_en;
2335 
2336         TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling);
2337 
2338         /*
2339          * For each of this QP's Work Queues, make sure we have a (properly
2340          * initialized) Work Request ID list attached to the relevant
2341          * completion queue.  Grab the CQ lock(s) before manipulating the
2342          * lists.
2343          */
2344         tavor_wrid_wqhdr_lock_both(qp);
2345         swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2346             TAVOR_WR_SEND);
2347         if (swq == NULL) {
2348                 /* Couldn't find matching work queue header, create it */
2349                 create_new_swq = create_wql = 1;
2350                 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2351                     qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2352                 if (swq == NULL) {
2353                         /*
2354                          * If we couldn't find/allocate space for the workq
2355                          * header, then drop the lock(s) and return failure.
2356                          */
2357                         tavor_wrid_wqhdr_unlock_both(qp);
2358                         TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2359                             TAVOR_TNF_ERROR, "");
2360                         TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2361                         return (ibc_get_ci_failure(0));
2362                 }
2363         }
2364         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2365         qp->qp_sq_wqhdr = swq;
2366         swq->wq_size = qp->qp_sq_bufsz;
2367         swq->wq_head = 0;
2368         swq->wq_tail = 0;
2369         swq->wq_full = 0;
2370 
2371         /*
2372          * Allocate space for the tavor_wrid_entry_t container
2373          */
2374         s_wridlist = tavor_wrid_get_list(swq->wq_size);
2375         if (s_wridlist == NULL) {
2376                 /*
2377                  * If we couldn't allocate space for tracking the WRID
2378                  * entries, then cleanup the workq header from above (if
2379                  * necessary, i.e. if we created the workq header).  Then
2380                  * drop the lock(s) and return failure.
2381                  */
2382                 if (create_new_swq) {
2383                         tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2384                 }
2385 
2386                 tavor_wrid_wqhdr_unlock_both(qp);
2387                 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2388                     TAVOR_TNF_ERROR, "");
2389                 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2390                 return (ibc_get_ci_failure(0));
2391         }
2392         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2393         s_wridlist->wl_wqhdr = swq;
2394 
2395         /* Chain the new WRID list container to the workq hdr list */
2396         mutex_enter(&swq->wq_wrid_wql->wql_lock);
2397         tavor_wrid_wqhdr_add(swq, s_wridlist);
2398         mutex_exit(&swq->wq_wrid_wql->wql_lock);
2399 
2400         qp_srq_en = qp->qp_srq_en;
2401 
2402 #ifdef __lock_lint
2403         mutex_enter(&qp->qp_srqhdl->srq_lock);
2404 #else
2405         if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2406                 mutex_enter(&qp->qp_srqhdl->srq_lock);
2407         }
2408 #endif
2409         /*
2410          * Now we repeat all the above operations for the receive work queue,
2411          * or shared receive work queue.
2412          *
2413          * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2414          */
2415         rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2416             TAVOR_WR_RECV);
2417         if (rwq == NULL) {
2418                 create_new_rwq = create_wql = 1;
2419 
2420                 /*
2421                  * If this QP is associated with an SRQ, and this isn't the
2422                  * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2423                  * created.  Since the WQL is created at 'wqhdr_create' time we
2424                  * pass in the flag 'create_wql' here to be 0 if we have
2425                  * already created it.  And later on below we then next setup
2426                  * the WQL and rwq information based off the existing SRQ info.
2427                  */
2428                 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2429                     qp->qp_srqhdl->srq_wrid_wql != NULL) {
2430                         create_wql = 0;
2431                 }
2432 
2433                 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2434                     qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2435                 if (rwq == NULL) {
2436                         /*
2437                          * If we couldn't find/allocate space for the workq
2438                          * header, then free all the send queue resources we
2439                          * just allocated and setup (above), drop the lock(s)
2440                          * and return failure.
2441                          */
2442                         mutex_enter(&swq->wq_wrid_wql->wql_lock);
2443                         tavor_wrid_wqhdr_remove(swq, s_wridlist);
2444                         mutex_exit(&swq->wq_wrid_wql->wql_lock);
2445                         if (create_new_swq) {
2446                                 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2447                                     swq);
2448                         }
2449 
2450 #ifdef __lock_lint
2451                         mutex_exit(&qp->qp_srqhdl->srq_lock);
2452 #else
2453                         if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2454                                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2455                         }
2456 #endif
2457 
2458                         tavor_wrid_wqhdr_unlock_both(qp);
2459                         TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2460                             TAVOR_TNF_ERROR, "");
2461                         TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2462                         return (ibc_get_ci_failure(0));
2463                 }
2464         }
2465         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2466 
2467         /*
2468          * Setup receive workq hdr
2469          *
2470          * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2471          * keeping a copy of the rwq pointer, setting the rwq bufsize
2472          * appropriately, and initializing our part of the WQLock.
2473          *
2474          * In the normal QP case, the QP recv queue bufsize is used.
2475          */
2476         if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2477                 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2478                 if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2479                         qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2480                 } else {
2481                         rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2482                 }
2483                 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2484 
2485         } else {
2486                 rwq->wq_size = qp->qp_rq_bufsz;
2487         }
2488 
2489         qp->qp_rq_wqhdr = rwq;
2490         rwq->wq_head = 0;
2491         rwq->wq_tail = 0;
2492         rwq->wq_full = 0;
2493 
2494         /*
2495          * Allocate space for the tavor_wrid_entry_t container.
2496          *
2497          * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2498          * allocate the wridlist normally.  However, if the srq_wridlist is !=
2499          * NULL, then we know this SRQ has already been initialized, thus the
2500          * wridlist has already been initialized.  So we re-use the
2501          * srq_wridlist as the r_wridlist for this QP in this case.
2502          */
2503         if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2504             qp->qp_srqhdl->srq_wridlist != NULL) {
2505                 /* Use existing srq_wridlist pointer */
2506                 r_wridlist = qp->qp_srqhdl->srq_wridlist;
2507                 ASSERT(r_wridlist != NULL);
2508         } else {
2509                 /* Allocate memory for the r_wridlist */
2510                 r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2511         }
2512 
2513         /*
2514          * If the memory allocation failed for r_wridlist (or the SRQ pointer
2515          * is mistakenly NULL), we cleanup our previous swq allocation from
2516          * above
2517          */
2518         if (r_wridlist == NULL) {
2519                 /*
2520                  * If we couldn't allocate space for tracking the WRID
2521                  * entries, then cleanup all the stuff from above.  Then
2522                  * drop the lock(s) and return failure.
2523                  */
2524                 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2525                 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2526                 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2527                 if (create_new_swq) {
2528                         tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2529                 }
2530                 if (create_new_rwq) {
2531                         tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2532                 }
2533 
2534 #ifdef __lock_lint
2535                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2536 #else
2537                 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2538                         mutex_exit(&qp->qp_srqhdl->srq_lock);
2539                 }
2540 #endif
2541 
2542                 tavor_wrid_wqhdr_unlock_both(qp);
2543                 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2544                     TAVOR_TNF_ERROR, "");
2545                 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2546                 return (ibc_get_ci_failure(0));
2547         }
2548         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2549 
2550         /*
2551          * Initialize the wridlist
2552          *
2553          * In the normal QP case, there is no special initialization needed.
2554          * We simply setup the wridlist backpointer to be the receive wqhdr
2555          * (rwq).
2556          *
2557          * But in the SRQ case, there is no backpointer to the wqhdr possible.
2558          * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2559          * and thus potentially shared across multiple QPs with the SRQ.  We
2560          * also setup the srq_wridlist pointer to be the r_wridlist, and
2561          * intialize the freelist to an invalid index.  This srq_wridlist
2562          * pointer is used above on future moves from_reset to let us know that
2563          * the srq_wridlist has been initialized already.
2564          *
2565          * And finally, if we are in a non-UMAP case, we setup the srq wrid
2566          * free list.
2567          */
2568         if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2569             qp->qp_srqhdl->srq_wridlist == NULL) {
2570                 r_wridlist->wl_srq_en = 1;
2571                 r_wridlist->wl_free_list_indx = -1;
2572                 qp->qp_srqhdl->srq_wridlist = r_wridlist;
2573 
2574                 /* Initialize srq wrid free list */
2575                 if (qp->qp_srqhdl->srq_is_umap == 0) {
2576                         mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2577                         tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2578                         mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2579                 }
2580         } else {
2581                 r_wridlist->wl_wqhdr = rwq;
2582         }
2583 
2584         /* Chain the WRID list "container" to the workq hdr list */
2585         mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2586         tavor_wrid_wqhdr_add(rwq, r_wridlist);
2587         mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2588 
2589 #ifdef __lock_lint
2590         mutex_exit(&qp->qp_srqhdl->srq_lock);
2591 #else
2592         if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2593                 mutex_exit(&qp->qp_srqhdl->srq_lock);
2594         }
2595 #endif
2596 
2597         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2598         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2599         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2600         _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2601 
2602         tavor_wrid_wqhdr_unlock_both(qp);
2603         TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2604         return (DDI_SUCCESS);
2605 }
2606 
2607 
2608 /*
2609  * tavor_wrid_to_reset_handling()
2610  *    Context: Can be called from interrupt or base context.
2611  */
2612 void
2613 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2614 {
2615         uint_t          free_wqhdr = 0;
2616 
2617         TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling);
2618 
2619         /*
2620          * For each of this QP's Work Queues, move the WRID "container" to
2621          * the "reapable" list.  Although there may still be unpolled
2622          * entries in these containers, it is not a big deal.  We will not
2623          * reap the list until either the Poll CQ command detects an empty
2624          * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2625          * manipulating the lists.
2626          */
2627         mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2628         tavor_wrid_wqhdr_lock_both(qp);
2629         tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2630 
2631         /*
2632          * Add the receive work queue header on to the reaplist.  But if we are
2633          * on SRQ, then don't add anything to the reaplist.  Instead we flush
2634          * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2635          * WQHDR (if needed).  We must hold the WQL for these operations, yet
2636          * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2637          * drop WQL before that call.  Then release the CQ WQHDR locks and the
2638          * CQ lock and return.
2639          */
2640         if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2641 
2642                 /*
2643                  * Pull off all (if any) entries for this QP from CQ.  This
2644                  * only includes entries that have not yet been polled
2645                  */
2646                 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2647                 tavor_cq_srq_entries_flush(state, qp);
2648 
2649                 /* Remove wridlist from WQHDR */
2650                 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2651                     qp->qp_rq_wqhdr->wq_wrid_post);
2652 
2653                 /* If wridlist chain is now empty, remove the wqhdr as well */
2654                 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2655                         free_wqhdr = 1;
2656                 } else {
2657                         free_wqhdr = 0;
2658                 }
2659 
2660                 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2661 
2662                 /* Free the WQHDR */
2663                 if (free_wqhdr) {
2664                         tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2665                 }
2666         } else {
2667                 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2668         }
2669         tavor_wrid_wqhdr_unlock_both(qp);
2670         mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2671 
2672         TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling);
2673 }
2674 
2675 
2676 /*
2677  * tavor_wrid_add_entry()
2678  *    Context: Can be called from interrupt or base context.
2679  */
2680 void
2681 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2682     uint_t signaled_dbd)
2683 {
2684         tavor_wrid_entry_t      *wre_tmp;
2685         uint32_t                head, tail, size;
2686 
2687         TAVOR_TNF_ENTER(tavor_wrid_add_entry);
2688 
2689         ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2690 
2691         /*
2692          * Find the entry in the container pointed to by the "tail" index.
2693          * Add all of the relevant information to that entry, including WRID,
2694          * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2695          * and/or doorbelled.
2696          */
2697         head = wq->wq_wrid_post->wl_head;
2698         tail = wq->wq_wrid_post->wl_tail;
2699         size = wq->wq_wrid_post->wl_size;
2700         wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2701         wre_tmp->wr_wrid       = wrid;
2702         wre_tmp->wr_wqeaddrsz          = wqeaddrsz;
2703         wre_tmp->wr_signaled_dbd  = signaled_dbd;
2704 
2705         /*
2706          * Update the "wrid_old_tail" pointer to point to the entry we just
2707          * inserted into the queue.  By tracking this pointer (the pointer to
2708          * the most recently inserted entry) it will possible later in the
2709          * PostSend() and PostRecv() code paths to find the entry that needs
2710          * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2711          * tavor_post_send()).
2712          */
2713         wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2714 
2715         /* Update the tail index */
2716         tail = ((tail + 1) & (size - 1));
2717         wq->wq_wrid_post->wl_tail = tail;
2718 
2719         /*
2720          * If the "tail" index has just wrapped over into the "head" index,
2721          * then we have filled the container.  We use the "full" flag to
2722          * indicate this condition and to distinguish it from the "empty"
2723          * condition (where head and tail are also equal).
2724          */
2725         if (head == tail) {
2726                 wq->wq_wrid_post->wl_full = 1;
2727         }
2728         TAVOR_TNF_EXIT(tavor_wrid_add_entry);
2729 }
2730 
2731 /*
2732  * tavor_wrid_add_entry_srq()
2733  * Context: Can be called from interrupt or base context
2734  */
2735 void
2736 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2737 {
2738         tavor_wrid_entry_t      *wre;
2739         uint64_t                *wl_wqe;
2740         uint32_t                wqe_index;
2741 
2742         TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq);
2743 
2744         /*
2745          * Find the next available WQE from the SRQ free_list.  Then update the
2746          * free_list to point to the next entry
2747          */
2748         wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2749 
2750         wqe_index = srq->srq_wridlist->wl_free_list_indx;
2751 
2752         /* ASSERT on impossible wqe_index values */
2753         ASSERT(wqe_index < srq->srq_wq_bufsz);
2754 
2755         /*
2756          * Setup the WRE.
2757          *
2758          * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2759          * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2760          * this information and associate the WRID to the WQE found on the CQE.
2761          */
2762         wre = &srq->srq_wridlist->wl_wre[wqe_index];
2763         wre->wr_wrid = wrid;
2764         wre->wr_signaled_dbd  = signaled_dbd;
2765 
2766         /* Update the free list index */
2767         srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2768             srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2769 
2770         TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq);
2771 }
2772 
2773 
2774 /*
2775  * tavor_wrid_get_entry()
2776  *    Context: Can be called from interrupt or base context.
2777  */
2778 uint64_t
2779 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2780     tavor_wrid_entry_t *wre)
2781 {
2782         tavor_workq_hdr_t       *wq;
2783         tavor_wrid_entry_t      *wre_tmp;
2784         uint64_t                wrid;
2785         uint_t                  send_or_recv, qpnum, error, opcode;
2786 
2787         TAVOR_TNF_ENTER(tavor_wrid_get_entry);
2788 
2789         /* Lock the list of work queues associated with this CQ */
2790         mutex_enter(&cq->cq_wrid_wqhdr_lock);
2791 
2792         /*
2793          * Determine whether this CQE is a send or receive completion (and
2794          * whether it was a "successful" completion or not)
2795          */
2796         opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2797         if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2798             (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2799                 error = 1;
2800                 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2801                     TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2802         } else {
2803                 error = 0;
2804                 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2805         }
2806 
2807         /* Find the work queue for this QP number (send or receive side) */
2808         qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2809         wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2810         ASSERT(wq != NULL);
2811 
2812         /*
2813          * Regardless of whether the completion is the result of a "success"
2814          * or a "failure", we lock the list of "containers" and attempt to
2815          * search for the the first matching completion (i.e. the first WR
2816          * with a matching WQE addr and size).  Once we find it, we pull out
2817          * the "wrid" field and return it (see below).  Note: One possible
2818          * future enhancement would be to enable this routine to skip over
2819          * any "unsignaled" completions to go directly to the next "signaled"
2820          * entry on success. XXX
2821          */
2822         mutex_enter(&wq->wq_wrid_wql->wql_lock);
2823         wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2824 
2825         /*
2826          * If this is a "successful" completion, then we assert that this
2827          * completion must be a "signaled" completion.
2828          */
2829         ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2830 
2831         /*
2832          * If the completion is a "failed" completion, then we save away the
2833          * contents of the entry (into the "wre" field passed in) for use
2834          * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2835          * function to grab "wqeaddrsz" from the next entry in the container.
2836          * This is required for error processing (where updating these fields
2837          * properly is necessary to correct handling of the "error" CQE)
2838          */
2839         if (error && (wre != NULL)) {
2840                 *wre = *wre_tmp;
2841                 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2842         }
2843 
2844         /* Pull out the WRID and return it */
2845         wrid = wre_tmp->wr_wrid;
2846 
2847         mutex_exit(&wq->wq_wrid_wql->wql_lock);
2848         mutex_exit(&cq->cq_wrid_wqhdr_lock);
2849 
2850         TAVOR_TNF_EXIT(tavor_wrid_get_entry);
2851         return (wrid);
2852 }
2853 
2854 
2855 /*
2856  * tavor_wrid_find_match()
2857  *    Context: Can be called from interrupt or base context.
2858  */
2859 static tavor_wrid_entry_t *
2860 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2861     tavor_hw_cqe_t *cqe)
2862 {
2863         tavor_wrid_entry_t      *curr = NULL;
2864         tavor_wrid_list_hdr_t   *container;
2865         uint32_t                wqeaddr_size;
2866         uint32_t                head, tail, size;
2867         int                     found = 0, last_container;
2868 
2869         TAVOR_TNF_ENTER(tavor_wrid_find_match);
2870 
2871         ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2872 
2873         /* Pull the "wqeaddrsz" information from the CQE */
2874         wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2875 
2876         /*
2877          * Walk the "containers" list(s), find first WR with a matching WQE
2878          * addr.  If the current "container" is not the last one on the list,
2879          * i.e. not the current one to which we are posting new WRID entries,
2880          * then we do not attempt to update the "q_head", "q_tail", and
2881          * "q_full" indicators on the main work queue header.  We do, however,
2882          * update the "head" and "full" indicators on the individual containers
2883          * as we go.  This is imperative because we need to be able to
2884          * determine when the current container has been emptied (so that we
2885          * can move on to the next container).
2886          */
2887         container = wq->wq_wrid_poll;
2888         while (container != NULL) {
2889                 /* Is this the last/only "container" on the list */
2890                 last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2891 
2892                 /*
2893                  * First check if we are on an SRQ.  If so, we grab the entry
2894                  * and break out.  Since SRQ wridlist's are never added to
2895                  * reaplist, they can only be the last container.
2896                  */
2897                 if (container->wl_srq_en) {
2898                         ASSERT(last_container == 1);
2899                         curr = tavor_wrid_find_match_srq(container, cq, cqe);
2900                         break;
2901                 }
2902 
2903                 /*
2904                  * Grab the current "head", "tail" and "size" fields before
2905                  * walking the list in the current container. Note: the "size"
2906                  * field here must always be a power-of-2.  The "full"
2907                  * parameter is checked (and updated) here to distinguish the
2908                  * "queue full" condition from "queue empty".
2909                  */
2910                 head = container->wl_head;
2911                 tail = container->wl_tail;
2912                 size = container->wl_size;
2913                 while ((head != tail) || (container->wl_full)) {
2914                         container->wl_full = 0;
2915                         curr = &container->wl_wre[head];
2916                         head = ((head + 1) & (size - 1));
2917 
2918                         /*
2919                          * If the current entry's "wqeaddrsz" matches the one
2920                          * we're searching for, then this must correspond to
2921                          * the work request that caused the completion.  Set
2922                          * the "found" flag and bail out.
2923                          */
2924                         if (curr->wr_wqeaddrsz == wqeaddr_size) {
2925                                 found = 1;
2926                                 break;
2927                         }
2928                 }
2929 
2930                 /*
2931                  * If the current container is empty (having reached here the
2932                  * "head == tail" condition can only mean that the container
2933                  * is empty), then NULL out the "wrid_old_tail" field (see
2934                  * tavor_post_send() and tavor_post_recv() for more details)
2935                  * and (potentially) remove the current container from future
2936                  * searches.
2937                  */
2938                 if (head == tail) {
2939 
2940                         container->wl_wre_old_tail = NULL;
2941                         /*
2942                          * If this wasn't the last "container" on the chain,
2943                          * i.e. the one to which new WRID entries will be
2944                          * added, then remove it from the list.
2945                          * Note: we don't "lose" the memory pointed to by this
2946                          * because we should have already put this container
2947                          * on the "reapable" list (from where it will later be
2948                          * pulled).
2949                          */
2950                         if (!last_container) {
2951                                 wq->wq_wrid_poll = container->wl_next;
2952                         }
2953                 }
2954 
2955                 /* Update the head index for the container */
2956                 container->wl_head = head;
2957 
2958                 /*
2959                  * If the entry was found in this container, then continue to
2960                  * bail out.  Else reset the "curr" pointer and move on to the
2961                  * next container (if there is one).  Note: the only real
2962                  * reason for setting "curr = NULL" here is so that the ASSERT
2963                  * below can catch the case where no matching entry was found
2964                  * on any of the lists.
2965                  */
2966                 if (found) {
2967                         break;
2968                 } else {
2969                         curr = NULL;
2970                         container = container->wl_next;
2971                 }
2972         }
2973 
2974         /*
2975          * Update work queue header's "head" and "full" conditions to match
2976          * the last entry on the container list.  (Note: Only if we're pulling
2977          * entries from the last work queue portion of the list, i.e. not from
2978          * the previous portions that may be the "reapable" list.)
2979          */
2980         if (last_container) {
2981                 wq->wq_head = wq->wq_wrid_post->wl_head;
2982                 wq->wq_full = wq->wq_wrid_post->wl_full;
2983         }
2984 
2985         /* Ensure that we've actually found what we were searching for */
2986         ASSERT(curr != NULL);
2987 
2988         TAVOR_TNF_EXIT(tavor_wrid_find_match);
2989         return (curr);
2990 }
2991 
2992 
2993 /*
2994  * tavor_wrid_find_match_srq()
2995  *    Context: Can be called from interrupt or base context.
2996  */
2997 tavor_wrid_entry_t *
2998 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2999     tavor_hw_cqe_t *cqe)
3000 {
3001         tavor_wrid_entry_t      *wre;
3002         uint64_t                *wl_wqe;
3003         uint32_t                wqe_index;
3004         uint64_t                wqe_addr;
3005         uint32_t                cqe_wqe_addr;
3006 
3007         /* Grab the WQE addr out of the CQE */
3008         cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
3009 
3010         /*
3011          * Use the WQE addr as the lower 32-bit, we add back on the
3012          * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
3013          * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
3014          * the SRQ Work Queue itself.  We use this address as the index to find
3015          * out which Work Queue Entry this CQE corresponds with.
3016          *
3017          * We also use this address below to add the WQE back on to the free
3018          * list.
3019          */
3020         wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
3021             (cqe_wqe_addr + wl->wl_srq_desc_off);
3022 
3023         /*
3024          * Given the 'wqe_addr' just calculated and the srq buf address, we
3025          * find the 'wqe_index'.  The 'wre' returned below contains the WRID
3026          * that we are looking for.  This indexes into the wre_list for this
3027          * specific WQE.
3028          */
3029         wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
3030             wl->wl_srq_log_wqesz);
3031 
3032         /* ASSERT on impossible wqe_index values */
3033         ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
3034 
3035         /* Get the pointer to this WQE */
3036         wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
3037 
3038         /* Put this WQE index back on the free list */
3039         ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
3040         wl->wl_free_list_indx = wqe_index;
3041 
3042         /* Using the index, return the Work Request ID Entry (wre) */
3043         wre = &wl->wl_wre[wqe_index];
3044 
3045         return (wre);
3046 }
3047 
3048 
3049 /*
3050  * tavor_wrid_cq_reap()
3051  *    Context: Can be called from interrupt or base context.
3052  */
3053 void
3054 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
3055 {
3056         tavor_workq_hdr_t       *consume_wqhdr;
3057         tavor_wrid_list_hdr_t   *container, *to_free;
3058 
3059         ASSERT(MUTEX_HELD(&cq->cq_lock));
3060 
3061         TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3062 
3063         /* Lock the list of work queues associated with this CQ */
3064         mutex_enter(&cq->cq_wrid_wqhdr_lock);
3065 
3066         /* Walk the "reapable" list and free up containers */
3067         container = cq->cq_wrid_reap_head;
3068         while (container != NULL) {
3069                 to_free   = container;
3070                 container = container->wl_reap_next;
3071                 /*
3072                  * If reaping the WRID list containers pulls the last
3073                  * container from the given work queue header, then we free
3074                  * the work queue header as well.
3075                  */
3076                 consume_wqhdr = tavor_wrid_list_reap(to_free);
3077                 if (consume_wqhdr != NULL) {
3078                         tavor_cq_wqhdr_remove(cq, consume_wqhdr);
3079                 }
3080         }
3081 
3082         /* Once finished reaping, we reset the CQ's reap list */
3083         cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
3084 
3085         mutex_exit(&cq->cq_wrid_wqhdr_lock);
3086         TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3087 }
3088 
3089 
3090 /*
3091  * tavor_wrid_cq_force_reap()
3092  *    Context: Can be called from interrupt or base context.
3093  */
3094 void
3095 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
3096 {
3097         tavor_workq_hdr_t       *curr;
3098         tavor_wrid_list_hdr_t   *container, *to_free;
3099         avl_tree_t              *treep;
3100         void                    *cookie = NULL;
3101 
3102         ASSERT(MUTEX_HELD(&cq->cq_lock));
3103 
3104         TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3105 
3106         /*
3107          * The first step is to walk the "reapable" list and free up those
3108          * containers.  This is necessary because the containers on the
3109          * reapable list are not otherwise connected to the work queue headers
3110          * anymore.
3111          */
3112         tavor_wrid_cq_reap(cq);
3113 
3114         /* Now lock the list of work queues associated with this CQ */
3115         mutex_enter(&cq->cq_wrid_wqhdr_lock);
3116 
3117         /*
3118          * Walk the list of work queue headers and free up all the WRID list
3119          * containers chained to it.  Note: We don't need to grab the locks
3120          * for each of the individual WRID lists here because the only way
3121          * things can be added or removed from the list at this point would be
3122          * through post a work request to a QP.  But if we've come this far,
3123          * then we can be assured that there are no longer any QP associated
3124          * with the CQ that we are trying to free.
3125          */
3126 #ifdef __lock_lint
3127         tavor_wrid_wqhdr_compare(NULL, NULL);
3128 #endif
3129         treep = &cq->cq_wrid_wqhdr_avl_tree;
3130         while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
3131                 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
3132                 container = curr->wq_wrid_poll;
3133                 while (container != NULL) {
3134                         to_free   = container;
3135                         container = container->wl_next;
3136                         /*
3137                          * If reaping the WRID list containers pulls the last
3138                          * container from the given work queue header, then
3139                          * we free the work queue header as well.  Note: we
3140                          * ignore the return value because we know that the
3141                          * work queue header should always be freed once the
3142                          * list of containers has come to an end.
3143                          */
3144                         (void) tavor_wrid_list_reap(to_free);
3145                         if (container == NULL) {
3146                                 tavor_cq_wqhdr_remove(cq, curr);
3147                         }
3148                 }
3149         }
3150         avl_destroy(treep);
3151 
3152         mutex_exit(&cq->cq_wrid_wqhdr_lock);
3153         TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3154 }
3155 
3156 
3157 /*
3158  * tavor_wrid_get_list()
3159  *    Context: Can be called from interrupt or base context.
3160  */
3161 tavor_wrid_list_hdr_t *
3162 tavor_wrid_get_list(uint32_t qsize)
3163 {
3164         tavor_wrid_list_hdr_t   *wridlist;
3165         uint32_t                size;
3166 
3167         /*
3168          * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
3169          * which holds the pointers necessary for maintaining the "reapable"
3170          * list, chaining together multiple "containers" old and new, and
3171          * tracking the head, tail, size, etc. for each container.
3172          *
3173          * The "container" also holds all the tavor_wrid_entry_t's, which is
3174          * allocated separately, one for each entry on the corresponding work
3175          * queue.
3176          */
3177         size = sizeof (tavor_wrid_list_hdr_t);
3178 
3179         /*
3180          * Note that this allocation has to be a NOSLEEP operation here
3181          * because we are holding the "wqhdr_list_lock" and, therefore,
3182          * could get raised to the interrupt level.
3183          */
3184         wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
3185         if (wridlist == NULL) {
3186                 return (NULL);
3187         }
3188         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
3189 
3190         /* Complete the "container" initialization */
3191         wridlist->wl_size = qsize;
3192         wridlist->wl_full = 0;
3193         wridlist->wl_head = 0;
3194         wridlist->wl_tail = 0;
3195         wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3196             sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3197         if (wridlist->wl_wre == NULL) {
3198                 kmem_free(wridlist, size);
3199                 return (NULL);
3200         }
3201         wridlist->wl_wre_old_tail  = NULL;
3202         wridlist->wl_reap_next = NULL;
3203         wridlist->wl_next  = NULL;
3204         wridlist->wl_prev  = NULL;
3205         wridlist->wl_srq_en = 0;
3206 
3207         return (wridlist);
3208 }
3209 
3210 /*
3211  * tavor_wrid_list_srq_init()
3212  * Context: Can be called from interrupt or base context
3213  */
3214 void
3215 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3216     uint_t wq_start)
3217 {
3218         uint64_t *wl_wqe;
3219         int wqe_index;
3220 
3221         ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3222 
3223         /* Setup pointers for use later when we are polling the CQ */
3224         wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3225         wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3226         wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3227         wridlist->wl_srq_desc_off = srq->srq_desc_off;
3228         wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3229 
3230         /* Given wq_start to start initializing buf at, verify sanity */
3231         ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3232 
3233         /*
3234          * Initialize wridlist free list
3235          *
3236          * For each WQ up to the size of our queue, we store an index in the WQ
3237          * memory itself, representing the next available free entry.  The
3238          * 'wl_free_list_indx' always holds the index of the next available
3239          * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3240          * completely full.  This gives us the advantage of being able to have
3241          * entries complete or be polled off the WQ out-of-order.
3242          *
3243          * For now, we write the free_list entries inside the WQ itself.  It
3244          * may be useful in the future to store this information in a separate
3245          * structure for debugging purposes.
3246          */
3247         for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3248                 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3249                 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3250                     wridlist->wl_free_list_indx);
3251                 wridlist->wl_free_list_indx = wqe_index;
3252         }
3253 }
3254 
3255 
3256 /*
3257  * tavor_wrid_reaplist_add()
3258  *    Context: Can be called from interrupt or base context.
3259  */
3260 static void
3261 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3262 {
3263         ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3264 
3265         TAVOR_TNF_ENTER(tavor_wrid_reaplist_add);
3266 
3267         mutex_enter(&wq->wq_wrid_wql->wql_lock);
3268 
3269         /*
3270          * Add the "post" container (the last one on the current chain) to
3271          * the CQ's "reapable" list
3272          */
3273         if ((cq->cq_wrid_reap_head == NULL) &&
3274             (cq->cq_wrid_reap_tail == NULL)) {
3275                 cq->cq_wrid_reap_head = wq->wq_wrid_post;
3276                 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3277         } else {
3278                 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3279                 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3280         }
3281 
3282         mutex_exit(&wq->wq_wrid_wql->wql_lock);
3283 }
3284 
3285 
3286 int
3287 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3288 {
3289         tavor_workq_compare_t   *cmpp;
3290         tavor_workq_hdr_t       *curr;
3291 
3292         cmpp = (tavor_workq_compare_t *)p1;
3293         curr = (tavor_workq_hdr_t *)p2;
3294 
3295         if (cmpp->cmp_qpn < curr->wq_qpn)
3296                 return (-1);
3297         else if (cmpp->cmp_qpn > curr->wq_qpn)
3298                 return (+1);
3299         else if (cmpp->cmp_type < curr->wq_type)
3300                 return (-1);
3301         else if (cmpp->cmp_type > curr->wq_type)
3302                 return (+1);
3303         else
3304                 return (0);
3305 }
3306 
3307 
3308 /*
3309  * tavor_wrid_wqhdr_find()
3310  *    Context: Can be called from interrupt or base context.
3311  */
3312 static tavor_workq_hdr_t *
3313 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3314 {
3315         tavor_workq_hdr_t       *curr;
3316         tavor_workq_compare_t   cmp;
3317 
3318         TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find);
3319 
3320         ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3321 
3322         /*
3323          * Walk the CQ's work queue list, trying to find a send or recv queue
3324          * with the same QP number.  We do this even if we are going to later
3325          * create a new entry because it helps us easily find the end of the
3326          * list.
3327          */
3328         cmp.cmp_qpn = qpn;
3329         cmp.cmp_type = wq_type;
3330 #ifdef __lock_lint
3331         tavor_wrid_wqhdr_compare(NULL, NULL);
3332 #endif
3333         curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3334 
3335         TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find);
3336         return (curr);
3337 }
3338 
3339 
3340 /*
3341  * tavor_wrid_wqhdr_create()
3342  *    Context: Can be called from interrupt or base context.
3343  */
3344 static tavor_workq_hdr_t *
3345 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3346     uint_t wq_type, uint_t create_wql)
3347 {
3348         tavor_workq_hdr_t       *wqhdr_tmp;
3349 
3350         TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create);
3351 
3352         ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3353 
3354         /*
3355          * Allocate space a work queue header structure and initialize it.
3356          * Each work queue header structure includes a "wq_wrid_wql"
3357          * which needs to be initialized.  Note that this allocation has to be
3358          * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3359          * and, therefore, could get raised to the interrupt level.
3360          */
3361         wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3362             sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3363         if (wqhdr_tmp == NULL) {
3364                 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3365                 return (NULL);
3366         }
3367         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3368         wqhdr_tmp->wq_qpn    = qpn;
3369         wqhdr_tmp->wq_type   = wq_type;
3370 
3371         if (create_wql) {
3372                 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3373                 if (wqhdr_tmp->wq_wrid_wql == NULL) {
3374                         kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3375                         TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3376                         return (NULL);
3377                 }
3378         }
3379 
3380         wqhdr_tmp->wq_wrid_poll = NULL;
3381         wqhdr_tmp->wq_wrid_post = NULL;
3382 
3383         /* Chain the newly allocated work queue header to the CQ's list */
3384         tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3385 
3386         TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3387         return (wqhdr_tmp);
3388 }
3389 
3390 
3391 /*
3392  * tavor_wrid_wql_create()
3393  *    Context: Can be called from interrupt or base context.
3394  */
3395 tavor_wq_lock_t *
3396 tavor_wrid_wql_create(tavor_state_t *state)
3397 {
3398         tavor_wq_lock_t *wql;
3399 
3400         TAVOR_TNF_ENTER(tavor_wrid_wql_create);
3401 
3402         /*
3403          * Allocate the WQL and initialize it.
3404          */
3405         wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3406         if (wql == NULL) {
3407                 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3408                 return (NULL);
3409         }
3410 
3411         mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3412             DDI_INTR_PRI(state->ts_intrmsi_pri));
3413 
3414         /* Add refcount to WQL */
3415         tavor_wql_refcnt_inc(wql);
3416 
3417         TAVOR_TNF_EXIT(tavor_wrid_wql_create);
3418         return (wql);
3419 }
3420 
3421 
3422 /*
3423  * tavor_wrid_get_wqeaddrsz()
3424  *    Context: Can be called from interrupt or base context.
3425  */
3426 static uint32_t
3427 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3428 {
3429         tavor_wrid_entry_t      *wre;
3430         uint32_t                wqeaddrsz;
3431         uint32_t                head;
3432 
3433         /*
3434          * If the container is empty, then there is no next entry. So just
3435          * return zero.  Note: the "head == tail" condition here can only
3436          * mean that the container is empty because we have previously pulled
3437          * something from the container.
3438          *
3439          * If the container is not empty, then find the next entry and return
3440          * the contents of its "wqeaddrsz" field.
3441          */
3442         if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3443                 wqeaddrsz = 0;
3444         } else {
3445                 /*
3446                  * We don't need to calculate the "next" head pointer here
3447                  * because "head" should already point to the next entry on
3448                  * the list (since we just pulled something off - in
3449                  * tavor_wrid_find_match() - and moved the head index forward.)
3450                  */
3451                 head = wq->wq_wrid_poll->wl_head;
3452                 wre = &wq->wq_wrid_poll->wl_wre[head];
3453                 wqeaddrsz = wre->wr_wqeaddrsz;
3454         }
3455         return (wqeaddrsz);
3456 }
3457 
3458 
3459 /*
3460  * tavor_wrid_wqhdr_add()
3461  *    Context: Can be called from interrupt or base context.
3462  */
3463 static void
3464 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3465     tavor_wrid_list_hdr_t *wridlist)
3466 {
3467         ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3468 
3469         /* Chain the new WRID list "container" to the work queue list */
3470         if ((wqhdr->wq_wrid_post == NULL) &&
3471             (wqhdr->wq_wrid_poll == NULL)) {
3472                 wqhdr->wq_wrid_poll = wridlist;
3473                 wqhdr->wq_wrid_post = wridlist;
3474         } else {
3475                 wqhdr->wq_wrid_post->wl_next = wridlist;
3476                 wridlist->wl_prev = wqhdr->wq_wrid_post;
3477                 wqhdr->wq_wrid_post = wridlist;
3478         }
3479 }
3480 
3481 
3482 /*
3483  * tavor_wrid_wqhdr_remove()
3484  *    Context: Can be called from interrupt or base context.
3485  *
3486  *    Note: this is only called to remove the most recently added WRID list
3487  *    container (i.e. in tavor_from_reset() above)
3488  */
3489 static void
3490 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3491     tavor_wrid_list_hdr_t *wridlist)
3492 {
3493         tavor_wrid_list_hdr_t   *prev, *next;
3494 
3495         ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3496 
3497         /* Unlink the WRID list "container" from the work queue list */
3498         prev = wridlist->wl_prev;
3499         next = wridlist->wl_next;
3500         if (prev != NULL) {
3501                 prev->wl_next = next;
3502         }
3503         if (next != NULL) {
3504                 next->wl_prev = prev;
3505         }
3506 
3507         /*
3508          * Update any pointers in the work queue hdr that may point to this
3509          * WRID list container
3510          */
3511         if (wqhdr->wq_wrid_post == wridlist) {
3512                 wqhdr->wq_wrid_post = prev;
3513         }
3514         if (wqhdr->wq_wrid_poll == wridlist) {
3515                 wqhdr->wq_wrid_poll = NULL;
3516         }
3517 }
3518 
3519 
3520 /*
3521  * tavor_wrid_list_reap()
3522  *    Context: Can be called from interrupt or base context.
3523  *    Note: The "wqhdr_list_lock" must be held.
3524  */
3525 static tavor_workq_hdr_t *
3526 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3527 {
3528         tavor_workq_hdr_t       *wqhdr, *consume_wqhdr = NULL;
3529         tavor_wrid_list_hdr_t   *prev, *next;
3530         uint32_t                size;
3531 
3532         TAVOR_TNF_ENTER(tavor_wrid_list_reap);
3533 
3534         /* Get the back pointer to the work queue header (see below) */
3535         wqhdr = wridlist->wl_wqhdr;
3536         mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3537 
3538         /* Unlink the WRID list "container" from the work queue list */
3539         prev = wridlist->wl_prev;
3540         next = wridlist->wl_next;
3541         if (prev != NULL) {
3542                 prev->wl_next = next;
3543         }
3544         if (next != NULL) {
3545                 next->wl_prev = prev;
3546         }
3547 
3548         /*
3549          * If the back pointer to the work queue header shows that it
3550          * was pointing to the entry we are about to remove, then the work
3551          * queue header is reapable as well.
3552          */
3553         if ((wqhdr->wq_wrid_poll == wridlist) &&
3554             (wqhdr->wq_wrid_post == wridlist)) {
3555                 consume_wqhdr = wqhdr;
3556         }
3557 
3558         /* Be sure to update the "poll" and "post" container pointers */
3559         if (wqhdr->wq_wrid_poll == wridlist) {
3560                 wqhdr->wq_wrid_poll = next;
3561         }
3562         if (wqhdr->wq_wrid_post == wridlist) {
3563                 wqhdr->wq_wrid_post = NULL;
3564         }
3565 
3566         /* Calculate the size and free the container */
3567         size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3568         kmem_free(wridlist->wl_wre, size);
3569         kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3570 
3571         mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3572 
3573         TAVOR_TNF_EXIT(tavor_wrid_list_reap);
3574         return (consume_wqhdr);
3575 }
3576 
3577 
3578 /*
3579  * tavor_wrid_wqhdr_lock_both()
3580  *    Context: Can be called from interrupt or base context.
3581  */
3582 static void
3583 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3584 {
3585         tavor_cqhdl_t   sq_cq, rq_cq;
3586 
3587         sq_cq = qp->qp_sq_cqhdl;
3588         rq_cq = qp->qp_rq_cqhdl;
3589 
3590 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3591 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3592 
3593         /*
3594          * If both work queues (send and recv) share a completion queue, then
3595          * grab the common lock.  If they use different CQs (hence different
3596          * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3597          * receive.  We do this consistently and correctly in
3598          * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3599          * of dead lock condition.  Note:  We add the "__lock_lint" code here
3600          * to fake out warlock into thinking we've grabbed both locks (when,
3601          * in fact, we only needed the one).
3602          */
3603         if (sq_cq == rq_cq) {
3604                 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3605 #ifdef  __lock_lint
3606                 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3607 #endif
3608         } else {
3609                 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3610                 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3611         }
3612 }
3613 
3614 /*
3615  * tavor_wrid_wqhdr_unlock_both()
3616  *    Context: Can be called from interrupt or base context.
3617  */
3618 static void
3619 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3620 {
3621         tavor_cqhdl_t   sq_cq, rq_cq;
3622 
3623         sq_cq = qp->qp_sq_cqhdl;
3624         rq_cq = qp->qp_rq_cqhdl;
3625 
3626 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3627 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3628 
3629         /*
3630          * See tavor_wrid_wqhdr_lock_both() above for more detail
3631          */
3632         if (sq_cq == rq_cq) {
3633 #ifdef  __lock_lint
3634                 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3635 #endif
3636                 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3637         } else {
3638                 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3639                 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3640         }
3641 }
3642 
3643 
3644 /*
3645  * tavor_cq_wqhdr_add()
3646  *    Context: Can be called from interrupt or base context.
3647  */
3648 static void
3649 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3650 {
3651         tavor_workq_compare_t   cmp;
3652         avl_index_t             where;
3653 
3654         ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3655 
3656         cmp.cmp_qpn = wqhdr->wq_qpn;
3657         cmp.cmp_type = wqhdr->wq_type;
3658 #ifdef __lock_lint
3659         tavor_wrid_wqhdr_compare(NULL, NULL);
3660 #endif
3661         (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3662         /*
3663          * If the CQ's work queue list is empty, then just add it.
3664          * Otherwise, chain it to the beginning of the list.
3665          */
3666         avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3667 }
3668 
3669 
3670 /*
3671  * tavor_cq_wqhdr_remove()
3672  *    Context: Can be called from interrupt or base context.
3673  */
3674 static void
3675 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3676 {
3677         ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3678 
3679 #ifdef __lock_lint
3680         tavor_wrid_wqhdr_compare(NULL, NULL);
3681 #endif
3682         /* Remove "wqhdr" from the work queue header list on "cq" */
3683         avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3684 
3685         /*
3686          * Release reference to WQL; If this is the last reference, this call
3687          * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3688          */
3689         tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3690 
3691         /* Free the memory associated with "wqhdr" */
3692         kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3693 }
3694 
3695 
3696 /*
3697  * tavor_wql_refcnt_inc()
3698  * Context: Can be called from interrupt or base context
3699  */
3700 void
3701 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3702 {
3703         ASSERT(wql != NULL);
3704 
3705         mutex_enter(&wql->wql_lock);
3706         wql->wql_refcnt++;
3707         mutex_exit(&wql->wql_lock);
3708 }
3709 
3710 /*
3711  * tavor_wql_refcnt_dec()
3712  * Context: Can be called from interrupt or base context
3713  */
3714 void
3715 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3716 {
3717         int     refcnt;
3718 
3719         ASSERT(wql != NULL);
3720 
3721         mutex_enter(&wql->wql_lock);
3722         wql->wql_refcnt--;
3723         refcnt = wql->wql_refcnt;
3724         mutex_exit(&wql->wql_lock);
3725 
3726         /*
3727          *
3728          * Free up WQL memory if we're the last one associated with this
3729          * structure.
3730          */
3731         if (refcnt == 0) {
3732                 mutex_destroy(&wql->wql_lock);
3733                 kmem_free(wql, sizeof (tavor_wq_lock_t));
3734         }
3735 }