ilwluts Wdiff usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c

Print this page

8368 remove warlock leftovers from usr/src/uts

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c
          +++ new/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*
  28   28   * tavor_wr.c
  29   29   *    Tavor Work Request Processing Routines
  30   30   *
  31   31   *    Implements all the routines necessary to provide the PostSend(),
  32   32   *    PostRecv() and PostSRQ() verbs.  Also contains all the code
  33   33   *    necessary to implement the Tavor WRID tracking mechanism.
  34   34   */
  35   35  
  36   36  #include <sys/types.h>
  37   37  #include <sys/conf.h>
  38   38  #include <sys/ddi.h>
  39   39  #include <sys/sunddi.h>
  40   40  #include <sys/modctl.h>
  41   41  #include <sys/avl.h>
  42   42  
  43   43  #include <sys/ib/adapters/tavor/tavor.h>
  44   44  
  45   45  static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
  46   46      uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
  47   47  #pragma inline(tavor_qp_send_doorbell)
  48   48  static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
  49   49      uint32_t nds, uint32_t qpn, uint32_t credits);
  50   50  #pragma inline(tavor_qp_recv_doorbell)
  51   51  static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
  52   52  static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
  53   53  static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
  54   54      ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  55   55  static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
  56   56      ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
  57   57      uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
  58   58  static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
  59   59      ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
  60   60  static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
  61   61      uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
  62   62      tavor_qphdl_t qp);
  63   63  static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
  64   64      ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
  65   65  static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
  66   66      uint64_t *prev, tavor_qphdl_t qp);
  67   67  static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
  68   68      ibt_recv_wr_t *wr, uint64_t *desc);
  69   69  static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
  70   70      tavor_srqhdl_t srq);
  71   71  static void tavor_wqe_sync(void *hdl, uint_t sync_from,
  72   72      uint_t sync_to, uint_t sync_type, uint_t flag);
  73   73  static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
  74   74      tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
  75   75  static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
  76   76  static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
  77   77      uint_t send_or_recv);
  78   78  static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
  79   79      tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
  80   80  static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
  81   81  static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
  82   82      tavor_wrid_list_hdr_t *wrid_list);
  83   83  static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
  84   84      tavor_wrid_list_hdr_t *wrid_list);
  85   85  static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
  86   86  static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
  87   87  static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
  88   88  static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
  89   89  static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
  90   90  
  91   91  /*
  92   92   * tavor_post_send()
  93   93   *    Context: Can be called from interrupt or base context.
  94   94   */
  95   95  int
  96   96  tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
  97   97      ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
  98   98  {
  99   99          tavor_sw_wqe_dbinfo_t           dbinfo;
 100  100          tavor_wrid_list_hdr_t           *wridlist;
 101  101          tavor_wrid_entry_t              *wre_last;
 102  102          uint64_t                        *desc, *prev, *first;
 103  103          uint32_t                        desc_sz, first_sz;
 104  104          uint32_t                        wqeaddrsz, signaled_dbd;
 105  105          uint32_t                        head, tail, next_tail, qsize_msk;
 106  106          uint32_t                        sync_from, sync_to;
 107  107          uint_t                          currindx, wrindx, numremain;
 108  108          uint_t                          chainlen, chainbegin, posted_cnt;
 109  109          uint_t                          maxdb = TAVOR_QP_MAXDESC_PER_DB;
 110  110          int                             status;
 111  111  
 112  112          TAVOR_TNF_ENTER(tavor_post_send);
 113  113  
 114  114          /*
 115  115           * Check for user-mappable QP memory.  Note:  We do not allow kernel
 116  116           * clients to post to QP memory that is accessible directly by the
 117  117           * user.  If the QP memory is user accessible, then return an error.
 118  118           */
 119  119          if (qp->qp_is_umap) {
 120  120                  TNF_PROBE_0(tavor_post_send_inv_usrmapped_type,
 121  121                      TAVOR_TNF_ERROR, "");
 122  122                  TAVOR_TNF_EXIT(tavor_post_send);
 123  123                  return (IBT_QP_HDL_INVALID);
 124  124          }
 125  125  
 126  126          /* Initialize posted_cnt */
 127  127          posted_cnt = 0;
 128  128  
 129  129          mutex_enter(&qp->qp_lock);
 130  130  
 131  131          /*
 132  132           * Check QP state.  Can not post Send requests from the "Reset",
 133  133           * "Init", or "RTR" states
 134  134           */
 135  135          if ((qp->qp_state == TAVOR_QP_RESET) ||
 136  136              (qp->qp_state == TAVOR_QP_INIT) ||
 137  137              (qp->qp_state == TAVOR_QP_RTR)) {
 138  138                  mutex_exit(&qp->qp_lock);
 139  139                  TNF_PROBE_0(tavor_post_send_inv_qpstate_fail,
 140  140                      TAVOR_TNF_ERROR, "");
 141  141                  TAVOR_TNF_EXIT(tavor_post_send);
 142  142                  return (IBT_QP_STATE_INVALID);
 143  143          }
 144  144  
 145  145          /* Grab the lock for the WRID list */
 146  146          mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
 147  147          wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
 148  148  
 149  149          /* Save away some initial QP state */
 150  150          qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
 151  151          tail      = qp->qp_sq_wqhdr->wq_tail;
 152  152          head      = qp->qp_sq_wqhdr->wq_head;
 153  153  
 154  154          /*
 155  155           * For each ibt_send_wr_t in the wr[] list passed in, parse the
 156  156           * request and build a Send WQE.  Note:  Because we are potentially
 157  157           * building a chain of WQEs, we want to link them all together.
 158  158           * However, we do not want to link the first one to the previous
 159  159           * WQE until the entire chain has been linked.  Then in the last
 160  160           * step we ring the appropriate doorbell.  Note:  It is possible for
 161  161           * more Work Requests to be posted than the HW will support at one
 162  162           * shot.  If this happens, we need to be able to post and ring
 163  163           * several chains here until the the entire request is complete.
 164  164           */
 165  165          wrindx = 0;
 166  166          numremain = num_wr;
 167  167          status    = DDI_SUCCESS;
 168  168          while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 169  169                  /*
 170  170                   * For the first WQE on a new chain we need "prev" to point
 171  171                   * to the current descriptor.  As we begin to process
 172  172                   * further, "prev" will be updated to point to the previous
 173  173                   * WQE on the current chain (see below).
 174  174                   */
 175  175                  prev = TAVOR_QP_SQ_ENTRY(qp, tail);
 176  176  
 177  177                  /*
 178  178                   * Before we begin, save the current "tail index" for later
 179  179                   * DMA sync
 180  180                   */
 181  181                  sync_from = tail;
 182  182  
 183  183                  /*
 184  184                   * Break the request up into chains that are less than or
 185  185                   * equal to the maximum number of WQEs that can be posted
 186  186                   * per doorbell ring
 187  187                   */
 188  188                  chainlen   = (numremain > maxdb) ? maxdb : numremain;
 189  189                  numremain -= chainlen;
 190  190                  chainbegin = wrindx;
 191  191                  for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 192  192                          /*
 193  193                           * Check for "queue full" condition.  If the queue
 194  194                           * is already full, then no more WQEs can be posted.
 195  195                           * So break out, ring a doorbell (if necessary) and
 196  196                           * return an error
 197  197                           */
 198  198                          if (qp->qp_sq_wqhdr->wq_full != 0) {
 199  199                                  status = IBT_QP_FULL;
 200  200                                  TNF_PROBE_0_DEBUG(tavor_post_send_sqfull,
 201  201                                      TAVOR_TNF_TRACE, "");
 202  202                                  break;
 203  203                          }
 204  204  
 205  205                          /*
 206  206                           * Increment the "tail index" and check for "queue
 207  207                           * full" condition.  If we detect that the current
 208  208                           * work request is going to fill the work queue, then
 209  209                           * we mark this condition and continue.
 210  210                           */
 211  211                          next_tail = (tail + 1) & qsize_msk;
 212  212                          if (next_tail == head) {
 213  213                                  qp->qp_sq_wqhdr->wq_full = 1;
 214  214                          }
 215  215  
 216  216                          /*
 217  217                           * Get the address of the location where the next
 218  218                           * Send WQE should be built
 219  219                           */
 220  220                          desc = TAVOR_QP_SQ_ENTRY(qp, tail);
 221  221  
 222  222                          /*
 223  223                           * Call tavor_wqe_send_build() to build the WQE
 224  224                           * at the given address.  This routine uses the
 225  225                           * information in the ibt_send_wr_t list (wr[]) and
 226  226                           * returns the size of the WQE when it returns.
 227  227                           */
 228  228                          status = tavor_wqe_send_build(state, qp,
 229  229                              &wr[wrindx], desc, &desc_sz);
 230  230                          if (status != DDI_SUCCESS) {
 231  231                                  TNF_PROBE_0(tavor_post_send_bldwqe_fail,
 232  232                                      TAVOR_TNF_ERROR, "");
 233  233                                  break;
 234  234                          }
 235  235  
 236  236                          /*
 237  237                           * Add a WRID entry to the WRID list.  Need to
 238  238                           * calculate the "wqeaddrsz" and "signaled_dbd"
 239  239                           * values to pass to tavor_wrid_add_entry()
 240  240                           */
 241  241                          wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
 242  242                              ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
 243  243                              desc_sz);
 244  244                          if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
 245  245                              (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
 246  246                                  signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
 247  247                          } else {
 248  248                                  signaled_dbd = 0;
 249  249                          }
 250  250                          tavor_wrid_add_entry(qp->qp_sq_wqhdr,
 251  251                              wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
 252  252  
 253  253                          /*
 254  254                           * If this is not the first descriptor on the current
 255  255                           * chain, then link it to the previous WQE.  Otherwise,
 256  256                           * save the address and size of this descriptor (in
 257  257                           * "first" and "first_sz" respectively) and continue.
 258  258                           * Note: Linking a WQE to the the previous one will
 259  259                           * depend on whether the two WQEs are from "special
 260  260                           * QPs" (i.e. MLX transport WQEs) or whether they are
 261  261                           * normal Send WQEs.
 262  262                           */
 263  263                          if (currindx != 0) {
 264  264                                  if (qp->qp_is_special) {
 265  265                                          tavor_wqe_mlx_linknext(&wr[wrindx - 1],
 266  266                                              desc, desc_sz, prev, NULL, qp);
 267  267                                  } else {
 268  268                                          tavor_wqe_send_linknext(&wr[wrindx],
 269  269                                              &wr[wrindx - 1], desc, desc_sz,
 270  270                                              prev, NULL, qp);
 271  271                                  }
 272  272                                  prev = desc;
 273  273                          } else {
 274  274                                  first    = desc;
 275  275                                  first_sz = desc_sz;
 276  276                          }
 277  277  
 278  278                          /*
 279  279                           * Update the current "tail index" and increment
 280  280                           * "posted_cnt"
 281  281                           */
 282  282                          tail = next_tail;
 283  283                          posted_cnt++;
 284  284                  }
 285  285  
 286  286                  /*
 287  287                   * If we reach here and there are one or more WQEs which have
 288  288                   * been successfully chained together, then we need to link
 289  289                   * the current chain to the previously executing chain of
 290  290                   * descriptor (if there is one) and ring the doorbell for the
 291  291                   * send work queue.
 292  292                   */
 293  293                  if (currindx != 0) {
 294  294                          /*
 295  295                           * Before we link the chain, we need to ensure that the
 296  296                           * "next" field on the last WQE is set to NULL (to
 297  297                           * indicate the end of the chain).  Note: Just as it
 298  298                           * did above, the format for the "next" fields in a
 299  299                           * given WQE depend on whether the WQE is MLX
 300  300                           * transport or not.
 301  301                           */
 302  302                          if (qp->qp_is_special) {
 303  303                                  tavor_wqe_mlx_linknext(&wr[chainbegin +
 304  304                                      currindx - 1], NULL, 0, prev, NULL, qp);
 305  305                          } else {
 306  306                                  tavor_wqe_send_linknext(NULL,
 307  307                                      &wr[chainbegin + currindx - 1], NULL, 0,
 308  308                                      prev, NULL, qp);
 309  309                          }
 310  310  
 311  311                          /* Save away updated "tail index" for the DMA sync */
 312  312                          sync_to = tail;
 313  313  
 314  314                          /* Do a DMA sync for current send WQE(s) */
 315  315                          tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
 316  316                              DDI_DMA_SYNC_FORDEV);
 317  317  
 318  318                          /*
 319  319                           * Now link the chain to the old chain (if there was
 320  320                           * one.  Note: still need to pay attention to whether
 321  321                           * the QP used MLX transport WQEs or not.
 322  322                           */
 323  323                          if (qp->qp_is_special) {
 324  324                                  tavor_wqe_mlx_linknext(NULL, first, first_sz,
 325  325                                      qp->qp_sq_lastwqeaddr, &dbinfo, qp);
 326  326                          } else {
 327  327                                  tavor_wqe_send_linknext(&wr[chainbegin], NULL,
 328  328                                      first, first_sz, qp->qp_sq_lastwqeaddr,
 329  329                                      &dbinfo, qp);
 330  330                          }
 331  331  
 332  332                          /*
 333  333                           * If there was a valid previous WQE (i.e. non-NULL),
 334  334                           * then sync it too.  This is because we have updated
 335  335                           * its "next" fields and we want to ensure that the
 336  336                           * hardware can see the changes.
 337  337                           */
 338  338                          if (qp->qp_sq_lastwqeaddr != NULL) {
 339  339                                  sync_to   = sync_from;
 340  340                                  sync_from = (sync_from - 1) & qsize_msk;
 341  341                                  tavor_wqe_sync(qp, sync_from, sync_to,
 342  342                                      TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
 343  343                          }
 344  344  
 345  345                          /*
 346  346                           * Now if the WRID tail entry is non-NULL, then this
 347  347                           * represents the entry to which we are chaining the
 348  348                           * new entries.  Since we are going to ring the
 349  349                           * doorbell for this WQE, we want set its "dbd" bit.
 350  350                           *
 351  351                           * On the other hand, if the tail is NULL, even though
 352  352                           * we will have rung the doorbell for the previous WQE
 353  353                           * (for the hardware's sake) it is irrelevant to our
 354  354                           * purposes (for tracking WRIDs) because we know the
 355  355                           * request must have already completed.
 356  356                           */
 357  357                          wre_last = wridlist->wl_wre_old_tail;
 358  358                          if (wre_last != NULL) {
 359  359                                  wre_last->wr_signaled_dbd |=
 360  360                                      TAVOR_WRID_ENTRY_DOORBELLED;
 361  361                          }
 362  362  
 363  363                          /* Update some of the state in the QP */
 364  364                          qp->qp_sq_lastwqeaddr    = desc;
 365  365                          qp->qp_sq_wqhdr->wq_tail = tail;
 366  366  
 367  367                          /* Ring the doorbell */
 368  368                          tavor_qp_send_doorbell(state,
 369  369                              (uint32_t)((uintptr_t)first - qp->qp_desc_off),
 370  370                              first_sz, qp->qp_qpnum, dbinfo.db_fence,
 371  371                              dbinfo.db_nopcode);
 372  372                  }
 373  373          }
 374  374  
 375  375          /*
 376  376           * Update the "num_posted" return value (if necessary).  Then drop
 377  377           * the locks and return success.
 378  378           */
 379  379          if (num_posted != NULL) {
 380  380                  *num_posted = posted_cnt;
 381  381          }
 382  382  
 383  383          mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
 384  384          mutex_exit(&qp->qp_lock);
 385  385  
 386  386          TAVOR_TNF_EXIT(tavor_post_send);
 387  387          return (status);
 388  388  }
 389  389  
 390  390  
 391  391  /*
 392  392   * tavor_post_recv()
 393  393   *    Context: Can be called from interrupt or base context.
 394  394   */
 395  395  int
 396  396  tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
 397  397      ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 398  398  {
 399  399          uint64_t                        *desc, *prev, *first;
 400  400          uint32_t                        desc_sz, first_sz;
 401  401          uint32_t                        wqeaddrsz, signaled_dbd;
 402  402          uint32_t                        head, tail, next_tail, qsize_msk;
 403  403          uint32_t                        sync_from, sync_to;
 404  404          uint_t                          currindx, wrindx, numremain;
 405  405          uint_t                          chainlen, posted_cnt;
 406  406          uint_t                          maxdb = TAVOR_QP_MAXDESC_PER_DB;
 407  407          int                             status;
 408  408  
 409  409          TAVOR_TNF_ENTER(tavor_post_recv);
 410  410  
 411  411          /*
 412  412           * Check for user-mappable QP memory.  Note:  We do not allow kernel
 413  413           * clients to post to QP memory that is accessible directly by the
 414  414           * user.  If the QP memory is user accessible, then return an error.
 415  415           */
 416  416          if (qp->qp_is_umap) {
 417  417                  TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type,
 418  418                      TAVOR_TNF_ERROR, "");
 419  419                  TAVOR_TNF_EXIT(tavor_post_recv);
 420  420                  return (IBT_QP_HDL_INVALID);
 421  421          }
 422  422  
 423  423          /* Initialize posted_cnt */
 424  424          posted_cnt = 0;
 425  425  
 426  426          mutex_enter(&qp->qp_lock);
 427  427  
 428  428          /*
 429  429           * Check if QP is associated with an SRQ
 430  430           */
 431  431          if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
 432  432                  mutex_exit(&qp->qp_lock);
 433  433                  TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq,
 434  434                      TAVOR_TNF_ERROR, "");
 435  435                  TAVOR_TNF_EXIT(tavor_post_recv);
 436  436                  return (IBT_SRQ_IN_USE);
 437  437          }
 438  438  
 439  439          /*
 440  440           * Check QP state.  Can not post Recv requests from the "Reset" state
 441  441           */
 442  442          if (qp->qp_state == TAVOR_QP_RESET) {
 443  443                  mutex_exit(&qp->qp_lock);
 444  444                  TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail,
 445  445                      TAVOR_TNF_ERROR, "");
 446  446                  TAVOR_TNF_EXIT(tavor_post_recv);
 447  447                  return (IBT_QP_STATE_INVALID);
 448  448          }
 449  449  
 450  450          /* Grab the lock for the WRID list */
 451  451          mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
 452  452  
 453  453          /* Save away some initial QP state */
 454  454          qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
 455  455          tail      = qp->qp_rq_wqhdr->wq_tail;
 456  456          head      = qp->qp_rq_wqhdr->wq_head;
 457  457  
 458  458          /*
 459  459           * For each ibt_recv_wr_t in the wr[] list passed in, parse the
 460  460           * request and build a Recv WQE.  Note:  Because we are potentially
 461  461           * building a chain of WQEs, we want to link them all together.
 462  462           * However, we do not want to link the first one to the previous
 463  463           * WQE until the entire chain has been linked.  Then in the last
 464  464           * step we ring the appropriate doorbell.  Note:  It is possible for
 465  465           * more Work Requests to be posted than the HW will support at one
 466  466           * shot.  If this happens, we need to be able to post and ring
 467  467           * several chains here until the the entire request is complete.
 468  468           */
 469  469          wrindx = 0;
 470  470          numremain = num_wr;
 471  471          status    = DDI_SUCCESS;
 472  472          while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 473  473                  /*
 474  474                   * For the first WQE on a new chain we need "prev" to point
 475  475                   * to the current descriptor.  As we begin to process
 476  476                   * further, "prev" will be updated to point to the previous
 477  477                   * WQE on the current chain (see below).
 478  478                   */
 479  479                  prev = TAVOR_QP_RQ_ENTRY(qp, tail);
 480  480  
 481  481                  /*
 482  482                   * Before we begin, save the current "tail index" for later
 483  483                   * DMA sync
 484  484                   */
 485  485                  sync_from = tail;
 486  486  
 487  487                  /*
 488  488                   * Break the request up into chains that are less than or
 489  489                   * equal to the maximum number of WQEs that can be posted
 490  490                   * per doorbell ring
 491  491                   */
 492  492                  chainlen = (numremain > maxdb) ? maxdb : numremain;
 493  493                  numremain -= chainlen;
 494  494                  for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 495  495                          /*
 496  496                           * Check for "queue full" condition.  If the queue
 497  497                           * is already full, then no more WQEs can be posted.
 498  498                           * So break out, ring a doorbell (if necessary) and
 499  499                           * return an error
 500  500                           */
 501  501                          if (qp->qp_rq_wqhdr->wq_full != 0) {
 502  502                                  status = IBT_QP_FULL;
 503  503                                  TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull,
 504  504                                      TAVOR_TNF_TRACE, "");
 505  505                                  break;
 506  506                          }
 507  507  
 508  508                          /*
 509  509                           * Increment the "tail index" and check for "queue
 510  510                           * full" condition.  If we detect that the current
 511  511                           * work request is going to fill the work queue, then
 512  512                           * we mark this condition and continue.
 513  513                           */
 514  514                          next_tail = (tail + 1) & qsize_msk;
 515  515                          if (next_tail == head) {
 516  516                                  qp->qp_rq_wqhdr->wq_full = 1;
 517  517                          }
 518  518  
 519  519                          /*
 520  520                           * Get the address of the location where the next
 521  521                           * Recv WQE should be built
 522  522                           */
 523  523                          desc = TAVOR_QP_RQ_ENTRY(qp, tail);
 524  524  
 525  525                          /*
 526  526                           * Call tavor_wqe_recv_build() to build the WQE
 527  527                           * at the given address.  This routine uses the
 528  528                           * information in the ibt_recv_wr_t list (wr[]) and
 529  529                           * returns the size of the WQE when it returns.
 530  530                           */
 531  531                          status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
 532  532                              desc, &desc_sz);
 533  533                          if (status != DDI_SUCCESS) {
 534  534                                  TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
 535  535                                      TAVOR_TNF_ERROR, "");
 536  536                                  break;
 537  537                          }
 538  538  
 539  539                          /*
 540  540                           * Add a WRID entry to the WRID list.  Need to
 541  541                           * calculate the "wqeaddrsz" and "signaled_dbd"
 542  542                           * values to pass to tavor_wrid_add_entry().  Note:
 543  543                           * all Recv WQEs are essentially "signaled" and
 544  544                           * "doorbelled" (since Tavor HW requires all
 545  545                           * RecvWQE's to have their "DBD" bits set).
 546  546                           */
 547  547                          wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
 548  548                              ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
 549  549                              desc_sz);
 550  550                          signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
 551  551                              TAVOR_WRID_ENTRY_DOORBELLED;
 552  552                          tavor_wrid_add_entry(qp->qp_rq_wqhdr,
 553  553                              wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
 554  554  
 555  555                          /*
 556  556                           * If this is not the first descriptor on the current
 557  557                           * chain, then link it to the previous WQE.  Otherwise,
 558  558                           * save the address and size of this descriptor (in
 559  559                           * "first" and "first_sz" respectively) and continue.
 560  560                           */
 561  561                          if (currindx != 0) {
 562  562                                  tavor_wqe_recv_linknext(desc, desc_sz, prev,
 563  563                                      qp);
 564  564                                  prev = desc;
 565  565                          } else {
 566  566                                  first    = desc;
 567  567                                  first_sz = desc_sz;
 568  568                          }
 569  569  
 570  570                          /*
 571  571                           * Update the current "tail index" and increment
 572  572                           * "posted_cnt"
 573  573                           */
 574  574                          tail = next_tail;
 575  575                          posted_cnt++;
 576  576                  }
 577  577  
 578  578                  /*
 579  579                   * If we reach here and there are one or more WQEs which have
 580  580                   * been successfully chained together, then we need to link
 581  581                   * the current chain to the previously executing chain of
 582  582                   * descriptor (if there is one) and ring the doorbell for the
 583  583                   * recv work queue.
 584  584                   */
 585  585                  if (currindx != 0) {
 586  586                          /*
 587  587                           * Before we link the chain, we need to ensure that the
 588  588                           * "next" field on the last WQE is set to NULL (to
 589  589                           * indicate the end of the chain).
 590  590                           */
 591  591                          tavor_wqe_recv_linknext(NULL, 0, prev, qp);
 592  592  
 593  593                          /* Save away updated "tail index" for the DMA sync */
 594  594                          sync_to = tail;
 595  595  
 596  596                          /* Do a DMA sync for current recv WQE(s) */
 597  597                          tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
 598  598                              DDI_DMA_SYNC_FORDEV);
 599  599  
 600  600                          /*
 601  601                           * Now link the chain to the old chain (if there was
 602  602                           * one.
 603  603                           */
 604  604                          tavor_wqe_recv_linknext(first, first_sz,
 605  605                              qp->qp_rq_lastwqeaddr, qp);
 606  606  
 607  607                          /*
 608  608                           * If there was a valid previous WQE (i.e. non-NULL),
 609  609                           * then sync it too.  This is because we have updated
 610  610                           * its "next" fields and we want to ensure that the
 611  611                           * hardware can see the changes.
 612  612                           */
 613  613                          if (qp->qp_rq_lastwqeaddr != NULL) {
 614  614                                  sync_to   = sync_from;
 615  615                                  sync_from = (sync_from - 1) & qsize_msk;
 616  616                                  tavor_wqe_sync(qp, sync_from, sync_to,
 617  617                                      TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
 618  618                          }
 619  619  
 620  620                          /* Update some of the state in the QP */
 621  621                          qp->qp_rq_lastwqeaddr    = desc;
 622  622                          qp->qp_rq_wqhdr->wq_tail = tail;
 623  623  
 624  624                          /* Ring the doorbell */
 625  625                          tavor_qp_recv_doorbell(state,
 626  626                              (uint32_t)((uintptr_t)first - qp->qp_desc_off),
 627  627                              first_sz, qp->qp_qpnum, (chainlen % maxdb));
 628  628                  }
 629  629          }
 630  630  
 631  631          /*
 632  632           * Update the "num_posted" return value (if necessary).  Then drop
 633  633           * the locks and return success.
 634  634           */
 635  635          if (num_posted != NULL) {
 636  636                  *num_posted = posted_cnt;
 637  637          }
 638  638  
 639  639          mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
 640  640          mutex_exit(&qp->qp_lock);
 641  641  
 642  642          TAVOR_TNF_EXIT(tavor_post_recv);
 643  643          return (status);
 644  644  }
 645  645  
 646  646  /*
 647  647   * tavor_post_srq()
 648  648   *    Context: Can be called from interrupt or base context.
 649  649   */
 650  650  int
 651  651  tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
 652  652      ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
 653  653  {
 654  654          uint64_t                        *desc, *prev, *first, *last_wqe_addr;
 655  655          uint32_t                        signaled_dbd;
 656  656          uint32_t                        sync_indx;
 657  657          uint_t                          currindx, wrindx, numremain;
 658  658          uint_t                          chainlen, posted_cnt;
 659  659          uint_t                          maxdb = TAVOR_QP_MAXDESC_PER_DB;
 660  660          int                             status;
 661  661  
 662  662          TAVOR_TNF_ENTER(tavor_post_srq);
 663  663  
 664  664          /*
 665  665           * Check for user-mappable QP memory.  Note:  We do not allow kernel
 666  666           * clients to post to QP memory that is accessible directly by the
 667  667           * user.  If the QP memory is user accessible, then return an error.
 668  668           */
 669  669          if (srq->srq_is_umap) {
 670  670                  TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type,
 671  671                      TAVOR_TNF_ERROR, "");
 672  672                  TAVOR_TNF_EXIT(tavor_post_srq);
 673  673                  return (IBT_SRQ_HDL_INVALID);
 674  674          }
 675  675  
 676  676          /* Initialize posted_cnt */
 677  677          posted_cnt = 0;
 678  678  
 679  679          mutex_enter(&srq->srq_lock);
 680  680  
 681  681          /*
 682  682           * Check SRQ state.  Can not post Recv requests when SRQ is in error
 683  683           */
 684  684          if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
 685  685                  mutex_exit(&srq->srq_lock);
 686  686                  TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail,
 687  687                      TAVOR_TNF_ERROR, "");
 688  688                  TAVOR_TNF_EXIT(tavor_post_srq);
 689  689                  return (IBT_QP_STATE_INVALID);
 690  690          }
 691  691  
 692  692          /* Grab the lock for the WRID list */
 693  693          mutex_enter(&srq->srq_wrid_wql->wql_lock);
 694  694  
 695  695          /*
 696  696           * For each ibt_recv_wr_t in the wr[] list passed in, parse the
 697  697           * request and build a Recv WQE.  Note:  Because we are potentially
 698  698           * building a chain of WQEs, we want to link them all together.
 699  699           * However, we do not want to link the first one to the previous
 700  700           * WQE until the entire chain has been linked.  Then in the last
 701  701           * step we ring the appropriate doorbell.  Note:  It is possible for
 702  702           * more Work Requests to be posted than the HW will support at one
 703  703           * shot.  If this happens, we need to be able to post and ring
 704  704           * several chains here until the the entire request is complete.
 705  705           */
 706  706          wrindx = 0;
 707  707          numremain = num_wr;
 708  708          status    = DDI_SUCCESS;
 709  709          while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
 710  710                  /*
 711  711                   * For the first WQE on a new chain we need "prev" to point
 712  712                   * to the current descriptor.  As we begin to process
 713  713                   * further, "prev" will be updated to point to the previous
 714  714                   * WQE on the current chain (see below).
 715  715                   */
 716  716                  if (srq->srq_wq_lastwqeindx == -1) {
 717  717                          prev = NULL;
 718  718                  } else {
 719  719                          prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
 720  720                  }
 721  721  
 722  722                  /*
 723  723                   * Break the request up into chains that are less than or
 724  724                   * equal to the maximum number of WQEs that can be posted
 725  725                   * per doorbell ring
 726  726                   */
 727  727                  chainlen = (numremain > maxdb) ? maxdb : numremain;
 728  728                  numremain -= chainlen;
 729  729                  for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
 730  730  
 731  731                          /*
 732  732                           * Check for "queue full" condition.  If the queue
 733  733                           * is already full, then no more WQEs can be posted.
 734  734                           * So break out, ring a doorbell (if necessary) and
 735  735                           * return an error
 736  736                           */
 737  737                          if (srq->srq_wridlist->wl_free_list_indx == -1) {
 738  738                                  status = IBT_QP_FULL;
 739  739                                  TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull,
 740  740                                      TAVOR_TNF_TRACE, "");
 741  741                                  break;
 742  742                          }
 743  743  
 744  744                          /*
 745  745                           * Get the address of the location where the next
 746  746                           * Recv WQE should be built
 747  747                           */
 748  748                          desc = TAVOR_SRQ_WQE_ADDR(srq,
 749  749                              srq->srq_wridlist->wl_free_list_indx);
 750  750  
 751  751                          /*
 752  752                           * Add a WRID entry to the WRID list.  Need to
 753  753                           * set the "signaled_dbd" values to pass to
 754  754                           * tavor_wrid_add_entry().  Note: all Recv WQEs are
 755  755                           * essentially "signaled"
 756  756                           *
 757  757                           * The 'size' is stored at srq_alloc time, in the
 758  758                           * srq_wq_stride.  This is a constant value required
 759  759                           * for SRQ.
 760  760                           */
 761  761                          signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
 762  762                          tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
 763  763                              signaled_dbd);
 764  764  
 765  765                          /*
 766  766                           * Call tavor_wqe_srq_build() to build the WQE
 767  767                           * at the given address.  This routine uses the
 768  768                           * information in the ibt_recv_wr_t list (wr[]) and
 769  769                           * returns the size of the WQE when it returns.
 770  770                           */
 771  771                          status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
 772  772                              desc);
 773  773                          if (status != DDI_SUCCESS) {
 774  774                                  TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
 775  775                                      TAVOR_TNF_ERROR, "");
 776  776                                  break;
 777  777                          }
 778  778  
 779  779                          /*
 780  780                           * If this is not the first descriptor on the current
 781  781                           * chain, then link it to the previous WQE.  Otherwise,
 782  782                           * save the address of this descriptor (in "first") and
 783  783                           * continue.
 784  784                           */
 785  785                          if (currindx != 0) {
 786  786                                  tavor_wqe_srq_linknext(desc, prev, srq);
 787  787                                  sync_indx = TAVOR_SRQ_WQE_INDEX(
 788  788                                      srq->srq_wq_buf, prev,
 789  789                                      srq->srq_wq_log_wqesz);
 790  790  
 791  791                                  /* Do a DMA sync for previous recv WQE */
 792  792                                  tavor_wqe_sync(srq, sync_indx, sync_indx+1,
 793  793                                      TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
 794  794  
 795  795                                  prev = desc;
 796  796                          } else {
 797  797  
 798  798                                  /*
 799  799                                   * In this case, the last WQE on the chain is
 800  800                                   * also considered 'first'.  So set prev to
 801  801                                   * first, here.
 802  802                                   */
 803  803                                  first = prev = desc;
 804  804                          }
 805  805  
 806  806                          /*
 807  807                           * Increment "posted_cnt"
 808  808                           */
 809  809                          posted_cnt++;
 810  810                  }
 811  811  
 812  812                  /*
 813  813                   * If we reach here and there are one or more WQEs which have
 814  814                   * been successfully chained together, then we need to link
 815  815                   * the current chain to the previously executing chain of
 816  816                   * descriptor (if there is one) and ring the doorbell for the
 817  817                   * recv work queue.
 818  818                   */
 819  819                  if (currindx != 0) {
 820  820                          /*
 821  821                           * Before we link the chain, we need to ensure that the
 822  822                           * "next" field on the last WQE is set to NULL (to
 823  823                           * indicate the end of the chain).
 824  824                           */
 825  825                          tavor_wqe_srq_linknext(NULL, prev, srq);
 826  826  
 827  827                          sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
 828  828                              srq->srq_wq_log_wqesz);
 829  829  
 830  830                          /* Do a DMA sync for current recv WQE */
 831  831                          tavor_wqe_sync(srq, sync_indx, sync_indx+1,
 832  832                              TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
 833  833  
 834  834                          /*
 835  835                           * Now link the chain to the old chain (if there was
 836  836                           * one).
 837  837                           */
 838  838                          if (srq->srq_wq_lastwqeindx == -1) {
 839  839                                  last_wqe_addr = NULL;
 840  840                          } else {
 841  841                                  last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
 842  842                                      srq->srq_wq_lastwqeindx);
 843  843                          }
 844  844                          tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
 845  845  
 846  846                          /*
 847  847                           * If there was a valid previous WQE (i.e. valid index),
 848  848                           * then sync it too.  This is because we have updated
 849  849                           * its "next" fields and we want to ensure that the
 850  850                           * hardware can see the changes.
 851  851                           */
 852  852                          if (srq->srq_wq_lastwqeindx != -1) {
 853  853                                  sync_indx = srq->srq_wq_lastwqeindx;
 854  854                                  tavor_wqe_sync(srq, sync_indx, sync_indx+1,
 855  855                                      TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
 856  856                          }
 857  857  
 858  858                          /* Update some of the state in the QP */
 859  859                          srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
 860  860                              srq->srq_wq_buf, desc,
 861  861                              srq->srq_wq_log_wqesz);
 862  862  
 863  863                          /* Ring the doorbell */
 864  864                          /* SRQ needs NDS of 0 */
 865  865                          tavor_qp_recv_doorbell(state,
 866  866                              (uint32_t)((uintptr_t)first - srq->srq_desc_off),
 867  867                              0, srq->srq_srqnum, (chainlen % maxdb));
 868  868                  }
 869  869          }
 870  870  
 871  871          /*
 872  872           * Update the "num_posted" return value (if necessary).  Then drop
 873  873           * the locks and return success.
 874  874           */
 875  875          if (num_posted != NULL) {
 876  876                  *num_posted = posted_cnt;
 877  877          }
 878  878  
 879  879          mutex_exit(&srq->srq_wrid_wql->wql_lock);
 880  880          mutex_exit(&srq->srq_lock);
 881  881  
 882  882          TAVOR_TNF_EXIT(tavor_post_srq);
 883  883          return (status);
 884  884  }
 885  885  
 886  886  
 887  887  /*
 888  888   * tavor_qp_send_doorbell()
 889  889   *    Context: Can be called from interrupt or base context.
 890  890   */
 891  891  static void
 892  892  tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
 893  893      uint32_t qpn, uint32_t fence, uint32_t nopcode)
 894  894  {
 895  895          uint64_t        doorbell = 0;
 896  896  
 897  897          /* Build the doorbell from the parameters */
 898  898          doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
 899  899              TAVOR_QPSNDDB_NDA_SHIFT) |
 900  900              ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
 901  901              ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
 902  902              ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
 903  903  
 904  904          TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "",
 905  905              tnf_ulong, doorbell, doorbell);
 906  906  
 907  907          /* Write the doorbell to UAR */
 908  908          TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
 909  909              doorbell);
 910  910  }
 911  911  
 912  912  
 913  913  /*
 914  914   * tavor_qp_recv_doorbell()
 915  915   *    Context: Can be called from interrupt or base context.
 916  916   */
 917  917  static void
 918  918  tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
 919  919      uint32_t qpn, uint32_t credits)
 920  920  {
 921  921          uint64_t        doorbell = 0;
 922  922  
 923  923          /* Build the doorbell from the parameters */
 924  924          doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
 925  925              TAVOR_QPRCVDB_NDA_SHIFT) |
 926  926              ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
 927  927              ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
 928  928  
 929  929          TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "",
 930  930              tnf_ulong, doorbell, doorbell);
 931  931  
 932  932          /* Write the doorbell to UAR */
 933  933          TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
 934  934              doorbell);
 935  935  }
 936  936  
 937  937  
 938  938  /*
 939  939   * tavor_wqe_send_build()
 940  940   *    Context: Can be called from interrupt or base context.
 941  941   */
 942  942  static int
 943  943  tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
 944  944      ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
 945  945  {
 946  946          tavor_hw_snd_wqe_ud_t           *ud;
 947  947          tavor_hw_snd_wqe_remaddr_t      *rc;
 948  948          tavor_hw_snd_wqe_atomic_t       *at;
 949  949          tavor_hw_snd_wqe_remaddr_t      *uc;
 950  950          tavor_hw_snd_wqe_bind_t         *bn;
 951  951          tavor_hw_wqe_sgl_t              *ds;
 952  952          ibt_wr_ds_t                     *sgl;
 953  953          tavor_ahhdl_t                   ah;
 954  954          uint32_t                        nds;
 955  955          int                             i, num_ds, status;
 956  956  
 957  957          TAVOR_TNF_ENTER(tavor_wqe_send_build);
 958  958  
 959  959          ASSERT(MUTEX_HELD(&qp->qp_lock));
 960  960  
 961  961          /* Initialize the information for the Data Segments */
 962  962          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
 963  963              sizeof (tavor_hw_snd_wqe_nextctrl_t));
 964  964          nds = wr->wr_nds;
 965  965          sgl = wr->wr_sgl;
 966  966          num_ds = 0;
 967  967  
 968  968          /*
 969  969           * Build a Send WQE depends first and foremost on the transport
 970  970           * type of Work Request (i.e. UD, RC, or UC)
 971  971           */
 972  972          switch (wr->wr_trans) {
 973  973          case IBT_UD_SRV:
 974  974                  /* Ensure that work request transport type matches QP type */
 975  975                  if (qp->qp_serv_type != TAVOR_QP_UD) {
 976  976                          TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
 977  977                              TAVOR_TNF_ERROR, "");
 978  978                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
 979  979                          return (IBT_QP_SRV_TYPE_INVALID);
 980  980                  }
 981  981  
 982  982                  /*
 983  983                   * Validate the operation type.  For UD requests, only the
 984  984                   * "Send" operation is valid
 985  985                   */
 986  986                  if (wr->wr_opcode != IBT_WRC_SEND) {
 987  987                          TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
 988  988                              TAVOR_TNF_ERROR, "");
 989  989                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
 990  990                          return (IBT_QP_OP_TYPE_INVALID);
 991  991                  }
 992  992  
 993  993                  /*
 994  994                   * If this is a Special QP (QP0 or QP1), then we need to
 995  995                   * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
 996  996                   * and return whatever status it returns
 997  997                   */
 998  998                  if (qp->qp_is_special) {
 999  999                          status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
1000 1000                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1001 1001                          return (status);
1002 1002                  }
1003 1003  
1004 1004                  /*
1005 1005                   * Otherwise, if this is a normal UD Send request, then fill
1006 1006                   * all the fields in the Tavor UD header for the WQE.  Note:
1007 1007                   * to do this we'll need to extract some information from the
1008 1008                   * Address Handle passed with the work request.
1009 1009                   */
1010 1010                  ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1011 1011                      sizeof (tavor_hw_snd_wqe_nextctrl_t));
1012 1012                  ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1013 1013                  if (ah == NULL) {
1014 1014                          TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail,
1015 1015                              TAVOR_TNF_ERROR, "");
1016 1016                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1017 1017                          return (IBT_AH_HDL_INVALID);
1018 1018                  }
1019 1019  
1020 1020                  /*
1021 1021                   * Build the Unreliable Datagram Segment for the WQE, using
1022 1022                   * the information from the address handle and the work
1023 1023                   * request.
1024 1024                   */
1025 1025                  mutex_enter(&ah->ah_lock);
1026 1026                  TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
1027 1027                  mutex_exit(&ah->ah_lock);
1028 1028  
1029 1029                  /* Update "ds" for filling in Data Segments (below) */
1030 1030                  ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
1031 1031                      sizeof (tavor_hw_snd_wqe_ud_t));
1032 1032                  break;
1033 1033  
1034 1034          case IBT_RC_SRV:
1035 1035                  /* Ensure that work request transport type matches QP type */
1036 1036                  if (qp->qp_serv_type != TAVOR_QP_RC) {
1037 1037                          TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1038 1038                              TAVOR_TNF_ERROR, "");
1039 1039                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1040 1040                          return (IBT_QP_SRV_TYPE_INVALID);
1041 1041                  }
1042 1042  
1043 1043                  /*
1044 1044                   * Validate the operation type.  For RC requests, we allow
1045 1045                   * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1046 1046                   * operations, and memory window "Bind"
1047 1047                   */
1048 1048                  if ((wr->wr_opcode != IBT_WRC_SEND) &&
1049 1049                      (wr->wr_opcode != IBT_WRC_RDMAR) &&
1050 1050                      (wr->wr_opcode != IBT_WRC_RDMAW) &&
1051 1051                      (wr->wr_opcode != IBT_WRC_CSWAP) &&
1052 1052                      (wr->wr_opcode != IBT_WRC_FADD) &&
1053 1053                      (wr->wr_opcode != IBT_WRC_BIND)) {
1054 1054                          TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1055 1055                              TAVOR_TNF_ERROR, "");
1056 1056                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1057 1057                          return (IBT_QP_OP_TYPE_INVALID);
1058 1058                  }
1059 1059  
1060 1060                  /*
1061 1061                   * If this is a Send request, then all we need to do is break
1062 1062                   * out and here and begin the Data Segment processing below
1063 1063                   */
1064 1064                  if (wr->wr_opcode == IBT_WRC_SEND) {
1065 1065                          break;
1066 1066                  }
1067 1067  
1068 1068                  /*
1069 1069                   * If this is an RDMA Read or RDMA Write request, then fill
1070 1070                   * in the "Remote Address" header fields.
1071 1071                   */
1072 1072                  if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1073 1073                      (wr->wr_opcode == IBT_WRC_RDMAW)) {
1074 1074                          rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1075 1075                              sizeof (tavor_hw_snd_wqe_nextctrl_t));
1076 1076  
1077 1077                          /*
1078 1078                           * Build the Remote Address Segment for the WQE, using
1079 1079                           * the information from the RC work request.
1080 1080                           */
1081 1081                          TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1082 1082  
1083 1083                          /* Update "ds" for filling in Data Segments (below) */
1084 1084                          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1085 1085                              sizeof (tavor_hw_snd_wqe_remaddr_t));
1086 1086                          break;
1087 1087                  }
1088 1088  
1089 1089                  /*
1090 1090                   * If this is one of the Atomic type operations (i.e
1091 1091                   * Compare-Swap or Fetch-Add), then fill in both the "Remote
1092 1092                   * Address" header fields and the "Atomic" header fields.
1093 1093                   */
1094 1094                  if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1095 1095                      (wr->wr_opcode == IBT_WRC_FADD)) {
1096 1096                          rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1097 1097                              sizeof (tavor_hw_snd_wqe_nextctrl_t));
1098 1098                          at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1099 1099                              sizeof (tavor_hw_snd_wqe_remaddr_t));
1100 1100  
1101 1101                          /*
1102 1102                           * Build the Remote Address and Atomic Segments for
1103 1103                           * the WQE, using the information from the RC Atomic
1104 1104                           * work request.
1105 1105                           */
1106 1106                          TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1107 1107                          TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1108 1108  
1109 1109                          /* Update "ds" for filling in Data Segments (below) */
1110 1110                          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1111 1111                              sizeof (tavor_hw_snd_wqe_atomic_t));
1112 1112  
1113 1113                          /*
1114 1114                           * Update "nds" and "sgl" because Atomic requests have
1115 1115                           * only a single Data Segment (and they are encoded
1116 1116                           * somewhat differently in the work request.
1117 1117                           */
1118 1118                          nds = 1;
1119 1119                          sgl = wr->wr_sgl;
1120 1120                          break;
1121 1121                  }
1122 1122  
1123 1123                  /*
1124 1124                   * If this is memory window Bind operation, then we call the
1125 1125                   * tavor_wr_bind_check() routine to validate the request and
1126 1126                   * to generate the updated RKey.  If this is successful, then
1127 1127                   * we fill in the WQE's "Bind" header fields.
1128 1128                   */
1129 1129                  if (wr->wr_opcode == IBT_WRC_BIND) {
1130 1130                          status = tavor_wr_bind_check(state, wr);
1131 1131                          if (status != DDI_SUCCESS) {
1132 1132                                  TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1133 1133                                      TAVOR_TNF_ERROR, "");
1134 1134                                  TAVOR_TNF_EXIT(tavor_wqe_send_build);
1135 1135                                  return (status);
1136 1136                          }
1137 1137  
1138 1138                          bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1139 1139                              sizeof (tavor_hw_snd_wqe_nextctrl_t));
1140 1140  
1141 1141                          /*
1142 1142                           * Build the Bind Memory Window Segments for the WQE,
1143 1143                           * using the information from the RC Bind memory
1144 1144                           * window work request.
1145 1145                           */
1146 1146                          TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1147 1147  
1148 1148                          /*
1149 1149                           * Update the "ds" pointer.  Even though the "bind"
1150 1150                           * operation requires no SGLs, this is necessary to
1151 1151                           * facilitate the correct descriptor size calculations
1152 1152                           * (below).
1153 1153                           */
1154 1154                          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1155 1155                              sizeof (tavor_hw_snd_wqe_bind_t));
1156 1156                          nds = 0;
1157 1157                  }
1158 1158                  break;
1159 1159  
1160 1160          case IBT_UC_SRV:
1161 1161                  /* Ensure that work request transport type matches QP type */
1162 1162                  if (qp->qp_serv_type != TAVOR_QP_UC) {
1163 1163                          TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1164 1164                              TAVOR_TNF_ERROR, "");
1165 1165                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1166 1166                          return (IBT_QP_SRV_TYPE_INVALID);
1167 1167                  }
1168 1168  
1169 1169                  /*
1170 1170                   * Validate the operation type.  For UC requests, we only
1171 1171                   * allow "Send", "RDMA Write", and memory window "Bind".
1172 1172                   * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1173 1173                   * operations
1174 1174                   */
1175 1175                  if ((wr->wr_opcode != IBT_WRC_SEND) &&
1176 1176                      (wr->wr_opcode != IBT_WRC_RDMAW) &&
1177 1177                      (wr->wr_opcode != IBT_WRC_BIND)) {
1178 1178                          TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1179 1179                              TAVOR_TNF_ERROR, "");
1180 1180                          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1181 1181                          return (IBT_QP_OP_TYPE_INVALID);
1182 1182                  }
1183 1183  
1184 1184                  /*
1185 1185                   * If this is a Send request, then all we need to do is break
1186 1186                   * out and here and begin the Data Segment processing below
1187 1187                   */
1188 1188                  if (wr->wr_opcode == IBT_WRC_SEND) {
1189 1189                          break;
1190 1190                  }
1191 1191  
1192 1192                  /*
1193 1193                   * If this is an RDMA Write request, then fill in the "Remote
1194 1194                   * Address" header fields.
1195 1195                   */
1196 1196                  if (wr->wr_opcode == IBT_WRC_RDMAW) {
1197 1197                          uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1198 1198                              sizeof (tavor_hw_snd_wqe_nextctrl_t));
1199 1199  
1200 1200                          /*
1201 1201                           * Build the Remote Address Segment for the WQE, using
1202 1202                           * the information from the UC work request.
1203 1203                           */
1204 1204                          TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1205 1205  
1206 1206                          /* Update "ds" for filling in Data Segments (below) */
1207 1207                          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1208 1208                              sizeof (tavor_hw_snd_wqe_remaddr_t));
1209 1209                          break;
1210 1210                  }
1211 1211  
1212 1212                  /*
1213 1213                   * If this is memory window Bind operation, then we call the
1214 1214                   * tavor_wr_bind_check() routine to validate the request and
1215 1215                   * to generate the updated RKey.  If this is successful, then
1216 1216                   * we fill in the WQE's "Bind" header fields.
1217 1217                   */
1218 1218                  if (wr->wr_opcode == IBT_WRC_BIND) {
1219 1219                          status = tavor_wr_bind_check(state, wr);
1220 1220                          if (status != DDI_SUCCESS) {
1221 1221                                  TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1222 1222                                      TAVOR_TNF_ERROR, "");
1223 1223                                  TAVOR_TNF_EXIT(tavor_wqe_send_build);
1224 1224                                  return (status);
1225 1225                          }
1226 1226  
1227 1227                          bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1228 1228                              sizeof (tavor_hw_snd_wqe_nextctrl_t));
1229 1229  
1230 1230                          /*
1231 1231                           * Build the Bind Memory Window Segments for the WQE,
1232 1232                           * using the information from the UC Bind memory
1233 1233                           * window work request.
1234 1234                           */
1235 1235                          TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1236 1236  
1237 1237                          /*
1238 1238                           * Update the "ds" pointer.  Even though the "bind"
1239 1239                           * operation requires no SGLs, this is necessary to
1240 1240                           * facilitate the correct descriptor size calculations
1241 1241                           * (below).
1242 1242                           */
1243 1243                          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1244 1244                              sizeof (tavor_hw_snd_wqe_bind_t));
1245 1245                          nds = 0;
1246 1246                  }
1247 1247                  break;
1248 1248  
1249 1249          default:
1250 1250                  TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail,
1251 1251                      TAVOR_TNF_ERROR, "");
1252 1252                  TAVOR_TNF_EXIT(tavor_wqe_send_build);
1253 1253                  return (IBT_QP_SRV_TYPE_INVALID);
1254 1254          }
1255 1255  
1256 1256          /*
1257 1257           * Now fill in the Data Segments (SGL) for the Send WQE based on
1258 1258           * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1259 1259           * Start by checking for a valid number of SGL entries
1260 1260           */
1261 1261          if (nds > qp->qp_sq_sgl) {
1262 1262                  TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail,
1263 1263                      TAVOR_TNF_ERROR, "");
1264 1264                  TAVOR_TNF_EXIT(tavor_wqe_send_build);
1265 1265                  return (IBT_QP_SGL_LEN_INVALID);
1266 1266          }
1267 1267  
1268 1268          /*
1269 1269           * For each SGL in the Send Work Request, fill in the Send WQE's data
1270 1270           * segments.  Note: We skip any SGL with zero size because Tavor
1271 1271           * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1272 1272           * the encoding for zero means a 2GB transfer.  Because of this special
1273 1273           * encoding in the hardware, we mask the requested length with
1274 1274           * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1275 1275           * zero.)
1276 1276           */
1277 1277          for (i = 0; i < nds; i++) {
1278 1278                  if (sgl[i].ds_len == 0) {
1279 1279                          continue;
1280 1280                  }
1281 1281  
1282 1282                  /*
1283 1283                   * Fill in the Data Segment(s) for the current WQE, using the
1284 1284                   * information contained in the scatter-gather list of the
1285 1285                   * work request.
1286 1286                   */
1287 1287                  TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1288 1288                  num_ds++;
1289 1289          }
1290 1290  
1291 1291          /* Return the size of descriptor (in 16-byte chunks) */
1292 1292          *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1293 1293  
1294 1294          TAVOR_TNF_EXIT(tavor_wqe_send_build);
1295 1295          return (DDI_SUCCESS);
1296 1296  }
1297 1297  
1298 1298  
1299 1299  /*
1300 1300   * tavor_wqe_send_linknext()
1301 1301   *    Context: Can be called from interrupt or base context.
1302 1302   */
1303 1303  static void
1304 1304  tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1305 1305      uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1306 1306      tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1307 1307  {
1308 1308          uint64_t        next, ctrl;
1309 1309          uint32_t        nopcode, fence;
1310 1310  
1311 1311          /*
1312 1312           * Calculate the "next" field of the descriptor.  This amounts to
1313 1313           * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1314 1314           * fields (see tavor_hw.h for more).  Note:  If there is no next
1315 1315           * descriptor (i.e. if the current descriptor is the last WQE on
1316 1316           * the chain), then set "next" to zero.
1317 1317           */
1318 1318          if (curr_desc != NULL) {
1319 1319                  /*
1320 1320                   * Determine the value for the Tavor WQE "nopcode" field
1321 1321                   * by using the IBTF opcode from the work request
1322 1322                   */
1323 1323                  switch (curr_wr->wr_opcode) {
1324 1324                  case IBT_WRC_RDMAW:
1325 1325                          if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1326 1326                                  nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1327 1327                          } else {
1328 1328                                  nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1329 1329                          }
1330 1330                          break;
1331 1331  
1332 1332                  case IBT_WRC_SEND:
1333 1333                          if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1334 1334                                  nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1335 1335                          } else {
1336 1336                                  nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1337 1337                          }
1338 1338                          break;
1339 1339  
1340 1340                  case IBT_WRC_RDMAR:
1341 1341                          nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1342 1342                          break;
1343 1343  
1344 1344                  case IBT_WRC_CSWAP:
1345 1345                          nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1346 1346                          break;
1347 1347  
1348 1348                  case IBT_WRC_FADD:
1349 1349                          nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1350 1350                          break;
1351 1351  
1352 1352                  case IBT_WRC_BIND:
1353 1353                          nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1354 1354                          break;
1355 1355                  }
1356 1356  
1357 1357                  curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1358 1358                      - qp->qp_desc_off);
1359 1359                  next  = ((uint64_t)(uintptr_t)curr_desc &
1360 1360                      TAVOR_WQE_NDA_MASK) << 32;
1361 1361                  next  = next | ((uint64_t)nopcode << 32);
1362 1362                  fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1363 1363                  if (fence) {
1364 1364                          next = next | TAVOR_WQE_SEND_FENCE_MASK;
1365 1365                  }
1366 1366                  next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1367 1367  
1368 1368                  /*
1369 1369                   * If a send queue doorbell will be rung for the next
1370 1370                   * WQE on the chain, then set the current WQE's "dbd" bit.
1371 1371                   * Note: We also update the "dbinfo" structure here to pass
1372 1372                   * back information about what should (later) be included
1373 1373                   * in the send queue doorbell.
1374 1374                   */
1375 1375                  if (dbinfo) {
1376 1376                          next = next | TAVOR_WQE_DBD_MASK;
1377 1377                          dbinfo->db_nopcode = nopcode;
1378 1378                          dbinfo->db_fence   = fence;
1379 1379                  }
1380 1380          } else {
1381 1381                  next = 0;
1382 1382          }
1383 1383  
1384 1384          /*
1385 1385           * If this WQE is supposed to be linked to the previous descriptor,
1386 1386           * then we need to update not only the previous WQE's "next" fields
1387 1387           * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1388 1388           * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1389 1389           * the "e" bit is always hardcoded to zero.
1390 1390           */
1391 1391          if (prev_desc != NULL) {
1392 1392                  /*
1393 1393                   * If a send queue doorbell will be rung for the next WQE on
1394 1394                   * the chain, then update the current WQE's "next" field and
1395 1395                   * return.
1396 1396                   * Note: We don't want to modify the "ctrl" field here because
1397 1397                   * that portion of the previous WQE has already been set
1398 1398                   * correctly at some previous point in time.
1399 1399                   */
1400 1400                  if (dbinfo) {
1401 1401                          TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1402 1402                          return;
1403 1403                  }
1404 1404  
1405 1405                  ctrl = 0;
1406 1406  
1407 1407                  /* Set the "c" (i.e. "signaled") bit appropriately */
1408 1408                  if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1409 1409                          ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1410 1410                  }
1411 1411  
1412 1412                  /* Set the "s" (i.e. "solicited") bit appropriately */
1413 1413                  if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1414 1414                          ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1415 1415                  }
1416 1416  
1417 1417                  /* Set the "i" bit and the immediate data appropriately */
1418 1418                  if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1419 1419                          ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1420 1420                          ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1421 1421                  }
1422 1422  
1423 1423                  TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1424 1424          }
1425 1425  }
1426 1426  
1427 1427  
1428 1428  /*
1429 1429   * tavor_wqe_mlx_build()
1430 1430   *    Context: Can be called from interrupt or base context.
1431 1431   */
1432 1432  static int
1433 1433  tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1434 1434      ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1435 1435  {
1436 1436          tavor_hw_udav_t         udav;
1437 1437          tavor_ahhdl_t           ah;
1438 1438          ib_lrh_hdr_t            *lrh;
1439 1439          ib_grh_t                *grh;
1440 1440          ib_bth_hdr_t            *bth;
1441 1441          ib_deth_hdr_t           *deth;
1442 1442          tavor_hw_wqe_sgl_t      *ds;
1443 1443          ibt_wr_ds_t             *sgl;
1444 1444          uint8_t                 *mgmtclass, *hpoint, *hcount;
1445 1445          uint64_t                data;
1446 1446          uint32_t                nds, offset, pktlen;
1447 1447          uint32_t                desc_sz, udav_sz;
1448 1448          int                     i, num_ds;
1449 1449  
1450 1450          TAVOR_TNF_ENTER(tavor_wqe_mlx_build);
1451 1451  
1452 1452          ASSERT(MUTEX_HELD(&qp->qp_lock));
1453 1453  
1454 1454          /* Initialize the information for the Data Segments */
1455 1455          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1456 1456              sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1457 1457  
1458 1458          /*
1459 1459           * Pull the address handle from the work request and read in
1460 1460           * the contents of the UDAV.  This will be used to answer some
1461 1461           * questions about the request.
1462 1462           */
1463 1463          ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1464 1464          if (ah == NULL) {
1465 1465                  TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail,
1466 1466                      TAVOR_TNF_ERROR, "");
1467 1467                  TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1468 1468                  return (IBT_AH_HDL_INVALID);
1469 1469          }
1470 1470          mutex_enter(&ah->ah_lock);
1471 1471          udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1472 1472          for (i = 0; i < udav_sz; i++) {
1473 1473                  data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1474 1474                      ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1475 1475                  ((uint64_t *)&udav)[i] = data;
1476 1476          }
1477 1477          mutex_exit(&ah->ah_lock);
1478 1478  
1479 1479          /*
1480 1480           * If the request is for QP1 and the destination LID is equal to
1481 1481           * the Permissive LID, then return an error.  This combination is
1482 1482           * not allowed
1483 1483           */
1484 1484          if ((udav.rlid == IB_LID_PERMISSIVE) &&
1485 1485              (qp->qp_is_special == TAVOR_QP_GSI)) {
1486 1486                  TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail,
1487 1487                      TAVOR_TNF_ERROR, "");
1488 1488                  TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1489 1489                  return (IBT_AH_HDL_INVALID);
1490 1490          }
1491 1491  
1492 1492          /*
1493 1493           * Calculate the size of the packet headers, including the GRH
1494 1494           * (if necessary)
1495 1495           */
1496 1496          desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1497 1497              sizeof (ib_deth_hdr_t);
1498 1498          if (udav.grh) {
1499 1499                  desc_sz += sizeof (ib_grh_t);
1500 1500          }
1501 1501  
1502 1502          /*
1503 1503           * Begin to build the first "inline" data segment for the packet
1504 1504           * headers.  Note:  By specifying "inline" we can build the contents
1505 1505           * of the MAD packet headers directly into the work queue (as part
1506 1506           * descriptor).  This has the advantage of both speeding things up
1507 1507           * and of not requiring the driver to allocate/register any additional
1508 1508           * memory for the packet headers.
1509 1509           */
1510 1510          TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1511 1511          desc_sz += 4;
1512 1512  
1513 1513          /*
1514 1514           * Build Local Route Header (LRH)
1515 1515           *    We start here by building the LRH into a temporary location.
1516 1516           *    When we have finished we copy the LRH data into the descriptor.
1517 1517           *
1518 1518           *    Notice that the VL values are hardcoded.  This is not a problem
1519 1519           *    because VL15 is decided later based on the value in the MLX
1520 1520           *    transport "next/ctrl" header (see the "vl15" bit below), and it
1521 1521           *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1522 1522           *    values.  This rule does not hold for loopback packets however
1523 1523           *    (all of which bypass the SL-to-VL tables) and it is the reason
1524 1524           *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1525 1525           *
1526 1526           *    Notice also that Source LID is hardcoded to the Permissive LID
1527 1527           *    (0xFFFF).  This is also not a problem because if the Destination
1528 1528           *    LID is not the Permissive LID, then the "slr" value in the MLX
1529 1529           *    transport "next/ctrl" header will be set to zero and the hardware
1530 1530           *    will pull the LID from value in the port.
1531 1531           */
1532 1532          lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1533 1533          pktlen = (desc_sz + 0x100) >> 2;
1534 1534          TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1535 1535  
1536 1536          /*
1537 1537           * Build Global Route Header (GRH)
1538 1538           *    This is only built if necessary as defined by the "grh" bit in
1539 1539           *    the address vector.  Note:  We also calculate the offset to the
1540 1540           *    next header (BTH) based on whether or not the "grh" bit is set.
1541 1541           */
1542 1542          if (udav.grh) {
1543 1543                  /*
1544 1544                   * If the request is for QP0, then return an error.  The
1545 1545                   * combination of global routine (GRH) and QP0 is not allowed.
1546 1546                   */
1547 1547                  if (qp->qp_is_special == TAVOR_QP_SMI) {
1548 1548                          TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail,
1549 1549                              TAVOR_TNF_ERROR, "");
1550 1550                          TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1551 1551                          return (IBT_AH_HDL_INVALID);
1552 1552                  }
1553 1553                  grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1554 1554                  TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1555 1555  
1556 1556                  bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1557 1557          } else {
1558 1558                  bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1559 1559          }
1560 1560  
1561 1561  
1562 1562          /*
1563 1563           * Build Base Transport Header (BTH)
1564 1564           *    Notice that the M, PadCnt, and TVer fields are all set
1565 1565           *    to zero implicitly.  This is true for all Management Datagrams
1566 1566           *    MADs whether GSI are SMI.
1567 1567           */
1568 1568          TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1569 1569  
1570 1570          /*
1571 1571           * Build Datagram Extended Transport Header (DETH)
1572 1572           */
1573 1573          deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1574 1574          TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1575 1575  
1576 1576          /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1577 1577          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1578 1578          ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1579 1579          nds = wr->wr_nds;
1580 1580          sgl = wr->wr_sgl;
1581 1581          num_ds = 0;
1582 1582  
1583 1583          /*
1584 1584           * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1585 1585           * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1586 1586           * Start by checking for a valid number of SGL entries
1587 1587           */
1588 1588          if (nds > qp->qp_sq_sgl) {
1589 1589                  TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail,
1590 1590                      TAVOR_TNF_ERROR, "");
1591 1591                  TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1592 1592                  return (IBT_QP_SGL_LEN_INVALID);
1593 1593          }
1594 1594  
1595 1595          /*
1596 1596           * For each SGL in the Send Work Request, fill in the MLX WQE's data
1597 1597           * segments.  Note: We skip any SGL with zero size because Tavor
1598 1598           * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1599 1599           * the encoding for zero means a 2GB transfer.  Because of this special
1600 1600           * encoding in the hardware, we mask the requested length with
1601 1601           * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1602 1602           * zero.)
1603 1603           */
1604 1604          mgmtclass = hpoint = hcount = NULL;
1605 1605          offset = 0;
1606 1606          for (i = 0; i < nds; i++) {
1607 1607                  if (sgl[i].ds_len == 0) {
1608 1608                          continue;
1609 1609                  }
1610 1610  
1611 1611                  /*
1612 1612                   * Fill in the Data Segment(s) for the MLX send WQE, using
1613 1613                   * the information contained in the scatter-gather list of
1614 1614                   * the work request.
1615 1615                   */
1616 1616                  TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1617 1617  
1618 1618                  /*
1619 1619                   * Search through the contents of all MADs posted to QP0 to
1620 1620                   * initialize pointers to the places where Directed Route "hop
1621 1621                   * pointer", "hop count", and "mgmtclass" would be.  Tavor
1622 1622                   * needs these updated (i.e. incremented or decremented, as
1623 1623                   * necessary) by software.
1624 1624                   */
1625 1625                  if (qp->qp_is_special == TAVOR_QP_SMI) {
1626 1626  
1627 1627                          TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1628 1628                              offset, sgl[i].ds_va, sgl[i].ds_len);
1629 1629  
1630 1630                          TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1631 1631                              offset, sgl[i].ds_va, sgl[i].ds_len);
1632 1632  
1633 1633                          TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1634 1634                              offset, sgl[i].ds_va, sgl[i].ds_len);
1635 1635  
1636 1636                          offset += sgl[i].ds_len;
1637 1637                  }
1638 1638                  num_ds++;
1639 1639          }
1640 1640  
1641 1641          /*
1642 1642           * Tavor's Directed Route MADs need to have the "hop pointer"
1643 1643           * incremented/decremented (as necessary) depending on whether it is
1644 1644           * currently less than or greater than the "hop count" (i.e. whether
1645 1645           * the MAD is a request or a response.)
1646 1646           */
1647 1647          if (qp->qp_is_special == TAVOR_QP_SMI) {
1648 1648                  TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1649 1649                      *hpoint, *hcount);
1650 1650          }
1651 1651  
1652 1652          /*
1653 1653           * Now fill in the ICRC Data Segment.  This data segment is inlined
1654 1654           * just like the packets headers above, but it is only four bytes and
1655 1655           * set to zero (to indicate that we wish the hardware to generate ICRC.
1656 1656           */
1657 1657          TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1658 1658          num_ds++;
1659 1659  
1660 1660          /* Return the size of descriptor (in 16-byte chunks) */
1661 1661          *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1662 1662  
1663 1663          TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1664 1664          return (DDI_SUCCESS);
1665 1665  }
1666 1666  
1667 1667  
1668 1668  /*
1669 1669   * tavor_wqe_mlx_linknext()
1670 1670   *    Context: Can be called from interrupt or base context.
1671 1671   */
1672 1672  static void
1673 1673  tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1674 1674      uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1675 1675      tavor_qphdl_t qp)
1676 1676  {
1677 1677          tavor_hw_udav_t         udav;
1678 1678          tavor_ahhdl_t           ah;
1679 1679          uint64_t                next, ctrl, data;
1680 1680          uint_t                  nopcode;
1681 1681          uint_t                  udav_sz;
1682 1682          int                     i;
1683 1683  
1684 1684          /*
1685 1685           * Calculate the "next" field of the descriptor.  This amounts to
1686 1686           * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1687 1687           * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1688 1688           * if the current descriptor is the last WQE on the chain), then set
1689 1689           * "next" to zero.
1690 1690           */
1691 1691          if (curr_desc != NULL) {
1692 1692                  /*
1693 1693                   * The only valid Tavor WQE "nopcode" for MLX transport
1694 1694                   * requests is the "Send" code.
1695 1695                   */
1696 1696                  nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1697 1697                  curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1698 1698                      (uintptr_t)curr_desc - qp->qp_desc_off);
1699 1699                  next = (uint64_t)((uintptr_t)curr_desc &
1700 1700                      TAVOR_WQE_NDA_MASK) << 32;
1701 1701                  next = next | ((uint64_t)nopcode << 32);
1702 1702                  next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1703 1703  
1704 1704                  /*
1705 1705                   * If a send queue doorbell will be rung for the next
1706 1706                   * WQE on the chain, then set the current WQE's "dbd" bit.
1707 1707                   * Note: We also update the "dbinfo" structure here to pass
1708 1708                   * back information about what should (later) be included
1709 1709                   * in the send queue doorbell.
1710 1710                   */
1711 1711                  if (dbinfo) {
1712 1712                          next = next | TAVOR_WQE_DBD_MASK;
1713 1713                          dbinfo->db_nopcode = nopcode;
1714 1714                          dbinfo->db_fence   = 0;
1715 1715                  }
1716 1716          } else {
1717 1717                  next = 0;
1718 1718          }
1719 1719  
1720 1720          /*
1721 1721           * If this WQE is supposed to be linked to the previous descriptor,
1722 1722           * then we need to update not only the previous WQE's "next" fields
1723 1723           * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1724 1724           * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1725 1725           * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1726 1726           * always hardcoded to zero.
1727 1727           */
1728 1728          if (prev_desc != NULL) {
1729 1729                  /*
1730 1730                   * If a send queue doorbell will be rung for the next WQE on
1731 1731                   * the chain, then update the current WQE's "next" field and
1732 1732                   * return.
1733 1733                   * Note: We don't want to modify the "ctrl" field here because
1734 1734                   * that portion of the previous WQE has already been set
1735 1735                   * correctly at some previous point in time.
1736 1736                   */
1737 1737                  if (dbinfo) {
1738 1738                          TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1739 1739                          return;
1740 1740                  }
1741 1741  
1742 1742                  /*
1743 1743                   * Pull the address handle from the work request and read in
1744 1744                   * the contents of the UDAV.  This will be used to answer some
1745 1745                   * questions about the request.
1746 1746                   */
1747 1747                  ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1748 1748                  mutex_enter(&ah->ah_lock);
1749 1749                  udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1750 1750                  for (i = 0; i < udav_sz; i++) {
1751 1751                          data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1752 1752                              ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1753 1753                          ((uint64_t *)&udav)[i] = data;
1754 1754                  }
1755 1755                  mutex_exit(&ah->ah_lock);
1756 1756  
1757 1757                  ctrl = 0;
1758 1758  
1759 1759                  /* Only QP0 uses VL15, otherwise use VL in the packet */
1760 1760                  if (qp->qp_is_special == TAVOR_QP_SMI) {
1761 1761                          ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1762 1762                  }
1763 1763  
1764 1764                  /*
1765 1765                   * The SLR (Source LID Replace) bit determines whether the
1766 1766                   * source LID for an outgoing MLX packet should come from the
1767 1767                   * PortInfo (SLR = 0) or should be left as it is in the
1768 1768                   * descriptor (SLR = 1).  The latter is necessary for packets
1769 1769                   * to be sent with the Permissive LID.
1770 1770                   */
1771 1771                  if (udav.rlid == IB_LID_PERMISSIVE) {
1772 1772                          ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1773 1773                  }
1774 1774  
1775 1775                  /* Fill in the max static rate from the address handle */
1776 1776                  ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1777 1777                      TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1778 1778  
1779 1779                  /* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1780 1780                  if (qp->qp_is_special != TAVOR_QP_SMI) {
1781 1781                          ctrl = ctrl | ((uint64_t)udav.sl <<
1782 1782                              TAVOR_WQE_MLXHDR_SL_SHIFT);
1783 1783                  }
1784 1784  
1785 1785                  /* Set the "c" (i.e. "signaled") bit appropriately */
1786 1786                  if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1787 1787                          ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1788 1788                  }
1789 1789  
1790 1790                  /* Fill in the destination LID from the address handle */
1791 1791                  ctrl = ctrl | ((uint64_t)udav.rlid <<
1792 1792                      TAVOR_WQE_MLXHDR_RLID_SHIFT);
1793 1793  
1794 1794                  TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1795 1795          }
1796 1796  }
1797 1797  
1798 1798  
1799 1799  /*
1800 1800   * tavor_wqe_recv_build()
1801 1801   *    Context: Can be called from interrupt or base context.
1802 1802   */
1803 1803  /* ARGSUSED */
1804 1804  static int
1805 1805  tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1806 1806      ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1807 1807  {
1808 1808          tavor_hw_wqe_sgl_t      *ds;
1809 1809          int                     i, num_ds;
1810 1810  
1811 1811          TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1812 1812  
1813 1813          ASSERT(MUTEX_HELD(&qp->qp_lock));
1814 1814  
1815 1815          /* Check that work request transport type is valid */
1816 1816          if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1817 1817              (qp->qp_serv_type != TAVOR_QP_RC) &&
1818 1818              (qp->qp_serv_type != TAVOR_QP_UC)) {
1819 1819                  TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail,
1820 1820                      TAVOR_TNF_ERROR, "");
1821 1821                  TAVOR_TNF_EXIT(tavor_build_recv_wqe);
1822 1822                  return (IBT_QP_SRV_TYPE_INVALID);
1823 1823          }
1824 1824  
1825 1825          /* Fill in the Data Segments (SGL) for the Recv WQE */
1826 1826          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1827 1827              sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1828 1828          num_ds = 0;
1829 1829  
1830 1830          /* Check for valid number of SGL entries */
1831 1831          if (wr->wr_nds > qp->qp_rq_sgl) {
1832 1832                  TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail,
1833 1833                      TAVOR_TNF_ERROR, "");
1834 1834                  TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1835 1835                  return (IBT_QP_SGL_LEN_INVALID);
1836 1836          }
1837 1837  
1838 1838          /*
1839 1839           * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1840 1840           * segments.  Note: We skip any SGL with zero size because Tavor
1841 1841           * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1842 1842           * the encoding for zero means a 2GB transfer.  Because of this special
1843 1843           * encoding in the hardware, we mask the requested length with
1844 1844           * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1845 1845           * zero.)
1846 1846           */
1847 1847          for (i = 0; i < wr->wr_nds; i++) {
1848 1848                  if (wr->wr_sgl[i].ds_len == 0) {
1849 1849                          continue;
1850 1850                  }
1851 1851  
1852 1852                  /*
1853 1853                   * Fill in the Data Segment(s) for the receive WQE, using the
1854 1854                   * information contained in the scatter-gather list of the
1855 1855                   * work request.
1856 1856                   */
1857 1857                  TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1858 1858                  num_ds++;
1859 1859          }
1860 1860  
1861 1861          /* Return the size of descriptor (in 16-byte chunks) */
1862 1862          *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1863 1863  
1864 1864          TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1865 1865          return (DDI_SUCCESS);
1866 1866  }
1867 1867  
1868 1868  
1869 1869  /*
1870 1870   * tavor_wqe_recv_linknext()
1871 1871   *    Context: Can be called from interrupt or base context.
1872 1872   */
1873 1873  static void
1874 1874  tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1875 1875      uint64_t *prev_desc, tavor_qphdl_t qp)
1876 1876  {
1877 1877          uint64_t        next;
1878 1878  
1879 1879          /*
1880 1880           * Calculate the "next" field of the descriptor.  This amounts to
1881 1881           * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1882 1882           * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1883 1883           * if the current descriptor is the last WQE on the chain), then set
1884 1884           * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1885 1885           * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1886 1886           * In either case, we must add a single bit in the "reserved" field
1887 1887           * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1888 1888           * workaround for a known Tavor errata that can cause Recv WQEs with
1889 1889           * zero in the NDA field to behave improperly.
1890 1890           */
1891 1891          if (curr_desc != NULL) {
1892 1892                  curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1893 1893                      qp->qp_desc_off);
1894 1894                  next = (uint64_t)((uintptr_t)curr_desc &
1895 1895                      TAVOR_WQE_NDA_MASK) << 32;
1896 1896                  next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1897 1897                      TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1898 1898          } else {
1899 1899                  next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1900 1900          }
1901 1901  
1902 1902          /*
1903 1903           * If this WQE is supposed to be linked to the previous descriptor,
1904 1904           * then we need to update not only the previous WQE's "next" fields
1905 1905           * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1906 1906           * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1907 1907           * bits are always hardcoded to zero.
1908 1908           */
1909 1909          if (prev_desc != NULL) {
1910 1910                  TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1911 1911          }
1912 1912  }
1913 1913  
1914 1914  
1915 1915  /*
1916 1916   * tavor_wqe_srq_build()
1917 1917   *    Context: Can be called from interrupt or base context.
1918 1918   */
1919 1919  /* ARGSUSED */
1920 1920  static int
1921 1921  tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1922 1922      ibt_recv_wr_t *wr, uint64_t *desc)
1923 1923  {
1924 1924          tavor_hw_wqe_sgl_t      *ds;
1925 1925          ibt_wr_ds_t             end_sgl;
1926 1926          int                     i, num_ds;
1927 1927  
1928 1928          TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1929 1929  
1930 1930          ASSERT(MUTEX_HELD(&srq->srq_lock));
1931 1931  
1932 1932          /* Fill in the Data Segments (SGL) for the Recv WQE */
1933 1933          ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1934 1934              sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1935 1935          num_ds = 0;
1936 1936  
1937 1937          /* Check for valid number of SGL entries */
1938 1938          if (wr->wr_nds > srq->srq_wq_sgl) {
1939 1939                  TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail,
1940 1940                      TAVOR_TNF_ERROR, "");
1941 1941                  TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1942 1942                  return (IBT_QP_SGL_LEN_INVALID);
1943 1943          }
1944 1944  
1945 1945          /*
1946 1946           * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1947 1947           * segments.  Note: We skip any SGL with zero size because Tavor
1948 1948           * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1949 1949           * the encoding for zero means a 2GB transfer.  Because of this special
1950 1950           * encoding in the hardware, we mask the requested length with
1951 1951           * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1952 1952           * zero.)
1953 1953           */
1954 1954          for (i = 0; i < wr->wr_nds; i++) {
1955 1955                  if (wr->wr_sgl[i].ds_len == 0) {
1956 1956                          continue;
1957 1957                  }
1958 1958  
1959 1959                  /*
1960 1960                   * Fill in the Data Segment(s) for the receive WQE, using the
1961 1961                   * information contained in the scatter-gather list of the
1962 1962                   * work request.
1963 1963                   */
1964 1964                  TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1965 1965                  num_ds++;
1966 1966          }
1967 1967  
1968 1968          /*
1969 1969           * For SRQ, if the number of data segments is less than the maximum
1970 1970           * specified at alloc, then we have to fill in a special "key" entry in
1971 1971           * the sgl entry after the last valid one in this post request.  We do
1972 1972           * that here.
1973 1973           */
1974 1974          if (num_ds < srq->srq_wq_sgl) {
1975 1975                  end_sgl.ds_va  = 0;
1976 1976                  end_sgl.ds_len = 0;
1977 1977                  end_sgl.ds_key = 0x1;
1978 1978                  TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1979 1979          }
1980 1980  
1981 1981          TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1982 1982          return (DDI_SUCCESS);
1983 1983  }
1984 1984  
1985 1985  
1986 1986  /*
1987 1987   * tavor_wqe_srq_linknext()
1988 1988   *    Context: Can be called from interrupt or base context.
1989 1989   */
1990 1990  static void
1991 1991  tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1992 1992      tavor_srqhdl_t srq)
1993 1993  {
1994 1994          uint64_t        next;
1995 1995  
1996 1996          /*
1997 1997           * Calculate the "next" field of the descriptor.  This amounts to
1998 1998           * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1999 1999           * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
2000 2000           * if the current descriptor is the last WQE on the chain), then set
2001 2001           * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
2002 2002           * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
2003 2003           * In either case, we must add a single bit in the "reserved" field
2004 2004           * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
2005 2005           * workaround for a known Tavor errata that can cause Recv WQEs with
2006 2006           * zero in the NDA field to behave improperly.
2007 2007           */
2008 2008          if (curr_desc != NULL) {
2009 2009                  curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
2010 2010                      srq->srq_desc_off);
2011 2011                  next = (uint64_t)((uintptr_t)curr_desc &
2012 2012                      TAVOR_WQE_NDA_MASK) << 32;
2013 2013                  next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2014 2014          } else {
2015 2015                  next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2016 2016          }
2017 2017  
2018 2018          /*
2019 2019           * If this WQE is supposed to be linked to the previous descriptor,
2020 2020           * then we need to update not only the previous WQE's "next" fields
2021 2021           * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
2022 2022           * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
2023 2023           * bits are always hardcoded to zero.
2024 2024           */
2025 2025          if (prev_desc != NULL) {
2026 2026                  TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
2027 2027          }
2028 2028  }
2029 2029  
2030 2030  
2031 2031  /*
2032 2032   * tavor_wr_get_immediate()
2033 2033   *    Context: Can be called from interrupt or base context.
2034 2034   */
2035 2035  static uint32_t
2036 2036  tavor_wr_get_immediate(ibt_send_wr_t *wr)
2037 2037  {
2038 2038          /*
2039 2039           * This routine extracts the "immediate data" from the appropriate
2040 2040           * location in the IBTF work request.  Because of the way the
2041 2041           * work request structure is defined, the location for this data
2042 2042           * depends on the actual work request operation type.
2043 2043           */
2044 2044  
2045 2045          /* For RDMA Write, test if RC or UC */
2046 2046          if (wr->wr_opcode == IBT_WRC_RDMAW) {
2047 2047                  if (wr->wr_trans == IBT_RC_SRV) {
2048 2048                          return (wr->wr.rc.rcwr.rdma.rdma_immed);
2049 2049                  } else {  /* IBT_UC_SRV */
2050 2050                          return (wr->wr.uc.ucwr.rdma.rdma_immed);
2051 2051                  }
2052 2052          }
2053 2053  
2054 2054          /* For Send, test if RC, UD, or UC */
2055 2055          if (wr->wr_opcode == IBT_WRC_SEND) {
2056 2056                  if (wr->wr_trans == IBT_RC_SRV) {
2057 2057                          return (wr->wr.rc.rcwr.send_immed);
2058 2058                  } else if (wr->wr_trans == IBT_UD_SRV) {
2059 2059                          return (wr->wr.ud.udwr_immed);
2060 2060                  } else {  /* IBT_UC_SRV */
2061 2061                          return (wr->wr.uc.ucwr.send_immed);
2062 2062                  }
2063 2063          }
2064 2064  
2065 2065          /*
2066 2066           * If any other type of request, then immediate is undefined
2067 2067           */
2068 2068          return (0);
2069 2069  }
2070 2070  
2071 2071  
2072 2072  /*
2073 2073   * tavor_wqe_sync()
2074 2074   *    Context: Can be called from interrupt or base context.
2075 2075   */
2076 2076  static void
2077 2077  tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
2078 2078      uint_t sync_type, uint_t flag)
2079 2079  {
2080 2080          tavor_qphdl_t           qp;
2081 2081          tavor_srqhdl_t          srq;
2082 2082          uint_t                  is_sync_req;
2083 2083          uint64_t                *wqe_from, *wqe_to, *wqe_base, *wqe_top;
2084 2084          ddi_dma_handle_t        dmahdl;
2085 2085          off_t                   offset;
2086 2086          size_t                  length;
2087 2087          uint32_t                qsize;
2088 2088          int                     status;
2089 2089  
2090 2090          TAVOR_TNF_ENTER(tavor_wqe_sync);
2091 2091  
2092 2092          if (sync_type == TAVOR_WR_SRQ) {
2093 2093                  srq = (tavor_srqhdl_t)hdl;
2094 2094                  is_sync_req = srq->srq_sync;
2095 2095                  /* Get the DMA handle from SRQ context */
2096 2096                  dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2097 2097          } else {
2098 2098                  qp = (tavor_qphdl_t)hdl;
2099 2099                  is_sync_req = qp->qp_sync;
2100 2100                  /* Get the DMA handle from QP context */
2101 2101                  dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2102 2102          }
2103 2103  
2104 2104          /* Determine if the work queues need to be synced or not */
2105 2105          if (is_sync_req == 0) {
2106 2106                  TAVOR_TNF_EXIT(tavor_wqe_sync);
2107 2107                  return;
2108 2108          }
2109 2109  
2110 2110          /*
2111 2111           * Depending on the type of the work queue, we grab information
2112 2112           * about the address ranges we need to DMA sync.
2113 2113           */
2114 2114          if (sync_type == TAVOR_WR_SEND) {
2115 2115                  wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
2116 2116                  wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
2117 2117                  qsize    = qp->qp_sq_bufsz;
2118 2118  
2119 2119                  wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2120 2120                  wqe_top  = TAVOR_QP_SQ_ENTRY(qp, qsize);
2121 2121          } else if (sync_type == TAVOR_WR_RECV) {
2122 2122                  wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2123 2123                  wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2124 2124                  qsize    = qp->qp_rq_bufsz;
2125 2125  
2126 2126                  wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2127 2127                  wqe_top  = TAVOR_QP_RQ_ENTRY(qp, qsize);
2128 2128          } else {
2129 2129                  wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2130 2130                  wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2131 2131                  qsize    = srq->srq_wq_bufsz;
2132 2132  
2133 2133                  wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2134 2134                  wqe_top  = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2135 2135          }
2136 2136  
2137 2137          /*
2138 2138           * There are two possible cases for the beginning and end of the WQE
2139 2139           * chain we are trying to sync.  Either this is the simple case, where
2140 2140           * the end of the chain is below the beginning of the chain, or it is
2141 2141           * the "wrap-around" case, where the end of the chain has wrapped over
2142 2142           * the end of the queue.  In the former case, we simply need to
2143 2143           * calculate the span from beginning to end and sync it.  In the latter
2144 2144           * case, however, we need to calculate the span from the top of the
2145 2145           * work queue to the end of the chain and sync that, and then we need
2146 2146           * to find the other portion (from beginning of chain to end of queue)
2147 2147           * and sync that as well.  Note: if the "top to end" span is actually
2148 2148           * zero length, then we don't do a DMA sync because a zero length DMA
2149 2149           * sync unnecessarily syncs the entire work queue.
2150 2150           */
2151 2151          if (wqe_to > wqe_from) {
2152 2152                  /* "From Beginning to End" */
2153 2153                  offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2154 2154                  length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2155 2155  
2156 2156                  status = ddi_dma_sync(dmahdl, offset, length, flag);
2157 2157                  if (status != DDI_SUCCESS) {
2158 2158                          TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2159 2159                          TAVOR_TNF_EXIT(tavor_wqe_sync);
2160 2160                          return;
2161 2161                  }
2162 2162          } else {
2163 2163                  /* "From Top to End" */
2164 2164                  offset = (off_t)0;
2165 2165                  length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2166 2166                  if (length) {
2167 2167                          status = ddi_dma_sync(dmahdl, offset, length, flag);
2168 2168                          if (status != DDI_SUCCESS) {
2169 2169                                  TNF_PROBE_0(tavor_wqe_sync_fail,
2170 2170                                      TAVOR_TNF_ERROR, "");
2171 2171                                  TAVOR_TNF_EXIT(tavor_wqe_sync);
2172 2172                                  return;
2173 2173                          }
2174 2174                  }
2175 2175  
2176 2176                  /* "From Beginning to Bottom" */
2177 2177                  offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2178 2178                  length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2179 2179                  status = ddi_dma_sync(dmahdl, offset, length, flag);
2180 2180                  if (status != DDI_SUCCESS) {
2181 2181                          TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2182 2182                          TAVOR_TNF_EXIT(tavor_wqe_sync);
2183 2183                          return;
2184 2184                  }
2185 2185          }
2186 2186  
2187 2187          TAVOR_TNF_EXIT(tavor_wqe_sync);
2188 2188  }
2189 2189  
2190 2190  
2191 2191  /*
2192 2192   * tavor_wr_bind_check()
2193 2193   *    Context: Can be called from interrupt or base context.
2194 2194   */
2195 2195  static int
2196 2196  tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2197 2197  {
2198 2198          ibt_bind_flags_t        bind_flags;
2199 2199          uint64_t                vaddr, len;
2200 2200          uint64_t                reg_start_addr, reg_end_addr;
2201 2201          tavor_mwhdl_t           mw;
2202 2202          tavor_mrhdl_t           mr;
2203 2203          tavor_rsrc_t            *mpt;
2204 2204          uint32_t                new_rkey;
2205 2205  
2206 2206          TAVOR_TNF_ENTER(tavor_wr_bind_check);
2207 2207  
2208 2208          /* Check for a valid Memory Window handle in the WR */
2209 2209          mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2210 2210          if (mw == NULL) {
2211 2211                  TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail,
2212 2212                      TAVOR_TNF_ERROR, "");
2213 2213                  TAVOR_TNF_EXIT(tavor_wr_bind_check);
2214 2214                  return (IBT_MW_HDL_INVALID);
2215 2215          }
2216 2216  
2217 2217          /* Check for a valid Memory Region handle in the WR */
2218 2218          mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2219 2219          if (mr == NULL) {
2220 2220                  TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail,
2221 2221                      TAVOR_TNF_ERROR, "");
2222 2222                  TAVOR_TNF_EXIT(tavor_wr_bind_check);
2223 2223                  return (IBT_MR_HDL_INVALID);
2224 2224          }
2225 2225  
2226 2226          mutex_enter(&mr->mr_lock);
2227 2227          mutex_enter(&mw->mr_lock);
2228 2228  
2229 2229          /*
2230 2230           * Check here to see if the memory region has already been partially
2231 2231           * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2232 2232           * If so, this is an error, return failure.
2233 2233           */
2234 2234          if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2235 2235                  mutex_exit(&mr->mr_lock);
2236 2236                  mutex_exit(&mw->mr_lock);
2237 2237                  TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail,
2238 2238                      TAVOR_TNF_ERROR, "");
2239 2239                  TAVOR_TNF_EXIT(tavor_wr_bind_check);
2240 2240                  return (IBT_MR_HDL_INVALID);
2241 2241          }
2242 2242  
2243 2243          /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2244 2244          if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2245 2245                  mutex_exit(&mr->mr_lock);
2246 2246                  mutex_exit(&mw->mr_lock);
2247 2247                  TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail,
2248 2248                      TAVOR_TNF_ERROR, "");
2249 2249                  TAVOR_TNF_EXIT(tavor_wr_bind_check);
2250 2250                  return (IBT_MR_RKEY_INVALID);
2251 2251          }
2252 2252  
2253 2253          /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2254 2254          if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2255 2255                  mutex_exit(&mr->mr_lock);
2256 2256                  mutex_exit(&mw->mr_lock);
2257 2257                  TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail,
2258 2258                      TAVOR_TNF_ERROR, "");
2259 2259                  TAVOR_TNF_EXIT(tavor_wr_bind_check);
2260 2260                  return (IBT_MR_LKEY_INVALID);
2261 2261          }
2262 2262  
2263 2263          /*
2264 2264           * Now check for valid "vaddr" and "len".  Note:  We don't check the
2265 2265           * "vaddr" range when "len == 0" (i.e. on unbind operations)
2266 2266           */
2267 2267          len = wr->wr.rc.rcwr.bind->bind_len;
2268 2268          if (len != 0) {
2269 2269                  vaddr = wr->wr.rc.rcwr.bind->bind_va;
2270 2270                  reg_start_addr = mr->mr_bindinfo.bi_addr;
2271 2271                  reg_end_addr   = mr->mr_bindinfo.bi_addr +
2272 2272                      (mr->mr_bindinfo.bi_len - 1);
2273 2273                  if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2274 2274                          mutex_exit(&mr->mr_lock);
2275 2275                          mutex_exit(&mw->mr_lock);
2276 2276                          TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail,
2277 2277                              TAVOR_TNF_ERROR, "");
2278 2278                          TAVOR_TNF_EXIT(tavor_wr_bind_check);
2279 2279                          return (IBT_MR_VA_INVALID);
2280 2280                  }
2281 2281                  vaddr = (vaddr + len) - 1;
2282 2282                  if (vaddr > reg_end_addr) {
2283 2283                          mutex_exit(&mr->mr_lock);
2284 2284                          mutex_exit(&mw->mr_lock);
2285 2285                          TNF_PROBE_0(tavor_wr_bind_check_invlen_fail,
2286 2286                              TAVOR_TNF_ERROR, "");
2287 2287                          TAVOR_TNF_EXIT(tavor_wr_bind_check);
2288 2288                          return (IBT_MR_LEN_INVALID);
2289 2289                  }
2290 2290          }
2291 2291  
2292 2292          /*
2293 2293           * Validate the bind access flags.  Remote Write and Atomic access for
2294 2294           * the Memory Window require that Local Write access be set in the
2295 2295           * corresponding Memory Region.
2296 2296           */
2297 2297          bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2298 2298          if (((bind_flags & IBT_WR_BIND_WRITE) ||
2299 2299              (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2300 2300              !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2301 2301                  mutex_exit(&mr->mr_lock);
2302 2302                  mutex_exit(&mw->mr_lock);
2303 2303                  TNF_PROBE_0(tavor_wr_bind_check_invflags_fail,
2304 2304                      TAVOR_TNF_ERROR, "");
2305 2305                  TAVOR_TNF_EXIT(tavor_wr_bind_check);
2306 2306                  return (IBT_MR_ACCESS_REQ_INVALID);
2307 2307          }
2308 2308  
2309 2309          /* Calculate the new RKey for the Memory Window */
2310 2310          mpt = mw->mr_mptrsrcp;
2311 2311          tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2312 2312  
2313 2313          wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2314 2314          mw->mr_rkey = new_rkey;
2315 2315  
2316 2316          mutex_exit(&mr->mr_lock);
2317 2317          mutex_exit(&mw->mr_lock);
2318 2318          TAVOR_TNF_EXIT(tavor_wr_bind_check);
2319 2319          return (DDI_SUCCESS);
2320 2320  }
2321 2321  
2322 2322  
2323 2323  /*
2324 2324   * tavor_wrid_from_reset_handling()
2325 2325   *    Context: Can be called from interrupt or base context.
2326 2326   */
2327 2327  int
2328 2328  tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2329 2329  {
2330 2330          tavor_workq_hdr_t       *swq, *rwq;
2331 2331          tavor_wrid_list_hdr_t   *s_wridlist, *r_wridlist;
2332 2332          uint_t                  create_new_swq = 0, create_new_rwq = 0;
2333 2333          uint_t                  create_wql = 0;
2334 2334          uint_t                  qp_srq_en;
2335 2335  
2336 2336          TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling);
2337 2337  
2338 2338          /*
2339 2339           * For each of this QP's Work Queues, make sure we have a (properly
2340 2340           * initialized) Work Request ID list attached to the relevant
2341 2341           * completion queue.  Grab the CQ lock(s) before manipulating the
2342 2342           * lists.
2343 2343           */
2344 2344          tavor_wrid_wqhdr_lock_both(qp);
2345 2345          swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2346 2346              TAVOR_WR_SEND);
2347 2347          if (swq == NULL) {
2348 2348                  /* Couldn't find matching work queue header, create it */
2349 2349                  create_new_swq = create_wql = 1;
2350 2350                  swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2351 2351                      qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2352 2352                  if (swq == NULL) {
2353 2353                          /*

↓ open down ↓

2353 lines elided

↑ open up ↑

2354 2354                           * If we couldn't find/allocate space for the workq
2355 2355                           * header, then drop the lock(s) and return failure.
2356 2356                           */
2357 2357                          tavor_wrid_wqhdr_unlock_both(qp);
2358 2358                          TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2359 2359                              TAVOR_TNF_ERROR, "");
2360 2360                          TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2361 2361                          return (ibc_get_ci_failure(0));
2362 2362                  }
2363 2363          }
2364      -        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2365 2364          qp->qp_sq_wqhdr = swq;
2366 2365          swq->wq_size = qp->qp_sq_bufsz;
2367 2366          swq->wq_head = 0;
2368 2367          swq->wq_tail = 0;
2369 2368          swq->wq_full = 0;
2370 2369  
2371 2370          /*
2372 2371           * Allocate space for the tavor_wrid_entry_t container
2373 2372           */
2374 2373          s_wridlist = tavor_wrid_get_list(swq->wq_size);

2375 2374          if (s_wridlist == NULL) {
2376 2375                  /*
2377 2376                   * If we couldn't allocate space for tracking the WRID
2378 2377                   * entries, then cleanup the workq header from above (if
2379 2378                   * necessary, i.e. if we created the workq header).  Then
2380 2379                   * drop the lock(s) and return failure.
2381 2380                   */

↓ open down ↓

7 lines elided

↑ open up ↑

2382 2381                  if (create_new_swq) {
2383 2382                          tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2384 2383                  }
2385 2384  
2386 2385                  tavor_wrid_wqhdr_unlock_both(qp);
2387 2386                  TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2388 2387                      TAVOR_TNF_ERROR, "");
2389 2388                  TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2390 2389                  return (ibc_get_ci_failure(0));
2391 2390          }
2392      -        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2393 2391          s_wridlist->wl_wqhdr = swq;
2394 2392  
2395 2393          /* Chain the new WRID list container to the workq hdr list */
2396 2394          mutex_enter(&swq->wq_wrid_wql->wql_lock);
2397 2395          tavor_wrid_wqhdr_add(swq, s_wridlist);
2398 2396          mutex_exit(&swq->wq_wrid_wql->wql_lock);
2399 2397  
2400 2398          qp_srq_en = qp->qp_srq_en;
2401 2399  
2402      -#ifdef __lock_lint
2403      -        mutex_enter(&qp->qp_srqhdl->srq_lock);
2404      -#else
2405 2400          if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2406 2401                  mutex_enter(&qp->qp_srqhdl->srq_lock);
2407 2402          }
2408      -#endif
     2403 +
2409 2404          /*
2410 2405           * Now we repeat all the above operations for the receive work queue,
2411 2406           * or shared receive work queue.
2412 2407           *
2413 2408           * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2414 2409           */
2415 2410          rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2416 2411              TAVOR_WR_RECV);
2417 2412          if (rwq == NULL) {
2418 2413                  create_new_rwq = create_wql = 1;

2419 2414  
2420 2415                  /*
2421 2416                   * If this QP is associated with an SRQ, and this isn't the
2422 2417                   * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2423 2418                   * created.  Since the WQL is created at 'wqhdr_create' time we
2424 2419                   * pass in the flag 'create_wql' here to be 0 if we have
2425 2420                   * already created it.  And later on below we then next setup
2426 2421                   * the WQL and rwq information based off the existing SRQ info.
2427 2422                   */
2428 2423                  if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2429 2424                      qp->qp_srqhdl->srq_wrid_wql != NULL) {
2430 2425                          create_wql = 0;
2431 2426                  }
2432 2427  
2433 2428                  rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2434 2429                      qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2435 2430                  if (rwq == NULL) {
2436 2431                          /*
2437 2432                           * If we couldn't find/allocate space for the workq
2438 2433                           * header, then free all the send queue resources we
2439 2434                           * just allocated and setup (above), drop the lock(s)

↓ open down ↓

21 lines elided

↑ open up ↑

2440 2435                           * and return failure.
2441 2436                           */
2442 2437                          mutex_enter(&swq->wq_wrid_wql->wql_lock);
2443 2438                          tavor_wrid_wqhdr_remove(swq, s_wridlist);
2444 2439                          mutex_exit(&swq->wq_wrid_wql->wql_lock);
2445 2440                          if (create_new_swq) {
2446 2441                                  tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2447 2442                                      swq);
2448 2443                          }
2449 2444  
2450      -#ifdef __lock_lint
2451      -                        mutex_exit(&qp->qp_srqhdl->srq_lock);
2452      -#else
2453 2445                          if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2454 2446                                  mutex_exit(&qp->qp_srqhdl->srq_lock);
2455 2447                          }
2456      -#endif
2457 2448  
2458 2449                          tavor_wrid_wqhdr_unlock_both(qp);
2459 2450                          TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2460 2451                              TAVOR_TNF_ERROR, "");
2461 2452                          TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2462 2453                          return (ibc_get_ci_failure(0));
2463 2454                  }
2464 2455          }
2465      -        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2466 2456  
2467 2457          /*
2468 2458           * Setup receive workq hdr
2469 2459           *
2470 2460           * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2471 2461           * keeping a copy of the rwq pointer, setting the rwq bufsize
2472 2462           * appropriately, and initializing our part of the WQLock.
2473 2463           *
2474 2464           * In the normal QP case, the QP recv queue bufsize is used.
2475 2465           */

2476 2466          if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2477 2467                  rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2478 2468                  if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2479 2469                          qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2480 2470                  } else {
2481 2471                          rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2482 2472                  }
2483 2473                  tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2484 2474  
2485 2475          } else {
2486 2476                  rwq->wq_size = qp->qp_rq_bufsz;
2487 2477          }
2488 2478  
2489 2479          qp->qp_rq_wqhdr = rwq;
2490 2480          rwq->wq_head = 0;
2491 2481          rwq->wq_tail = 0;
2492 2482          rwq->wq_full = 0;
2493 2483  
2494 2484          /*
2495 2485           * Allocate space for the tavor_wrid_entry_t container.
2496 2486           *
2497 2487           * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2498 2488           * allocate the wridlist normally.  However, if the srq_wridlist is !=
2499 2489           * NULL, then we know this SRQ has already been initialized, thus the
2500 2490           * wridlist has already been initialized.  So we re-use the
2501 2491           * srq_wridlist as the r_wridlist for this QP in this case.
2502 2492           */
2503 2493          if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2504 2494              qp->qp_srqhdl->srq_wridlist != NULL) {
2505 2495                  /* Use existing srq_wridlist pointer */
2506 2496                  r_wridlist = qp->qp_srqhdl->srq_wridlist;
2507 2497                  ASSERT(r_wridlist != NULL);
2508 2498          } else {
2509 2499                  /* Allocate memory for the r_wridlist */
2510 2500                  r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2511 2501          }
2512 2502  
2513 2503          /*
2514 2504           * If the memory allocation failed for r_wridlist (or the SRQ pointer
2515 2505           * is mistakenly NULL), we cleanup our previous swq allocation from
2516 2506           * above
2517 2507           */
2518 2508          if (r_wridlist == NULL) {
2519 2509                  /*
2520 2510                   * If we couldn't allocate space for tracking the WRID
2521 2511                   * entries, then cleanup all the stuff from above.  Then
2522 2512                   * drop the lock(s) and return failure.
2523 2513                   */

↓ open down ↓

48 lines elided

↑ open up ↑

2524 2514                  mutex_enter(&swq->wq_wrid_wql->wql_lock);
2525 2515                  tavor_wrid_wqhdr_remove(swq, s_wridlist);
2526 2516                  mutex_exit(&swq->wq_wrid_wql->wql_lock);
2527 2517                  if (create_new_swq) {
2528 2518                          tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2529 2519                  }
2530 2520                  if (create_new_rwq) {
2531 2521                          tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2532 2522                  }
2533 2523  
2534      -#ifdef __lock_lint
2535      -                mutex_exit(&qp->qp_srqhdl->srq_lock);
2536      -#else
2537 2524                  if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2538 2525                          mutex_exit(&qp->qp_srqhdl->srq_lock);
2539 2526                  }
2540      -#endif
2541 2527  
2542 2528                  tavor_wrid_wqhdr_unlock_both(qp);
2543 2529                  TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2544 2530                      TAVOR_TNF_ERROR, "");
2545 2531                  TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2546 2532                  return (ibc_get_ci_failure(0));
2547 2533          }
2548      -        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2549 2534  
2550 2535          /*
2551 2536           * Initialize the wridlist
2552 2537           *
2553 2538           * In the normal QP case, there is no special initialization needed.
2554 2539           * We simply setup the wridlist backpointer to be the receive wqhdr
2555 2540           * (rwq).
2556 2541           *
2557 2542           * But in the SRQ case, there is no backpointer to the wqhdr possible.
2558 2543           * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ

2559 2544           * and thus potentially shared across multiple QPs with the SRQ.  We
2560 2545           * also setup the srq_wridlist pointer to be the r_wridlist, and
2561 2546           * intialize the freelist to an invalid index.  This srq_wridlist
2562 2547           * pointer is used above on future moves from_reset to let us know that
2563 2548           * the srq_wridlist has been initialized already.
2564 2549           *
2565 2550           * And finally, if we are in a non-UMAP case, we setup the srq wrid
2566 2551           * free list.
2567 2552           */
2568 2553          if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2569 2554              qp->qp_srqhdl->srq_wridlist == NULL) {
2570 2555                  r_wridlist->wl_srq_en = 1;
2571 2556                  r_wridlist->wl_free_list_indx = -1;
2572 2557                  qp->qp_srqhdl->srq_wridlist = r_wridlist;
2573 2558  
2574 2559                  /* Initialize srq wrid free list */
2575 2560                  if (qp->qp_srqhdl->srq_is_umap == 0) {
2576 2561                          mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2577 2562                          tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2578 2563                          mutex_exit(&rwq->wq_wrid_wql->wql_lock);

↓ open down ↓

20 lines elided

↑ open up ↑

2579 2564                  }
2580 2565          } else {
2581 2566                  r_wridlist->wl_wqhdr = rwq;
2582 2567          }
2583 2568  
2584 2569          /* Chain the WRID list "container" to the workq hdr list */
2585 2570          mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2586 2571          tavor_wrid_wqhdr_add(rwq, r_wridlist);
2587 2572          mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2588 2573  
2589      -#ifdef __lock_lint
2590      -        mutex_exit(&qp->qp_srqhdl->srq_lock);
2591      -#else
2592 2574          if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2593 2575                  mutex_exit(&qp->qp_srqhdl->srq_lock);
2594 2576          }
2595      -#endif
2596 2577  
2597      -        _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2598      -        _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2599      -        _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2600      -        _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2601      -
2602 2578          tavor_wrid_wqhdr_unlock_both(qp);
2603 2579          TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2604 2580          return (DDI_SUCCESS);
2605 2581  }
2606 2582  
2607 2583  
2608 2584  /*
2609 2585   * tavor_wrid_to_reset_handling()
2610 2586   *    Context: Can be called from interrupt or base context.
2611 2587   */

2612 2588  void
2613 2589  tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2614 2590  {
2615 2591          uint_t          free_wqhdr = 0;
2616 2592  
2617 2593          TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling);
2618 2594  
2619 2595          /*
2620 2596           * For each of this QP's Work Queues, move the WRID "container" to
2621 2597           * the "reapable" list.  Although there may still be unpolled
2622 2598           * entries in these containers, it is not a big deal.  We will not
2623 2599           * reap the list until either the Poll CQ command detects an empty
2624 2600           * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2625 2601           * manipulating the lists.
2626 2602           */
2627 2603          mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2628 2604          tavor_wrid_wqhdr_lock_both(qp);
2629 2605          tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2630 2606  
2631 2607          /*
2632 2608           * Add the receive work queue header on to the reaplist.  But if we are
2633 2609           * on SRQ, then don't add anything to the reaplist.  Instead we flush
2634 2610           * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2635 2611           * WQHDR (if needed).  We must hold the WQL for these operations, yet
2636 2612           * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2637 2613           * drop WQL before that call.  Then release the CQ WQHDR locks and the
2638 2614           * CQ lock and return.
2639 2615           */
2640 2616          if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2641 2617  
2642 2618                  /*
2643 2619                   * Pull off all (if any) entries for this QP from CQ.  This
2644 2620                   * only includes entries that have not yet been polled
2645 2621                   */
2646 2622                  mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2647 2623                  tavor_cq_srq_entries_flush(state, qp);
2648 2624  
2649 2625                  /* Remove wridlist from WQHDR */
2650 2626                  tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2651 2627                      qp->qp_rq_wqhdr->wq_wrid_post);
2652 2628  
2653 2629                  /* If wridlist chain is now empty, remove the wqhdr as well */
2654 2630                  if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2655 2631                          free_wqhdr = 1;
2656 2632                  } else {
2657 2633                          free_wqhdr = 0;
2658 2634                  }
2659 2635  
2660 2636                  mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2661 2637  
2662 2638                  /* Free the WQHDR */
2663 2639                  if (free_wqhdr) {
2664 2640                          tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2665 2641                  }
2666 2642          } else {
2667 2643                  tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2668 2644          }
2669 2645          tavor_wrid_wqhdr_unlock_both(qp);
2670 2646          mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2671 2647  
2672 2648          TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling);
2673 2649  }
2674 2650  
2675 2651  
2676 2652  /*
2677 2653   * tavor_wrid_add_entry()
2678 2654   *    Context: Can be called from interrupt or base context.
2679 2655   */
2680 2656  void
2681 2657  tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2682 2658      uint_t signaled_dbd)
2683 2659  {
2684 2660          tavor_wrid_entry_t      *wre_tmp;
2685 2661          uint32_t                head, tail, size;
2686 2662  
2687 2663          TAVOR_TNF_ENTER(tavor_wrid_add_entry);
2688 2664  
2689 2665          ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2690 2666  
2691 2667          /*
2692 2668           * Find the entry in the container pointed to by the "tail" index.
2693 2669           * Add all of the relevant information to that entry, including WRID,
2694 2670           * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2695 2671           * and/or doorbelled.
2696 2672           */
2697 2673          head = wq->wq_wrid_post->wl_head;
2698 2674          tail = wq->wq_wrid_post->wl_tail;
2699 2675          size = wq->wq_wrid_post->wl_size;
2700 2676          wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2701 2677          wre_tmp->wr_wrid          = wrid;
2702 2678          wre_tmp->wr_wqeaddrsz     = wqeaddrsz;
2703 2679          wre_tmp->wr_signaled_dbd  = signaled_dbd;
2704 2680  
2705 2681          /*
2706 2682           * Update the "wrid_old_tail" pointer to point to the entry we just
2707 2683           * inserted into the queue.  By tracking this pointer (the pointer to
2708 2684           * the most recently inserted entry) it will possible later in the
2709 2685           * PostSend() and PostRecv() code paths to find the entry that needs
2710 2686           * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2711 2687           * tavor_post_send()).
2712 2688           */
2713 2689          wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2714 2690  
2715 2691          /* Update the tail index */
2716 2692          tail = ((tail + 1) & (size - 1));
2717 2693          wq->wq_wrid_post->wl_tail = tail;
2718 2694  
2719 2695          /*
2720 2696           * If the "tail" index has just wrapped over into the "head" index,
2721 2697           * then we have filled the container.  We use the "full" flag to
2722 2698           * indicate this condition and to distinguish it from the "empty"
2723 2699           * condition (where head and tail are also equal).
2724 2700           */
2725 2701          if (head == tail) {
2726 2702                  wq->wq_wrid_post->wl_full = 1;
2727 2703          }
2728 2704          TAVOR_TNF_EXIT(tavor_wrid_add_entry);
2729 2705  }
2730 2706  
2731 2707  /*
2732 2708   * tavor_wrid_add_entry_srq()
2733 2709   * Context: Can be called from interrupt or base context
2734 2710   */
2735 2711  void
2736 2712  tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2737 2713  {
2738 2714          tavor_wrid_entry_t      *wre;
2739 2715          uint64_t                *wl_wqe;
2740 2716          uint32_t                wqe_index;
2741 2717  
2742 2718          TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq);
2743 2719  
2744 2720          /*
2745 2721           * Find the next available WQE from the SRQ free_list.  Then update the
2746 2722           * free_list to point to the next entry
2747 2723           */
2748 2724          wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2749 2725  
2750 2726          wqe_index = srq->srq_wridlist->wl_free_list_indx;
2751 2727  
2752 2728          /* ASSERT on impossible wqe_index values */
2753 2729          ASSERT(wqe_index < srq->srq_wq_bufsz);
2754 2730  
2755 2731          /*
2756 2732           * Setup the WRE.
2757 2733           *
2758 2734           * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2759 2735           * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2760 2736           * this information and associate the WRID to the WQE found on the CQE.
2761 2737           */
2762 2738          wre = &srq->srq_wridlist->wl_wre[wqe_index];
2763 2739          wre->wr_wrid = wrid;
2764 2740          wre->wr_signaled_dbd  = signaled_dbd;
2765 2741  
2766 2742          /* Update the free list index */
2767 2743          srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2768 2744              srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2769 2745  
2770 2746          TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq);
2771 2747  }
2772 2748  
2773 2749  
2774 2750  /*
2775 2751   * tavor_wrid_get_entry()
2776 2752   *    Context: Can be called from interrupt or base context.
2777 2753   */
2778 2754  uint64_t
2779 2755  tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2780 2756      tavor_wrid_entry_t *wre)
2781 2757  {
2782 2758          tavor_workq_hdr_t       *wq;
2783 2759          tavor_wrid_entry_t      *wre_tmp;
2784 2760          uint64_t                wrid;
2785 2761          uint_t                  send_or_recv, qpnum, error, opcode;
2786 2762  
2787 2763          TAVOR_TNF_ENTER(tavor_wrid_get_entry);
2788 2764  
2789 2765          /* Lock the list of work queues associated with this CQ */
2790 2766          mutex_enter(&cq->cq_wrid_wqhdr_lock);
2791 2767  
2792 2768          /*
2793 2769           * Determine whether this CQE is a send or receive completion (and
2794 2770           * whether it was a "successful" completion or not)
2795 2771           */
2796 2772          opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2797 2773          if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2798 2774              (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2799 2775                  error = 1;
2800 2776                  send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2801 2777                      TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2802 2778          } else {
2803 2779                  error = 0;
2804 2780                  send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2805 2781          }
2806 2782  
2807 2783          /* Find the work queue for this QP number (send or receive side) */
2808 2784          qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2809 2785          wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2810 2786          ASSERT(wq != NULL);
2811 2787  
2812 2788          /*
2813 2789           * Regardless of whether the completion is the result of a "success"
2814 2790           * or a "failure", we lock the list of "containers" and attempt to
2815 2791           * search for the the first matching completion (i.e. the first WR
2816 2792           * with a matching WQE addr and size).  Once we find it, we pull out
2817 2793           * the "wrid" field and return it (see below).  Note: One possible
2818 2794           * future enhancement would be to enable this routine to skip over
2819 2795           * any "unsignaled" completions to go directly to the next "signaled"
2820 2796           * entry on success. XXX
2821 2797           */
2822 2798          mutex_enter(&wq->wq_wrid_wql->wql_lock);
2823 2799          wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2824 2800  
2825 2801          /*
2826 2802           * If this is a "successful" completion, then we assert that this
2827 2803           * completion must be a "signaled" completion.
2828 2804           */
2829 2805          ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2830 2806  
2831 2807          /*
2832 2808           * If the completion is a "failed" completion, then we save away the
2833 2809           * contents of the entry (into the "wre" field passed in) for use
2834 2810           * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2835 2811           * function to grab "wqeaddrsz" from the next entry in the container.
2836 2812           * This is required for error processing (where updating these fields
2837 2813           * properly is necessary to correct handling of the "error" CQE)
2838 2814           */
2839 2815          if (error && (wre != NULL)) {
2840 2816                  *wre = *wre_tmp;
2841 2817                  wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2842 2818          }
2843 2819  
2844 2820          /* Pull out the WRID and return it */
2845 2821          wrid = wre_tmp->wr_wrid;
2846 2822  
2847 2823          mutex_exit(&wq->wq_wrid_wql->wql_lock);
2848 2824          mutex_exit(&cq->cq_wrid_wqhdr_lock);
2849 2825  
2850 2826          TAVOR_TNF_EXIT(tavor_wrid_get_entry);
2851 2827          return (wrid);
2852 2828  }
2853 2829  
2854 2830  
2855 2831  /*
2856 2832   * tavor_wrid_find_match()
2857 2833   *    Context: Can be called from interrupt or base context.
2858 2834   */
2859 2835  static tavor_wrid_entry_t *
2860 2836  tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2861 2837      tavor_hw_cqe_t *cqe)
2862 2838  {
2863 2839          tavor_wrid_entry_t      *curr = NULL;
2864 2840          tavor_wrid_list_hdr_t   *container;
2865 2841          uint32_t                wqeaddr_size;
2866 2842          uint32_t                head, tail, size;
2867 2843          int                     found = 0, last_container;
2868 2844  
2869 2845          TAVOR_TNF_ENTER(tavor_wrid_find_match);
2870 2846  
2871 2847          ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2872 2848  
2873 2849          /* Pull the "wqeaddrsz" information from the CQE */
2874 2850          wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2875 2851  
2876 2852          /*
2877 2853           * Walk the "containers" list(s), find first WR with a matching WQE
2878 2854           * addr.  If the current "container" is not the last one on the list,
2879 2855           * i.e. not the current one to which we are posting new WRID entries,
2880 2856           * then we do not attempt to update the "q_head", "q_tail", and
2881 2857           * "q_full" indicators on the main work queue header.  We do, however,
2882 2858           * update the "head" and "full" indicators on the individual containers
2883 2859           * as we go.  This is imperative because we need to be able to
2884 2860           * determine when the current container has been emptied (so that we
2885 2861           * can move on to the next container).
2886 2862           */
2887 2863          container = wq->wq_wrid_poll;
2888 2864          while (container != NULL) {
2889 2865                  /* Is this the last/only "container" on the list */
2890 2866                  last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2891 2867  
2892 2868                  /*
2893 2869                   * First check if we are on an SRQ.  If so, we grab the entry
2894 2870                   * and break out.  Since SRQ wridlist's are never added to
2895 2871                   * reaplist, they can only be the last container.
2896 2872                   */
2897 2873                  if (container->wl_srq_en) {
2898 2874                          ASSERT(last_container == 1);
2899 2875                          curr = tavor_wrid_find_match_srq(container, cq, cqe);
2900 2876                          break;
2901 2877                  }
2902 2878  
2903 2879                  /*
2904 2880                   * Grab the current "head", "tail" and "size" fields before
2905 2881                   * walking the list in the current container. Note: the "size"
2906 2882                   * field here must always be a power-of-2.  The "full"
2907 2883                   * parameter is checked (and updated) here to distinguish the
2908 2884                   * "queue full" condition from "queue empty".
2909 2885                   */
2910 2886                  head = container->wl_head;
2911 2887                  tail = container->wl_tail;
2912 2888                  size = container->wl_size;
2913 2889                  while ((head != tail) || (container->wl_full)) {
2914 2890                          container->wl_full = 0;
2915 2891                          curr = &container->wl_wre[head];
2916 2892                          head = ((head + 1) & (size - 1));
2917 2893  
2918 2894                          /*
2919 2895                           * If the current entry's "wqeaddrsz" matches the one
2920 2896                           * we're searching for, then this must correspond to
2921 2897                           * the work request that caused the completion.  Set
2922 2898                           * the "found" flag and bail out.
2923 2899                           */
2924 2900                          if (curr->wr_wqeaddrsz == wqeaddr_size) {
2925 2901                                  found = 1;
2926 2902                                  break;
2927 2903                          }
2928 2904                  }
2929 2905  
2930 2906                  /*
2931 2907                   * If the current container is empty (having reached here the
2932 2908                   * "head == tail" condition can only mean that the container
2933 2909                   * is empty), then NULL out the "wrid_old_tail" field (see
2934 2910                   * tavor_post_send() and tavor_post_recv() for more details)
2935 2911                   * and (potentially) remove the current container from future
2936 2912                   * searches.
2937 2913                   */
2938 2914                  if (head == tail) {
2939 2915  
2940 2916                          container->wl_wre_old_tail = NULL;
2941 2917                          /*
2942 2918                           * If this wasn't the last "container" on the chain,
2943 2919                           * i.e. the one to which new WRID entries will be
2944 2920                           * added, then remove it from the list.
2945 2921                           * Note: we don't "lose" the memory pointed to by this
2946 2922                           * because we should have already put this container
2947 2923                           * on the "reapable" list (from where it will later be
2948 2924                           * pulled).
2949 2925                           */
2950 2926                          if (!last_container) {
2951 2927                                  wq->wq_wrid_poll = container->wl_next;
2952 2928                          }
2953 2929                  }
2954 2930  
2955 2931                  /* Update the head index for the container */
2956 2932                  container->wl_head = head;
2957 2933  
2958 2934                  /*
2959 2935                   * If the entry was found in this container, then continue to
2960 2936                   * bail out.  Else reset the "curr" pointer and move on to the
2961 2937                   * next container (if there is one).  Note: the only real
2962 2938                   * reason for setting "curr = NULL" here is so that the ASSERT
2963 2939                   * below can catch the case where no matching entry was found
2964 2940                   * on any of the lists.
2965 2941                   */
2966 2942                  if (found) {
2967 2943                          break;
2968 2944                  } else {
2969 2945                          curr = NULL;
2970 2946                          container = container->wl_next;
2971 2947                  }
2972 2948          }
2973 2949  
2974 2950          /*
2975 2951           * Update work queue header's "head" and "full" conditions to match
2976 2952           * the last entry on the container list.  (Note: Only if we're pulling
2977 2953           * entries from the last work queue portion of the list, i.e. not from
2978 2954           * the previous portions that may be the "reapable" list.)
2979 2955           */
2980 2956          if (last_container) {
2981 2957                  wq->wq_head = wq->wq_wrid_post->wl_head;
2982 2958                  wq->wq_full = wq->wq_wrid_post->wl_full;
2983 2959          }
2984 2960  
2985 2961          /* Ensure that we've actually found what we were searching for */
2986 2962          ASSERT(curr != NULL);
2987 2963  
2988 2964          TAVOR_TNF_EXIT(tavor_wrid_find_match);
2989 2965          return (curr);
2990 2966  }
2991 2967  
2992 2968  
2993 2969  /*
2994 2970   * tavor_wrid_find_match_srq()
2995 2971   *    Context: Can be called from interrupt or base context.
2996 2972   */
2997 2973  tavor_wrid_entry_t *
2998 2974  tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2999 2975      tavor_hw_cqe_t *cqe)
3000 2976  {
3001 2977          tavor_wrid_entry_t      *wre;
3002 2978          uint64_t                *wl_wqe;
3003 2979          uint32_t                wqe_index;
3004 2980          uint64_t                wqe_addr;
3005 2981          uint32_t                cqe_wqe_addr;
3006 2982  
3007 2983          /* Grab the WQE addr out of the CQE */
3008 2984          cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
3009 2985  
3010 2986          /*
3011 2987           * Use the WQE addr as the lower 32-bit, we add back on the
3012 2988           * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
3013 2989           * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
3014 2990           * the SRQ Work Queue itself.  We use this address as the index to find
3015 2991           * out which Work Queue Entry this CQE corresponds with.
3016 2992           *
3017 2993           * We also use this address below to add the WQE back on to the free
3018 2994           * list.
3019 2995           */
3020 2996          wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
3021 2997              (cqe_wqe_addr + wl->wl_srq_desc_off);
3022 2998  
3023 2999          /*
3024 3000           * Given the 'wqe_addr' just calculated and the srq buf address, we
3025 3001           * find the 'wqe_index'.  The 'wre' returned below contains the WRID
3026 3002           * that we are looking for.  This indexes into the wre_list for this
3027 3003           * specific WQE.
3028 3004           */
3029 3005          wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
3030 3006              wl->wl_srq_log_wqesz);
3031 3007  
3032 3008          /* ASSERT on impossible wqe_index values */
3033 3009          ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
3034 3010  
3035 3011          /* Get the pointer to this WQE */
3036 3012          wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
3037 3013  
3038 3014          /* Put this WQE index back on the free list */
3039 3015          ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
3040 3016          wl->wl_free_list_indx = wqe_index;
3041 3017  
3042 3018          /* Using the index, return the Work Request ID Entry (wre) */
3043 3019          wre = &wl->wl_wre[wqe_index];
3044 3020  
3045 3021          return (wre);
3046 3022  }
3047 3023  
3048 3024  
3049 3025  /*
3050 3026   * tavor_wrid_cq_reap()
3051 3027   *    Context: Can be called from interrupt or base context.
3052 3028   */
3053 3029  void
3054 3030  tavor_wrid_cq_reap(tavor_cqhdl_t cq)
3055 3031  {
3056 3032          tavor_workq_hdr_t       *consume_wqhdr;
3057 3033          tavor_wrid_list_hdr_t   *container, *to_free;
3058 3034  
3059 3035          ASSERT(MUTEX_HELD(&cq->cq_lock));
3060 3036  
3061 3037          TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3062 3038  
3063 3039          /* Lock the list of work queues associated with this CQ */
3064 3040          mutex_enter(&cq->cq_wrid_wqhdr_lock);
3065 3041  
3066 3042          /* Walk the "reapable" list and free up containers */
3067 3043          container = cq->cq_wrid_reap_head;
3068 3044          while (container != NULL) {
3069 3045                  to_free   = container;
3070 3046                  container = container->wl_reap_next;
3071 3047                  /*
3072 3048                   * If reaping the WRID list containers pulls the last
3073 3049                   * container from the given work queue header, then we free
3074 3050                   * the work queue header as well.
3075 3051                   */
3076 3052                  consume_wqhdr = tavor_wrid_list_reap(to_free);
3077 3053                  if (consume_wqhdr != NULL) {
3078 3054                          tavor_cq_wqhdr_remove(cq, consume_wqhdr);
3079 3055                  }
3080 3056          }
3081 3057  
3082 3058          /* Once finished reaping, we reset the CQ's reap list */
3083 3059          cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
3084 3060  
3085 3061          mutex_exit(&cq->cq_wrid_wqhdr_lock);
3086 3062          TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3087 3063  }
3088 3064  
3089 3065  
3090 3066  /*
3091 3067   * tavor_wrid_cq_force_reap()
3092 3068   *    Context: Can be called from interrupt or base context.
3093 3069   */
3094 3070  void
3095 3071  tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
3096 3072  {
3097 3073          tavor_workq_hdr_t       *curr;
3098 3074          tavor_wrid_list_hdr_t   *container, *to_free;
3099 3075          avl_tree_t              *treep;
3100 3076          void                    *cookie = NULL;
3101 3077  
3102 3078          ASSERT(MUTEX_HELD(&cq->cq_lock));
3103 3079  
3104 3080          TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3105 3081  
3106 3082          /*
3107 3083           * The first step is to walk the "reapable" list and free up those
3108 3084           * containers.  This is necessary because the containers on the
3109 3085           * reapable list are not otherwise connected to the work queue headers
3110 3086           * anymore.
3111 3087           */
3112 3088          tavor_wrid_cq_reap(cq);
3113 3089  
3114 3090          /* Now lock the list of work queues associated with this CQ */
3115 3091          mutex_enter(&cq->cq_wrid_wqhdr_lock);

↓ open down ↓

504 lines elided

↑ open up ↑

3116 3092  
3117 3093          /*
3118 3094           * Walk the list of work queue headers and free up all the WRID list
3119 3095           * containers chained to it.  Note: We don't need to grab the locks
3120 3096           * for each of the individual WRID lists here because the only way
3121 3097           * things can be added or removed from the list at this point would be
3122 3098           * through post a work request to a QP.  But if we've come this far,
3123 3099           * then we can be assured that there are no longer any QP associated
3124 3100           * with the CQ that we are trying to free.
3125 3101           */
3126      -#ifdef __lock_lint
3127      -        tavor_wrid_wqhdr_compare(NULL, NULL);
3128      -#endif
3129 3102          treep = &cq->cq_wrid_wqhdr_avl_tree;
3130 3103          while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
3131      -                _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
3132 3104                  container = curr->wq_wrid_poll;
3133 3105                  while (container != NULL) {
3134 3106                          to_free   = container;
3135 3107                          container = container->wl_next;
3136 3108                          /*
3137 3109                           * If reaping the WRID list containers pulls the last
3138 3110                           * container from the given work queue header, then
3139 3111                           * we free the work queue header as well.  Note: we
3140 3112                           * ignore the return value because we know that the
3141 3113                           * work queue header should always be freed once the

3142 3114                           * list of containers has come to an end.
3143 3115                           */
3144 3116                          (void) tavor_wrid_list_reap(to_free);
3145 3117                          if (container == NULL) {
3146 3118                                  tavor_cq_wqhdr_remove(cq, curr);
3147 3119                          }
3148 3120                  }
3149 3121          }
3150 3122          avl_destroy(treep);
3151 3123  
3152 3124          mutex_exit(&cq->cq_wrid_wqhdr_lock);
3153 3125          TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3154 3126  }
3155 3127  
3156 3128  
3157 3129  /*
3158 3130   * tavor_wrid_get_list()
3159 3131   *    Context: Can be called from interrupt or base context.
3160 3132   */
3161 3133  tavor_wrid_list_hdr_t *
3162 3134  tavor_wrid_get_list(uint32_t qsize)
3163 3135  {
3164 3136          tavor_wrid_list_hdr_t   *wridlist;
3165 3137          uint32_t                size;
3166 3138  
3167 3139          /*
3168 3140           * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
3169 3141           * which holds the pointers necessary for maintaining the "reapable"
3170 3142           * list, chaining together multiple "containers" old and new, and
3171 3143           * tracking the head, tail, size, etc. for each container.
3172 3144           *
3173 3145           * The "container" also holds all the tavor_wrid_entry_t's, which is
3174 3146           * allocated separately, one for each entry on the corresponding work
3175 3147           * queue.
3176 3148           */
3177 3149          size = sizeof (tavor_wrid_list_hdr_t);

↓ open down ↓

36 lines elided

↑ open up ↑

3178 3150  
3179 3151          /*
3180 3152           * Note that this allocation has to be a NOSLEEP operation here
3181 3153           * because we are holding the "wqhdr_list_lock" and, therefore,
3182 3154           * could get raised to the interrupt level.
3183 3155           */
3184 3156          wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
3185 3157          if (wridlist == NULL) {
3186 3158                  return (NULL);
3187 3159          }
3188      -        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
3189 3160  
3190 3161          /* Complete the "container" initialization */
3191 3162          wridlist->wl_size = qsize;
3192 3163          wridlist->wl_full = 0;
3193 3164          wridlist->wl_head = 0;
3194 3165          wridlist->wl_tail = 0;
3195 3166          wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3196 3167              sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3197 3168          if (wridlist->wl_wre == NULL) {
3198 3169                  kmem_free(wridlist, size);

3199 3170                  return (NULL);
3200 3171          }
3201 3172          wridlist->wl_wre_old_tail  = NULL;
3202 3173          wridlist->wl_reap_next = NULL;
3203 3174          wridlist->wl_next  = NULL;
3204 3175          wridlist->wl_prev  = NULL;
3205 3176          wridlist->wl_srq_en = 0;
3206 3177  
3207 3178          return (wridlist);
3208 3179  }
3209 3180  
3210 3181  /*
3211 3182   * tavor_wrid_list_srq_init()
3212 3183   * Context: Can be called from interrupt or base context
3213 3184   */
3214 3185  void
3215 3186  tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3216 3187      uint_t wq_start)
3217 3188  {
3218 3189          uint64_t *wl_wqe;
3219 3190          int wqe_index;
3220 3191  
3221 3192          ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3222 3193  
3223 3194          /* Setup pointers for use later when we are polling the CQ */
3224 3195          wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3225 3196          wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3226 3197          wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3227 3198          wridlist->wl_srq_desc_off = srq->srq_desc_off;
3228 3199          wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3229 3200  
3230 3201          /* Given wq_start to start initializing buf at, verify sanity */
3231 3202          ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3232 3203  
3233 3204          /*
3234 3205           * Initialize wridlist free list
3235 3206           *
3236 3207           * For each WQ up to the size of our queue, we store an index in the WQ
3237 3208           * memory itself, representing the next available free entry.  The
3238 3209           * 'wl_free_list_indx' always holds the index of the next available
3239 3210           * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3240 3211           * completely full.  This gives us the advantage of being able to have
3241 3212           * entries complete or be polled off the WQ out-of-order.
3242 3213           *
3243 3214           * For now, we write the free_list entries inside the WQ itself.  It
3244 3215           * may be useful in the future to store this information in a separate
3245 3216           * structure for debugging purposes.
3246 3217           */
3247 3218          for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3248 3219                  wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3249 3220                  ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3250 3221                      wridlist->wl_free_list_indx);
3251 3222                  wridlist->wl_free_list_indx = wqe_index;
3252 3223          }
3253 3224  }
3254 3225  
3255 3226  
3256 3227  /*
3257 3228   * tavor_wrid_reaplist_add()
3258 3229   *    Context: Can be called from interrupt or base context.
3259 3230   */
3260 3231  static void
3261 3232  tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3262 3233  {
3263 3234          ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3264 3235  
3265 3236          TAVOR_TNF_ENTER(tavor_wrid_reaplist_add);
3266 3237  
3267 3238          mutex_enter(&wq->wq_wrid_wql->wql_lock);
3268 3239  
3269 3240          /*
3270 3241           * Add the "post" container (the last one on the current chain) to
3271 3242           * the CQ's "reapable" list
3272 3243           */
3273 3244          if ((cq->cq_wrid_reap_head == NULL) &&
3274 3245              (cq->cq_wrid_reap_tail == NULL)) {
3275 3246                  cq->cq_wrid_reap_head = wq->wq_wrid_post;
3276 3247                  cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3277 3248          } else {
3278 3249                  cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3279 3250                  cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3280 3251          }
3281 3252  
3282 3253          mutex_exit(&wq->wq_wrid_wql->wql_lock);
3283 3254  }
3284 3255  
3285 3256  
3286 3257  int
3287 3258  tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3288 3259  {
3289 3260          tavor_workq_compare_t   *cmpp;
3290 3261          tavor_workq_hdr_t       *curr;
3291 3262  
3292 3263          cmpp = (tavor_workq_compare_t *)p1;
3293 3264          curr = (tavor_workq_hdr_t *)p2;
3294 3265  
3295 3266          if (cmpp->cmp_qpn < curr->wq_qpn)
3296 3267                  return (-1);
3297 3268          else if (cmpp->cmp_qpn > curr->wq_qpn)
3298 3269                  return (+1);
3299 3270          else if (cmpp->cmp_type < curr->wq_type)
3300 3271                  return (-1);
3301 3272          else if (cmpp->cmp_type > curr->wq_type)
3302 3273                  return (+1);
3303 3274          else
3304 3275                  return (0);
3305 3276  }
3306 3277  
3307 3278  
3308 3279  /*
3309 3280   * tavor_wrid_wqhdr_find()
3310 3281   *    Context: Can be called from interrupt or base context.
3311 3282   */
3312 3283  static tavor_workq_hdr_t *
3313 3284  tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3314 3285  {
3315 3286          tavor_workq_hdr_t       *curr;
3316 3287          tavor_workq_compare_t   cmp;
3317 3288  
3318 3289          TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find);
3319 3290

↓ open down ↓

121 lines elided

↑ open up ↑

3320 3291          ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3321 3292  
3322 3293          /*
3323 3294           * Walk the CQ's work queue list, trying to find a send or recv queue
3324 3295           * with the same QP number.  We do this even if we are going to later
3325 3296           * create a new entry because it helps us easily find the end of the
3326 3297           * list.
3327 3298           */
3328 3299          cmp.cmp_qpn = qpn;
3329 3300          cmp.cmp_type = wq_type;
3330      -#ifdef __lock_lint
3331      -        tavor_wrid_wqhdr_compare(NULL, NULL);
3332      -#endif
3333 3301          curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3334 3302  
3335 3303          TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find);
3336 3304          return (curr);
3337 3305  }
3338 3306  
3339 3307  
3340 3308  /*
3341 3309   * tavor_wrid_wqhdr_create()
3342 3310   *    Context: Can be called from interrupt or base context.

3343 3311   */
3344 3312  static tavor_workq_hdr_t *
3345 3313  tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3346 3314      uint_t wq_type, uint_t create_wql)
3347 3315  {
3348 3316          tavor_workq_hdr_t       *wqhdr_tmp;
3349 3317  
3350 3318          TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create);
3351 3319  
3352 3320          ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3353 3321  
3354 3322          /*
3355 3323           * Allocate space a work queue header structure and initialize it.
3356 3324           * Each work queue header structure includes a "wq_wrid_wql"

↓ open down ↓

14 lines elided

↑ open up ↑

3357 3325           * which needs to be initialized.  Note that this allocation has to be
3358 3326           * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3359 3327           * and, therefore, could get raised to the interrupt level.
3360 3328           */
3361 3329          wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3362 3330              sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3363 3331          if (wqhdr_tmp == NULL) {
3364 3332                  TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3365 3333                  return (NULL);
3366 3334          }
3367      -        _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3368 3335          wqhdr_tmp->wq_qpn       = qpn;
3369 3336          wqhdr_tmp->wq_type      = wq_type;
3370 3337  
3371 3338          if (create_wql) {
3372 3339                  wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3373 3340                  if (wqhdr_tmp->wq_wrid_wql == NULL) {
3374 3341                          kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3375 3342                          TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3376 3343                          return (NULL);
3377 3344                  }

3378 3345          }
3379 3346  
3380 3347          wqhdr_tmp->wq_wrid_poll = NULL;
3381 3348          wqhdr_tmp->wq_wrid_post = NULL;
3382 3349  
3383 3350          /* Chain the newly allocated work queue header to the CQ's list */
3384 3351          tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3385 3352  
3386 3353          TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3387 3354          return (wqhdr_tmp);
3388 3355  }
3389 3356  
3390 3357  
3391 3358  /*
3392 3359   * tavor_wrid_wql_create()
3393 3360   *    Context: Can be called from interrupt or base context.
3394 3361   */
3395 3362  tavor_wq_lock_t *
3396 3363  tavor_wrid_wql_create(tavor_state_t *state)
3397 3364  {
3398 3365          tavor_wq_lock_t *wql;
3399 3366  
3400 3367          TAVOR_TNF_ENTER(tavor_wrid_wql_create);
3401 3368  
3402 3369          /*
3403 3370           * Allocate the WQL and initialize it.
3404 3371           */
3405 3372          wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3406 3373          if (wql == NULL) {
3407 3374                  TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3408 3375                  return (NULL);
3409 3376          }
3410 3377  
3411 3378          mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3412 3379              DDI_INTR_PRI(state->ts_intrmsi_pri));
3413 3380  
3414 3381          /* Add refcount to WQL */
3415 3382          tavor_wql_refcnt_inc(wql);
3416 3383  
3417 3384          TAVOR_TNF_EXIT(tavor_wrid_wql_create);
3418 3385          return (wql);
3419 3386  }
3420 3387  
3421 3388  
3422 3389  /*
3423 3390   * tavor_wrid_get_wqeaddrsz()
3424 3391   *    Context: Can be called from interrupt or base context.
3425 3392   */
3426 3393  static uint32_t
3427 3394  tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3428 3395  {
3429 3396          tavor_wrid_entry_t      *wre;
3430 3397          uint32_t                wqeaddrsz;
3431 3398          uint32_t                head;
3432 3399  
3433 3400          /*
3434 3401           * If the container is empty, then there is no next entry. So just
3435 3402           * return zero.  Note: the "head == tail" condition here can only
3436 3403           * mean that the container is empty because we have previously pulled
3437 3404           * something from the container.
3438 3405           *
3439 3406           * If the container is not empty, then find the next entry and return
3440 3407           * the contents of its "wqeaddrsz" field.
3441 3408           */
3442 3409          if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3443 3410                  wqeaddrsz = 0;
3444 3411          } else {
3445 3412                  /*
3446 3413                   * We don't need to calculate the "next" head pointer here
3447 3414                   * because "head" should already point to the next entry on
3448 3415                   * the list (since we just pulled something off - in
3449 3416                   * tavor_wrid_find_match() - and moved the head index forward.)
3450 3417                   */
3451 3418                  head = wq->wq_wrid_poll->wl_head;
3452 3419                  wre = &wq->wq_wrid_poll->wl_wre[head];
3453 3420                  wqeaddrsz = wre->wr_wqeaddrsz;
3454 3421          }
3455 3422          return (wqeaddrsz);
3456 3423  }
3457 3424  
3458 3425  
3459 3426  /*
3460 3427   * tavor_wrid_wqhdr_add()
3461 3428   *    Context: Can be called from interrupt or base context.
3462 3429   */
3463 3430  static void
3464 3431  tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3465 3432      tavor_wrid_list_hdr_t *wridlist)
3466 3433  {
3467 3434          ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3468 3435  
3469 3436          /* Chain the new WRID list "container" to the work queue list */
3470 3437          if ((wqhdr->wq_wrid_post == NULL) &&
3471 3438              (wqhdr->wq_wrid_poll == NULL)) {
3472 3439                  wqhdr->wq_wrid_poll = wridlist;
3473 3440                  wqhdr->wq_wrid_post = wridlist;
3474 3441          } else {
3475 3442                  wqhdr->wq_wrid_post->wl_next = wridlist;
3476 3443                  wridlist->wl_prev = wqhdr->wq_wrid_post;
3477 3444                  wqhdr->wq_wrid_post = wridlist;
3478 3445          }
3479 3446  }
3480 3447  
3481 3448  
3482 3449  /*
3483 3450   * tavor_wrid_wqhdr_remove()
3484 3451   *    Context: Can be called from interrupt or base context.
3485 3452   *
3486 3453   *    Note: this is only called to remove the most recently added WRID list
3487 3454   *    container (i.e. in tavor_from_reset() above)
3488 3455   */
3489 3456  static void
3490 3457  tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3491 3458      tavor_wrid_list_hdr_t *wridlist)
3492 3459  {
3493 3460          tavor_wrid_list_hdr_t   *prev, *next;
3494 3461  
3495 3462          ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3496 3463  
3497 3464          /* Unlink the WRID list "container" from the work queue list */
3498 3465          prev = wridlist->wl_prev;
3499 3466          next = wridlist->wl_next;
3500 3467          if (prev != NULL) {
3501 3468                  prev->wl_next = next;
3502 3469          }
3503 3470          if (next != NULL) {
3504 3471                  next->wl_prev = prev;
3505 3472          }
3506 3473  
3507 3474          /*
3508 3475           * Update any pointers in the work queue hdr that may point to this
3509 3476           * WRID list container
3510 3477           */
3511 3478          if (wqhdr->wq_wrid_post == wridlist) {
3512 3479                  wqhdr->wq_wrid_post = prev;
3513 3480          }
3514 3481          if (wqhdr->wq_wrid_poll == wridlist) {
3515 3482                  wqhdr->wq_wrid_poll = NULL;
3516 3483          }
3517 3484  }
3518 3485  
3519 3486  
3520 3487  /*
3521 3488   * tavor_wrid_list_reap()
3522 3489   *    Context: Can be called from interrupt or base context.
3523 3490   *    Note: The "wqhdr_list_lock" must be held.
3524 3491   */
3525 3492  static tavor_workq_hdr_t *
3526 3493  tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3527 3494  {
3528 3495          tavor_workq_hdr_t       *wqhdr, *consume_wqhdr = NULL;
3529 3496          tavor_wrid_list_hdr_t   *prev, *next;
3530 3497          uint32_t                size;
3531 3498  
3532 3499          TAVOR_TNF_ENTER(tavor_wrid_list_reap);
3533 3500  
3534 3501          /* Get the back pointer to the work queue header (see below) */
3535 3502          wqhdr = wridlist->wl_wqhdr;
3536 3503          mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3537 3504  
3538 3505          /* Unlink the WRID list "container" from the work queue list */
3539 3506          prev = wridlist->wl_prev;
3540 3507          next = wridlist->wl_next;
3541 3508          if (prev != NULL) {
3542 3509                  prev->wl_next = next;
3543 3510          }
3544 3511          if (next != NULL) {
3545 3512                  next->wl_prev = prev;
3546 3513          }
3547 3514  
3548 3515          /*
3549 3516           * If the back pointer to the work queue header shows that it
3550 3517           * was pointing to the entry we are about to remove, then the work
3551 3518           * queue header is reapable as well.
3552 3519           */
3553 3520          if ((wqhdr->wq_wrid_poll == wridlist) &&
3554 3521              (wqhdr->wq_wrid_post == wridlist)) {
3555 3522                  consume_wqhdr = wqhdr;
3556 3523          }
3557 3524  
3558 3525          /* Be sure to update the "poll" and "post" container pointers */
3559 3526          if (wqhdr->wq_wrid_poll == wridlist) {
3560 3527                  wqhdr->wq_wrid_poll = next;
3561 3528          }
3562 3529          if (wqhdr->wq_wrid_post == wridlist) {
3563 3530                  wqhdr->wq_wrid_post = NULL;
3564 3531          }
3565 3532  
3566 3533          /* Calculate the size and free the container */
3567 3534          size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3568 3535          kmem_free(wridlist->wl_wre, size);
3569 3536          kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3570 3537  
3571 3538          mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3572 3539  
3573 3540          TAVOR_TNF_EXIT(tavor_wrid_list_reap);
3574 3541          return (consume_wqhdr);
3575 3542  }
3576 3543  
3577 3544  
3578 3545  /*
3579 3546   * tavor_wrid_wqhdr_lock_both()

↓ open down ↓

202 lines elided

↑ open up ↑

3580 3547   *    Context: Can be called from interrupt or base context.
3581 3548   */
3582 3549  static void
3583 3550  tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3584 3551  {
3585 3552          tavor_cqhdl_t   sq_cq, rq_cq;
3586 3553  
3587 3554          sq_cq = qp->qp_sq_cqhdl;
3588 3555          rq_cq = qp->qp_rq_cqhdl;
3589 3556  
3590      -_NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3591      -_NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3592      -
3593 3557          /*
3594 3558           * If both work queues (send and recv) share a completion queue, then
3595 3559           * grab the common lock.  If they use different CQs (hence different
3596 3560           * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3597 3561           * receive.  We do this consistently and correctly in
3598 3562           * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3599      -         * of dead lock condition.  Note:  We add the "__lock_lint" code here
3600      -         * to fake out warlock into thinking we've grabbed both locks (when,
3601      -         * in fact, we only needed the one).
     3563 +         * of dead lock condition.
3602 3564           */
3603 3565          if (sq_cq == rq_cq) {
3604 3566                  mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3605      -#ifdef  __lock_lint
3606      -                mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3607      -#endif
3608 3567          } else {
3609 3568                  mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3610 3569                  mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3611 3570          }
3612 3571  }
3613 3572  
3614 3573  /*
3615 3574   * tavor_wrid_wqhdr_unlock_both()
3616 3575   *    Context: Can be called from interrupt or base context.
3617 3576   */
3618 3577  static void
3619 3578  tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3620 3579  {
3621 3580          tavor_cqhdl_t   sq_cq, rq_cq;
3622 3581  
3623 3582          sq_cq = qp->qp_sq_cqhdl;
3624 3583          rq_cq = qp->qp_rq_cqhdl;
3625 3584  
3626      -_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3627      -_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3628      -
3629 3585          /*
3630 3586           * See tavor_wrid_wqhdr_lock_both() above for more detail
3631 3587           */
3632 3588          if (sq_cq == rq_cq) {
3633      -#ifdef  __lock_lint
3634      -                mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3635      -#endif
3636 3589                  mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3637 3590          } else {
3638 3591                  mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3639 3592                  mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3640 3593          }
3641 3594  }
3642 3595  
3643 3596  
3644 3597  /*
3645 3598   * tavor_cq_wqhdr_add()

3646 3599   *    Context: Can be called from interrupt or base context.
3647 3600   */

↓ open down ↓

2 lines elided

↑ open up ↑

3648 3601  static void
3649 3602  tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3650 3603  {
3651 3604          tavor_workq_compare_t   cmp;
3652 3605          avl_index_t             where;
3653 3606  
3654 3607          ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3655 3608  
3656 3609          cmp.cmp_qpn = wqhdr->wq_qpn;
3657 3610          cmp.cmp_type = wqhdr->wq_type;
3658      -#ifdef __lock_lint
3659      -        tavor_wrid_wqhdr_compare(NULL, NULL);
3660      -#endif
3661 3611          (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3662 3612          /*
3663 3613           * If the CQ's work queue list is empty, then just add it.
3664 3614           * Otherwise, chain it to the beginning of the list.
3665 3615           */
3666 3616          avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3667 3617  }
3668 3618  
3669 3619  
3670 3620  /*
3671 3621   * tavor_cq_wqhdr_remove()
3672 3622   *    Context: Can be called from interrupt or base context.
3673 3623   */
3674 3624  static void
3675 3625  tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3676 3626  {
3677 3627          ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3678 3628  
3679      -#ifdef __lock_lint
3680      -        tavor_wrid_wqhdr_compare(NULL, NULL);
3681      -#endif
3682 3629          /* Remove "wqhdr" from the work queue header list on "cq" */
3683 3630          avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3684 3631  
3685 3632          /*
3686 3633           * Release reference to WQL; If this is the last reference, this call
3687 3634           * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3688 3635           */
3689 3636          tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3690 3637  
3691 3638          /* Free the memory associated with "wqhdr" */

3692 3639          kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3693 3640  }
3694 3641  
3695 3642  
3696 3643  /*
3697 3644   * tavor_wql_refcnt_inc()
3698 3645   * Context: Can be called from interrupt or base context
3699 3646   */
3700 3647  void
3701 3648  tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3702 3649  {
3703 3650          ASSERT(wql != NULL);
3704 3651  
3705 3652          mutex_enter(&wql->wql_lock);
3706 3653          wql->wql_refcnt++;
3707 3654          mutex_exit(&wql->wql_lock);
3708 3655  }
3709 3656  
3710 3657  /*
3711 3658   * tavor_wql_refcnt_dec()
3712 3659   * Context: Can be called from interrupt or base context
3713 3660   */
3714 3661  void
3715 3662  tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3716 3663  {
3717 3664          int     refcnt;
3718 3665  
3719 3666          ASSERT(wql != NULL);
3720 3667  
3721 3668          mutex_enter(&wql->wql_lock);
3722 3669          wql->wql_refcnt--;
3723 3670          refcnt = wql->wql_refcnt;
3724 3671          mutex_exit(&wql->wql_lock);
3725 3672  
3726 3673          /*
3727 3674           *
3728 3675           * Free up WQL memory if we're the last one associated with this
3729 3676           * structure.
3730 3677           */
3731 3678          if (refcnt == 0) {
3732 3679                  mutex_destroy(&wql->wql_lock);
3733 3680                  kmem_free(wql, sizeof (tavor_wq_lock_t));
3734 3681          }
3735 3682  }

↓ open down ↓

44 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX