1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * tavor_srq.c
  29  *    Tavor Shared Receive Queue Processing Routines
  30  *
  31  *    Implements all the routines necessary for allocating, freeing, querying,
  32  *    modifying and posting shared receive queues.
  33  */
  34 
  35 #include <sys/sysmacros.h>
  36 #include <sys/types.h>
  37 #include <sys/conf.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/modctl.h>
  41 #include <sys/bitmap.h>
  42 
  43 #include <sys/ib/adapters/tavor/tavor.h>
  44 
  45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
  46     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
  47 
  48 /*
  49  * tavor_srq_alloc()
  50  *    Context: Can be called only from user or kernel context.
  51  */
  52 int
  53 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
  54     uint_t sleepflag, tavor_srq_options_t *op)
  55 {
  56         ibt_srq_hdl_t           ibt_srqhdl;
  57         tavor_pdhdl_t           pd;
  58         ibt_srq_sizes_t         *sizes;
  59         ibt_srq_sizes_t         *real_sizes;
  60         tavor_srqhdl_t          *srqhdl;
  61         ibt_srq_flags_t         flags;
  62         tavor_rsrc_t            *srqc, *rsrc;
  63         tavor_hw_srqc_t         srqc_entry;
  64         uint32_t                *buf;
  65         tavor_srqhdl_t          srq;
  66         tavor_umap_db_entry_t   *umapdb;
  67         ibt_mr_attr_t           mr_attr;
  68         tavor_mr_options_t      mr_op;
  69         tavor_mrhdl_t           mr;
  70         uint64_t                addr;
  71         uint64_t                value, srq_desc_off;
  72         uint32_t                lkey;
  73         uint32_t                log_srq_size;
  74         uint32_t                uarpg;
  75         uint_t                  wq_location, dma_xfer_mode, srq_is_umap;
  76         int                     flag, status;
  77         char                    *errormsg;
  78         uint_t                  max_sgl;
  79         uint_t                  wqesz;
  80 
  81         TAVOR_TNF_ENTER(tavor_srq_alloc);
  82 
  83         /*
  84          * Check the "options" flag.  Currently this flag tells the driver
  85          * whether or not the SRQ's work queues should be come from normal
  86          * system memory or whether they should be allocated from DDR memory.
  87          */
  88         if (op == NULL) {
  89                 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
  90         } else {
  91                 wq_location = op->srqo_wq_loc;
  92         }
  93 
  94         /*
  95          * Extract the necessary info from the tavor_srq_info_t structure
  96          */
  97         real_sizes = srqinfo->srqi_real_sizes;
  98         sizes      = srqinfo->srqi_sizes;
  99         pd         = srqinfo->srqi_pd;
 100         ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
 101         flags      = srqinfo->srqi_flags;
 102         srqhdl     = srqinfo->srqi_srqhdl;
 103 
 104         /*
 105          * Determine whether SRQ is being allocated for userland access or
 106          * whether it is being allocated for kernel access.  If the SRQ is
 107          * being allocated for userland access, then lookup the UAR doorbell
 108          * page number for the current process.  Note:  If this is not found
 109          * (e.g. if the process has not previously open()'d the Tavor driver),
 110          * then an error is returned.
 111          */
 112         srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
 113         if (srq_is_umap) {
 114                 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
 115                     MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
 116                 if (status != DDI_SUCCESS) {
 117                         /* Set "status" and "errormsg" and goto failure */
 118                         TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
 119                         goto srqalloc_fail3;
 120                 }
 121                 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
 122         }
 123 
 124         /* Increase PD refcnt */
 125         tavor_pd_refcnt_inc(pd);
 126 
 127         /* Allocate an SRQ context entry */
 128         status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
 129         if (status != DDI_SUCCESS) {
 130                 /* Set "status" and "errormsg" and goto failure */
 131                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
 132                 goto srqalloc_fail1;
 133         }
 134 
 135         /* Allocate the SRQ Handle entry */
 136         status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
 137         if (status != DDI_SUCCESS) {
 138                 /* Set "status" and "errormsg" and goto failure */
 139                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
 140                 goto srqalloc_fail2;
 141         }
 142 
 143         srq = (tavor_srqhdl_t)rsrc->tr_addr;
 144 
 145         srq->srq_srqnum = srqc->tr_indx;  /* just use index */
 146 
 147         /*
 148          * If this will be a user-mappable SRQ, then allocate an entry for
 149          * the "userland resources database".  This will later be added to
 150          * the database (after all further SRQ operations are successful).
 151          * If we fail here, we must undo the reference counts and the
 152          * previous resource allocation.
 153          */
 154         if (srq_is_umap) {
 155                 umapdb = tavor_umap_db_alloc(state->ts_instance,
 156                     srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
 157                     (uint64_t)(uintptr_t)rsrc);
 158                 if (umapdb == NULL) {
 159                         /* Set "status" and "errormsg" and goto failure */
 160                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 161                         goto srqalloc_fail3;
 162                 }
 163         }
 164 
 165         /*
 166          * Calculate the appropriate size for the SRQ.
 167          * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
 168          * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
 169          * is to round the requested size up to the next highest power-of-2
 170          */
 171         sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
 172         log_srq_size = highbit(sizes->srq_wr_sz);
 173         if (ISP2(sizes->srq_wr_sz)) {
 174                 log_srq_size = log_srq_size - 1;
 175         }
 176 
 177         /*
 178          * Next we verify that the rounded-up size is valid (i.e. consistent
 179          * with the device limits and/or software-configured limits).  If not,
 180          * then obviously we have a lot of cleanup to do before returning.
 181          */
 182         if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
 183                 /* Set "status" and "errormsg" and goto failure */
 184                 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
 185                 goto srqalloc_fail4;
 186         }
 187 
 188         /*
 189          * Next we verify that the requested number of SGL is valid (i.e.
 190          * consistent with the device limits and/or software-configured
 191          * limits).  If not, then obviously the same cleanup needs to be done.
 192          */
 193         max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
 194         if (sizes->srq_sgl_sz > max_sgl) {
 195                 /* Set "status" and "errormsg" and goto failure */
 196                 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
 197                 goto srqalloc_fail4;
 198         }
 199 
 200         /*
 201          * Determine the SRQ's WQE sizes.  This depends on the requested
 202          * number of SGLs.  Note: This also has the side-effect of
 203          * calculating the real number of SGLs (for the calculated WQE size)
 204          */
 205         tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
 206             TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
 207             &srq->srq_wq_sgl);
 208 
 209         /*
 210          * Allocate the memory for SRQ work queues.  Note:  The location from
 211          * which we will allocate these work queues has been passed in through
 212          * the tavor_qp_options_t structure.  Since Tavor work queues are not
 213          * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
 214          * queue memory is very important.  We used to allocate work queues
 215          * (the combined receive and send queues) so that they would be aligned
 216          * on their combined size.  That alignment guaranteed that they would
 217          * never cross the 4GB boundary (Tavor work queues are on the order of
 218          * MBs at maximum).  Now we are able to relax this alignment constraint
 219          * by ensuring that the IB address assigned to the queue memory (as a
 220          * result of the tavor_mr_register() call) is offset from zero.
 221          * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
 222          * guarantee the alignment, but when attempting to use IOMMU bypass
 223          * mode we found that we were not allowed to specify any alignment that
 224          * was more restrictive than the system page size.  So we avoided this
 225          * constraint by passing two alignment values, one for the memory
 226          * allocation itself and the other for the DMA handle (for later bind).
 227          * This used to cause more memory than necessary to be allocated (in
 228          * order to guarantee the more restrictive alignment contraint).  But
 229          * be guaranteeing the zero-based IB virtual address for the queue, we
 230          * are able to conserve this memory.
 231          *
 232          * Note: If SRQ is not user-mappable, then it may come from either
 233          * kernel system memory or from HCA-attached local DDR memory.
 234          *
 235          * Note2: We align this queue on a pagesize boundary.  This is required
 236          * to make sure that all the resulting IB addresses will start at 0, for
 237          * a zero-based queue.  By making sure we are aligned on at least a
 238          * page, any offset we use into our queue will be the same as when we
 239          * perform tavor_srq_modify() operations later.
 240          */
 241         wqesz = (1 << srq->srq_wq_log_wqesz);
 242         srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
 243         srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
 244         srq->srq_wqinfo.qa_bind_align = PAGESIZE;
 245         if (srq_is_umap) {
 246                 srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 247         } else {
 248                 srq->srq_wqinfo.qa_location = wq_location;
 249         }
 250         status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
 251         if (status != DDI_SUCCESS) {
 252                 /* Set "status" and "errormsg" and goto failure */
 253                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 254                 goto srqalloc_fail4;
 255         }
 256         buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
 257 
 258         /*
 259          * Register the memory for the SRQ work queues.  The memory for the SRQ
 260          * must be registered in the Tavor TPT tables.  This gives us the LKey
 261          * to specify in the SRQ context later.  Note: If the work queue is to
 262          * be allocated from DDR memory, then only a "bypass" mapping is
 263          * appropriate.  And if the SRQ memory is user-mappable, then we force
 264          * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
 265          * restriction, we pass the "mro_bind_override_addr" flag in the call
 266          * to tavor_mr_register().  This guarantees that the resulting IB vaddr
 267          * will be zero-based (modulo the offset into the first page).  If we
 268          * fail here, we still have the bunch of resource and reference count
 269          * cleanup to do.
 270          */
 271         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
 272             IBT_MR_NOSLEEP;
 273         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 274         mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
 275         mr_attr.mr_as    = NULL;
 276         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
 277         if (srq_is_umap) {
 278                 mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
 279         } else {
 280                 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 281                         mr_op.mro_bind_type =
 282                             state->ts_cfg_profile->cp_iommu_bypass;
 283                         dma_xfer_mode =
 284                             state->ts_cfg_profile->cp_streaming_consistent;
 285                         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 286                                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 287                         }
 288                 } else {
 289                         mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
 290                 }
 291         }
 292         mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
 293         mr_op.mro_bind_override_addr = 1;
 294         status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
 295         if (status != DDI_SUCCESS) {
 296                 /* Set "status" and "errormsg" and goto failure */
 297                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 298                 goto srqalloc_fail5;
 299         }
 300         addr = mr->mr_bindinfo.bi_addr;
 301         lkey = mr->mr_lkey;
 302 
 303         /*
 304          * Calculate the offset between the kernel virtual address space
 305          * and the IB virtual address space.  This will be used when
 306          * posting work requests to properly initialize each WQE.
 307          */
 308         srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
 309             (uint64_t)mr->mr_bindinfo.bi_addr;
 310 
 311         /*
 312          * Create WQL and Wridlist for use by this SRQ
 313          */
 314         srq->srq_wrid_wql = tavor_wrid_wql_create(state);
 315         if (srq->srq_wrid_wql == NULL) {
 316                 /* Set "status" and "errormsg" and goto failure */
 317                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
 318                 goto srqalloc_fail6;
 319         }
 320 
 321         srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
 322         if (srq->srq_wridlist == NULL) {
 323                 /* Set "status" and "errormsg" and goto failure */
 324                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
 325                 goto srqalloc_fail7;
 326         }
 327 
 328         srq->srq_wridlist->wl_srq_en = 1;
 329         srq->srq_wridlist->wl_free_list_indx = -1;
 330 
 331         /*
 332          * Fill in all the return arguments (if necessary).  This includes
 333          * real queue size and real SGLs.
 334          */
 335         if (real_sizes != NULL) {
 336                 real_sizes->srq_wr_sz = (1 << log_srq_size);
 337                 real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
 338         }
 339 
 340         /*
 341          * Fill in the SRQC entry.  This is the final step before passing
 342          * ownership of the SRQC entry to the Tavor hardware.  We use all of
 343          * the information collected/calculated above to fill in the
 344          * requisite portions of the SRQC.  Note: If this SRQ is going to be
 345          * used for userland access, then we need to set the UAR page number
 346          * appropriately (otherwise it's a "don't care")
 347          */
 348         bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
 349         srqc_entry.wqe_addr_h      = (addr >> 32);
 350         srqc_entry.next_wqe_addr_l = 0;
 351         srqc_entry.ds              = (wqesz >> 4);
 352         srqc_entry.state           = TAVOR_SRQ_STATE_HW_OWNER;
 353         srqc_entry.pd              = pd->pd_pdnum;
 354         srqc_entry.lkey            = lkey;
 355         srqc_entry.wqe_cnt         = 0;
 356         if (srq_is_umap) {
 357                 srqc_entry.uar     = uarpg;
 358         } else {
 359                 srqc_entry.uar     = 0;
 360         }
 361 
 362         /*
 363          * Write the SRQC entry to hardware.  Lastly, we pass ownership of
 364          * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
 365          * command).  Note: In general, this operation shouldn't fail.  But
 366          * if it does, we have to undo everything we've done above before
 367          * returning error.
 368          */
 369         status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
 370             sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
 371             sleepflag);
 372         if (status != TAVOR_CMD_SUCCESS) {
 373                 cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
 374                     status);
 375                 TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
 376                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 377                 /* Set "status" and "errormsg" and goto failure */
 378                 TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
 379                 goto srqalloc_fail8;
 380         }
 381 
 382         /*
 383          * Fill in the rest of the Tavor SRQ handle.  We can update
 384          * the following fields for use in further operations on the SRQ.
 385          */
 386         srq->srq_srqcrsrcp = srqc;
 387         srq->srq_rsrcp          = rsrc;
 388         srq->srq_mrhdl          = mr;
 389         srq->srq_refcnt         = 0;
 390         srq->srq_is_umap   = srq_is_umap;
 391         srq->srq_uarpg          = (srq->srq_is_umap) ? uarpg : 0;
 392         srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
 393         srq->srq_pdhdl          = pd;
 394         srq->srq_wq_lastwqeindx = -1;
 395         srq->srq_wq_bufsz  = (1 << log_srq_size);
 396         srq->srq_wq_buf         = buf;
 397         srq->srq_desc_off  = srq_desc_off;
 398         srq->srq_hdlrarg   = (void *)ibt_srqhdl;
 399         srq->srq_state          = 0;
 400         srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
 401         srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
 402 
 403         /* Determine if later ddi_dma_sync will be necessary */
 404         srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
 405 
 406         /*
 407          * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
 408          * "srqhdl" and return success
 409          */
 410         ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
 411         state->ts_srqhdl[srqc->tr_indx] = srq;
 412 
 413         /*
 414          * If this is a user-mappable SRQ, then we need to insert the
 415          * previously allocated entry into the "userland resources database".
 416          * This will allow for later lookup during devmap() (i.e. mmap())
 417          * calls.
 418          */
 419         if (srq->srq_is_umap) {
 420                 tavor_umap_db_add(umapdb);
 421         } else {
 422                 mutex_enter(&srq->srq_wrid_wql->wql_lock);
 423                 tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
 424                 mutex_exit(&srq->srq_wrid_wql->wql_lock);
 425         }
 426 
 427         *srqhdl = srq;
 428 
 429         TAVOR_TNF_EXIT(tavor_srq_alloc);
 430         return (status);
 431 
 432 /*
 433  * The following is cleanup for all possible failure cases in this routine
 434  */
 435 srqalloc_fail8:
 436         kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
 437             sizeof (tavor_wrid_entry_t));
 438         kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
 439 srqalloc_fail7:
 440         tavor_wql_refcnt_dec(srq->srq_wrid_wql);
 441 srqalloc_fail6:
 442         if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 443             TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
 444                 TAVOR_WARNING(state, "failed to deregister SRQ memory");
 445         }
 446 srqalloc_fail5:
 447         tavor_queue_free(state, &srq->srq_wqinfo);
 448 srqalloc_fail4:
 449         if (srq_is_umap) {
 450                 tavor_umap_db_free(umapdb);
 451         }
 452 srqalloc_fail3:
 453         tavor_rsrc_free(state, &rsrc);
 454 srqalloc_fail2:
 455         tavor_rsrc_free(state, &srqc);
 456 srqalloc_fail1:
 457         tavor_pd_refcnt_dec(pd);
 458 srqalloc_fail:
 459         TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
 460             tnf_string, msg, errormsg);
 461         TAVOR_TNF_EXIT(tavor_srq_alloc);
 462         return (status);
 463 }
 464 
 465 
 466 /*
 467  * tavor_srq_free()
 468  *    Context: Can be called only from user or kernel context.
 469  */
 470 /* ARGSUSED */
 471 int
 472 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
 473 {
 474         tavor_rsrc_t            *srqc, *rsrc;
 475         tavor_umap_db_entry_t   *umapdb;
 476         uint64_t                value;
 477         tavor_srqhdl_t          srq;
 478         tavor_mrhdl_t           mr;
 479         tavor_pdhdl_t           pd;
 480         tavor_hw_srqc_t         srqc_entry;
 481         uint32_t                srqnum;
 482         uint32_t                size;
 483         uint_t                  maxprot;
 484         int                     status;
 485 
 486         TAVOR_TNF_ENTER(tavor_srq_free);
 487 
 488         /*
 489          * Pull all the necessary information from the Tavor Shared Receive
 490          * Queue handle.  This is necessary here because the resource for the
 491          * SRQ handle is going to be freed up as part of this operation.
 492          */
 493         srq     = *srqhdl;
 494         mutex_enter(&srq->srq_lock);
 495         srqc    = srq->srq_srqcrsrcp;
 496         rsrc    = srq->srq_rsrcp;
 497         pd      = srq->srq_pdhdl;
 498         mr      = srq->srq_mrhdl;
 499         srqnum  = srq->srq_srqnum;
 500 
 501         /*
 502          * If there are work queues still associated with the SRQ, then return
 503          * an error.  Otherwise, we will be holding the SRQ lock.
 504          */
 505         if (srq->srq_refcnt != 0) {
 506                 mutex_exit(&srq->srq_lock);
 507                 TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
 508                     tnf_int, refcnt, srq->srq_refcnt);
 509                 TAVOR_TNF_EXIT(tavor_srq_free);
 510                 return (IBT_SRQ_IN_USE);
 511         }
 512 
 513         /*
 514          * If this was a user-mappable SRQ, then we need to remove its entry
 515          * from the "userland resources database".  If it is also currently
 516          * mmap()'d out to a user process, then we need to call
 517          * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
 518          * We also need to invalidate the SRQ tracking information for the
 519          * user mapping.
 520          */
 521         if (srq->srq_is_umap) {
 522                 status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
 523                     MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
 524                     &umapdb);
 525                 if (status != DDI_SUCCESS) {
 526                         mutex_exit(&srq->srq_lock);
 527                         TAVOR_WARNING(state, "failed to find in database");
 528                         TAVOR_TNF_EXIT(tavor_srq_free);
 529                         return (ibc_get_ci_failure(0));
 530                 }
 531                 tavor_umap_db_free(umapdb);
 532                 if (srq->srq_umap_dhp != NULL) {
 533                         maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 534                         status = devmap_devmem_remap(srq->srq_umap_dhp,
 535                             state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
 536                             maxprot, DEVMAP_MAPPING_INVALID, NULL);
 537                         if (status != DDI_SUCCESS) {
 538                                 mutex_exit(&srq->srq_lock);
 539                                 TAVOR_WARNING(state, "failed in SRQ memory "
 540                                     "devmap_devmem_remap()");
 541                                 TAVOR_TNF_EXIT(tavor_srq_free);
 542                                 return (ibc_get_ci_failure(0));
 543                         }
 544                         srq->srq_umap_dhp = (devmap_cookie_t)NULL;
 545                 }
 546         }
 547 
 548         /*
 549          * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
 550          * in-progress events to detect that the SRQ corresponding to this
 551          * number has been freed.
 552          */
 553         state->ts_srqhdl[srqc->tr_indx] = NULL;
 554 
 555         mutex_exit(&srq->srq_lock);
 556 
 557         /*
 558          * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
 559          * firmware command).  If the ownership transfer fails for any reason,
 560          * then it is an indication that something (either in HW or SW) has
 561          * gone seriously wrong.
 562          */
 563         status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
 564             sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
 565         if (status != TAVOR_CMD_SUCCESS) {
 566                 TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
 567                 cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
 568                     status);
 569                 TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
 570                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 571                 TAVOR_TNF_EXIT(tavor_srq_free);
 572                 return (IBT_FAILURE);
 573         }
 574 
 575         /*
 576          * Deregister the memory for the Shared Receive Queue.  If this fails
 577          * for any reason, then it is an indication that something (either
 578          * in HW or SW) has gone seriously wrong.  So we print a warning
 579          * message and return.
 580          */
 581         status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 582             sleepflag);
 583         if (status != DDI_SUCCESS) {
 584                 TAVOR_WARNING(state, "failed to deregister SRQ memory");
 585                 TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
 586                 TAVOR_TNF_EXIT(tavor_srq_free);
 587                 return (IBT_FAILURE);
 588         }
 589 
 590         /* Calculate the size and free the wridlist container */
 591         if (srq->srq_wridlist != NULL) {
 592                 size = (srq->srq_wridlist->wl_size *
 593                     sizeof (tavor_wrid_entry_t));
 594                 kmem_free(srq->srq_wridlist->wl_wre, size);
 595                 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
 596 
 597                 /*
 598                  * Release reference to WQL; If this is the last reference,
 599                  * this call also has the side effect of freeing up the
 600                  * 'srq_wrid_wql' memory.
 601                  */
 602                 tavor_wql_refcnt_dec(srq->srq_wrid_wql);
 603         }
 604 
 605         /* Free the memory for the SRQ */
 606         tavor_queue_free(state, &srq->srq_wqinfo);
 607 
 608         /* Free the Tavor SRQ Handle */
 609         tavor_rsrc_free(state, &rsrc);
 610 
 611         /* Free the SRQC entry resource */
 612         tavor_rsrc_free(state, &srqc);
 613 
 614         /* Decrement the reference count on the protection domain (PD) */
 615         tavor_pd_refcnt_dec(pd);
 616 
 617         /* Set the srqhdl pointer to NULL and return success */
 618         *srqhdl = NULL;
 619 
 620         TAVOR_TNF_EXIT(tavor_srq_free);
 621         return (DDI_SUCCESS);
 622 }
 623 
 624 
 625 /*
 626  * tavor_srq_modify()
 627  *    Context: Can be called only from user or kernel context.
 628  */
 629 int
 630 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
 631     uint_t *real_size, uint_t sleepflag)
 632 {
 633         tavor_qalloc_info_t     new_srqinfo, old_srqinfo;
 634         tavor_rsrc_t            *mtt, *mpt, *old_mtt;
 635         tavor_bind_info_t       bind;
 636         tavor_bind_info_t       old_bind;
 637         tavor_rsrc_pool_info_t  *rsrc_pool;
 638         tavor_mrhdl_t           mr;
 639         tavor_hw_mpt_t          mpt_entry;
 640         tavor_wrid_entry_t      *wre_new, *wre_old;
 641         uint64_t                mtt_ddrbaseaddr, mtt_addr;
 642         uint64_t                srq_desc_off;
 643         uint32_t                *buf, srq_old_bufsz;
 644         uint32_t                wqesz;
 645         uint_t                  max_srq_size;
 646         uint_t                  dma_xfer_mode, mtt_pgsize_bits;
 647         uint_t                  srq_sync, log_srq_size, maxprot;
 648         uint_t                  wq_location;
 649         int                     status;
 650         char                    *errormsg;
 651 
 652         TAVOR_TNF_ENTER(tavor_srq_modify);
 653 
 654         /*
 655          * Check the "inddr" flag.  This flag tells the driver whether or not
 656          * the SRQ's work queues should be come from normal system memory or
 657          * whether they should be allocated from DDR memory.
 658          */
 659         wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
 660 
 661         /*
 662          * If size requested is larger than device capability, return
 663          * Insufficient Resources
 664          */
 665         max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
 666         if (size > max_srq_size) {
 667                 TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
 668                     TAVOR_TNF_ERROR, "");
 669                 TAVOR_TNF_EXIT(tavor_srq_modify);
 670                 return (IBT_HCA_WR_EXCEEDED);
 671         }
 672 
 673         /*
 674          * Calculate the appropriate size for the SRQ.
 675          * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
 676          * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
 677          * is to round the requested size up to the next highest power-of-2
 678          */
 679         size = max(size, TAVOR_SRQ_MIN_SIZE);
 680         log_srq_size = highbit(size);
 681         if (ISP2(size)) {
 682                 log_srq_size = log_srq_size - 1;
 683         }
 684 
 685         /*
 686          * Next we verify that the rounded-up size is valid (i.e. consistent
 687          * with the device limits and/or software-configured limits).
 688          */
 689         if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
 690                 /* Set "status" and "errormsg" and goto failure */
 691                 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
 692                 goto srqmodify_fail;
 693         }
 694 
 695         /*
 696          * Allocate the memory for newly resized Shared Receive Queue.
 697          *
 698          * Note: If SRQ is not user-mappable, then it may come from either
 699          * kernel system memory or from HCA-attached local DDR memory.
 700          *
 701          * Note2: We align this queue on a pagesize boundary.  This is required
 702          * to make sure that all the resulting IB addresses will start at 0,
 703          * for a zero-based queue.  By making sure we are aligned on at least a
 704          * page, any offset we use into our queue will be the same as it was
 705          * when we allocated it at tavor_srq_alloc() time.
 706          */
 707         wqesz = (1 << srq->srq_wq_log_wqesz);
 708         new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
 709         new_srqinfo.qa_alloc_align = PAGESIZE;
 710         new_srqinfo.qa_bind_align  = PAGESIZE;
 711         if (srq->srq_is_umap) {
 712                 new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 713         } else {
 714                 new_srqinfo.qa_location = wq_location;
 715         }
 716         status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
 717         if (status != DDI_SUCCESS) {
 718                 /* Set "status" and "errormsg" and goto failure */
 719                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
 720                 goto srqmodify_fail;
 721         }
 722         buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
 723 
 724         /*
 725          * Allocate the memory for the new WRE list.  This will be used later
 726          * when we resize the wridlist based on the new SRQ size.
 727          */
 728         wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
 729             sizeof (tavor_wrid_entry_t), sleepflag);
 730         if (wre_new == NULL) {
 731                 /* Set "status" and "errormsg" and goto failure */
 732                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
 733                     "failed wre_new alloc");
 734                 goto srqmodify_fail;
 735         }
 736 
 737         /*
 738          * Fill in the "bind" struct.  This struct provides the majority
 739          * of the information that will be used to distinguish between an
 740          * "addr" binding (as is the case here) and a "buf" binding (see
 741          * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
 742          * which does most of the "heavy lifting" for the Tavor memory
 743          * registration routines.
 744          */
 745         bzero(&bind, sizeof (tavor_bind_info_t));
 746         bind.bi_type  = TAVOR_BINDHDL_VADDR;
 747         bind.bi_addr  = (uint64_t)(uintptr_t)buf;
 748         bind.bi_len   = new_srqinfo.qa_size;
 749         bind.bi_as    = NULL;
 750         bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
 751             IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
 752         if (srq->srq_is_umap) {
 753                 bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
 754         } else {
 755                 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
 756                         bind.bi_bypass =
 757                             state->ts_cfg_profile->cp_iommu_bypass;
 758                         dma_xfer_mode =
 759                             state->ts_cfg_profile->cp_streaming_consistent;
 760                         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 761                                 bind.bi_flags |= IBT_MR_NONCOHERENT;
 762                         }
 763                 } else {
 764                         bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
 765                 }
 766         }
 767         status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
 768             &mtt_pgsize_bits);
 769         if (status != DDI_SUCCESS) {
 770                 /* Set "status" and "errormsg" and goto failure */
 771                 TAVOR_TNF_FAIL(status, "failed mtt bind");
 772                 kmem_free(wre_new, srq->srq_wq_bufsz *
 773                     sizeof (tavor_wrid_entry_t));
 774                 tavor_queue_free(state, &new_srqinfo);
 775                 goto srqmodify_fail;
 776         }
 777 
 778         /*
 779          * Calculate the offset between the kernel virtual address space
 780          * and the IB virtual address space.  This will be used when
 781          * posting work requests to properly initialize each WQE.
 782          *
 783          * Note: bind addr is zero-based (from alloc) so we calculate the
 784          * correct new offset here.
 785          */
 786         bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
 787         srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
 788             (uint64_t)bind.bi_addr;
 789 
 790         /*
 791          * Get the base address for the MTT table.  This will be necessary
 792          * below when we are modifying the MPT entry.
 793          */
 794         rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
 795         mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
 796 
 797         /*
 798          * Fill in the MPT entry.  This is the final step before passing
 799          * ownership of the MPT entry to the Tavor hardware.  We use all of
 800          * the information collected/calculated above to fill in the
 801          * requisite portions of the MPT.
 802          */
 803         bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
 804         mpt_entry.reg_win_len   = bind.bi_len;
 805         mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
 806         mpt_entry.mttseg_addr_h = mtt_addr >> 32;
 807         mpt_entry.mttseg_addr_l = mtt_addr >> 6;
 808 
 809         /*
 810          * Now we grab the SRQ lock.  Since we will be updating the actual
 811          * SRQ location and the producer/consumer indexes, we should hold
 812          * the lock.
 813          *
 814          * We do a TAVOR_NOSLEEP here (and below), though, because we are
 815          * holding the "srq_lock" and if we got raised to interrupt level
 816          * by priority inversion, we would not want to block in this routine
 817          * waiting for success.
 818          */
 819         mutex_enter(&srq->srq_lock);
 820 
 821         /*
 822          * Copy old entries to new buffer
 823          */
 824         srq_old_bufsz = srq->srq_wq_bufsz;
 825         bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
 826 
 827         /* Determine if later ddi_dma_sync will be necessary */
 828         srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
 829 
 830         /* Sync entire "new" SRQ for use by hardware (if necessary) */
 831         if (srq_sync) {
 832                 (void) ddi_dma_sync(bind.bi_dmahdl, 0,
 833                     new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 834         }
 835 
 836         /*
 837          * Setup MPT information for use in the MODIFY_MPT command
 838          */
 839         mr = srq->srq_mrhdl;
 840         mutex_enter(&mr->mr_lock);
 841         mpt = srq->srq_mrhdl->mr_mptrsrcp;
 842 
 843         /*
 844          * MODIFY_MPT
 845          *
 846          * If this fails for any reason, then it is an indication that
 847          * something (either in HW or SW) has gone seriously wrong.  So we
 848          * print a warning message and return.
 849          */
 850         status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
 851             TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
 852         if (status != TAVOR_CMD_SUCCESS) {
 853                 cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
 854                     status);
 855                 TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
 856                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 857                 TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
 858                 (void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
 859                     srq->srq_mrhdl->mr_mttrsrcp);
 860                 kmem_free(wre_new, srq->srq_wq_bufsz *
 861                     sizeof (tavor_wrid_entry_t));
 862                 tavor_queue_free(state, &new_srqinfo);
 863                 mutex_exit(&mr->mr_lock);
 864                 mutex_exit(&srq->srq_lock);
 865                 return (ibc_get_ci_failure(0));
 866         }
 867 
 868         /*
 869          * Update the Tavor Shared Receive Queue handle with all the new
 870          * information.  At the same time, save away all the necessary
 871          * information for freeing up the old resources
 872          */
 873         old_srqinfo        = srq->srq_wqinfo;
 874         old_mtt            = srq->srq_mrhdl->mr_mttrsrcp;
 875         bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
 876             sizeof (tavor_bind_info_t));
 877 
 878         /* Now set the new info */
 879         srq->srq_wqinfo         = new_srqinfo;
 880         srq->srq_wq_buf         = buf;
 881         srq->srq_wq_bufsz  = (1 << log_srq_size);
 882         bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
 883         srq->srq_mrhdl->mr_mttrsrcp = mtt;
 884         srq->srq_desc_off  = srq_desc_off;
 885         srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
 886 
 887         /* Update MR mtt pagesize */
 888         mr->mr_logmttpgsz = mtt_pgsize_bits;
 889         mutex_exit(&mr->mr_lock);
 890 
 891         if (srq->srq_wrid_wql != NULL) {
 892                 mutex_enter(&srq->srq_wrid_wql->wql_lock);
 893         }
 894 
 895         /*
 896          * Initialize new wridlist, if needed.
 897          *
 898          * If a wridlist already is setup on an SRQ (the QP associated with an
 899          * SRQ has moved "from_reset") then we must update this wridlist based
 900          * on the new SRQ size.  We allocate the new size of Work Request ID
 901          * Entries, copy over the old entries to the new list, and
 902          * re-initialize the srq wridlist in non-umap case
 903          */
 904         wre_old = NULL;
 905         if (srq->srq_wridlist != NULL) {
 906                 wre_old = srq->srq_wridlist->wl_wre;
 907 
 908                 bcopy(wre_old, wre_new, srq_old_bufsz *
 909                     sizeof (tavor_wrid_entry_t));
 910 
 911                 /* Setup new sizes in wre */
 912                 srq->srq_wridlist->wl_wre = wre_new;
 913                 srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
 914 
 915                 if (!srq->srq_is_umap) {
 916                         tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
 917                             srq_old_bufsz);
 918                 }
 919         }
 920 
 921         if (srq->srq_wrid_wql != NULL) {
 922                 mutex_exit(&srq->srq_wrid_wql->wql_lock);
 923         }
 924 
 925         /*
 926          * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
 927          * to a user process, then we need to call devmap_devmem_remap() to
 928          * invalidate the mapping to the SRQ memory.  We also need to
 929          * invalidate the SRQ tracking information for the user mapping.
 930          *
 931          * Note: On failure, the remap really shouldn't ever happen.  So, if it
 932          * does, it is an indication that something has gone seriously wrong.
 933          * So we print a warning message and return error (knowing, of course,
 934          * that the "old" SRQ memory will be leaked)
 935          */
 936         if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
 937                 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 938                 status = devmap_devmem_remap(srq->srq_umap_dhp,
 939                     state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
 940                     DEVMAP_MAPPING_INVALID, NULL);
 941                 if (status != DDI_SUCCESS) {
 942                         mutex_exit(&srq->srq_lock);
 943                         TAVOR_WARNING(state, "failed in SRQ memory "
 944                             "devmap_devmem_remap()");
 945                         /* We can, however, free the memory for old wre */
 946                         if (wre_old != NULL) {
 947                                 kmem_free(wre_old, srq_old_bufsz *
 948                                     sizeof (tavor_wrid_entry_t));
 949                         }
 950                         TAVOR_TNF_EXIT(tavor_srq_modify);
 951                         return (ibc_get_ci_failure(0));
 952                 }
 953                 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
 954         }
 955 
 956         /*
 957          * Drop the SRQ lock now.  The only thing left to do is to free up
 958          * the old resources.
 959          */
 960         mutex_exit(&srq->srq_lock);
 961 
 962         /*
 963          * Unbind the MTT entries.
 964          */
 965         status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
 966         if (status != DDI_SUCCESS) {
 967                 TAVOR_WARNING(state, "failed to unbind old SRQ memory");
 968                 /* Set "status" and "errormsg" and goto failure */
 969                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 970                     "failed to unbind (old)");
 971                 goto srqmodify_fail;
 972         }
 973 
 974         /* Free the memory for old wre */
 975         if (wre_old != NULL) {
 976                 kmem_free(wre_old, srq_old_bufsz *
 977                     sizeof (tavor_wrid_entry_t));
 978         }
 979 
 980         /* Free the memory for the old SRQ */
 981         tavor_queue_free(state, &old_srqinfo);
 982 
 983         /*
 984          * Fill in the return arguments (if necessary).  This includes the
 985          * real new completion queue size.
 986          */
 987         if (real_size != NULL) {
 988                 *real_size = (1 << log_srq_size);
 989         }
 990 
 991         TAVOR_TNF_EXIT(tavor_srq_modify);
 992         return (DDI_SUCCESS);
 993 
 994 srqmodify_fail:
 995         TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
 996             tnf_string, msg, errormsg);
 997         TAVOR_TNF_EXIT(tavor_srq_modify);
 998         return (status);
 999 }
1000 
1001 
1002 /*
1003  * tavor_srq_refcnt_inc()
1004  *    Context: Can be called from interrupt or base context.
1005  */
1006 void
1007 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
1008 {
1009         mutex_enter(&srq->srq_lock);
1010         TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
1011             tnf_uint, refcnt, srq->srq_refcnt);
1012         srq->srq_refcnt++;
1013         mutex_exit(&srq->srq_lock);
1014 }
1015 
1016 
1017 /*
1018  * tavor_srq_refcnt_dec()
1019  *    Context: Can be called from interrupt or base context.
1020  */
1021 void
1022 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
1023 {
1024         mutex_enter(&srq->srq_lock);
1025         srq->srq_refcnt--;
1026         TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
1027             tnf_uint, refcnt, srq->srq_refcnt);
1028         mutex_exit(&srq->srq_lock);
1029 }
1030 
1031 
1032 /*
1033  * tavor_srqhdl_from_srqnum()
1034  *    Context: Can be called from interrupt or base context.
1035  *
1036  *    This routine is important because changing the unconstrained
1037  *    portion of the SRQ number is critical to the detection of a
1038  *    potential race condition in the SRQ handler code (i.e. the case
1039  *    where a SRQ is freed and alloc'd again before an event for the
1040  *    "old" SRQ can be handled).
1041  *
1042  *    While this is not a perfect solution (not sure that one exists)
1043  *    it does help to mitigate the chance that this race condition will
1044  *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
1045  *    this solution does not scale well because the number of constrained
1046  *    bits increases (and, hence, the number of unconstrained bits
1047  *    decreases) as the number of supported SRQ grows.  For small and
1048  *    intermediate values, it should hopefully provide sufficient
1049  *    protection.
1050  */
1051 tavor_srqhdl_t
1052 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
1053 {
1054         uint_t  srqindx, srqmask;
1055 
1056         /* Calculate the SRQ table index from the srqnum */
1057         srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1058         srqindx = srqnum & srqmask;
1059         return (state->ts_srqhdl[srqindx]);
1060 }
1061 
1062 
1063 /*
1064  * tavor_srq_sgl_to_logwqesz()
1065  *    Context: Can be called from interrupt or base context.
1066  */
1067 static void
1068 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1069     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1070 {
1071         uint_t  max_size, log2, actual_sgl;
1072 
1073         TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);
1074 
1075         switch (wq_type) {
1076         case TAVOR_QP_WQ_TYPE_RECVQ:
1077                 /*
1078                  * Use requested maximum SGL to calculate max descriptor size
1079                  * (while guaranteeing that the descriptor size is a
1080                  * power-of-2 cachelines).
1081                  */
1082                 max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1083                 log2 = highbit(max_size);
1084                 if (ISP2(max_size)) {
1085                         log2 = log2 - 1;
1086                 }
1087 
1088                 /* Make sure descriptor is at least the minimum size */
1089                 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1090 
1091                 /* Calculate actual number of SGL (given WQE size) */
1092                 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1093                 break;
1094 
1095         default:
1096                 TAVOR_WARNING(state, "unexpected work queue type");
1097                 TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
1098                     TAVOR_TNF_ERROR, "");
1099                 break;
1100         }
1101 
1102         /* Fill in the return values */
1103         *logwqesz = log2;
1104         *max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1105 
1106         TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
1107 }