1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * tavor_cq.c
  29  *    Tavor Completion Queue Processing Routines
  30  *
  31  *    Implements all the routines necessary for allocating, freeing, resizing,
  32  *    and handling the completion type events that the Tavor hardware can
  33  *    generate.
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/conf.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/modctl.h>
  41 #include <sys/bitmap.h>
  42 #include <sys/sysmacros.h>
  43 
  44 #include <sys/ib/adapters/tavor/tavor.h>
  45 
  46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
  47     uint32_t cqn, uint32_t cq_param);
  48 #pragma inline(tavor_cq_doorbell)
  49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
  50     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
  51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
  52     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
  53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
  54     uint_t flag);
  55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
  56     uint32_t old_cons_indx, uint32_t num_newcqe);
  57 
  58 /*
  59  * tavor_cq_alloc()
  60  *    Context: Can be called only from user or kernel context.
  61  */
  62 int
  63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
  64     ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
  65     uint_t sleepflag)
  66 {
  67         tavor_rsrc_t            *cqc, *rsrc;
  68         tavor_umap_db_entry_t   *umapdb;
  69         tavor_hw_cqc_t          cqc_entry;
  70         tavor_cqhdl_t           cq;
  71         ibt_mr_attr_t           mr_attr;
  72         tavor_mr_options_t      op;
  73         tavor_pdhdl_t           pd;
  74         tavor_mrhdl_t           mr;
  75         tavor_hw_cqe_t          *buf;
  76         uint64_t                addr, value;
  77         uint32_t                log_cq_size, lkey, uarpg;
  78         uint_t                  dma_xfer_mode, cq_sync, cq_is_umap;
  79         int                     status, i, flag;
  80         char                    *errormsg;
  81 
  82         TAVOR_TNF_ENTER(tavor_cq_alloc);
  83 
  84         /*
  85          * Determine whether CQ is being allocated for userland access or
  86          * whether it is being allocated for kernel access.  If the CQ is
  87          * being allocated for userland access, then lookup the UAR doorbell
  88          * page number for the current process.  Note:  If this is not found
  89          * (e.g. if the process has not previously open()'d the Tavor driver),
  90          * then an error is returned.
  91          */
  92         cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
  93         if (cq_is_umap) {
  94                 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
  95                     MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
  96                 if (status != DDI_SUCCESS) {
  97                         /* Set "status" and "errormsg" and goto failure */
  98                         TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
  99                         goto cqalloc_fail;
 100                 }
 101                 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
 102         }
 103 
 104         /* Use the internal protection domain (PD) for setting up CQs */
 105         pd = state->ts_pdhdl_internal;
 106 
 107         /* Increment the reference count on the protection domain (PD) */
 108         tavor_pd_refcnt_inc(pd);
 109 
 110         /*
 111          * Allocate an CQ context entry.  This will be filled in with all
 112          * the necessary parameters to define the Completion Queue.  And then
 113          * ownership will be passed to the hardware in the final step
 114          * below.  If we fail here, we must undo the protection domain
 115          * reference count.
 116          */
 117         status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
 118         if (status != DDI_SUCCESS) {
 119                 /* Set "status" and "errormsg" and goto failure */
 120                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
 121                 goto cqalloc_fail1;
 122         }
 123 
 124         /*
 125          * Allocate the software structure for tracking the completion queue
 126          * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
 127          * undo the protection domain reference count and the previous
 128          * resource allocation.
 129          */
 130         status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
 131         if (status != DDI_SUCCESS) {
 132                 /* Set "status" and "errormsg" and goto failure */
 133                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
 134                 goto cqalloc_fail2;
 135         }
 136         cq = (tavor_cqhdl_t)rsrc->tr_addr;
 137         cq->cq_is_umap = cq_is_umap;
 138 
 139         /* Use the index as CQ number */
 140         cq->cq_cqnum = cqc->tr_indx;
 141 
 142         /*
 143          * If this will be a user-mappable CQ, then allocate an entry for
 144          * the "userland resources database".  This will later be added to
 145          * the database (after all further CQ operations are successful).
 146          * If we fail here, we must undo the reference counts and the
 147          * previous resource allocation.
 148          */
 149         if (cq->cq_is_umap) {
 150                 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
 151                     MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
 152                 if (umapdb == NULL) {
 153                         /* Set "status" and "errormsg" and goto failure */
 154                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 155                         goto cqalloc_fail3;
 156                 }
 157         }
 158 
 159         /*
 160          * Calculate the appropriate size for the completion queue.
 161          * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
 162          * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
 163          * to round the requested size up to the next highest power-of-2
 164          */
 165         cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
 166         log_cq_size = highbit(cq_attr->cq_size);
 167 
 168         /*
 169          * Next we verify that the rounded-up size is valid (i.e. consistent
 170          * with the device limits and/or software-configured limits)
 171          */
 172         if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
 173                 /* Set "status" and "errormsg" and goto failure */
 174                 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
 175                 goto cqalloc_fail4;
 176         }
 177 
 178         /*
 179          * Allocate the memory for Completion Queue.
 180          *
 181          * Note: Although we use the common queue allocation routine, we
 182          * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
 183          * kernel system memory) for kernel CQs because it would be
 184          * inefficient to have CQs located in DDR memory.  This is primarily
 185          * because CQs are read from (by software) more than they are written
 186          * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
 187          * user-mappable CQs for a similar reason.)
 188          * It is also worth noting that, unlike Tavor QP work queues,
 189          * completion queues do not have the same strict alignment
 190          * requirements.  It is sufficient for the CQ memory to be both
 191          * aligned to and bound to addresses which are a multiple of CQE size.
 192          */
 193         cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
 194         cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
 195         cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
 196         if (cq->cq_is_umap) {
 197                 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 198         } else {
 199                 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
 200         }
 201         status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
 202         if (status != DDI_SUCCESS) {
 203                 /* Set "status" and "errormsg" and goto failure */
 204                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
 205                 goto cqalloc_fail4;
 206         }
 207         buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
 208 
 209         /*
 210          * Initialize each of the Completion Queue Entries (CQE) by setting
 211          * their ownership to hardware ("owner" bit set to HW).  This is in
 212          * preparation for the final transfer of ownership (below) of the
 213          * CQ context itself.
 214          */
 215         for (i = 0; i < (1 << log_cq_size); i++) {
 216                 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
 217         }
 218 
 219         /*
 220          * Register the memory for the CQ.  The memory for the CQ must
 221          * be registered in the Tavor TPT tables.  This gives us the LKey
 222          * to specify in the CQ context below.  Note: If this is a user-
 223          * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
 224          */
 225         flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
 226         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 227         mr_attr.mr_len   = cq->cq_cqinfo.qa_size;
 228         mr_attr.mr_as    = NULL;
 229         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
 230         if (cq->cq_is_umap) {
 231                 dma_xfer_mode = DDI_DMA_CONSISTENT;
 232         } else {
 233                 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
 234         }
 235         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 236                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 237         }
 238         op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
 239         op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
 240         op.mro_bind_override_addr = 0;
 241         status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
 242         if (status != DDI_SUCCESS) {
 243                 /* Set "status" and "errormsg" and goto failure */
 244                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 245                 goto cqalloc_fail5;
 246         }
 247         addr = mr->mr_bindinfo.bi_addr;
 248         lkey = mr->mr_lkey;
 249 
 250         /* Determine if later ddi_dma_sync will be necessary */
 251         cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
 252 
 253         /* Sync entire CQ for use by the hardware (if necessary). */
 254         if (cq_sync) {
 255                 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
 256                     cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 257         }
 258 
 259         /*
 260          * Fill in the CQC entry.  This is the final step before passing
 261          * ownership of the CQC entry to the Tavor hardware.  We use all of
 262          * the information collected/calculated above to fill in the
 263          * requisite portions of the CQC.  Note: If this CQ is going to be
 264          * used for userland access, then we need to set the UAR page number
 265          * appropriately (otherwise it's a "don't care")
 266          */
 267         bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
 268         cq->cq_eqnum         = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
 269         cq->cq_erreqnum              = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
 270         cqc_entry.xlat          = TAVOR_VA2PA_XLAT_ENABLED;
 271         cqc_entry.state         = TAVOR_CQ_DISARMED;
 272         cqc_entry.start_addr_h  = (addr >> 32);
 273         cqc_entry.start_addr_l  = (addr & 0xFFFFFFFF);
 274         cqc_entry.log_cq_sz     = log_cq_size;
 275         if (cq->cq_is_umap) {
 276                 cqc_entry.usr_page = uarpg;
 277         } else {
 278                 cqc_entry.usr_page = 0;
 279         }
 280         cqc_entry.pd            = pd->pd_pdnum;
 281         cqc_entry.lkey          = lkey;
 282         cqc_entry.e_eqn         = cq->cq_erreqnum;
 283         cqc_entry.c_eqn         = cq->cq_eqnum;
 284         cqc_entry.cqn           = cq->cq_cqnum;
 285 
 286         /*
 287          * Write the CQC entry to hardware.  Lastly, we pass ownership of
 288          * the entry to the hardware (using the Tavor SW2HW_CQ firmware
 289          * command).  Note: In general, this operation shouldn't fail.  But
 290          * if it does, we have to undo everything we've done above before
 291          * returning error.
 292          */
 293         status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
 294             sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
 295         if (status != TAVOR_CMD_SUCCESS) {
 296                 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
 297                     status);
 298                 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
 299                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 300                 /* Set "status" and "errormsg" and goto failure */
 301                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
 302                 goto cqalloc_fail6;
 303         }
 304 
 305         /*
 306          * Fill in the rest of the Tavor Completion Queue handle.  Having
 307          * successfully transferred ownership of the CQC, we can update the
 308          * following fields for use in further operations on the CQ.
 309          */
 310         cq->cq_cqcrsrcp        = cqc;
 311         cq->cq_rsrcp   = rsrc;
 312         cq->cq_consindx        = 0;
 313         cq->cq_buf     = buf;
 314         cq->cq_bufsz   = (1 << log_cq_size);
 315         cq->cq_mrhdl   = mr;
 316         cq->cq_sync    = cq_sync;
 317         cq->cq_refcnt          = 0;
 318         cq->cq_is_special = 0;
 319         cq->cq_uarpg   = uarpg;
 320         cq->cq_umap_dhp        = (devmap_cookie_t)NULL;
 321         avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
 322             sizeof (struct tavor_workq_hdr_s),
 323             offsetof(struct tavor_workq_hdr_s, wq_avl_link));
 324 
 325         cq->cq_wrid_reap_head  = NULL;
 326         cq->cq_wrid_reap_tail  = NULL;
 327         cq->cq_hdlrarg         = (void *)ibt_cqhdl;
 328 
 329         /*
 330          * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
 331          * "actual_size" and "cqhdl" and return success
 332          */
 333         ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
 334         state->ts_cqhdl[cqc->tr_indx] = cq;
 335 
 336         /*
 337          * If this is a user-mappable CQ, then we need to insert the previously
 338          * allocated entry into the "userland resources database".  This will
 339          * allow for later lookup during devmap() (i.e. mmap()) calls.
 340          */
 341         if (cq->cq_is_umap) {
 342                 tavor_umap_db_add(umapdb);
 343         }
 344 
 345         /*
 346          * Fill in the return arguments (if necessary).  This includes the
 347          * real completion queue size.
 348          */
 349         if (actual_size != NULL) {
 350                 *actual_size = (1 << log_cq_size) - 1;
 351         }
 352         *cqhdl = cq;
 353 
 354         TAVOR_TNF_EXIT(tavor_cq_alloc);
 355         return (DDI_SUCCESS);
 356 
 357 /*
 358  * The following is cleanup for all possible failure cases in this routine
 359  */
 360 cqalloc_fail6:
 361         if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 362             sleepflag) != DDI_SUCCESS) {
 363                 TAVOR_WARNING(state, "failed to deregister CQ memory");
 364         }
 365 cqalloc_fail5:
 366         tavor_queue_free(state, &cq->cq_cqinfo);
 367 cqalloc_fail4:
 368         if (cq_is_umap) {
 369                 tavor_umap_db_free(umapdb);
 370         }
 371 cqalloc_fail3:
 372         tavor_rsrc_free(state, &rsrc);
 373 cqalloc_fail2:
 374         tavor_rsrc_free(state, &cqc);
 375 cqalloc_fail1:
 376         tavor_pd_refcnt_dec(pd);
 377 cqalloc_fail:
 378         TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
 379             tnf_string, msg, errormsg);
 380         TAVOR_TNF_EXIT(tavor_cq_alloc);
 381         return (status);
 382 }
 383 
 384 
 385 /*
 386  * tavor_cq_free()
 387  *    Context: Can be called only from user or kernel context.
 388  */
 389 /* ARGSUSED */
 390 int
 391 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
 392 {
 393         tavor_rsrc_t            *cqc, *rsrc;
 394         tavor_umap_db_entry_t   *umapdb;
 395         tavor_hw_cqc_t          cqc_entry;
 396         tavor_pdhdl_t           pd;
 397         tavor_mrhdl_t           mr;
 398         tavor_cqhdl_t           cq;
 399         uint32_t                cqnum;
 400         uint64_t                value;
 401         uint_t                  maxprot;
 402         int                     status;
 403 
 404         TAVOR_TNF_ENTER(tavor_cq_free);
 405 
 406         /*
 407          * Pull all the necessary information from the Tavor Completion Queue
 408          * handle.  This is necessary here because the resource for the
 409          * CQ handle is going to be freed up as part of this operation.
 410          */
 411         cq      = *cqhdl;
 412         mutex_enter(&cq->cq_lock);
 413         cqc     = cq->cq_cqcrsrcp;
 414         rsrc    = cq->cq_rsrcp;
 415         pd      = state->ts_pdhdl_internal;
 416         mr      = cq->cq_mrhdl;
 417         cqnum   = cq->cq_cqnum;
 418 
 419         /*
 420          * If there are work queues still associated with the CQ, then return
 421          * an error.  Otherwise, we will be holding the CQ lock.
 422          */
 423         if (cq->cq_refcnt != 0) {
 424                 mutex_exit(&cq->cq_lock);
 425                 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
 426                     tnf_int, refcnt, cq->cq_refcnt);
 427                 TAVOR_TNF_EXIT(tavor_cq_free);
 428                 return (IBT_CQ_BUSY);
 429         }
 430 
 431         /*
 432          * If this was a user-mappable CQ, then we need to remove its entry
 433          * from the "userland resources database".  If it is also currently
 434          * mmap()'d out to a user process, then we need to call
 435          * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
 436          * We also need to invalidate the CQ tracking information for the
 437          * user mapping.
 438          */
 439         if (cq->cq_is_umap) {
 440                 status = tavor_umap_db_find(state->ts_instance, cqnum,
 441                     MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
 442                     &umapdb);
 443                 if (status != DDI_SUCCESS) {
 444                         mutex_exit(&cq->cq_lock);
 445                         TAVOR_WARNING(state, "failed to find in database");
 446                         TAVOR_TNF_EXIT(tavor_cq_free);
 447                         return (ibc_get_ci_failure(0));
 448                 }
 449                 tavor_umap_db_free(umapdb);
 450                 if (cq->cq_umap_dhp != NULL) {
 451                         maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 452                         status = devmap_devmem_remap(cq->cq_umap_dhp,
 453                             state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
 454                             maxprot, DEVMAP_MAPPING_INVALID, NULL);
 455                         if (status != DDI_SUCCESS) {
 456                                 mutex_exit(&cq->cq_lock);
 457                                 TAVOR_WARNING(state, "failed in CQ memory "
 458                                     "devmap_devmem_remap()");
 459                                 TAVOR_TNF_EXIT(tavor_cq_free);
 460                                 return (ibc_get_ci_failure(0));
 461                         }
 462                         cq->cq_umap_dhp = (devmap_cookie_t)NULL;
 463                 }
 464         }
 465 
 466         /*
 467          * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
 468          * in-progress events to detect that the CQ corresponding to this
 469          * number has been freed.
 470          */
 471         state->ts_cqhdl[cqc->tr_indx] = NULL;
 472 
 473         /*
 474          * While we hold the CQ lock, do a "forced reap" of the workQ WRID
 475          * list.  This cleans up all the structures associated with the WRID
 476          * processing for this CQ.  Once we complete, drop the lock and finish
 477          * the deallocation of the CQ.
 478          */
 479         tavor_wrid_cq_force_reap(cq);
 480 
 481         mutex_exit(&cq->cq_lock);
 482 
 483         /*
 484          * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
 485          * firmware command).  If the ownership transfer fails for any reason,
 486          * then it is an indication that something (either in HW or SW) has
 487          * gone seriously wrong.
 488          */
 489         status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
 490             sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
 491         if (status != TAVOR_CMD_SUCCESS) {
 492                 TAVOR_WARNING(state, "failed to reclaim CQC ownership");
 493                 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
 494                     status);
 495                 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
 496                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 497                 TAVOR_TNF_EXIT(tavor_cq_free);
 498                 return (ibc_get_ci_failure(0));
 499         }
 500 
 501         /*
 502          * Deregister the memory for the Completion Queue.  If this fails
 503          * for any reason, then it is an indication that something (either
 504          * in HW or SW) has gone seriously wrong.  So we print a warning
 505          * message and return.
 506          */
 507         status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 508             sleepflag);
 509         if (status != DDI_SUCCESS) {
 510                 TAVOR_WARNING(state, "failed to deregister CQ memory");
 511                 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
 512                 TAVOR_TNF_EXIT(tavor_cq_free);
 513                 return (ibc_get_ci_failure(0));
 514         }
 515 
 516         /* Free the memory for the CQ */
 517         tavor_queue_free(state, &cq->cq_cqinfo);
 518 
 519         /* Free the Tavor Completion Queue handle */
 520         tavor_rsrc_free(state, &rsrc);
 521 
 522         /* Free up the CQC entry resource */
 523         tavor_rsrc_free(state, &cqc);
 524 
 525         /* Decrement the reference count on the protection domain (PD) */
 526         tavor_pd_refcnt_dec(pd);
 527 
 528         /* Set the cqhdl pointer to NULL and return success */
 529         *cqhdl = NULL;
 530 
 531         TAVOR_TNF_EXIT(tavor_cq_free);
 532         return (DDI_SUCCESS);
 533 }
 534 
 535 
 536 /*
 537  * tavor_cq_resize()
 538  *    Context: Can be called only from user or kernel context.
 539  */
 540 int
 541 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
 542     uint_t *actual_size, uint_t sleepflag)
 543 {
 544         tavor_hw_cqc_t          cqc_entry;
 545         tavor_qalloc_info_t     new_cqinfo, old_cqinfo;
 546         ibt_mr_attr_t           mr_attr;
 547         tavor_mr_options_t      op;
 548         tavor_pdhdl_t           pd;
 549         tavor_mrhdl_t           mr, mr_old;
 550         tavor_hw_cqe_t          *buf;
 551         uint32_t                new_prod_indx, old_cons_indx;
 552         uint_t                  dma_xfer_mode, cq_sync, log_cq_size, maxprot;
 553         int                     status, i, flag;
 554         char                    *errormsg;
 555 
 556         TAVOR_TNF_ENTER(tavor_cq_resize);
 557 
 558         /* Use the internal protection domain (PD) for CQs */
 559         pd = state->ts_pdhdl_internal;
 560 
 561         /*
 562          * Calculate the appropriate size for the new resized completion queue.
 563          * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
 564          * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
 565          * to round the requested size up to the next highest power-of-2
 566          */
 567         req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
 568         log_cq_size = highbit(req_size);
 569 
 570         /*
 571          * Next we verify that the rounded-up size is valid (i.e. consistent
 572          * with the device limits and/or software-configured limits)
 573          */
 574         if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
 575                 /* Set "status" and "errormsg" and goto failure */
 576                 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
 577                 goto cqresize_fail;
 578         }
 579 
 580         /*
 581          * Allocate the memory for newly resized Completion Queue.
 582          *
 583          * Note: Although we use the common queue allocation routine, we
 584          * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
 585          * kernel system memory) for kernel CQs because it would be
 586          * inefficient to have CQs located in DDR memory.  This is the same
 587          * as we do when we first allocate completion queues primarily
 588          * because CQs are read from (by software) more than they are written
 589          * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
 590          * user-mappable CQs for a similar reason.)
 591          * It is also worth noting that, unlike Tavor QP work queues,
 592          * completion queues do not have the same strict alignment
 593          * requirements.  It is sufficient for the CQ memory to be both
 594          * aligned to and bound to addresses which are a multiple of CQE size.
 595          */
 596         new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
 597         new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
 598         new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
 599         if (cq->cq_is_umap) {
 600                 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 601         } else {
 602                 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
 603         }
 604         status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
 605         if (status != DDI_SUCCESS) {
 606                 /* Set "status" and "errormsg" and goto failure */
 607                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
 608                 goto cqresize_fail;
 609         }
 610         buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
 611 
 612         /*
 613          * Initialize each of the Completion Queue Entries (CQE) by setting
 614          * their ownership to hardware ("owner" bit set to HW).  This is in
 615          * preparation for the final resize operation (below).
 616          */
 617         for (i = 0; i < (1 << log_cq_size); i++) {
 618                 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
 619         }
 620 
 621         /*
 622          * Register the memory for the CQ.  The memory for the CQ must
 623          * be registered in the Tavor TPT tables.  This gives us the LKey
 624          * to specify in the CQ context below.
 625          */
 626         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
 627         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 628         mr_attr.mr_len   = new_cqinfo.qa_size;
 629         mr_attr.mr_as    = NULL;
 630         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
 631         if (cq->cq_is_umap) {
 632                 dma_xfer_mode = DDI_DMA_CONSISTENT;
 633         } else {
 634                 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
 635         }
 636         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 637                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 638         }
 639         op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
 640         op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
 641         op.mro_bind_override_addr = 0;
 642         status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
 643         if (status != DDI_SUCCESS) {
 644                 tavor_queue_free(state, &new_cqinfo);
 645                 /* Set "status" and "errormsg" and goto failure */
 646                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 647                 goto cqresize_fail;
 648         }
 649 
 650         /* Determine if later ddi_dma_sync will be necessary */
 651         cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
 652 
 653         /* Sync entire "new" CQ for use by hardware (if necessary) */
 654         if (cq_sync) {
 655                 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
 656                     new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 657         }
 658 
 659         /*
 660          * Now we grab the CQ lock.  Since we will be updating the actual
 661          * CQ location and the producer/consumer indexes, we should hold
 662          * the lock.
 663          *
 664          * We do a TAVOR_NOSLEEP here (and below), though, because we are
 665          * holding the "cq_lock" and if we got raised to interrupt level
 666          * by priority inversion, we would not want to block in this routine
 667          * waiting for success.
 668          */
 669         mutex_enter(&cq->cq_lock);
 670 
 671         /*
 672          * Determine the current CQ "consumer index".
 673          *
 674          * Note:  This will depend on whether the CQ had previously been
 675          * mapped for user access or whether it is a kernel CQ.  If this
 676          * is a kernel CQ, then all PollCQ() operations have come through
 677          * the IBTF and, hence, the driver's CQ state structure will
 678          * contain the current consumer index.  If, however, the user has
 679          * accessed this CQ by bypassing the driver (OS-bypass), then we
 680          * need to query the firmware to determine the current CQ consumer
 681          * index.  This also assumes that the user process will not continue
 682          * to consume entries while at the same time doing the ResizeCQ()
 683          * operation.  If the user process does not guarantee this, then it
 684          * may see duplicate or missed completions.  But under no
 685          * circumstances should this panic the system.
 686          */
 687         if (cq->cq_is_umap) {
 688                 status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
 689                     cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
 690                     TAVOR_NOSLEEP);
 691                 if (status != TAVOR_CMD_SUCCESS) {
 692                         /* Query CQ has failed, drop CQ lock and cleanup */
 693                         mutex_exit(&cq->cq_lock);
 694                         if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 695                             sleepflag) != DDI_SUCCESS) {
 696                                 TAVOR_WARNING(state, "failed to deregister "
 697                                     "CQ memory");
 698                         }
 699                         tavor_queue_free(state, &new_cqinfo);
 700                         TAVOR_WARNING(state, "failed to find in database");
 701 
 702                         /* Set "status" and "errormsg" and goto failure */
 703                         TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 704                             "failed umap lookup");
 705                         goto cqresize_fail;
 706                 }
 707                 old_cons_indx = cqc_entry.cons_indx;
 708         } else {
 709                 old_cons_indx = cq->cq_consindx;
 710         }
 711 
 712         /*
 713          * Fill in the CQC entry.  For the resize operation this is the
 714          * final step before attempting the resize operation on the CQC entry.
 715          * We use all of the information collected/calculated above to fill
 716          * in the requisite portions of the CQC.
 717          */
 718         bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
 719         cqc_entry.start_addr_h  = (mr->mr_bindinfo.bi_addr >> 32);
 720         cqc_entry.start_addr_l  = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
 721         cqc_entry.log_cq_sz     = log_cq_size;
 722         cqc_entry.lkey          = mr->mr_lkey;
 723 
 724         /*
 725          * Write the CQC entry to hardware.  Lastly, we pass ownership of
 726          * the entry to the hardware (using the Tavor RESIZE_CQ firmware
 727          * command).  Note: In general, this operation shouldn't fail.  But
 728          * if it does, we have to undo everything we've done above before
 729          * returning error.  Also note that the status returned may indicate
 730          * the code to return to the IBTF.
 731          */
 732         status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
 733             &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
 734         if (status != TAVOR_CMD_SUCCESS) {
 735                 /* Resize attempt has failed, drop CQ lock and cleanup */
 736                 mutex_exit(&cq->cq_lock);
 737                 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 738                     sleepflag) != DDI_SUCCESS) {
 739                         TAVOR_WARNING(state, "failed to deregister CQ memory");
 740                 }
 741                 tavor_queue_free(state, &new_cqinfo);
 742                 if (status == TAVOR_CMD_BAD_SIZE) {
 743                         TAVOR_TNF_EXIT(tavor_cq_resize);
 744                         return (IBT_CQ_SZ_INSUFFICIENT);
 745                 } else {
 746                         cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
 747                             "%08x\n", status);
 748                         TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
 749                             TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 750                         TAVOR_TNF_EXIT(tavor_cq_resize);
 751                         return (ibc_get_ci_failure(0));
 752                 }
 753         }
 754 
 755         /*
 756          * The CQ resize attempt was successful.  Before dropping the CQ lock,
 757          * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
 758          * the Tavor firmware guarantees us that sufficient space is set aside
 759          * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
 760          * The two parameters to this helper function ("old_cons_indx" and
 761          * "new_prod_indx") essentially indicate the starting index and number
 762          * of any CQEs that might remain in the "old" CQ memory.
 763          */
 764         tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
 765 
 766         /* Sync entire "new" CQ for use by hardware (if necessary) */
 767         if (cq_sync) {
 768                 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
 769                     new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 770         }
 771 
 772         /*
 773          * Update the Tavor Completion Queue handle with all the new
 774          * information.  At the same time, save away all the necessary
 775          * information for freeing up the old resources
 776          */
 777         mr_old           = cq->cq_mrhdl;
 778         old_cqinfo       = cq->cq_cqinfo;
 779         cq->cq_cqinfo         = new_cqinfo;
 780         cq->cq_consindx       = 0;
 781         cq->cq_buf    = buf;
 782         cq->cq_bufsz  = (1 << log_cq_size);
 783         cq->cq_mrhdl  = mr;
 784         cq->cq_sync   = cq_sync;
 785 
 786         /*
 787          * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
 788          * to a user process, then we need to call devmap_devmem_remap() to
 789          * invalidate the mapping to the CQ memory.  We also need to
 790          * invalidate the CQ tracking information for the user mapping.
 791          */
 792         if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
 793                 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 794                 status = devmap_devmem_remap(cq->cq_umap_dhp,
 795                     state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
 796                     DEVMAP_MAPPING_INVALID, NULL);
 797                 if (status != DDI_SUCCESS) {
 798                         mutex_exit(&cq->cq_lock);
 799                         TAVOR_WARNING(state, "failed in CQ memory "
 800                             "devmap_devmem_remap()");
 801                         TAVOR_TNF_EXIT(tavor_cq_free);
 802                         return (ibc_get_ci_failure(0));
 803                 }
 804                 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
 805         }
 806 
 807         /*
 808          * Drop the CQ lock now.  The only thing left to do is to free up
 809          * the old resources.
 810          */
 811         mutex_exit(&cq->cq_lock);
 812 
 813         /*
 814          * Deregister the memory for the old Completion Queue.  Note: We
 815          * really can't return error here because we have no good way to
 816          * cleanup.  Plus, the deregistration really shouldn't ever happen.
 817          * So, if it does, it is an indication that something has gone
 818          * seriously wrong.  So we print a warning message and return error
 819          * (knowing, of course, that the "old" CQ memory will be leaked)
 820          */
 821         status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
 822             sleepflag);
 823         if (status != DDI_SUCCESS) {
 824                 TAVOR_WARNING(state, "failed to deregister old CQ memory");
 825                 /* Set "status" and "errormsg" and goto failure */
 826                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 827                     "failed deregister mr (old)");
 828                 goto cqresize_fail;
 829         }
 830 
 831         /* Free the memory for the old CQ */
 832         tavor_queue_free(state, &old_cqinfo);
 833 
 834         /*
 835          * Fill in the return arguments (if necessary).  This includes the
 836          * real new completion queue size.
 837          */
 838         if (actual_size != NULL) {
 839                 *actual_size = (1 << log_cq_size) - 1;
 840         }
 841 
 842         TAVOR_TNF_EXIT(tavor_cq_resize);
 843         return (DDI_SUCCESS);
 844 
 845 cqresize_fail:
 846         TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
 847             tnf_string, msg, errormsg);
 848         TAVOR_TNF_EXIT(tavor_cq_resize);
 849         return (status);
 850 }
 851 
 852 
 853 /*
 854  * tavor_cq_notify()
 855  *    Context: Can be called from interrupt or base context.
 856  */
 857 int
 858 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
 859     ibt_cq_notify_flags_t flags)
 860 {
 861         uint_t          cqnum;
 862 
 863         TAVOR_TNF_ENTER(tavor_cq_notify);
 864 
 865         /*
 866          * Determine if we are trying to get the next completion or the next
 867          * "solicited" completion.  Then hit the appropriate doorbell.
 868          *
 869          * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
 870          * regarding why we do not have to do an extra PIO read here, and we
 871          * will not lose an event after writing this doorbell.
 872          */
 873         cqnum = cq->cq_cqnum;
 874         if (flags == IBT_NEXT_COMPLETION) {
 875                 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
 876                     TAVOR_CQDB_DEFAULT_PARAM);
 877 
 878         } else if (flags == IBT_NEXT_SOLICITED) {
 879                 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
 880                     cqnum, TAVOR_CQDB_DEFAULT_PARAM);
 881 
 882         } else {
 883                 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
 884                     tnf_int, flags, flags);
 885                 TAVOR_TNF_EXIT(tavor_cq_notify);
 886                 return (IBT_CQ_NOTIFY_TYPE_INVALID);
 887         }
 888 
 889         TAVOR_TNF_EXIT(tavor_cq_notify);
 890         return (DDI_SUCCESS);
 891 }
 892 
 893 
 894 /*
 895  * tavor_cq_poll()
 896  *    Context: Can be called from interrupt or base context.
 897  */
 898 int
 899 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
 900     uint_t num_wc, uint_t *num_polled)
 901 {
 902         tavor_hw_cqe_t  *cqe;
 903         uint32_t        cons_indx, wrap_around_mask;
 904         uint32_t        polled_cnt, num_to_increment;
 905         int             status;
 906 
 907         TAVOR_TNF_ENTER(tavor_cq_poll);
 908 
 909         /*
 910          * Check for user-mappable CQ memory.  Note:  We do not allow kernel
 911          * clients to poll CQ memory that is accessible directly by the user.
 912          * If the CQ memory is user accessible, then return an error.
 913          */
 914         if (cq->cq_is_umap) {
 915                 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
 916                     TAVOR_TNF_ERROR, "");
 917                 TAVOR_TNF_EXIT(tavor_cq_poll);
 918                 return (IBT_CQ_HDL_INVALID);
 919         }
 920 
 921         mutex_enter(&cq->cq_lock);
 922 
 923         /* Get the consumer index */
 924         cons_indx = cq->cq_consindx;
 925 
 926         /*
 927          * Calculate the wrap around mask.  Note: This operation only works
 928          * because all Tavor completion queues have power-of-2 sizes
 929          */
 930         wrap_around_mask = (cq->cq_bufsz - 1);
 931 
 932         /* Calculate the pointer to the first CQ entry */
 933         cqe = &cq->cq_buf[cons_indx];
 934 
 935         /* Sync the current CQE to read */
 936         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
 937 
 938         /*
 939          * Keep pulling entries from the CQ until we find an entry owned by
 940          * the hardware.  As long as there the CQE's owned by SW, process
 941          * each entry by calling tavor_cq_cqe_consume() and updating the CQ
 942          * consumer index.  Note:  We only update the consumer index if
 943          * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
 944          * it indicates that we are going to "recycle" the CQE (probably
 945          * because it is a error CQE and corresponds to more than one
 946          * completion).
 947          */
 948         polled_cnt = 0;
 949         while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
 950                 status = tavor_cq_cqe_consume(state, cq, cqe,
 951                     &wc_p[polled_cnt++]);
 952                 if (status == TAVOR_CQ_SYNC_AND_DB) {
 953                         /* Reset entry to hardware ownership */
 954                         TAVOR_CQE_OWNER_SET_HW(cq, cqe);
 955 
 956                         /* Sync the current CQE for device */
 957                         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
 958 
 959                         /* Increment the consumer index */
 960                         cons_indx = (cons_indx + 1) & wrap_around_mask;
 961 
 962                         /* Update the pointer to the next CQ entry */
 963                         cqe = &cq->cq_buf[cons_indx];
 964 
 965                         /* Sync the next CQE to read */
 966                         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
 967                 }
 968 
 969                 /*
 970                  * If we have run out of space to store work completions,
 971                  * then stop and return the ones we have pulled of the CQ.
 972                  */
 973                 if (polled_cnt >= num_wc) {
 974                         break;
 975                 }
 976         }
 977 
 978         /*
 979          * Now we only ring the doorbell (to update the consumer index) if
 980          * we've actually consumed a CQ entry.  If we have, for example,
 981          * pulled from a CQE that we are still in the process of "recycling"
 982          * for error purposes, then we would not update the consumer index.
 983          */
 984         if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
 985                 /*
 986                  * Post doorbell to update the consumer index.  Doorbell
 987                  * value indicates number of entries consumed (minus 1)
 988                  */
 989                 if (cons_indx > cq->cq_consindx) {
 990                         num_to_increment = (cons_indx - cq->cq_consindx) - 1;
 991                 } else {
 992                         num_to_increment = ((cons_indx + cq->cq_bufsz) -
 993                             cq->cq_consindx) - 1;
 994                 }
 995                 cq->cq_consindx = cons_indx;
 996                 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
 997                     cq->cq_cqnum, num_to_increment);
 998 
 999         } else if (polled_cnt == 0) {
1000                 /*
1001                  * If the CQ is empty, we can try to free up some of the WRID
1002                  * list containers.  See tavor_wr.c for more details on this
1003                  * operation.
1004                  */
1005                 tavor_wrid_cq_reap(cq);
1006         }
1007 
1008         mutex_exit(&cq->cq_lock);
1009 
1010         /* Set "num_polled" (if necessary) */
1011         if (num_polled != NULL) {
1012                 *num_polled = polled_cnt;
1013         }
1014 
1015         /* Set CQ_EMPTY condition if needed, otherwise return success */
1016         if (polled_cnt == 0) {
1017                 status = IBT_CQ_EMPTY;
1018         } else {
1019                 status = DDI_SUCCESS;
1020         }
1021 
1022         /*
1023          * Check if the system is currently panicking.  If it is, then call
1024          * the Tavor interrupt service routine.  This step is necessary here
1025          * because we might be in a polled I/O mode and without the call to
1026          * tavor_isr() - and its subsequent calls to poll and rearm each
1027          * event queue - we might overflow our EQs and render the system
1028          * unable to sync/dump.
1029          */
1030         if (ddi_in_panic() != 0) {
1031                 (void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1032         }
1033 
1034         TAVOR_TNF_EXIT(tavor_cq_poll);
1035         return (status);
1036 }
1037 
1038 
1039 /*
1040  * tavor_cq_handler()
1041  *    Context: Only called from interrupt context
1042  */
1043 int
1044 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1045     tavor_hw_eqe_t *eqe)
1046 {
1047         tavor_cqhdl_t           cq;
1048         uint_t                  cqnum;
1049         uint_t                  eqe_evttype;
1050 
1051         TAVOR_TNF_ENTER(tavor_cq_handler);
1052 
1053         eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1054 
1055         ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1056             eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1057 
1058         if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1059                 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1060                     TAVOR_TNF_ERROR, "");
1061                 tavor_eq_overflow_handler(state, eq, eqe);
1062 
1063                 TAVOR_TNF_EXIT(tavor_cq_handler);
1064                 return (DDI_FAILURE);
1065         }
1066 
1067 
1068         /* Get the CQ handle from CQ number in event descriptor */
1069         cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1070         cq = tavor_cqhdl_from_cqnum(state, cqnum);
1071 
1072         /*
1073          * Post the EQ doorbell to move the CQ to the "disarmed" state.
1074          * This operation is to enable subsequent CQ doorbells (e.g. those
1075          * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1076          */
1077         tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1078 
1079         /*
1080          * If the CQ handle is NULL, this is probably an indication
1081          * that the CQ has been freed already.  In which case, we
1082          * should not deliver this event.
1083          *
1084          * We also check that the CQ number in the handle is the
1085          * same as the CQ number in the event queue entry.  This
1086          * extra check allows us to handle the case where a CQ was
1087          * freed and then allocated again in the time it took to
1088          * handle the event queue processing.  By constantly incrementing
1089          * the non-constrained portion of the CQ number every time
1090          * a new CQ is allocated, we mitigate (somewhat) the chance
1091          * that a stale event could be passed to the client's CQ
1092          * handler.
1093          *
1094          * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1095          * means that we've have either received this event before we
1096          * finished attaching to the IBTF or we've received it while we
1097          * are in the process of detaching.
1098          */
1099         if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1100             (state->ts_ibtfpriv != NULL)) {
1101                 TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1102         } else {
1103                 TNF_PROBE_2(tavor_cq_handler_dropped_event,
1104                     TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1105                     tnf_uint, hdl_cqnum, cqnum);
1106         }
1107 
1108         TAVOR_TNF_EXIT(tavor_cq_handler);
1109         return (DDI_SUCCESS);
1110 }
1111 
1112 
1113 /*
1114  * tavor_cq_err_handler()
1115  *    Context: Only called from interrupt context
1116  */
1117 int
1118 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1119     tavor_hw_eqe_t *eqe)
1120 {
1121         tavor_cqhdl_t           cq;
1122         uint_t                  cqnum;
1123         ibc_async_event_t       event;
1124         ibt_async_code_t        type;
1125         uint_t                  eqe_evttype;
1126 
1127         TAVOR_TNF_ENTER(tavor_cq_err_handler);
1128 
1129         eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1130 
1131         ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1132             eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1133 
1134         if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1135                 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1136                     TAVOR_TNF_ERROR, "");
1137                 tavor_eq_overflow_handler(state, eq, eqe);
1138 
1139                 TAVOR_TNF_EXIT(tavor_cq_err_handler);
1140                 return (DDI_FAILURE);
1141         }
1142 
1143         /* cmn_err(CE_CONT, "CQ Error handler\n"); */
1144 
1145         /* Get the CQ handle from CQ number in event descriptor */
1146         cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1147         cq = tavor_cqhdl_from_cqnum(state, cqnum);
1148 
1149         /*
1150          * If the CQ handle is NULL, this is probably an indication
1151          * that the CQ has been freed already.  In which case, we
1152          * should not deliver this event.
1153          *
1154          * We also check that the CQ number in the handle is the
1155          * same as the CQ number in the event queue entry.  This
1156          * extra check allows us to handle the case where a CQ was
1157          * freed and then allocated again in the time it took to
1158          * handle the event queue processing.  By constantly incrementing
1159          * the non-constrained portion of the CQ number every time
1160          * a new CQ is allocated, we mitigate (somewhat) the chance
1161          * that a stale event could be passed to the client's CQ
1162          * handler.
1163          *
1164          * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1165          * means that we've have either received this event before we
1166          * finished attaching to the IBTF or we've received it while we
1167          * are in the process of detaching.
1168          */
1169         if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1170             (state->ts_ibtfpriv != NULL)) {
1171                 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1172                 type            = IBT_ERROR_CQ;
1173 
1174                 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1175         } else {
1176                 TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1177                     TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1178                     tnf_uint, hdl_cqnum, cqnum);
1179         }
1180 
1181         TAVOR_TNF_EXIT(tavor_cq_err_handler);
1182         return (DDI_SUCCESS);
1183 }
1184 
1185 
1186 /*
1187  * tavor_cq_refcnt_inc()
1188  *    Context: Can be called from interrupt or base context.
1189  */
1190 int
1191 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1192 {
1193         /*
1194          * Increment the completion queue's reference count.  Note: In order
1195          * to ensure compliance with IBA C11-15, we must ensure that a given
1196          * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1197          * This is accomplished here by keeping track of how the referenced
1198          * CQ is being used.
1199          */
1200         mutex_enter(&cq->cq_lock);
1201         TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1202             tnf_uint, refcnt, cq->cq_refcnt);
1203         if (cq->cq_refcnt == 0) {
1204                 cq->cq_is_special = is_special;
1205         } else {
1206                 if (cq->cq_is_special != is_special) {
1207                         mutex_exit(&cq->cq_lock);
1208                         return (DDI_FAILURE);
1209                 }
1210         }
1211         cq->cq_refcnt++;
1212         mutex_exit(&cq->cq_lock);
1213         return (DDI_SUCCESS);
1214 }
1215 
1216 
1217 /*
1218  * tavor_cq_refcnt_dec()
1219  *    Context: Can be called from interrupt or base context.
1220  */
1221 void
1222 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1223 {
1224         /* Decrement the completion queue's reference count */
1225         mutex_enter(&cq->cq_lock);
1226         cq->cq_refcnt--;
1227         TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1228             tnf_uint, refcnt, cq->cq_refcnt);
1229         mutex_exit(&cq->cq_lock);
1230 }
1231 
1232 
1233 /*
1234  * tavor_cq_doorbell()
1235  *    Context: Can be called from interrupt or base context.
1236  */
1237 static void
1238 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1239     uint32_t cq_param)
1240 {
1241         uint64_t        doorbell = 0;
1242 
1243         /* Build the doorbell from the parameters */
1244         doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1245             ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1246 
1247         TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1248             tnf_ulong, doorbell, doorbell);
1249 
1250         /* Write the doorbell to UAR */
1251         TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1252             doorbell);
1253 }
1254 
1255 
1256 /*
1257  * tavor_cqhdl_from_cqnum()
1258  *    Context: Can be called from interrupt or base context.
1259  *
1260  *    This routine is important because changing the unconstrained
1261  *    portion of the CQ number is critical to the detection of a
1262  *    potential race condition in the CQ handler code (i.e. the case
1263  *    where a CQ is freed and alloc'd again before an event for the
1264  *    "old" CQ can be handled).
1265  *
1266  *    While this is not a perfect solution (not sure that one exists)
1267  *    it does help to mitigate the chance that this race condition will
1268  *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1269  *    this solution does not scale well because the number of constrained
1270  *    bits increases (and, hence, the number of unconstrained bits
1271  *    decreases) as the number of supported CQs grows.  For small and
1272  *    intermediate values, it should hopefully provide sufficient
1273  *    protection.
1274  */
1275 tavor_cqhdl_t
1276 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1277 {
1278         uint_t  cqindx, cqmask;
1279 
1280         /* Calculate the CQ table index from the cqnum */
1281         cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1282         cqindx = cqnum & cqmask;
1283         return (state->ts_cqhdl[cqindx]);
1284 }
1285 
1286 
1287 /*
1288  * tavor_cq_cqe_consume()
1289  *    Context: Can be called from interrupt or base context.
1290  */
1291 static int
1292 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1293     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1294 {
1295         uint_t          flags, type, opcode, qpnum, qp1_indx;
1296         int             status;
1297 
1298         TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1299 
1300         /*
1301          * Determine if this is an "error" CQE by examining "opcode".  If it
1302          * is an error CQE, then call tavor_cq_errcqe_consume() and return
1303          * whatever status it returns.  Otherwise, this is a successful
1304          * completion.
1305          */
1306         opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1307         if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1308             (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1309                 status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1310                 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1311                 return (status);
1312         }
1313 
1314         /*
1315          * Fetch the Work Request ID using the information in the CQE.
1316          * See tavor_wr.c for more details.
1317          */
1318         wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1319 
1320         /*
1321          * Parse the CQE opcode to determine completion type.  This will set
1322          * not only the type of the completion, but also any flags that might
1323          * be associated with it (e.g. whether immediate data is present).
1324          */
1325         flags = IBT_WC_NO_FLAGS;
1326         if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1327 
1328                 /* Send CQE */
1329                 switch (opcode) {
1330                 case TAVOR_CQE_SND_RDMAWR_IMM:
1331                         flags |= IBT_WC_IMMED_DATA_PRESENT;
1332                         /* FALLTHROUGH */
1333                 case TAVOR_CQE_SND_RDMAWR:
1334                         type = IBT_WRC_RDMAW;
1335                         break;
1336 
1337                 case TAVOR_CQE_SND_SEND_IMM:
1338                         flags |= IBT_WC_IMMED_DATA_PRESENT;
1339                         /* FALLTHROUGH */
1340                 case TAVOR_CQE_SND_SEND:
1341                         type = IBT_WRC_SEND;
1342                         break;
1343 
1344                 case TAVOR_CQE_SND_RDMARD:
1345                         type = IBT_WRC_RDMAR;
1346                         break;
1347 
1348                 case TAVOR_CQE_SND_ATOMIC_CS:
1349                         type = IBT_WRC_CSWAP;
1350                         break;
1351 
1352                 case TAVOR_CQE_SND_ATOMIC_FA:
1353                         type = IBT_WRC_FADD;
1354                         break;
1355 
1356                 case TAVOR_CQE_SND_BIND_MW:
1357                         type = IBT_WRC_BIND;
1358                         break;
1359 
1360                 default:
1361                         TAVOR_WARNING(state, "unknown send CQE type");
1362                         wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1363                         TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1364                             TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1365                         TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1366                         return (TAVOR_CQ_SYNC_AND_DB);
1367                 }
1368         } else {
1369 
1370                 /* Receive CQE */
1371                 switch (opcode & 0x1F) {
1372                 case TAVOR_CQE_RCV_RECV_IMM:
1373                         /* FALLTHROUGH */
1374                 case TAVOR_CQE_RCV_RECV_IMM2:
1375                         /*
1376                          * Note:  According to the Tavor PRM, all QP1 recv
1377                          * completions look like the result of a Send with
1378                          * Immediate.  They are not, however, (MADs are Send
1379                          * Only) so we need to check the QP number and set
1380                          * the flag only if it is non-QP1.
1381                          */
1382                         qpnum    = TAVOR_CQE_QPNUM_GET(cq, cqe);
1383                         qp1_indx = state->ts_spec_qp1->tr_indx;
1384                         if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1385                                 flags |= IBT_WC_IMMED_DATA_PRESENT;
1386                         }
1387                         /* FALLTHROUGH */
1388                 case TAVOR_CQE_RCV_RECV:
1389                         /* FALLTHROUGH */
1390                 case TAVOR_CQE_RCV_RECV2:
1391                         type = IBT_WRC_RECV;
1392                         break;
1393 
1394                 case TAVOR_CQE_RCV_RDMAWR_IMM:
1395                         /* FALLTHROUGH */
1396                 case TAVOR_CQE_RCV_RDMAWR_IMM2:
1397                         flags |= IBT_WC_IMMED_DATA_PRESENT;
1398                         type = IBT_WRC_RECV_RDMAWI;
1399                         break;
1400 
1401                 default:
1402                         TAVOR_WARNING(state, "unknown recv CQE type");
1403                         wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1404                         TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1405                             TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1406                         TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1407                         return (TAVOR_CQ_SYNC_AND_DB);
1408                 }
1409         }
1410         wc->wc_type = type;
1411 
1412         /*
1413          * Check for GRH, update the flags, then fill in "wc_flags" field
1414          * in the work completion
1415          */
1416         if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1417                 flags |= IBT_WC_GRH_PRESENT;
1418         }
1419         wc->wc_flags = flags;
1420 
1421         /* If we got here, completion status must be success */
1422         wc->wc_status = IBT_WC_SUCCESS;
1423 
1424         /*
1425          * Parse the remaining contents of the CQE into the work completion.
1426          * This means filling in SL, QP number, SLID, immediate data, etc.
1427          * Note:  Not all of these fields are valid in a given completion.
1428          * Many of them depend on the actual type of completion.  So we fill
1429          * in all of the fields and leave it up to the IBTF and consumer to
1430          * sort out which are valid based on their context.
1431          */
1432         wc->wc_sl      = TAVOR_CQE_SL_GET(cq, cqe);
1433         wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1434         wc->wc_qpn     = TAVOR_CQE_DQPN_GET(cq, cqe);
1435         wc->wc_res_hash        = 0;
1436         wc->wc_slid    = TAVOR_CQE_DLID_GET(cq, cqe);
1437         wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1438         wc->wc_pkey_ix         = (wc->wc_immed_data >> 16);
1439 
1440         /*
1441          * Depending on whether the completion was a receive or a send
1442          * completion, fill in "bytes transferred" as appropriate.  Also,
1443          * if necessary, fill in the "path bits" field.
1444          */
1445         if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1446                 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1447                 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1448 
1449         } else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1450             (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1451                 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1452         }
1453 
1454         TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1455         return (TAVOR_CQ_SYNC_AND_DB);
1456 }
1457 
1458 
1459 /*
1460  * tavor_cq_errcqe_consume()
1461  *    Context: Can be called from interrupt or base context.
1462  */
1463 static int
1464 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1465     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1466 {
1467         uint64_t                next_wqeaddr;
1468         uint32_t                imm_eth_pkey_cred;
1469         uint_t                  nextwqesize, dbd;
1470         uint_t                  doorbell_cnt, status;
1471         tavor_wrid_entry_t      wre;
1472 
1473         TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1474 
1475         /*
1476          * Fetch the Work Request ID using the information in the CQE.
1477          * See tavor_wr.c for more details.
1478          */
1479         wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1480 
1481         /*
1482          * Parse the CQE opcode to determine completion type.  We know that
1483          * the CQE is an error completion, so we extract only the completion
1484          * status here.
1485          */
1486         imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1487         status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1488         switch (status) {
1489         case TAVOR_CQE_LOC_LEN_ERR:
1490                 status = IBT_WC_LOCAL_LEN_ERR;
1491                 break;
1492 
1493         case TAVOR_CQE_LOC_OP_ERR:
1494                 status = IBT_WC_LOCAL_QP_OP_ERR;
1495                 break;
1496 
1497         case TAVOR_CQE_LOC_PROT_ERR:
1498                 status = IBT_WC_LOCAL_PROTECT_ERR;
1499                 break;
1500 
1501         case TAVOR_CQE_WR_FLUSHED_ERR:
1502                 status = IBT_WC_WR_FLUSHED_ERR;
1503                 break;
1504 
1505         case TAVOR_CQE_MW_BIND_ERR:
1506                 status = IBT_WC_MEM_WIN_BIND_ERR;
1507                 break;
1508 
1509         case TAVOR_CQE_BAD_RESPONSE_ERR:
1510                 status = IBT_WC_BAD_RESPONSE_ERR;
1511                 break;
1512 
1513         case TAVOR_CQE_LOCAL_ACCESS_ERR:
1514                 status = IBT_WC_LOCAL_ACCESS_ERR;
1515                 break;
1516 
1517         case TAVOR_CQE_REM_INV_REQ_ERR:
1518                 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1519                 break;
1520 
1521         case TAVOR_CQE_REM_ACC_ERR:
1522                 status = IBT_WC_REMOTE_ACCESS_ERR;
1523                 break;
1524 
1525         case TAVOR_CQE_REM_OP_ERR:
1526                 status = IBT_WC_REMOTE_OP_ERR;
1527                 break;
1528 
1529         case TAVOR_CQE_TRANS_TO_ERR:
1530                 status = IBT_WC_TRANS_TIMEOUT_ERR;
1531                 break;
1532 
1533         case TAVOR_CQE_RNRNAK_TO_ERR:
1534                 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1535                 break;
1536 
1537         /*
1538          * The following error codes are not supported in the Tavor driver
1539          * as they relate only to Reliable Datagram completion statuses:
1540          *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1541          *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1542          *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1543          *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1544          *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1545          *    case TAVOR_CQE_LOC_EEC_ERR:
1546          */
1547 
1548         default:
1549                 TAVOR_WARNING(state, "unknown error CQE status");
1550                 status = IBT_WC_LOCAL_QP_OP_ERR;
1551                 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1552                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1553                 break;
1554         }
1555         wc->wc_status = status;
1556 
1557         /*
1558          * Now we do all the checking that's necessary to handle completion
1559          * queue entry "recycling"
1560          *
1561          * It is not necessary here to try to sync the WQE as we are only
1562          * attempting to read from the Work Queue (and hardware does not
1563          * write to it).
1564          */
1565 
1566         /*
1567          * We can get doorbell info, WQE address, size for the next WQE
1568          * from the "wre" (which was filled in above in the call to the
1569          * tavor_wrid_get_entry() routine)
1570          */
1571         dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1572         next_wqeaddr = wre.wr_wqeaddrsz;
1573         nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1574 
1575         /*
1576          * Get the doorbell count from the CQE.  This indicates how many
1577          * completions this one CQE represents.
1578          */
1579         doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1580 
1581         /*
1582          * Determine if we're ready to consume this CQE yet or not.  If the
1583          * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1584          * is down to zero, then this is the last/only completion represented
1585          * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1586          * current CQE needs to be recycled (see below).
1587          */
1588         if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1589                 /*
1590                  * Consume the CQE
1591                  *    Return status to indicate that doorbell and sync may be
1592                  *    necessary.
1593                  */
1594                 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1595                 return (TAVOR_CQ_SYNC_AND_DB);
1596 
1597         } else {
1598                 /*
1599                  * Recycle the CQE for use in the next PollCQ() call
1600                  *    Decrement the doorbell count, modify the error status,
1601                  *    and update the WQE address and size (to point to the
1602                  *    next WQE on the chain.  Put these update entries back
1603                  *    into the CQE.
1604                  *    Despite the fact that we have updated the CQE, it is not
1605                  *    necessary for us to attempt to sync this entry just yet
1606                  *    as we have not changed the "hardware's view" of the
1607                  *    entry (i.e. we have not modified the "owner" bit - which
1608                  *    is all that the Tavor hardware really cares about.
1609                  */
1610                 doorbell_cnt = doorbell_cnt - dbd;
1611                 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1612                     ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1613                     (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1614                 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1615                     TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1616 
1617                 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1618                 return (TAVOR_CQ_RECYCLE_ENTRY);
1619         }
1620 }
1621 
1622 
1623 /*
1624  * tavor_cqe_sync()
1625  *    Context: Can be called from interrupt or base context.
1626  */
1627 static void
1628 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1629 {
1630         ddi_dma_handle_t        dmahdl;
1631         off_t                   offset;
1632         int                     status;
1633 
1634         TAVOR_TNF_ENTER(tavor_cqe_sync);
1635 
1636         /* Determine if CQ needs to be synced or not */
1637         if (cq->cq_sync == 0) {
1638                 TAVOR_TNF_EXIT(tavor_cqe_sync);
1639                 return;
1640         }
1641 
1642         /* Get the DMA handle from CQ context */
1643         dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1644 
1645         /* Calculate offset of next CQE */
1646         offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1647         status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1648         if (status != DDI_SUCCESS) {
1649                 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1650                     TAVOR_TNF_ERROR, "");
1651                 TAVOR_TNF_EXIT(tavor_cqe_sync);
1652                 return;
1653         }
1654 
1655         TAVOR_TNF_EXIT(tavor_cqe_sync);
1656 }
1657 
1658 
1659 /*
1660  * tavor_cq_resize_helper()
1661  *    Context: Can be called only from user or kernel context.
1662  */
1663 static void
1664 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1665     uint32_t old_cons_indx, uint32_t num_newcqe)
1666 {
1667         tavor_hw_cqe_t  *old_cqe, *new_cqe;
1668         uint32_t        new_cons_indx, wrap_around_mask;
1669         int             i;
1670 
1671         TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1672 
1673         ASSERT(MUTEX_HELD(&cq->cq_lock));
1674 
1675         /* Get the consumer index */
1676         new_cons_indx = 0;
1677 
1678         /*
1679          * Calculate the wrap around mask.  Note: This operation only works
1680          * because all Tavor completion queues have power-of-2 sizes
1681          */
1682         wrap_around_mask = (cq->cq_bufsz - 1);
1683 
1684         /*
1685          * Calculate the pointers to the first CQ entry (in the "old" CQ)
1686          * and the first CQ entry in the "new" CQ
1687          */
1688         old_cqe = &cq->cq_buf[old_cons_indx];
1689         new_cqe = &new_cqbuf[new_cons_indx];
1690 
1691         /* Sync entire "old" CQ for use by software (if necessary). */
1692         if (cq->cq_sync) {
1693                 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1694                     0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1695         }
1696 
1697         /*
1698          * Keep pulling entries from the "old" CQ until we find an entry owned
1699          * by the hardware.  Process each entry by copying it into the "new"
1700          * CQ and updating respective indices and pointers in the "old" CQ.
1701          */
1702         for (i = 0; i < num_newcqe; i++) {
1703 
1704                 /* Copy this old CQE into the "new_cqe" pointer */
1705                 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1706 
1707                 /* Increment the consumer index (for both CQs) */
1708                 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1709                 new_cons_indx = (new_cons_indx + 1);
1710 
1711                 /* Update the pointer to the next CQ entry */
1712                 old_cqe = &cq->cq_buf[old_cons_indx];
1713                 new_cqe = &new_cqbuf[new_cons_indx];
1714         }
1715 
1716         TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1717 }
1718 
1719 /*
1720  * tavor_cq_srq_entries_flush()
1721  * Context: Can be called from interrupt or base context.
1722  */
1723 void
1724 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1725 {
1726         tavor_cqhdl_t           cq;
1727         tavor_workq_hdr_t       *wqhdr;
1728         tavor_hw_cqe_t          *cqe;
1729         tavor_hw_cqe_t          *next_cqe;
1730         uint32_t                cons_indx, tail_cons_indx, wrap_around_mask;
1731         uint32_t                new_indx, check_indx, indx;
1732         uint32_t                num_to_increment;
1733         int                     cqe_qpnum, cqe_type;
1734         int                     outstanding_cqes, removed_cqes;
1735         int                     i;
1736 
1737         ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1738 
1739         cq = qp->qp_rq_cqhdl;
1740         wqhdr = qp->qp_rq_wqhdr;
1741 
1742         ASSERT(wqhdr->wq_wrid_post != NULL);
1743         ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1744 
1745         /*
1746          * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1747          * clients to modify any userland mapping CQ.  If the CQ is
1748          * user-mapped, then we simply return here, and this "flush" function
1749          * becomes a NO-OP in this case.
1750          */
1751         if (cq->cq_is_umap) {
1752                 return;
1753         }
1754 
1755         /* Get the consumer index */
1756         cons_indx = cq->cq_consindx;
1757 
1758         /*
1759          * Calculate the wrap around mask.  Note: This operation only works
1760          * because all Tavor completion queues have power-of-2 sizes
1761          */
1762         wrap_around_mask = (cq->cq_bufsz - 1);
1763 
1764         /* Calculate the pointer to the first CQ entry */
1765         cqe = &cq->cq_buf[cons_indx];
1766 
1767         /* Sync the current CQE to read */
1768         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1769 
1770         /*
1771          * Loop through the CQ looking for entries owned by software.  If an
1772          * entry is owned by software then we increment an 'outstanding_cqes'
1773          * count to know how many entries total we have on our CQ.  We use this
1774          * value further down to know how many entries to loop through looking
1775          * for our same QP number.
1776          */
1777         outstanding_cqes = 0;
1778         tail_cons_indx = cons_indx;
1779         while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1780                 /* increment total cqes count */
1781                 outstanding_cqes++;
1782 
1783                 /* increment the consumer index */
1784                 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1785 
1786                 /* update the pointer to the next cq entry */
1787                 cqe = &cq->cq_buf[tail_cons_indx];
1788 
1789                 /* sync the next cqe to read */
1790                 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1791         }
1792 
1793         /*
1794          * Using the 'tail_cons_indx' that was just set, we now know how many
1795          * total CQEs possible there are.  Set the 'check_indx' and the
1796          * 'new_indx' to the last entry identified by 'tail_cons_indx'
1797          */
1798         check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1799 
1800         for (i = 0; i < outstanding_cqes; i++) {
1801                 cqe = &cq->cq_buf[check_indx];
1802 
1803                 /* Grab QP number from CQE */
1804                 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1805                 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1806 
1807                 /*
1808                  * If the QP number is the same in the CQE as the QP that we
1809                  * have on this SRQ, then we must free up the entry off the
1810                  * SRQ.  We also make sure that the completion type is of the
1811                  * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1812                  * this CQ will be left as-is.  The handling of returning
1813                  * entries back to HW ownership happens further down.
1814                  */
1815                 if (cqe_qpnum == qp->qp_qpnum &&
1816                     cqe_type == TAVOR_COMPLETION_RECV) {
1817 
1818                         /* Add back to SRQ free list */
1819                         (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1820                             cq, cqe);
1821                 } else {
1822                         /* Do Copy */
1823                         if (check_indx != new_indx) {
1824                                 next_cqe = &cq->cq_buf[new_indx];
1825 
1826                                 /*
1827                                  * Copy the CQE into the "next_cqe"
1828                                  * pointer.
1829                                  */
1830                                 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1831                         }
1832                         new_indx = (new_indx - 1) & wrap_around_mask;
1833                 }
1834                 /* Move index to next CQE to check */
1835                 check_indx = (check_indx - 1) & wrap_around_mask;
1836         }
1837 
1838         /* Initialize removed cqes count */
1839         removed_cqes = 0;
1840 
1841         /* If an entry was removed */
1842         if (check_indx != new_indx) {
1843 
1844                 /*
1845                  * Set current pointer back to the beginning consumer index.
1846                  * At this point, all unclaimed entries have been copied to the
1847                  * index specified by 'new_indx'.  This 'new_indx' will be used
1848                  * as the new consumer index after we mark all freed entries as
1849                  * having HW ownership.  We do that here.
1850                  */
1851 
1852                 /* Loop through all entries until we reach our new pointer */
1853                 for (indx = cons_indx; indx <= new_indx;
1854                     indx = (indx + 1) & wrap_around_mask) {
1855                         removed_cqes++;
1856                         cqe = &cq->cq_buf[indx];
1857 
1858                         /* Reset entry to hardware ownership */
1859                         TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1860                 }
1861         }
1862 
1863         /*
1864          * Update consumer index to be the 'new_indx'.  This moves it past all
1865          * removed entries.  Because 'new_indx' is pointing to the last
1866          * previously valid SW owned entry, we add 1 to point the cons_indx to
1867          * the first HW owned entry.
1868          */
1869         cons_indx = (new_indx + 1) & wrap_around_mask;
1870 
1871         /*
1872          * Now we only ring the doorbell (to update the consumer index) if
1873          * we've actually consumed a CQ entry.  If we found no QP number
1874          * matches above, then we would not have removed anything.  So only if
1875          * something was removed do we ring the doorbell.
1876          */
1877         if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1878                 /*
1879                  * Post doorbell to update the consumer index.  Doorbell
1880                  * value indicates number of entries consumed (minus 1)
1881                  */
1882                 if (cons_indx > cq->cq_consindx) {
1883                         num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1884                 } else {
1885                         num_to_increment = ((cons_indx + cq->cq_bufsz) -
1886                             cq->cq_consindx) - 1;
1887                 }
1888                 cq->cq_consindx = cons_indx;
1889 
1890                 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1891                     cq->cq_cqnum, num_to_increment);
1892         }
1893 }