1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * tavor_cq.c
  29  *    Tavor Completion Queue Processing Routines
  30  *
  31  *    Implements all the routines necessary for allocating, freeing, resizing,
  32  *    and handling the completion type events that the Tavor hardware can
  33  *    generate.
  34  */
  35 
  36 #include <sys/types.h>
  37 #include <sys/conf.h>
  38 #include <sys/ddi.h>
  39 #include <sys/sunddi.h>
  40 #include <sys/modctl.h>
  41 #include <sys/bitmap.h>
  42 #include <sys/sysmacros.h>
  43 
  44 #include <sys/ib/adapters/tavor/tavor.h>
  45 
  46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
  47     uint32_t cqn, uint32_t cq_param);
  48 #pragma inline(tavor_cq_doorbell)
  49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
  50     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
  51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
  52     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
  53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
  54     uint_t flag);
  55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
  56     uint32_t old_cons_indx, uint32_t num_newcqe);
  57 
  58 /*
  59  * tavor_cq_alloc()
  60  *    Context: Can be called only from user or kernel context.
  61  */
  62 int
  63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
  64     ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
  65     uint_t sleepflag)
  66 {
  67         tavor_rsrc_t            *cqc, *rsrc;
  68         tavor_umap_db_entry_t   *umapdb;
  69         tavor_hw_cqc_t          cqc_entry;
  70         tavor_cqhdl_t           cq;
  71         ibt_mr_attr_t           mr_attr;
  72         tavor_mr_options_t      op;
  73         tavor_pdhdl_t           pd;
  74         tavor_mrhdl_t           mr;
  75         tavor_hw_cqe_t          *buf;
  76         uint64_t                addr, value;
  77         uint32_t                log_cq_size, lkey, uarpg;
  78         uint_t                  dma_xfer_mode, cq_sync, cq_is_umap;
  79         int                     status, i, flag;
  80         char                    *errormsg;
  81 
  82         TAVOR_TNF_ENTER(tavor_cq_alloc);
  83 
  84         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
  85 
  86         /*
  87          * Determine whether CQ is being allocated for userland access or
  88          * whether it is being allocated for kernel access.  If the CQ is
  89          * being allocated for userland access, then lookup the UAR doorbell
  90          * page number for the current process.  Note:  If this is not found
  91          * (e.g. if the process has not previously open()'d the Tavor driver),
  92          * then an error is returned.
  93          */
  94         cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
  95         if (cq_is_umap) {
  96                 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
  97                     MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
  98                 if (status != DDI_SUCCESS) {
  99                         /* Set "status" and "errormsg" and goto failure */
 100                         TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
 101                         goto cqalloc_fail;
 102                 }
 103                 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
 104         }
 105 
 106         /* Use the internal protection domain (PD) for setting up CQs */
 107         pd = state->ts_pdhdl_internal;
 108 
 109         /* Increment the reference count on the protection domain (PD) */
 110         tavor_pd_refcnt_inc(pd);
 111 
 112         /*
 113          * Allocate an CQ context entry.  This will be filled in with all
 114          * the necessary parameters to define the Completion Queue.  And then
 115          * ownership will be passed to the hardware in the final step
 116          * below.  If we fail here, we must undo the protection domain
 117          * reference count.
 118          */
 119         status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
 120         if (status != DDI_SUCCESS) {
 121                 /* Set "status" and "errormsg" and goto failure */
 122                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
 123                 goto cqalloc_fail1;
 124         }
 125 
 126         /*
 127          * Allocate the software structure for tracking the completion queue
 128          * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
 129          * undo the protection domain reference count and the previous
 130          * resource allocation.
 131          */
 132         status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
 133         if (status != DDI_SUCCESS) {
 134                 /* Set "status" and "errormsg" and goto failure */
 135                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
 136                 goto cqalloc_fail2;
 137         }
 138         cq = (tavor_cqhdl_t)rsrc->tr_addr;
 139         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
 140         cq->cq_is_umap = cq_is_umap;
 141 
 142         /* Use the index as CQ number */
 143         cq->cq_cqnum = cqc->tr_indx;
 144 
 145         /*
 146          * If this will be a user-mappable CQ, then allocate an entry for
 147          * the "userland resources database".  This will later be added to
 148          * the database (after all further CQ operations are successful).
 149          * If we fail here, we must undo the reference counts and the
 150          * previous resource allocation.
 151          */
 152         if (cq->cq_is_umap) {
 153                 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
 154                     MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
 155                 if (umapdb == NULL) {
 156                         /* Set "status" and "errormsg" and goto failure */
 157                         TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
 158                         goto cqalloc_fail3;
 159                 }
 160         }
 161 
 162         /*
 163          * Calculate the appropriate size for the completion queue.
 164          * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
 165          * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
 166          * to round the requested size up to the next highest power-of-2
 167          */
 168         cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
 169         log_cq_size = highbit(cq_attr->cq_size);
 170 
 171         /*
 172          * Next we verify that the rounded-up size is valid (i.e. consistent
 173          * with the device limits and/or software-configured limits)
 174          */
 175         if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
 176                 /* Set "status" and "errormsg" and goto failure */
 177                 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
 178                 goto cqalloc_fail4;
 179         }
 180 
 181         /*
 182          * Allocate the memory for Completion Queue.
 183          *
 184          * Note: Although we use the common queue allocation routine, we
 185          * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
 186          * kernel system memory) for kernel CQs because it would be
 187          * inefficient to have CQs located in DDR memory.  This is primarily
 188          * because CQs are read from (by software) more than they are written
 189          * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
 190          * user-mappable CQs for a similar reason.)
 191          * It is also worth noting that, unlike Tavor QP work queues,
 192          * completion queues do not have the same strict alignment
 193          * requirements.  It is sufficient for the CQ memory to be both
 194          * aligned to and bound to addresses which are a multiple of CQE size.
 195          */
 196         cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
 197         cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
 198         cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
 199         if (cq->cq_is_umap) {
 200                 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 201         } else {
 202                 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
 203         }
 204         status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
 205         if (status != DDI_SUCCESS) {
 206                 /* Set "status" and "errormsg" and goto failure */
 207                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
 208                 goto cqalloc_fail4;
 209         }
 210         buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
 211         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
 212 
 213         /*
 214          * Initialize each of the Completion Queue Entries (CQE) by setting
 215          * their ownership to hardware ("owner" bit set to HW).  This is in
 216          * preparation for the final transfer of ownership (below) of the
 217          * CQ context itself.
 218          */
 219         for (i = 0; i < (1 << log_cq_size); i++) {
 220                 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
 221         }
 222 
 223         /*
 224          * Register the memory for the CQ.  The memory for the CQ must
 225          * be registered in the Tavor TPT tables.  This gives us the LKey
 226          * to specify in the CQ context below.  Note: If this is a user-
 227          * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
 228          */
 229         flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
 230         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 231         mr_attr.mr_len   = cq->cq_cqinfo.qa_size;
 232         mr_attr.mr_as    = NULL;
 233         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
 234         if (cq->cq_is_umap) {
 235                 dma_xfer_mode = DDI_DMA_CONSISTENT;
 236         } else {
 237                 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
 238         }
 239         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 240                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 241         }
 242         op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
 243         op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
 244         op.mro_bind_override_addr = 0;
 245         status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
 246         if (status != DDI_SUCCESS) {
 247                 /* Set "status" and "errormsg" and goto failure */
 248                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 249                 goto cqalloc_fail5;
 250         }
 251         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
 252         addr = mr->mr_bindinfo.bi_addr;
 253         lkey = mr->mr_lkey;
 254 
 255         /* Determine if later ddi_dma_sync will be necessary */
 256         cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
 257 
 258         /* Sync entire CQ for use by the hardware (if necessary). */
 259         if (cq_sync) {
 260                 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
 261                     cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 262         }
 263 
 264         /*
 265          * Fill in the CQC entry.  This is the final step before passing
 266          * ownership of the CQC entry to the Tavor hardware.  We use all of
 267          * the information collected/calculated above to fill in the
 268          * requisite portions of the CQC.  Note: If this CQ is going to be
 269          * used for userland access, then we need to set the UAR page number
 270          * appropriately (otherwise it's a "don't care")
 271          */
 272         bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
 273         cq->cq_eqnum         = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
 274         cq->cq_erreqnum              = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
 275         cqc_entry.xlat          = TAVOR_VA2PA_XLAT_ENABLED;
 276         cqc_entry.state         = TAVOR_CQ_DISARMED;
 277         cqc_entry.start_addr_h  = (addr >> 32);
 278         cqc_entry.start_addr_l  = (addr & 0xFFFFFFFF);
 279         cqc_entry.log_cq_sz     = log_cq_size;
 280         if (cq->cq_is_umap) {
 281                 cqc_entry.usr_page = uarpg;
 282         } else {
 283                 cqc_entry.usr_page = 0;
 284         }
 285         cqc_entry.pd            = pd->pd_pdnum;
 286         cqc_entry.lkey          = lkey;
 287         cqc_entry.e_eqn         = cq->cq_erreqnum;
 288         cqc_entry.c_eqn         = cq->cq_eqnum;
 289         cqc_entry.cqn           = cq->cq_cqnum;
 290 
 291         /*
 292          * Write the CQC entry to hardware.  Lastly, we pass ownership of
 293          * the entry to the hardware (using the Tavor SW2HW_CQ firmware
 294          * command).  Note: In general, this operation shouldn't fail.  But
 295          * if it does, we have to undo everything we've done above before
 296          * returning error.
 297          */
 298         status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
 299             sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
 300         if (status != TAVOR_CMD_SUCCESS) {
 301                 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
 302                     status);
 303                 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
 304                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 305                 /* Set "status" and "errormsg" and goto failure */
 306                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
 307                 goto cqalloc_fail6;
 308         }
 309 
 310         /*
 311          * Fill in the rest of the Tavor Completion Queue handle.  Having
 312          * successfully transferred ownership of the CQC, we can update the
 313          * following fields for use in further operations on the CQ.
 314          */
 315         cq->cq_cqcrsrcp        = cqc;
 316         cq->cq_rsrcp   = rsrc;
 317         cq->cq_consindx        = 0;
 318         cq->cq_buf     = buf;
 319         cq->cq_bufsz   = (1 << log_cq_size);
 320         cq->cq_mrhdl   = mr;
 321         cq->cq_sync    = cq_sync;
 322         cq->cq_refcnt          = 0;
 323         cq->cq_is_special = 0;
 324         cq->cq_uarpg   = uarpg;
 325         cq->cq_umap_dhp        = (devmap_cookie_t)NULL;
 326         avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
 327             sizeof (struct tavor_workq_hdr_s),
 328             offsetof(struct tavor_workq_hdr_s, wq_avl_link));
 329 
 330         cq->cq_wrid_reap_head  = NULL;
 331         cq->cq_wrid_reap_tail  = NULL;
 332         cq->cq_hdlrarg         = (void *)ibt_cqhdl;
 333 
 334         /*
 335          * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
 336          * "actual_size" and "cqhdl" and return success
 337          */
 338         ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
 339         state->ts_cqhdl[cqc->tr_indx] = cq;
 340 
 341         /*
 342          * If this is a user-mappable CQ, then we need to insert the previously
 343          * allocated entry into the "userland resources database".  This will
 344          * allow for later lookup during devmap() (i.e. mmap()) calls.
 345          */
 346         if (cq->cq_is_umap) {
 347                 tavor_umap_db_add(umapdb);
 348         }
 349 
 350         /*
 351          * Fill in the return arguments (if necessary).  This includes the
 352          * real completion queue size.
 353          */
 354         if (actual_size != NULL) {
 355                 *actual_size = (1 << log_cq_size) - 1;
 356         }
 357         *cqhdl = cq;
 358 
 359         TAVOR_TNF_EXIT(tavor_cq_alloc);
 360         return (DDI_SUCCESS);
 361 
 362 /*
 363  * The following is cleanup for all possible failure cases in this routine
 364  */
 365 cqalloc_fail6:
 366         if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 367             sleepflag) != DDI_SUCCESS) {
 368                 TAVOR_WARNING(state, "failed to deregister CQ memory");
 369         }
 370 cqalloc_fail5:
 371         tavor_queue_free(state, &cq->cq_cqinfo);
 372 cqalloc_fail4:
 373         if (cq_is_umap) {
 374                 tavor_umap_db_free(umapdb);
 375         }
 376 cqalloc_fail3:
 377         tavor_rsrc_free(state, &rsrc);
 378 cqalloc_fail2:
 379         tavor_rsrc_free(state, &cqc);
 380 cqalloc_fail1:
 381         tavor_pd_refcnt_dec(pd);
 382 cqalloc_fail:
 383         TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
 384             tnf_string, msg, errormsg);
 385         TAVOR_TNF_EXIT(tavor_cq_alloc);
 386         return (status);
 387 }
 388 
 389 
 390 /*
 391  * tavor_cq_free()
 392  *    Context: Can be called only from user or kernel context.
 393  */
 394 /* ARGSUSED */
 395 int
 396 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
 397 {
 398         tavor_rsrc_t            *cqc, *rsrc;
 399         tavor_umap_db_entry_t   *umapdb;
 400         tavor_hw_cqc_t          cqc_entry;
 401         tavor_pdhdl_t           pd;
 402         tavor_mrhdl_t           mr;
 403         tavor_cqhdl_t           cq;
 404         uint32_t                cqnum;
 405         uint64_t                value;
 406         uint_t                  maxprot;
 407         int                     status;
 408 
 409         TAVOR_TNF_ENTER(tavor_cq_free);
 410 
 411         /*
 412          * Pull all the necessary information from the Tavor Completion Queue
 413          * handle.  This is necessary here because the resource for the
 414          * CQ handle is going to be freed up as part of this operation.
 415          */
 416         cq      = *cqhdl;
 417         mutex_enter(&cq->cq_lock);
 418         cqc     = cq->cq_cqcrsrcp;
 419         rsrc    = cq->cq_rsrcp;
 420         pd      = state->ts_pdhdl_internal;
 421         mr      = cq->cq_mrhdl;
 422         cqnum   = cq->cq_cqnum;
 423 
 424         /*
 425          * If there are work queues still associated with the CQ, then return
 426          * an error.  Otherwise, we will be holding the CQ lock.
 427          */
 428         if (cq->cq_refcnt != 0) {
 429                 mutex_exit(&cq->cq_lock);
 430                 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
 431                     tnf_int, refcnt, cq->cq_refcnt);
 432                 TAVOR_TNF_EXIT(tavor_cq_free);
 433                 return (IBT_CQ_BUSY);
 434         }
 435 
 436         /*
 437          * If this was a user-mappable CQ, then we need to remove its entry
 438          * from the "userland resources database".  If it is also currently
 439          * mmap()'d out to a user process, then we need to call
 440          * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
 441          * We also need to invalidate the CQ tracking information for the
 442          * user mapping.
 443          */
 444         if (cq->cq_is_umap) {
 445                 status = tavor_umap_db_find(state->ts_instance, cqnum,
 446                     MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
 447                     &umapdb);
 448                 if (status != DDI_SUCCESS) {
 449                         mutex_exit(&cq->cq_lock);
 450                         TAVOR_WARNING(state, "failed to find in database");
 451                         TAVOR_TNF_EXIT(tavor_cq_free);
 452                         return (ibc_get_ci_failure(0));
 453                 }
 454                 tavor_umap_db_free(umapdb);
 455                 if (cq->cq_umap_dhp != NULL) {
 456                         maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 457                         status = devmap_devmem_remap(cq->cq_umap_dhp,
 458                             state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
 459                             maxprot, DEVMAP_MAPPING_INVALID, NULL);
 460                         if (status != DDI_SUCCESS) {
 461                                 mutex_exit(&cq->cq_lock);
 462                                 TAVOR_WARNING(state, "failed in CQ memory "
 463                                     "devmap_devmem_remap()");
 464                                 TAVOR_TNF_EXIT(tavor_cq_free);
 465                                 return (ibc_get_ci_failure(0));
 466                         }
 467                         cq->cq_umap_dhp = (devmap_cookie_t)NULL;
 468                 }
 469         }
 470 
 471         /*
 472          * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
 473          * in-progress events to detect that the CQ corresponding to this
 474          * number has been freed.
 475          */
 476         state->ts_cqhdl[cqc->tr_indx] = NULL;
 477 
 478         /*
 479          * While we hold the CQ lock, do a "forced reap" of the workQ WRID
 480          * list.  This cleans up all the structures associated with the WRID
 481          * processing for this CQ.  Once we complete, drop the lock and finish
 482          * the deallocation of the CQ.
 483          */
 484         tavor_wrid_cq_force_reap(cq);
 485 
 486         mutex_exit(&cq->cq_lock);
 487         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
 488 
 489         /*
 490          * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
 491          * firmware command).  If the ownership transfer fails for any reason,
 492          * then it is an indication that something (either in HW or SW) has
 493          * gone seriously wrong.
 494          */
 495         status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
 496             sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
 497         if (status != TAVOR_CMD_SUCCESS) {
 498                 TAVOR_WARNING(state, "failed to reclaim CQC ownership");
 499                 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
 500                     status);
 501                 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
 502                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 503                 TAVOR_TNF_EXIT(tavor_cq_free);
 504                 return (ibc_get_ci_failure(0));
 505         }
 506 
 507         /*
 508          * Deregister the memory for the Completion Queue.  If this fails
 509          * for any reason, then it is an indication that something (either
 510          * in HW or SW) has gone seriously wrong.  So we print a warning
 511          * message and return.
 512          */
 513         status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 514             sleepflag);
 515         if (status != DDI_SUCCESS) {
 516                 TAVOR_WARNING(state, "failed to deregister CQ memory");
 517                 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
 518                 TAVOR_TNF_EXIT(tavor_cq_free);
 519                 return (ibc_get_ci_failure(0));
 520         }
 521 
 522         /* Free the memory for the CQ */
 523         tavor_queue_free(state, &cq->cq_cqinfo);
 524 
 525         /* Free the Tavor Completion Queue handle */
 526         tavor_rsrc_free(state, &rsrc);
 527 
 528         /* Free up the CQC entry resource */
 529         tavor_rsrc_free(state, &cqc);
 530 
 531         /* Decrement the reference count on the protection domain (PD) */
 532         tavor_pd_refcnt_dec(pd);
 533 
 534         /* Set the cqhdl pointer to NULL and return success */
 535         *cqhdl = NULL;
 536 
 537         TAVOR_TNF_EXIT(tavor_cq_free);
 538         return (DDI_SUCCESS);
 539 }
 540 
 541 
 542 /*
 543  * tavor_cq_resize()
 544  *    Context: Can be called only from user or kernel context.
 545  */
 546 int
 547 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
 548     uint_t *actual_size, uint_t sleepflag)
 549 {
 550         tavor_hw_cqc_t          cqc_entry;
 551         tavor_qalloc_info_t     new_cqinfo, old_cqinfo;
 552         ibt_mr_attr_t           mr_attr;
 553         tavor_mr_options_t      op;
 554         tavor_pdhdl_t           pd;
 555         tavor_mrhdl_t           mr, mr_old;
 556         tavor_hw_cqe_t          *buf;
 557         uint32_t                new_prod_indx, old_cons_indx;
 558         uint_t                  dma_xfer_mode, cq_sync, log_cq_size, maxprot;
 559         int                     status, i, flag;
 560         char                    *errormsg;
 561 
 562         TAVOR_TNF_ENTER(tavor_cq_resize);
 563 
 564         /* Use the internal protection domain (PD) for CQs */
 565         pd = state->ts_pdhdl_internal;
 566 
 567         /*
 568          * Calculate the appropriate size for the new resized completion queue.
 569          * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
 570          * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
 571          * to round the requested size up to the next highest power-of-2
 572          */
 573         req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
 574         log_cq_size = highbit(req_size);
 575 
 576         /*
 577          * Next we verify that the rounded-up size is valid (i.e. consistent
 578          * with the device limits and/or software-configured limits)
 579          */
 580         if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
 581                 /* Set "status" and "errormsg" and goto failure */
 582                 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
 583                 goto cqresize_fail;
 584         }
 585 
 586         /*
 587          * Allocate the memory for newly resized Completion Queue.
 588          *
 589          * Note: Although we use the common queue allocation routine, we
 590          * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
 591          * kernel system memory) for kernel CQs because it would be
 592          * inefficient to have CQs located in DDR memory.  This is the same
 593          * as we do when we first allocate completion queues primarily
 594          * because CQs are read from (by software) more than they are written
 595          * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
 596          * user-mappable CQs for a similar reason.)
 597          * It is also worth noting that, unlike Tavor QP work queues,
 598          * completion queues do not have the same strict alignment
 599          * requirements.  It is sufficient for the CQ memory to be both
 600          * aligned to and bound to addresses which are a multiple of CQE size.
 601          */
 602         new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
 603         new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
 604         new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
 605         if (cq->cq_is_umap) {
 606                 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
 607         } else {
 608                 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
 609         }
 610         status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
 611         if (status != DDI_SUCCESS) {
 612                 /* Set "status" and "errormsg" and goto failure */
 613                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
 614                 goto cqresize_fail;
 615         }
 616         buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
 617         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
 618 
 619         /*
 620          * Initialize each of the Completion Queue Entries (CQE) by setting
 621          * their ownership to hardware ("owner" bit set to HW).  This is in
 622          * preparation for the final resize operation (below).
 623          */
 624         for (i = 0; i < (1 << log_cq_size); i++) {
 625                 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
 626         }
 627 
 628         /*
 629          * Register the memory for the CQ.  The memory for the CQ must
 630          * be registered in the Tavor TPT tables.  This gives us the LKey
 631          * to specify in the CQ context below.
 632          */
 633         flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
 634         mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
 635         mr_attr.mr_len   = new_cqinfo.qa_size;
 636         mr_attr.mr_as    = NULL;
 637         mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
 638         if (cq->cq_is_umap) {
 639                 dma_xfer_mode = DDI_DMA_CONSISTENT;
 640         } else {
 641                 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
 642         }
 643         if (dma_xfer_mode == DDI_DMA_STREAMING) {
 644                 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
 645         }
 646         op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
 647         op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
 648         op.mro_bind_override_addr = 0;
 649         status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
 650         if (status != DDI_SUCCESS) {
 651                 tavor_queue_free(state, &new_cqinfo);
 652                 /* Set "status" and "errormsg" and goto failure */
 653                 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
 654                 goto cqresize_fail;
 655         }
 656         _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
 657 
 658         /* Determine if later ddi_dma_sync will be necessary */
 659         cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
 660 
 661         /* Sync entire "new" CQ for use by hardware (if necessary) */
 662         if (cq_sync) {
 663                 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
 664                     new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 665         }
 666 
 667         /*
 668          * Now we grab the CQ lock.  Since we will be updating the actual
 669          * CQ location and the producer/consumer indexes, we should hold
 670          * the lock.
 671          *
 672          * We do a TAVOR_NOSLEEP here (and below), though, because we are
 673          * holding the "cq_lock" and if we got raised to interrupt level
 674          * by priority inversion, we would not want to block in this routine
 675          * waiting for success.
 676          */
 677         mutex_enter(&cq->cq_lock);
 678 
 679         /*
 680          * Determine the current CQ "consumer index".
 681          *
 682          * Note:  This will depend on whether the CQ had previously been
 683          * mapped for user access or whether it is a kernel CQ.  If this
 684          * is a kernel CQ, then all PollCQ() operations have come through
 685          * the IBTF and, hence, the driver's CQ state structure will
 686          * contain the current consumer index.  If, however, the user has
 687          * accessed this CQ by bypassing the driver (OS-bypass), then we
 688          * need to query the firmware to determine the current CQ consumer
 689          * index.  This also assumes that the user process will not continue
 690          * to consume entries while at the same time doing the ResizeCQ()
 691          * operation.  If the user process does not guarantee this, then it
 692          * may see duplicate or missed completions.  But under no
 693          * circumstances should this panic the system.
 694          */
 695         if (cq->cq_is_umap) {
 696                 status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
 697                     cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
 698                     TAVOR_NOSLEEP);
 699                 if (status != TAVOR_CMD_SUCCESS) {
 700                         /* Query CQ has failed, drop CQ lock and cleanup */
 701                         mutex_exit(&cq->cq_lock);
 702                         if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 703                             sleepflag) != DDI_SUCCESS) {
 704                                 TAVOR_WARNING(state, "failed to deregister "
 705                                     "CQ memory");
 706                         }
 707                         tavor_queue_free(state, &new_cqinfo);
 708                         TAVOR_WARNING(state, "failed to find in database");
 709 
 710                         /* Set "status" and "errormsg" and goto failure */
 711                         TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 712                             "failed umap lookup");
 713                         goto cqresize_fail;
 714                 }
 715                 old_cons_indx = cqc_entry.cons_indx;
 716         } else {
 717                 old_cons_indx = cq->cq_consindx;
 718         }
 719 
 720         /*
 721          * Fill in the CQC entry.  For the resize operation this is the
 722          * final step before attempting the resize operation on the CQC entry.
 723          * We use all of the information collected/calculated above to fill
 724          * in the requisite portions of the CQC.
 725          */
 726         bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
 727         cqc_entry.start_addr_h  = (mr->mr_bindinfo.bi_addr >> 32);
 728         cqc_entry.start_addr_l  = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
 729         cqc_entry.log_cq_sz     = log_cq_size;
 730         cqc_entry.lkey          = mr->mr_lkey;
 731 
 732         /*
 733          * Write the CQC entry to hardware.  Lastly, we pass ownership of
 734          * the entry to the hardware (using the Tavor RESIZE_CQ firmware
 735          * command).  Note: In general, this operation shouldn't fail.  But
 736          * if it does, we have to undo everything we've done above before
 737          * returning error.  Also note that the status returned may indicate
 738          * the code to return to the IBTF.
 739          */
 740         status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
 741             &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
 742         if (status != TAVOR_CMD_SUCCESS) {
 743                 /* Resize attempt has failed, drop CQ lock and cleanup */
 744                 mutex_exit(&cq->cq_lock);
 745                 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
 746                     sleepflag) != DDI_SUCCESS) {
 747                         TAVOR_WARNING(state, "failed to deregister CQ memory");
 748                 }
 749                 tavor_queue_free(state, &new_cqinfo);
 750                 if (status == TAVOR_CMD_BAD_SIZE) {
 751                         TAVOR_TNF_EXIT(tavor_cq_resize);
 752                         return (IBT_CQ_SZ_INSUFFICIENT);
 753                 } else {
 754                         cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
 755                             "%08x\n", status);
 756                         TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
 757                             TAVOR_TNF_ERROR, "", tnf_uint, status, status);
 758                         TAVOR_TNF_EXIT(tavor_cq_resize);
 759                         return (ibc_get_ci_failure(0));
 760                 }
 761         }
 762 
 763         /*
 764          * The CQ resize attempt was successful.  Before dropping the CQ lock,
 765          * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
 766          * the Tavor firmware guarantees us that sufficient space is set aside
 767          * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
 768          * The two parameters to this helper function ("old_cons_indx" and
 769          * "new_prod_indx") essentially indicate the starting index and number
 770          * of any CQEs that might remain in the "old" CQ memory.
 771          */
 772         tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
 773 
 774         /* Sync entire "new" CQ for use by hardware (if necessary) */
 775         if (cq_sync) {
 776                 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
 777                     new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
 778         }
 779 
 780         /*
 781          * Update the Tavor Completion Queue handle with all the new
 782          * information.  At the same time, save away all the necessary
 783          * information for freeing up the old resources
 784          */
 785         mr_old           = cq->cq_mrhdl;
 786         old_cqinfo       = cq->cq_cqinfo;
 787         cq->cq_cqinfo         = new_cqinfo;
 788         cq->cq_consindx       = 0;
 789         cq->cq_buf    = buf;
 790         cq->cq_bufsz  = (1 << log_cq_size);
 791         cq->cq_mrhdl  = mr;
 792         cq->cq_sync   = cq_sync;
 793 
 794         /*
 795          * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
 796          * to a user process, then we need to call devmap_devmem_remap() to
 797          * invalidate the mapping to the CQ memory.  We also need to
 798          * invalidate the CQ tracking information for the user mapping.
 799          */
 800         if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
 801                 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
 802                 status = devmap_devmem_remap(cq->cq_umap_dhp,
 803                     state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
 804                     DEVMAP_MAPPING_INVALID, NULL);
 805                 if (status != DDI_SUCCESS) {
 806                         mutex_exit(&cq->cq_lock);
 807                         TAVOR_WARNING(state, "failed in CQ memory "
 808                             "devmap_devmem_remap()");
 809                         TAVOR_TNF_EXIT(tavor_cq_free);
 810                         return (ibc_get_ci_failure(0));
 811                 }
 812                 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
 813         }
 814 
 815         /*
 816          * Drop the CQ lock now.  The only thing left to do is to free up
 817          * the old resources.
 818          */
 819         mutex_exit(&cq->cq_lock);
 820 
 821         /*
 822          * Deregister the memory for the old Completion Queue.  Note: We
 823          * really can't return error here because we have no good way to
 824          * cleanup.  Plus, the deregistration really shouldn't ever happen.
 825          * So, if it does, it is an indication that something has gone
 826          * seriously wrong.  So we print a warning message and return error
 827          * (knowing, of course, that the "old" CQ memory will be leaked)
 828          */
 829         status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
 830             sleepflag);
 831         if (status != DDI_SUCCESS) {
 832                 TAVOR_WARNING(state, "failed to deregister old CQ memory");
 833                 /* Set "status" and "errormsg" and goto failure */
 834                 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
 835                     "failed deregister mr (old)");
 836                 goto cqresize_fail;
 837         }
 838 
 839         /* Free the memory for the old CQ */
 840         tavor_queue_free(state, &old_cqinfo);
 841 
 842         /*
 843          * Fill in the return arguments (if necessary).  This includes the
 844          * real new completion queue size.
 845          */
 846         if (actual_size != NULL) {
 847                 *actual_size = (1 << log_cq_size) - 1;
 848         }
 849 
 850         TAVOR_TNF_EXIT(tavor_cq_resize);
 851         return (DDI_SUCCESS);
 852 
 853 cqresize_fail:
 854         TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
 855             tnf_string, msg, errormsg);
 856         TAVOR_TNF_EXIT(tavor_cq_resize);
 857         return (status);
 858 }
 859 
 860 
 861 /*
 862  * tavor_cq_notify()
 863  *    Context: Can be called from interrupt or base context.
 864  */
 865 int
 866 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
 867     ibt_cq_notify_flags_t flags)
 868 {
 869         uint_t          cqnum;
 870 
 871         TAVOR_TNF_ENTER(tavor_cq_notify);
 872 
 873         /*
 874          * Determine if we are trying to get the next completion or the next
 875          * "solicited" completion.  Then hit the appropriate doorbell.
 876          *
 877          * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
 878          * regarding why we do not have to do an extra PIO read here, and we
 879          * will not lose an event after writing this doorbell.
 880          */
 881         cqnum = cq->cq_cqnum;
 882         if (flags == IBT_NEXT_COMPLETION) {
 883                 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
 884                     TAVOR_CQDB_DEFAULT_PARAM);
 885 
 886         } else if (flags == IBT_NEXT_SOLICITED) {
 887                 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
 888                     cqnum, TAVOR_CQDB_DEFAULT_PARAM);
 889 
 890         } else {
 891                 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
 892                     tnf_int, flags, flags);
 893                 TAVOR_TNF_EXIT(tavor_cq_notify);
 894                 return (IBT_CQ_NOTIFY_TYPE_INVALID);
 895         }
 896 
 897         TAVOR_TNF_EXIT(tavor_cq_notify);
 898         return (DDI_SUCCESS);
 899 }
 900 
 901 
 902 /*
 903  * tavor_cq_poll()
 904  *    Context: Can be called from interrupt or base context.
 905  */
 906 int
 907 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
 908     uint_t num_wc, uint_t *num_polled)
 909 {
 910         tavor_hw_cqe_t  *cqe;
 911         uint32_t        cons_indx, wrap_around_mask;
 912         uint32_t        polled_cnt, num_to_increment;
 913         int             status;
 914 
 915         TAVOR_TNF_ENTER(tavor_cq_poll);
 916 
 917         /*
 918          * Check for user-mappable CQ memory.  Note:  We do not allow kernel
 919          * clients to poll CQ memory that is accessible directly by the user.
 920          * If the CQ memory is user accessible, then return an error.
 921          */
 922         if (cq->cq_is_umap) {
 923                 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
 924                     TAVOR_TNF_ERROR, "");
 925                 TAVOR_TNF_EXIT(tavor_cq_poll);
 926                 return (IBT_CQ_HDL_INVALID);
 927         }
 928 
 929         mutex_enter(&cq->cq_lock);
 930 
 931         /* Get the consumer index */
 932         cons_indx = cq->cq_consindx;
 933 
 934         /*
 935          * Calculate the wrap around mask.  Note: This operation only works
 936          * because all Tavor completion queues have power-of-2 sizes
 937          */
 938         wrap_around_mask = (cq->cq_bufsz - 1);
 939 
 940         /* Calculate the pointer to the first CQ entry */
 941         cqe = &cq->cq_buf[cons_indx];
 942 
 943         /* Sync the current CQE to read */
 944         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
 945 
 946         /*
 947          * Keep pulling entries from the CQ until we find an entry owned by
 948          * the hardware.  As long as there the CQE's owned by SW, process
 949          * each entry by calling tavor_cq_cqe_consume() and updating the CQ
 950          * consumer index.  Note:  We only update the consumer index if
 951          * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
 952          * it indicates that we are going to "recycle" the CQE (probably
 953          * because it is a error CQE and corresponds to more than one
 954          * completion).
 955          */
 956         polled_cnt = 0;
 957         while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
 958                 status = tavor_cq_cqe_consume(state, cq, cqe,
 959                     &wc_p[polled_cnt++]);
 960                 if (status == TAVOR_CQ_SYNC_AND_DB) {
 961                         /* Reset entry to hardware ownership */
 962                         TAVOR_CQE_OWNER_SET_HW(cq, cqe);
 963 
 964                         /* Sync the current CQE for device */
 965                         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
 966 
 967                         /* Increment the consumer index */
 968                         cons_indx = (cons_indx + 1) & wrap_around_mask;
 969 
 970                         /* Update the pointer to the next CQ entry */
 971                         cqe = &cq->cq_buf[cons_indx];
 972 
 973                         /* Sync the next CQE to read */
 974                         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
 975                 }
 976 
 977                 /*
 978                  * If we have run out of space to store work completions,
 979                  * then stop and return the ones we have pulled of the CQ.
 980                  */
 981                 if (polled_cnt >= num_wc) {
 982                         break;
 983                 }
 984         }
 985 
 986         /*
 987          * Now we only ring the doorbell (to update the consumer index) if
 988          * we've actually consumed a CQ entry.  If we have, for example,
 989          * pulled from a CQE that we are still in the process of "recycling"
 990          * for error purposes, then we would not update the consumer index.
 991          */
 992         if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
 993                 /*
 994                  * Post doorbell to update the consumer index.  Doorbell
 995                  * value indicates number of entries consumed (minus 1)
 996                  */
 997                 if (cons_indx > cq->cq_consindx) {
 998                         num_to_increment = (cons_indx - cq->cq_consindx) - 1;
 999                 } else {
1000                         num_to_increment = ((cons_indx + cq->cq_bufsz) -
1001                             cq->cq_consindx) - 1;
1002                 }
1003                 cq->cq_consindx = cons_indx;
1004                 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1005                     cq->cq_cqnum, num_to_increment);
1006 
1007         } else if (polled_cnt == 0) {
1008                 /*
1009                  * If the CQ is empty, we can try to free up some of the WRID
1010                  * list containers.  See tavor_wr.c for more details on this
1011                  * operation.
1012                  */
1013                 tavor_wrid_cq_reap(cq);
1014         }
1015 
1016         mutex_exit(&cq->cq_lock);
1017 
1018         /* Set "num_polled" (if necessary) */
1019         if (num_polled != NULL) {
1020                 *num_polled = polled_cnt;
1021         }
1022 
1023         /* Set CQ_EMPTY condition if needed, otherwise return success */
1024         if (polled_cnt == 0) {
1025                 status = IBT_CQ_EMPTY;
1026         } else {
1027                 status = DDI_SUCCESS;
1028         }
1029 
1030         /*
1031          * Check if the system is currently panicking.  If it is, then call
1032          * the Tavor interrupt service routine.  This step is necessary here
1033          * because we might be in a polled I/O mode and without the call to
1034          * tavor_isr() - and its subsequent calls to poll and rearm each
1035          * event queue - we might overflow our EQs and render the system
1036          * unable to sync/dump.
1037          */
1038         if (ddi_in_panic() != 0) {
1039                 (void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1040         }
1041 
1042         TAVOR_TNF_EXIT(tavor_cq_poll);
1043         return (status);
1044 }
1045 
1046 
1047 /*
1048  * tavor_cq_handler()
1049  *    Context: Only called from interrupt context
1050  */
1051 int
1052 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1053     tavor_hw_eqe_t *eqe)
1054 {
1055         tavor_cqhdl_t           cq;
1056         uint_t                  cqnum;
1057         uint_t                  eqe_evttype;
1058 
1059         TAVOR_TNF_ENTER(tavor_cq_handler);
1060 
1061         eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1062 
1063         ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1064             eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1065 
1066         if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1067                 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1068                     TAVOR_TNF_ERROR, "");
1069                 tavor_eq_overflow_handler(state, eq, eqe);
1070 
1071                 TAVOR_TNF_EXIT(tavor_cq_handler);
1072                 return (DDI_FAILURE);
1073         }
1074 
1075 
1076         /* Get the CQ handle from CQ number in event descriptor */
1077         cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1078         cq = tavor_cqhdl_from_cqnum(state, cqnum);
1079 
1080         /*
1081          * Post the EQ doorbell to move the CQ to the "disarmed" state.
1082          * This operation is to enable subsequent CQ doorbells (e.g. those
1083          * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1084          */
1085         tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1086 
1087         /*
1088          * If the CQ handle is NULL, this is probably an indication
1089          * that the CQ has been freed already.  In which case, we
1090          * should not deliver this event.
1091          *
1092          * We also check that the CQ number in the handle is the
1093          * same as the CQ number in the event queue entry.  This
1094          * extra check allows us to handle the case where a CQ was
1095          * freed and then allocated again in the time it took to
1096          * handle the event queue processing.  By constantly incrementing
1097          * the non-constrained portion of the CQ number every time
1098          * a new CQ is allocated, we mitigate (somewhat) the chance
1099          * that a stale event could be passed to the client's CQ
1100          * handler.
1101          *
1102          * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1103          * means that we've have either received this event before we
1104          * finished attaching to the IBTF or we've received it while we
1105          * are in the process of detaching.
1106          */
1107         if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1108             (state->ts_ibtfpriv != NULL)) {
1109                 TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1110         } else {
1111                 TNF_PROBE_2(tavor_cq_handler_dropped_event,
1112                     TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1113                     tnf_uint, hdl_cqnum, cqnum);
1114         }
1115 
1116         TAVOR_TNF_EXIT(tavor_cq_handler);
1117         return (DDI_SUCCESS);
1118 }
1119 
1120 
1121 /*
1122  * tavor_cq_err_handler()
1123  *    Context: Only called from interrupt context
1124  */
1125 int
1126 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1127     tavor_hw_eqe_t *eqe)
1128 {
1129         tavor_cqhdl_t           cq;
1130         uint_t                  cqnum;
1131         ibc_async_event_t       event;
1132         ibt_async_code_t        type;
1133         uint_t                  eqe_evttype;
1134 
1135         TAVOR_TNF_ENTER(tavor_cq_err_handler);
1136 
1137         eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1138 
1139         ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1140             eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1141 
1142         if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1143                 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1144                     TAVOR_TNF_ERROR, "");
1145                 tavor_eq_overflow_handler(state, eq, eqe);
1146 
1147                 TAVOR_TNF_EXIT(tavor_cq_err_handler);
1148                 return (DDI_FAILURE);
1149         }
1150 
1151         /* cmn_err(CE_CONT, "CQ Error handler\n"); */
1152 
1153         /* Get the CQ handle from CQ number in event descriptor */
1154         cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1155         cq = tavor_cqhdl_from_cqnum(state, cqnum);
1156 
1157         /*
1158          * If the CQ handle is NULL, this is probably an indication
1159          * that the CQ has been freed already.  In which case, we
1160          * should not deliver this event.
1161          *
1162          * We also check that the CQ number in the handle is the
1163          * same as the CQ number in the event queue entry.  This
1164          * extra check allows us to handle the case where a CQ was
1165          * freed and then allocated again in the time it took to
1166          * handle the event queue processing.  By constantly incrementing
1167          * the non-constrained portion of the CQ number every time
1168          * a new CQ is allocated, we mitigate (somewhat) the chance
1169          * that a stale event could be passed to the client's CQ
1170          * handler.
1171          *
1172          * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1173          * means that we've have either received this event before we
1174          * finished attaching to the IBTF or we've received it while we
1175          * are in the process of detaching.
1176          */
1177         if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1178             (state->ts_ibtfpriv != NULL)) {
1179                 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1180                 type            = IBT_ERROR_CQ;
1181 
1182                 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1183         } else {
1184                 TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1185                     TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1186                     tnf_uint, hdl_cqnum, cqnum);
1187         }
1188 
1189         TAVOR_TNF_EXIT(tavor_cq_err_handler);
1190         return (DDI_SUCCESS);
1191 }
1192 
1193 
1194 /*
1195  * tavor_cq_refcnt_inc()
1196  *    Context: Can be called from interrupt or base context.
1197  */
1198 int
1199 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1200 {
1201         /*
1202          * Increment the completion queue's reference count.  Note: In order
1203          * to ensure compliance with IBA C11-15, we must ensure that a given
1204          * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1205          * This is accomplished here by keeping track of how the referenced
1206          * CQ is being used.
1207          */
1208         mutex_enter(&cq->cq_lock);
1209         TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1210             tnf_uint, refcnt, cq->cq_refcnt);
1211         if (cq->cq_refcnt == 0) {
1212                 cq->cq_is_special = is_special;
1213         } else {
1214                 if (cq->cq_is_special != is_special) {
1215                         mutex_exit(&cq->cq_lock);
1216                         return (DDI_FAILURE);
1217                 }
1218         }
1219         cq->cq_refcnt++;
1220         mutex_exit(&cq->cq_lock);
1221         return (DDI_SUCCESS);
1222 }
1223 
1224 
1225 /*
1226  * tavor_cq_refcnt_dec()
1227  *    Context: Can be called from interrupt or base context.
1228  */
1229 void
1230 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1231 {
1232         /* Decrement the completion queue's reference count */
1233         mutex_enter(&cq->cq_lock);
1234         cq->cq_refcnt--;
1235         TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1236             tnf_uint, refcnt, cq->cq_refcnt);
1237         mutex_exit(&cq->cq_lock);
1238 }
1239 
1240 
1241 /*
1242  * tavor_cq_doorbell()
1243  *    Context: Can be called from interrupt or base context.
1244  */
1245 static void
1246 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1247     uint32_t cq_param)
1248 {
1249         uint64_t        doorbell = 0;
1250 
1251         /* Build the doorbell from the parameters */
1252         doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1253             ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1254 
1255         TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1256             tnf_ulong, doorbell, doorbell);
1257 
1258         /* Write the doorbell to UAR */
1259         TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1260             doorbell);
1261 }
1262 
1263 
1264 /*
1265  * tavor_cqhdl_from_cqnum()
1266  *    Context: Can be called from interrupt or base context.
1267  *
1268  *    This routine is important because changing the unconstrained
1269  *    portion of the CQ number is critical to the detection of a
1270  *    potential race condition in the CQ handler code (i.e. the case
1271  *    where a CQ is freed and alloc'd again before an event for the
1272  *    "old" CQ can be handled).
1273  *
1274  *    While this is not a perfect solution (not sure that one exists)
1275  *    it does help to mitigate the chance that this race condition will
1276  *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1277  *    this solution does not scale well because the number of constrained
1278  *    bits increases (and, hence, the number of unconstrained bits
1279  *    decreases) as the number of supported CQs grows.  For small and
1280  *    intermediate values, it should hopefully provide sufficient
1281  *    protection.
1282  */
1283 tavor_cqhdl_t
1284 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1285 {
1286         uint_t  cqindx, cqmask;
1287 
1288         /* Calculate the CQ table index from the cqnum */
1289         cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1290         cqindx = cqnum & cqmask;
1291         return (state->ts_cqhdl[cqindx]);
1292 }
1293 
1294 
1295 /*
1296  * tavor_cq_cqe_consume()
1297  *    Context: Can be called from interrupt or base context.
1298  */
1299 static int
1300 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1301     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1302 {
1303         uint_t          flags, type, opcode, qpnum, qp1_indx;
1304         int             status;
1305 
1306         TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1307 
1308         /*
1309          * Determine if this is an "error" CQE by examining "opcode".  If it
1310          * is an error CQE, then call tavor_cq_errcqe_consume() and return
1311          * whatever status it returns.  Otherwise, this is a successful
1312          * completion.
1313          */
1314         opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1315         if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1316             (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1317                 status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1318                 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1319                 return (status);
1320         }
1321 
1322         /*
1323          * Fetch the Work Request ID using the information in the CQE.
1324          * See tavor_wr.c for more details.
1325          */
1326         wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1327 
1328         /*
1329          * Parse the CQE opcode to determine completion type.  This will set
1330          * not only the type of the completion, but also any flags that might
1331          * be associated with it (e.g. whether immediate data is present).
1332          */
1333         flags = IBT_WC_NO_FLAGS;
1334         if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1335 
1336                 /* Send CQE */
1337                 switch (opcode) {
1338                 case TAVOR_CQE_SND_RDMAWR_IMM:
1339                         flags |= IBT_WC_IMMED_DATA_PRESENT;
1340                         /* FALLTHROUGH */
1341                 case TAVOR_CQE_SND_RDMAWR:
1342                         type = IBT_WRC_RDMAW;
1343                         break;
1344 
1345                 case TAVOR_CQE_SND_SEND_IMM:
1346                         flags |= IBT_WC_IMMED_DATA_PRESENT;
1347                         /* FALLTHROUGH */
1348                 case TAVOR_CQE_SND_SEND:
1349                         type = IBT_WRC_SEND;
1350                         break;
1351 
1352                 case TAVOR_CQE_SND_RDMARD:
1353                         type = IBT_WRC_RDMAR;
1354                         break;
1355 
1356                 case TAVOR_CQE_SND_ATOMIC_CS:
1357                         type = IBT_WRC_CSWAP;
1358                         break;
1359 
1360                 case TAVOR_CQE_SND_ATOMIC_FA:
1361                         type = IBT_WRC_FADD;
1362                         break;
1363 
1364                 case TAVOR_CQE_SND_BIND_MW:
1365                         type = IBT_WRC_BIND;
1366                         break;
1367 
1368                 default:
1369                         TAVOR_WARNING(state, "unknown send CQE type");
1370                         wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1371                         TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1372                             TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1373                         TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1374                         return (TAVOR_CQ_SYNC_AND_DB);
1375                 }
1376         } else {
1377 
1378                 /* Receive CQE */
1379                 switch (opcode & 0x1F) {
1380                 case TAVOR_CQE_RCV_RECV_IMM:
1381                         /* FALLTHROUGH */
1382                 case TAVOR_CQE_RCV_RECV_IMM2:
1383                         /*
1384                          * Note:  According to the Tavor PRM, all QP1 recv
1385                          * completions look like the result of a Send with
1386                          * Immediate.  They are not, however, (MADs are Send
1387                          * Only) so we need to check the QP number and set
1388                          * the flag only if it is non-QP1.
1389                          */
1390                         qpnum    = TAVOR_CQE_QPNUM_GET(cq, cqe);
1391                         qp1_indx = state->ts_spec_qp1->tr_indx;
1392                         if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1393                                 flags |= IBT_WC_IMMED_DATA_PRESENT;
1394                         }
1395                         /* FALLTHROUGH */
1396                 case TAVOR_CQE_RCV_RECV:
1397                         /* FALLTHROUGH */
1398                 case TAVOR_CQE_RCV_RECV2:
1399                         type = IBT_WRC_RECV;
1400                         break;
1401 
1402                 case TAVOR_CQE_RCV_RDMAWR_IMM:
1403                         /* FALLTHROUGH */
1404                 case TAVOR_CQE_RCV_RDMAWR_IMM2:
1405                         flags |= IBT_WC_IMMED_DATA_PRESENT;
1406                         type = IBT_WRC_RECV_RDMAWI;
1407                         break;
1408 
1409                 default:
1410                         TAVOR_WARNING(state, "unknown recv CQE type");
1411                         wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1412                         TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1413                             TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1414                         TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1415                         return (TAVOR_CQ_SYNC_AND_DB);
1416                 }
1417         }
1418         wc->wc_type = type;
1419 
1420         /*
1421          * Check for GRH, update the flags, then fill in "wc_flags" field
1422          * in the work completion
1423          */
1424         if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1425                 flags |= IBT_WC_GRH_PRESENT;
1426         }
1427         wc->wc_flags = flags;
1428 
1429         /* If we got here, completion status must be success */
1430         wc->wc_status = IBT_WC_SUCCESS;
1431 
1432         /*
1433          * Parse the remaining contents of the CQE into the work completion.
1434          * This means filling in SL, QP number, SLID, immediate data, etc.
1435          * Note:  Not all of these fields are valid in a given completion.
1436          * Many of them depend on the actual type of completion.  So we fill
1437          * in all of the fields and leave it up to the IBTF and consumer to
1438          * sort out which are valid based on their context.
1439          */
1440         wc->wc_sl      = TAVOR_CQE_SL_GET(cq, cqe);
1441         wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1442         wc->wc_qpn     = TAVOR_CQE_DQPN_GET(cq, cqe);
1443         wc->wc_res_hash        = 0;
1444         wc->wc_slid    = TAVOR_CQE_DLID_GET(cq, cqe);
1445         wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1446         wc->wc_pkey_ix         = (wc->wc_immed_data >> 16);
1447 
1448         /*
1449          * Depending on whether the completion was a receive or a send
1450          * completion, fill in "bytes transferred" as appropriate.  Also,
1451          * if necessary, fill in the "path bits" field.
1452          */
1453         if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1454                 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1455                 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1456 
1457         } else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1458             (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1459                 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1460         }
1461 
1462         TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1463         return (TAVOR_CQ_SYNC_AND_DB);
1464 }
1465 
1466 
1467 /*
1468  * tavor_cq_errcqe_consume()
1469  *    Context: Can be called from interrupt or base context.
1470  */
1471 static int
1472 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1473     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1474 {
1475         uint64_t                next_wqeaddr;
1476         uint32_t                imm_eth_pkey_cred;
1477         uint_t                  nextwqesize, dbd;
1478         uint_t                  doorbell_cnt, status;
1479         tavor_wrid_entry_t      wre;
1480 
1481         TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1482 
1483         /*
1484          * Fetch the Work Request ID using the information in the CQE.
1485          * See tavor_wr.c for more details.
1486          */
1487         wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1488 
1489         /*
1490          * Parse the CQE opcode to determine completion type.  We know that
1491          * the CQE is an error completion, so we extract only the completion
1492          * status here.
1493          */
1494         imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1495         status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1496         switch (status) {
1497         case TAVOR_CQE_LOC_LEN_ERR:
1498                 status = IBT_WC_LOCAL_LEN_ERR;
1499                 break;
1500 
1501         case TAVOR_CQE_LOC_OP_ERR:
1502                 status = IBT_WC_LOCAL_QP_OP_ERR;
1503                 break;
1504 
1505         case TAVOR_CQE_LOC_PROT_ERR:
1506                 status = IBT_WC_LOCAL_PROTECT_ERR;
1507                 break;
1508 
1509         case TAVOR_CQE_WR_FLUSHED_ERR:
1510                 status = IBT_WC_WR_FLUSHED_ERR;
1511                 break;
1512 
1513         case TAVOR_CQE_MW_BIND_ERR:
1514                 status = IBT_WC_MEM_WIN_BIND_ERR;
1515                 break;
1516 
1517         case TAVOR_CQE_BAD_RESPONSE_ERR:
1518                 status = IBT_WC_BAD_RESPONSE_ERR;
1519                 break;
1520 
1521         case TAVOR_CQE_LOCAL_ACCESS_ERR:
1522                 status = IBT_WC_LOCAL_ACCESS_ERR;
1523                 break;
1524 
1525         case TAVOR_CQE_REM_INV_REQ_ERR:
1526                 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1527                 break;
1528 
1529         case TAVOR_CQE_REM_ACC_ERR:
1530                 status = IBT_WC_REMOTE_ACCESS_ERR;
1531                 break;
1532 
1533         case TAVOR_CQE_REM_OP_ERR:
1534                 status = IBT_WC_REMOTE_OP_ERR;
1535                 break;
1536 
1537         case TAVOR_CQE_TRANS_TO_ERR:
1538                 status = IBT_WC_TRANS_TIMEOUT_ERR;
1539                 break;
1540 
1541         case TAVOR_CQE_RNRNAK_TO_ERR:
1542                 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1543                 break;
1544 
1545         /*
1546          * The following error codes are not supported in the Tavor driver
1547          * as they relate only to Reliable Datagram completion statuses:
1548          *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1549          *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1550          *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1551          *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1552          *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1553          *    case TAVOR_CQE_LOC_EEC_ERR:
1554          */
1555 
1556         default:
1557                 TAVOR_WARNING(state, "unknown error CQE status");
1558                 status = IBT_WC_LOCAL_QP_OP_ERR;
1559                 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1560                     TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1561                 break;
1562         }
1563         wc->wc_status = status;
1564 
1565         /*
1566          * Now we do all the checking that's necessary to handle completion
1567          * queue entry "recycling"
1568          *
1569          * It is not necessary here to try to sync the WQE as we are only
1570          * attempting to read from the Work Queue (and hardware does not
1571          * write to it).
1572          */
1573 
1574         /*
1575          * We can get doorbell info, WQE address, size for the next WQE
1576          * from the "wre" (which was filled in above in the call to the
1577          * tavor_wrid_get_entry() routine)
1578          */
1579         dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1580         next_wqeaddr = wre.wr_wqeaddrsz;
1581         nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1582 
1583         /*
1584          * Get the doorbell count from the CQE.  This indicates how many
1585          * completions this one CQE represents.
1586          */
1587         doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1588 
1589         /*
1590          * Determine if we're ready to consume this CQE yet or not.  If the
1591          * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1592          * is down to zero, then this is the last/only completion represented
1593          * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1594          * current CQE needs to be recycled (see below).
1595          */
1596         if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1597                 /*
1598                  * Consume the CQE
1599                  *    Return status to indicate that doorbell and sync may be
1600                  *    necessary.
1601                  */
1602                 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1603                 return (TAVOR_CQ_SYNC_AND_DB);
1604 
1605         } else {
1606                 /*
1607                  * Recycle the CQE for use in the next PollCQ() call
1608                  *    Decrement the doorbell count, modify the error status,
1609                  *    and update the WQE address and size (to point to the
1610                  *    next WQE on the chain.  Put these update entries back
1611                  *    into the CQE.
1612                  *    Despite the fact that we have updated the CQE, it is not
1613                  *    necessary for us to attempt to sync this entry just yet
1614                  *    as we have not changed the "hardware's view" of the
1615                  *    entry (i.e. we have not modified the "owner" bit - which
1616                  *    is all that the Tavor hardware really cares about.
1617                  */
1618                 doorbell_cnt = doorbell_cnt - dbd;
1619                 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1620                     ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1621                     (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1622                 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1623                     TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1624 
1625                 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1626                 return (TAVOR_CQ_RECYCLE_ENTRY);
1627         }
1628 }
1629 
1630 
1631 /*
1632  * tavor_cqe_sync()
1633  *    Context: Can be called from interrupt or base context.
1634  */
1635 static void
1636 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1637 {
1638         ddi_dma_handle_t        dmahdl;
1639         off_t                   offset;
1640         int                     status;
1641 
1642         TAVOR_TNF_ENTER(tavor_cqe_sync);
1643 
1644         /* Determine if CQ needs to be synced or not */
1645         if (cq->cq_sync == 0) {
1646                 TAVOR_TNF_EXIT(tavor_cqe_sync);
1647                 return;
1648         }
1649 
1650         /* Get the DMA handle from CQ context */
1651         dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1652 
1653         /* Calculate offset of next CQE */
1654         offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1655         status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1656         if (status != DDI_SUCCESS) {
1657                 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1658                     TAVOR_TNF_ERROR, "");
1659                 TAVOR_TNF_EXIT(tavor_cqe_sync);
1660                 return;
1661         }
1662 
1663         TAVOR_TNF_EXIT(tavor_cqe_sync);
1664 }
1665 
1666 
1667 /*
1668  * tavor_cq_resize_helper()
1669  *    Context: Can be called only from user or kernel context.
1670  */
1671 static void
1672 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1673     uint32_t old_cons_indx, uint32_t num_newcqe)
1674 {
1675         tavor_hw_cqe_t  *old_cqe, *new_cqe;
1676         uint32_t        new_cons_indx, wrap_around_mask;
1677         int             i;
1678 
1679         TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1680 
1681         ASSERT(MUTEX_HELD(&cq->cq_lock));
1682 
1683         /* Get the consumer index */
1684         new_cons_indx = 0;
1685 
1686         /*
1687          * Calculate the wrap around mask.  Note: This operation only works
1688          * because all Tavor completion queues have power-of-2 sizes
1689          */
1690         wrap_around_mask = (cq->cq_bufsz - 1);
1691 
1692         /*
1693          * Calculate the pointers to the first CQ entry (in the "old" CQ)
1694          * and the first CQ entry in the "new" CQ
1695          */
1696         old_cqe = &cq->cq_buf[old_cons_indx];
1697         new_cqe = &new_cqbuf[new_cons_indx];
1698 
1699         /* Sync entire "old" CQ for use by software (if necessary). */
1700         if (cq->cq_sync) {
1701                 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1702                     0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1703         }
1704 
1705         /*
1706          * Keep pulling entries from the "old" CQ until we find an entry owned
1707          * by the hardware.  Process each entry by copying it into the "new"
1708          * CQ and updating respective indices and pointers in the "old" CQ.
1709          */
1710         for (i = 0; i < num_newcqe; i++) {
1711 
1712                 /* Copy this old CQE into the "new_cqe" pointer */
1713                 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1714 
1715                 /* Increment the consumer index (for both CQs) */
1716                 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1717                 new_cons_indx = (new_cons_indx + 1);
1718 
1719                 /* Update the pointer to the next CQ entry */
1720                 old_cqe = &cq->cq_buf[old_cons_indx];
1721                 new_cqe = &new_cqbuf[new_cons_indx];
1722         }
1723 
1724         TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1725 }
1726 
1727 /*
1728  * tavor_cq_srq_entries_flush()
1729  * Context: Can be called from interrupt or base context.
1730  */
1731 void
1732 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1733 {
1734         tavor_cqhdl_t           cq;
1735         tavor_workq_hdr_t       *wqhdr;
1736         tavor_hw_cqe_t          *cqe;
1737         tavor_hw_cqe_t          *next_cqe;
1738         uint32_t                cons_indx, tail_cons_indx, wrap_around_mask;
1739         uint32_t                new_indx, check_indx, indx;
1740         uint32_t                num_to_increment;
1741         int                     cqe_qpnum, cqe_type;
1742         int                     outstanding_cqes, removed_cqes;
1743         int                     i;
1744 
1745         ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1746 
1747         cq = qp->qp_rq_cqhdl;
1748         wqhdr = qp->qp_rq_wqhdr;
1749 
1750         ASSERT(wqhdr->wq_wrid_post != NULL);
1751         ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1752 
1753         /*
1754          * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1755          * clients to modify any userland mapping CQ.  If the CQ is
1756          * user-mapped, then we simply return here, and this "flush" function
1757          * becomes a NO-OP in this case.
1758          */
1759         if (cq->cq_is_umap) {
1760                 return;
1761         }
1762 
1763         /* Get the consumer index */
1764         cons_indx = cq->cq_consindx;
1765 
1766         /*
1767          * Calculate the wrap around mask.  Note: This operation only works
1768          * because all Tavor completion queues have power-of-2 sizes
1769          */
1770         wrap_around_mask = (cq->cq_bufsz - 1);
1771 
1772         /* Calculate the pointer to the first CQ entry */
1773         cqe = &cq->cq_buf[cons_indx];
1774 
1775         /* Sync the current CQE to read */
1776         tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1777 
1778         /*
1779          * Loop through the CQ looking for entries owned by software.  If an
1780          * entry is owned by software then we increment an 'outstanding_cqes'
1781          * count to know how many entries total we have on our CQ.  We use this
1782          * value further down to know how many entries to loop through looking
1783          * for our same QP number.
1784          */
1785         outstanding_cqes = 0;
1786         tail_cons_indx = cons_indx;
1787         while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1788                 /* increment total cqes count */
1789                 outstanding_cqes++;
1790 
1791                 /* increment the consumer index */
1792                 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1793 
1794                 /* update the pointer to the next cq entry */
1795                 cqe = &cq->cq_buf[tail_cons_indx];
1796 
1797                 /* sync the next cqe to read */
1798                 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1799         }
1800 
1801         /*
1802          * Using the 'tail_cons_indx' that was just set, we now know how many
1803          * total CQEs possible there are.  Set the 'check_indx' and the
1804          * 'new_indx' to the last entry identified by 'tail_cons_indx'
1805          */
1806         check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1807 
1808         for (i = 0; i < outstanding_cqes; i++) {
1809                 cqe = &cq->cq_buf[check_indx];
1810 
1811                 /* Grab QP number from CQE */
1812                 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1813                 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1814 
1815                 /*
1816                  * If the QP number is the same in the CQE as the QP that we
1817                  * have on this SRQ, then we must free up the entry off the
1818                  * SRQ.  We also make sure that the completion type is of the
1819                  * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1820                  * this CQ will be left as-is.  The handling of returning
1821                  * entries back to HW ownership happens further down.
1822                  */
1823                 if (cqe_qpnum == qp->qp_qpnum &&
1824                     cqe_type == TAVOR_COMPLETION_RECV) {
1825 
1826                         /* Add back to SRQ free list */
1827                         (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1828                             cq, cqe);
1829                 } else {
1830                         /* Do Copy */
1831                         if (check_indx != new_indx) {
1832                                 next_cqe = &cq->cq_buf[new_indx];
1833 
1834                                 /*
1835                                  * Copy the CQE into the "next_cqe"
1836                                  * pointer.
1837                                  */
1838                                 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1839                         }
1840                         new_indx = (new_indx - 1) & wrap_around_mask;
1841                 }
1842                 /* Move index to next CQE to check */
1843                 check_indx = (check_indx - 1) & wrap_around_mask;
1844         }
1845 
1846         /* Initialize removed cqes count */
1847         removed_cqes = 0;
1848 
1849         /* If an entry was removed */
1850         if (check_indx != new_indx) {
1851 
1852                 /*
1853                  * Set current pointer back to the beginning consumer index.
1854                  * At this point, all unclaimed entries have been copied to the
1855                  * index specified by 'new_indx'.  This 'new_indx' will be used
1856                  * as the new consumer index after we mark all freed entries as
1857                  * having HW ownership.  We do that here.
1858                  */
1859 
1860                 /* Loop through all entries until we reach our new pointer */
1861                 for (indx = cons_indx; indx <= new_indx;
1862                     indx = (indx + 1) & wrap_around_mask) {
1863                         removed_cqes++;
1864                         cqe = &cq->cq_buf[indx];
1865 
1866                         /* Reset entry to hardware ownership */
1867                         TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1868                 }
1869         }
1870 
1871         /*
1872          * Update consumer index to be the 'new_indx'.  This moves it past all
1873          * removed entries.  Because 'new_indx' is pointing to the last
1874          * previously valid SW owned entry, we add 1 to point the cons_indx to
1875          * the first HW owned entry.
1876          */
1877         cons_indx = (new_indx + 1) & wrap_around_mask;
1878 
1879         /*
1880          * Now we only ring the doorbell (to update the consumer index) if
1881          * we've actually consumed a CQ entry.  If we found no QP number
1882          * matches above, then we would not have removed anything.  So only if
1883          * something was removed do we ring the doorbell.
1884          */
1885         if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1886                 /*
1887                  * Post doorbell to update the consumer index.  Doorbell
1888                  * value indicates number of entries consumed (minus 1)
1889                  */
1890                 if (cons_indx > cq->cq_consindx) {
1891                         num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1892                 } else {
1893                         num_to_increment = ((cons_indx + cq->cq_bufsz) -
1894                             cq->cq_consindx) - 1;
1895                 }
1896                 cq->cq_consindx = cons_indx;
1897 
1898                 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1899                     cq->cq_cqnum, num_to_increment);
1900         }
1901 }