1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_cq.c 29 * Tavor Completion Queue Processing Routines 30 * 31 * Implements all the routines necessary for allocating, freeing, resizing, 32 * and handling the completion type events that the Tavor hardware can 33 * generate. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/bitmap.h> 42 #include <sys/sysmacros.h> 43 44 #include <sys/ib/adapters/tavor/tavor.h> 45 46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, 47 uint32_t cqn, uint32_t cq_param); 48 #pragma inline(tavor_cq_doorbell) 49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 50 tavor_hw_cqe_t *cqe, ibt_wc_t *wc); 51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 52 tavor_hw_cqe_t *cqe, ibt_wc_t *wc); 53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, 54 uint_t flag); 55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf, 56 uint32_t old_cons_indx, uint32_t num_newcqe); 57 58 /* 59 * tavor_cq_alloc() 60 * Context: Can be called only from user or kernel context. 61 */ 62 int 63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl, 64 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl, 65 uint_t sleepflag) 66 { 67 tavor_rsrc_t *cqc, *rsrc; 68 tavor_umap_db_entry_t *umapdb; 69 tavor_hw_cqc_t cqc_entry; 70 tavor_cqhdl_t cq; 71 ibt_mr_attr_t mr_attr; 72 tavor_mr_options_t op; 73 tavor_pdhdl_t pd; 74 tavor_mrhdl_t mr; 75 tavor_hw_cqe_t *buf; 76 uint64_t addr, value; 77 uint32_t log_cq_size, lkey, uarpg; 78 uint_t dma_xfer_mode, cq_sync, cq_is_umap; 79 int status, i, flag; 80 char *errormsg; 81 82 TAVOR_TNF_ENTER(tavor_cq_alloc); 83 84 /* 85 * Determine whether CQ is being allocated for userland access or 86 * whether it is being allocated for kernel access. If the CQ is 87 * being allocated for userland access, then lookup the UAR doorbell 88 * page number for the current process. Note: If this is not found 89 * (e.g. if the process has not previously open()'d the Tavor driver), 90 * then an error is returned. 91 */ 92 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0; 93 if (cq_is_umap) { 94 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(), 95 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL); 96 if (status != DDI_SUCCESS) { 97 /* Set "status" and "errormsg" and goto failure */ 98 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page"); 99 goto cqalloc_fail; 100 } 101 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx; 102 } 103 104 /* Use the internal protection domain (PD) for setting up CQs */ 105 pd = state->ts_pdhdl_internal; 106 107 /* Increment the reference count on the protection domain (PD) */ 108 tavor_pd_refcnt_inc(pd); 109 110 /* 111 * Allocate an CQ context entry. This will be filled in with all 112 * the necessary parameters to define the Completion Queue. And then 113 * ownership will be passed to the hardware in the final step 114 * below. If we fail here, we must undo the protection domain 115 * reference count. 116 */ 117 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc); 118 if (status != DDI_SUCCESS) { 119 /* Set "status" and "errormsg" and goto failure */ 120 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context"); 121 goto cqalloc_fail1; 122 } 123 124 /* 125 * Allocate the software structure for tracking the completion queue 126 * (i.e. the Tavor Completion Queue handle). If we fail here, we must 127 * undo the protection domain reference count and the previous 128 * resource allocation. 129 */ 130 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc); 131 if (status != DDI_SUCCESS) { 132 /* Set "status" and "errormsg" and goto failure */ 133 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle"); 134 goto cqalloc_fail2; 135 } 136 cq = (tavor_cqhdl_t)rsrc->tr_addr; 137 cq->cq_is_umap = cq_is_umap; 138 139 /* Use the index as CQ number */ 140 cq->cq_cqnum = cqc->tr_indx; 141 142 /* 143 * If this will be a user-mappable CQ, then allocate an entry for 144 * the "userland resources database". This will later be added to 145 * the database (after all further CQ operations are successful). 146 * If we fail here, we must undo the reference counts and the 147 * previous resource allocation. 148 */ 149 if (cq->cq_is_umap) { 150 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum, 151 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc); 152 if (umapdb == NULL) { 153 /* Set "status" and "errormsg" and goto failure */ 154 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add"); 155 goto cqalloc_fail3; 156 } 157 } 158 159 /* 160 * Calculate the appropriate size for the completion queue. 161 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also 162 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is 163 * to round the requested size up to the next highest power-of-2 164 */ 165 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE); 166 log_cq_size = highbit(cq_attr->cq_size); 167 168 /* 169 * Next we verify that the rounded-up size is valid (i.e. consistent 170 * with the device limits and/or software-configured limits) 171 */ 172 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) { 173 /* Set "status" and "errormsg" and goto failure */ 174 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size"); 175 goto cqalloc_fail4; 176 } 177 178 /* 179 * Allocate the memory for Completion Queue. 180 * 181 * Note: Although we use the common queue allocation routine, we 182 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in 183 * kernel system memory) for kernel CQs because it would be 184 * inefficient to have CQs located in DDR memory. This is primarily 185 * because CQs are read from (by software) more than they are written 186 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all 187 * user-mappable CQs for a similar reason.) 188 * It is also worth noting that, unlike Tavor QP work queues, 189 * completion queues do not have the same strict alignment 190 * requirements. It is sufficient for the CQ memory to be both 191 * aligned to and bound to addresses which are a multiple of CQE size. 192 */ 193 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t); 194 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t); 195 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t); 196 if (cq->cq_is_umap) { 197 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 198 } else { 199 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL; 200 } 201 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag); 202 if (status != DDI_SUCCESS) { 203 /* Set "status" and "errormsg" and goto failure */ 204 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue"); 205 goto cqalloc_fail4; 206 } 207 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned; 208 209 /* 210 * Initialize each of the Completion Queue Entries (CQE) by setting 211 * their ownership to hardware ("owner" bit set to HW). This is in 212 * preparation for the final transfer of ownership (below) of the 213 * CQ context itself. 214 */ 215 for (i = 0; i < (1 << log_cq_size); i++) { 216 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]); 217 } 218 219 /* 220 * Register the memory for the CQ. The memory for the CQ must 221 * be registered in the Tavor TPT tables. This gives us the LKey 222 * to specify in the CQ context below. Note: If this is a user- 223 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping. 224 */ 225 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 226 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 227 mr_attr.mr_len = cq->cq_cqinfo.qa_size; 228 mr_attr.mr_as = NULL; 229 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 230 if (cq->cq_is_umap) { 231 dma_xfer_mode = DDI_DMA_CONSISTENT; 232 } else { 233 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 234 } 235 if (dma_xfer_mode == DDI_DMA_STREAMING) { 236 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 237 } 238 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 239 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl; 240 op.mro_bind_override_addr = 0; 241 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 242 if (status != DDI_SUCCESS) { 243 /* Set "status" and "errormsg" and goto failure */ 244 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 245 goto cqalloc_fail5; 246 } 247 addr = mr->mr_bindinfo.bi_addr; 248 lkey = mr->mr_lkey; 249 250 /* Determine if later ddi_dma_sync will be necessary */ 251 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo); 252 253 /* Sync entire CQ for use by the hardware (if necessary). */ 254 if (cq_sync) { 255 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 256 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 257 } 258 259 /* 260 * Fill in the CQC entry. This is the final step before passing 261 * ownership of the CQC entry to the Tavor hardware. We use all of 262 * the information collected/calculated above to fill in the 263 * requisite portions of the CQC. Note: If this CQ is going to be 264 * used for userland access, then we need to set the UAR page number 265 * appropriately (otherwise it's a "don't care") 266 */ 267 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t)); 268 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum); 269 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum); 270 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED; 271 cqc_entry.state = TAVOR_CQ_DISARMED; 272 cqc_entry.start_addr_h = (addr >> 32); 273 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF); 274 cqc_entry.log_cq_sz = log_cq_size; 275 if (cq->cq_is_umap) { 276 cqc_entry.usr_page = uarpg; 277 } else { 278 cqc_entry.usr_page = 0; 279 } 280 cqc_entry.pd = pd->pd_pdnum; 281 cqc_entry.lkey = lkey; 282 cqc_entry.e_eqn = cq->cq_erreqnum; 283 cqc_entry.c_eqn = cq->cq_eqnum; 284 cqc_entry.cqn = cq->cq_cqnum; 285 286 /* 287 * Write the CQC entry to hardware. Lastly, we pass ownership of 288 * the entry to the hardware (using the Tavor SW2HW_CQ firmware 289 * command). Note: In general, this operation shouldn't fail. But 290 * if it does, we have to undo everything we've done above before 291 * returning error. 292 */ 293 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry, 294 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag); 295 if (status != TAVOR_CMD_SUCCESS) { 296 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n", 297 status); 298 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail, 299 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 300 /* Set "status" and "errormsg" and goto failure */ 301 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command"); 302 goto cqalloc_fail6; 303 } 304 305 /* 306 * Fill in the rest of the Tavor Completion Queue handle. Having 307 * successfully transferred ownership of the CQC, we can update the 308 * following fields for use in further operations on the CQ. 309 */ 310 cq->cq_cqcrsrcp = cqc; 311 cq->cq_rsrcp = rsrc; 312 cq->cq_consindx = 0; 313 cq->cq_buf = buf; 314 cq->cq_bufsz = (1 << log_cq_size); 315 cq->cq_mrhdl = mr; 316 cq->cq_sync = cq_sync; 317 cq->cq_refcnt = 0; 318 cq->cq_is_special = 0; 319 cq->cq_uarpg = uarpg; 320 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 321 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare, 322 sizeof (struct tavor_workq_hdr_s), 323 offsetof(struct tavor_workq_hdr_s, wq_avl_link)); 324 325 cq->cq_wrid_reap_head = NULL; 326 cq->cq_wrid_reap_tail = NULL; 327 cq->cq_hdlrarg = (void *)ibt_cqhdl; 328 329 /* 330 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the 331 * "actual_size" and "cqhdl" and return success 332 */ 333 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL); 334 state->ts_cqhdl[cqc->tr_indx] = cq; 335 336 /* 337 * If this is a user-mappable CQ, then we need to insert the previously 338 * allocated entry into the "userland resources database". This will 339 * allow for later lookup during devmap() (i.e. mmap()) calls. 340 */ 341 if (cq->cq_is_umap) { 342 tavor_umap_db_add(umapdb); 343 } 344 345 /* 346 * Fill in the return arguments (if necessary). This includes the 347 * real completion queue size. 348 */ 349 if (actual_size != NULL) { 350 *actual_size = (1 << log_cq_size) - 1; 351 } 352 *cqhdl = cq; 353 354 TAVOR_TNF_EXIT(tavor_cq_alloc); 355 return (DDI_SUCCESS); 356 357 /* 358 * The following is cleanup for all possible failure cases in this routine 359 */ 360 cqalloc_fail6: 361 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 362 sleepflag) != DDI_SUCCESS) { 363 TAVOR_WARNING(state, "failed to deregister CQ memory"); 364 } 365 cqalloc_fail5: 366 tavor_queue_free(state, &cq->cq_cqinfo); 367 cqalloc_fail4: 368 if (cq_is_umap) { 369 tavor_umap_db_free(umapdb); 370 } 371 cqalloc_fail3: 372 tavor_rsrc_free(state, &rsrc); 373 cqalloc_fail2: 374 tavor_rsrc_free(state, &cqc); 375 cqalloc_fail1: 376 tavor_pd_refcnt_dec(pd); 377 cqalloc_fail: 378 TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "", 379 tnf_string, msg, errormsg); 380 TAVOR_TNF_EXIT(tavor_cq_alloc); 381 return (status); 382 } 383 384 385 /* 386 * tavor_cq_free() 387 * Context: Can be called only from user or kernel context. 388 */ 389 /* ARGSUSED */ 390 int 391 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag) 392 { 393 tavor_rsrc_t *cqc, *rsrc; 394 tavor_umap_db_entry_t *umapdb; 395 tavor_hw_cqc_t cqc_entry; 396 tavor_pdhdl_t pd; 397 tavor_mrhdl_t mr; 398 tavor_cqhdl_t cq; 399 uint32_t cqnum; 400 uint64_t value; 401 uint_t maxprot; 402 int status; 403 404 TAVOR_TNF_ENTER(tavor_cq_free); 405 406 /* 407 * Pull all the necessary information from the Tavor Completion Queue 408 * handle. This is necessary here because the resource for the 409 * CQ handle is going to be freed up as part of this operation. 410 */ 411 cq = *cqhdl; 412 mutex_enter(&cq->cq_lock); 413 cqc = cq->cq_cqcrsrcp; 414 rsrc = cq->cq_rsrcp; 415 pd = state->ts_pdhdl_internal; 416 mr = cq->cq_mrhdl; 417 cqnum = cq->cq_cqnum; 418 419 /* 420 * If there are work queues still associated with the CQ, then return 421 * an error. Otherwise, we will be holding the CQ lock. 422 */ 423 if (cq->cq_refcnt != 0) { 424 mutex_exit(&cq->cq_lock); 425 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "", 426 tnf_int, refcnt, cq->cq_refcnt); 427 TAVOR_TNF_EXIT(tavor_cq_free); 428 return (IBT_CQ_BUSY); 429 } 430 431 /* 432 * If this was a user-mappable CQ, then we need to remove its entry 433 * from the "userland resources database". If it is also currently 434 * mmap()'d out to a user process, then we need to call 435 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping. 436 * We also need to invalidate the CQ tracking information for the 437 * user mapping. 438 */ 439 if (cq->cq_is_umap) { 440 status = tavor_umap_db_find(state->ts_instance, cqnum, 441 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE, 442 &umapdb); 443 if (status != DDI_SUCCESS) { 444 mutex_exit(&cq->cq_lock); 445 TAVOR_WARNING(state, "failed to find in database"); 446 TAVOR_TNF_EXIT(tavor_cq_free); 447 return (ibc_get_ci_failure(0)); 448 } 449 tavor_umap_db_free(umapdb); 450 if (cq->cq_umap_dhp != NULL) { 451 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 452 status = devmap_devmem_remap(cq->cq_umap_dhp, 453 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, 454 maxprot, DEVMAP_MAPPING_INVALID, NULL); 455 if (status != DDI_SUCCESS) { 456 mutex_exit(&cq->cq_lock); 457 TAVOR_WARNING(state, "failed in CQ memory " 458 "devmap_devmem_remap()"); 459 TAVOR_TNF_EXIT(tavor_cq_free); 460 return (ibc_get_ci_failure(0)); 461 } 462 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 463 } 464 } 465 466 /* 467 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any 468 * in-progress events to detect that the CQ corresponding to this 469 * number has been freed. 470 */ 471 state->ts_cqhdl[cqc->tr_indx] = NULL; 472 473 /* 474 * While we hold the CQ lock, do a "forced reap" of the workQ WRID 475 * list. This cleans up all the structures associated with the WRID 476 * processing for this CQ. Once we complete, drop the lock and finish 477 * the deallocation of the CQ. 478 */ 479 tavor_wrid_cq_force_reap(cq); 480 481 mutex_exit(&cq->cq_lock); 482 483 /* 484 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ 485 * firmware command). If the ownership transfer fails for any reason, 486 * then it is an indication that something (either in HW or SW) has 487 * gone seriously wrong. 488 */ 489 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry, 490 sizeof (tavor_hw_cqc_t), cqnum, sleepflag); 491 if (status != TAVOR_CMD_SUCCESS) { 492 TAVOR_WARNING(state, "failed to reclaim CQC ownership"); 493 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n", 494 status); 495 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail, 496 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 497 TAVOR_TNF_EXIT(tavor_cq_free); 498 return (ibc_get_ci_failure(0)); 499 } 500 501 /* 502 * Deregister the memory for the Completion Queue. If this fails 503 * for any reason, then it is an indication that something (either 504 * in HW or SW) has gone seriously wrong. So we print a warning 505 * message and return. 506 */ 507 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 508 sleepflag); 509 if (status != DDI_SUCCESS) { 510 TAVOR_WARNING(state, "failed to deregister CQ memory"); 511 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, ""); 512 TAVOR_TNF_EXIT(tavor_cq_free); 513 return (ibc_get_ci_failure(0)); 514 } 515 516 /* Free the memory for the CQ */ 517 tavor_queue_free(state, &cq->cq_cqinfo); 518 519 /* Free the Tavor Completion Queue handle */ 520 tavor_rsrc_free(state, &rsrc); 521 522 /* Free up the CQC entry resource */ 523 tavor_rsrc_free(state, &cqc); 524 525 /* Decrement the reference count on the protection domain (PD) */ 526 tavor_pd_refcnt_dec(pd); 527 528 /* Set the cqhdl pointer to NULL and return success */ 529 *cqhdl = NULL; 530 531 TAVOR_TNF_EXIT(tavor_cq_free); 532 return (DDI_SUCCESS); 533 } 534 535 536 /* 537 * tavor_cq_resize() 538 * Context: Can be called only from user or kernel context. 539 */ 540 int 541 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size, 542 uint_t *actual_size, uint_t sleepflag) 543 { 544 tavor_hw_cqc_t cqc_entry; 545 tavor_qalloc_info_t new_cqinfo, old_cqinfo; 546 ibt_mr_attr_t mr_attr; 547 tavor_mr_options_t op; 548 tavor_pdhdl_t pd; 549 tavor_mrhdl_t mr, mr_old; 550 tavor_hw_cqe_t *buf; 551 uint32_t new_prod_indx, old_cons_indx; 552 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot; 553 int status, i, flag; 554 char *errormsg; 555 556 TAVOR_TNF_ENTER(tavor_cq_resize); 557 558 /* Use the internal protection domain (PD) for CQs */ 559 pd = state->ts_pdhdl_internal; 560 561 /* 562 * Calculate the appropriate size for the new resized completion queue. 563 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also 564 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is 565 * to round the requested size up to the next highest power-of-2 566 */ 567 req_size = max(req_size, TAVOR_CQ_MIN_SIZE); 568 log_cq_size = highbit(req_size); 569 570 /* 571 * Next we verify that the rounded-up size is valid (i.e. consistent 572 * with the device limits and/or software-configured limits) 573 */ 574 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) { 575 /* Set "status" and "errormsg" and goto failure */ 576 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size"); 577 goto cqresize_fail; 578 } 579 580 /* 581 * Allocate the memory for newly resized Completion Queue. 582 * 583 * Note: Although we use the common queue allocation routine, we 584 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in 585 * kernel system memory) for kernel CQs because it would be 586 * inefficient to have CQs located in DDR memory. This is the same 587 * as we do when we first allocate completion queues primarily 588 * because CQs are read from (by software) more than they are written 589 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all 590 * user-mappable CQs for a similar reason.) 591 * It is also worth noting that, unlike Tavor QP work queues, 592 * completion queues do not have the same strict alignment 593 * requirements. It is sufficient for the CQ memory to be both 594 * aligned to and bound to addresses which are a multiple of CQE size. 595 */ 596 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t); 597 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t); 598 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t); 599 if (cq->cq_is_umap) { 600 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 601 } else { 602 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL; 603 } 604 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag); 605 if (status != DDI_SUCCESS) { 606 /* Set "status" and "errormsg" and goto failure */ 607 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue"); 608 goto cqresize_fail; 609 } 610 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned; 611 612 /* 613 * Initialize each of the Completion Queue Entries (CQE) by setting 614 * their ownership to hardware ("owner" bit set to HW). This is in 615 * preparation for the final resize operation (below). 616 */ 617 for (i = 0; i < (1 << log_cq_size); i++) { 618 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]); 619 } 620 621 /* 622 * Register the memory for the CQ. The memory for the CQ must 623 * be registered in the Tavor TPT tables. This gives us the LKey 624 * to specify in the CQ context below. 625 */ 626 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 627 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 628 mr_attr.mr_len = new_cqinfo.qa_size; 629 mr_attr.mr_as = NULL; 630 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 631 if (cq->cq_is_umap) { 632 dma_xfer_mode = DDI_DMA_CONSISTENT; 633 } else { 634 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 635 } 636 if (dma_xfer_mode == DDI_DMA_STREAMING) { 637 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 638 } 639 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 640 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl; 641 op.mro_bind_override_addr = 0; 642 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 643 if (status != DDI_SUCCESS) { 644 tavor_queue_free(state, &new_cqinfo); 645 /* Set "status" and "errormsg" and goto failure */ 646 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 647 goto cqresize_fail; 648 } 649 650 /* Determine if later ddi_dma_sync will be necessary */ 651 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo); 652 653 /* Sync entire "new" CQ for use by hardware (if necessary) */ 654 if (cq_sync) { 655 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 656 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 657 } 658 659 /* 660 * Now we grab the CQ lock. Since we will be updating the actual 661 * CQ location and the producer/consumer indexes, we should hold 662 * the lock. 663 * 664 * We do a TAVOR_NOSLEEP here (and below), though, because we are 665 * holding the "cq_lock" and if we got raised to interrupt level 666 * by priority inversion, we would not want to block in this routine 667 * waiting for success. 668 */ 669 mutex_enter(&cq->cq_lock); 670 671 /* 672 * Determine the current CQ "consumer index". 673 * 674 * Note: This will depend on whether the CQ had previously been 675 * mapped for user access or whether it is a kernel CQ. If this 676 * is a kernel CQ, then all PollCQ() operations have come through 677 * the IBTF and, hence, the driver's CQ state structure will 678 * contain the current consumer index. If, however, the user has 679 * accessed this CQ by bypassing the driver (OS-bypass), then we 680 * need to query the firmware to determine the current CQ consumer 681 * index. This also assumes that the user process will not continue 682 * to consume entries while at the same time doing the ResizeCQ() 683 * operation. If the user process does not guarantee this, then it 684 * may see duplicate or missed completions. But under no 685 * circumstances should this panic the system. 686 */ 687 if (cq->cq_is_umap) { 688 status = tavor_cmn_query_cmd_post(state, QUERY_CQ, 689 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t), 690 TAVOR_NOSLEEP); 691 if (status != TAVOR_CMD_SUCCESS) { 692 /* Query CQ has failed, drop CQ lock and cleanup */ 693 mutex_exit(&cq->cq_lock); 694 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 695 sleepflag) != DDI_SUCCESS) { 696 TAVOR_WARNING(state, "failed to deregister " 697 "CQ memory"); 698 } 699 tavor_queue_free(state, &new_cqinfo); 700 TAVOR_WARNING(state, "failed to find in database"); 701 702 /* Set "status" and "errormsg" and goto failure */ 703 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 704 "failed umap lookup"); 705 goto cqresize_fail; 706 } 707 old_cons_indx = cqc_entry.cons_indx; 708 } else { 709 old_cons_indx = cq->cq_consindx; 710 } 711 712 /* 713 * Fill in the CQC entry. For the resize operation this is the 714 * final step before attempting the resize operation on the CQC entry. 715 * We use all of the information collected/calculated above to fill 716 * in the requisite portions of the CQC. 717 */ 718 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t)); 719 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32); 720 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF); 721 cqc_entry.log_cq_sz = log_cq_size; 722 cqc_entry.lkey = mr->mr_lkey; 723 724 /* 725 * Write the CQC entry to hardware. Lastly, we pass ownership of 726 * the entry to the hardware (using the Tavor RESIZE_CQ firmware 727 * command). Note: In general, this operation shouldn't fail. But 728 * if it does, we have to undo everything we've done above before 729 * returning error. Also note that the status returned may indicate 730 * the code to return to the IBTF. 731 */ 732 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum, 733 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN); 734 if (status != TAVOR_CMD_SUCCESS) { 735 /* Resize attempt has failed, drop CQ lock and cleanup */ 736 mutex_exit(&cq->cq_lock); 737 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 738 sleepflag) != DDI_SUCCESS) { 739 TAVOR_WARNING(state, "failed to deregister CQ memory"); 740 } 741 tavor_queue_free(state, &new_cqinfo); 742 if (status == TAVOR_CMD_BAD_SIZE) { 743 TAVOR_TNF_EXIT(tavor_cq_resize); 744 return (IBT_CQ_SZ_INSUFFICIENT); 745 } else { 746 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: " 747 "%08x\n", status); 748 TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail, 749 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 750 TAVOR_TNF_EXIT(tavor_cq_resize); 751 return (ibc_get_ci_failure(0)); 752 } 753 } 754 755 /* 756 * The CQ resize attempt was successful. Before dropping the CQ lock, 757 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note: 758 * the Tavor firmware guarantees us that sufficient space is set aside 759 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ. 760 * The two parameters to this helper function ("old_cons_indx" and 761 * "new_prod_indx") essentially indicate the starting index and number 762 * of any CQEs that might remain in the "old" CQ memory. 763 */ 764 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx); 765 766 /* Sync entire "new" CQ for use by hardware (if necessary) */ 767 if (cq_sync) { 768 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 769 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 770 } 771 772 /* 773 * Update the Tavor Completion Queue handle with all the new 774 * information. At the same time, save away all the necessary 775 * information for freeing up the old resources 776 */ 777 mr_old = cq->cq_mrhdl; 778 old_cqinfo = cq->cq_cqinfo; 779 cq->cq_cqinfo = new_cqinfo; 780 cq->cq_consindx = 0; 781 cq->cq_buf = buf; 782 cq->cq_bufsz = (1 << log_cq_size); 783 cq->cq_mrhdl = mr; 784 cq->cq_sync = cq_sync; 785 786 /* 787 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out 788 * to a user process, then we need to call devmap_devmem_remap() to 789 * invalidate the mapping to the CQ memory. We also need to 790 * invalidate the CQ tracking information for the user mapping. 791 */ 792 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) { 793 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 794 status = devmap_devmem_remap(cq->cq_umap_dhp, 795 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot, 796 DEVMAP_MAPPING_INVALID, NULL); 797 if (status != DDI_SUCCESS) { 798 mutex_exit(&cq->cq_lock); 799 TAVOR_WARNING(state, "failed in CQ memory " 800 "devmap_devmem_remap()"); 801 TAVOR_TNF_EXIT(tavor_cq_free); 802 return (ibc_get_ci_failure(0)); 803 } 804 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 805 } 806 807 /* 808 * Drop the CQ lock now. The only thing left to do is to free up 809 * the old resources. 810 */ 811 mutex_exit(&cq->cq_lock); 812 813 /* 814 * Deregister the memory for the old Completion Queue. Note: We 815 * really can't return error here because we have no good way to 816 * cleanup. Plus, the deregistration really shouldn't ever happen. 817 * So, if it does, it is an indication that something has gone 818 * seriously wrong. So we print a warning message and return error 819 * (knowing, of course, that the "old" CQ memory will be leaked) 820 */ 821 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL, 822 sleepflag); 823 if (status != DDI_SUCCESS) { 824 TAVOR_WARNING(state, "failed to deregister old CQ memory"); 825 /* Set "status" and "errormsg" and goto failure */ 826 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 827 "failed deregister mr (old)"); 828 goto cqresize_fail; 829 } 830 831 /* Free the memory for the old CQ */ 832 tavor_queue_free(state, &old_cqinfo); 833 834 /* 835 * Fill in the return arguments (if necessary). This includes the 836 * real new completion queue size. 837 */ 838 if (actual_size != NULL) { 839 *actual_size = (1 << log_cq_size) - 1; 840 } 841 842 TAVOR_TNF_EXIT(tavor_cq_resize); 843 return (DDI_SUCCESS); 844 845 cqresize_fail: 846 TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "", 847 tnf_string, msg, errormsg); 848 TAVOR_TNF_EXIT(tavor_cq_resize); 849 return (status); 850 } 851 852 853 /* 854 * tavor_cq_notify() 855 * Context: Can be called from interrupt or base context. 856 */ 857 int 858 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq, 859 ibt_cq_notify_flags_t flags) 860 { 861 uint_t cqnum; 862 863 TAVOR_TNF_ENTER(tavor_cq_notify); 864 865 /* 866 * Determine if we are trying to get the next completion or the next 867 * "solicited" completion. Then hit the appropriate doorbell. 868 * 869 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll 870 * regarding why we do not have to do an extra PIO read here, and we 871 * will not lose an event after writing this doorbell. 872 */ 873 cqnum = cq->cq_cqnum; 874 if (flags == IBT_NEXT_COMPLETION) { 875 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum, 876 TAVOR_CQDB_DEFAULT_PARAM); 877 878 } else if (flags == IBT_NEXT_SOLICITED) { 879 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT, 880 cqnum, TAVOR_CQDB_DEFAULT_PARAM); 881 882 } else { 883 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "", 884 tnf_int, flags, flags); 885 TAVOR_TNF_EXIT(tavor_cq_notify); 886 return (IBT_CQ_NOTIFY_TYPE_INVALID); 887 } 888 889 TAVOR_TNF_EXIT(tavor_cq_notify); 890 return (DDI_SUCCESS); 891 } 892 893 894 /* 895 * tavor_cq_poll() 896 * Context: Can be called from interrupt or base context. 897 */ 898 int 899 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p, 900 uint_t num_wc, uint_t *num_polled) 901 { 902 tavor_hw_cqe_t *cqe; 903 uint32_t cons_indx, wrap_around_mask; 904 uint32_t polled_cnt, num_to_increment; 905 int status; 906 907 TAVOR_TNF_ENTER(tavor_cq_poll); 908 909 /* 910 * Check for user-mappable CQ memory. Note: We do not allow kernel 911 * clients to poll CQ memory that is accessible directly by the user. 912 * If the CQ memory is user accessible, then return an error. 913 */ 914 if (cq->cq_is_umap) { 915 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type, 916 TAVOR_TNF_ERROR, ""); 917 TAVOR_TNF_EXIT(tavor_cq_poll); 918 return (IBT_CQ_HDL_INVALID); 919 } 920 921 mutex_enter(&cq->cq_lock); 922 923 /* Get the consumer index */ 924 cons_indx = cq->cq_consindx; 925 926 /* 927 * Calculate the wrap around mask. Note: This operation only works 928 * because all Tavor completion queues have power-of-2 sizes 929 */ 930 wrap_around_mask = (cq->cq_bufsz - 1); 931 932 /* Calculate the pointer to the first CQ entry */ 933 cqe = &cq->cq_buf[cons_indx]; 934 935 /* Sync the current CQE to read */ 936 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 937 938 /* 939 * Keep pulling entries from the CQ until we find an entry owned by 940 * the hardware. As long as there the CQE's owned by SW, process 941 * each entry by calling tavor_cq_cqe_consume() and updating the CQ 942 * consumer index. Note: We only update the consumer index if 943 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise, 944 * it indicates that we are going to "recycle" the CQE (probably 945 * because it is a error CQE and corresponds to more than one 946 * completion). 947 */ 948 polled_cnt = 0; 949 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) { 950 status = tavor_cq_cqe_consume(state, cq, cqe, 951 &wc_p[polled_cnt++]); 952 if (status == TAVOR_CQ_SYNC_AND_DB) { 953 /* Reset entry to hardware ownership */ 954 TAVOR_CQE_OWNER_SET_HW(cq, cqe); 955 956 /* Sync the current CQE for device */ 957 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV); 958 959 /* Increment the consumer index */ 960 cons_indx = (cons_indx + 1) & wrap_around_mask; 961 962 /* Update the pointer to the next CQ entry */ 963 cqe = &cq->cq_buf[cons_indx]; 964 965 /* Sync the next CQE to read */ 966 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 967 } 968 969 /* 970 * If we have run out of space to store work completions, 971 * then stop and return the ones we have pulled of the CQ. 972 */ 973 if (polled_cnt >= num_wc) { 974 break; 975 } 976 } 977 978 /* 979 * Now we only ring the doorbell (to update the consumer index) if 980 * we've actually consumed a CQ entry. If we have, for example, 981 * pulled from a CQE that we are still in the process of "recycling" 982 * for error purposes, then we would not update the consumer index. 983 */ 984 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) { 985 /* 986 * Post doorbell to update the consumer index. Doorbell 987 * value indicates number of entries consumed (minus 1) 988 */ 989 if (cons_indx > cq->cq_consindx) { 990 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 991 } else { 992 num_to_increment = ((cons_indx + cq->cq_bufsz) - 993 cq->cq_consindx) - 1; 994 } 995 cq->cq_consindx = cons_indx; 996 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX, 997 cq->cq_cqnum, num_to_increment); 998 999 } else if (polled_cnt == 0) { 1000 /* 1001 * If the CQ is empty, we can try to free up some of the WRID 1002 * list containers. See tavor_wr.c for more details on this 1003 * operation. 1004 */ 1005 tavor_wrid_cq_reap(cq); 1006 } 1007 1008 mutex_exit(&cq->cq_lock); 1009 1010 /* Set "num_polled" (if necessary) */ 1011 if (num_polled != NULL) { 1012 *num_polled = polled_cnt; 1013 } 1014 1015 /* Set CQ_EMPTY condition if needed, otherwise return success */ 1016 if (polled_cnt == 0) { 1017 status = IBT_CQ_EMPTY; 1018 } else { 1019 status = DDI_SUCCESS; 1020 } 1021 1022 /* 1023 * Check if the system is currently panicking. If it is, then call 1024 * the Tavor interrupt service routine. This step is necessary here 1025 * because we might be in a polled I/O mode and without the call to 1026 * tavor_isr() - and its subsequent calls to poll and rearm each 1027 * event queue - we might overflow our EQs and render the system 1028 * unable to sync/dump. 1029 */ 1030 if (ddi_in_panic() != 0) { 1031 (void) tavor_isr((caddr_t)state, (caddr_t)NULL); 1032 } 1033 1034 TAVOR_TNF_EXIT(tavor_cq_poll); 1035 return (status); 1036 } 1037 1038 1039 /* 1040 * tavor_cq_handler() 1041 * Context: Only called from interrupt context 1042 */ 1043 int 1044 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq, 1045 tavor_hw_eqe_t *eqe) 1046 { 1047 tavor_cqhdl_t cq; 1048 uint_t cqnum; 1049 uint_t eqe_evttype; 1050 1051 TAVOR_TNF_ENTER(tavor_cq_handler); 1052 1053 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe); 1054 1055 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION || 1056 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW); 1057 1058 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) { 1059 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition, 1060 TAVOR_TNF_ERROR, ""); 1061 tavor_eq_overflow_handler(state, eq, eqe); 1062 1063 TAVOR_TNF_EXIT(tavor_cq_handler); 1064 return (DDI_FAILURE); 1065 } 1066 1067 1068 /* Get the CQ handle from CQ number in event descriptor */ 1069 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe); 1070 cq = tavor_cqhdl_from_cqnum(state, cqnum); 1071 1072 /* 1073 * Post the EQ doorbell to move the CQ to the "disarmed" state. 1074 * This operation is to enable subsequent CQ doorbells (e.g. those 1075 * that can be rung by tavor_cq_notify() above) to rearm the CQ. 1076 */ 1077 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum); 1078 1079 /* 1080 * If the CQ handle is NULL, this is probably an indication 1081 * that the CQ has been freed already. In which case, we 1082 * should not deliver this event. 1083 * 1084 * We also check that the CQ number in the handle is the 1085 * same as the CQ number in the event queue entry. This 1086 * extra check allows us to handle the case where a CQ was 1087 * freed and then allocated again in the time it took to 1088 * handle the event queue processing. By constantly incrementing 1089 * the non-constrained portion of the CQ number every time 1090 * a new CQ is allocated, we mitigate (somewhat) the chance 1091 * that a stale event could be passed to the client's CQ 1092 * handler. 1093 * 1094 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it 1095 * means that we've have either received this event before we 1096 * finished attaching to the IBTF or we've received it while we 1097 * are in the process of detaching. 1098 */ 1099 if ((cq != NULL) && (cq->cq_cqnum == cqnum) && 1100 (state->ts_ibtfpriv != NULL)) { 1101 TAVOR_DO_IBTF_CQ_CALLB(state, cq); 1102 } else { 1103 TNF_PROBE_2(tavor_cq_handler_dropped_event, 1104 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum, 1105 tnf_uint, hdl_cqnum, cqnum); 1106 } 1107 1108 TAVOR_TNF_EXIT(tavor_cq_handler); 1109 return (DDI_SUCCESS); 1110 } 1111 1112 1113 /* 1114 * tavor_cq_err_handler() 1115 * Context: Only called from interrupt context 1116 */ 1117 int 1118 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq, 1119 tavor_hw_eqe_t *eqe) 1120 { 1121 tavor_cqhdl_t cq; 1122 uint_t cqnum; 1123 ibc_async_event_t event; 1124 ibt_async_code_t type; 1125 uint_t eqe_evttype; 1126 1127 TAVOR_TNF_ENTER(tavor_cq_err_handler); 1128 1129 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe); 1130 1131 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS || 1132 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW); 1133 1134 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) { 1135 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition, 1136 TAVOR_TNF_ERROR, ""); 1137 tavor_eq_overflow_handler(state, eq, eqe); 1138 1139 TAVOR_TNF_EXIT(tavor_cq_err_handler); 1140 return (DDI_FAILURE); 1141 } 1142 1143 /* cmn_err(CE_CONT, "CQ Error handler\n"); */ 1144 1145 /* Get the CQ handle from CQ number in event descriptor */ 1146 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe); 1147 cq = tavor_cqhdl_from_cqnum(state, cqnum); 1148 1149 /* 1150 * If the CQ handle is NULL, this is probably an indication 1151 * that the CQ has been freed already. In which case, we 1152 * should not deliver this event. 1153 * 1154 * We also check that the CQ number in the handle is the 1155 * same as the CQ number in the event queue entry. This 1156 * extra check allows us to handle the case where a CQ was 1157 * freed and then allocated again in the time it took to 1158 * handle the event queue processing. By constantly incrementing 1159 * the non-constrained portion of the CQ number every time 1160 * a new CQ is allocated, we mitigate (somewhat) the chance 1161 * that a stale event could be passed to the client's CQ 1162 * handler. 1163 * 1164 * And then we check if "ts_ibtfpriv" is NULL. If it is then it 1165 * means that we've have either received this event before we 1166 * finished attaching to the IBTF or we've received it while we 1167 * are in the process of detaching. 1168 */ 1169 if ((cq != NULL) && (cq->cq_cqnum == cqnum) && 1170 (state->ts_ibtfpriv != NULL)) { 1171 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg; 1172 type = IBT_ERROR_CQ; 1173 1174 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event); 1175 } else { 1176 TNF_PROBE_2(tavor_cq_err_handler_dropped_event, 1177 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum, 1178 tnf_uint, hdl_cqnum, cqnum); 1179 } 1180 1181 TAVOR_TNF_EXIT(tavor_cq_err_handler); 1182 return (DDI_SUCCESS); 1183 } 1184 1185 1186 /* 1187 * tavor_cq_refcnt_inc() 1188 * Context: Can be called from interrupt or base context. 1189 */ 1190 int 1191 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special) 1192 { 1193 /* 1194 * Increment the completion queue's reference count. Note: In order 1195 * to ensure compliance with IBA C11-15, we must ensure that a given 1196 * CQ is not used for both special (SMI/GSI) QP and non-special QP. 1197 * This is accomplished here by keeping track of how the referenced 1198 * CQ is being used. 1199 */ 1200 mutex_enter(&cq->cq_lock); 1201 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "", 1202 tnf_uint, refcnt, cq->cq_refcnt); 1203 if (cq->cq_refcnt == 0) { 1204 cq->cq_is_special = is_special; 1205 } else { 1206 if (cq->cq_is_special != is_special) { 1207 mutex_exit(&cq->cq_lock); 1208 return (DDI_FAILURE); 1209 } 1210 } 1211 cq->cq_refcnt++; 1212 mutex_exit(&cq->cq_lock); 1213 return (DDI_SUCCESS); 1214 } 1215 1216 1217 /* 1218 * tavor_cq_refcnt_dec() 1219 * Context: Can be called from interrupt or base context. 1220 */ 1221 void 1222 tavor_cq_refcnt_dec(tavor_cqhdl_t cq) 1223 { 1224 /* Decrement the completion queue's reference count */ 1225 mutex_enter(&cq->cq_lock); 1226 cq->cq_refcnt--; 1227 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "", 1228 tnf_uint, refcnt, cq->cq_refcnt); 1229 mutex_exit(&cq->cq_lock); 1230 } 1231 1232 1233 /* 1234 * tavor_cq_doorbell() 1235 * Context: Can be called from interrupt or base context. 1236 */ 1237 static void 1238 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn, 1239 uint32_t cq_param) 1240 { 1241 uint64_t doorbell = 0; 1242 1243 /* Build the doorbell from the parameters */ 1244 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | 1245 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param; 1246 1247 TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "", 1248 tnf_ulong, doorbell, doorbell); 1249 1250 /* Write the doorbell to UAR */ 1251 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq, 1252 doorbell); 1253 } 1254 1255 1256 /* 1257 * tavor_cqhdl_from_cqnum() 1258 * Context: Can be called from interrupt or base context. 1259 * 1260 * This routine is important because changing the unconstrained 1261 * portion of the CQ number is critical to the detection of a 1262 * potential race condition in the CQ handler code (i.e. the case 1263 * where a CQ is freed and alloc'd again before an event for the 1264 * "old" CQ can be handled). 1265 * 1266 * While this is not a perfect solution (not sure that one exists) 1267 * it does help to mitigate the chance that this race condition will 1268 * cause us to deliver a "stale" event to the new CQ owner. Note: 1269 * this solution does not scale well because the number of constrained 1270 * bits increases (and, hence, the number of unconstrained bits 1271 * decreases) as the number of supported CQs grows. For small and 1272 * intermediate values, it should hopefully provide sufficient 1273 * protection. 1274 */ 1275 tavor_cqhdl_t 1276 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum) 1277 { 1278 uint_t cqindx, cqmask; 1279 1280 /* Calculate the CQ table index from the cqnum */ 1281 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1; 1282 cqindx = cqnum & cqmask; 1283 return (state->ts_cqhdl[cqindx]); 1284 } 1285 1286 1287 /* 1288 * tavor_cq_cqe_consume() 1289 * Context: Can be called from interrupt or base context. 1290 */ 1291 static int 1292 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 1293 tavor_hw_cqe_t *cqe, ibt_wc_t *wc) 1294 { 1295 uint_t flags, type, opcode, qpnum, qp1_indx; 1296 int status; 1297 1298 TAVOR_TNF_ENTER(tavor_cq_cqe_consume); 1299 1300 /* 1301 * Determine if this is an "error" CQE by examining "opcode". If it 1302 * is an error CQE, then call tavor_cq_errcqe_consume() and return 1303 * whatever status it returns. Otherwise, this is a successful 1304 * completion. 1305 */ 1306 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe); 1307 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 1308 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 1309 status = tavor_cq_errcqe_consume(state, cq, cqe, wc); 1310 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1311 return (status); 1312 } 1313 1314 /* 1315 * Fetch the Work Request ID using the information in the CQE. 1316 * See tavor_wr.c for more details. 1317 */ 1318 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL); 1319 1320 /* 1321 * Parse the CQE opcode to determine completion type. This will set 1322 * not only the type of the completion, but also any flags that might 1323 * be associated with it (e.g. whether immediate data is present). 1324 */ 1325 flags = IBT_WC_NO_FLAGS; 1326 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) { 1327 1328 /* Send CQE */ 1329 switch (opcode) { 1330 case TAVOR_CQE_SND_RDMAWR_IMM: 1331 flags |= IBT_WC_IMMED_DATA_PRESENT; 1332 /* FALLTHROUGH */ 1333 case TAVOR_CQE_SND_RDMAWR: 1334 type = IBT_WRC_RDMAW; 1335 break; 1336 1337 case TAVOR_CQE_SND_SEND_IMM: 1338 flags |= IBT_WC_IMMED_DATA_PRESENT; 1339 /* FALLTHROUGH */ 1340 case TAVOR_CQE_SND_SEND: 1341 type = IBT_WRC_SEND; 1342 break; 1343 1344 case TAVOR_CQE_SND_RDMARD: 1345 type = IBT_WRC_RDMAR; 1346 break; 1347 1348 case TAVOR_CQE_SND_ATOMIC_CS: 1349 type = IBT_WRC_CSWAP; 1350 break; 1351 1352 case TAVOR_CQE_SND_ATOMIC_FA: 1353 type = IBT_WRC_FADD; 1354 break; 1355 1356 case TAVOR_CQE_SND_BIND_MW: 1357 type = IBT_WRC_BIND; 1358 break; 1359 1360 default: 1361 TAVOR_WARNING(state, "unknown send CQE type"); 1362 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR; 1363 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type, 1364 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode); 1365 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1366 return (TAVOR_CQ_SYNC_AND_DB); 1367 } 1368 } else { 1369 1370 /* Receive CQE */ 1371 switch (opcode & 0x1F) { 1372 case TAVOR_CQE_RCV_RECV_IMM: 1373 /* FALLTHROUGH */ 1374 case TAVOR_CQE_RCV_RECV_IMM2: 1375 /* 1376 * Note: According to the Tavor PRM, all QP1 recv 1377 * completions look like the result of a Send with 1378 * Immediate. They are not, however, (MADs are Send 1379 * Only) so we need to check the QP number and set 1380 * the flag only if it is non-QP1. 1381 */ 1382 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 1383 qp1_indx = state->ts_spec_qp1->tr_indx; 1384 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) { 1385 flags |= IBT_WC_IMMED_DATA_PRESENT; 1386 } 1387 /* FALLTHROUGH */ 1388 case TAVOR_CQE_RCV_RECV: 1389 /* FALLTHROUGH */ 1390 case TAVOR_CQE_RCV_RECV2: 1391 type = IBT_WRC_RECV; 1392 break; 1393 1394 case TAVOR_CQE_RCV_RDMAWR_IMM: 1395 /* FALLTHROUGH */ 1396 case TAVOR_CQE_RCV_RDMAWR_IMM2: 1397 flags |= IBT_WC_IMMED_DATA_PRESENT; 1398 type = IBT_WRC_RECV_RDMAWI; 1399 break; 1400 1401 default: 1402 TAVOR_WARNING(state, "unknown recv CQE type"); 1403 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR; 1404 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type, 1405 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode); 1406 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1407 return (TAVOR_CQ_SYNC_AND_DB); 1408 } 1409 } 1410 wc->wc_type = type; 1411 1412 /* 1413 * Check for GRH, update the flags, then fill in "wc_flags" field 1414 * in the work completion 1415 */ 1416 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) { 1417 flags |= IBT_WC_GRH_PRESENT; 1418 } 1419 wc->wc_flags = flags; 1420 1421 /* If we got here, completion status must be success */ 1422 wc->wc_status = IBT_WC_SUCCESS; 1423 1424 /* 1425 * Parse the remaining contents of the CQE into the work completion. 1426 * This means filling in SL, QP number, SLID, immediate data, etc. 1427 * Note: Not all of these fields are valid in a given completion. 1428 * Many of them depend on the actual type of completion. So we fill 1429 * in all of the fields and leave it up to the IBTF and consumer to 1430 * sort out which are valid based on their context. 1431 */ 1432 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe); 1433 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe); 1434 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe); 1435 wc->wc_res_hash = 0; 1436 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe); 1437 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF); 1438 wc->wc_pkey_ix = (wc->wc_immed_data >> 16); 1439 1440 /* 1441 * Depending on whether the completion was a receive or a send 1442 * completion, fill in "bytes transferred" as appropriate. Also, 1443 * if necessary, fill in the "path bits" field. 1444 */ 1445 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) { 1446 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe); 1447 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe); 1448 1449 } else if ((wc->wc_type == IBT_WRC_RDMAR) || 1450 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) { 1451 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe); 1452 } 1453 1454 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1455 return (TAVOR_CQ_SYNC_AND_DB); 1456 } 1457 1458 1459 /* 1460 * tavor_cq_errcqe_consume() 1461 * Context: Can be called from interrupt or base context. 1462 */ 1463 static int 1464 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 1465 tavor_hw_cqe_t *cqe, ibt_wc_t *wc) 1466 { 1467 uint64_t next_wqeaddr; 1468 uint32_t imm_eth_pkey_cred; 1469 uint_t nextwqesize, dbd; 1470 uint_t doorbell_cnt, status; 1471 tavor_wrid_entry_t wre; 1472 1473 TAVOR_TNF_ENTER(tavor_cq_errcqe_consume); 1474 1475 /* 1476 * Fetch the Work Request ID using the information in the CQE. 1477 * See tavor_wr.c for more details. 1478 */ 1479 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre); 1480 1481 /* 1482 * Parse the CQE opcode to determine completion type. We know that 1483 * the CQE is an error completion, so we extract only the completion 1484 * status here. 1485 */ 1486 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe); 1487 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT; 1488 switch (status) { 1489 case TAVOR_CQE_LOC_LEN_ERR: 1490 status = IBT_WC_LOCAL_LEN_ERR; 1491 break; 1492 1493 case TAVOR_CQE_LOC_OP_ERR: 1494 status = IBT_WC_LOCAL_QP_OP_ERR; 1495 break; 1496 1497 case TAVOR_CQE_LOC_PROT_ERR: 1498 status = IBT_WC_LOCAL_PROTECT_ERR; 1499 break; 1500 1501 case TAVOR_CQE_WR_FLUSHED_ERR: 1502 status = IBT_WC_WR_FLUSHED_ERR; 1503 break; 1504 1505 case TAVOR_CQE_MW_BIND_ERR: 1506 status = IBT_WC_MEM_WIN_BIND_ERR; 1507 break; 1508 1509 case TAVOR_CQE_BAD_RESPONSE_ERR: 1510 status = IBT_WC_BAD_RESPONSE_ERR; 1511 break; 1512 1513 case TAVOR_CQE_LOCAL_ACCESS_ERR: 1514 status = IBT_WC_LOCAL_ACCESS_ERR; 1515 break; 1516 1517 case TAVOR_CQE_REM_INV_REQ_ERR: 1518 status = IBT_WC_REMOTE_INVALID_REQ_ERR; 1519 break; 1520 1521 case TAVOR_CQE_REM_ACC_ERR: 1522 status = IBT_WC_REMOTE_ACCESS_ERR; 1523 break; 1524 1525 case TAVOR_CQE_REM_OP_ERR: 1526 status = IBT_WC_REMOTE_OP_ERR; 1527 break; 1528 1529 case TAVOR_CQE_TRANS_TO_ERR: 1530 status = IBT_WC_TRANS_TIMEOUT_ERR; 1531 break; 1532 1533 case TAVOR_CQE_RNRNAK_TO_ERR: 1534 status = IBT_WC_RNR_NAK_TIMEOUT_ERR; 1535 break; 1536 1537 /* 1538 * The following error codes are not supported in the Tavor driver 1539 * as they relate only to Reliable Datagram completion statuses: 1540 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR: 1541 * case TAVOR_CQE_REM_INV_RD_REQ_ERR: 1542 * case TAVOR_CQE_EEC_REM_ABORTED_ERR: 1543 * case TAVOR_CQE_INV_EEC_NUM_ERR: 1544 * case TAVOR_CQE_INV_EEC_STATE_ERR: 1545 * case TAVOR_CQE_LOC_EEC_ERR: 1546 */ 1547 1548 default: 1549 TAVOR_WARNING(state, "unknown error CQE status"); 1550 status = IBT_WC_LOCAL_QP_OP_ERR; 1551 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status, 1552 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 1553 break; 1554 } 1555 wc->wc_status = status; 1556 1557 /* 1558 * Now we do all the checking that's necessary to handle completion 1559 * queue entry "recycling" 1560 * 1561 * It is not necessary here to try to sync the WQE as we are only 1562 * attempting to read from the Work Queue (and hardware does not 1563 * write to it). 1564 */ 1565 1566 /* 1567 * We can get doorbell info, WQE address, size for the next WQE 1568 * from the "wre" (which was filled in above in the call to the 1569 * tavor_wrid_get_entry() routine) 1570 */ 1571 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0; 1572 next_wqeaddr = wre.wr_wqeaddrsz; 1573 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK; 1574 1575 /* 1576 * Get the doorbell count from the CQE. This indicates how many 1577 * completions this one CQE represents. 1578 */ 1579 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 1580 1581 /* 1582 * Determine if we're ready to consume this CQE yet or not. If the 1583 * next WQE has size zero (i.e. no next WQE) or if the doorbell count 1584 * is down to zero, then this is the last/only completion represented 1585 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the 1586 * current CQE needs to be recycled (see below). 1587 */ 1588 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) { 1589 /* 1590 * Consume the CQE 1591 * Return status to indicate that doorbell and sync may be 1592 * necessary. 1593 */ 1594 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume); 1595 return (TAVOR_CQ_SYNC_AND_DB); 1596 1597 } else { 1598 /* 1599 * Recycle the CQE for use in the next PollCQ() call 1600 * Decrement the doorbell count, modify the error status, 1601 * and update the WQE address and size (to point to the 1602 * next WQE on the chain. Put these update entries back 1603 * into the CQE. 1604 * Despite the fact that we have updated the CQE, it is not 1605 * necessary for us to attempt to sync this entry just yet 1606 * as we have not changed the "hardware's view" of the 1607 * entry (i.e. we have not modified the "owner" bit - which 1608 * is all that the Tavor hardware really cares about. 1609 */ 1610 doorbell_cnt = doorbell_cnt - dbd; 1611 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe, 1612 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) | 1613 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK))); 1614 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe, 1615 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize)); 1616 1617 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume); 1618 return (TAVOR_CQ_RECYCLE_ENTRY); 1619 } 1620 } 1621 1622 1623 /* 1624 * tavor_cqe_sync() 1625 * Context: Can be called from interrupt or base context. 1626 */ 1627 static void 1628 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag) 1629 { 1630 ddi_dma_handle_t dmahdl; 1631 off_t offset; 1632 int status; 1633 1634 TAVOR_TNF_ENTER(tavor_cqe_sync); 1635 1636 /* Determine if CQ needs to be synced or not */ 1637 if (cq->cq_sync == 0) { 1638 TAVOR_TNF_EXIT(tavor_cqe_sync); 1639 return; 1640 } 1641 1642 /* Get the DMA handle from CQ context */ 1643 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl; 1644 1645 /* Calculate offset of next CQE */ 1646 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]); 1647 status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag); 1648 if (status != DDI_SUCCESS) { 1649 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail, 1650 TAVOR_TNF_ERROR, ""); 1651 TAVOR_TNF_EXIT(tavor_cqe_sync); 1652 return; 1653 } 1654 1655 TAVOR_TNF_EXIT(tavor_cqe_sync); 1656 } 1657 1658 1659 /* 1660 * tavor_cq_resize_helper() 1661 * Context: Can be called only from user or kernel context. 1662 */ 1663 static void 1664 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf, 1665 uint32_t old_cons_indx, uint32_t num_newcqe) 1666 { 1667 tavor_hw_cqe_t *old_cqe, *new_cqe; 1668 uint32_t new_cons_indx, wrap_around_mask; 1669 int i; 1670 1671 TAVOR_TNF_ENTER(tavor_cq_resize_helper); 1672 1673 ASSERT(MUTEX_HELD(&cq->cq_lock)); 1674 1675 /* Get the consumer index */ 1676 new_cons_indx = 0; 1677 1678 /* 1679 * Calculate the wrap around mask. Note: This operation only works 1680 * because all Tavor completion queues have power-of-2 sizes 1681 */ 1682 wrap_around_mask = (cq->cq_bufsz - 1); 1683 1684 /* 1685 * Calculate the pointers to the first CQ entry (in the "old" CQ) 1686 * and the first CQ entry in the "new" CQ 1687 */ 1688 old_cqe = &cq->cq_buf[old_cons_indx]; 1689 new_cqe = &new_cqbuf[new_cons_indx]; 1690 1691 /* Sync entire "old" CQ for use by software (if necessary). */ 1692 if (cq->cq_sync) { 1693 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl, 1694 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU); 1695 } 1696 1697 /* 1698 * Keep pulling entries from the "old" CQ until we find an entry owned 1699 * by the hardware. Process each entry by copying it into the "new" 1700 * CQ and updating respective indices and pointers in the "old" CQ. 1701 */ 1702 for (i = 0; i < num_newcqe; i++) { 1703 1704 /* Copy this old CQE into the "new_cqe" pointer */ 1705 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t)); 1706 1707 /* Increment the consumer index (for both CQs) */ 1708 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask; 1709 new_cons_indx = (new_cons_indx + 1); 1710 1711 /* Update the pointer to the next CQ entry */ 1712 old_cqe = &cq->cq_buf[old_cons_indx]; 1713 new_cqe = &new_cqbuf[new_cons_indx]; 1714 } 1715 1716 TAVOR_TNF_EXIT(tavor_cq_resize_helper); 1717 } 1718 1719 /* 1720 * tavor_cq_srq_entries_flush() 1721 * Context: Can be called from interrupt or base context. 1722 */ 1723 void 1724 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp) 1725 { 1726 tavor_cqhdl_t cq; 1727 tavor_workq_hdr_t *wqhdr; 1728 tavor_hw_cqe_t *cqe; 1729 tavor_hw_cqe_t *next_cqe; 1730 uint32_t cons_indx, tail_cons_indx, wrap_around_mask; 1731 uint32_t new_indx, check_indx, indx; 1732 uint32_t num_to_increment; 1733 int cqe_qpnum, cqe_type; 1734 int outstanding_cqes, removed_cqes; 1735 int i; 1736 1737 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); 1738 1739 cq = qp->qp_rq_cqhdl; 1740 wqhdr = qp->qp_rq_wqhdr; 1741 1742 ASSERT(wqhdr->wq_wrid_post != NULL); 1743 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0); 1744 1745 /* 1746 * Check for user-mapped CQ memory. Note: We do not allow kernel 1747 * clients to modify any userland mapping CQ. If the CQ is 1748 * user-mapped, then we simply return here, and this "flush" function 1749 * becomes a NO-OP in this case. 1750 */ 1751 if (cq->cq_is_umap) { 1752 return; 1753 } 1754 1755 /* Get the consumer index */ 1756 cons_indx = cq->cq_consindx; 1757 1758 /* 1759 * Calculate the wrap around mask. Note: This operation only works 1760 * because all Tavor completion queues have power-of-2 sizes 1761 */ 1762 wrap_around_mask = (cq->cq_bufsz - 1); 1763 1764 /* Calculate the pointer to the first CQ entry */ 1765 cqe = &cq->cq_buf[cons_indx]; 1766 1767 /* Sync the current CQE to read */ 1768 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 1769 1770 /* 1771 * Loop through the CQ looking for entries owned by software. If an 1772 * entry is owned by software then we increment an 'outstanding_cqes' 1773 * count to know how many entries total we have on our CQ. We use this 1774 * value further down to know how many entries to loop through looking 1775 * for our same QP number. 1776 */ 1777 outstanding_cqes = 0; 1778 tail_cons_indx = cons_indx; 1779 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) { 1780 /* increment total cqes count */ 1781 outstanding_cqes++; 1782 1783 /* increment the consumer index */ 1784 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask; 1785 1786 /* update the pointer to the next cq entry */ 1787 cqe = &cq->cq_buf[tail_cons_indx]; 1788 1789 /* sync the next cqe to read */ 1790 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 1791 } 1792 1793 /* 1794 * Using the 'tail_cons_indx' that was just set, we now know how many 1795 * total CQEs possible there are. Set the 'check_indx' and the 1796 * 'new_indx' to the last entry identified by 'tail_cons_indx' 1797 */ 1798 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask; 1799 1800 for (i = 0; i < outstanding_cqes; i++) { 1801 cqe = &cq->cq_buf[check_indx]; 1802 1803 /* Grab QP number from CQE */ 1804 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 1805 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe); 1806 1807 /* 1808 * If the QP number is the same in the CQE as the QP that we 1809 * have on this SRQ, then we must free up the entry off the 1810 * SRQ. We also make sure that the completion type is of the 1811 * 'TAVOR_COMPLETION_RECV' type. So any send completions on 1812 * this CQ will be left as-is. The handling of returning 1813 * entries back to HW ownership happens further down. 1814 */ 1815 if (cqe_qpnum == qp->qp_qpnum && 1816 cqe_type == TAVOR_COMPLETION_RECV) { 1817 1818 /* Add back to SRQ free list */ 1819 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post, 1820 cq, cqe); 1821 } else { 1822 /* Do Copy */ 1823 if (check_indx != new_indx) { 1824 next_cqe = &cq->cq_buf[new_indx]; 1825 1826 /* 1827 * Copy the CQE into the "next_cqe" 1828 * pointer. 1829 */ 1830 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t)); 1831 } 1832 new_indx = (new_indx - 1) & wrap_around_mask; 1833 } 1834 /* Move index to next CQE to check */ 1835 check_indx = (check_indx - 1) & wrap_around_mask; 1836 } 1837 1838 /* Initialize removed cqes count */ 1839 removed_cqes = 0; 1840 1841 /* If an entry was removed */ 1842 if (check_indx != new_indx) { 1843 1844 /* 1845 * Set current pointer back to the beginning consumer index. 1846 * At this point, all unclaimed entries have been copied to the 1847 * index specified by 'new_indx'. This 'new_indx' will be used 1848 * as the new consumer index after we mark all freed entries as 1849 * having HW ownership. We do that here. 1850 */ 1851 1852 /* Loop through all entries until we reach our new pointer */ 1853 for (indx = cons_indx; indx <= new_indx; 1854 indx = (indx + 1) & wrap_around_mask) { 1855 removed_cqes++; 1856 cqe = &cq->cq_buf[indx]; 1857 1858 /* Reset entry to hardware ownership */ 1859 TAVOR_CQE_OWNER_SET_HW(cq, cqe); 1860 } 1861 } 1862 1863 /* 1864 * Update consumer index to be the 'new_indx'. This moves it past all 1865 * removed entries. Because 'new_indx' is pointing to the last 1866 * previously valid SW owned entry, we add 1 to point the cons_indx to 1867 * the first HW owned entry. 1868 */ 1869 cons_indx = (new_indx + 1) & wrap_around_mask; 1870 1871 /* 1872 * Now we only ring the doorbell (to update the consumer index) if 1873 * we've actually consumed a CQ entry. If we found no QP number 1874 * matches above, then we would not have removed anything. So only if 1875 * something was removed do we ring the doorbell. 1876 */ 1877 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) { 1878 /* 1879 * Post doorbell to update the consumer index. Doorbell 1880 * value indicates number of entries consumed (minus 1) 1881 */ 1882 if (cons_indx > cq->cq_consindx) { 1883 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 1884 } else { 1885 num_to_increment = ((cons_indx + cq->cq_bufsz) - 1886 cq->cq_consindx) - 1; 1887 } 1888 cq->cq_consindx = cons_indx; 1889 1890 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX, 1891 cq->cq_cqnum, num_to_increment); 1892 } 1893 }