Print this page
8368 remove warlock leftovers from usr/src/uts
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/adapters/tavor/tavor_cq.c
+++ new/usr/src/uts/common/io/ib/adapters/tavor/tavor_cq.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * tavor_cq.c
29 29 * Tavor Completion Queue Processing Routines
30 30 *
31 31 * Implements all the routines necessary for allocating, freeing, resizing,
32 32 * and handling the completion type events that the Tavor hardware can
33 33 * generate.
34 34 */
35 35
36 36 #include <sys/types.h>
37 37 #include <sys/conf.h>
38 38 #include <sys/ddi.h>
39 39 #include <sys/sunddi.h>
40 40 #include <sys/modctl.h>
41 41 #include <sys/bitmap.h>
42 42 #include <sys/sysmacros.h>
43 43
44 44 #include <sys/ib/adapters/tavor/tavor.h>
45 45
46 46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47 47 uint32_t cqn, uint32_t cq_param);
48 48 #pragma inline(tavor_cq_doorbell)
49 49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
50 50 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
51 51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
52 52 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
53 53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
54 54 uint_t flag);
55 55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
56 56 uint32_t old_cons_indx, uint32_t num_newcqe);
57 57
58 58 /*
59 59 * tavor_cq_alloc()
60 60 * Context: Can be called only from user or kernel context.
61 61 */
62 62 int
63 63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
64 64 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
65 65 uint_t sleepflag)
66 66 {
67 67 tavor_rsrc_t *cqc, *rsrc;
68 68 tavor_umap_db_entry_t *umapdb;
69 69 tavor_hw_cqc_t cqc_entry;
70 70 tavor_cqhdl_t cq;
71 71 ibt_mr_attr_t mr_attr;
72 72 tavor_mr_options_t op;
73 73 tavor_pdhdl_t pd;
↓ open down ↓ |
73 lines elided |
↑ open up ↑ |
74 74 tavor_mrhdl_t mr;
75 75 tavor_hw_cqe_t *buf;
76 76 uint64_t addr, value;
77 77 uint32_t log_cq_size, lkey, uarpg;
78 78 uint_t dma_xfer_mode, cq_sync, cq_is_umap;
79 79 int status, i, flag;
80 80 char *errormsg;
81 81
82 82 TAVOR_TNF_ENTER(tavor_cq_alloc);
83 83
84 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
85 -
86 84 /*
87 85 * Determine whether CQ is being allocated for userland access or
88 86 * whether it is being allocated for kernel access. If the CQ is
89 87 * being allocated for userland access, then lookup the UAR doorbell
90 88 * page number for the current process. Note: If this is not found
91 89 * (e.g. if the process has not previously open()'d the Tavor driver),
92 90 * then an error is returned.
93 91 */
94 92 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
95 93 if (cq_is_umap) {
96 94 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
97 95 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
98 96 if (status != DDI_SUCCESS) {
99 97 /* Set "status" and "errormsg" and goto failure */
100 98 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
101 99 goto cqalloc_fail;
102 100 }
103 101 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
104 102 }
105 103
106 104 /* Use the internal protection domain (PD) for setting up CQs */
107 105 pd = state->ts_pdhdl_internal;
108 106
109 107 /* Increment the reference count on the protection domain (PD) */
110 108 tavor_pd_refcnt_inc(pd);
111 109
112 110 /*
113 111 * Allocate an CQ context entry. This will be filled in with all
114 112 * the necessary parameters to define the Completion Queue. And then
115 113 * ownership will be passed to the hardware in the final step
116 114 * below. If we fail here, we must undo the protection domain
117 115 * reference count.
118 116 */
119 117 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
120 118 if (status != DDI_SUCCESS) {
121 119 /* Set "status" and "errormsg" and goto failure */
122 120 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
123 121 goto cqalloc_fail1;
124 122 }
125 123
126 124 /*
127 125 * Allocate the software structure for tracking the completion queue
128 126 * (i.e. the Tavor Completion Queue handle). If we fail here, we must
↓ open down ↓ |
33 lines elided |
↑ open up ↑ |
129 127 * undo the protection domain reference count and the previous
130 128 * resource allocation.
131 129 */
132 130 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
133 131 if (status != DDI_SUCCESS) {
134 132 /* Set "status" and "errormsg" and goto failure */
135 133 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
136 134 goto cqalloc_fail2;
137 135 }
138 136 cq = (tavor_cqhdl_t)rsrc->tr_addr;
139 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
140 137 cq->cq_is_umap = cq_is_umap;
141 138
142 139 /* Use the index as CQ number */
143 140 cq->cq_cqnum = cqc->tr_indx;
144 141
145 142 /*
146 143 * If this will be a user-mappable CQ, then allocate an entry for
147 144 * the "userland resources database". This will later be added to
148 145 * the database (after all further CQ operations are successful).
149 146 * If we fail here, we must undo the reference counts and the
150 147 * previous resource allocation.
151 148 */
152 149 if (cq->cq_is_umap) {
153 150 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
154 151 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
155 152 if (umapdb == NULL) {
156 153 /* Set "status" and "errormsg" and goto failure */
157 154 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
158 155 goto cqalloc_fail3;
159 156 }
160 157 }
161 158
162 159 /*
163 160 * Calculate the appropriate size for the completion queue.
164 161 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
165 162 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
166 163 * to round the requested size up to the next highest power-of-2
167 164 */
168 165 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
169 166 log_cq_size = highbit(cq_attr->cq_size);
170 167
171 168 /*
172 169 * Next we verify that the rounded-up size is valid (i.e. consistent
173 170 * with the device limits and/or software-configured limits)
174 171 */
175 172 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
176 173 /* Set "status" and "errormsg" and goto failure */
177 174 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
178 175 goto cqalloc_fail4;
179 176 }
180 177
181 178 /*
182 179 * Allocate the memory for Completion Queue.
183 180 *
184 181 * Note: Although we use the common queue allocation routine, we
185 182 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
186 183 * kernel system memory) for kernel CQs because it would be
187 184 * inefficient to have CQs located in DDR memory. This is primarily
188 185 * because CQs are read from (by software) more than they are written
189 186 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
190 187 * user-mappable CQs for a similar reason.)
191 188 * It is also worth noting that, unlike Tavor QP work queues,
192 189 * completion queues do not have the same strict alignment
193 190 * requirements. It is sufficient for the CQ memory to be both
194 191 * aligned to and bound to addresses which are a multiple of CQE size.
195 192 */
196 193 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
197 194 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
198 195 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
199 196 if (cq->cq_is_umap) {
200 197 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
↓ open down ↓ |
51 lines elided |
↑ open up ↑ |
201 198 } else {
202 199 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
203 200 }
204 201 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
205 202 if (status != DDI_SUCCESS) {
206 203 /* Set "status" and "errormsg" and goto failure */
207 204 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
208 205 goto cqalloc_fail4;
209 206 }
210 207 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
211 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
212 208
213 209 /*
214 210 * Initialize each of the Completion Queue Entries (CQE) by setting
215 211 * their ownership to hardware ("owner" bit set to HW). This is in
216 212 * preparation for the final transfer of ownership (below) of the
217 213 * CQ context itself.
218 214 */
219 215 for (i = 0; i < (1 << log_cq_size); i++) {
220 216 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
221 217 }
222 218
223 219 /*
224 220 * Register the memory for the CQ. The memory for the CQ must
225 221 * be registered in the Tavor TPT tables. This gives us the LKey
226 222 * to specify in the CQ context below. Note: If this is a user-
227 223 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
228 224 */
229 225 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
230 226 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
231 227 mr_attr.mr_len = cq->cq_cqinfo.qa_size;
232 228 mr_attr.mr_as = NULL;
233 229 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
234 230 if (cq->cq_is_umap) {
235 231 dma_xfer_mode = DDI_DMA_CONSISTENT;
236 232 } else {
237 233 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
238 234 }
239 235 if (dma_xfer_mode == DDI_DMA_STREAMING) {
240 236 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
241 237 }
242 238 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
243 239 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
244 240 op.mro_bind_override_addr = 0;
245 241 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
246 242 if (status != DDI_SUCCESS) {
247 243 /* Set "status" and "errormsg" and goto failure */
248 244 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
249 245 goto cqalloc_fail5;
250 246 }
251 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
252 247 addr = mr->mr_bindinfo.bi_addr;
253 248 lkey = mr->mr_lkey;
254 249
255 250 /* Determine if later ddi_dma_sync will be necessary */
256 251 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
257 252
258 253 /* Sync entire CQ for use by the hardware (if necessary). */
259 254 if (cq_sync) {
260 255 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
261 256 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
262 257 }
263 258
264 259 /*
265 260 * Fill in the CQC entry. This is the final step before passing
266 261 * ownership of the CQC entry to the Tavor hardware. We use all of
267 262 * the information collected/calculated above to fill in the
268 263 * requisite portions of the CQC. Note: If this CQ is going to be
269 264 * used for userland access, then we need to set the UAR page number
270 265 * appropriately (otherwise it's a "don't care")
271 266 */
272 267 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
273 268 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
274 269 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
275 270 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED;
276 271 cqc_entry.state = TAVOR_CQ_DISARMED;
277 272 cqc_entry.start_addr_h = (addr >> 32);
278 273 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF);
279 274 cqc_entry.log_cq_sz = log_cq_size;
280 275 if (cq->cq_is_umap) {
281 276 cqc_entry.usr_page = uarpg;
282 277 } else {
283 278 cqc_entry.usr_page = 0;
284 279 }
285 280 cqc_entry.pd = pd->pd_pdnum;
286 281 cqc_entry.lkey = lkey;
287 282 cqc_entry.e_eqn = cq->cq_erreqnum;
288 283 cqc_entry.c_eqn = cq->cq_eqnum;
289 284 cqc_entry.cqn = cq->cq_cqnum;
290 285
291 286 /*
292 287 * Write the CQC entry to hardware. Lastly, we pass ownership of
293 288 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
294 289 * command). Note: In general, this operation shouldn't fail. But
295 290 * if it does, we have to undo everything we've done above before
296 291 * returning error.
297 292 */
298 293 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
299 294 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
300 295 if (status != TAVOR_CMD_SUCCESS) {
301 296 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
302 297 status);
303 298 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
304 299 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
305 300 /* Set "status" and "errormsg" and goto failure */
306 301 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
307 302 goto cqalloc_fail6;
308 303 }
309 304
310 305 /*
311 306 * Fill in the rest of the Tavor Completion Queue handle. Having
312 307 * successfully transferred ownership of the CQC, we can update the
313 308 * following fields for use in further operations on the CQ.
314 309 */
315 310 cq->cq_cqcrsrcp = cqc;
316 311 cq->cq_rsrcp = rsrc;
317 312 cq->cq_consindx = 0;
318 313 cq->cq_buf = buf;
319 314 cq->cq_bufsz = (1 << log_cq_size);
320 315 cq->cq_mrhdl = mr;
321 316 cq->cq_sync = cq_sync;
322 317 cq->cq_refcnt = 0;
323 318 cq->cq_is_special = 0;
324 319 cq->cq_uarpg = uarpg;
325 320 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
326 321 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
327 322 sizeof (struct tavor_workq_hdr_s),
328 323 offsetof(struct tavor_workq_hdr_s, wq_avl_link));
329 324
330 325 cq->cq_wrid_reap_head = NULL;
331 326 cq->cq_wrid_reap_tail = NULL;
332 327 cq->cq_hdlrarg = (void *)ibt_cqhdl;
333 328
334 329 /*
335 330 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the
336 331 * "actual_size" and "cqhdl" and return success
337 332 */
338 333 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
339 334 state->ts_cqhdl[cqc->tr_indx] = cq;
340 335
341 336 /*
342 337 * If this is a user-mappable CQ, then we need to insert the previously
343 338 * allocated entry into the "userland resources database". This will
344 339 * allow for later lookup during devmap() (i.e. mmap()) calls.
345 340 */
346 341 if (cq->cq_is_umap) {
347 342 tavor_umap_db_add(umapdb);
348 343 }
349 344
350 345 /*
351 346 * Fill in the return arguments (if necessary). This includes the
352 347 * real completion queue size.
353 348 */
354 349 if (actual_size != NULL) {
355 350 *actual_size = (1 << log_cq_size) - 1;
356 351 }
357 352 *cqhdl = cq;
358 353
359 354 TAVOR_TNF_EXIT(tavor_cq_alloc);
360 355 return (DDI_SUCCESS);
361 356
362 357 /*
363 358 * The following is cleanup for all possible failure cases in this routine
364 359 */
365 360 cqalloc_fail6:
366 361 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
367 362 sleepflag) != DDI_SUCCESS) {
368 363 TAVOR_WARNING(state, "failed to deregister CQ memory");
369 364 }
370 365 cqalloc_fail5:
371 366 tavor_queue_free(state, &cq->cq_cqinfo);
372 367 cqalloc_fail4:
373 368 if (cq_is_umap) {
374 369 tavor_umap_db_free(umapdb);
375 370 }
376 371 cqalloc_fail3:
377 372 tavor_rsrc_free(state, &rsrc);
378 373 cqalloc_fail2:
379 374 tavor_rsrc_free(state, &cqc);
380 375 cqalloc_fail1:
381 376 tavor_pd_refcnt_dec(pd);
382 377 cqalloc_fail:
383 378 TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
384 379 tnf_string, msg, errormsg);
385 380 TAVOR_TNF_EXIT(tavor_cq_alloc);
386 381 return (status);
387 382 }
388 383
389 384
390 385 /*
391 386 * tavor_cq_free()
392 387 * Context: Can be called only from user or kernel context.
393 388 */
394 389 /* ARGSUSED */
395 390 int
396 391 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
397 392 {
398 393 tavor_rsrc_t *cqc, *rsrc;
399 394 tavor_umap_db_entry_t *umapdb;
400 395 tavor_hw_cqc_t cqc_entry;
401 396 tavor_pdhdl_t pd;
402 397 tavor_mrhdl_t mr;
403 398 tavor_cqhdl_t cq;
404 399 uint32_t cqnum;
405 400 uint64_t value;
406 401 uint_t maxprot;
407 402 int status;
408 403
409 404 TAVOR_TNF_ENTER(tavor_cq_free);
410 405
411 406 /*
412 407 * Pull all the necessary information from the Tavor Completion Queue
413 408 * handle. This is necessary here because the resource for the
414 409 * CQ handle is going to be freed up as part of this operation.
415 410 */
416 411 cq = *cqhdl;
417 412 mutex_enter(&cq->cq_lock);
418 413 cqc = cq->cq_cqcrsrcp;
419 414 rsrc = cq->cq_rsrcp;
420 415 pd = state->ts_pdhdl_internal;
421 416 mr = cq->cq_mrhdl;
422 417 cqnum = cq->cq_cqnum;
423 418
424 419 /*
425 420 * If there are work queues still associated with the CQ, then return
426 421 * an error. Otherwise, we will be holding the CQ lock.
427 422 */
428 423 if (cq->cq_refcnt != 0) {
429 424 mutex_exit(&cq->cq_lock);
430 425 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
431 426 tnf_int, refcnt, cq->cq_refcnt);
432 427 TAVOR_TNF_EXIT(tavor_cq_free);
433 428 return (IBT_CQ_BUSY);
434 429 }
435 430
436 431 /*
437 432 * If this was a user-mappable CQ, then we need to remove its entry
438 433 * from the "userland resources database". If it is also currently
439 434 * mmap()'d out to a user process, then we need to call
440 435 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
441 436 * We also need to invalidate the CQ tracking information for the
442 437 * user mapping.
443 438 */
444 439 if (cq->cq_is_umap) {
445 440 status = tavor_umap_db_find(state->ts_instance, cqnum,
446 441 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
447 442 &umapdb);
448 443 if (status != DDI_SUCCESS) {
449 444 mutex_exit(&cq->cq_lock);
450 445 TAVOR_WARNING(state, "failed to find in database");
451 446 TAVOR_TNF_EXIT(tavor_cq_free);
452 447 return (ibc_get_ci_failure(0));
453 448 }
454 449 tavor_umap_db_free(umapdb);
455 450 if (cq->cq_umap_dhp != NULL) {
456 451 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
457 452 status = devmap_devmem_remap(cq->cq_umap_dhp,
458 453 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
459 454 maxprot, DEVMAP_MAPPING_INVALID, NULL);
460 455 if (status != DDI_SUCCESS) {
461 456 mutex_exit(&cq->cq_lock);
462 457 TAVOR_WARNING(state, "failed in CQ memory "
463 458 "devmap_devmem_remap()");
464 459 TAVOR_TNF_EXIT(tavor_cq_free);
465 460 return (ibc_get_ci_failure(0));
466 461 }
467 462 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
468 463 }
469 464 }
470 465
471 466 /*
472 467 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any
473 468 * in-progress events to detect that the CQ corresponding to this
474 469 * number has been freed.
475 470 */
476 471 state->ts_cqhdl[cqc->tr_indx] = NULL;
↓ open down ↓ |
215 lines elided |
↑ open up ↑ |
477 472
478 473 /*
479 474 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
480 475 * list. This cleans up all the structures associated with the WRID
481 476 * processing for this CQ. Once we complete, drop the lock and finish
482 477 * the deallocation of the CQ.
483 478 */
484 479 tavor_wrid_cq_force_reap(cq);
485 480
486 481 mutex_exit(&cq->cq_lock);
487 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
488 482
489 483 /*
490 484 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
491 485 * firmware command). If the ownership transfer fails for any reason,
492 486 * then it is an indication that something (either in HW or SW) has
493 487 * gone seriously wrong.
494 488 */
495 489 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
496 490 sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
497 491 if (status != TAVOR_CMD_SUCCESS) {
498 492 TAVOR_WARNING(state, "failed to reclaim CQC ownership");
499 493 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
500 494 status);
501 495 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
502 496 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
503 497 TAVOR_TNF_EXIT(tavor_cq_free);
504 498 return (ibc_get_ci_failure(0));
505 499 }
506 500
507 501 /*
508 502 * Deregister the memory for the Completion Queue. If this fails
509 503 * for any reason, then it is an indication that something (either
510 504 * in HW or SW) has gone seriously wrong. So we print a warning
511 505 * message and return.
512 506 */
513 507 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
514 508 sleepflag);
515 509 if (status != DDI_SUCCESS) {
516 510 TAVOR_WARNING(state, "failed to deregister CQ memory");
517 511 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
518 512 TAVOR_TNF_EXIT(tavor_cq_free);
519 513 return (ibc_get_ci_failure(0));
520 514 }
521 515
522 516 /* Free the memory for the CQ */
523 517 tavor_queue_free(state, &cq->cq_cqinfo);
524 518
525 519 /* Free the Tavor Completion Queue handle */
526 520 tavor_rsrc_free(state, &rsrc);
527 521
528 522 /* Free up the CQC entry resource */
529 523 tavor_rsrc_free(state, &cqc);
530 524
531 525 /* Decrement the reference count on the protection domain (PD) */
532 526 tavor_pd_refcnt_dec(pd);
533 527
534 528 /* Set the cqhdl pointer to NULL and return success */
535 529 *cqhdl = NULL;
536 530
537 531 TAVOR_TNF_EXIT(tavor_cq_free);
538 532 return (DDI_SUCCESS);
539 533 }
540 534
541 535
542 536 /*
543 537 * tavor_cq_resize()
544 538 * Context: Can be called only from user or kernel context.
545 539 */
546 540 int
547 541 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
548 542 uint_t *actual_size, uint_t sleepflag)
549 543 {
550 544 tavor_hw_cqc_t cqc_entry;
551 545 tavor_qalloc_info_t new_cqinfo, old_cqinfo;
552 546 ibt_mr_attr_t mr_attr;
553 547 tavor_mr_options_t op;
554 548 tavor_pdhdl_t pd;
555 549 tavor_mrhdl_t mr, mr_old;
556 550 tavor_hw_cqe_t *buf;
557 551 uint32_t new_prod_indx, old_cons_indx;
558 552 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot;
559 553 int status, i, flag;
560 554 char *errormsg;
561 555
562 556 TAVOR_TNF_ENTER(tavor_cq_resize);
563 557
564 558 /* Use the internal protection domain (PD) for CQs */
565 559 pd = state->ts_pdhdl_internal;
566 560
567 561 /*
568 562 * Calculate the appropriate size for the new resized completion queue.
569 563 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
570 564 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
571 565 * to round the requested size up to the next highest power-of-2
572 566 */
573 567 req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
574 568 log_cq_size = highbit(req_size);
575 569
576 570 /*
577 571 * Next we verify that the rounded-up size is valid (i.e. consistent
578 572 * with the device limits and/or software-configured limits)
579 573 */
580 574 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
581 575 /* Set "status" and "errormsg" and goto failure */
582 576 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
583 577 goto cqresize_fail;
584 578 }
585 579
586 580 /*
587 581 * Allocate the memory for newly resized Completion Queue.
588 582 *
589 583 * Note: Although we use the common queue allocation routine, we
590 584 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
591 585 * kernel system memory) for kernel CQs because it would be
592 586 * inefficient to have CQs located in DDR memory. This is the same
593 587 * as we do when we first allocate completion queues primarily
594 588 * because CQs are read from (by software) more than they are written
595 589 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
596 590 * user-mappable CQs for a similar reason.)
597 591 * It is also worth noting that, unlike Tavor QP work queues,
598 592 * completion queues do not have the same strict alignment
599 593 * requirements. It is sufficient for the CQ memory to be both
600 594 * aligned to and bound to addresses which are a multiple of CQE size.
601 595 */
602 596 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
603 597 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
604 598 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
605 599 if (cq->cq_is_umap) {
606 600 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
↓ open down ↓ |
109 lines elided |
↑ open up ↑ |
607 601 } else {
608 602 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
609 603 }
610 604 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
611 605 if (status != DDI_SUCCESS) {
612 606 /* Set "status" and "errormsg" and goto failure */
613 607 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
614 608 goto cqresize_fail;
615 609 }
616 610 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
617 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
618 611
619 612 /*
620 613 * Initialize each of the Completion Queue Entries (CQE) by setting
621 614 * their ownership to hardware ("owner" bit set to HW). This is in
622 615 * preparation for the final resize operation (below).
623 616 */
624 617 for (i = 0; i < (1 << log_cq_size); i++) {
625 618 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
626 619 }
627 620
628 621 /*
629 622 * Register the memory for the CQ. The memory for the CQ must
630 623 * be registered in the Tavor TPT tables. This gives us the LKey
631 624 * to specify in the CQ context below.
632 625 */
633 626 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
634 627 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
635 628 mr_attr.mr_len = new_cqinfo.qa_size;
636 629 mr_attr.mr_as = NULL;
637 630 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
638 631 if (cq->cq_is_umap) {
639 632 dma_xfer_mode = DDI_DMA_CONSISTENT;
640 633 } else {
641 634 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
642 635 }
643 636 if (dma_xfer_mode == DDI_DMA_STREAMING) {
644 637 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
645 638 }
↓ open down ↓ |
18 lines elided |
↑ open up ↑ |
646 639 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
647 640 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
648 641 op.mro_bind_override_addr = 0;
649 642 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
650 643 if (status != DDI_SUCCESS) {
651 644 tavor_queue_free(state, &new_cqinfo);
652 645 /* Set "status" and "errormsg" and goto failure */
653 646 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
654 647 goto cqresize_fail;
655 648 }
656 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
657 649
658 650 /* Determine if later ddi_dma_sync will be necessary */
659 651 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
660 652
661 653 /* Sync entire "new" CQ for use by hardware (if necessary) */
662 654 if (cq_sync) {
663 655 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
664 656 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
665 657 }
666 658
667 659 /*
668 660 * Now we grab the CQ lock. Since we will be updating the actual
669 661 * CQ location and the producer/consumer indexes, we should hold
670 662 * the lock.
671 663 *
672 664 * We do a TAVOR_NOSLEEP here (and below), though, because we are
673 665 * holding the "cq_lock" and if we got raised to interrupt level
674 666 * by priority inversion, we would not want to block in this routine
675 667 * waiting for success.
676 668 */
677 669 mutex_enter(&cq->cq_lock);
678 670
679 671 /*
680 672 * Determine the current CQ "consumer index".
681 673 *
682 674 * Note: This will depend on whether the CQ had previously been
683 675 * mapped for user access or whether it is a kernel CQ. If this
684 676 * is a kernel CQ, then all PollCQ() operations have come through
685 677 * the IBTF and, hence, the driver's CQ state structure will
686 678 * contain the current consumer index. If, however, the user has
687 679 * accessed this CQ by bypassing the driver (OS-bypass), then we
688 680 * need to query the firmware to determine the current CQ consumer
689 681 * index. This also assumes that the user process will not continue
690 682 * to consume entries while at the same time doing the ResizeCQ()
691 683 * operation. If the user process does not guarantee this, then it
692 684 * may see duplicate or missed completions. But under no
693 685 * circumstances should this panic the system.
694 686 */
695 687 if (cq->cq_is_umap) {
696 688 status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
697 689 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
698 690 TAVOR_NOSLEEP);
699 691 if (status != TAVOR_CMD_SUCCESS) {
700 692 /* Query CQ has failed, drop CQ lock and cleanup */
701 693 mutex_exit(&cq->cq_lock);
702 694 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
703 695 sleepflag) != DDI_SUCCESS) {
704 696 TAVOR_WARNING(state, "failed to deregister "
705 697 "CQ memory");
706 698 }
707 699 tavor_queue_free(state, &new_cqinfo);
708 700 TAVOR_WARNING(state, "failed to find in database");
709 701
710 702 /* Set "status" and "errormsg" and goto failure */
711 703 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
712 704 "failed umap lookup");
713 705 goto cqresize_fail;
714 706 }
715 707 old_cons_indx = cqc_entry.cons_indx;
716 708 } else {
717 709 old_cons_indx = cq->cq_consindx;
718 710 }
719 711
720 712 /*
721 713 * Fill in the CQC entry. For the resize operation this is the
722 714 * final step before attempting the resize operation on the CQC entry.
723 715 * We use all of the information collected/calculated above to fill
724 716 * in the requisite portions of the CQC.
725 717 */
726 718 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
727 719 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32);
728 720 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
729 721 cqc_entry.log_cq_sz = log_cq_size;
730 722 cqc_entry.lkey = mr->mr_lkey;
731 723
732 724 /*
733 725 * Write the CQC entry to hardware. Lastly, we pass ownership of
734 726 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
735 727 * command). Note: In general, this operation shouldn't fail. But
736 728 * if it does, we have to undo everything we've done above before
737 729 * returning error. Also note that the status returned may indicate
738 730 * the code to return to the IBTF.
739 731 */
740 732 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
741 733 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
742 734 if (status != TAVOR_CMD_SUCCESS) {
743 735 /* Resize attempt has failed, drop CQ lock and cleanup */
744 736 mutex_exit(&cq->cq_lock);
745 737 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
746 738 sleepflag) != DDI_SUCCESS) {
747 739 TAVOR_WARNING(state, "failed to deregister CQ memory");
748 740 }
749 741 tavor_queue_free(state, &new_cqinfo);
750 742 if (status == TAVOR_CMD_BAD_SIZE) {
751 743 TAVOR_TNF_EXIT(tavor_cq_resize);
752 744 return (IBT_CQ_SZ_INSUFFICIENT);
753 745 } else {
754 746 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
755 747 "%08x\n", status);
756 748 TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
757 749 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
758 750 TAVOR_TNF_EXIT(tavor_cq_resize);
759 751 return (ibc_get_ci_failure(0));
760 752 }
761 753 }
762 754
763 755 /*
764 756 * The CQ resize attempt was successful. Before dropping the CQ lock,
765 757 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note:
766 758 * the Tavor firmware guarantees us that sufficient space is set aside
767 759 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
768 760 * The two parameters to this helper function ("old_cons_indx" and
769 761 * "new_prod_indx") essentially indicate the starting index and number
770 762 * of any CQEs that might remain in the "old" CQ memory.
771 763 */
772 764 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
773 765
774 766 /* Sync entire "new" CQ for use by hardware (if necessary) */
775 767 if (cq_sync) {
776 768 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
777 769 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
778 770 }
779 771
780 772 /*
781 773 * Update the Tavor Completion Queue handle with all the new
782 774 * information. At the same time, save away all the necessary
783 775 * information for freeing up the old resources
784 776 */
785 777 mr_old = cq->cq_mrhdl;
786 778 old_cqinfo = cq->cq_cqinfo;
787 779 cq->cq_cqinfo = new_cqinfo;
788 780 cq->cq_consindx = 0;
789 781 cq->cq_buf = buf;
790 782 cq->cq_bufsz = (1 << log_cq_size);
791 783 cq->cq_mrhdl = mr;
792 784 cq->cq_sync = cq_sync;
793 785
794 786 /*
795 787 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
796 788 * to a user process, then we need to call devmap_devmem_remap() to
797 789 * invalidate the mapping to the CQ memory. We also need to
798 790 * invalidate the CQ tracking information for the user mapping.
799 791 */
800 792 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
801 793 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
802 794 status = devmap_devmem_remap(cq->cq_umap_dhp,
803 795 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
804 796 DEVMAP_MAPPING_INVALID, NULL);
805 797 if (status != DDI_SUCCESS) {
806 798 mutex_exit(&cq->cq_lock);
807 799 TAVOR_WARNING(state, "failed in CQ memory "
808 800 "devmap_devmem_remap()");
809 801 TAVOR_TNF_EXIT(tavor_cq_free);
810 802 return (ibc_get_ci_failure(0));
811 803 }
812 804 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
813 805 }
814 806
815 807 /*
816 808 * Drop the CQ lock now. The only thing left to do is to free up
817 809 * the old resources.
818 810 */
819 811 mutex_exit(&cq->cq_lock);
820 812
821 813 /*
822 814 * Deregister the memory for the old Completion Queue. Note: We
823 815 * really can't return error here because we have no good way to
824 816 * cleanup. Plus, the deregistration really shouldn't ever happen.
825 817 * So, if it does, it is an indication that something has gone
826 818 * seriously wrong. So we print a warning message and return error
827 819 * (knowing, of course, that the "old" CQ memory will be leaked)
828 820 */
829 821 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
830 822 sleepflag);
831 823 if (status != DDI_SUCCESS) {
832 824 TAVOR_WARNING(state, "failed to deregister old CQ memory");
833 825 /* Set "status" and "errormsg" and goto failure */
834 826 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
835 827 "failed deregister mr (old)");
836 828 goto cqresize_fail;
837 829 }
838 830
839 831 /* Free the memory for the old CQ */
840 832 tavor_queue_free(state, &old_cqinfo);
841 833
842 834 /*
843 835 * Fill in the return arguments (if necessary). This includes the
844 836 * real new completion queue size.
845 837 */
846 838 if (actual_size != NULL) {
847 839 *actual_size = (1 << log_cq_size) - 1;
848 840 }
849 841
850 842 TAVOR_TNF_EXIT(tavor_cq_resize);
851 843 return (DDI_SUCCESS);
852 844
853 845 cqresize_fail:
854 846 TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
855 847 tnf_string, msg, errormsg);
856 848 TAVOR_TNF_EXIT(tavor_cq_resize);
857 849 return (status);
858 850 }
859 851
860 852
861 853 /*
862 854 * tavor_cq_notify()
863 855 * Context: Can be called from interrupt or base context.
864 856 */
865 857 int
866 858 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
867 859 ibt_cq_notify_flags_t flags)
868 860 {
869 861 uint_t cqnum;
870 862
871 863 TAVOR_TNF_ENTER(tavor_cq_notify);
872 864
873 865 /*
874 866 * Determine if we are trying to get the next completion or the next
875 867 * "solicited" completion. Then hit the appropriate doorbell.
876 868 *
877 869 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
878 870 * regarding why we do not have to do an extra PIO read here, and we
879 871 * will not lose an event after writing this doorbell.
880 872 */
881 873 cqnum = cq->cq_cqnum;
882 874 if (flags == IBT_NEXT_COMPLETION) {
883 875 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
884 876 TAVOR_CQDB_DEFAULT_PARAM);
885 877
886 878 } else if (flags == IBT_NEXT_SOLICITED) {
887 879 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
888 880 cqnum, TAVOR_CQDB_DEFAULT_PARAM);
889 881
890 882 } else {
891 883 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
892 884 tnf_int, flags, flags);
893 885 TAVOR_TNF_EXIT(tavor_cq_notify);
894 886 return (IBT_CQ_NOTIFY_TYPE_INVALID);
895 887 }
896 888
897 889 TAVOR_TNF_EXIT(tavor_cq_notify);
898 890 return (DDI_SUCCESS);
899 891 }
900 892
901 893
902 894 /*
903 895 * tavor_cq_poll()
904 896 * Context: Can be called from interrupt or base context.
905 897 */
906 898 int
907 899 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
908 900 uint_t num_wc, uint_t *num_polled)
909 901 {
910 902 tavor_hw_cqe_t *cqe;
911 903 uint32_t cons_indx, wrap_around_mask;
912 904 uint32_t polled_cnt, num_to_increment;
913 905 int status;
914 906
915 907 TAVOR_TNF_ENTER(tavor_cq_poll);
916 908
917 909 /*
918 910 * Check for user-mappable CQ memory. Note: We do not allow kernel
919 911 * clients to poll CQ memory that is accessible directly by the user.
920 912 * If the CQ memory is user accessible, then return an error.
921 913 */
922 914 if (cq->cq_is_umap) {
923 915 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
924 916 TAVOR_TNF_ERROR, "");
925 917 TAVOR_TNF_EXIT(tavor_cq_poll);
926 918 return (IBT_CQ_HDL_INVALID);
927 919 }
928 920
929 921 mutex_enter(&cq->cq_lock);
930 922
931 923 /* Get the consumer index */
932 924 cons_indx = cq->cq_consindx;
933 925
934 926 /*
935 927 * Calculate the wrap around mask. Note: This operation only works
936 928 * because all Tavor completion queues have power-of-2 sizes
937 929 */
938 930 wrap_around_mask = (cq->cq_bufsz - 1);
939 931
940 932 /* Calculate the pointer to the first CQ entry */
941 933 cqe = &cq->cq_buf[cons_indx];
942 934
943 935 /* Sync the current CQE to read */
944 936 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
945 937
946 938 /*
947 939 * Keep pulling entries from the CQ until we find an entry owned by
948 940 * the hardware. As long as there the CQE's owned by SW, process
949 941 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
950 942 * consumer index. Note: We only update the consumer index if
951 943 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise,
952 944 * it indicates that we are going to "recycle" the CQE (probably
953 945 * because it is a error CQE and corresponds to more than one
954 946 * completion).
955 947 */
956 948 polled_cnt = 0;
957 949 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
958 950 status = tavor_cq_cqe_consume(state, cq, cqe,
959 951 &wc_p[polled_cnt++]);
960 952 if (status == TAVOR_CQ_SYNC_AND_DB) {
961 953 /* Reset entry to hardware ownership */
962 954 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
963 955
964 956 /* Sync the current CQE for device */
965 957 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
966 958
967 959 /* Increment the consumer index */
968 960 cons_indx = (cons_indx + 1) & wrap_around_mask;
969 961
970 962 /* Update the pointer to the next CQ entry */
971 963 cqe = &cq->cq_buf[cons_indx];
972 964
973 965 /* Sync the next CQE to read */
974 966 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
975 967 }
976 968
977 969 /*
978 970 * If we have run out of space to store work completions,
979 971 * then stop and return the ones we have pulled of the CQ.
980 972 */
981 973 if (polled_cnt >= num_wc) {
982 974 break;
983 975 }
984 976 }
985 977
986 978 /*
987 979 * Now we only ring the doorbell (to update the consumer index) if
988 980 * we've actually consumed a CQ entry. If we have, for example,
989 981 * pulled from a CQE that we are still in the process of "recycling"
990 982 * for error purposes, then we would not update the consumer index.
991 983 */
992 984 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
993 985 /*
994 986 * Post doorbell to update the consumer index. Doorbell
995 987 * value indicates number of entries consumed (minus 1)
996 988 */
997 989 if (cons_indx > cq->cq_consindx) {
998 990 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
999 991 } else {
1000 992 num_to_increment = ((cons_indx + cq->cq_bufsz) -
1001 993 cq->cq_consindx) - 1;
1002 994 }
1003 995 cq->cq_consindx = cons_indx;
1004 996 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1005 997 cq->cq_cqnum, num_to_increment);
1006 998
1007 999 } else if (polled_cnt == 0) {
1008 1000 /*
1009 1001 * If the CQ is empty, we can try to free up some of the WRID
1010 1002 * list containers. See tavor_wr.c for more details on this
1011 1003 * operation.
1012 1004 */
1013 1005 tavor_wrid_cq_reap(cq);
1014 1006 }
1015 1007
1016 1008 mutex_exit(&cq->cq_lock);
1017 1009
1018 1010 /* Set "num_polled" (if necessary) */
1019 1011 if (num_polled != NULL) {
1020 1012 *num_polled = polled_cnt;
1021 1013 }
1022 1014
1023 1015 /* Set CQ_EMPTY condition if needed, otherwise return success */
1024 1016 if (polled_cnt == 0) {
1025 1017 status = IBT_CQ_EMPTY;
1026 1018 } else {
1027 1019 status = DDI_SUCCESS;
1028 1020 }
1029 1021
1030 1022 /*
1031 1023 * Check if the system is currently panicking. If it is, then call
1032 1024 * the Tavor interrupt service routine. This step is necessary here
1033 1025 * because we might be in a polled I/O mode and without the call to
1034 1026 * tavor_isr() - and its subsequent calls to poll and rearm each
1035 1027 * event queue - we might overflow our EQs and render the system
1036 1028 * unable to sync/dump.
1037 1029 */
1038 1030 if (ddi_in_panic() != 0) {
1039 1031 (void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1040 1032 }
1041 1033
1042 1034 TAVOR_TNF_EXIT(tavor_cq_poll);
1043 1035 return (status);
1044 1036 }
1045 1037
1046 1038
1047 1039 /*
1048 1040 * tavor_cq_handler()
1049 1041 * Context: Only called from interrupt context
1050 1042 */
1051 1043 int
1052 1044 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1053 1045 tavor_hw_eqe_t *eqe)
1054 1046 {
1055 1047 tavor_cqhdl_t cq;
1056 1048 uint_t cqnum;
1057 1049 uint_t eqe_evttype;
1058 1050
1059 1051 TAVOR_TNF_ENTER(tavor_cq_handler);
1060 1052
1061 1053 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1062 1054
1063 1055 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1064 1056 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1065 1057
1066 1058 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1067 1059 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1068 1060 TAVOR_TNF_ERROR, "");
1069 1061 tavor_eq_overflow_handler(state, eq, eqe);
1070 1062
1071 1063 TAVOR_TNF_EXIT(tavor_cq_handler);
1072 1064 return (DDI_FAILURE);
1073 1065 }
1074 1066
1075 1067
1076 1068 /* Get the CQ handle from CQ number in event descriptor */
1077 1069 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1078 1070 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1079 1071
1080 1072 /*
1081 1073 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1082 1074 * This operation is to enable subsequent CQ doorbells (e.g. those
1083 1075 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1084 1076 */
1085 1077 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1086 1078
1087 1079 /*
1088 1080 * If the CQ handle is NULL, this is probably an indication
1089 1081 * that the CQ has been freed already. In which case, we
1090 1082 * should not deliver this event.
1091 1083 *
1092 1084 * We also check that the CQ number in the handle is the
1093 1085 * same as the CQ number in the event queue entry. This
1094 1086 * extra check allows us to handle the case where a CQ was
1095 1087 * freed and then allocated again in the time it took to
1096 1088 * handle the event queue processing. By constantly incrementing
1097 1089 * the non-constrained portion of the CQ number every time
1098 1090 * a new CQ is allocated, we mitigate (somewhat) the chance
1099 1091 * that a stale event could be passed to the client's CQ
1100 1092 * handler.
1101 1093 *
1102 1094 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it
1103 1095 * means that we've have either received this event before we
1104 1096 * finished attaching to the IBTF or we've received it while we
1105 1097 * are in the process of detaching.
1106 1098 */
1107 1099 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1108 1100 (state->ts_ibtfpriv != NULL)) {
1109 1101 TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1110 1102 } else {
1111 1103 TNF_PROBE_2(tavor_cq_handler_dropped_event,
1112 1104 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1113 1105 tnf_uint, hdl_cqnum, cqnum);
1114 1106 }
1115 1107
1116 1108 TAVOR_TNF_EXIT(tavor_cq_handler);
1117 1109 return (DDI_SUCCESS);
1118 1110 }
1119 1111
1120 1112
1121 1113 /*
1122 1114 * tavor_cq_err_handler()
1123 1115 * Context: Only called from interrupt context
1124 1116 */
1125 1117 int
1126 1118 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1127 1119 tavor_hw_eqe_t *eqe)
1128 1120 {
1129 1121 tavor_cqhdl_t cq;
1130 1122 uint_t cqnum;
1131 1123 ibc_async_event_t event;
1132 1124 ibt_async_code_t type;
1133 1125 uint_t eqe_evttype;
1134 1126
1135 1127 TAVOR_TNF_ENTER(tavor_cq_err_handler);
1136 1128
1137 1129 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1138 1130
1139 1131 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1140 1132 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1141 1133
1142 1134 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1143 1135 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1144 1136 TAVOR_TNF_ERROR, "");
1145 1137 tavor_eq_overflow_handler(state, eq, eqe);
1146 1138
1147 1139 TAVOR_TNF_EXIT(tavor_cq_err_handler);
1148 1140 return (DDI_FAILURE);
1149 1141 }
1150 1142
1151 1143 /* cmn_err(CE_CONT, "CQ Error handler\n"); */
1152 1144
1153 1145 /* Get the CQ handle from CQ number in event descriptor */
1154 1146 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1155 1147 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1156 1148
1157 1149 /*
1158 1150 * If the CQ handle is NULL, this is probably an indication
1159 1151 * that the CQ has been freed already. In which case, we
1160 1152 * should not deliver this event.
1161 1153 *
1162 1154 * We also check that the CQ number in the handle is the
1163 1155 * same as the CQ number in the event queue entry. This
1164 1156 * extra check allows us to handle the case where a CQ was
1165 1157 * freed and then allocated again in the time it took to
1166 1158 * handle the event queue processing. By constantly incrementing
1167 1159 * the non-constrained portion of the CQ number every time
1168 1160 * a new CQ is allocated, we mitigate (somewhat) the chance
1169 1161 * that a stale event could be passed to the client's CQ
1170 1162 * handler.
1171 1163 *
1172 1164 * And then we check if "ts_ibtfpriv" is NULL. If it is then it
1173 1165 * means that we've have either received this event before we
1174 1166 * finished attaching to the IBTF or we've received it while we
1175 1167 * are in the process of detaching.
1176 1168 */
1177 1169 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1178 1170 (state->ts_ibtfpriv != NULL)) {
1179 1171 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1180 1172 type = IBT_ERROR_CQ;
1181 1173
1182 1174 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1183 1175 } else {
1184 1176 TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1185 1177 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1186 1178 tnf_uint, hdl_cqnum, cqnum);
1187 1179 }
1188 1180
1189 1181 TAVOR_TNF_EXIT(tavor_cq_err_handler);
1190 1182 return (DDI_SUCCESS);
1191 1183 }
1192 1184
1193 1185
1194 1186 /*
1195 1187 * tavor_cq_refcnt_inc()
1196 1188 * Context: Can be called from interrupt or base context.
1197 1189 */
1198 1190 int
1199 1191 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1200 1192 {
1201 1193 /*
1202 1194 * Increment the completion queue's reference count. Note: In order
1203 1195 * to ensure compliance with IBA C11-15, we must ensure that a given
1204 1196 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1205 1197 * This is accomplished here by keeping track of how the referenced
1206 1198 * CQ is being used.
1207 1199 */
1208 1200 mutex_enter(&cq->cq_lock);
1209 1201 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1210 1202 tnf_uint, refcnt, cq->cq_refcnt);
1211 1203 if (cq->cq_refcnt == 0) {
1212 1204 cq->cq_is_special = is_special;
1213 1205 } else {
1214 1206 if (cq->cq_is_special != is_special) {
1215 1207 mutex_exit(&cq->cq_lock);
1216 1208 return (DDI_FAILURE);
1217 1209 }
1218 1210 }
1219 1211 cq->cq_refcnt++;
1220 1212 mutex_exit(&cq->cq_lock);
1221 1213 return (DDI_SUCCESS);
1222 1214 }
1223 1215
1224 1216
1225 1217 /*
1226 1218 * tavor_cq_refcnt_dec()
1227 1219 * Context: Can be called from interrupt or base context.
1228 1220 */
1229 1221 void
1230 1222 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1231 1223 {
1232 1224 /* Decrement the completion queue's reference count */
1233 1225 mutex_enter(&cq->cq_lock);
1234 1226 cq->cq_refcnt--;
1235 1227 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1236 1228 tnf_uint, refcnt, cq->cq_refcnt);
1237 1229 mutex_exit(&cq->cq_lock);
1238 1230 }
1239 1231
1240 1232
1241 1233 /*
1242 1234 * tavor_cq_doorbell()
1243 1235 * Context: Can be called from interrupt or base context.
1244 1236 */
1245 1237 static void
1246 1238 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1247 1239 uint32_t cq_param)
1248 1240 {
1249 1241 uint64_t doorbell = 0;
1250 1242
1251 1243 /* Build the doorbell from the parameters */
1252 1244 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1253 1245 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1254 1246
1255 1247 TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1256 1248 tnf_ulong, doorbell, doorbell);
1257 1249
1258 1250 /* Write the doorbell to UAR */
1259 1251 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1260 1252 doorbell);
1261 1253 }
1262 1254
1263 1255
1264 1256 /*
1265 1257 * tavor_cqhdl_from_cqnum()
1266 1258 * Context: Can be called from interrupt or base context.
1267 1259 *
1268 1260 * This routine is important because changing the unconstrained
1269 1261 * portion of the CQ number is critical to the detection of a
1270 1262 * potential race condition in the CQ handler code (i.e. the case
1271 1263 * where a CQ is freed and alloc'd again before an event for the
1272 1264 * "old" CQ can be handled).
1273 1265 *
1274 1266 * While this is not a perfect solution (not sure that one exists)
1275 1267 * it does help to mitigate the chance that this race condition will
1276 1268 * cause us to deliver a "stale" event to the new CQ owner. Note:
1277 1269 * this solution does not scale well because the number of constrained
1278 1270 * bits increases (and, hence, the number of unconstrained bits
1279 1271 * decreases) as the number of supported CQs grows. For small and
1280 1272 * intermediate values, it should hopefully provide sufficient
1281 1273 * protection.
1282 1274 */
1283 1275 tavor_cqhdl_t
1284 1276 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1285 1277 {
1286 1278 uint_t cqindx, cqmask;
1287 1279
1288 1280 /* Calculate the CQ table index from the cqnum */
1289 1281 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1290 1282 cqindx = cqnum & cqmask;
1291 1283 return (state->ts_cqhdl[cqindx]);
1292 1284 }
1293 1285
1294 1286
1295 1287 /*
1296 1288 * tavor_cq_cqe_consume()
1297 1289 * Context: Can be called from interrupt or base context.
1298 1290 */
1299 1291 static int
1300 1292 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1301 1293 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1302 1294 {
1303 1295 uint_t flags, type, opcode, qpnum, qp1_indx;
1304 1296 int status;
1305 1297
1306 1298 TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1307 1299
1308 1300 /*
1309 1301 * Determine if this is an "error" CQE by examining "opcode". If it
1310 1302 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1311 1303 * whatever status it returns. Otherwise, this is a successful
1312 1304 * completion.
1313 1305 */
1314 1306 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1315 1307 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1316 1308 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1317 1309 status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1318 1310 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1319 1311 return (status);
1320 1312 }
1321 1313
1322 1314 /*
1323 1315 * Fetch the Work Request ID using the information in the CQE.
1324 1316 * See tavor_wr.c for more details.
1325 1317 */
1326 1318 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1327 1319
1328 1320 /*
1329 1321 * Parse the CQE opcode to determine completion type. This will set
1330 1322 * not only the type of the completion, but also any flags that might
1331 1323 * be associated with it (e.g. whether immediate data is present).
1332 1324 */
1333 1325 flags = IBT_WC_NO_FLAGS;
1334 1326 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1335 1327
1336 1328 /* Send CQE */
1337 1329 switch (opcode) {
1338 1330 case TAVOR_CQE_SND_RDMAWR_IMM:
1339 1331 flags |= IBT_WC_IMMED_DATA_PRESENT;
1340 1332 /* FALLTHROUGH */
1341 1333 case TAVOR_CQE_SND_RDMAWR:
1342 1334 type = IBT_WRC_RDMAW;
1343 1335 break;
1344 1336
1345 1337 case TAVOR_CQE_SND_SEND_IMM:
1346 1338 flags |= IBT_WC_IMMED_DATA_PRESENT;
1347 1339 /* FALLTHROUGH */
1348 1340 case TAVOR_CQE_SND_SEND:
1349 1341 type = IBT_WRC_SEND;
1350 1342 break;
1351 1343
1352 1344 case TAVOR_CQE_SND_RDMARD:
1353 1345 type = IBT_WRC_RDMAR;
1354 1346 break;
1355 1347
1356 1348 case TAVOR_CQE_SND_ATOMIC_CS:
1357 1349 type = IBT_WRC_CSWAP;
1358 1350 break;
1359 1351
1360 1352 case TAVOR_CQE_SND_ATOMIC_FA:
1361 1353 type = IBT_WRC_FADD;
1362 1354 break;
1363 1355
1364 1356 case TAVOR_CQE_SND_BIND_MW:
1365 1357 type = IBT_WRC_BIND;
1366 1358 break;
1367 1359
1368 1360 default:
1369 1361 TAVOR_WARNING(state, "unknown send CQE type");
1370 1362 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1371 1363 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1372 1364 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1373 1365 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1374 1366 return (TAVOR_CQ_SYNC_AND_DB);
1375 1367 }
1376 1368 } else {
1377 1369
1378 1370 /* Receive CQE */
1379 1371 switch (opcode & 0x1F) {
1380 1372 case TAVOR_CQE_RCV_RECV_IMM:
1381 1373 /* FALLTHROUGH */
1382 1374 case TAVOR_CQE_RCV_RECV_IMM2:
1383 1375 /*
1384 1376 * Note: According to the Tavor PRM, all QP1 recv
1385 1377 * completions look like the result of a Send with
1386 1378 * Immediate. They are not, however, (MADs are Send
1387 1379 * Only) so we need to check the QP number and set
1388 1380 * the flag only if it is non-QP1.
1389 1381 */
1390 1382 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1391 1383 qp1_indx = state->ts_spec_qp1->tr_indx;
1392 1384 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1393 1385 flags |= IBT_WC_IMMED_DATA_PRESENT;
1394 1386 }
1395 1387 /* FALLTHROUGH */
1396 1388 case TAVOR_CQE_RCV_RECV:
1397 1389 /* FALLTHROUGH */
1398 1390 case TAVOR_CQE_RCV_RECV2:
1399 1391 type = IBT_WRC_RECV;
1400 1392 break;
1401 1393
1402 1394 case TAVOR_CQE_RCV_RDMAWR_IMM:
1403 1395 /* FALLTHROUGH */
1404 1396 case TAVOR_CQE_RCV_RDMAWR_IMM2:
1405 1397 flags |= IBT_WC_IMMED_DATA_PRESENT;
1406 1398 type = IBT_WRC_RECV_RDMAWI;
1407 1399 break;
1408 1400
1409 1401 default:
1410 1402 TAVOR_WARNING(state, "unknown recv CQE type");
1411 1403 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1412 1404 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1413 1405 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1414 1406 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1415 1407 return (TAVOR_CQ_SYNC_AND_DB);
1416 1408 }
1417 1409 }
1418 1410 wc->wc_type = type;
1419 1411
1420 1412 /*
1421 1413 * Check for GRH, update the flags, then fill in "wc_flags" field
1422 1414 * in the work completion
1423 1415 */
1424 1416 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1425 1417 flags |= IBT_WC_GRH_PRESENT;
1426 1418 }
1427 1419 wc->wc_flags = flags;
1428 1420
1429 1421 /* If we got here, completion status must be success */
1430 1422 wc->wc_status = IBT_WC_SUCCESS;
1431 1423
1432 1424 /*
1433 1425 * Parse the remaining contents of the CQE into the work completion.
1434 1426 * This means filling in SL, QP number, SLID, immediate data, etc.
1435 1427 * Note: Not all of these fields are valid in a given completion.
1436 1428 * Many of them depend on the actual type of completion. So we fill
1437 1429 * in all of the fields and leave it up to the IBTF and consumer to
1438 1430 * sort out which are valid based on their context.
1439 1431 */
1440 1432 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe);
1441 1433 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1442 1434 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe);
1443 1435 wc->wc_res_hash = 0;
1444 1436 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe);
1445 1437 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF);
1446 1438 wc->wc_pkey_ix = (wc->wc_immed_data >> 16);
1447 1439
1448 1440 /*
1449 1441 * Depending on whether the completion was a receive or a send
1450 1442 * completion, fill in "bytes transferred" as appropriate. Also,
1451 1443 * if necessary, fill in the "path bits" field.
1452 1444 */
1453 1445 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1454 1446 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1455 1447 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1456 1448
1457 1449 } else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1458 1450 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1459 1451 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1460 1452 }
1461 1453
1462 1454 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1463 1455 return (TAVOR_CQ_SYNC_AND_DB);
1464 1456 }
1465 1457
1466 1458
1467 1459 /*
1468 1460 * tavor_cq_errcqe_consume()
1469 1461 * Context: Can be called from interrupt or base context.
1470 1462 */
1471 1463 static int
1472 1464 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1473 1465 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1474 1466 {
1475 1467 uint64_t next_wqeaddr;
1476 1468 uint32_t imm_eth_pkey_cred;
1477 1469 uint_t nextwqesize, dbd;
1478 1470 uint_t doorbell_cnt, status;
1479 1471 tavor_wrid_entry_t wre;
1480 1472
1481 1473 TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1482 1474
1483 1475 /*
1484 1476 * Fetch the Work Request ID using the information in the CQE.
1485 1477 * See tavor_wr.c for more details.
1486 1478 */
1487 1479 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1488 1480
1489 1481 /*
1490 1482 * Parse the CQE opcode to determine completion type. We know that
1491 1483 * the CQE is an error completion, so we extract only the completion
1492 1484 * status here.
1493 1485 */
1494 1486 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1495 1487 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1496 1488 switch (status) {
1497 1489 case TAVOR_CQE_LOC_LEN_ERR:
1498 1490 status = IBT_WC_LOCAL_LEN_ERR;
1499 1491 break;
1500 1492
1501 1493 case TAVOR_CQE_LOC_OP_ERR:
1502 1494 status = IBT_WC_LOCAL_QP_OP_ERR;
1503 1495 break;
1504 1496
1505 1497 case TAVOR_CQE_LOC_PROT_ERR:
1506 1498 status = IBT_WC_LOCAL_PROTECT_ERR;
1507 1499 break;
1508 1500
1509 1501 case TAVOR_CQE_WR_FLUSHED_ERR:
1510 1502 status = IBT_WC_WR_FLUSHED_ERR;
1511 1503 break;
1512 1504
1513 1505 case TAVOR_CQE_MW_BIND_ERR:
1514 1506 status = IBT_WC_MEM_WIN_BIND_ERR;
1515 1507 break;
1516 1508
1517 1509 case TAVOR_CQE_BAD_RESPONSE_ERR:
1518 1510 status = IBT_WC_BAD_RESPONSE_ERR;
1519 1511 break;
1520 1512
1521 1513 case TAVOR_CQE_LOCAL_ACCESS_ERR:
1522 1514 status = IBT_WC_LOCAL_ACCESS_ERR;
1523 1515 break;
1524 1516
1525 1517 case TAVOR_CQE_REM_INV_REQ_ERR:
1526 1518 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1527 1519 break;
1528 1520
1529 1521 case TAVOR_CQE_REM_ACC_ERR:
1530 1522 status = IBT_WC_REMOTE_ACCESS_ERR;
1531 1523 break;
1532 1524
1533 1525 case TAVOR_CQE_REM_OP_ERR:
1534 1526 status = IBT_WC_REMOTE_OP_ERR;
1535 1527 break;
1536 1528
1537 1529 case TAVOR_CQE_TRANS_TO_ERR:
1538 1530 status = IBT_WC_TRANS_TIMEOUT_ERR;
1539 1531 break;
1540 1532
1541 1533 case TAVOR_CQE_RNRNAK_TO_ERR:
1542 1534 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1543 1535 break;
1544 1536
1545 1537 /*
1546 1538 * The following error codes are not supported in the Tavor driver
1547 1539 * as they relate only to Reliable Datagram completion statuses:
1548 1540 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1549 1541 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1550 1542 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1551 1543 * case TAVOR_CQE_INV_EEC_NUM_ERR:
1552 1544 * case TAVOR_CQE_INV_EEC_STATE_ERR:
1553 1545 * case TAVOR_CQE_LOC_EEC_ERR:
1554 1546 */
1555 1547
1556 1548 default:
1557 1549 TAVOR_WARNING(state, "unknown error CQE status");
1558 1550 status = IBT_WC_LOCAL_QP_OP_ERR;
1559 1551 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1560 1552 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1561 1553 break;
1562 1554 }
1563 1555 wc->wc_status = status;
1564 1556
1565 1557 /*
1566 1558 * Now we do all the checking that's necessary to handle completion
1567 1559 * queue entry "recycling"
1568 1560 *
1569 1561 * It is not necessary here to try to sync the WQE as we are only
1570 1562 * attempting to read from the Work Queue (and hardware does not
1571 1563 * write to it).
1572 1564 */
1573 1565
1574 1566 /*
1575 1567 * We can get doorbell info, WQE address, size for the next WQE
1576 1568 * from the "wre" (which was filled in above in the call to the
1577 1569 * tavor_wrid_get_entry() routine)
1578 1570 */
1579 1571 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1580 1572 next_wqeaddr = wre.wr_wqeaddrsz;
1581 1573 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1582 1574
1583 1575 /*
1584 1576 * Get the doorbell count from the CQE. This indicates how many
1585 1577 * completions this one CQE represents.
1586 1578 */
1587 1579 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1588 1580
1589 1581 /*
1590 1582 * Determine if we're ready to consume this CQE yet or not. If the
1591 1583 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1592 1584 * is down to zero, then this is the last/only completion represented
1593 1585 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the
1594 1586 * current CQE needs to be recycled (see below).
1595 1587 */
1596 1588 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1597 1589 /*
1598 1590 * Consume the CQE
1599 1591 * Return status to indicate that doorbell and sync may be
1600 1592 * necessary.
1601 1593 */
1602 1594 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1603 1595 return (TAVOR_CQ_SYNC_AND_DB);
1604 1596
1605 1597 } else {
1606 1598 /*
1607 1599 * Recycle the CQE for use in the next PollCQ() call
1608 1600 * Decrement the doorbell count, modify the error status,
1609 1601 * and update the WQE address and size (to point to the
1610 1602 * next WQE on the chain. Put these update entries back
1611 1603 * into the CQE.
1612 1604 * Despite the fact that we have updated the CQE, it is not
1613 1605 * necessary for us to attempt to sync this entry just yet
1614 1606 * as we have not changed the "hardware's view" of the
1615 1607 * entry (i.e. we have not modified the "owner" bit - which
1616 1608 * is all that the Tavor hardware really cares about.
1617 1609 */
1618 1610 doorbell_cnt = doorbell_cnt - dbd;
1619 1611 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1620 1612 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1621 1613 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1622 1614 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1623 1615 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1624 1616
1625 1617 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1626 1618 return (TAVOR_CQ_RECYCLE_ENTRY);
1627 1619 }
1628 1620 }
1629 1621
1630 1622
1631 1623 /*
1632 1624 * tavor_cqe_sync()
1633 1625 * Context: Can be called from interrupt or base context.
1634 1626 */
1635 1627 static void
1636 1628 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1637 1629 {
1638 1630 ddi_dma_handle_t dmahdl;
1639 1631 off_t offset;
1640 1632 int status;
1641 1633
1642 1634 TAVOR_TNF_ENTER(tavor_cqe_sync);
1643 1635
1644 1636 /* Determine if CQ needs to be synced or not */
1645 1637 if (cq->cq_sync == 0) {
1646 1638 TAVOR_TNF_EXIT(tavor_cqe_sync);
1647 1639 return;
1648 1640 }
1649 1641
1650 1642 /* Get the DMA handle from CQ context */
1651 1643 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1652 1644
1653 1645 /* Calculate offset of next CQE */
1654 1646 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1655 1647 status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1656 1648 if (status != DDI_SUCCESS) {
1657 1649 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1658 1650 TAVOR_TNF_ERROR, "");
1659 1651 TAVOR_TNF_EXIT(tavor_cqe_sync);
1660 1652 return;
1661 1653 }
1662 1654
1663 1655 TAVOR_TNF_EXIT(tavor_cqe_sync);
1664 1656 }
1665 1657
1666 1658
1667 1659 /*
1668 1660 * tavor_cq_resize_helper()
1669 1661 * Context: Can be called only from user or kernel context.
1670 1662 */
1671 1663 static void
1672 1664 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1673 1665 uint32_t old_cons_indx, uint32_t num_newcqe)
1674 1666 {
1675 1667 tavor_hw_cqe_t *old_cqe, *new_cqe;
1676 1668 uint32_t new_cons_indx, wrap_around_mask;
1677 1669 int i;
1678 1670
1679 1671 TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1680 1672
1681 1673 ASSERT(MUTEX_HELD(&cq->cq_lock));
1682 1674
1683 1675 /* Get the consumer index */
1684 1676 new_cons_indx = 0;
1685 1677
1686 1678 /*
1687 1679 * Calculate the wrap around mask. Note: This operation only works
1688 1680 * because all Tavor completion queues have power-of-2 sizes
1689 1681 */
1690 1682 wrap_around_mask = (cq->cq_bufsz - 1);
1691 1683
1692 1684 /*
1693 1685 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1694 1686 * and the first CQ entry in the "new" CQ
1695 1687 */
1696 1688 old_cqe = &cq->cq_buf[old_cons_indx];
1697 1689 new_cqe = &new_cqbuf[new_cons_indx];
1698 1690
1699 1691 /* Sync entire "old" CQ for use by software (if necessary). */
1700 1692 if (cq->cq_sync) {
1701 1693 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1702 1694 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1703 1695 }
1704 1696
1705 1697 /*
1706 1698 * Keep pulling entries from the "old" CQ until we find an entry owned
1707 1699 * by the hardware. Process each entry by copying it into the "new"
1708 1700 * CQ and updating respective indices and pointers in the "old" CQ.
1709 1701 */
1710 1702 for (i = 0; i < num_newcqe; i++) {
1711 1703
1712 1704 /* Copy this old CQE into the "new_cqe" pointer */
1713 1705 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1714 1706
1715 1707 /* Increment the consumer index (for both CQs) */
1716 1708 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1717 1709 new_cons_indx = (new_cons_indx + 1);
1718 1710
1719 1711 /* Update the pointer to the next CQ entry */
1720 1712 old_cqe = &cq->cq_buf[old_cons_indx];
1721 1713 new_cqe = &new_cqbuf[new_cons_indx];
1722 1714 }
1723 1715
1724 1716 TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1725 1717 }
1726 1718
1727 1719 /*
1728 1720 * tavor_cq_srq_entries_flush()
1729 1721 * Context: Can be called from interrupt or base context.
1730 1722 */
1731 1723 void
1732 1724 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1733 1725 {
1734 1726 tavor_cqhdl_t cq;
1735 1727 tavor_workq_hdr_t *wqhdr;
1736 1728 tavor_hw_cqe_t *cqe;
1737 1729 tavor_hw_cqe_t *next_cqe;
1738 1730 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1739 1731 uint32_t new_indx, check_indx, indx;
1740 1732 uint32_t num_to_increment;
1741 1733 int cqe_qpnum, cqe_type;
1742 1734 int outstanding_cqes, removed_cqes;
1743 1735 int i;
1744 1736
1745 1737 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1746 1738
1747 1739 cq = qp->qp_rq_cqhdl;
1748 1740 wqhdr = qp->qp_rq_wqhdr;
1749 1741
1750 1742 ASSERT(wqhdr->wq_wrid_post != NULL);
1751 1743 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1752 1744
1753 1745 /*
1754 1746 * Check for user-mapped CQ memory. Note: We do not allow kernel
1755 1747 * clients to modify any userland mapping CQ. If the CQ is
1756 1748 * user-mapped, then we simply return here, and this "flush" function
1757 1749 * becomes a NO-OP in this case.
1758 1750 */
1759 1751 if (cq->cq_is_umap) {
1760 1752 return;
1761 1753 }
1762 1754
1763 1755 /* Get the consumer index */
1764 1756 cons_indx = cq->cq_consindx;
1765 1757
1766 1758 /*
1767 1759 * Calculate the wrap around mask. Note: This operation only works
1768 1760 * because all Tavor completion queues have power-of-2 sizes
1769 1761 */
1770 1762 wrap_around_mask = (cq->cq_bufsz - 1);
1771 1763
1772 1764 /* Calculate the pointer to the first CQ entry */
1773 1765 cqe = &cq->cq_buf[cons_indx];
1774 1766
1775 1767 /* Sync the current CQE to read */
1776 1768 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1777 1769
1778 1770 /*
1779 1771 * Loop through the CQ looking for entries owned by software. If an
1780 1772 * entry is owned by software then we increment an 'outstanding_cqes'
1781 1773 * count to know how many entries total we have on our CQ. We use this
1782 1774 * value further down to know how many entries to loop through looking
1783 1775 * for our same QP number.
1784 1776 */
1785 1777 outstanding_cqes = 0;
1786 1778 tail_cons_indx = cons_indx;
1787 1779 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1788 1780 /* increment total cqes count */
1789 1781 outstanding_cqes++;
1790 1782
1791 1783 /* increment the consumer index */
1792 1784 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1793 1785
1794 1786 /* update the pointer to the next cq entry */
1795 1787 cqe = &cq->cq_buf[tail_cons_indx];
1796 1788
1797 1789 /* sync the next cqe to read */
1798 1790 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1799 1791 }
1800 1792
1801 1793 /*
1802 1794 * Using the 'tail_cons_indx' that was just set, we now know how many
1803 1795 * total CQEs possible there are. Set the 'check_indx' and the
1804 1796 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1805 1797 */
1806 1798 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1807 1799
1808 1800 for (i = 0; i < outstanding_cqes; i++) {
1809 1801 cqe = &cq->cq_buf[check_indx];
1810 1802
1811 1803 /* Grab QP number from CQE */
1812 1804 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1813 1805 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1814 1806
1815 1807 /*
1816 1808 * If the QP number is the same in the CQE as the QP that we
1817 1809 * have on this SRQ, then we must free up the entry off the
1818 1810 * SRQ. We also make sure that the completion type is of the
1819 1811 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1820 1812 * this CQ will be left as-is. The handling of returning
1821 1813 * entries back to HW ownership happens further down.
1822 1814 */
1823 1815 if (cqe_qpnum == qp->qp_qpnum &&
1824 1816 cqe_type == TAVOR_COMPLETION_RECV) {
1825 1817
1826 1818 /* Add back to SRQ free list */
1827 1819 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1828 1820 cq, cqe);
1829 1821 } else {
1830 1822 /* Do Copy */
1831 1823 if (check_indx != new_indx) {
1832 1824 next_cqe = &cq->cq_buf[new_indx];
1833 1825
1834 1826 /*
1835 1827 * Copy the CQE into the "next_cqe"
1836 1828 * pointer.
1837 1829 */
1838 1830 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1839 1831 }
1840 1832 new_indx = (new_indx - 1) & wrap_around_mask;
1841 1833 }
1842 1834 /* Move index to next CQE to check */
1843 1835 check_indx = (check_indx - 1) & wrap_around_mask;
1844 1836 }
1845 1837
1846 1838 /* Initialize removed cqes count */
1847 1839 removed_cqes = 0;
1848 1840
1849 1841 /* If an entry was removed */
1850 1842 if (check_indx != new_indx) {
1851 1843
1852 1844 /*
1853 1845 * Set current pointer back to the beginning consumer index.
1854 1846 * At this point, all unclaimed entries have been copied to the
1855 1847 * index specified by 'new_indx'. This 'new_indx' will be used
1856 1848 * as the new consumer index after we mark all freed entries as
1857 1849 * having HW ownership. We do that here.
1858 1850 */
1859 1851
1860 1852 /* Loop through all entries until we reach our new pointer */
1861 1853 for (indx = cons_indx; indx <= new_indx;
1862 1854 indx = (indx + 1) & wrap_around_mask) {
1863 1855 removed_cqes++;
1864 1856 cqe = &cq->cq_buf[indx];
1865 1857
1866 1858 /* Reset entry to hardware ownership */
1867 1859 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1868 1860 }
1869 1861 }
1870 1862
1871 1863 /*
1872 1864 * Update consumer index to be the 'new_indx'. This moves it past all
1873 1865 * removed entries. Because 'new_indx' is pointing to the last
1874 1866 * previously valid SW owned entry, we add 1 to point the cons_indx to
1875 1867 * the first HW owned entry.
1876 1868 */
1877 1869 cons_indx = (new_indx + 1) & wrap_around_mask;
1878 1870
1879 1871 /*
1880 1872 * Now we only ring the doorbell (to update the consumer index) if
1881 1873 * we've actually consumed a CQ entry. If we found no QP number
1882 1874 * matches above, then we would not have removed anything. So only if
1883 1875 * something was removed do we ring the doorbell.
1884 1876 */
1885 1877 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1886 1878 /*
1887 1879 * Post doorbell to update the consumer index. Doorbell
1888 1880 * value indicates number of entries consumed (minus 1)
1889 1881 */
1890 1882 if (cons_indx > cq->cq_consindx) {
1891 1883 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1892 1884 } else {
1893 1885 num_to_increment = ((cons_indx + cq->cq_bufsz) -
1894 1886 cq->cq_consindx) - 1;
1895 1887 }
1896 1888 cq->cq_consindx = cons_indx;
1897 1889
1898 1890 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1899 1891 cq->cq_cqnum, num_to_increment);
1900 1892 }
1901 1893 }
↓ open down ↓ |
1235 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX