Print this page
8368 remove warlock leftovers from usr/src/uts
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c
+++ new/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 /*
28 28 * tavor_wr.c
29 29 * Tavor Work Request Processing Routines
30 30 *
31 31 * Implements all the routines necessary to provide the PostSend(),
32 32 * PostRecv() and PostSRQ() verbs. Also contains all the code
33 33 * necessary to implement the Tavor WRID tracking mechanism.
34 34 */
35 35
36 36 #include <sys/types.h>
37 37 #include <sys/conf.h>
38 38 #include <sys/ddi.h>
39 39 #include <sys/sunddi.h>
40 40 #include <sys/modctl.h>
41 41 #include <sys/avl.h>
42 42
43 43 #include <sys/ib/adapters/tavor/tavor.h>
44 44
45 45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46 46 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 47 #pragma inline(tavor_qp_send_doorbell)
48 48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
49 49 uint32_t nds, uint32_t qpn, uint32_t credits);
50 50 #pragma inline(tavor_qp_recv_doorbell)
51 51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
52 52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
53 53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
54 54 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
55 55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
56 56 ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
57 57 uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
58 58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
59 59 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
60 60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
61 61 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
62 62 tavor_qphdl_t qp);
63 63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
64 64 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
65 65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
66 66 uint64_t *prev, tavor_qphdl_t qp);
67 67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
68 68 ibt_recv_wr_t *wr, uint64_t *desc);
69 69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
70 70 tavor_srqhdl_t srq);
71 71 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
72 72 uint_t sync_to, uint_t sync_type, uint_t flag);
73 73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
74 74 tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
75 75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
76 76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
77 77 uint_t send_or_recv);
78 78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
79 79 tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
80 80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
81 81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
82 82 tavor_wrid_list_hdr_t *wrid_list);
83 83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
84 84 tavor_wrid_list_hdr_t *wrid_list);
85 85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
86 86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
87 87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
88 88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
89 89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
90 90
91 91 /*
92 92 * tavor_post_send()
93 93 * Context: Can be called from interrupt or base context.
94 94 */
95 95 int
96 96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
97 97 ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
98 98 {
99 99 tavor_sw_wqe_dbinfo_t dbinfo;
100 100 tavor_wrid_list_hdr_t *wridlist;
101 101 tavor_wrid_entry_t *wre_last;
102 102 uint64_t *desc, *prev, *first;
103 103 uint32_t desc_sz, first_sz;
104 104 uint32_t wqeaddrsz, signaled_dbd;
105 105 uint32_t head, tail, next_tail, qsize_msk;
106 106 uint32_t sync_from, sync_to;
107 107 uint_t currindx, wrindx, numremain;
108 108 uint_t chainlen, chainbegin, posted_cnt;
109 109 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
110 110 int status;
111 111
112 112 TAVOR_TNF_ENTER(tavor_post_send);
113 113
114 114 /*
115 115 * Check for user-mappable QP memory. Note: We do not allow kernel
116 116 * clients to post to QP memory that is accessible directly by the
117 117 * user. If the QP memory is user accessible, then return an error.
118 118 */
119 119 if (qp->qp_is_umap) {
120 120 TNF_PROBE_0(tavor_post_send_inv_usrmapped_type,
121 121 TAVOR_TNF_ERROR, "");
122 122 TAVOR_TNF_EXIT(tavor_post_send);
123 123 return (IBT_QP_HDL_INVALID);
124 124 }
125 125
126 126 /* Initialize posted_cnt */
127 127 posted_cnt = 0;
128 128
129 129 mutex_enter(&qp->qp_lock);
130 130
131 131 /*
132 132 * Check QP state. Can not post Send requests from the "Reset",
133 133 * "Init", or "RTR" states
134 134 */
135 135 if ((qp->qp_state == TAVOR_QP_RESET) ||
136 136 (qp->qp_state == TAVOR_QP_INIT) ||
137 137 (qp->qp_state == TAVOR_QP_RTR)) {
138 138 mutex_exit(&qp->qp_lock);
139 139 TNF_PROBE_0(tavor_post_send_inv_qpstate_fail,
140 140 TAVOR_TNF_ERROR, "");
141 141 TAVOR_TNF_EXIT(tavor_post_send);
142 142 return (IBT_QP_STATE_INVALID);
143 143 }
144 144
145 145 /* Grab the lock for the WRID list */
146 146 mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
147 147 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
148 148
149 149 /* Save away some initial QP state */
150 150 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
151 151 tail = qp->qp_sq_wqhdr->wq_tail;
152 152 head = qp->qp_sq_wqhdr->wq_head;
153 153
154 154 /*
155 155 * For each ibt_send_wr_t in the wr[] list passed in, parse the
156 156 * request and build a Send WQE. Note: Because we are potentially
157 157 * building a chain of WQEs, we want to link them all together.
158 158 * However, we do not want to link the first one to the previous
159 159 * WQE until the entire chain has been linked. Then in the last
160 160 * step we ring the appropriate doorbell. Note: It is possible for
161 161 * more Work Requests to be posted than the HW will support at one
162 162 * shot. If this happens, we need to be able to post and ring
163 163 * several chains here until the the entire request is complete.
164 164 */
165 165 wrindx = 0;
166 166 numremain = num_wr;
167 167 status = DDI_SUCCESS;
168 168 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
169 169 /*
170 170 * For the first WQE on a new chain we need "prev" to point
171 171 * to the current descriptor. As we begin to process
172 172 * further, "prev" will be updated to point to the previous
173 173 * WQE on the current chain (see below).
174 174 */
175 175 prev = TAVOR_QP_SQ_ENTRY(qp, tail);
176 176
177 177 /*
178 178 * Before we begin, save the current "tail index" for later
179 179 * DMA sync
180 180 */
181 181 sync_from = tail;
182 182
183 183 /*
184 184 * Break the request up into chains that are less than or
185 185 * equal to the maximum number of WQEs that can be posted
186 186 * per doorbell ring
187 187 */
188 188 chainlen = (numremain > maxdb) ? maxdb : numremain;
189 189 numremain -= chainlen;
190 190 chainbegin = wrindx;
191 191 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
192 192 /*
193 193 * Check for "queue full" condition. If the queue
194 194 * is already full, then no more WQEs can be posted.
195 195 * So break out, ring a doorbell (if necessary) and
196 196 * return an error
197 197 */
198 198 if (qp->qp_sq_wqhdr->wq_full != 0) {
199 199 status = IBT_QP_FULL;
200 200 TNF_PROBE_0_DEBUG(tavor_post_send_sqfull,
201 201 TAVOR_TNF_TRACE, "");
202 202 break;
203 203 }
204 204
205 205 /*
206 206 * Increment the "tail index" and check for "queue
207 207 * full" condition. If we detect that the current
208 208 * work request is going to fill the work queue, then
209 209 * we mark this condition and continue.
210 210 */
211 211 next_tail = (tail + 1) & qsize_msk;
212 212 if (next_tail == head) {
213 213 qp->qp_sq_wqhdr->wq_full = 1;
214 214 }
215 215
216 216 /*
217 217 * Get the address of the location where the next
218 218 * Send WQE should be built
219 219 */
220 220 desc = TAVOR_QP_SQ_ENTRY(qp, tail);
221 221
222 222 /*
223 223 * Call tavor_wqe_send_build() to build the WQE
224 224 * at the given address. This routine uses the
225 225 * information in the ibt_send_wr_t list (wr[]) and
226 226 * returns the size of the WQE when it returns.
227 227 */
228 228 status = tavor_wqe_send_build(state, qp,
229 229 &wr[wrindx], desc, &desc_sz);
230 230 if (status != DDI_SUCCESS) {
231 231 TNF_PROBE_0(tavor_post_send_bldwqe_fail,
232 232 TAVOR_TNF_ERROR, "");
233 233 break;
234 234 }
235 235
236 236 /*
237 237 * Add a WRID entry to the WRID list. Need to
238 238 * calculate the "wqeaddrsz" and "signaled_dbd"
239 239 * values to pass to tavor_wrid_add_entry()
240 240 */
241 241 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
242 242 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
243 243 desc_sz);
244 244 if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
245 245 (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
246 246 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
247 247 } else {
248 248 signaled_dbd = 0;
249 249 }
250 250 tavor_wrid_add_entry(qp->qp_sq_wqhdr,
251 251 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
252 252
253 253 /*
254 254 * If this is not the first descriptor on the current
255 255 * chain, then link it to the previous WQE. Otherwise,
256 256 * save the address and size of this descriptor (in
257 257 * "first" and "first_sz" respectively) and continue.
258 258 * Note: Linking a WQE to the the previous one will
259 259 * depend on whether the two WQEs are from "special
260 260 * QPs" (i.e. MLX transport WQEs) or whether they are
261 261 * normal Send WQEs.
262 262 */
263 263 if (currindx != 0) {
264 264 if (qp->qp_is_special) {
265 265 tavor_wqe_mlx_linknext(&wr[wrindx - 1],
266 266 desc, desc_sz, prev, NULL, qp);
267 267 } else {
268 268 tavor_wqe_send_linknext(&wr[wrindx],
269 269 &wr[wrindx - 1], desc, desc_sz,
270 270 prev, NULL, qp);
271 271 }
272 272 prev = desc;
273 273 } else {
274 274 first = desc;
275 275 first_sz = desc_sz;
276 276 }
277 277
278 278 /*
279 279 * Update the current "tail index" and increment
280 280 * "posted_cnt"
281 281 */
282 282 tail = next_tail;
283 283 posted_cnt++;
284 284 }
285 285
286 286 /*
287 287 * If we reach here and there are one or more WQEs which have
288 288 * been successfully chained together, then we need to link
289 289 * the current chain to the previously executing chain of
290 290 * descriptor (if there is one) and ring the doorbell for the
291 291 * send work queue.
292 292 */
293 293 if (currindx != 0) {
294 294 /*
295 295 * Before we link the chain, we need to ensure that the
296 296 * "next" field on the last WQE is set to NULL (to
297 297 * indicate the end of the chain). Note: Just as it
298 298 * did above, the format for the "next" fields in a
299 299 * given WQE depend on whether the WQE is MLX
300 300 * transport or not.
301 301 */
302 302 if (qp->qp_is_special) {
303 303 tavor_wqe_mlx_linknext(&wr[chainbegin +
304 304 currindx - 1], NULL, 0, prev, NULL, qp);
305 305 } else {
306 306 tavor_wqe_send_linknext(NULL,
307 307 &wr[chainbegin + currindx - 1], NULL, 0,
308 308 prev, NULL, qp);
309 309 }
310 310
311 311 /* Save away updated "tail index" for the DMA sync */
312 312 sync_to = tail;
313 313
314 314 /* Do a DMA sync for current send WQE(s) */
315 315 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
316 316 DDI_DMA_SYNC_FORDEV);
317 317
318 318 /*
319 319 * Now link the chain to the old chain (if there was
320 320 * one. Note: still need to pay attention to whether
321 321 * the QP used MLX transport WQEs or not.
322 322 */
323 323 if (qp->qp_is_special) {
324 324 tavor_wqe_mlx_linknext(NULL, first, first_sz,
325 325 qp->qp_sq_lastwqeaddr, &dbinfo, qp);
326 326 } else {
327 327 tavor_wqe_send_linknext(&wr[chainbegin], NULL,
328 328 first, first_sz, qp->qp_sq_lastwqeaddr,
329 329 &dbinfo, qp);
330 330 }
331 331
332 332 /*
333 333 * If there was a valid previous WQE (i.e. non-NULL),
334 334 * then sync it too. This is because we have updated
335 335 * its "next" fields and we want to ensure that the
336 336 * hardware can see the changes.
337 337 */
338 338 if (qp->qp_sq_lastwqeaddr != NULL) {
339 339 sync_to = sync_from;
340 340 sync_from = (sync_from - 1) & qsize_msk;
341 341 tavor_wqe_sync(qp, sync_from, sync_to,
342 342 TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
343 343 }
344 344
345 345 /*
346 346 * Now if the WRID tail entry is non-NULL, then this
347 347 * represents the entry to which we are chaining the
348 348 * new entries. Since we are going to ring the
349 349 * doorbell for this WQE, we want set its "dbd" bit.
350 350 *
351 351 * On the other hand, if the tail is NULL, even though
352 352 * we will have rung the doorbell for the previous WQE
353 353 * (for the hardware's sake) it is irrelevant to our
354 354 * purposes (for tracking WRIDs) because we know the
355 355 * request must have already completed.
356 356 */
357 357 wre_last = wridlist->wl_wre_old_tail;
358 358 if (wre_last != NULL) {
359 359 wre_last->wr_signaled_dbd |=
360 360 TAVOR_WRID_ENTRY_DOORBELLED;
361 361 }
362 362
363 363 /* Update some of the state in the QP */
364 364 qp->qp_sq_lastwqeaddr = desc;
365 365 qp->qp_sq_wqhdr->wq_tail = tail;
366 366
367 367 /* Ring the doorbell */
368 368 tavor_qp_send_doorbell(state,
369 369 (uint32_t)((uintptr_t)first - qp->qp_desc_off),
370 370 first_sz, qp->qp_qpnum, dbinfo.db_fence,
371 371 dbinfo.db_nopcode);
372 372 }
373 373 }
374 374
375 375 /*
376 376 * Update the "num_posted" return value (if necessary). Then drop
377 377 * the locks and return success.
378 378 */
379 379 if (num_posted != NULL) {
380 380 *num_posted = posted_cnt;
381 381 }
382 382
383 383 mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
384 384 mutex_exit(&qp->qp_lock);
385 385
386 386 TAVOR_TNF_EXIT(tavor_post_send);
387 387 return (status);
388 388 }
389 389
390 390
391 391 /*
392 392 * tavor_post_recv()
393 393 * Context: Can be called from interrupt or base context.
394 394 */
395 395 int
396 396 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
397 397 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
398 398 {
399 399 uint64_t *desc, *prev, *first;
400 400 uint32_t desc_sz, first_sz;
401 401 uint32_t wqeaddrsz, signaled_dbd;
402 402 uint32_t head, tail, next_tail, qsize_msk;
403 403 uint32_t sync_from, sync_to;
404 404 uint_t currindx, wrindx, numremain;
405 405 uint_t chainlen, posted_cnt;
406 406 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
407 407 int status;
408 408
409 409 TAVOR_TNF_ENTER(tavor_post_recv);
410 410
411 411 /*
412 412 * Check for user-mappable QP memory. Note: We do not allow kernel
413 413 * clients to post to QP memory that is accessible directly by the
414 414 * user. If the QP memory is user accessible, then return an error.
415 415 */
416 416 if (qp->qp_is_umap) {
417 417 TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type,
418 418 TAVOR_TNF_ERROR, "");
419 419 TAVOR_TNF_EXIT(tavor_post_recv);
420 420 return (IBT_QP_HDL_INVALID);
421 421 }
422 422
423 423 /* Initialize posted_cnt */
424 424 posted_cnt = 0;
425 425
426 426 mutex_enter(&qp->qp_lock);
427 427
428 428 /*
429 429 * Check if QP is associated with an SRQ
430 430 */
431 431 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
432 432 mutex_exit(&qp->qp_lock);
433 433 TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq,
434 434 TAVOR_TNF_ERROR, "");
435 435 TAVOR_TNF_EXIT(tavor_post_recv);
436 436 return (IBT_SRQ_IN_USE);
437 437 }
438 438
439 439 /*
440 440 * Check QP state. Can not post Recv requests from the "Reset" state
441 441 */
442 442 if (qp->qp_state == TAVOR_QP_RESET) {
443 443 mutex_exit(&qp->qp_lock);
444 444 TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail,
445 445 TAVOR_TNF_ERROR, "");
446 446 TAVOR_TNF_EXIT(tavor_post_recv);
447 447 return (IBT_QP_STATE_INVALID);
448 448 }
449 449
450 450 /* Grab the lock for the WRID list */
451 451 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
452 452
453 453 /* Save away some initial QP state */
454 454 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
455 455 tail = qp->qp_rq_wqhdr->wq_tail;
456 456 head = qp->qp_rq_wqhdr->wq_head;
457 457
458 458 /*
459 459 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
460 460 * request and build a Recv WQE. Note: Because we are potentially
461 461 * building a chain of WQEs, we want to link them all together.
462 462 * However, we do not want to link the first one to the previous
463 463 * WQE until the entire chain has been linked. Then in the last
464 464 * step we ring the appropriate doorbell. Note: It is possible for
465 465 * more Work Requests to be posted than the HW will support at one
466 466 * shot. If this happens, we need to be able to post and ring
467 467 * several chains here until the the entire request is complete.
468 468 */
469 469 wrindx = 0;
470 470 numremain = num_wr;
471 471 status = DDI_SUCCESS;
472 472 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
473 473 /*
474 474 * For the first WQE on a new chain we need "prev" to point
475 475 * to the current descriptor. As we begin to process
476 476 * further, "prev" will be updated to point to the previous
477 477 * WQE on the current chain (see below).
478 478 */
479 479 prev = TAVOR_QP_RQ_ENTRY(qp, tail);
480 480
481 481 /*
482 482 * Before we begin, save the current "tail index" for later
483 483 * DMA sync
484 484 */
485 485 sync_from = tail;
486 486
487 487 /*
488 488 * Break the request up into chains that are less than or
489 489 * equal to the maximum number of WQEs that can be posted
490 490 * per doorbell ring
491 491 */
492 492 chainlen = (numremain > maxdb) ? maxdb : numremain;
493 493 numremain -= chainlen;
494 494 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
495 495 /*
496 496 * Check for "queue full" condition. If the queue
497 497 * is already full, then no more WQEs can be posted.
498 498 * So break out, ring a doorbell (if necessary) and
499 499 * return an error
500 500 */
501 501 if (qp->qp_rq_wqhdr->wq_full != 0) {
502 502 status = IBT_QP_FULL;
503 503 TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull,
504 504 TAVOR_TNF_TRACE, "");
505 505 break;
506 506 }
507 507
508 508 /*
509 509 * Increment the "tail index" and check for "queue
510 510 * full" condition. If we detect that the current
511 511 * work request is going to fill the work queue, then
512 512 * we mark this condition and continue.
513 513 */
514 514 next_tail = (tail + 1) & qsize_msk;
515 515 if (next_tail == head) {
516 516 qp->qp_rq_wqhdr->wq_full = 1;
517 517 }
518 518
519 519 /*
520 520 * Get the address of the location where the next
521 521 * Recv WQE should be built
522 522 */
523 523 desc = TAVOR_QP_RQ_ENTRY(qp, tail);
524 524
525 525 /*
526 526 * Call tavor_wqe_recv_build() to build the WQE
527 527 * at the given address. This routine uses the
528 528 * information in the ibt_recv_wr_t list (wr[]) and
529 529 * returns the size of the WQE when it returns.
530 530 */
531 531 status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
532 532 desc, &desc_sz);
533 533 if (status != DDI_SUCCESS) {
534 534 TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
535 535 TAVOR_TNF_ERROR, "");
536 536 break;
537 537 }
538 538
539 539 /*
540 540 * Add a WRID entry to the WRID list. Need to
541 541 * calculate the "wqeaddrsz" and "signaled_dbd"
542 542 * values to pass to tavor_wrid_add_entry(). Note:
543 543 * all Recv WQEs are essentially "signaled" and
544 544 * "doorbelled" (since Tavor HW requires all
545 545 * RecvWQE's to have their "DBD" bits set).
546 546 */
547 547 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
548 548 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
549 549 desc_sz);
550 550 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
551 551 TAVOR_WRID_ENTRY_DOORBELLED;
552 552 tavor_wrid_add_entry(qp->qp_rq_wqhdr,
553 553 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
554 554
555 555 /*
556 556 * If this is not the first descriptor on the current
557 557 * chain, then link it to the previous WQE. Otherwise,
558 558 * save the address and size of this descriptor (in
559 559 * "first" and "first_sz" respectively) and continue.
560 560 */
561 561 if (currindx != 0) {
562 562 tavor_wqe_recv_linknext(desc, desc_sz, prev,
563 563 qp);
564 564 prev = desc;
565 565 } else {
566 566 first = desc;
567 567 first_sz = desc_sz;
568 568 }
569 569
570 570 /*
571 571 * Update the current "tail index" and increment
572 572 * "posted_cnt"
573 573 */
574 574 tail = next_tail;
575 575 posted_cnt++;
576 576 }
577 577
578 578 /*
579 579 * If we reach here and there are one or more WQEs which have
580 580 * been successfully chained together, then we need to link
581 581 * the current chain to the previously executing chain of
582 582 * descriptor (if there is one) and ring the doorbell for the
583 583 * recv work queue.
584 584 */
585 585 if (currindx != 0) {
586 586 /*
587 587 * Before we link the chain, we need to ensure that the
588 588 * "next" field on the last WQE is set to NULL (to
589 589 * indicate the end of the chain).
590 590 */
591 591 tavor_wqe_recv_linknext(NULL, 0, prev, qp);
592 592
593 593 /* Save away updated "tail index" for the DMA sync */
594 594 sync_to = tail;
595 595
596 596 /* Do a DMA sync for current recv WQE(s) */
597 597 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
598 598 DDI_DMA_SYNC_FORDEV);
599 599
600 600 /*
601 601 * Now link the chain to the old chain (if there was
602 602 * one.
603 603 */
604 604 tavor_wqe_recv_linknext(first, first_sz,
605 605 qp->qp_rq_lastwqeaddr, qp);
606 606
607 607 /*
608 608 * If there was a valid previous WQE (i.e. non-NULL),
609 609 * then sync it too. This is because we have updated
610 610 * its "next" fields and we want to ensure that the
611 611 * hardware can see the changes.
612 612 */
613 613 if (qp->qp_rq_lastwqeaddr != NULL) {
614 614 sync_to = sync_from;
615 615 sync_from = (sync_from - 1) & qsize_msk;
616 616 tavor_wqe_sync(qp, sync_from, sync_to,
617 617 TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
618 618 }
619 619
620 620 /* Update some of the state in the QP */
621 621 qp->qp_rq_lastwqeaddr = desc;
622 622 qp->qp_rq_wqhdr->wq_tail = tail;
623 623
624 624 /* Ring the doorbell */
625 625 tavor_qp_recv_doorbell(state,
626 626 (uint32_t)((uintptr_t)first - qp->qp_desc_off),
627 627 first_sz, qp->qp_qpnum, (chainlen % maxdb));
628 628 }
629 629 }
630 630
631 631 /*
632 632 * Update the "num_posted" return value (if necessary). Then drop
633 633 * the locks and return success.
634 634 */
635 635 if (num_posted != NULL) {
636 636 *num_posted = posted_cnt;
637 637 }
638 638
639 639 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
640 640 mutex_exit(&qp->qp_lock);
641 641
642 642 TAVOR_TNF_EXIT(tavor_post_recv);
643 643 return (status);
644 644 }
645 645
646 646 /*
647 647 * tavor_post_srq()
648 648 * Context: Can be called from interrupt or base context.
649 649 */
650 650 int
651 651 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
652 652 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
653 653 {
654 654 uint64_t *desc, *prev, *first, *last_wqe_addr;
655 655 uint32_t signaled_dbd;
656 656 uint32_t sync_indx;
657 657 uint_t currindx, wrindx, numremain;
658 658 uint_t chainlen, posted_cnt;
659 659 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
660 660 int status;
661 661
662 662 TAVOR_TNF_ENTER(tavor_post_srq);
663 663
664 664 /*
665 665 * Check for user-mappable QP memory. Note: We do not allow kernel
666 666 * clients to post to QP memory that is accessible directly by the
667 667 * user. If the QP memory is user accessible, then return an error.
668 668 */
669 669 if (srq->srq_is_umap) {
670 670 TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type,
671 671 TAVOR_TNF_ERROR, "");
672 672 TAVOR_TNF_EXIT(tavor_post_srq);
673 673 return (IBT_SRQ_HDL_INVALID);
674 674 }
675 675
676 676 /* Initialize posted_cnt */
677 677 posted_cnt = 0;
678 678
679 679 mutex_enter(&srq->srq_lock);
680 680
681 681 /*
682 682 * Check SRQ state. Can not post Recv requests when SRQ is in error
683 683 */
684 684 if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
685 685 mutex_exit(&srq->srq_lock);
686 686 TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail,
687 687 TAVOR_TNF_ERROR, "");
688 688 TAVOR_TNF_EXIT(tavor_post_srq);
689 689 return (IBT_QP_STATE_INVALID);
690 690 }
691 691
692 692 /* Grab the lock for the WRID list */
693 693 mutex_enter(&srq->srq_wrid_wql->wql_lock);
694 694
695 695 /*
696 696 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
697 697 * request and build a Recv WQE. Note: Because we are potentially
698 698 * building a chain of WQEs, we want to link them all together.
699 699 * However, we do not want to link the first one to the previous
700 700 * WQE until the entire chain has been linked. Then in the last
701 701 * step we ring the appropriate doorbell. Note: It is possible for
702 702 * more Work Requests to be posted than the HW will support at one
703 703 * shot. If this happens, we need to be able to post and ring
704 704 * several chains here until the the entire request is complete.
705 705 */
706 706 wrindx = 0;
707 707 numremain = num_wr;
708 708 status = DDI_SUCCESS;
709 709 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
710 710 /*
711 711 * For the first WQE on a new chain we need "prev" to point
712 712 * to the current descriptor. As we begin to process
713 713 * further, "prev" will be updated to point to the previous
714 714 * WQE on the current chain (see below).
715 715 */
716 716 if (srq->srq_wq_lastwqeindx == -1) {
717 717 prev = NULL;
718 718 } else {
719 719 prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
720 720 }
721 721
722 722 /*
723 723 * Break the request up into chains that are less than or
724 724 * equal to the maximum number of WQEs that can be posted
725 725 * per doorbell ring
726 726 */
727 727 chainlen = (numremain > maxdb) ? maxdb : numremain;
728 728 numremain -= chainlen;
729 729 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
730 730
731 731 /*
732 732 * Check for "queue full" condition. If the queue
733 733 * is already full, then no more WQEs can be posted.
734 734 * So break out, ring a doorbell (if necessary) and
735 735 * return an error
736 736 */
737 737 if (srq->srq_wridlist->wl_free_list_indx == -1) {
738 738 status = IBT_QP_FULL;
739 739 TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull,
740 740 TAVOR_TNF_TRACE, "");
741 741 break;
742 742 }
743 743
744 744 /*
745 745 * Get the address of the location where the next
746 746 * Recv WQE should be built
747 747 */
748 748 desc = TAVOR_SRQ_WQE_ADDR(srq,
749 749 srq->srq_wridlist->wl_free_list_indx);
750 750
751 751 /*
752 752 * Add a WRID entry to the WRID list. Need to
753 753 * set the "signaled_dbd" values to pass to
754 754 * tavor_wrid_add_entry(). Note: all Recv WQEs are
755 755 * essentially "signaled"
756 756 *
757 757 * The 'size' is stored at srq_alloc time, in the
758 758 * srq_wq_stride. This is a constant value required
759 759 * for SRQ.
760 760 */
761 761 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
762 762 tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
763 763 signaled_dbd);
764 764
765 765 /*
766 766 * Call tavor_wqe_srq_build() to build the WQE
767 767 * at the given address. This routine uses the
768 768 * information in the ibt_recv_wr_t list (wr[]) and
769 769 * returns the size of the WQE when it returns.
770 770 */
771 771 status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
772 772 desc);
773 773 if (status != DDI_SUCCESS) {
774 774 TNF_PROBE_0(tavor_post_recv_bldwqe_fail,
775 775 TAVOR_TNF_ERROR, "");
776 776 break;
777 777 }
778 778
779 779 /*
780 780 * If this is not the first descriptor on the current
781 781 * chain, then link it to the previous WQE. Otherwise,
782 782 * save the address of this descriptor (in "first") and
783 783 * continue.
784 784 */
785 785 if (currindx != 0) {
786 786 tavor_wqe_srq_linknext(desc, prev, srq);
787 787 sync_indx = TAVOR_SRQ_WQE_INDEX(
788 788 srq->srq_wq_buf, prev,
789 789 srq->srq_wq_log_wqesz);
790 790
791 791 /* Do a DMA sync for previous recv WQE */
792 792 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
793 793 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
794 794
795 795 prev = desc;
796 796 } else {
797 797
798 798 /*
799 799 * In this case, the last WQE on the chain is
800 800 * also considered 'first'. So set prev to
801 801 * first, here.
802 802 */
803 803 first = prev = desc;
804 804 }
805 805
806 806 /*
807 807 * Increment "posted_cnt"
808 808 */
809 809 posted_cnt++;
810 810 }
811 811
812 812 /*
813 813 * If we reach here and there are one or more WQEs which have
814 814 * been successfully chained together, then we need to link
815 815 * the current chain to the previously executing chain of
816 816 * descriptor (if there is one) and ring the doorbell for the
817 817 * recv work queue.
818 818 */
819 819 if (currindx != 0) {
820 820 /*
821 821 * Before we link the chain, we need to ensure that the
822 822 * "next" field on the last WQE is set to NULL (to
823 823 * indicate the end of the chain).
824 824 */
825 825 tavor_wqe_srq_linknext(NULL, prev, srq);
826 826
827 827 sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
828 828 srq->srq_wq_log_wqesz);
829 829
830 830 /* Do a DMA sync for current recv WQE */
831 831 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
832 832 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
833 833
834 834 /*
835 835 * Now link the chain to the old chain (if there was
836 836 * one).
837 837 */
838 838 if (srq->srq_wq_lastwqeindx == -1) {
839 839 last_wqe_addr = NULL;
840 840 } else {
841 841 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
842 842 srq->srq_wq_lastwqeindx);
843 843 }
844 844 tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
845 845
846 846 /*
847 847 * If there was a valid previous WQE (i.e. valid index),
848 848 * then sync it too. This is because we have updated
849 849 * its "next" fields and we want to ensure that the
850 850 * hardware can see the changes.
851 851 */
852 852 if (srq->srq_wq_lastwqeindx != -1) {
853 853 sync_indx = srq->srq_wq_lastwqeindx;
854 854 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
855 855 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
856 856 }
857 857
858 858 /* Update some of the state in the QP */
859 859 srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
860 860 srq->srq_wq_buf, desc,
861 861 srq->srq_wq_log_wqesz);
862 862
863 863 /* Ring the doorbell */
864 864 /* SRQ needs NDS of 0 */
865 865 tavor_qp_recv_doorbell(state,
866 866 (uint32_t)((uintptr_t)first - srq->srq_desc_off),
867 867 0, srq->srq_srqnum, (chainlen % maxdb));
868 868 }
869 869 }
870 870
871 871 /*
872 872 * Update the "num_posted" return value (if necessary). Then drop
873 873 * the locks and return success.
874 874 */
875 875 if (num_posted != NULL) {
876 876 *num_posted = posted_cnt;
877 877 }
878 878
879 879 mutex_exit(&srq->srq_wrid_wql->wql_lock);
880 880 mutex_exit(&srq->srq_lock);
881 881
882 882 TAVOR_TNF_EXIT(tavor_post_srq);
883 883 return (status);
884 884 }
885 885
886 886
887 887 /*
888 888 * tavor_qp_send_doorbell()
889 889 * Context: Can be called from interrupt or base context.
890 890 */
891 891 static void
892 892 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
893 893 uint32_t qpn, uint32_t fence, uint32_t nopcode)
894 894 {
895 895 uint64_t doorbell = 0;
896 896
897 897 /* Build the doorbell from the parameters */
898 898 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
899 899 TAVOR_QPSNDDB_NDA_SHIFT) |
900 900 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
901 901 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
902 902 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
903 903
904 904 TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "",
905 905 tnf_ulong, doorbell, doorbell);
906 906
907 907 /* Write the doorbell to UAR */
908 908 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
909 909 doorbell);
910 910 }
911 911
912 912
913 913 /*
914 914 * tavor_qp_recv_doorbell()
915 915 * Context: Can be called from interrupt or base context.
916 916 */
917 917 static void
918 918 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
919 919 uint32_t qpn, uint32_t credits)
920 920 {
921 921 uint64_t doorbell = 0;
922 922
923 923 /* Build the doorbell from the parameters */
924 924 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
925 925 TAVOR_QPRCVDB_NDA_SHIFT) |
926 926 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
927 927 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
928 928
929 929 TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "",
930 930 tnf_ulong, doorbell, doorbell);
931 931
932 932 /* Write the doorbell to UAR */
933 933 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
934 934 doorbell);
935 935 }
936 936
937 937
938 938 /*
939 939 * tavor_wqe_send_build()
940 940 * Context: Can be called from interrupt or base context.
941 941 */
942 942 static int
943 943 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
944 944 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
945 945 {
946 946 tavor_hw_snd_wqe_ud_t *ud;
947 947 tavor_hw_snd_wqe_remaddr_t *rc;
948 948 tavor_hw_snd_wqe_atomic_t *at;
949 949 tavor_hw_snd_wqe_remaddr_t *uc;
950 950 tavor_hw_snd_wqe_bind_t *bn;
951 951 tavor_hw_wqe_sgl_t *ds;
952 952 ibt_wr_ds_t *sgl;
953 953 tavor_ahhdl_t ah;
954 954 uint32_t nds;
955 955 int i, num_ds, status;
956 956
957 957 TAVOR_TNF_ENTER(tavor_wqe_send_build);
958 958
959 959 ASSERT(MUTEX_HELD(&qp->qp_lock));
960 960
961 961 /* Initialize the information for the Data Segments */
962 962 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
963 963 sizeof (tavor_hw_snd_wqe_nextctrl_t));
964 964 nds = wr->wr_nds;
965 965 sgl = wr->wr_sgl;
966 966 num_ds = 0;
967 967
968 968 /*
969 969 * Build a Send WQE depends first and foremost on the transport
970 970 * type of Work Request (i.e. UD, RC, or UC)
971 971 */
972 972 switch (wr->wr_trans) {
973 973 case IBT_UD_SRV:
974 974 /* Ensure that work request transport type matches QP type */
975 975 if (qp->qp_serv_type != TAVOR_QP_UD) {
976 976 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
977 977 TAVOR_TNF_ERROR, "");
978 978 TAVOR_TNF_EXIT(tavor_wqe_send_build);
979 979 return (IBT_QP_SRV_TYPE_INVALID);
980 980 }
981 981
982 982 /*
983 983 * Validate the operation type. For UD requests, only the
984 984 * "Send" operation is valid
985 985 */
986 986 if (wr->wr_opcode != IBT_WRC_SEND) {
987 987 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
988 988 TAVOR_TNF_ERROR, "");
989 989 TAVOR_TNF_EXIT(tavor_wqe_send_build);
990 990 return (IBT_QP_OP_TYPE_INVALID);
991 991 }
992 992
993 993 /*
994 994 * If this is a Special QP (QP0 or QP1), then we need to
995 995 * build MLX WQEs instead. So jump to tavor_wqe_mlx_build()
996 996 * and return whatever status it returns
997 997 */
998 998 if (qp->qp_is_special) {
999 999 status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
1000 1000 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1001 1001 return (status);
1002 1002 }
1003 1003
1004 1004 /*
1005 1005 * Otherwise, if this is a normal UD Send request, then fill
1006 1006 * all the fields in the Tavor UD header for the WQE. Note:
1007 1007 * to do this we'll need to extract some information from the
1008 1008 * Address Handle passed with the work request.
1009 1009 */
1010 1010 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1011 1011 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1012 1012 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1013 1013 if (ah == NULL) {
1014 1014 TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail,
1015 1015 TAVOR_TNF_ERROR, "");
1016 1016 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1017 1017 return (IBT_AH_HDL_INVALID);
1018 1018 }
1019 1019
1020 1020 /*
1021 1021 * Build the Unreliable Datagram Segment for the WQE, using
1022 1022 * the information from the address handle and the work
1023 1023 * request.
1024 1024 */
1025 1025 mutex_enter(&ah->ah_lock);
1026 1026 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
1027 1027 mutex_exit(&ah->ah_lock);
1028 1028
1029 1029 /* Update "ds" for filling in Data Segments (below) */
1030 1030 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
1031 1031 sizeof (tavor_hw_snd_wqe_ud_t));
1032 1032 break;
1033 1033
1034 1034 case IBT_RC_SRV:
1035 1035 /* Ensure that work request transport type matches QP type */
1036 1036 if (qp->qp_serv_type != TAVOR_QP_RC) {
1037 1037 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1038 1038 TAVOR_TNF_ERROR, "");
1039 1039 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1040 1040 return (IBT_QP_SRV_TYPE_INVALID);
1041 1041 }
1042 1042
1043 1043 /*
1044 1044 * Validate the operation type. For RC requests, we allow
1045 1045 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1046 1046 * operations, and memory window "Bind"
1047 1047 */
1048 1048 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1049 1049 (wr->wr_opcode != IBT_WRC_RDMAR) &&
1050 1050 (wr->wr_opcode != IBT_WRC_RDMAW) &&
1051 1051 (wr->wr_opcode != IBT_WRC_CSWAP) &&
1052 1052 (wr->wr_opcode != IBT_WRC_FADD) &&
1053 1053 (wr->wr_opcode != IBT_WRC_BIND)) {
1054 1054 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1055 1055 TAVOR_TNF_ERROR, "");
1056 1056 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1057 1057 return (IBT_QP_OP_TYPE_INVALID);
1058 1058 }
1059 1059
1060 1060 /*
1061 1061 * If this is a Send request, then all we need to do is break
1062 1062 * out and here and begin the Data Segment processing below
1063 1063 */
1064 1064 if (wr->wr_opcode == IBT_WRC_SEND) {
1065 1065 break;
1066 1066 }
1067 1067
1068 1068 /*
1069 1069 * If this is an RDMA Read or RDMA Write request, then fill
1070 1070 * in the "Remote Address" header fields.
1071 1071 */
1072 1072 if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1073 1073 (wr->wr_opcode == IBT_WRC_RDMAW)) {
1074 1074 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1075 1075 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1076 1076
1077 1077 /*
1078 1078 * Build the Remote Address Segment for the WQE, using
1079 1079 * the information from the RC work request.
1080 1080 */
1081 1081 TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1082 1082
1083 1083 /* Update "ds" for filling in Data Segments (below) */
1084 1084 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1085 1085 sizeof (tavor_hw_snd_wqe_remaddr_t));
1086 1086 break;
1087 1087 }
1088 1088
1089 1089 /*
1090 1090 * If this is one of the Atomic type operations (i.e
1091 1091 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1092 1092 * Address" header fields and the "Atomic" header fields.
1093 1093 */
1094 1094 if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1095 1095 (wr->wr_opcode == IBT_WRC_FADD)) {
1096 1096 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1097 1097 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1098 1098 at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1099 1099 sizeof (tavor_hw_snd_wqe_remaddr_t));
1100 1100
1101 1101 /*
1102 1102 * Build the Remote Address and Atomic Segments for
1103 1103 * the WQE, using the information from the RC Atomic
1104 1104 * work request.
1105 1105 */
1106 1106 TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1107 1107 TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1108 1108
1109 1109 /* Update "ds" for filling in Data Segments (below) */
1110 1110 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1111 1111 sizeof (tavor_hw_snd_wqe_atomic_t));
1112 1112
1113 1113 /*
1114 1114 * Update "nds" and "sgl" because Atomic requests have
1115 1115 * only a single Data Segment (and they are encoded
1116 1116 * somewhat differently in the work request.
1117 1117 */
1118 1118 nds = 1;
1119 1119 sgl = wr->wr_sgl;
1120 1120 break;
1121 1121 }
1122 1122
1123 1123 /*
1124 1124 * If this is memory window Bind operation, then we call the
1125 1125 * tavor_wr_bind_check() routine to validate the request and
1126 1126 * to generate the updated RKey. If this is successful, then
1127 1127 * we fill in the WQE's "Bind" header fields.
1128 1128 */
1129 1129 if (wr->wr_opcode == IBT_WRC_BIND) {
1130 1130 status = tavor_wr_bind_check(state, wr);
1131 1131 if (status != DDI_SUCCESS) {
1132 1132 TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1133 1133 TAVOR_TNF_ERROR, "");
1134 1134 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1135 1135 return (status);
1136 1136 }
1137 1137
1138 1138 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1139 1139 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1140 1140
1141 1141 /*
1142 1142 * Build the Bind Memory Window Segments for the WQE,
1143 1143 * using the information from the RC Bind memory
1144 1144 * window work request.
1145 1145 */
1146 1146 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1147 1147
1148 1148 /*
1149 1149 * Update the "ds" pointer. Even though the "bind"
1150 1150 * operation requires no SGLs, this is necessary to
1151 1151 * facilitate the correct descriptor size calculations
1152 1152 * (below).
1153 1153 */
1154 1154 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1155 1155 sizeof (tavor_hw_snd_wqe_bind_t));
1156 1156 nds = 0;
1157 1157 }
1158 1158 break;
1159 1159
1160 1160 case IBT_UC_SRV:
1161 1161 /* Ensure that work request transport type matches QP type */
1162 1162 if (qp->qp_serv_type != TAVOR_QP_UC) {
1163 1163 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail,
1164 1164 TAVOR_TNF_ERROR, "");
1165 1165 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1166 1166 return (IBT_QP_SRV_TYPE_INVALID);
1167 1167 }
1168 1168
1169 1169 /*
1170 1170 * Validate the operation type. For UC requests, we only
1171 1171 * allow "Send", "RDMA Write", and memory window "Bind".
1172 1172 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1173 1173 * operations
1174 1174 */
1175 1175 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1176 1176 (wr->wr_opcode != IBT_WRC_RDMAW) &&
1177 1177 (wr->wr_opcode != IBT_WRC_BIND)) {
1178 1178 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail,
1179 1179 TAVOR_TNF_ERROR, "");
1180 1180 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1181 1181 return (IBT_QP_OP_TYPE_INVALID);
1182 1182 }
1183 1183
1184 1184 /*
1185 1185 * If this is a Send request, then all we need to do is break
1186 1186 * out and here and begin the Data Segment processing below
1187 1187 */
1188 1188 if (wr->wr_opcode == IBT_WRC_SEND) {
1189 1189 break;
1190 1190 }
1191 1191
1192 1192 /*
1193 1193 * If this is an RDMA Write request, then fill in the "Remote
1194 1194 * Address" header fields.
1195 1195 */
1196 1196 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1197 1197 uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1198 1198 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1199 1199
1200 1200 /*
1201 1201 * Build the Remote Address Segment for the WQE, using
1202 1202 * the information from the UC work request.
1203 1203 */
1204 1204 TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1205 1205
1206 1206 /* Update "ds" for filling in Data Segments (below) */
1207 1207 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1208 1208 sizeof (tavor_hw_snd_wqe_remaddr_t));
1209 1209 break;
1210 1210 }
1211 1211
1212 1212 /*
1213 1213 * If this is memory window Bind operation, then we call the
1214 1214 * tavor_wr_bind_check() routine to validate the request and
1215 1215 * to generate the updated RKey. If this is successful, then
1216 1216 * we fill in the WQE's "Bind" header fields.
1217 1217 */
1218 1218 if (wr->wr_opcode == IBT_WRC_BIND) {
1219 1219 status = tavor_wr_bind_check(state, wr);
1220 1220 if (status != DDI_SUCCESS) {
1221 1221 TNF_PROBE_0(tavor_wqe_send_build_bind_fail,
1222 1222 TAVOR_TNF_ERROR, "");
1223 1223 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1224 1224 return (status);
1225 1225 }
1226 1226
1227 1227 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1228 1228 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1229 1229
1230 1230 /*
1231 1231 * Build the Bind Memory Window Segments for the WQE,
1232 1232 * using the information from the UC Bind memory
1233 1233 * window work request.
1234 1234 */
1235 1235 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1236 1236
1237 1237 /*
1238 1238 * Update the "ds" pointer. Even though the "bind"
1239 1239 * operation requires no SGLs, this is necessary to
1240 1240 * facilitate the correct descriptor size calculations
1241 1241 * (below).
1242 1242 */
1243 1243 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1244 1244 sizeof (tavor_hw_snd_wqe_bind_t));
1245 1245 nds = 0;
1246 1246 }
1247 1247 break;
1248 1248
1249 1249 default:
1250 1250 TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail,
1251 1251 TAVOR_TNF_ERROR, "");
1252 1252 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1253 1253 return (IBT_QP_SRV_TYPE_INVALID);
1254 1254 }
1255 1255
1256 1256 /*
1257 1257 * Now fill in the Data Segments (SGL) for the Send WQE based on
1258 1258 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1259 1259 * Start by checking for a valid number of SGL entries
1260 1260 */
1261 1261 if (nds > qp->qp_sq_sgl) {
1262 1262 TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail,
1263 1263 TAVOR_TNF_ERROR, "");
1264 1264 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1265 1265 return (IBT_QP_SGL_LEN_INVALID);
1266 1266 }
1267 1267
1268 1268 /*
1269 1269 * For each SGL in the Send Work Request, fill in the Send WQE's data
1270 1270 * segments. Note: We skip any SGL with zero size because Tavor
1271 1271 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1272 1272 * the encoding for zero means a 2GB transfer. Because of this special
1273 1273 * encoding in the hardware, we mask the requested length with
1274 1274 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1275 1275 * zero.)
1276 1276 */
1277 1277 for (i = 0; i < nds; i++) {
1278 1278 if (sgl[i].ds_len == 0) {
1279 1279 continue;
1280 1280 }
1281 1281
1282 1282 /*
1283 1283 * Fill in the Data Segment(s) for the current WQE, using the
1284 1284 * information contained in the scatter-gather list of the
1285 1285 * work request.
1286 1286 */
1287 1287 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1288 1288 num_ds++;
1289 1289 }
1290 1290
1291 1291 /* Return the size of descriptor (in 16-byte chunks) */
1292 1292 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1293 1293
1294 1294 TAVOR_TNF_EXIT(tavor_wqe_send_build);
1295 1295 return (DDI_SUCCESS);
1296 1296 }
1297 1297
1298 1298
1299 1299 /*
1300 1300 * tavor_wqe_send_linknext()
1301 1301 * Context: Can be called from interrupt or base context.
1302 1302 */
1303 1303 static void
1304 1304 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1305 1305 uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1306 1306 tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1307 1307 {
1308 1308 uint64_t next, ctrl;
1309 1309 uint32_t nopcode, fence;
1310 1310
1311 1311 /*
1312 1312 * Calculate the "next" field of the descriptor. This amounts to
1313 1313 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1314 1314 * fields (see tavor_hw.h for more). Note: If there is no next
1315 1315 * descriptor (i.e. if the current descriptor is the last WQE on
1316 1316 * the chain), then set "next" to zero.
1317 1317 */
1318 1318 if (curr_desc != NULL) {
1319 1319 /*
1320 1320 * Determine the value for the Tavor WQE "nopcode" field
1321 1321 * by using the IBTF opcode from the work request
1322 1322 */
1323 1323 switch (curr_wr->wr_opcode) {
1324 1324 case IBT_WRC_RDMAW:
1325 1325 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1326 1326 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1327 1327 } else {
1328 1328 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1329 1329 }
1330 1330 break;
1331 1331
1332 1332 case IBT_WRC_SEND:
1333 1333 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1334 1334 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1335 1335 } else {
1336 1336 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1337 1337 }
1338 1338 break;
1339 1339
1340 1340 case IBT_WRC_RDMAR:
1341 1341 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1342 1342 break;
1343 1343
1344 1344 case IBT_WRC_CSWAP:
1345 1345 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1346 1346 break;
1347 1347
1348 1348 case IBT_WRC_FADD:
1349 1349 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1350 1350 break;
1351 1351
1352 1352 case IBT_WRC_BIND:
1353 1353 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1354 1354 break;
1355 1355 }
1356 1356
1357 1357 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1358 1358 - qp->qp_desc_off);
1359 1359 next = ((uint64_t)(uintptr_t)curr_desc &
1360 1360 TAVOR_WQE_NDA_MASK) << 32;
1361 1361 next = next | ((uint64_t)nopcode << 32);
1362 1362 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1363 1363 if (fence) {
1364 1364 next = next | TAVOR_WQE_SEND_FENCE_MASK;
1365 1365 }
1366 1366 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1367 1367
1368 1368 /*
1369 1369 * If a send queue doorbell will be rung for the next
1370 1370 * WQE on the chain, then set the current WQE's "dbd" bit.
1371 1371 * Note: We also update the "dbinfo" structure here to pass
1372 1372 * back information about what should (later) be included
1373 1373 * in the send queue doorbell.
1374 1374 */
1375 1375 if (dbinfo) {
1376 1376 next = next | TAVOR_WQE_DBD_MASK;
1377 1377 dbinfo->db_nopcode = nopcode;
1378 1378 dbinfo->db_fence = fence;
1379 1379 }
1380 1380 } else {
1381 1381 next = 0;
1382 1382 }
1383 1383
1384 1384 /*
1385 1385 * If this WQE is supposed to be linked to the previous descriptor,
1386 1386 * then we need to update not only the previous WQE's "next" fields
1387 1387 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1388 1388 * "s", "i" and "immediate" fields - see tavor_hw.h for more). Note:
1389 1389 * the "e" bit is always hardcoded to zero.
1390 1390 */
1391 1391 if (prev_desc != NULL) {
1392 1392 /*
1393 1393 * If a send queue doorbell will be rung for the next WQE on
1394 1394 * the chain, then update the current WQE's "next" field and
1395 1395 * return.
1396 1396 * Note: We don't want to modify the "ctrl" field here because
1397 1397 * that portion of the previous WQE has already been set
1398 1398 * correctly at some previous point in time.
1399 1399 */
1400 1400 if (dbinfo) {
1401 1401 TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1402 1402 return;
1403 1403 }
1404 1404
1405 1405 ctrl = 0;
1406 1406
1407 1407 /* Set the "c" (i.e. "signaled") bit appropriately */
1408 1408 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1409 1409 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1410 1410 }
1411 1411
1412 1412 /* Set the "s" (i.e. "solicited") bit appropriately */
1413 1413 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1414 1414 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1415 1415 }
1416 1416
1417 1417 /* Set the "i" bit and the immediate data appropriately */
1418 1418 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1419 1419 ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1420 1420 ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1421 1421 }
1422 1422
1423 1423 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1424 1424 }
1425 1425 }
1426 1426
1427 1427
1428 1428 /*
1429 1429 * tavor_wqe_mlx_build()
1430 1430 * Context: Can be called from interrupt or base context.
1431 1431 */
1432 1432 static int
1433 1433 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1434 1434 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1435 1435 {
1436 1436 tavor_hw_udav_t udav;
1437 1437 tavor_ahhdl_t ah;
1438 1438 ib_lrh_hdr_t *lrh;
1439 1439 ib_grh_t *grh;
1440 1440 ib_bth_hdr_t *bth;
1441 1441 ib_deth_hdr_t *deth;
1442 1442 tavor_hw_wqe_sgl_t *ds;
1443 1443 ibt_wr_ds_t *sgl;
1444 1444 uint8_t *mgmtclass, *hpoint, *hcount;
1445 1445 uint64_t data;
1446 1446 uint32_t nds, offset, pktlen;
1447 1447 uint32_t desc_sz, udav_sz;
1448 1448 int i, num_ds;
1449 1449
1450 1450 TAVOR_TNF_ENTER(tavor_wqe_mlx_build);
1451 1451
1452 1452 ASSERT(MUTEX_HELD(&qp->qp_lock));
1453 1453
1454 1454 /* Initialize the information for the Data Segments */
1455 1455 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1456 1456 sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1457 1457
1458 1458 /*
1459 1459 * Pull the address handle from the work request and read in
1460 1460 * the contents of the UDAV. This will be used to answer some
1461 1461 * questions about the request.
1462 1462 */
1463 1463 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1464 1464 if (ah == NULL) {
1465 1465 TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail,
1466 1466 TAVOR_TNF_ERROR, "");
1467 1467 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1468 1468 return (IBT_AH_HDL_INVALID);
1469 1469 }
1470 1470 mutex_enter(&ah->ah_lock);
1471 1471 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1472 1472 for (i = 0; i < udav_sz; i++) {
1473 1473 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1474 1474 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1475 1475 ((uint64_t *)&udav)[i] = data;
1476 1476 }
1477 1477 mutex_exit(&ah->ah_lock);
1478 1478
1479 1479 /*
1480 1480 * If the request is for QP1 and the destination LID is equal to
1481 1481 * the Permissive LID, then return an error. This combination is
1482 1482 * not allowed
1483 1483 */
1484 1484 if ((udav.rlid == IB_LID_PERMISSIVE) &&
1485 1485 (qp->qp_is_special == TAVOR_QP_GSI)) {
1486 1486 TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail,
1487 1487 TAVOR_TNF_ERROR, "");
1488 1488 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1489 1489 return (IBT_AH_HDL_INVALID);
1490 1490 }
1491 1491
1492 1492 /*
1493 1493 * Calculate the size of the packet headers, including the GRH
1494 1494 * (if necessary)
1495 1495 */
1496 1496 desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1497 1497 sizeof (ib_deth_hdr_t);
1498 1498 if (udav.grh) {
1499 1499 desc_sz += sizeof (ib_grh_t);
1500 1500 }
1501 1501
1502 1502 /*
1503 1503 * Begin to build the first "inline" data segment for the packet
1504 1504 * headers. Note: By specifying "inline" we can build the contents
1505 1505 * of the MAD packet headers directly into the work queue (as part
1506 1506 * descriptor). This has the advantage of both speeding things up
1507 1507 * and of not requiring the driver to allocate/register any additional
1508 1508 * memory for the packet headers.
1509 1509 */
1510 1510 TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1511 1511 desc_sz += 4;
1512 1512
1513 1513 /*
1514 1514 * Build Local Route Header (LRH)
1515 1515 * We start here by building the LRH into a temporary location.
1516 1516 * When we have finished we copy the LRH data into the descriptor.
1517 1517 *
1518 1518 * Notice that the VL values are hardcoded. This is not a problem
1519 1519 * because VL15 is decided later based on the value in the MLX
1520 1520 * transport "next/ctrl" header (see the "vl15" bit below), and it
1521 1521 * is otherwise (meaning for QP1) chosen from the SL-to-VL table
1522 1522 * values. This rule does not hold for loopback packets however
1523 1523 * (all of which bypass the SL-to-VL tables) and it is the reason
1524 1524 * that non-QP0 MADs are setup with VL hardcoded to zero below.
1525 1525 *
1526 1526 * Notice also that Source LID is hardcoded to the Permissive LID
1527 1527 * (0xFFFF). This is also not a problem because if the Destination
1528 1528 * LID is not the Permissive LID, then the "slr" value in the MLX
1529 1529 * transport "next/ctrl" header will be set to zero and the hardware
1530 1530 * will pull the LID from value in the port.
1531 1531 */
1532 1532 lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1533 1533 pktlen = (desc_sz + 0x100) >> 2;
1534 1534 TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1535 1535
1536 1536 /*
1537 1537 * Build Global Route Header (GRH)
1538 1538 * This is only built if necessary as defined by the "grh" bit in
1539 1539 * the address vector. Note: We also calculate the offset to the
1540 1540 * next header (BTH) based on whether or not the "grh" bit is set.
1541 1541 */
1542 1542 if (udav.grh) {
1543 1543 /*
1544 1544 * If the request is for QP0, then return an error. The
1545 1545 * combination of global routine (GRH) and QP0 is not allowed.
1546 1546 */
1547 1547 if (qp->qp_is_special == TAVOR_QP_SMI) {
1548 1548 TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail,
1549 1549 TAVOR_TNF_ERROR, "");
1550 1550 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1551 1551 return (IBT_AH_HDL_INVALID);
1552 1552 }
1553 1553 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1554 1554 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1555 1555
1556 1556 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1557 1557 } else {
1558 1558 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1559 1559 }
1560 1560
1561 1561
1562 1562 /*
1563 1563 * Build Base Transport Header (BTH)
1564 1564 * Notice that the M, PadCnt, and TVer fields are all set
1565 1565 * to zero implicitly. This is true for all Management Datagrams
1566 1566 * MADs whether GSI are SMI.
1567 1567 */
1568 1568 TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1569 1569
1570 1570 /*
1571 1571 * Build Datagram Extended Transport Header (DETH)
1572 1572 */
1573 1573 deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1574 1574 TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1575 1575
1576 1576 /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1577 1577 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1578 1578 ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1579 1579 nds = wr->wr_nds;
1580 1580 sgl = wr->wr_sgl;
1581 1581 num_ds = 0;
1582 1582
1583 1583 /*
1584 1584 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1585 1585 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1586 1586 * Start by checking for a valid number of SGL entries
1587 1587 */
1588 1588 if (nds > qp->qp_sq_sgl) {
1589 1589 TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail,
1590 1590 TAVOR_TNF_ERROR, "");
1591 1591 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1592 1592 return (IBT_QP_SGL_LEN_INVALID);
1593 1593 }
1594 1594
1595 1595 /*
1596 1596 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1597 1597 * segments. Note: We skip any SGL with zero size because Tavor
1598 1598 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1599 1599 * the encoding for zero means a 2GB transfer. Because of this special
1600 1600 * encoding in the hardware, we mask the requested length with
1601 1601 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1602 1602 * zero.)
1603 1603 */
1604 1604 mgmtclass = hpoint = hcount = NULL;
1605 1605 offset = 0;
1606 1606 for (i = 0; i < nds; i++) {
1607 1607 if (sgl[i].ds_len == 0) {
1608 1608 continue;
1609 1609 }
1610 1610
1611 1611 /*
1612 1612 * Fill in the Data Segment(s) for the MLX send WQE, using
1613 1613 * the information contained in the scatter-gather list of
1614 1614 * the work request.
1615 1615 */
1616 1616 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1617 1617
1618 1618 /*
1619 1619 * Search through the contents of all MADs posted to QP0 to
1620 1620 * initialize pointers to the places where Directed Route "hop
1621 1621 * pointer", "hop count", and "mgmtclass" would be. Tavor
1622 1622 * needs these updated (i.e. incremented or decremented, as
1623 1623 * necessary) by software.
1624 1624 */
1625 1625 if (qp->qp_is_special == TAVOR_QP_SMI) {
1626 1626
1627 1627 TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1628 1628 offset, sgl[i].ds_va, sgl[i].ds_len);
1629 1629
1630 1630 TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1631 1631 offset, sgl[i].ds_va, sgl[i].ds_len);
1632 1632
1633 1633 TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1634 1634 offset, sgl[i].ds_va, sgl[i].ds_len);
1635 1635
1636 1636 offset += sgl[i].ds_len;
1637 1637 }
1638 1638 num_ds++;
1639 1639 }
1640 1640
1641 1641 /*
1642 1642 * Tavor's Directed Route MADs need to have the "hop pointer"
1643 1643 * incremented/decremented (as necessary) depending on whether it is
1644 1644 * currently less than or greater than the "hop count" (i.e. whether
1645 1645 * the MAD is a request or a response.)
1646 1646 */
1647 1647 if (qp->qp_is_special == TAVOR_QP_SMI) {
1648 1648 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1649 1649 *hpoint, *hcount);
1650 1650 }
1651 1651
1652 1652 /*
1653 1653 * Now fill in the ICRC Data Segment. This data segment is inlined
1654 1654 * just like the packets headers above, but it is only four bytes and
1655 1655 * set to zero (to indicate that we wish the hardware to generate ICRC.
1656 1656 */
1657 1657 TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1658 1658 num_ds++;
1659 1659
1660 1660 /* Return the size of descriptor (in 16-byte chunks) */
1661 1661 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1662 1662
1663 1663 TAVOR_TNF_EXIT(tavor_wqe_mlx_build);
1664 1664 return (DDI_SUCCESS);
1665 1665 }
1666 1666
1667 1667
1668 1668 /*
1669 1669 * tavor_wqe_mlx_linknext()
1670 1670 * Context: Can be called from interrupt or base context.
1671 1671 */
1672 1672 static void
1673 1673 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1674 1674 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1675 1675 tavor_qphdl_t qp)
1676 1676 {
1677 1677 tavor_hw_udav_t udav;
1678 1678 tavor_ahhdl_t ah;
1679 1679 uint64_t next, ctrl, data;
1680 1680 uint_t nopcode;
1681 1681 uint_t udav_sz;
1682 1682 int i;
1683 1683
1684 1684 /*
1685 1685 * Calculate the "next" field of the descriptor. This amounts to
1686 1686 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1687 1687 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1688 1688 * if the current descriptor is the last WQE on the chain), then set
1689 1689 * "next" to zero.
1690 1690 */
1691 1691 if (curr_desc != NULL) {
1692 1692 /*
1693 1693 * The only valid Tavor WQE "nopcode" for MLX transport
1694 1694 * requests is the "Send" code.
1695 1695 */
1696 1696 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1697 1697 curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1698 1698 (uintptr_t)curr_desc - qp->qp_desc_off);
1699 1699 next = (uint64_t)((uintptr_t)curr_desc &
1700 1700 TAVOR_WQE_NDA_MASK) << 32;
1701 1701 next = next | ((uint64_t)nopcode << 32);
1702 1702 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1703 1703
1704 1704 /*
1705 1705 * If a send queue doorbell will be rung for the next
1706 1706 * WQE on the chain, then set the current WQE's "dbd" bit.
1707 1707 * Note: We also update the "dbinfo" structure here to pass
1708 1708 * back information about what should (later) be included
1709 1709 * in the send queue doorbell.
1710 1710 */
1711 1711 if (dbinfo) {
1712 1712 next = next | TAVOR_WQE_DBD_MASK;
1713 1713 dbinfo->db_nopcode = nopcode;
1714 1714 dbinfo->db_fence = 0;
1715 1715 }
1716 1716 } else {
1717 1717 next = 0;
1718 1718 }
1719 1719
1720 1720 /*
1721 1721 * If this WQE is supposed to be linked to the previous descriptor,
1722 1722 * then we need to update not only the previous WQE's "next" fields
1723 1723 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1724 1724 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1725 1725 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1726 1726 * always hardcoded to zero.
1727 1727 */
1728 1728 if (prev_desc != NULL) {
1729 1729 /*
1730 1730 * If a send queue doorbell will be rung for the next WQE on
1731 1731 * the chain, then update the current WQE's "next" field and
1732 1732 * return.
1733 1733 * Note: We don't want to modify the "ctrl" field here because
1734 1734 * that portion of the previous WQE has already been set
1735 1735 * correctly at some previous point in time.
1736 1736 */
1737 1737 if (dbinfo) {
1738 1738 TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1739 1739 return;
1740 1740 }
1741 1741
1742 1742 /*
1743 1743 * Pull the address handle from the work request and read in
1744 1744 * the contents of the UDAV. This will be used to answer some
1745 1745 * questions about the request.
1746 1746 */
1747 1747 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1748 1748 mutex_enter(&ah->ah_lock);
1749 1749 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1750 1750 for (i = 0; i < udav_sz; i++) {
1751 1751 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1752 1752 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1753 1753 ((uint64_t *)&udav)[i] = data;
1754 1754 }
1755 1755 mutex_exit(&ah->ah_lock);
1756 1756
1757 1757 ctrl = 0;
1758 1758
1759 1759 /* Only QP0 uses VL15, otherwise use VL in the packet */
1760 1760 if (qp->qp_is_special == TAVOR_QP_SMI) {
1761 1761 ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1762 1762 }
1763 1763
1764 1764 /*
1765 1765 * The SLR (Source LID Replace) bit determines whether the
1766 1766 * source LID for an outgoing MLX packet should come from the
1767 1767 * PortInfo (SLR = 0) or should be left as it is in the
1768 1768 * descriptor (SLR = 1). The latter is necessary for packets
1769 1769 * to be sent with the Permissive LID.
1770 1770 */
1771 1771 if (udav.rlid == IB_LID_PERMISSIVE) {
1772 1772 ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1773 1773 }
1774 1774
1775 1775 /* Fill in the max static rate from the address handle */
1776 1776 ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1777 1777 TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1778 1778
1779 1779 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1780 1780 if (qp->qp_is_special != TAVOR_QP_SMI) {
1781 1781 ctrl = ctrl | ((uint64_t)udav.sl <<
1782 1782 TAVOR_WQE_MLXHDR_SL_SHIFT);
1783 1783 }
1784 1784
1785 1785 /* Set the "c" (i.e. "signaled") bit appropriately */
1786 1786 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1787 1787 ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1788 1788 }
1789 1789
1790 1790 /* Fill in the destination LID from the address handle */
1791 1791 ctrl = ctrl | ((uint64_t)udav.rlid <<
1792 1792 TAVOR_WQE_MLXHDR_RLID_SHIFT);
1793 1793
1794 1794 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1795 1795 }
1796 1796 }
1797 1797
1798 1798
1799 1799 /*
1800 1800 * tavor_wqe_recv_build()
1801 1801 * Context: Can be called from interrupt or base context.
1802 1802 */
1803 1803 /* ARGSUSED */
1804 1804 static int
1805 1805 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1806 1806 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1807 1807 {
1808 1808 tavor_hw_wqe_sgl_t *ds;
1809 1809 int i, num_ds;
1810 1810
1811 1811 TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1812 1812
1813 1813 ASSERT(MUTEX_HELD(&qp->qp_lock));
1814 1814
1815 1815 /* Check that work request transport type is valid */
1816 1816 if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1817 1817 (qp->qp_serv_type != TAVOR_QP_RC) &&
1818 1818 (qp->qp_serv_type != TAVOR_QP_UC)) {
1819 1819 TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail,
1820 1820 TAVOR_TNF_ERROR, "");
1821 1821 TAVOR_TNF_EXIT(tavor_build_recv_wqe);
1822 1822 return (IBT_QP_SRV_TYPE_INVALID);
1823 1823 }
1824 1824
1825 1825 /* Fill in the Data Segments (SGL) for the Recv WQE */
1826 1826 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1827 1827 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1828 1828 num_ds = 0;
1829 1829
1830 1830 /* Check for valid number of SGL entries */
1831 1831 if (wr->wr_nds > qp->qp_rq_sgl) {
1832 1832 TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail,
1833 1833 TAVOR_TNF_ERROR, "");
1834 1834 TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1835 1835 return (IBT_QP_SGL_LEN_INVALID);
1836 1836 }
1837 1837
1838 1838 /*
1839 1839 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1840 1840 * segments. Note: We skip any SGL with zero size because Tavor
1841 1841 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1842 1842 * the encoding for zero means a 2GB transfer. Because of this special
1843 1843 * encoding in the hardware, we mask the requested length with
1844 1844 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1845 1845 * zero.)
1846 1846 */
1847 1847 for (i = 0; i < wr->wr_nds; i++) {
1848 1848 if (wr->wr_sgl[i].ds_len == 0) {
1849 1849 continue;
1850 1850 }
1851 1851
1852 1852 /*
1853 1853 * Fill in the Data Segment(s) for the receive WQE, using the
1854 1854 * information contained in the scatter-gather list of the
1855 1855 * work request.
1856 1856 */
1857 1857 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1858 1858 num_ds++;
1859 1859 }
1860 1860
1861 1861 /* Return the size of descriptor (in 16-byte chunks) */
1862 1862 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1863 1863
1864 1864 TAVOR_TNF_EXIT(tavor_wqe_recv_build);
1865 1865 return (DDI_SUCCESS);
1866 1866 }
1867 1867
1868 1868
1869 1869 /*
1870 1870 * tavor_wqe_recv_linknext()
1871 1871 * Context: Can be called from interrupt or base context.
1872 1872 */
1873 1873 static void
1874 1874 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1875 1875 uint64_t *prev_desc, tavor_qphdl_t qp)
1876 1876 {
1877 1877 uint64_t next;
1878 1878
1879 1879 /*
1880 1880 * Calculate the "next" field of the descriptor. This amounts to
1881 1881 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1882 1882 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1883 1883 * if the current descriptor is the last WQE on the chain), then set
1884 1884 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor
1885 1885 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1886 1886 * In either case, we must add a single bit in the "reserved" field
1887 1887 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
1888 1888 * workaround for a known Tavor errata that can cause Recv WQEs with
1889 1889 * zero in the NDA field to behave improperly.
1890 1890 */
1891 1891 if (curr_desc != NULL) {
1892 1892 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1893 1893 qp->qp_desc_off);
1894 1894 next = (uint64_t)((uintptr_t)curr_desc &
1895 1895 TAVOR_WQE_NDA_MASK) << 32;
1896 1896 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1897 1897 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1898 1898 } else {
1899 1899 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1900 1900 }
1901 1901
1902 1902 /*
1903 1903 * If this WQE is supposed to be linked to the previous descriptor,
1904 1904 * then we need to update not only the previous WQE's "next" fields
1905 1905 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1906 1906 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e"
1907 1907 * bits are always hardcoded to zero.
1908 1908 */
1909 1909 if (prev_desc != NULL) {
1910 1910 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1911 1911 }
1912 1912 }
1913 1913
1914 1914
1915 1915 /*
1916 1916 * tavor_wqe_srq_build()
1917 1917 * Context: Can be called from interrupt or base context.
1918 1918 */
1919 1919 /* ARGSUSED */
1920 1920 static int
1921 1921 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1922 1922 ibt_recv_wr_t *wr, uint64_t *desc)
1923 1923 {
1924 1924 tavor_hw_wqe_sgl_t *ds;
1925 1925 ibt_wr_ds_t end_sgl;
1926 1926 int i, num_ds;
1927 1927
1928 1928 TAVOR_TNF_ENTER(tavor_wqe_recv_build);
1929 1929
1930 1930 ASSERT(MUTEX_HELD(&srq->srq_lock));
1931 1931
1932 1932 /* Fill in the Data Segments (SGL) for the Recv WQE */
1933 1933 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1934 1934 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1935 1935 num_ds = 0;
1936 1936
1937 1937 /* Check for valid number of SGL entries */
1938 1938 if (wr->wr_nds > srq->srq_wq_sgl) {
1939 1939 TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail,
1940 1940 TAVOR_TNF_ERROR, "");
1941 1941 TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1942 1942 return (IBT_QP_SGL_LEN_INVALID);
1943 1943 }
1944 1944
1945 1945 /*
1946 1946 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1947 1947 * segments. Note: We skip any SGL with zero size because Tavor
1948 1948 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1949 1949 * the encoding for zero means a 2GB transfer. Because of this special
1950 1950 * encoding in the hardware, we mask the requested length with
1951 1951 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1952 1952 * zero.)
1953 1953 */
1954 1954 for (i = 0; i < wr->wr_nds; i++) {
1955 1955 if (wr->wr_sgl[i].ds_len == 0) {
1956 1956 continue;
1957 1957 }
1958 1958
1959 1959 /*
1960 1960 * Fill in the Data Segment(s) for the receive WQE, using the
1961 1961 * information contained in the scatter-gather list of the
1962 1962 * work request.
1963 1963 */
1964 1964 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1965 1965 num_ds++;
1966 1966 }
1967 1967
1968 1968 /*
1969 1969 * For SRQ, if the number of data segments is less than the maximum
1970 1970 * specified at alloc, then we have to fill in a special "key" entry in
1971 1971 * the sgl entry after the last valid one in this post request. We do
1972 1972 * that here.
1973 1973 */
1974 1974 if (num_ds < srq->srq_wq_sgl) {
1975 1975 end_sgl.ds_va = 0;
1976 1976 end_sgl.ds_len = 0;
1977 1977 end_sgl.ds_key = 0x1;
1978 1978 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1979 1979 }
1980 1980
1981 1981 TAVOR_TNF_EXIT(tavor_wqe_srq_build);
1982 1982 return (DDI_SUCCESS);
1983 1983 }
1984 1984
1985 1985
1986 1986 /*
1987 1987 * tavor_wqe_srq_linknext()
1988 1988 * Context: Can be called from interrupt or base context.
1989 1989 */
1990 1990 static void
1991 1991 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1992 1992 tavor_srqhdl_t srq)
1993 1993 {
1994 1994 uint64_t next;
1995 1995
1996 1996 /*
1997 1997 * Calculate the "next" field of the descriptor. This amounts to
1998 1998 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1999 1999 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
2000 2000 * if the current descriptor is the last WQE on the chain), then set
2001 2001 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor
2002 2002 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
2003 2003 * In either case, we must add a single bit in the "reserved" field
2004 2004 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
2005 2005 * workaround for a known Tavor errata that can cause Recv WQEs with
2006 2006 * zero in the NDA field to behave improperly.
2007 2007 */
2008 2008 if (curr_desc != NULL) {
2009 2009 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
2010 2010 srq->srq_desc_off);
2011 2011 next = (uint64_t)((uintptr_t)curr_desc &
2012 2012 TAVOR_WQE_NDA_MASK) << 32;
2013 2013 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
2014 2014 } else {
2015 2015 next = TAVOR_RCV_WQE_NDA0_WA_MASK;
2016 2016 }
2017 2017
2018 2018 /*
2019 2019 * If this WQE is supposed to be linked to the previous descriptor,
2020 2020 * then we need to update not only the previous WQE's "next" fields
2021 2021 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
2022 2022 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e"
2023 2023 * bits are always hardcoded to zero.
2024 2024 */
2025 2025 if (prev_desc != NULL) {
2026 2026 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
2027 2027 }
2028 2028 }
2029 2029
2030 2030
2031 2031 /*
2032 2032 * tavor_wr_get_immediate()
2033 2033 * Context: Can be called from interrupt or base context.
2034 2034 */
2035 2035 static uint32_t
2036 2036 tavor_wr_get_immediate(ibt_send_wr_t *wr)
2037 2037 {
2038 2038 /*
2039 2039 * This routine extracts the "immediate data" from the appropriate
2040 2040 * location in the IBTF work request. Because of the way the
2041 2041 * work request structure is defined, the location for this data
2042 2042 * depends on the actual work request operation type.
2043 2043 */
2044 2044
2045 2045 /* For RDMA Write, test if RC or UC */
2046 2046 if (wr->wr_opcode == IBT_WRC_RDMAW) {
2047 2047 if (wr->wr_trans == IBT_RC_SRV) {
2048 2048 return (wr->wr.rc.rcwr.rdma.rdma_immed);
2049 2049 } else { /* IBT_UC_SRV */
2050 2050 return (wr->wr.uc.ucwr.rdma.rdma_immed);
2051 2051 }
2052 2052 }
2053 2053
2054 2054 /* For Send, test if RC, UD, or UC */
2055 2055 if (wr->wr_opcode == IBT_WRC_SEND) {
2056 2056 if (wr->wr_trans == IBT_RC_SRV) {
2057 2057 return (wr->wr.rc.rcwr.send_immed);
2058 2058 } else if (wr->wr_trans == IBT_UD_SRV) {
2059 2059 return (wr->wr.ud.udwr_immed);
2060 2060 } else { /* IBT_UC_SRV */
2061 2061 return (wr->wr.uc.ucwr.send_immed);
2062 2062 }
2063 2063 }
2064 2064
2065 2065 /*
2066 2066 * If any other type of request, then immediate is undefined
2067 2067 */
2068 2068 return (0);
2069 2069 }
2070 2070
2071 2071
2072 2072 /*
2073 2073 * tavor_wqe_sync()
2074 2074 * Context: Can be called from interrupt or base context.
2075 2075 */
2076 2076 static void
2077 2077 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
2078 2078 uint_t sync_type, uint_t flag)
2079 2079 {
2080 2080 tavor_qphdl_t qp;
2081 2081 tavor_srqhdl_t srq;
2082 2082 uint_t is_sync_req;
2083 2083 uint64_t *wqe_from, *wqe_to, *wqe_base, *wqe_top;
2084 2084 ddi_dma_handle_t dmahdl;
2085 2085 off_t offset;
2086 2086 size_t length;
2087 2087 uint32_t qsize;
2088 2088 int status;
2089 2089
2090 2090 TAVOR_TNF_ENTER(tavor_wqe_sync);
2091 2091
2092 2092 if (sync_type == TAVOR_WR_SRQ) {
2093 2093 srq = (tavor_srqhdl_t)hdl;
2094 2094 is_sync_req = srq->srq_sync;
2095 2095 /* Get the DMA handle from SRQ context */
2096 2096 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
2097 2097 } else {
2098 2098 qp = (tavor_qphdl_t)hdl;
2099 2099 is_sync_req = qp->qp_sync;
2100 2100 /* Get the DMA handle from QP context */
2101 2101 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
2102 2102 }
2103 2103
2104 2104 /* Determine if the work queues need to be synced or not */
2105 2105 if (is_sync_req == 0) {
2106 2106 TAVOR_TNF_EXIT(tavor_wqe_sync);
2107 2107 return;
2108 2108 }
2109 2109
2110 2110 /*
2111 2111 * Depending on the type of the work queue, we grab information
2112 2112 * about the address ranges we need to DMA sync.
2113 2113 */
2114 2114 if (sync_type == TAVOR_WR_SEND) {
2115 2115 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
2116 2116 wqe_to = TAVOR_QP_SQ_ENTRY(qp, sync_to);
2117 2117 qsize = qp->qp_sq_bufsz;
2118 2118
2119 2119 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2120 2120 wqe_top = TAVOR_QP_SQ_ENTRY(qp, qsize);
2121 2121 } else if (sync_type == TAVOR_WR_RECV) {
2122 2122 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2123 2123 wqe_to = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2124 2124 qsize = qp->qp_rq_bufsz;
2125 2125
2126 2126 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2127 2127 wqe_top = TAVOR_QP_RQ_ENTRY(qp, qsize);
2128 2128 } else {
2129 2129 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2130 2130 wqe_to = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2131 2131 qsize = srq->srq_wq_bufsz;
2132 2132
2133 2133 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2134 2134 wqe_top = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2135 2135 }
2136 2136
2137 2137 /*
2138 2138 * There are two possible cases for the beginning and end of the WQE
2139 2139 * chain we are trying to sync. Either this is the simple case, where
2140 2140 * the end of the chain is below the beginning of the chain, or it is
2141 2141 * the "wrap-around" case, where the end of the chain has wrapped over
2142 2142 * the end of the queue. In the former case, we simply need to
2143 2143 * calculate the span from beginning to end and sync it. In the latter
2144 2144 * case, however, we need to calculate the span from the top of the
2145 2145 * work queue to the end of the chain and sync that, and then we need
2146 2146 * to find the other portion (from beginning of chain to end of queue)
2147 2147 * and sync that as well. Note: if the "top to end" span is actually
2148 2148 * zero length, then we don't do a DMA sync because a zero length DMA
2149 2149 * sync unnecessarily syncs the entire work queue.
2150 2150 */
2151 2151 if (wqe_to > wqe_from) {
2152 2152 /* "From Beginning to End" */
2153 2153 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2154 2154 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2155 2155
2156 2156 status = ddi_dma_sync(dmahdl, offset, length, flag);
2157 2157 if (status != DDI_SUCCESS) {
2158 2158 TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2159 2159 TAVOR_TNF_EXIT(tavor_wqe_sync);
2160 2160 return;
2161 2161 }
2162 2162 } else {
2163 2163 /* "From Top to End" */
2164 2164 offset = (off_t)0;
2165 2165 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2166 2166 if (length) {
2167 2167 status = ddi_dma_sync(dmahdl, offset, length, flag);
2168 2168 if (status != DDI_SUCCESS) {
2169 2169 TNF_PROBE_0(tavor_wqe_sync_fail,
2170 2170 TAVOR_TNF_ERROR, "");
2171 2171 TAVOR_TNF_EXIT(tavor_wqe_sync);
2172 2172 return;
2173 2173 }
2174 2174 }
2175 2175
2176 2176 /* "From Beginning to Bottom" */
2177 2177 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2178 2178 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2179 2179 status = ddi_dma_sync(dmahdl, offset, length, flag);
2180 2180 if (status != DDI_SUCCESS) {
2181 2181 TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, "");
2182 2182 TAVOR_TNF_EXIT(tavor_wqe_sync);
2183 2183 return;
2184 2184 }
2185 2185 }
2186 2186
2187 2187 TAVOR_TNF_EXIT(tavor_wqe_sync);
2188 2188 }
2189 2189
2190 2190
2191 2191 /*
2192 2192 * tavor_wr_bind_check()
2193 2193 * Context: Can be called from interrupt or base context.
2194 2194 */
2195 2195 static int
2196 2196 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2197 2197 {
2198 2198 ibt_bind_flags_t bind_flags;
2199 2199 uint64_t vaddr, len;
2200 2200 uint64_t reg_start_addr, reg_end_addr;
2201 2201 tavor_mwhdl_t mw;
2202 2202 tavor_mrhdl_t mr;
2203 2203 tavor_rsrc_t *mpt;
2204 2204 uint32_t new_rkey;
2205 2205
2206 2206 TAVOR_TNF_ENTER(tavor_wr_bind_check);
2207 2207
2208 2208 /* Check for a valid Memory Window handle in the WR */
2209 2209 mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2210 2210 if (mw == NULL) {
2211 2211 TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail,
2212 2212 TAVOR_TNF_ERROR, "");
2213 2213 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2214 2214 return (IBT_MW_HDL_INVALID);
2215 2215 }
2216 2216
2217 2217 /* Check for a valid Memory Region handle in the WR */
2218 2218 mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2219 2219 if (mr == NULL) {
2220 2220 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail,
2221 2221 TAVOR_TNF_ERROR, "");
2222 2222 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2223 2223 return (IBT_MR_HDL_INVALID);
2224 2224 }
2225 2225
2226 2226 mutex_enter(&mr->mr_lock);
2227 2227 mutex_enter(&mw->mr_lock);
2228 2228
2229 2229 /*
2230 2230 * Check here to see if the memory region has already been partially
2231 2231 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2232 2232 * If so, this is an error, return failure.
2233 2233 */
2234 2234 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2235 2235 mutex_exit(&mr->mr_lock);
2236 2236 mutex_exit(&mw->mr_lock);
2237 2237 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail,
2238 2238 TAVOR_TNF_ERROR, "");
2239 2239 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2240 2240 return (IBT_MR_HDL_INVALID);
2241 2241 }
2242 2242
2243 2243 /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2244 2244 if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2245 2245 mutex_exit(&mr->mr_lock);
2246 2246 mutex_exit(&mw->mr_lock);
2247 2247 TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail,
2248 2248 TAVOR_TNF_ERROR, "");
2249 2249 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2250 2250 return (IBT_MR_RKEY_INVALID);
2251 2251 }
2252 2252
2253 2253 /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2254 2254 if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2255 2255 mutex_exit(&mr->mr_lock);
2256 2256 mutex_exit(&mw->mr_lock);
2257 2257 TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail,
2258 2258 TAVOR_TNF_ERROR, "");
2259 2259 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2260 2260 return (IBT_MR_LKEY_INVALID);
2261 2261 }
2262 2262
2263 2263 /*
2264 2264 * Now check for valid "vaddr" and "len". Note: We don't check the
2265 2265 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2266 2266 */
2267 2267 len = wr->wr.rc.rcwr.bind->bind_len;
2268 2268 if (len != 0) {
2269 2269 vaddr = wr->wr.rc.rcwr.bind->bind_va;
2270 2270 reg_start_addr = mr->mr_bindinfo.bi_addr;
2271 2271 reg_end_addr = mr->mr_bindinfo.bi_addr +
2272 2272 (mr->mr_bindinfo.bi_len - 1);
2273 2273 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2274 2274 mutex_exit(&mr->mr_lock);
2275 2275 mutex_exit(&mw->mr_lock);
2276 2276 TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail,
2277 2277 TAVOR_TNF_ERROR, "");
2278 2278 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2279 2279 return (IBT_MR_VA_INVALID);
2280 2280 }
2281 2281 vaddr = (vaddr + len) - 1;
2282 2282 if (vaddr > reg_end_addr) {
2283 2283 mutex_exit(&mr->mr_lock);
2284 2284 mutex_exit(&mw->mr_lock);
2285 2285 TNF_PROBE_0(tavor_wr_bind_check_invlen_fail,
2286 2286 TAVOR_TNF_ERROR, "");
2287 2287 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2288 2288 return (IBT_MR_LEN_INVALID);
2289 2289 }
2290 2290 }
2291 2291
2292 2292 /*
2293 2293 * Validate the bind access flags. Remote Write and Atomic access for
2294 2294 * the Memory Window require that Local Write access be set in the
2295 2295 * corresponding Memory Region.
2296 2296 */
2297 2297 bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2298 2298 if (((bind_flags & IBT_WR_BIND_WRITE) ||
2299 2299 (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2300 2300 !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2301 2301 mutex_exit(&mr->mr_lock);
2302 2302 mutex_exit(&mw->mr_lock);
2303 2303 TNF_PROBE_0(tavor_wr_bind_check_invflags_fail,
2304 2304 TAVOR_TNF_ERROR, "");
2305 2305 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2306 2306 return (IBT_MR_ACCESS_REQ_INVALID);
2307 2307 }
2308 2308
2309 2309 /* Calculate the new RKey for the Memory Window */
2310 2310 mpt = mw->mr_mptrsrcp;
2311 2311 tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2312 2312
2313 2313 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2314 2314 mw->mr_rkey = new_rkey;
2315 2315
2316 2316 mutex_exit(&mr->mr_lock);
2317 2317 mutex_exit(&mw->mr_lock);
2318 2318 TAVOR_TNF_EXIT(tavor_wr_bind_check);
2319 2319 return (DDI_SUCCESS);
2320 2320 }
2321 2321
2322 2322
2323 2323 /*
2324 2324 * tavor_wrid_from_reset_handling()
2325 2325 * Context: Can be called from interrupt or base context.
2326 2326 */
2327 2327 int
2328 2328 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2329 2329 {
2330 2330 tavor_workq_hdr_t *swq, *rwq;
2331 2331 tavor_wrid_list_hdr_t *s_wridlist, *r_wridlist;
2332 2332 uint_t create_new_swq = 0, create_new_rwq = 0;
2333 2333 uint_t create_wql = 0;
2334 2334 uint_t qp_srq_en;
2335 2335
2336 2336 TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling);
2337 2337
2338 2338 /*
2339 2339 * For each of this QP's Work Queues, make sure we have a (properly
2340 2340 * initialized) Work Request ID list attached to the relevant
2341 2341 * completion queue. Grab the CQ lock(s) before manipulating the
2342 2342 * lists.
2343 2343 */
2344 2344 tavor_wrid_wqhdr_lock_both(qp);
2345 2345 swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2346 2346 TAVOR_WR_SEND);
2347 2347 if (swq == NULL) {
2348 2348 /* Couldn't find matching work queue header, create it */
2349 2349 create_new_swq = create_wql = 1;
2350 2350 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2351 2351 qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2352 2352 if (swq == NULL) {
2353 2353 /*
↓ open down ↓ |
2353 lines elided |
↑ open up ↑ |
2354 2354 * If we couldn't find/allocate space for the workq
2355 2355 * header, then drop the lock(s) and return failure.
2356 2356 */
2357 2357 tavor_wrid_wqhdr_unlock_both(qp);
2358 2358 TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2359 2359 TAVOR_TNF_ERROR, "");
2360 2360 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2361 2361 return (ibc_get_ci_failure(0));
2362 2362 }
2363 2363 }
2364 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2365 2364 qp->qp_sq_wqhdr = swq;
2366 2365 swq->wq_size = qp->qp_sq_bufsz;
2367 2366 swq->wq_head = 0;
2368 2367 swq->wq_tail = 0;
2369 2368 swq->wq_full = 0;
2370 2369
2371 2370 /*
2372 2371 * Allocate space for the tavor_wrid_entry_t container
2373 2372 */
2374 2373 s_wridlist = tavor_wrid_get_list(swq->wq_size);
2375 2374 if (s_wridlist == NULL) {
2376 2375 /*
2377 2376 * If we couldn't allocate space for tracking the WRID
2378 2377 * entries, then cleanup the workq header from above (if
2379 2378 * necessary, i.e. if we created the workq header). Then
2380 2379 * drop the lock(s) and return failure.
2381 2380 */
↓ open down ↓ |
7 lines elided |
↑ open up ↑ |
2382 2381 if (create_new_swq) {
2383 2382 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2384 2383 }
2385 2384
2386 2385 tavor_wrid_wqhdr_unlock_both(qp);
2387 2386 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2388 2387 TAVOR_TNF_ERROR, "");
2389 2388 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2390 2389 return (ibc_get_ci_failure(0));
2391 2390 }
2392 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2393 2391 s_wridlist->wl_wqhdr = swq;
2394 2392
2395 2393 /* Chain the new WRID list container to the workq hdr list */
2396 2394 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2397 2395 tavor_wrid_wqhdr_add(swq, s_wridlist);
2398 2396 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2399 2397
2400 2398 qp_srq_en = qp->qp_srq_en;
2401 2399
2402 -#ifdef __lock_lint
2403 - mutex_enter(&qp->qp_srqhdl->srq_lock);
2404 -#else
2405 2400 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2406 2401 mutex_enter(&qp->qp_srqhdl->srq_lock);
2407 2402 }
2408 -#endif
2403 +
2409 2404 /*
2410 2405 * Now we repeat all the above operations for the receive work queue,
2411 2406 * or shared receive work queue.
2412 2407 *
2413 2408 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2414 2409 */
2415 2410 rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2416 2411 TAVOR_WR_RECV);
2417 2412 if (rwq == NULL) {
2418 2413 create_new_rwq = create_wql = 1;
2419 2414
2420 2415 /*
2421 2416 * If this QP is associated with an SRQ, and this isn't the
2422 2417 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2423 2418 * created. Since the WQL is created at 'wqhdr_create' time we
2424 2419 * pass in the flag 'create_wql' here to be 0 if we have
2425 2420 * already created it. And later on below we then next setup
2426 2421 * the WQL and rwq information based off the existing SRQ info.
2427 2422 */
2428 2423 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2429 2424 qp->qp_srqhdl->srq_wrid_wql != NULL) {
2430 2425 create_wql = 0;
2431 2426 }
2432 2427
2433 2428 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2434 2429 qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2435 2430 if (rwq == NULL) {
2436 2431 /*
2437 2432 * If we couldn't find/allocate space for the workq
2438 2433 * header, then free all the send queue resources we
2439 2434 * just allocated and setup (above), drop the lock(s)
↓ open down ↓ |
21 lines elided |
↑ open up ↑ |
2440 2435 * and return failure.
2441 2436 */
2442 2437 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2443 2438 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2444 2439 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2445 2440 if (create_new_swq) {
2446 2441 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2447 2442 swq);
2448 2443 }
2449 2444
2450 -#ifdef __lock_lint
2451 - mutex_exit(&qp->qp_srqhdl->srq_lock);
2452 -#else
2453 2445 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2454 2446 mutex_exit(&qp->qp_srqhdl->srq_lock);
2455 2447 }
2456 -#endif
2457 2448
2458 2449 tavor_wrid_wqhdr_unlock_both(qp);
2459 2450 TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail,
2460 2451 TAVOR_TNF_ERROR, "");
2461 2452 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2462 2453 return (ibc_get_ci_failure(0));
2463 2454 }
2464 2455 }
2465 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2466 2456
2467 2457 /*
2468 2458 * Setup receive workq hdr
2469 2459 *
2470 2460 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2471 2461 * keeping a copy of the rwq pointer, setting the rwq bufsize
2472 2462 * appropriately, and initializing our part of the WQLock.
2473 2463 *
2474 2464 * In the normal QP case, the QP recv queue bufsize is used.
2475 2465 */
2476 2466 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2477 2467 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2478 2468 if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2479 2469 qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2480 2470 } else {
2481 2471 rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2482 2472 }
2483 2473 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2484 2474
2485 2475 } else {
2486 2476 rwq->wq_size = qp->qp_rq_bufsz;
2487 2477 }
2488 2478
2489 2479 qp->qp_rq_wqhdr = rwq;
2490 2480 rwq->wq_head = 0;
2491 2481 rwq->wq_tail = 0;
2492 2482 rwq->wq_full = 0;
2493 2483
2494 2484 /*
2495 2485 * Allocate space for the tavor_wrid_entry_t container.
2496 2486 *
2497 2487 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2498 2488 * allocate the wridlist normally. However, if the srq_wridlist is !=
2499 2489 * NULL, then we know this SRQ has already been initialized, thus the
2500 2490 * wridlist has already been initialized. So we re-use the
2501 2491 * srq_wridlist as the r_wridlist for this QP in this case.
2502 2492 */
2503 2493 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2504 2494 qp->qp_srqhdl->srq_wridlist != NULL) {
2505 2495 /* Use existing srq_wridlist pointer */
2506 2496 r_wridlist = qp->qp_srqhdl->srq_wridlist;
2507 2497 ASSERT(r_wridlist != NULL);
2508 2498 } else {
2509 2499 /* Allocate memory for the r_wridlist */
2510 2500 r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2511 2501 }
2512 2502
2513 2503 /*
2514 2504 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2515 2505 * is mistakenly NULL), we cleanup our previous swq allocation from
2516 2506 * above
2517 2507 */
2518 2508 if (r_wridlist == NULL) {
2519 2509 /*
2520 2510 * If we couldn't allocate space for tracking the WRID
2521 2511 * entries, then cleanup all the stuff from above. Then
2522 2512 * drop the lock(s) and return failure.
2523 2513 */
↓ open down ↓ |
48 lines elided |
↑ open up ↑ |
2524 2514 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2525 2515 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2526 2516 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2527 2517 if (create_new_swq) {
2528 2518 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2529 2519 }
2530 2520 if (create_new_rwq) {
2531 2521 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2532 2522 }
2533 2523
2534 -#ifdef __lock_lint
2535 - mutex_exit(&qp->qp_srqhdl->srq_lock);
2536 -#else
2537 2524 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2538 2525 mutex_exit(&qp->qp_srqhdl->srq_lock);
2539 2526 }
2540 -#endif
2541 2527
2542 2528 tavor_wrid_wqhdr_unlock_both(qp);
2543 2529 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail,
2544 2530 TAVOR_TNF_ERROR, "");
2545 2531 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2546 2532 return (ibc_get_ci_failure(0));
2547 2533 }
2548 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2549 2534
2550 2535 /*
2551 2536 * Initialize the wridlist
2552 2537 *
2553 2538 * In the normal QP case, there is no special initialization needed.
2554 2539 * We simply setup the wridlist backpointer to be the receive wqhdr
2555 2540 * (rwq).
2556 2541 *
2557 2542 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2558 2543 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2559 2544 * and thus potentially shared across multiple QPs with the SRQ. We
2560 2545 * also setup the srq_wridlist pointer to be the r_wridlist, and
2561 2546 * intialize the freelist to an invalid index. This srq_wridlist
2562 2547 * pointer is used above on future moves from_reset to let us know that
2563 2548 * the srq_wridlist has been initialized already.
2564 2549 *
2565 2550 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2566 2551 * free list.
2567 2552 */
2568 2553 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2569 2554 qp->qp_srqhdl->srq_wridlist == NULL) {
2570 2555 r_wridlist->wl_srq_en = 1;
2571 2556 r_wridlist->wl_free_list_indx = -1;
2572 2557 qp->qp_srqhdl->srq_wridlist = r_wridlist;
2573 2558
2574 2559 /* Initialize srq wrid free list */
2575 2560 if (qp->qp_srqhdl->srq_is_umap == 0) {
2576 2561 mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2577 2562 tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2578 2563 mutex_exit(&rwq->wq_wrid_wql->wql_lock);
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
2579 2564 }
2580 2565 } else {
2581 2566 r_wridlist->wl_wqhdr = rwq;
2582 2567 }
2583 2568
2584 2569 /* Chain the WRID list "container" to the workq hdr list */
2585 2570 mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2586 2571 tavor_wrid_wqhdr_add(rwq, r_wridlist);
2587 2572 mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2588 2573
2589 -#ifdef __lock_lint
2590 - mutex_exit(&qp->qp_srqhdl->srq_lock);
2591 -#else
2592 2574 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2593 2575 mutex_exit(&qp->qp_srqhdl->srq_lock);
2594 2576 }
2595 -#endif
2596 2577
2597 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2598 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2599 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2600 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2601 -
2602 2578 tavor_wrid_wqhdr_unlock_both(qp);
2603 2579 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling);
2604 2580 return (DDI_SUCCESS);
2605 2581 }
2606 2582
2607 2583
2608 2584 /*
2609 2585 * tavor_wrid_to_reset_handling()
2610 2586 * Context: Can be called from interrupt or base context.
2611 2587 */
2612 2588 void
2613 2589 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2614 2590 {
2615 2591 uint_t free_wqhdr = 0;
2616 2592
2617 2593 TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling);
2618 2594
2619 2595 /*
2620 2596 * For each of this QP's Work Queues, move the WRID "container" to
2621 2597 * the "reapable" list. Although there may still be unpolled
2622 2598 * entries in these containers, it is not a big deal. We will not
2623 2599 * reap the list until either the Poll CQ command detects an empty
2624 2600 * condition or the CQ itself is freed. Grab the CQ lock(s) before
2625 2601 * manipulating the lists.
2626 2602 */
2627 2603 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2628 2604 tavor_wrid_wqhdr_lock_both(qp);
2629 2605 tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2630 2606
2631 2607 /*
2632 2608 * Add the receive work queue header on to the reaplist. But if we are
2633 2609 * on SRQ, then don't add anything to the reaplist. Instead we flush
2634 2610 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2635 2611 * WQHDR (if needed). We must hold the WQL for these operations, yet
2636 2612 * the call to tavor_cq_wqhdr_remove grabs the WQL internally. So we
2637 2613 * drop WQL before that call. Then release the CQ WQHDR locks and the
2638 2614 * CQ lock and return.
2639 2615 */
2640 2616 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2641 2617
2642 2618 /*
2643 2619 * Pull off all (if any) entries for this QP from CQ. This
2644 2620 * only includes entries that have not yet been polled
2645 2621 */
2646 2622 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2647 2623 tavor_cq_srq_entries_flush(state, qp);
2648 2624
2649 2625 /* Remove wridlist from WQHDR */
2650 2626 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2651 2627 qp->qp_rq_wqhdr->wq_wrid_post);
2652 2628
2653 2629 /* If wridlist chain is now empty, remove the wqhdr as well */
2654 2630 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2655 2631 free_wqhdr = 1;
2656 2632 } else {
2657 2633 free_wqhdr = 0;
2658 2634 }
2659 2635
2660 2636 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2661 2637
2662 2638 /* Free the WQHDR */
2663 2639 if (free_wqhdr) {
2664 2640 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2665 2641 }
2666 2642 } else {
2667 2643 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2668 2644 }
2669 2645 tavor_wrid_wqhdr_unlock_both(qp);
2670 2646 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2671 2647
2672 2648 TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling);
2673 2649 }
2674 2650
2675 2651
2676 2652 /*
2677 2653 * tavor_wrid_add_entry()
2678 2654 * Context: Can be called from interrupt or base context.
2679 2655 */
2680 2656 void
2681 2657 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2682 2658 uint_t signaled_dbd)
2683 2659 {
2684 2660 tavor_wrid_entry_t *wre_tmp;
2685 2661 uint32_t head, tail, size;
2686 2662
2687 2663 TAVOR_TNF_ENTER(tavor_wrid_add_entry);
2688 2664
2689 2665 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2690 2666
2691 2667 /*
2692 2668 * Find the entry in the container pointed to by the "tail" index.
2693 2669 * Add all of the relevant information to that entry, including WRID,
2694 2670 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2695 2671 * and/or doorbelled.
2696 2672 */
2697 2673 head = wq->wq_wrid_post->wl_head;
2698 2674 tail = wq->wq_wrid_post->wl_tail;
2699 2675 size = wq->wq_wrid_post->wl_size;
2700 2676 wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2701 2677 wre_tmp->wr_wrid = wrid;
2702 2678 wre_tmp->wr_wqeaddrsz = wqeaddrsz;
2703 2679 wre_tmp->wr_signaled_dbd = signaled_dbd;
2704 2680
2705 2681 /*
2706 2682 * Update the "wrid_old_tail" pointer to point to the entry we just
2707 2683 * inserted into the queue. By tracking this pointer (the pointer to
2708 2684 * the most recently inserted entry) it will possible later in the
2709 2685 * PostSend() and PostRecv() code paths to find the entry that needs
2710 2686 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2711 2687 * tavor_post_send()).
2712 2688 */
2713 2689 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2714 2690
2715 2691 /* Update the tail index */
2716 2692 tail = ((tail + 1) & (size - 1));
2717 2693 wq->wq_wrid_post->wl_tail = tail;
2718 2694
2719 2695 /*
2720 2696 * If the "tail" index has just wrapped over into the "head" index,
2721 2697 * then we have filled the container. We use the "full" flag to
2722 2698 * indicate this condition and to distinguish it from the "empty"
2723 2699 * condition (where head and tail are also equal).
2724 2700 */
2725 2701 if (head == tail) {
2726 2702 wq->wq_wrid_post->wl_full = 1;
2727 2703 }
2728 2704 TAVOR_TNF_EXIT(tavor_wrid_add_entry);
2729 2705 }
2730 2706
2731 2707 /*
2732 2708 * tavor_wrid_add_entry_srq()
2733 2709 * Context: Can be called from interrupt or base context
2734 2710 */
2735 2711 void
2736 2712 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2737 2713 {
2738 2714 tavor_wrid_entry_t *wre;
2739 2715 uint64_t *wl_wqe;
2740 2716 uint32_t wqe_index;
2741 2717
2742 2718 TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq);
2743 2719
2744 2720 /*
2745 2721 * Find the next available WQE from the SRQ free_list. Then update the
2746 2722 * free_list to point to the next entry
2747 2723 */
2748 2724 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2749 2725
2750 2726 wqe_index = srq->srq_wridlist->wl_free_list_indx;
2751 2727
2752 2728 /* ASSERT on impossible wqe_index values */
2753 2729 ASSERT(wqe_index < srq->srq_wq_bufsz);
2754 2730
2755 2731 /*
2756 2732 * Setup the WRE.
2757 2733 *
2758 2734 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2759 2735 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2760 2736 * this information and associate the WRID to the WQE found on the CQE.
2761 2737 */
2762 2738 wre = &srq->srq_wridlist->wl_wre[wqe_index];
2763 2739 wre->wr_wrid = wrid;
2764 2740 wre->wr_signaled_dbd = signaled_dbd;
2765 2741
2766 2742 /* Update the free list index */
2767 2743 srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2768 2744 srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2769 2745
2770 2746 TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq);
2771 2747 }
2772 2748
2773 2749
2774 2750 /*
2775 2751 * tavor_wrid_get_entry()
2776 2752 * Context: Can be called from interrupt or base context.
2777 2753 */
2778 2754 uint64_t
2779 2755 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2780 2756 tavor_wrid_entry_t *wre)
2781 2757 {
2782 2758 tavor_workq_hdr_t *wq;
2783 2759 tavor_wrid_entry_t *wre_tmp;
2784 2760 uint64_t wrid;
2785 2761 uint_t send_or_recv, qpnum, error, opcode;
2786 2762
2787 2763 TAVOR_TNF_ENTER(tavor_wrid_get_entry);
2788 2764
2789 2765 /* Lock the list of work queues associated with this CQ */
2790 2766 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2791 2767
2792 2768 /*
2793 2769 * Determine whether this CQE is a send or receive completion (and
2794 2770 * whether it was a "successful" completion or not)
2795 2771 */
2796 2772 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2797 2773 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2798 2774 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2799 2775 error = 1;
2800 2776 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2801 2777 TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2802 2778 } else {
2803 2779 error = 0;
2804 2780 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2805 2781 }
2806 2782
2807 2783 /* Find the work queue for this QP number (send or receive side) */
2808 2784 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2809 2785 wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2810 2786 ASSERT(wq != NULL);
2811 2787
2812 2788 /*
2813 2789 * Regardless of whether the completion is the result of a "success"
2814 2790 * or a "failure", we lock the list of "containers" and attempt to
2815 2791 * search for the the first matching completion (i.e. the first WR
2816 2792 * with a matching WQE addr and size). Once we find it, we pull out
2817 2793 * the "wrid" field and return it (see below). Note: One possible
2818 2794 * future enhancement would be to enable this routine to skip over
2819 2795 * any "unsignaled" completions to go directly to the next "signaled"
2820 2796 * entry on success. XXX
2821 2797 */
2822 2798 mutex_enter(&wq->wq_wrid_wql->wql_lock);
2823 2799 wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2824 2800
2825 2801 /*
2826 2802 * If this is a "successful" completion, then we assert that this
2827 2803 * completion must be a "signaled" completion.
2828 2804 */
2829 2805 ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2830 2806
2831 2807 /*
2832 2808 * If the completion is a "failed" completion, then we save away the
2833 2809 * contents of the entry (into the "wre" field passed in) for use
2834 2810 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2835 2811 * function to grab "wqeaddrsz" from the next entry in the container.
2836 2812 * This is required for error processing (where updating these fields
2837 2813 * properly is necessary to correct handling of the "error" CQE)
2838 2814 */
2839 2815 if (error && (wre != NULL)) {
2840 2816 *wre = *wre_tmp;
2841 2817 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2842 2818 }
2843 2819
2844 2820 /* Pull out the WRID and return it */
2845 2821 wrid = wre_tmp->wr_wrid;
2846 2822
2847 2823 mutex_exit(&wq->wq_wrid_wql->wql_lock);
2848 2824 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2849 2825
2850 2826 TAVOR_TNF_EXIT(tavor_wrid_get_entry);
2851 2827 return (wrid);
2852 2828 }
2853 2829
2854 2830
2855 2831 /*
2856 2832 * tavor_wrid_find_match()
2857 2833 * Context: Can be called from interrupt or base context.
2858 2834 */
2859 2835 static tavor_wrid_entry_t *
2860 2836 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2861 2837 tavor_hw_cqe_t *cqe)
2862 2838 {
2863 2839 tavor_wrid_entry_t *curr = NULL;
2864 2840 tavor_wrid_list_hdr_t *container;
2865 2841 uint32_t wqeaddr_size;
2866 2842 uint32_t head, tail, size;
2867 2843 int found = 0, last_container;
2868 2844
2869 2845 TAVOR_TNF_ENTER(tavor_wrid_find_match);
2870 2846
2871 2847 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2872 2848
2873 2849 /* Pull the "wqeaddrsz" information from the CQE */
2874 2850 wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2875 2851
2876 2852 /*
2877 2853 * Walk the "containers" list(s), find first WR with a matching WQE
2878 2854 * addr. If the current "container" is not the last one on the list,
2879 2855 * i.e. not the current one to which we are posting new WRID entries,
2880 2856 * then we do not attempt to update the "q_head", "q_tail", and
2881 2857 * "q_full" indicators on the main work queue header. We do, however,
2882 2858 * update the "head" and "full" indicators on the individual containers
2883 2859 * as we go. This is imperative because we need to be able to
2884 2860 * determine when the current container has been emptied (so that we
2885 2861 * can move on to the next container).
2886 2862 */
2887 2863 container = wq->wq_wrid_poll;
2888 2864 while (container != NULL) {
2889 2865 /* Is this the last/only "container" on the list */
2890 2866 last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2891 2867
2892 2868 /*
2893 2869 * First check if we are on an SRQ. If so, we grab the entry
2894 2870 * and break out. Since SRQ wridlist's are never added to
2895 2871 * reaplist, they can only be the last container.
2896 2872 */
2897 2873 if (container->wl_srq_en) {
2898 2874 ASSERT(last_container == 1);
2899 2875 curr = tavor_wrid_find_match_srq(container, cq, cqe);
2900 2876 break;
2901 2877 }
2902 2878
2903 2879 /*
2904 2880 * Grab the current "head", "tail" and "size" fields before
2905 2881 * walking the list in the current container. Note: the "size"
2906 2882 * field here must always be a power-of-2. The "full"
2907 2883 * parameter is checked (and updated) here to distinguish the
2908 2884 * "queue full" condition from "queue empty".
2909 2885 */
2910 2886 head = container->wl_head;
2911 2887 tail = container->wl_tail;
2912 2888 size = container->wl_size;
2913 2889 while ((head != tail) || (container->wl_full)) {
2914 2890 container->wl_full = 0;
2915 2891 curr = &container->wl_wre[head];
2916 2892 head = ((head + 1) & (size - 1));
2917 2893
2918 2894 /*
2919 2895 * If the current entry's "wqeaddrsz" matches the one
2920 2896 * we're searching for, then this must correspond to
2921 2897 * the work request that caused the completion. Set
2922 2898 * the "found" flag and bail out.
2923 2899 */
2924 2900 if (curr->wr_wqeaddrsz == wqeaddr_size) {
2925 2901 found = 1;
2926 2902 break;
2927 2903 }
2928 2904 }
2929 2905
2930 2906 /*
2931 2907 * If the current container is empty (having reached here the
2932 2908 * "head == tail" condition can only mean that the container
2933 2909 * is empty), then NULL out the "wrid_old_tail" field (see
2934 2910 * tavor_post_send() and tavor_post_recv() for more details)
2935 2911 * and (potentially) remove the current container from future
2936 2912 * searches.
2937 2913 */
2938 2914 if (head == tail) {
2939 2915
2940 2916 container->wl_wre_old_tail = NULL;
2941 2917 /*
2942 2918 * If this wasn't the last "container" on the chain,
2943 2919 * i.e. the one to which new WRID entries will be
2944 2920 * added, then remove it from the list.
2945 2921 * Note: we don't "lose" the memory pointed to by this
2946 2922 * because we should have already put this container
2947 2923 * on the "reapable" list (from where it will later be
2948 2924 * pulled).
2949 2925 */
2950 2926 if (!last_container) {
2951 2927 wq->wq_wrid_poll = container->wl_next;
2952 2928 }
2953 2929 }
2954 2930
2955 2931 /* Update the head index for the container */
2956 2932 container->wl_head = head;
2957 2933
2958 2934 /*
2959 2935 * If the entry was found in this container, then continue to
2960 2936 * bail out. Else reset the "curr" pointer and move on to the
2961 2937 * next container (if there is one). Note: the only real
2962 2938 * reason for setting "curr = NULL" here is so that the ASSERT
2963 2939 * below can catch the case where no matching entry was found
2964 2940 * on any of the lists.
2965 2941 */
2966 2942 if (found) {
2967 2943 break;
2968 2944 } else {
2969 2945 curr = NULL;
2970 2946 container = container->wl_next;
2971 2947 }
2972 2948 }
2973 2949
2974 2950 /*
2975 2951 * Update work queue header's "head" and "full" conditions to match
2976 2952 * the last entry on the container list. (Note: Only if we're pulling
2977 2953 * entries from the last work queue portion of the list, i.e. not from
2978 2954 * the previous portions that may be the "reapable" list.)
2979 2955 */
2980 2956 if (last_container) {
2981 2957 wq->wq_head = wq->wq_wrid_post->wl_head;
2982 2958 wq->wq_full = wq->wq_wrid_post->wl_full;
2983 2959 }
2984 2960
2985 2961 /* Ensure that we've actually found what we were searching for */
2986 2962 ASSERT(curr != NULL);
2987 2963
2988 2964 TAVOR_TNF_EXIT(tavor_wrid_find_match);
2989 2965 return (curr);
2990 2966 }
2991 2967
2992 2968
2993 2969 /*
2994 2970 * tavor_wrid_find_match_srq()
2995 2971 * Context: Can be called from interrupt or base context.
2996 2972 */
2997 2973 tavor_wrid_entry_t *
2998 2974 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2999 2975 tavor_hw_cqe_t *cqe)
3000 2976 {
3001 2977 tavor_wrid_entry_t *wre;
3002 2978 uint64_t *wl_wqe;
3003 2979 uint32_t wqe_index;
3004 2980 uint64_t wqe_addr;
3005 2981 uint32_t cqe_wqe_addr;
3006 2982
3007 2983 /* Grab the WQE addr out of the CQE */
3008 2984 cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
3009 2985
3010 2986 /*
3011 2987 * Use the WQE addr as the lower 32-bit, we add back on the
3012 2988 * 'wl_srq_desc_off' because we have a zero-based queue. Then the
3013 2989 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
3014 2990 * the SRQ Work Queue itself. We use this address as the index to find
3015 2991 * out which Work Queue Entry this CQE corresponds with.
3016 2992 *
3017 2993 * We also use this address below to add the WQE back on to the free
3018 2994 * list.
3019 2995 */
3020 2996 wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
3021 2997 (cqe_wqe_addr + wl->wl_srq_desc_off);
3022 2998
3023 2999 /*
3024 3000 * Given the 'wqe_addr' just calculated and the srq buf address, we
3025 3001 * find the 'wqe_index'. The 'wre' returned below contains the WRID
3026 3002 * that we are looking for. This indexes into the wre_list for this
3027 3003 * specific WQE.
3028 3004 */
3029 3005 wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
3030 3006 wl->wl_srq_log_wqesz);
3031 3007
3032 3008 /* ASSERT on impossible wqe_index values */
3033 3009 ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
3034 3010
3035 3011 /* Get the pointer to this WQE */
3036 3012 wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
3037 3013
3038 3014 /* Put this WQE index back on the free list */
3039 3015 ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
3040 3016 wl->wl_free_list_indx = wqe_index;
3041 3017
3042 3018 /* Using the index, return the Work Request ID Entry (wre) */
3043 3019 wre = &wl->wl_wre[wqe_index];
3044 3020
3045 3021 return (wre);
3046 3022 }
3047 3023
3048 3024
3049 3025 /*
3050 3026 * tavor_wrid_cq_reap()
3051 3027 * Context: Can be called from interrupt or base context.
3052 3028 */
3053 3029 void
3054 3030 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
3055 3031 {
3056 3032 tavor_workq_hdr_t *consume_wqhdr;
3057 3033 tavor_wrid_list_hdr_t *container, *to_free;
3058 3034
3059 3035 ASSERT(MUTEX_HELD(&cq->cq_lock));
3060 3036
3061 3037 TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3062 3038
3063 3039 /* Lock the list of work queues associated with this CQ */
3064 3040 mutex_enter(&cq->cq_wrid_wqhdr_lock);
3065 3041
3066 3042 /* Walk the "reapable" list and free up containers */
3067 3043 container = cq->cq_wrid_reap_head;
3068 3044 while (container != NULL) {
3069 3045 to_free = container;
3070 3046 container = container->wl_reap_next;
3071 3047 /*
3072 3048 * If reaping the WRID list containers pulls the last
3073 3049 * container from the given work queue header, then we free
3074 3050 * the work queue header as well.
3075 3051 */
3076 3052 consume_wqhdr = tavor_wrid_list_reap(to_free);
3077 3053 if (consume_wqhdr != NULL) {
3078 3054 tavor_cq_wqhdr_remove(cq, consume_wqhdr);
3079 3055 }
3080 3056 }
3081 3057
3082 3058 /* Once finished reaping, we reset the CQ's reap list */
3083 3059 cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
3084 3060
3085 3061 mutex_exit(&cq->cq_wrid_wqhdr_lock);
3086 3062 TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3087 3063 }
3088 3064
3089 3065
3090 3066 /*
3091 3067 * tavor_wrid_cq_force_reap()
3092 3068 * Context: Can be called from interrupt or base context.
3093 3069 */
3094 3070 void
3095 3071 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
3096 3072 {
3097 3073 tavor_workq_hdr_t *curr;
3098 3074 tavor_wrid_list_hdr_t *container, *to_free;
3099 3075 avl_tree_t *treep;
3100 3076 void *cookie = NULL;
3101 3077
3102 3078 ASSERT(MUTEX_HELD(&cq->cq_lock));
3103 3079
3104 3080 TAVOR_TNF_ENTER(tavor_wrid_cq_reap);
3105 3081
3106 3082 /*
3107 3083 * The first step is to walk the "reapable" list and free up those
3108 3084 * containers. This is necessary because the containers on the
3109 3085 * reapable list are not otherwise connected to the work queue headers
3110 3086 * anymore.
3111 3087 */
3112 3088 tavor_wrid_cq_reap(cq);
3113 3089
3114 3090 /* Now lock the list of work queues associated with this CQ */
3115 3091 mutex_enter(&cq->cq_wrid_wqhdr_lock);
↓ open down ↓ |
504 lines elided |
↑ open up ↑ |
3116 3092
3117 3093 /*
3118 3094 * Walk the list of work queue headers and free up all the WRID list
3119 3095 * containers chained to it. Note: We don't need to grab the locks
3120 3096 * for each of the individual WRID lists here because the only way
3121 3097 * things can be added or removed from the list at this point would be
3122 3098 * through post a work request to a QP. But if we've come this far,
3123 3099 * then we can be assured that there are no longer any QP associated
3124 3100 * with the CQ that we are trying to free.
3125 3101 */
3126 -#ifdef __lock_lint
3127 - tavor_wrid_wqhdr_compare(NULL, NULL);
3128 -#endif
3129 3102 treep = &cq->cq_wrid_wqhdr_avl_tree;
3130 3103 while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
3131 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
3132 3104 container = curr->wq_wrid_poll;
3133 3105 while (container != NULL) {
3134 3106 to_free = container;
3135 3107 container = container->wl_next;
3136 3108 /*
3137 3109 * If reaping the WRID list containers pulls the last
3138 3110 * container from the given work queue header, then
3139 3111 * we free the work queue header as well. Note: we
3140 3112 * ignore the return value because we know that the
3141 3113 * work queue header should always be freed once the
3142 3114 * list of containers has come to an end.
3143 3115 */
3144 3116 (void) tavor_wrid_list_reap(to_free);
3145 3117 if (container == NULL) {
3146 3118 tavor_cq_wqhdr_remove(cq, curr);
3147 3119 }
3148 3120 }
3149 3121 }
3150 3122 avl_destroy(treep);
3151 3123
3152 3124 mutex_exit(&cq->cq_wrid_wqhdr_lock);
3153 3125 TAVOR_TNF_EXIT(tavor_wrid_cq_reap);
3154 3126 }
3155 3127
3156 3128
3157 3129 /*
3158 3130 * tavor_wrid_get_list()
3159 3131 * Context: Can be called from interrupt or base context.
3160 3132 */
3161 3133 tavor_wrid_list_hdr_t *
3162 3134 tavor_wrid_get_list(uint32_t qsize)
3163 3135 {
3164 3136 tavor_wrid_list_hdr_t *wridlist;
3165 3137 uint32_t size;
3166 3138
3167 3139 /*
3168 3140 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
3169 3141 * which holds the pointers necessary for maintaining the "reapable"
3170 3142 * list, chaining together multiple "containers" old and new, and
3171 3143 * tracking the head, tail, size, etc. for each container.
3172 3144 *
3173 3145 * The "container" also holds all the tavor_wrid_entry_t's, which is
3174 3146 * allocated separately, one for each entry on the corresponding work
3175 3147 * queue.
3176 3148 */
3177 3149 size = sizeof (tavor_wrid_list_hdr_t);
↓ open down ↓ |
36 lines elided |
↑ open up ↑ |
3178 3150
3179 3151 /*
3180 3152 * Note that this allocation has to be a NOSLEEP operation here
3181 3153 * because we are holding the "wqhdr_list_lock" and, therefore,
3182 3154 * could get raised to the interrupt level.
3183 3155 */
3184 3156 wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
3185 3157 if (wridlist == NULL) {
3186 3158 return (NULL);
3187 3159 }
3188 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
3189 3160
3190 3161 /* Complete the "container" initialization */
3191 3162 wridlist->wl_size = qsize;
3192 3163 wridlist->wl_full = 0;
3193 3164 wridlist->wl_head = 0;
3194 3165 wridlist->wl_tail = 0;
3195 3166 wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3196 3167 sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3197 3168 if (wridlist->wl_wre == NULL) {
3198 3169 kmem_free(wridlist, size);
3199 3170 return (NULL);
3200 3171 }
3201 3172 wridlist->wl_wre_old_tail = NULL;
3202 3173 wridlist->wl_reap_next = NULL;
3203 3174 wridlist->wl_next = NULL;
3204 3175 wridlist->wl_prev = NULL;
3205 3176 wridlist->wl_srq_en = 0;
3206 3177
3207 3178 return (wridlist);
3208 3179 }
3209 3180
3210 3181 /*
3211 3182 * tavor_wrid_list_srq_init()
3212 3183 * Context: Can be called from interrupt or base context
3213 3184 */
3214 3185 void
3215 3186 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3216 3187 uint_t wq_start)
3217 3188 {
3218 3189 uint64_t *wl_wqe;
3219 3190 int wqe_index;
3220 3191
3221 3192 ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3222 3193
3223 3194 /* Setup pointers for use later when we are polling the CQ */
3224 3195 wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3225 3196 wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3226 3197 wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3227 3198 wridlist->wl_srq_desc_off = srq->srq_desc_off;
3228 3199 wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3229 3200
3230 3201 /* Given wq_start to start initializing buf at, verify sanity */
3231 3202 ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3232 3203
3233 3204 /*
3234 3205 * Initialize wridlist free list
3235 3206 *
3236 3207 * For each WQ up to the size of our queue, we store an index in the WQ
3237 3208 * memory itself, representing the next available free entry. The
3238 3209 * 'wl_free_list_indx' always holds the index of the next available
3239 3210 * free entry in the WQ. If 'wl_free_list_indx' is -1, then we are
3240 3211 * completely full. This gives us the advantage of being able to have
3241 3212 * entries complete or be polled off the WQ out-of-order.
3242 3213 *
3243 3214 * For now, we write the free_list entries inside the WQ itself. It
3244 3215 * may be useful in the future to store this information in a separate
3245 3216 * structure for debugging purposes.
3246 3217 */
3247 3218 for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3248 3219 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3249 3220 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3250 3221 wridlist->wl_free_list_indx);
3251 3222 wridlist->wl_free_list_indx = wqe_index;
3252 3223 }
3253 3224 }
3254 3225
3255 3226
3256 3227 /*
3257 3228 * tavor_wrid_reaplist_add()
3258 3229 * Context: Can be called from interrupt or base context.
3259 3230 */
3260 3231 static void
3261 3232 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3262 3233 {
3263 3234 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3264 3235
3265 3236 TAVOR_TNF_ENTER(tavor_wrid_reaplist_add);
3266 3237
3267 3238 mutex_enter(&wq->wq_wrid_wql->wql_lock);
3268 3239
3269 3240 /*
3270 3241 * Add the "post" container (the last one on the current chain) to
3271 3242 * the CQ's "reapable" list
3272 3243 */
3273 3244 if ((cq->cq_wrid_reap_head == NULL) &&
3274 3245 (cq->cq_wrid_reap_tail == NULL)) {
3275 3246 cq->cq_wrid_reap_head = wq->wq_wrid_post;
3276 3247 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3277 3248 } else {
3278 3249 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3279 3250 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3280 3251 }
3281 3252
3282 3253 mutex_exit(&wq->wq_wrid_wql->wql_lock);
3283 3254 }
3284 3255
3285 3256
3286 3257 int
3287 3258 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3288 3259 {
3289 3260 tavor_workq_compare_t *cmpp;
3290 3261 tavor_workq_hdr_t *curr;
3291 3262
3292 3263 cmpp = (tavor_workq_compare_t *)p1;
3293 3264 curr = (tavor_workq_hdr_t *)p2;
3294 3265
3295 3266 if (cmpp->cmp_qpn < curr->wq_qpn)
3296 3267 return (-1);
3297 3268 else if (cmpp->cmp_qpn > curr->wq_qpn)
3298 3269 return (+1);
3299 3270 else if (cmpp->cmp_type < curr->wq_type)
3300 3271 return (-1);
3301 3272 else if (cmpp->cmp_type > curr->wq_type)
3302 3273 return (+1);
3303 3274 else
3304 3275 return (0);
3305 3276 }
3306 3277
3307 3278
3308 3279 /*
3309 3280 * tavor_wrid_wqhdr_find()
3310 3281 * Context: Can be called from interrupt or base context.
3311 3282 */
3312 3283 static tavor_workq_hdr_t *
3313 3284 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3314 3285 {
3315 3286 tavor_workq_hdr_t *curr;
3316 3287 tavor_workq_compare_t cmp;
3317 3288
3318 3289 TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find);
3319 3290
↓ open down ↓ |
121 lines elided |
↑ open up ↑ |
3320 3291 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3321 3292
3322 3293 /*
3323 3294 * Walk the CQ's work queue list, trying to find a send or recv queue
3324 3295 * with the same QP number. We do this even if we are going to later
3325 3296 * create a new entry because it helps us easily find the end of the
3326 3297 * list.
3327 3298 */
3328 3299 cmp.cmp_qpn = qpn;
3329 3300 cmp.cmp_type = wq_type;
3330 -#ifdef __lock_lint
3331 - tavor_wrid_wqhdr_compare(NULL, NULL);
3332 -#endif
3333 3301 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3334 3302
3335 3303 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find);
3336 3304 return (curr);
3337 3305 }
3338 3306
3339 3307
3340 3308 /*
3341 3309 * tavor_wrid_wqhdr_create()
3342 3310 * Context: Can be called from interrupt or base context.
3343 3311 */
3344 3312 static tavor_workq_hdr_t *
3345 3313 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3346 3314 uint_t wq_type, uint_t create_wql)
3347 3315 {
3348 3316 tavor_workq_hdr_t *wqhdr_tmp;
3349 3317
3350 3318 TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create);
3351 3319
3352 3320 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3353 3321
3354 3322 /*
3355 3323 * Allocate space a work queue header structure and initialize it.
3356 3324 * Each work queue header structure includes a "wq_wrid_wql"
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
3357 3325 * which needs to be initialized. Note that this allocation has to be
3358 3326 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3359 3327 * and, therefore, could get raised to the interrupt level.
3360 3328 */
3361 3329 wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3362 3330 sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3363 3331 if (wqhdr_tmp == NULL) {
3364 3332 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3365 3333 return (NULL);
3366 3334 }
3367 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3368 3335 wqhdr_tmp->wq_qpn = qpn;
3369 3336 wqhdr_tmp->wq_type = wq_type;
3370 3337
3371 3338 if (create_wql) {
3372 3339 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3373 3340 if (wqhdr_tmp->wq_wrid_wql == NULL) {
3374 3341 kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3375 3342 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3376 3343 return (NULL);
3377 3344 }
3378 3345 }
3379 3346
3380 3347 wqhdr_tmp->wq_wrid_poll = NULL;
3381 3348 wqhdr_tmp->wq_wrid_post = NULL;
3382 3349
3383 3350 /* Chain the newly allocated work queue header to the CQ's list */
3384 3351 tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3385 3352
3386 3353 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3387 3354 return (wqhdr_tmp);
3388 3355 }
3389 3356
3390 3357
3391 3358 /*
3392 3359 * tavor_wrid_wql_create()
3393 3360 * Context: Can be called from interrupt or base context.
3394 3361 */
3395 3362 tavor_wq_lock_t *
3396 3363 tavor_wrid_wql_create(tavor_state_t *state)
3397 3364 {
3398 3365 tavor_wq_lock_t *wql;
3399 3366
3400 3367 TAVOR_TNF_ENTER(tavor_wrid_wql_create);
3401 3368
3402 3369 /*
3403 3370 * Allocate the WQL and initialize it.
3404 3371 */
3405 3372 wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3406 3373 if (wql == NULL) {
3407 3374 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create);
3408 3375 return (NULL);
3409 3376 }
3410 3377
3411 3378 mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3412 3379 DDI_INTR_PRI(state->ts_intrmsi_pri));
3413 3380
3414 3381 /* Add refcount to WQL */
3415 3382 tavor_wql_refcnt_inc(wql);
3416 3383
3417 3384 TAVOR_TNF_EXIT(tavor_wrid_wql_create);
3418 3385 return (wql);
3419 3386 }
3420 3387
3421 3388
3422 3389 /*
3423 3390 * tavor_wrid_get_wqeaddrsz()
3424 3391 * Context: Can be called from interrupt or base context.
3425 3392 */
3426 3393 static uint32_t
3427 3394 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3428 3395 {
3429 3396 tavor_wrid_entry_t *wre;
3430 3397 uint32_t wqeaddrsz;
3431 3398 uint32_t head;
3432 3399
3433 3400 /*
3434 3401 * If the container is empty, then there is no next entry. So just
3435 3402 * return zero. Note: the "head == tail" condition here can only
3436 3403 * mean that the container is empty because we have previously pulled
3437 3404 * something from the container.
3438 3405 *
3439 3406 * If the container is not empty, then find the next entry and return
3440 3407 * the contents of its "wqeaddrsz" field.
3441 3408 */
3442 3409 if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3443 3410 wqeaddrsz = 0;
3444 3411 } else {
3445 3412 /*
3446 3413 * We don't need to calculate the "next" head pointer here
3447 3414 * because "head" should already point to the next entry on
3448 3415 * the list (since we just pulled something off - in
3449 3416 * tavor_wrid_find_match() - and moved the head index forward.)
3450 3417 */
3451 3418 head = wq->wq_wrid_poll->wl_head;
3452 3419 wre = &wq->wq_wrid_poll->wl_wre[head];
3453 3420 wqeaddrsz = wre->wr_wqeaddrsz;
3454 3421 }
3455 3422 return (wqeaddrsz);
3456 3423 }
3457 3424
3458 3425
3459 3426 /*
3460 3427 * tavor_wrid_wqhdr_add()
3461 3428 * Context: Can be called from interrupt or base context.
3462 3429 */
3463 3430 static void
3464 3431 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3465 3432 tavor_wrid_list_hdr_t *wridlist)
3466 3433 {
3467 3434 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3468 3435
3469 3436 /* Chain the new WRID list "container" to the work queue list */
3470 3437 if ((wqhdr->wq_wrid_post == NULL) &&
3471 3438 (wqhdr->wq_wrid_poll == NULL)) {
3472 3439 wqhdr->wq_wrid_poll = wridlist;
3473 3440 wqhdr->wq_wrid_post = wridlist;
3474 3441 } else {
3475 3442 wqhdr->wq_wrid_post->wl_next = wridlist;
3476 3443 wridlist->wl_prev = wqhdr->wq_wrid_post;
3477 3444 wqhdr->wq_wrid_post = wridlist;
3478 3445 }
3479 3446 }
3480 3447
3481 3448
3482 3449 /*
3483 3450 * tavor_wrid_wqhdr_remove()
3484 3451 * Context: Can be called from interrupt or base context.
3485 3452 *
3486 3453 * Note: this is only called to remove the most recently added WRID list
3487 3454 * container (i.e. in tavor_from_reset() above)
3488 3455 */
3489 3456 static void
3490 3457 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3491 3458 tavor_wrid_list_hdr_t *wridlist)
3492 3459 {
3493 3460 tavor_wrid_list_hdr_t *prev, *next;
3494 3461
3495 3462 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3496 3463
3497 3464 /* Unlink the WRID list "container" from the work queue list */
3498 3465 prev = wridlist->wl_prev;
3499 3466 next = wridlist->wl_next;
3500 3467 if (prev != NULL) {
3501 3468 prev->wl_next = next;
3502 3469 }
3503 3470 if (next != NULL) {
3504 3471 next->wl_prev = prev;
3505 3472 }
3506 3473
3507 3474 /*
3508 3475 * Update any pointers in the work queue hdr that may point to this
3509 3476 * WRID list container
3510 3477 */
3511 3478 if (wqhdr->wq_wrid_post == wridlist) {
3512 3479 wqhdr->wq_wrid_post = prev;
3513 3480 }
3514 3481 if (wqhdr->wq_wrid_poll == wridlist) {
3515 3482 wqhdr->wq_wrid_poll = NULL;
3516 3483 }
3517 3484 }
3518 3485
3519 3486
3520 3487 /*
3521 3488 * tavor_wrid_list_reap()
3522 3489 * Context: Can be called from interrupt or base context.
3523 3490 * Note: The "wqhdr_list_lock" must be held.
3524 3491 */
3525 3492 static tavor_workq_hdr_t *
3526 3493 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3527 3494 {
3528 3495 tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL;
3529 3496 tavor_wrid_list_hdr_t *prev, *next;
3530 3497 uint32_t size;
3531 3498
3532 3499 TAVOR_TNF_ENTER(tavor_wrid_list_reap);
3533 3500
3534 3501 /* Get the back pointer to the work queue header (see below) */
3535 3502 wqhdr = wridlist->wl_wqhdr;
3536 3503 mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3537 3504
3538 3505 /* Unlink the WRID list "container" from the work queue list */
3539 3506 prev = wridlist->wl_prev;
3540 3507 next = wridlist->wl_next;
3541 3508 if (prev != NULL) {
3542 3509 prev->wl_next = next;
3543 3510 }
3544 3511 if (next != NULL) {
3545 3512 next->wl_prev = prev;
3546 3513 }
3547 3514
3548 3515 /*
3549 3516 * If the back pointer to the work queue header shows that it
3550 3517 * was pointing to the entry we are about to remove, then the work
3551 3518 * queue header is reapable as well.
3552 3519 */
3553 3520 if ((wqhdr->wq_wrid_poll == wridlist) &&
3554 3521 (wqhdr->wq_wrid_post == wridlist)) {
3555 3522 consume_wqhdr = wqhdr;
3556 3523 }
3557 3524
3558 3525 /* Be sure to update the "poll" and "post" container pointers */
3559 3526 if (wqhdr->wq_wrid_poll == wridlist) {
3560 3527 wqhdr->wq_wrid_poll = next;
3561 3528 }
3562 3529 if (wqhdr->wq_wrid_post == wridlist) {
3563 3530 wqhdr->wq_wrid_post = NULL;
3564 3531 }
3565 3532
3566 3533 /* Calculate the size and free the container */
3567 3534 size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3568 3535 kmem_free(wridlist->wl_wre, size);
3569 3536 kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3570 3537
3571 3538 mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3572 3539
3573 3540 TAVOR_TNF_EXIT(tavor_wrid_list_reap);
3574 3541 return (consume_wqhdr);
3575 3542 }
3576 3543
3577 3544
3578 3545 /*
3579 3546 * tavor_wrid_wqhdr_lock_both()
↓ open down ↓ |
202 lines elided |
↑ open up ↑ |
3580 3547 * Context: Can be called from interrupt or base context.
3581 3548 */
3582 3549 static void
3583 3550 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3584 3551 {
3585 3552 tavor_cqhdl_t sq_cq, rq_cq;
3586 3553
3587 3554 sq_cq = qp->qp_sq_cqhdl;
3588 3555 rq_cq = qp->qp_rq_cqhdl;
3589 3556
3590 -_NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3591 -_NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3592 -
3593 3557 /*
3594 3558 * If both work queues (send and recv) share a completion queue, then
3595 3559 * grab the common lock. If they use different CQs (hence different
3596 3560 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3597 3561 * receive. We do this consistently and correctly in
3598 3562 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3599 - * of dead lock condition. Note: We add the "__lock_lint" code here
3600 - * to fake out warlock into thinking we've grabbed both locks (when,
3601 - * in fact, we only needed the one).
3563 + * of dead lock condition.
3602 3564 */
3603 3565 if (sq_cq == rq_cq) {
3604 3566 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3605 -#ifdef __lock_lint
3606 - mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3607 -#endif
3608 3567 } else {
3609 3568 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3610 3569 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3611 3570 }
3612 3571 }
3613 3572
3614 3573 /*
3615 3574 * tavor_wrid_wqhdr_unlock_both()
3616 3575 * Context: Can be called from interrupt or base context.
3617 3576 */
3618 3577 static void
3619 3578 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3620 3579 {
3621 3580 tavor_cqhdl_t sq_cq, rq_cq;
3622 3581
3623 3582 sq_cq = qp->qp_sq_cqhdl;
3624 3583 rq_cq = qp->qp_rq_cqhdl;
3625 3584
3626 -_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3627 -_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3628 -
3629 3585 /*
3630 3586 * See tavor_wrid_wqhdr_lock_both() above for more detail
3631 3587 */
3632 3588 if (sq_cq == rq_cq) {
3633 -#ifdef __lock_lint
3634 - mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3635 -#endif
3636 3589 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3637 3590 } else {
3638 3591 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3639 3592 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3640 3593 }
3641 3594 }
3642 3595
3643 3596
3644 3597 /*
3645 3598 * tavor_cq_wqhdr_add()
3646 3599 * Context: Can be called from interrupt or base context.
3647 3600 */
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
3648 3601 static void
3649 3602 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3650 3603 {
3651 3604 tavor_workq_compare_t cmp;
3652 3605 avl_index_t where;
3653 3606
3654 3607 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3655 3608
3656 3609 cmp.cmp_qpn = wqhdr->wq_qpn;
3657 3610 cmp.cmp_type = wqhdr->wq_type;
3658 -#ifdef __lock_lint
3659 - tavor_wrid_wqhdr_compare(NULL, NULL);
3660 -#endif
3661 3611 (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3662 3612 /*
3663 3613 * If the CQ's work queue list is empty, then just add it.
3664 3614 * Otherwise, chain it to the beginning of the list.
3665 3615 */
3666 3616 avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3667 3617 }
3668 3618
3669 3619
3670 3620 /*
3671 3621 * tavor_cq_wqhdr_remove()
3672 3622 * Context: Can be called from interrupt or base context.
3673 3623 */
3674 3624 static void
3675 3625 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3676 3626 {
3677 3627 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3678 3628
3679 -#ifdef __lock_lint
3680 - tavor_wrid_wqhdr_compare(NULL, NULL);
3681 -#endif
3682 3629 /* Remove "wqhdr" from the work queue header list on "cq" */
3683 3630 avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3684 3631
3685 3632 /*
3686 3633 * Release reference to WQL; If this is the last reference, this call
3687 3634 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3688 3635 */
3689 3636 tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3690 3637
3691 3638 /* Free the memory associated with "wqhdr" */
3692 3639 kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3693 3640 }
3694 3641
3695 3642
3696 3643 /*
3697 3644 * tavor_wql_refcnt_inc()
3698 3645 * Context: Can be called from interrupt or base context
3699 3646 */
3700 3647 void
3701 3648 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3702 3649 {
3703 3650 ASSERT(wql != NULL);
3704 3651
3705 3652 mutex_enter(&wql->wql_lock);
3706 3653 wql->wql_refcnt++;
3707 3654 mutex_exit(&wql->wql_lock);
3708 3655 }
3709 3656
3710 3657 /*
3711 3658 * tavor_wql_refcnt_dec()
3712 3659 * Context: Can be called from interrupt or base context
3713 3660 */
3714 3661 void
3715 3662 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3716 3663 {
3717 3664 int refcnt;
3718 3665
3719 3666 ASSERT(wql != NULL);
3720 3667
3721 3668 mutex_enter(&wql->wql_lock);
3722 3669 wql->wql_refcnt--;
3723 3670 refcnt = wql->wql_refcnt;
3724 3671 mutex_exit(&wql->wql_lock);
3725 3672
3726 3673 /*
3727 3674 *
3728 3675 * Free up WQL memory if we're the last one associated with this
3729 3676 * structure.
3730 3677 */
3731 3678 if (refcnt == 0) {
3732 3679 mutex_destroy(&wql->wql_lock);
3733 3680 kmem_free(wql, sizeof (tavor_wq_lock_t));
3734 3681 }
3735 3682 }
↓ open down ↓ |
44 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX