Print this page
7127 remove -Wno-missing-braces from Makefile.uts
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/clients/ibd/ibd.c
+++ new/usr/src/uts/common/io/ib/clients/ibd/ibd.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * An implementation of the IPoIB standard based on PSARC 2001/289.
28 28 */
29 29
30 30 #include <sys/types.h>
31 31 #include <sys/conf.h>
32 32 #include <sys/ddi.h>
33 33 #include <sys/sunddi.h>
34 34 #include <sys/modctl.h>
35 35 #include <sys/stropts.h>
36 36 #include <sys/stream.h>
37 37 #include <sys/strsun.h>
38 38 #include <sys/strsubr.h>
39 39 #include <sys/dlpi.h>
40 40 #include <sys/mac_provider.h>
41 41
42 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */
43 43 #include <sys/sysmacros.h> /* for offsetof */
44 44 #include <sys/disp.h> /* for async thread pri */
45 45 #include <sys/atomic.h> /* for atomic_add*() */
46 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */
47 47 #include <netinet/in.h> /* for netinet/ip.h below */
48 48 #include <netinet/ip.h> /* for struct ip */
49 49 #include <netinet/udp.h> /* for struct udphdr */
50 50 #include <inet/common.h> /* for inet/ip.h below */
51 51 #include <inet/ip.h> /* for ipha_t */
52 52 #include <inet/ip6.h> /* for ip6_t */
53 53 #include <inet/tcp.h> /* for tcph_t */
54 54 #include <netinet/icmp6.h> /* for icmp6_t */
55 55 #include <sys/callb.h>
56 56 #include <sys/modhash.h>
57 57
58 58 #include <sys/ib/clients/ibd/ibd.h>
59 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */
60 60 #include <sys/note.h>
61 61 #include <sys/multidata.h>
62 62
63 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */
64 64
65 65 #include <sys/priv_names.h>
66 66 #include <sys/dls.h>
67 67 #include <sys/dld_ioc.h>
68 68 #include <sys/policy.h>
69 69 #include <sys/ibpart.h>
70 70 #include <sys/file.h>
71 71
72 72 /*
73 73 * The write-up below includes details on the following:
74 74 * 1. The dladm administrative model.
75 75 * 2. Late HCA initialization feature.
76 76 * 3. Brussels support and its implications to the current architecture.
77 77 *
78 78 * 1. The dladm administrative model.
79 79 * ------------------------------------------
80 80 * With the dladm model, ibnex will create one ibd instance per port. These
81 81 * instances will be created independent of the port state.
82 82 *
83 83 * The ibd driver is two faceted: One side of it working as the port driver and
84 84 * the other as the partition object driver.
85 85 *
86 86 * The port instance is a child of the HCA, and will have an entry in the devfs.
87 87 * A DDI attach only happens for the port driver, and its attach is
88 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89 89 * handled in ibd_port_unattach().
90 90 *
91 91 * The partition object is only a registrant to the mac layer via mac_register()
92 92 * and does not have an entry in the device tree. There is no DDI softstate
93 93 * managed by the DDI framework for the partition objects. However, the state is
94 94 * managed inside the ibd driver, and every partition object hangs off the
95 95 * "ibd_objlist_head".
96 96 *
97 97 * The partition object first comes into existence when a user runs the
98 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry
99 99 * point of the partition object. The partition object goes away with the
100 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101 101 * point of the partition object.
102 102 *
103 103 * The create-part and delete-part subcommands result in dld ioctls that end up
104 104 * calling ibd_create_parition() and ibd_delete_partition respectively.
105 105 * There ioctls are registered with the dld layer in _init() via a call to
106 106 * dld_ioc_register().
107 107 *
108 108 * The port instance by itself cannot be plumbed. It is only the partition
109 109 * objects that can be plumbed and they alone participate in I/O and not the
110 110 * port driver.
111 111 *
112 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to
113 113 * display useful information. The info entry point for ibd is
114 114 * ibd_get_partition_info().
115 115 *
116 116 * 2. Late HCA initialization feature.
117 117 * ------------------------------------
118 118 * As mentioned in section 1, the user creates the partition objects via
119 119 * dladm(1M). It is possible that:
120 120 * a) The physical port itself is down and the SM cannot be reached.
121 121 * b) The PKEY specified by the used has not been created in the SM yet.
122 122 * c) An IPoIB broadcast group for the specified PKEY is not present.
123 123 *
124 124 * In all of the above cases, complete initialization of the partition object is
125 125 * not possible. However, the new model allows the creation of partition
126 126 * objects even in such cases but will defer the initialization for later.
127 127 * When such a partition object is plumbed, the link state will be displayed as
128 128 * "down".
129 129 * The driver, at this point, is listening to events that herald the
130 130 * availability of resources -
131 131 * i) LINK_UP when the link becomes available
132 132 * ii) PORT_CHANGE when the PKEY has been created
133 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134 134 * created
135 135 * via ibd_async_handler() for events i) and ii), and via
136 136 * ibd_snet_notices_handler() for iii.
137 137 * The driver handles these events (as and when they arrive) and completes the
138 138 * initialization of the partition object and transitions it to a usable state.
139 139 *
140 140 * 3. Brussels support and its implications to the current architecture.
141 141 * ---------------------------------------------------------------------
142 142 * The brussels support introduces two new interfaces to the ibd driver -
143 143 * ibd_m_getprop() and ibd_m_setprop().
144 144 * These interfaces allow setting and retrieval of certain properties.
145 145 * Some of them are public properties while most other are private properties
146 146 * meant to be used by developers. Tuning the latter kind can cause
147 147 * performance issues and should not be used without understanding the
148 148 * implications. All properties are specific to an instance of either the
149 149 * partition object or the port driver.
150 150 *
151 151 * The public properties are : mtu and linkmode.
152 152 * mtu is a read-only property.
153 153 * linkmode can take two values - UD and CM.
154 154 *
155 155 * Changing the linkmode requires some bookkeeping in the driver. The
156 156 * capabilities need to be re-reported to the mac layer. This is done by
157 157 * calling mac_capab_update(). The maxsdu is updated by calling
158 158 * mac_maxsdu_update2().
159 159 * The private properties retain their values across the change of linkmode.
160 160 * NOTE:
161 161 * - The port driver does not support any property apart from mtu.
162 162 * - All other properties are only meant for the partition object.
163 163 * - The properties cannot be set when an instance is plumbed. The
164 164 * instance has to be unplumbed to effect any setting.
165 165 */
166 166
167 167 /*
168 168 * Driver wide tunables
169 169 *
170 170 * ibd_tx_softintr
171 171 * ibd_rx_softintr
172 172 * The softintr mechanism allows ibd to avoid event queue overflows if
173 173 * the receive/completion handlers are to be expensive. These are enabled
174 174 * by default.
175 175 *
176 176 * ibd_log_sz
177 177 * This specifies the size of the ibd log buffer in bytes. The buffer is
178 178 * allocated and logging is enabled only when IBD_LOGGING is defined.
179 179 *
180 180 */
181 181 uint_t ibd_rx_softintr = 1;
182 182 uint_t ibd_tx_softintr = 1;
183 183
184 184 #ifdef IBD_LOGGING
185 185 uint_t ibd_log_sz = 0x20000;
186 186 #endif
187 187
188 188 #ifdef IBD_LOGGING
189 189 #define IBD_LOG_SZ ibd_log_sz
190 190 #endif
191 191
192 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 193 #define IBD_RX_POST_CNT 8
194 194
195 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 196 #define IBD_LOG_RX_POST 4
197 197
198 198 /* Minimum number of receive work requests driver needs to always have */
199 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 200
201 201 /*
202 202 * LSO parameters
203 203 */
204 204 #define IBD_LSO_MAXLEN 65536
205 205 #define IBD_LSO_BUFSZ 8192
206 206
207 207 /*
208 208 * Async operation states
209 209 */
210 210 #define IBD_OP_NOTSTARTED 0
211 211 #define IBD_OP_ONGOING 1
212 212 #define IBD_OP_COMPLETED 2
213 213 #define IBD_OP_ERRORED 3
214 214 #define IBD_OP_ROUTERED 4
215 215
216 216 /*
217 217 * Start/stop in-progress flags; note that restart must always remain
218 218 * the OR of start and stop flag values.
219 219 */
220 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000
221 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000
222 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000
223 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS
224 224
225 225 /*
226 226 * Miscellaneous constants
227 227 */
228 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF
229 229 #define IBD_DEF_MAX_SDU 2044
230 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
231 231 #define IBD_DEF_RC_MAX_SDU 65520
232 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
233 233 #define IBD_DEFAULT_QKEY 0xB1B
234 234 #ifdef IBD_LOGGING
235 235 #define IBD_DMAX_LINE 100
236 236 #endif
237 237
238 238 /*
239 239 * Enumerations for link states
240 240 */
241 241 typedef enum {
242 242 IBD_LINK_DOWN,
243 243 IBD_LINK_UP,
244 244 IBD_LINK_UP_ABSENT
245 245 } ibd_link_op_t;
246 246
247 247 /*
248 248 * Driver State Pointer
249 249 */
250 250 void *ibd_list;
251 251
252 252 /*
253 253 * Driver Global Data
254 254 */
255 255 ibd_global_state_t ibd_gstate;
256 256
257 257 /*
258 258 * Partition object list
259 259 */
260 260 ibd_state_t *ibd_objlist_head = NULL;
261 261 kmutex_t ibd_objlist_lock;
262 262
263 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */
264 264
265 265 /*
266 266 * Logging
267 267 */
268 268 #ifdef IBD_LOGGING
269 269 kmutex_t ibd_lbuf_lock;
270 270 uint8_t *ibd_lbuf;
271 271 uint32_t ibd_lbuf_ndx;
272 272 #endif
273 273
274 274 /*
275 275 * Required system entry points
276 276 */
277 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
278 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
279 279
280 280 /*
281 281 * Required driver entry points for GLDv3
282 282 */
283 283 static int ibd_m_stat(void *, uint_t, uint64_t *);
284 284 static int ibd_m_start(void *);
285 285 static void ibd_m_stop(void *);
286 286 static int ibd_m_promisc(void *, boolean_t);
287 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
288 288 static int ibd_m_unicst(void *, const uint8_t *);
289 289 static mblk_t *ibd_m_tx(void *, mblk_t *);
290 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
291 291
292 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
293 293 const void *);
294 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
295 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
296 296 mac_prop_info_handle_t);
297 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
298 298 const void *);
299 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
300 300
301 301 /*
302 302 * Private driver entry points for GLDv3
303 303 */
304 304
305 305 /*
306 306 * Initialization
307 307 */
308 308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
309 309 static int ibd_init_txlist(ibd_state_t *);
310 310 static int ibd_init_rxlist(ibd_state_t *);
311 311 static int ibd_acache_init(ibd_state_t *);
312 312 #ifdef IBD_LOGGING
313 313 static void ibd_log_init(void);
314 314 #endif
315 315
316 316 /*
317 317 * Termination/cleanup
318 318 */
319 319 static void ibd_state_fini(ibd_state_t *);
320 320 static void ibd_fini_txlist(ibd_state_t *);
321 321 static void ibd_fini_rxlist(ibd_state_t *);
322 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
323 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
324 324 static void ibd_acache_fini(ibd_state_t *);
325 325 #ifdef IBD_LOGGING
326 326 static void ibd_log_fini(void);
327 327 #endif
328 328
329 329 /*
330 330 * Allocation/acquire/map routines
331 331 */
332 332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
333 333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
334 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
335 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
336 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
337 337 uint32_t *);
338 338
339 339 /*
340 340 * Free/release/unmap routines
341 341 */
342 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
343 343 static void ibd_free_tx_copybufs(ibd_state_t *);
344 344 static void ibd_free_rx_copybufs(ibd_state_t *);
345 345 static void ibd_free_rx_rsrcs(ibd_state_t *);
346 346 static void ibd_free_tx_lsobufs(ibd_state_t *);
347 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
348 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
349 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
350 350
351 351 /*
352 352 * Handlers/callback routines
353 353 */
354 354 static uint_t ibd_intr(caddr_t);
355 355 static uint_t ibd_tx_recycle(caddr_t);
356 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
357 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
358 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
359 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
360 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
361 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
362 362 static void ibd_freemsg_cb(char *);
363 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
364 364 ibt_async_event_t *);
365 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
366 366 ibt_async_event_t *);
367 367 static void ibd_snet_notices_handler(void *, ib_gid_t,
368 368 ibt_subnet_event_code_t, ibt_subnet_event_t *);
369 369
370 370 /*
371 371 * Send/receive routines
372 372 */
373 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
374 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
375 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
376 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
377 377
378 378 /*
379 379 * Threads
380 380 */
381 381 static void ibd_async_work(ibd_state_t *);
382 382
383 383 /*
384 384 * Async tasks
385 385 */
386 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
387 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
388 388 static void ibd_async_setprom(ibd_state_t *);
389 389 static void ibd_async_unsetprom(ibd_state_t *);
390 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
391 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
392 392 static void ibd_async_txsched(ibd_state_t *);
393 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
394 394
395 395 /*
396 396 * Async task helpers
397 397 */
398 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
399 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
400 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
401 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
402 402 ipoib_mac_t *, ipoib_mac_t *);
403 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
404 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
405 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
406 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
407 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
408 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
409 409 static uint64_t ibd_get_portspeed(ibd_state_t *);
410 410 static boolean_t ibd_async_safe(ibd_state_t *);
411 411 static void ibd_async_done(ibd_state_t *);
412 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
413 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
414 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
415 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
416 416
417 417 /*
418 418 * Helpers for attach/start routines
419 419 */
420 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
421 421 static int ibd_record_capab(ibd_state_t *);
422 422 static int ibd_get_port_details(ibd_state_t *);
423 423 static int ibd_alloc_cqs(ibd_state_t *);
424 424 static int ibd_setup_ud_channel(ibd_state_t *);
425 425 static int ibd_start(ibd_state_t *);
426 426 static int ibd_undo_start(ibd_state_t *, link_state_t);
427 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
428 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
429 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
430 430 static void ibd_part_unattach(ibd_state_t *state);
431 431 static int ibd_port_attach(dev_info_t *);
432 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
433 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
434 434 static int ibd_part_busy(ibd_state_t *);
435 435
436 436 /*
437 437 * Miscellaneous helpers
438 438 */
439 439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 440 static void ibd_resume_transmission(ibd_state_t *);
441 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
442 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
443 443 static void *list_get_head(list_t *);
444 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
445 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
446 446
447 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
448 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
449 449
450 450 #ifdef IBD_LOGGING
451 451 static void ibd_log(const char *, ...);
452 452 #endif
453 453
454 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
455 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
↓ open down ↓ |
455 lines elided |
↑ open up ↑ |
456 456
457 457 /* Module Driver Info */
458 458 static struct modldrv ibd_modldrv = {
459 459 &mod_driverops, /* This one is a driver */
460 460 "InfiniBand GLDv3 Driver", /* short description */
461 461 &ibd_dev_ops /* driver specific ops */
462 462 };
463 463
464 464 /* Module Linkage */
465 465 static struct modlinkage ibd_modlinkage = {
466 - MODREV_1, (void *)&ibd_modldrv, NULL
466 + MODREV_1, { (void *)&ibd_modldrv, NULL }
467 467 };
468 468
469 469 /*
470 470 * Module (static) info passed to IBTL during ibt_attach
471 471 */
472 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
473 473 IBTI_V_CURR,
474 474 IBT_NETWORK,
475 475 ibd_async_handler,
476 476 NULL,
477 477 "IBPART"
478 478 };
479 479
480 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
481 481 IBTI_V_CURR,
482 482 IBT_NETWORK,
483 483 ibdpd_async_handler,
484 484 NULL,
485 485 "IPIB"
486 486 };
487 487
488 488 /*
489 489 * GLDv3 entry points
490 490 */
491 491 #define IBD_M_CALLBACK_FLAGS \
492 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
493 493
494 494 static mac_callbacks_t ibd_m_callbacks = {
495 495 IBD_M_CALLBACK_FLAGS,
496 496 ibd_m_stat,
497 497 ibd_m_start,
498 498 ibd_m_stop,
499 499 ibd_m_promisc,
500 500 ibd_m_multicst,
501 501 ibd_m_unicst,
502 502 ibd_m_tx,
503 503 NULL,
504 504 NULL,
505 505 ibd_m_getcapab,
506 506 NULL,
507 507 NULL,
508 508 ibd_m_setprop,
509 509 ibd_m_getprop,
510 510 ibd_m_propinfo
511 511 };
512 512
513 513 /* Private properties */
514 514 char *ibd_priv_props[] = {
515 515 "_ibd_broadcast_group",
516 516 "_ibd_coalesce_completions",
517 517 "_ibd_create_broadcast_group",
518 518 "_ibd_hash_size",
519 519 "_ibd_lso_enable",
520 520 "_ibd_num_ah",
521 521 "_ibd_num_lso_bufs",
522 522 "_ibd_rc_enable_srq",
523 523 "_ibd_rc_num_rwqe",
524 524 "_ibd_rc_num_srq",
525 525 "_ibd_rc_num_swqe",
526 526 "_ibd_rc_rx_comp_count",
527 527 "_ibd_rc_rx_comp_usec",
528 528 "_ibd_rc_rx_copy_thresh",
529 529 "_ibd_rc_rx_rwqe_thresh",
530 530 "_ibd_rc_tx_comp_count",
531 531 "_ibd_rc_tx_comp_usec",
532 532 "_ibd_rc_tx_copy_thresh",
533 533 "_ibd_ud_num_rwqe",
534 534 "_ibd_ud_num_swqe",
535 535 "_ibd_ud_rx_comp_count",
536 536 "_ibd_ud_rx_comp_usec",
537 537 "_ibd_ud_tx_comp_count",
538 538 "_ibd_ud_tx_comp_usec",
539 539 "_ibd_ud_tx_copy_thresh",
540 540 NULL
541 541 };
542 542
543 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
544 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
545 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
546 546
547 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
548 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
549 549 ibd_create_partition, secpolicy_dl_config},
550 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
551 551 ibd_delete_partition, secpolicy_dl_config},
552 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
553 553 ibd_get_partition_info, NULL}
554 554 };
555 555
556 556 /*
557 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address
558 558 */
559 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \
560 560 { \
561 561 *(uint32_t *)((char *)(maddr) + 4) |= \
562 562 htonl((uint32_t)(scope) << 16); \
563 563 *(uint32_t *)((char *)(maddr) + 8) |= \
564 564 htonl((uint32_t)(pkey) << 16); \
565 565 }
566 566
567 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \
568 568 { \
569 569 *(uint32_t *)((char *)(maddr) + 4) &= \
570 570 htonl(~((uint32_t)0xF << 16)); \
571 571 *(uint32_t *)((char *)(maddr) + 8) &= \
572 572 htonl(~((uint32_t)0xFFFF << 16)); \
573 573 }
574 574
575 575 /*
576 576 * Rudimentary debugging support
577 577 */
578 578 #ifdef DEBUG
579 579 int ibd_debuglevel = 100;
580 580 void
581 581 debug_print(int l, char *fmt, ...)
582 582 {
583 583 va_list ap;
584 584
585 585 if (l < ibd_debuglevel)
586 586 return;
587 587 va_start(ap, fmt);
588 588 vcmn_err(CE_CONT, fmt, ap);
589 589 va_end(ap);
590 590 }
591 591 #endif
592 592
593 593 /*
594 594 * Common routine to print warning messages; adds in hca guid, port number
595 595 * and pkey to be able to identify the IBA interface.
596 596 */
597 597 void
598 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
599 599 {
600 600 ib_guid_t hca_guid;
601 601 char ibd_print_buf[MAXNAMELEN + 256];
602 602 int len;
603 603 va_list ap;
604 604 char part_name[MAXNAMELEN];
605 605 datalink_id_t linkid = state->id_plinkid;
606 606
607 607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
608 608 0, "hca-guid", 0);
609 609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
610 610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
611 611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
612 612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
613 613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
614 614 part_name);
615 615 va_start(ap, fmt);
616 616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
617 617 fmt, ap);
618 618 cmn_err(CE_NOTE, "!%s", ibd_print_buf);
619 619 va_end(ap);
620 620 }
621 621
622 622 /*
623 623 * Warlock directives
624 624 */
625 625
626 626 /*
627 627 * id_lso_lock
628 628 *
629 629 * state->id_lso->bkt_nfree may be accessed without a lock to
630 630 * determine the threshold at which we have to ask the nw layer
631 631 * to resume transmission (see ibd_resume_transmission()).
632 632 */
633 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
634 634 ibd_state_t::id_lso))
635 635 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
636 636 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
637 637 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
638 638
639 639 /*
640 640 * id_scq_poll_lock
641 641 */
642 642 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
643 643 ibd_state_t::id_scq_poll_busy))
644 644
645 645 /*
646 646 * id_txpost_lock
647 647 */
648 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
649 649 ibd_state_t::id_tx_head))
650 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
651 651 ibd_state_t::id_tx_busy))
652 652
653 653 /*
654 654 * id_acache_req_lock
655 655 */
656 656 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
657 657 ibd_state_t::id_acache_req_cv))
658 658 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
659 659 ibd_state_t::id_req_list))
660 660 _NOTE(SCHEME_PROTECTS_DATA("atomic",
661 661 ibd_acache_s::ac_ref))
662 662
663 663 /*
664 664 * id_ac_mutex
665 665 *
666 666 * This mutex is actually supposed to protect id_ah_op as well,
667 667 * but this path of the code isn't clean (see update of id_ah_op
668 668 * in ibd_async_acache(), immediately after the call to
669 669 * ibd_async_mcache()). For now, we'll skip this check by
670 670 * declaring that id_ah_op is protected by some internal scheme
671 671 * that warlock isn't aware of.
672 672 */
673 673 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
674 674 ibd_state_t::id_ah_active))
675 675 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
676 676 ibd_state_t::id_ah_free))
677 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
678 678 ibd_state_t::id_ah_addr))
679 679 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
680 680 ibd_state_t::id_ah_op))
681 681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
682 682 ibd_state_t::id_ah_error))
683 683 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
684 684 ibd_state_t::id_ac_hot_ace))
685 685 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
686 686
687 687 /*
688 688 * id_mc_mutex
689 689 */
690 690 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
691 691 ibd_state_t::id_mc_full))
692 692 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
693 693 ibd_state_t::id_mc_non))
694 694
695 695 /*
696 696 * id_trap_lock
697 697 */
698 698 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
699 699 ibd_state_t::id_trap_cv))
700 700 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
701 701 ibd_state_t::id_trap_stop))
702 702 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
703 703 ibd_state_t::id_trap_inprog))
704 704
705 705 /*
706 706 * id_prom_op
707 707 */
708 708 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
709 709 ibd_state_t::id_prom_op))
710 710
711 711 /*
712 712 * id_sched_lock
713 713 */
714 714 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
715 715 ibd_state_t::id_sched_needed))
716 716
717 717 /*
718 718 * id_link_mutex
719 719 */
720 720 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
721 721 ibd_state_t::id_link_state))
722 722 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
723 723 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
724 724 ibd_state_t::id_link_speed))
725 725 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
726 726
727 727 /*
728 728 * id_tx_list.dl_mutex
729 729 */
730 730 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
731 731 ibd_state_t::id_tx_list.dl_head))
732 732 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
733 733 ibd_state_t::id_tx_list.dl_pending_sends))
734 734 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
735 735 ibd_state_t::id_tx_list.dl_cnt))
736 736
737 737 /*
738 738 * id_rx_list.dl_mutex
739 739 */
740 740 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
741 741 ibd_state_t::id_rx_list.dl_bufs_outstanding))
742 742 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
743 743 ibd_state_t::id_rx_list.dl_cnt))
744 744
745 745 /*
746 746 * rc_timeout_lock
747 747 */
748 748 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
749 749 ibd_state_t::rc_timeout_start))
750 750 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
751 751 ibd_state_t::rc_timeout))
752 752
753 753
754 754 /*
755 755 * Items protected by atomic updates
756 756 */
757 757 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
758 758 ibd_state_s::id_brd_rcv
759 759 ibd_state_s::id_brd_xmt
760 760 ibd_state_s::id_multi_rcv
761 761 ibd_state_s::id_multi_xmt
762 762 ibd_state_s::id_num_intrs
763 763 ibd_state_s::id_rcv_bytes
764 764 ibd_state_s::id_rcv_pkt
765 765 ibd_state_s::id_rx_post_queue_index
766 766 ibd_state_s::id_tx_short
767 767 ibd_state_s::id_xmt_bytes
768 768 ibd_state_s::id_xmt_pkt
769 769 ibd_state_s::rc_rcv_trans_byte
770 770 ibd_state_s::rc_rcv_trans_pkt
771 771 ibd_state_s::rc_rcv_copy_byte
772 772 ibd_state_s::rc_rcv_copy_pkt
773 773 ibd_state_s::rc_xmt_bytes
774 774 ibd_state_s::rc_xmt_small_pkt
775 775 ibd_state_s::rc_xmt_fragmented_pkt
776 776 ibd_state_s::rc_xmt_map_fail_pkt
777 777 ibd_state_s::rc_xmt_map_succ_pkt
778 778 ibd_rc_chan_s::rcq_invoking))
779 779
780 780 /*
781 781 * Non-mutex protection schemes for data elements. Almost all of
782 782 * these are non-shared items.
783 783 */
784 784 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
785 785 callb_cpr
786 786 ib_gid_s
787 787 ib_header_info
788 788 ibd_acache_rq
789 789 ibd_acache_s::ac_mce
790 790 ibd_acache_s::ac_chan
791 791 ibd_mcache::mc_fullreap
792 792 ibd_mcache::mc_jstate
793 793 ibd_mcache::mc_req
794 794 ibd_rwqe_s
795 795 ibd_swqe_s
796 796 ibd_wqe_s
797 797 ibt_wr_ds_s::ds_va
798 798 ibt_wr_lso_s
799 799 ipoib_mac::ipoib_qpn
800 800 mac_capab_lso_s
801 801 msgb::b_next
802 802 msgb::b_cont
803 803 msgb::b_rptr
804 804 msgb::b_wptr
805 805 ibd_state_s::id_bgroup_created
806 806 ibd_state_s::id_mac_state
807 807 ibd_state_s::id_mtu
808 808 ibd_state_s::id_ud_num_rwqe
809 809 ibd_state_s::id_ud_num_swqe
810 810 ibd_state_s::id_qpnum
811 811 ibd_state_s::id_rcq_hdl
812 812 ibd_state_s::id_rx_buf_sz
813 813 ibd_state_s::id_rx_bufs
814 814 ibd_state_s::id_rx_mr_hdl
815 815 ibd_state_s::id_rx_wqes
816 816 ibd_state_s::id_rxwcs
817 817 ibd_state_s::id_rxwcs_size
818 818 ibd_state_s::id_rx_nqueues
819 819 ibd_state_s::id_rx_queues
820 820 ibd_state_s::id_scope
821 821 ibd_state_s::id_scq_hdl
822 822 ibd_state_s::id_tx_buf_sz
823 823 ibd_state_s::id_tx_bufs
824 824 ibd_state_s::id_tx_mr_hdl
825 825 ibd_state_s::id_tx_rel_list.dl_cnt
826 826 ibd_state_s::id_tx_wqes
827 827 ibd_state_s::id_txwcs
828 828 ibd_state_s::id_txwcs_size
829 829 ibd_state_s::rc_listen_hdl
830 830 ibd_state_s::rc_listen_hdl_OFED_interop
831 831 ibd_state_s::rc_srq_size
832 832 ibd_state_s::rc_srq_rwqes
833 833 ibd_state_s::rc_srq_rx_bufs
834 834 ibd_state_s::rc_srq_rx_mr_hdl
835 835 ibd_state_s::rc_tx_largebuf_desc_base
836 836 ibd_state_s::rc_tx_mr_bufs
837 837 ibd_state_s::rc_tx_mr_hdl
838 838 ipha_s
839 839 icmph_s
840 840 ibt_path_info_s::pi_sid
841 841 ibd_rc_chan_s::ace
842 842 ibd_rc_chan_s::chan_hdl
843 843 ibd_rc_chan_s::state
844 844 ibd_rc_chan_s::chan_state
845 845 ibd_rc_chan_s::is_tx_chan
846 846 ibd_rc_chan_s::rcq_hdl
847 847 ibd_rc_chan_s::rcq_size
848 848 ibd_rc_chan_s::scq_hdl
849 849 ibd_rc_chan_s::scq_size
850 850 ibd_rc_chan_s::rx_bufs
851 851 ibd_rc_chan_s::rx_mr_hdl
852 852 ibd_rc_chan_s::rx_rwqes
853 853 ibd_rc_chan_s::tx_wqes
854 854 ibd_rc_chan_s::tx_mr_bufs
855 855 ibd_rc_chan_s::tx_mr_hdl
856 856 ibd_rc_chan_s::tx_rel_list.dl_cnt
857 857 ibd_rc_chan_s::is_used
858 858 ibd_rc_tx_largebuf_s::lb_buf
859 859 ibd_rc_msg_hello_s
860 860 ibt_cm_return_args_s))
861 861
862 862 /*
863 863 * ibd_rc_chan_s::next is protected by two mutexes:
864 864 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
865 865 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
866 866 */
867 867 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
868 868 ibd_rc_chan_s::next))
869 869
870 870 /*
871 871 * ibd_state_s.rc_tx_large_bufs_lock
872 872 */
873 873 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
874 874 ibd_state_s::rc_tx_largebuf_free_head))
875 875 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
876 876 ibd_state_s::rc_tx_largebuf_nfree))
877 877 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
878 878 ibd_rc_tx_largebuf_s::lb_next))
879 879
880 880 /*
881 881 * ibd_acache_s.tx_too_big_mutex
882 882 */
883 883 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
884 884 ibd_acache_s::tx_too_big_ongoing))
885 885
886 886 /*
887 887 * tx_wqe_list.dl_mutex
888 888 */
889 889 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
890 890 ibd_rc_chan_s::tx_wqe_list.dl_head))
891 891 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
892 892 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
893 893 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
894 894 ibd_rc_chan_s::tx_wqe_list.dl_cnt))
895 895
896 896 /*
897 897 * ibd_state_s.rc_ace_recycle_lock
898 898 */
899 899 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
900 900 ibd_state_s::rc_ace_recycle))
901 901
902 902 /*
903 903 * rc_srq_rwqe_list.dl_mutex
904 904 */
905 905 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
906 906 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
907 907 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
908 908 ibd_state_t::rc_srq_rwqe_list.dl_cnt))
909 909
910 910 /*
911 911 * Non-mutex protection schemes for data elements. They are counters
912 912 * for problem diagnosis. Don't need be protected.
913 913 */
914 914 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
915 915 ibd_state_s::rc_rcv_alloc_fail
916 916 ibd_state_s::rc_rcq_err
917 917 ibd_state_s::rc_ace_not_found
918 918 ibd_state_s::rc_xmt_drop_too_long_pkt
919 919 ibd_state_s::rc_xmt_icmp_too_long_pkt
920 920 ibd_state_s::rc_xmt_reenter_too_long_pkt
921 921 ibd_state_s::rc_swqe_short
922 922 ibd_state_s::rc_swqe_mac_update
923 923 ibd_state_s::rc_xmt_buf_short
924 924 ibd_state_s::rc_xmt_buf_mac_update
925 925 ibd_state_s::rc_scq_no_swqe
926 926 ibd_state_s::rc_scq_no_largebuf
927 927 ibd_state_s::rc_conn_succ
928 928 ibd_state_s::rc_conn_fail
929 929 ibd_state_s::rc_null_conn
930 930 ibd_state_s::rc_no_estab_conn
931 931 ibd_state_s::rc_act_close
932 932 ibd_state_s::rc_pas_close
933 933 ibd_state_s::rc_delay_ace_recycle
934 934 ibd_state_s::rc_act_close_simultaneous
935 935 ibd_state_s::rc_act_close_not_clean
936 936 ibd_state_s::rc_pas_close_rcq_invoking
937 937 ibd_state_s::rc_reset_cnt
938 938 ibd_state_s::rc_timeout_act
939 939 ibd_state_s::rc_timeout_pas
940 940 ibd_state_s::rc_stop_connect))
941 941
942 942 #ifdef DEBUG
943 943 /*
944 944 * Non-mutex protection schemes for data elements. They are counters
945 945 * for problem diagnosis. Don't need be protected.
946 946 */
947 947 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
948 948 ibd_state_s::rc_rwqe_short
949 949 ibd_rc_stat_s::rc_rcv_trans_byte
950 950 ibd_rc_stat_s::rc_rcv_trans_pkt
951 951 ibd_rc_stat_s::rc_rcv_copy_byte
952 952 ibd_rc_stat_s::rc_rcv_copy_pkt
953 953 ibd_rc_stat_s::rc_rcv_alloc_fail
954 954 ibd_rc_stat_s::rc_rcq_err
955 955 ibd_rc_stat_s::rc_rwqe_short
956 956 ibd_rc_stat_s::rc_xmt_bytes
957 957 ibd_rc_stat_s::rc_xmt_small_pkt
958 958 ibd_rc_stat_s::rc_xmt_fragmented_pkt
959 959 ibd_rc_stat_s::rc_xmt_map_fail_pkt
960 960 ibd_rc_stat_s::rc_xmt_map_succ_pkt
961 961 ibd_rc_stat_s::rc_ace_not_found
962 962 ibd_rc_stat_s::rc_scq_no_swqe
963 963 ibd_rc_stat_s::rc_scq_no_largebuf
964 964 ibd_rc_stat_s::rc_swqe_short
965 965 ibd_rc_stat_s::rc_swqe_mac_update
966 966 ibd_rc_stat_s::rc_xmt_buf_short
967 967 ibd_rc_stat_s::rc_xmt_buf_mac_update
968 968 ibd_rc_stat_s::rc_conn_succ
969 969 ibd_rc_stat_s::rc_conn_fail
970 970 ibd_rc_stat_s::rc_null_conn
971 971 ibd_rc_stat_s::rc_no_estab_conn
972 972 ibd_rc_stat_s::rc_act_close
973 973 ibd_rc_stat_s::rc_pas_close
974 974 ibd_rc_stat_s::rc_delay_ace_recycle
975 975 ibd_rc_stat_s::rc_act_close_simultaneous
976 976 ibd_rc_stat_s::rc_reset_cnt
977 977 ibd_rc_stat_s::rc_timeout_act
978 978 ibd_rc_stat_s::rc_timeout_pas))
979 979 #endif
980 980
981 981 int
982 982 _init()
983 983 {
984 984 int status;
985 985
986 986 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
987 987 PAGESIZE), 0);
988 988 if (status != 0) {
989 989 DPRINT(10, "_init:failed in ddi_soft_state_init()");
990 990 return (status);
991 991 }
992 992
993 993 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
994 994
995 995 mac_init_ops(&ibd_dev_ops, "ibp");
996 996 status = mod_install(&ibd_modlinkage);
997 997 if (status != 0) {
998 998 DPRINT(10, "_init:failed in mod_install()");
999 999 ddi_soft_state_fini(&ibd_list);
1000 1000 mac_fini_ops(&ibd_dev_ops);
1001 1001 return (status);
1002 1002 }
1003 1003
1004 1004 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1005 1005 mutex_enter(&ibd_gstate.ig_mutex);
1006 1006 ibd_gstate.ig_ibt_hdl = NULL;
1007 1007 ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1008 1008 ibd_gstate.ig_service_list = NULL;
1009 1009 mutex_exit(&ibd_gstate.ig_mutex);
1010 1010
1011 1011 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1012 1012 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1013 1013 return (EIO);
1014 1014 }
1015 1015
1016 1016 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1017 1017
1018 1018 #ifdef IBD_LOGGING
1019 1019 ibd_log_init();
1020 1020 #endif
1021 1021 return (0);
1022 1022 }
1023 1023
1024 1024 int
1025 1025 _info(struct modinfo *modinfop)
1026 1026 {
1027 1027 return (mod_info(&ibd_modlinkage, modinfop));
1028 1028 }
1029 1029
1030 1030 int
1031 1031 _fini()
1032 1032 {
1033 1033 int status;
1034 1034
1035 1035 status = mod_remove(&ibd_modlinkage);
1036 1036 if (status != 0)
1037 1037 return (status);
1038 1038
1039 1039 ibt_unregister_part_attr_cb();
1040 1040
1041 1041 mac_fini_ops(&ibd_dev_ops);
1042 1042 mutex_destroy(&ibd_objlist_lock);
1043 1043 ddi_soft_state_fini(&ibd_list);
1044 1044 mutex_destroy(&ibd_gstate.ig_mutex);
1045 1045 #ifdef IBD_LOGGING
1046 1046 ibd_log_fini();
1047 1047 #endif
1048 1048 return (0);
1049 1049 }
1050 1050
1051 1051 /*
1052 1052 * Convert the GID part of the mac address from network byte order
1053 1053 * to host order.
1054 1054 */
1055 1055 static void
1056 1056 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1057 1057 {
1058 1058 ib_sn_prefix_t nbopref;
1059 1059 ib_guid_t nboguid;
1060 1060
1061 1061 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1062 1062 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1063 1063 dgid->gid_prefix = b2h64(nbopref);
1064 1064 dgid->gid_guid = b2h64(nboguid);
1065 1065 }
1066 1066
1067 1067 /*
1068 1068 * Create the IPoIB address in network byte order from host order inputs.
1069 1069 */
1070 1070 static void
1071 1071 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1072 1072 ib_guid_t guid)
1073 1073 {
1074 1074 ib_sn_prefix_t nbopref;
1075 1075 ib_guid_t nboguid;
1076 1076
1077 1077 mac->ipoib_qpn = htonl(qpn);
1078 1078 nbopref = h2b64(prefix);
1079 1079 nboguid = h2b64(guid);
1080 1080 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1081 1081 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1082 1082 }
1083 1083
1084 1084 /*
1085 1085 * Send to the appropriate all-routers group when the IBA multicast group
1086 1086 * does not exist, based on whether the target group is v4 or v6.
1087 1087 */
1088 1088 static boolean_t
1089 1089 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1090 1090 ipoib_mac_t *rmac)
1091 1091 {
1092 1092 boolean_t retval = B_TRUE;
1093 1093 uint32_t adjscope = state->id_scope << 16;
1094 1094 uint32_t topword;
1095 1095
1096 1096 /*
1097 1097 * Copy the first 4 bytes in without assuming any alignment of
1098 1098 * input mac address; this will have IPoIB signature, flags and
1099 1099 * scope bits.
1100 1100 */
1101 1101 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1102 1102 topword = ntohl(topword);
1103 1103
1104 1104 /*
1105 1105 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1106 1106 */
1107 1107 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1108 1108 (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1109 1109 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1110 1110 ((uint32_t)(state->id_pkey << 16))),
1111 1111 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1112 1112 else
1113 1113 /*
1114 1114 * Does not have proper bits in the mgid address.
1115 1115 */
1116 1116 retval = B_FALSE;
1117 1117
1118 1118 return (retval);
1119 1119 }
1120 1120
1121 1121 /*
1122 1122 * Membership states for different mcg's are tracked by two lists:
1123 1123 * the "non" list is used for promiscuous mode, when all mcg traffic
1124 1124 * needs to be inspected. This type of membership is never used for
1125 1125 * transmission, so there can not be an AH in the active list
1126 1126 * corresponding to a member in this list. This list does not need
1127 1127 * any protection, since all operations are performed by the async
1128 1128 * thread.
1129 1129 *
1130 1130 * "Full" and "SendOnly" membership is tracked using a single list,
1131 1131 * the "full" list. This is because this single list can then be
1132 1132 * searched during transmit to a multicast group (if an AH for the
1133 1133 * mcg is not found in the active list), since at least one type
1134 1134 * of membership must be present before initiating the transmit.
1135 1135 * This list is also emptied during driver detach, since sendonly
1136 1136 * membership acquired during transmit is dropped at detach time
1137 1137 * along with ipv4 broadcast full membership. Insert/deletes to
1138 1138 * this list are done only by the async thread, but it is also
1139 1139 * searched in program context (see multicast disable case), thus
1140 1140 * the id_mc_mutex protects the list. The driver detach path also
1141 1141 * deconstructs the "full" list, but it ensures that the async
1142 1142 * thread will not be accessing the list (by blocking out mcg
1143 1143 * trap handling and making sure no more Tx reaping will happen).
1144 1144 *
1145 1145 * Currently, an IBA attach is done in the SendOnly case too,
1146 1146 * although this is not required.
1147 1147 */
1148 1148 #define IBD_MCACHE_INSERT_FULL(state, mce) \
1149 1149 list_insert_head(&state->id_mc_full, mce)
1150 1150 #define IBD_MCACHE_INSERT_NON(state, mce) \
1151 1151 list_insert_head(&state->id_mc_non, mce)
1152 1152 #define IBD_MCACHE_FIND_FULL(state, mgid) \
1153 1153 ibd_mcache_find(mgid, &state->id_mc_full)
1154 1154 #define IBD_MCACHE_FIND_NON(state, mgid) \
1155 1155 ibd_mcache_find(mgid, &state->id_mc_non)
1156 1156 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \
1157 1157 list_remove(&state->id_mc_full, mce)
1158 1158 #define IBD_MCACHE_PULLOUT_NON(state, mce) \
1159 1159 list_remove(&state->id_mc_non, mce)
1160 1160
1161 1161 static void *
1162 1162 list_get_head(list_t *list)
1163 1163 {
1164 1164 list_node_t *lhead = list_head(list);
1165 1165
1166 1166 if (lhead != NULL)
1167 1167 list_remove(list, lhead);
1168 1168 return (lhead);
1169 1169 }
1170 1170
1171 1171 /*
1172 1172 * This is always guaranteed to be able to queue the work.
1173 1173 */
1174 1174 void
1175 1175 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1176 1176 {
1177 1177 /* Initialize request */
1178 1178 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1179 1179 ptr->rq_op = op;
1180 1180
1181 1181 /*
1182 1182 * Queue provided slot onto request pool.
1183 1183 */
1184 1184 mutex_enter(&state->id_acache_req_lock);
1185 1185 list_insert_tail(&state->id_req_list, ptr);
1186 1186
1187 1187 /* Go, fetch, async thread */
1188 1188 cv_signal(&state->id_acache_req_cv);
1189 1189 mutex_exit(&state->id_acache_req_lock);
1190 1190 }
1191 1191
1192 1192 /*
1193 1193 * Main body of the per interface async thread.
1194 1194 */
1195 1195 static void
1196 1196 ibd_async_work(ibd_state_t *state)
1197 1197 {
1198 1198 ibd_req_t *ptr;
1199 1199 callb_cpr_t cprinfo;
1200 1200
1201 1201 mutex_enter(&state->id_acache_req_lock);
1202 1202 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1203 1203 callb_generic_cpr, "ibd_async_work");
1204 1204
1205 1205 for (;;) {
1206 1206 ptr = list_get_head(&state->id_req_list);
1207 1207 if (ptr != NULL) {
1208 1208 mutex_exit(&state->id_acache_req_lock);
1209 1209
1210 1210 /*
1211 1211 * If we are in late hca initialization mode, do not
1212 1212 * process any other async request other than TRAP. TRAP
1213 1213 * is used for indicating creation of a broadcast group;
1214 1214 * in which case, we need to join/create the group.
1215 1215 */
1216 1216 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1217 1217 (ptr->rq_op != IBD_ASYNC_TRAP)) {
1218 1218 goto free_req_and_continue;
1219 1219 }
1220 1220
1221 1221 /*
1222 1222 * Once we have done the operation, there is no
1223 1223 * guarantee the request slot is going to be valid,
1224 1224 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1225 1225 * TRAP).
1226 1226 *
1227 1227 * Perform the request.
1228 1228 */
1229 1229 switch (ptr->rq_op) {
1230 1230 case IBD_ASYNC_GETAH:
1231 1231 ibd_async_acache(state, &ptr->rq_mac);
1232 1232 break;
1233 1233 case IBD_ASYNC_JOIN:
1234 1234 case IBD_ASYNC_LEAVE:
1235 1235 ibd_async_multicast(state,
1236 1236 ptr->rq_gid, ptr->rq_op);
1237 1237 break;
1238 1238 case IBD_ASYNC_PROMON:
1239 1239 ibd_async_setprom(state);
1240 1240 break;
1241 1241 case IBD_ASYNC_PROMOFF:
1242 1242 ibd_async_unsetprom(state);
1243 1243 break;
1244 1244 case IBD_ASYNC_REAP:
1245 1245 ibd_async_reap_group(state,
1246 1246 ptr->rq_ptr, ptr->rq_gid,
1247 1247 IB_MC_JSTATE_FULL);
1248 1248 /*
1249 1249 * the req buf contains in mce
1250 1250 * structure, so we do not need
1251 1251 * to free it here.
1252 1252 */
1253 1253 ptr = NULL;
1254 1254 break;
1255 1255 case IBD_ASYNC_TRAP:
1256 1256 ibd_async_trap(state, ptr);
1257 1257 break;
1258 1258 case IBD_ASYNC_SCHED:
1259 1259 ibd_async_txsched(state);
1260 1260 break;
1261 1261 case IBD_ASYNC_LINK:
1262 1262 ibd_async_link(state, ptr);
1263 1263 break;
1264 1264 case IBD_ASYNC_EXIT:
1265 1265 mutex_enter(&state->id_acache_req_lock);
1266 1266 #ifndef __lock_lint
1267 1267 CALLB_CPR_EXIT(&cprinfo);
1268 1268 #else
1269 1269 mutex_exit(&state->id_acache_req_lock);
1270 1270 #endif
1271 1271 return;
1272 1272 case IBD_ASYNC_RC_TOO_BIG:
1273 1273 ibd_async_rc_process_too_big(state,
1274 1274 ptr);
1275 1275 break;
1276 1276 case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1277 1277 ibd_async_rc_close_act_chan(state, ptr);
1278 1278 break;
1279 1279 case IBD_ASYNC_RC_RECYCLE_ACE:
1280 1280 ibd_async_rc_recycle_ace(state, ptr);
1281 1281 break;
1282 1282 case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
1283 1283 (void) ibd_rc_pas_close(ptr->rq_ptr,
1284 1284 B_TRUE, B_TRUE);
1285 1285 break;
1286 1286 }
1287 1287 free_req_and_continue:
1288 1288 if (ptr != NULL)
1289 1289 kmem_cache_free(state->id_req_kmc, ptr);
1290 1290
1291 1291 mutex_enter(&state->id_acache_req_lock);
1292 1292 } else {
1293 1293 #ifndef __lock_lint
1294 1294 /*
1295 1295 * Nothing to do: wait till new request arrives.
1296 1296 */
1297 1297 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298 1298 cv_wait(&state->id_acache_req_cv,
1299 1299 &state->id_acache_req_lock);
1300 1300 CALLB_CPR_SAFE_END(&cprinfo,
1301 1301 &state->id_acache_req_lock);
1302 1302 #endif
1303 1303 }
1304 1304 }
1305 1305
1306 1306 /*NOTREACHED*/
1307 1307 _NOTE(NOT_REACHED)
1308 1308 }
1309 1309
1310 1310 /*
1311 1311 * Return when it is safe to queue requests to the async daemon; primarily
1312 1312 * for subnet trap and async event handling. Disallow requests before the
1313 1313 * daemon is created, and when interface deinitilization starts.
1314 1314 */
1315 1315 static boolean_t
1316 1316 ibd_async_safe(ibd_state_t *state)
1317 1317 {
1318 1318 mutex_enter(&state->id_trap_lock);
1319 1319 if (state->id_trap_stop) {
1320 1320 mutex_exit(&state->id_trap_lock);
1321 1321 return (B_FALSE);
1322 1322 }
1323 1323 state->id_trap_inprog++;
1324 1324 mutex_exit(&state->id_trap_lock);
1325 1325 return (B_TRUE);
1326 1326 }
1327 1327
1328 1328 /*
1329 1329 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1330 1330 * trap or event handling to complete to kill the async thread and deconstruct
1331 1331 * the mcg/ace list.
1332 1332 */
1333 1333 static void
1334 1334 ibd_async_done(ibd_state_t *state)
1335 1335 {
1336 1336 mutex_enter(&state->id_trap_lock);
1337 1337 if (--state->id_trap_inprog == 0)
1338 1338 cv_signal(&state->id_trap_cv);
1339 1339 mutex_exit(&state->id_trap_lock);
1340 1340 }
1341 1341
1342 1342 /*
1343 1343 * Hash functions:
1344 1344 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1345 1345 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1346 1346 * These operate on mac addresses input into ibd_send, but there is no
1347 1347 * guarantee on the alignment of the ipoib_mac_t structure.
1348 1348 */
1349 1349 /*ARGSUSED*/
1350 1350 static uint_t
1351 1351 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1352 1352 {
1353 1353 ulong_t ptraddr = (ulong_t)key;
1354 1354 uint_t hval;
1355 1355
1356 1356 /*
1357 1357 * If the input address is 4 byte aligned, we can just dereference
1358 1358 * it. This is most common, since IP will send in a 4 byte aligned
1359 1359 * IP header, which implies the 24 byte IPoIB psuedo header will be
1360 1360 * 4 byte aligned too.
1361 1361 */
1362 1362 if ((ptraddr & 3) == 0)
1363 1363 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1364 1364
1365 1365 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1366 1366 return (hval);
1367 1367 }
1368 1368
1369 1369 static int
1370 1370 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1371 1371 {
1372 1372 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1373 1373 return (0);
1374 1374 else
1375 1375 return (1);
1376 1376 }
1377 1377
1378 1378 /*
1379 1379 * Initialize all the per interface caches and lists; AH cache,
1380 1380 * MCG list etc.
1381 1381 */
1382 1382 static int
1383 1383 ibd_acache_init(ibd_state_t *state)
1384 1384 {
1385 1385 ibd_ace_t *ce;
1386 1386 int i;
1387 1387
1388 1388 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1389 1389 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1390 1390 mutex_enter(&state->id_ac_mutex);
1391 1391 list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1392 1392 offsetof(ibd_ace_t, ac_list));
1393 1393 list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1394 1394 offsetof(ibd_ace_t, ac_list));
1395 1395 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1396 1396 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1397 1397 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1398 1398 list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1399 1399 offsetof(ibd_mce_t, mc_list));
1400 1400 list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1401 1401 offsetof(ibd_mce_t, mc_list));
1402 1402 state->id_ac_hot_ace = NULL;
1403 1403
1404 1404 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1405 1405 state->id_num_ah, KM_SLEEP);
1406 1406 for (i = 0; i < state->id_num_ah; i++, ce++) {
1407 1407 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1408 1408 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1409 1409 mutex_exit(&state->id_ac_mutex);
1410 1410 ibd_acache_fini(state);
1411 1411 return (DDI_FAILURE);
1412 1412 } else {
1413 1413 CLEAR_REFCYCLE(ce);
1414 1414 ce->ac_mce = NULL;
1415 1415 mutex_init(&ce->tx_too_big_mutex, NULL,
1416 1416 MUTEX_DRIVER, NULL);
1417 1417 IBD_ACACHE_INSERT_FREE(state, ce);
1418 1418 }
1419 1419 }
1420 1420 mutex_exit(&state->id_ac_mutex);
1421 1421 return (DDI_SUCCESS);
1422 1422 }
1423 1423
1424 1424 static void
1425 1425 ibd_acache_fini(ibd_state_t *state)
1426 1426 {
1427 1427 ibd_ace_t *ptr;
1428 1428
1429 1429 mutex_enter(&state->id_ac_mutex);
1430 1430
1431 1431 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1432 1432 ASSERT(GET_REF(ptr) == 0);
1433 1433 mutex_destroy(&ptr->tx_too_big_mutex);
1434 1434 (void) ibt_free_ud_dest(ptr->ac_dest);
1435 1435 }
1436 1436
1437 1437 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1438 1438 ASSERT(GET_REF(ptr) == 0);
1439 1439 mutex_destroy(&ptr->tx_too_big_mutex);
1440 1440 (void) ibt_free_ud_dest(ptr->ac_dest);
1441 1441 }
1442 1442
1443 1443 list_destroy(&state->id_ah_free);
1444 1444 list_destroy(&state->id_ah_active);
1445 1445 list_destroy(&state->id_mc_full);
1446 1446 list_destroy(&state->id_mc_non);
1447 1447 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1448 1448 mutex_exit(&state->id_ac_mutex);
1449 1449 mutex_destroy(&state->id_ac_mutex);
1450 1450 mutex_destroy(&state->id_mc_mutex);
1451 1451 }
1452 1452
1453 1453 /*
1454 1454 * Search AH active hash list for a cached path to input destination.
1455 1455 * If we are "just looking", hold == F. When we are in the Tx path,
1456 1456 * we set hold == T to grab a reference on the AH so that it can not
1457 1457 * be recycled to a new destination while the Tx request is posted.
1458 1458 */
1459 1459 ibd_ace_t *
1460 1460 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1461 1461 {
1462 1462 ibd_ace_t *ptr;
1463 1463
1464 1464 ASSERT(mutex_owned(&state->id_ac_mutex));
1465 1465
1466 1466 /*
1467 1467 * Do hash search.
1468 1468 */
1469 1469 if (mod_hash_find(state->id_ah_active_hash,
1470 1470 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1471 1471 if (hold)
1472 1472 INC_REF(ptr, num);
1473 1473 return (ptr);
1474 1474 }
1475 1475 return (NULL);
1476 1476 }
1477 1477
1478 1478 /*
1479 1479 * This is called by the tx side; if an initialized AH is found in
1480 1480 * the active list, it is locked down and can be used; if no entry
1481 1481 * is found, an async request is queued to do path resolution.
1482 1482 */
1483 1483 static ibd_ace_t *
1484 1484 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1485 1485 {
1486 1486 ibd_ace_t *ptr;
1487 1487 ibd_req_t *req;
1488 1488
1489 1489 /*
1490 1490 * Only attempt to print when we can; in the mdt pattr case, the
1491 1491 * address is not aligned properly.
1492 1492 */
1493 1493 if (((ulong_t)mac & 3) == 0) {
1494 1494 DPRINT(4,
1495 1495 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1496 1496 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1497 1497 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1498 1498 htonl(mac->ipoib_gidsuff[1]));
1499 1499 }
1500 1500
1501 1501 mutex_enter(&state->id_ac_mutex);
1502 1502
1503 1503 if (((ptr = state->id_ac_hot_ace) != NULL) &&
1504 1504 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1505 1505 INC_REF(ptr, numwqe);
1506 1506 mutex_exit(&state->id_ac_mutex);
1507 1507 return (ptr);
1508 1508 }
1509 1509 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1510 1510 state->id_ac_hot_ace = ptr;
1511 1511 mutex_exit(&state->id_ac_mutex);
1512 1512 return (ptr);
1513 1513 }
1514 1514
1515 1515 /*
1516 1516 * Implementation of a single outstanding async request; if
1517 1517 * the operation is not started yet, queue a request and move
1518 1518 * to ongoing state. Remember in id_ah_addr for which address
1519 1519 * we are queueing the request, in case we need to flag an error;
1520 1520 * Any further requests, for the same or different address, until
1521 1521 * the operation completes, is sent back to GLDv3 to be retried.
1522 1522 * The async thread will update id_ah_op with an error indication
1523 1523 * or will set it to indicate the next look up can start; either
1524 1524 * way, it will mac_tx_update() so that all blocked requests come
1525 1525 * back here.
1526 1526 */
1527 1527 *err = EAGAIN;
1528 1528 if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1529 1529 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1530 1530 if (req != NULL) {
1531 1531 /*
1532 1532 * We did not even find the entry; queue a request
1533 1533 * for it.
1534 1534 */
1535 1535 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1536 1536 state->id_ah_op = IBD_OP_ONGOING;
1537 1537 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1538 1538 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1539 1539 }
1540 1540 } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1541 1541 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1542 1542 /*
1543 1543 * Check the status of the pathrecord lookup request
1544 1544 * we had queued before.
1545 1545 */
1546 1546 if (state->id_ah_op == IBD_OP_ERRORED) {
1547 1547 *err = EFAULT;
1548 1548 state->id_ah_error++;
1549 1549 } else {
1550 1550 /*
1551 1551 * IBD_OP_ROUTERED case: We need to send to the
1552 1552 * all-router MCG. If we can find the AH for
1553 1553 * the mcg, the Tx will be attempted. If we
1554 1554 * do not find the AH, we return NORESOURCES
1555 1555 * to retry.
1556 1556 */
1557 1557 ipoib_mac_t routermac;
1558 1558
1559 1559 (void) ibd_get_allroutergroup(state, mac, &routermac);
1560 1560 ptr = ibd_acache_find(state, &routermac, B_TRUE,
1561 1561 numwqe);
1562 1562 }
1563 1563 state->id_ah_op = IBD_OP_NOTSTARTED;
1564 1564 } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1565 1565 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1566 1566 /*
1567 1567 * This case can happen when we get a higher band
1568 1568 * packet. The easiest way is to reset the state machine
1569 1569 * to accommodate the higher priority packet.
1570 1570 */
1571 1571 state->id_ah_op = IBD_OP_NOTSTARTED;
1572 1572 }
1573 1573 mutex_exit(&state->id_ac_mutex);
1574 1574
1575 1575 return (ptr);
1576 1576 }
1577 1577
1578 1578 /*
1579 1579 * Grab a not-currently-in-use AH/PathRecord from the active
1580 1580 * list to recycle to a new destination. Only the async thread
1581 1581 * executes this code.
1582 1582 */
1583 1583 static ibd_ace_t *
1584 1584 ibd_acache_get_unref(ibd_state_t *state)
1585 1585 {
1586 1586 ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1587 1587 boolean_t try_rc_chan_recycle = B_FALSE;
1588 1588
1589 1589 ASSERT(mutex_owned(&state->id_ac_mutex));
1590 1590
1591 1591 /*
1592 1592 * Do plain linear search.
1593 1593 */
1594 1594 while (ptr != NULL) {
1595 1595 /*
1596 1596 * Note that it is possible that the "cycle" bit
1597 1597 * is set on the AH w/o any reference count. The
1598 1598 * mcg must have been deleted, and the tx cleanup
1599 1599 * just decremented the reference count to 0, but
1600 1600 * hasn't gotten around to grabbing the id_ac_mutex
1601 1601 * to move the AH into the free list.
1602 1602 */
1603 1603 if (GET_REF(ptr) == 0) {
1604 1604 if (ptr->ac_chan != NULL) {
1605 1605 ASSERT(state->id_enable_rc == B_TRUE);
1606 1606 if (!try_rc_chan_recycle) {
1607 1607 try_rc_chan_recycle = B_TRUE;
1608 1608 ibd_rc_signal_ace_recycle(state, ptr);
1609 1609 }
1610 1610 } else {
1611 1611 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1612 1612 break;
1613 1613 }
1614 1614 }
1615 1615 ptr = list_prev(&state->id_ah_active, ptr);
1616 1616 }
1617 1617 return (ptr);
1618 1618 }
1619 1619
1620 1620 /*
1621 1621 * Invoked to clean up AH from active list in case of multicast
1622 1622 * disable and to handle sendonly memberships during mcg traps.
1623 1623 * And for port up processing for multicast and unicast AHs.
1624 1624 * Normally, the AH is taken off the active list, and put into
1625 1625 * the free list to be recycled for a new destination. In case
1626 1626 * Tx requests on the AH have not completed yet, the AH is marked
1627 1627 * for reaping (which will put the AH on the free list) once the Tx's
1628 1628 * complete; in this case, depending on the "force" input, we take
1629 1629 * out the AH from the active list right now, or leave it also for
1630 1630 * the reap operation. Returns TRUE if the AH is taken off the active
1631 1631 * list (and either put into the free list right now, or arranged for
1632 1632 * later), FALSE otherwise.
1633 1633 */
1634 1634 boolean_t
1635 1635 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1636 1636 {
1637 1637 ibd_ace_t *acactive;
1638 1638 boolean_t ret = B_TRUE;
1639 1639
1640 1640 ASSERT(mutex_owned(&state->id_ac_mutex));
1641 1641
1642 1642 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1643 1643
1644 1644 /*
1645 1645 * Note that the AH might already have the cycle bit set
1646 1646 * on it; this might happen if sequences of multicast
1647 1647 * enables and disables are coming so fast, that posted
1648 1648 * Tx's to the mcg have not completed yet, and the cycle
1649 1649 * bit is set successively by each multicast disable.
1650 1650 */
1651 1651 if (SET_CYCLE_IF_REF(acactive)) {
1652 1652 if (!force) {
1653 1653 /*
1654 1654 * The ace is kept on the active list, further
1655 1655 * Tx's can still grab a reference on it; the
1656 1656 * ace is reaped when all pending Tx's
1657 1657 * referencing the AH complete.
1658 1658 */
1659 1659 ret = B_FALSE;
1660 1660 } else {
1661 1661 /*
1662 1662 * In the mcg trap case, we always pull the
1663 1663 * AH from the active list. And also the port
1664 1664 * up multi/unicast case.
1665 1665 */
1666 1666 ASSERT(acactive->ac_chan == NULL);
1667 1667 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1668 1668 acactive->ac_mce = NULL;
1669 1669 }
1670 1670 } else {
1671 1671 /*
1672 1672 * Determined the ref count is 0, thus reclaim
1673 1673 * immediately after pulling out the ace from
1674 1674 * the active list.
1675 1675 */
1676 1676 ASSERT(acactive->ac_chan == NULL);
1677 1677 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1678 1678 acactive->ac_mce = NULL;
1679 1679 IBD_ACACHE_INSERT_FREE(state, acactive);
1680 1680 }
1681 1681
1682 1682 }
1683 1683 return (ret);
1684 1684 }
1685 1685
1686 1686 /*
1687 1687 * Helper function for async path record lookup. If we are trying to
1688 1688 * Tx to a MCG, check our membership, possibly trying to join the
1689 1689 * group if required. If that fails, try to send the packet to the
1690 1690 * all router group (indicated by the redirect output), pointing
1691 1691 * the input mac address to the router mcg address.
1692 1692 */
1693 1693 static ibd_mce_t *
1694 1694 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1695 1695 {
1696 1696 ib_gid_t mgid;
1697 1697 ibd_mce_t *mce;
1698 1698 ipoib_mac_t routermac;
1699 1699
1700 1700 *redirect = B_FALSE;
1701 1701 ibd_n2h_gid(mac, &mgid);
1702 1702
1703 1703 /*
1704 1704 * Check the FullMember+SendOnlyNonMember list.
1705 1705 * Since we are the only one who manipulates the
1706 1706 * id_mc_full list, no locks are needed.
1707 1707 */
1708 1708 mce = IBD_MCACHE_FIND_FULL(state, mgid);
1709 1709 if (mce != NULL) {
1710 1710 DPRINT(4, "ibd_async_mcache : already joined to group");
1711 1711 return (mce);
1712 1712 }
1713 1713
1714 1714 /*
1715 1715 * Not found; try to join(SendOnlyNonMember) and attach.
1716 1716 */
1717 1717 DPRINT(4, "ibd_async_mcache : not joined to group");
1718 1718 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1719 1719 NULL) {
1720 1720 DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1721 1721 return (mce);
1722 1722 }
1723 1723
1724 1724 /*
1725 1725 * MCGroup not present; try to join the all-router group. If
1726 1726 * any of the following steps succeed, we will be redirecting
1727 1727 * to the all router group.
1728 1728 */
1729 1729 DPRINT(4, "ibd_async_mcache : nonmem join failed");
1730 1730 if (!ibd_get_allroutergroup(state, mac, &routermac))
1731 1731 return (NULL);
1732 1732 *redirect = B_TRUE;
1733 1733 ibd_n2h_gid(&routermac, &mgid);
1734 1734 bcopy(&routermac, mac, IPOIB_ADDRL);
1735 1735 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1736 1736 mgid.gid_prefix, mgid.gid_guid);
1737 1737
1738 1738 /*
1739 1739 * Are we already joined to the router group?
1740 1740 */
1741 1741 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1742 1742 DPRINT(4, "ibd_async_mcache : using already joined router"
1743 1743 "group\n");
1744 1744 return (mce);
1745 1745 }
1746 1746
1747 1747 /*
1748 1748 * Can we join(SendOnlyNonMember) the router group?
1749 1749 */
1750 1750 DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1751 1751 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1752 1752 NULL) {
1753 1753 DPRINT(4, "ibd_async_mcache : joined to router grp");
1754 1754 return (mce);
1755 1755 }
1756 1756
1757 1757 return (NULL);
1758 1758 }
1759 1759
1760 1760 /*
1761 1761 * Async path record lookup code.
1762 1762 */
1763 1763 static void
1764 1764 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1765 1765 {
1766 1766 ibd_ace_t *ce;
1767 1767 ibd_mce_t *mce = NULL;
1768 1768 ibt_path_attr_t path_attr;
1769 1769 ibt_path_info_t path_info;
1770 1770 ib_gid_t destgid;
1771 1771 char ret = IBD_OP_NOTSTARTED;
1772 1772
1773 1773 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X",
1774 1774 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1775 1775 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1776 1776 htonl(mac->ipoib_gidsuff[1]));
1777 1777
1778 1778 /*
1779 1779 * Check whether we are trying to transmit to a MCG.
1780 1780 * In that case, we need to make sure we are a member of
1781 1781 * the MCG.
1782 1782 */
1783 1783 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1784 1784 boolean_t redirected;
1785 1785
1786 1786 /*
1787 1787 * If we can not find or join the group or even
1788 1788 * redirect, error out.
1789 1789 */
1790 1790 if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1791 1791 NULL) {
1792 1792 state->id_ah_op = IBD_OP_ERRORED;
1793 1793 return;
1794 1794 }
1795 1795
1796 1796 /*
1797 1797 * If we got redirected, we need to determine whether
1798 1798 * the AH for the new mcg is in the cache already, and
1799 1799 * not pull it in then; otherwise proceed to get the
1800 1800 * path for the new mcg. There is no guarantee that
1801 1801 * if the AH is currently in the cache, it will still be
1802 1802 * there when we look in ibd_acache_lookup(), but that's
1803 1803 * okay, we will come back here.
1804 1804 */
1805 1805 if (redirected) {
1806 1806 ret = IBD_OP_ROUTERED;
1807 1807 DPRINT(4, "ibd_async_acache : redirected to "
1808 1808 "%08X:%08X:%08X:%08X:%08X",
1809 1809 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1810 1810 htonl(mac->ipoib_gidpref[1]),
1811 1811 htonl(mac->ipoib_gidsuff[0]),
1812 1812 htonl(mac->ipoib_gidsuff[1]));
1813 1813
1814 1814 mutex_enter(&state->id_ac_mutex);
1815 1815 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1816 1816 state->id_ah_op = IBD_OP_ROUTERED;
1817 1817 mutex_exit(&state->id_ac_mutex);
1818 1818 DPRINT(4, "ibd_async_acache : router AH found");
1819 1819 return;
1820 1820 }
1821 1821 mutex_exit(&state->id_ac_mutex);
1822 1822 }
1823 1823 }
1824 1824
1825 1825 /*
1826 1826 * Get an AH from the free list.
1827 1827 */
1828 1828 mutex_enter(&state->id_ac_mutex);
1829 1829 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1830 1830 /*
1831 1831 * No free ones; try to grab an unreferenced active
1832 1832 * one. Maybe we need to make the active list LRU,
1833 1833 * but that will create more work for Tx callbacks.
1834 1834 * Is there a way of not having to pull out the
1835 1835 * entry from the active list, but just indicate it
1836 1836 * is being recycled? Yes, but that creates one more
1837 1837 * check in the fast lookup path.
1838 1838 */
1839 1839 if ((ce = ibd_acache_get_unref(state)) == NULL) {
1840 1840 /*
1841 1841 * Pretty serious shortage now.
1842 1842 */
1843 1843 state->id_ah_op = IBD_OP_NOTSTARTED;
1844 1844 mutex_exit(&state->id_ac_mutex);
1845 1845 DPRINT(10, "ibd_async_acache : failed to find AH "
1846 1846 "slot\n");
1847 1847 return;
1848 1848 }
1849 1849 /*
1850 1850 * We could check whether ac_mce points to a SendOnly
1851 1851 * member and drop that membership now. Or do it lazily
1852 1852 * at detach time.
1853 1853 */
1854 1854 ce->ac_mce = NULL;
1855 1855 }
1856 1856 mutex_exit(&state->id_ac_mutex);
1857 1857 ASSERT(ce->ac_mce == NULL);
1858 1858
1859 1859 /*
1860 1860 * Update the entry.
1861 1861 */
1862 1862 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1863 1863
1864 1864 bzero(&path_info, sizeof (path_info));
1865 1865 bzero(&path_attr, sizeof (ibt_path_attr_t));
1866 1866 path_attr.pa_sgid = state->id_sgid;
1867 1867 path_attr.pa_num_dgids = 1;
1868 1868 ibd_n2h_gid(&ce->ac_mac, &destgid);
1869 1869 path_attr.pa_dgids = &destgid;
1870 1870 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1871 1871 path_attr.pa_pkey = state->id_pkey;
1872 1872 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1873 1873 &path_info, NULL) != IBT_SUCCESS) {
1874 1874 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1875 1875 goto error;
1876 1876 }
1877 1877 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1878 1878 ntohl(ce->ac_mac.ipoib_qpn),
1879 1879 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1880 1880 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1881 1881 goto error;
1882 1882 }
1883 1883
1884 1884 /*
1885 1885 * mce is set whenever an AH is being associated with a
1886 1886 * MCG; this will come in handy when we leave the MCG. The
1887 1887 * lock protects Tx fastpath from scanning the active list.
1888 1888 */
1889 1889 if (mce != NULL)
1890 1890 ce->ac_mce = mce;
1891 1891
1892 1892 /*
1893 1893 * initiate a RC mode connection for unicast address
1894 1894 */
1895 1895 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1896 1896 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1897 1897 ASSERT(ce->ac_chan == NULL);
1898 1898 DPRINT(10, "ibd_async_acache: call "
1899 1899 "ibd_rc_try_connect(ace=%p)", ce);
1900 1900 ibd_rc_try_connect(state, ce, &path_info);
1901 1901 if (ce->ac_chan == NULL) {
1902 1902 DPRINT(10, "ibd_async_acache: fail to setup RC"
1903 1903 " channel");
1904 1904 state->rc_conn_fail++;
1905 1905 goto error;
1906 1906 }
1907 1907 }
1908 1908
1909 1909 mutex_enter(&state->id_ac_mutex);
1910 1910 IBD_ACACHE_INSERT_ACTIVE(state, ce);
1911 1911 state->id_ah_op = ret;
1912 1912 mutex_exit(&state->id_ac_mutex);
1913 1913 return;
1914 1914 error:
1915 1915 /*
1916 1916 * We might want to drop SendOnly membership here if we
1917 1917 * joined above. The lock protects Tx callbacks inserting
1918 1918 * into the free list.
1919 1919 */
1920 1920 mutex_enter(&state->id_ac_mutex);
1921 1921 state->id_ah_op = IBD_OP_ERRORED;
1922 1922 IBD_ACACHE_INSERT_FREE(state, ce);
1923 1923 mutex_exit(&state->id_ac_mutex);
1924 1924 }
1925 1925
1926 1926 /*
1927 1927 * While restoring port's presence on the subnet on a port up, it is possible
1928 1928 * that the port goes down again.
1929 1929 */
1930 1930 static void
1931 1931 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1932 1932 {
1933 1933 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1934 1934 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1935 1935 LINK_STATE_UP;
1936 1936 ibd_mce_t *mce, *pmce;
1937 1937 ibd_ace_t *ace, *pace;
1938 1938
1939 1939 DPRINT(10, "ibd_async_link(): %d", opcode);
1940 1940
1941 1941 /*
1942 1942 * On a link up, revalidate the link speed/width. No point doing
1943 1943 * this on a link down, since we will be unable to do SA operations,
1944 1944 * defaulting to the lowest speed. Also notice that we update our
1945 1945 * notion of speed before calling mac_link_update(), which will do
1946 1946 * necessary higher level notifications for speed changes.
1947 1947 */
1948 1948 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1949 1949 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1950 1950 state->id_link_speed = ibd_get_portspeed(state);
1951 1951 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1952 1952 }
1953 1953
1954 1954 /*
1955 1955 * Do all the work required to establish our presence on
1956 1956 * the subnet.
1957 1957 */
1958 1958 if (opcode == IBD_LINK_UP_ABSENT) {
1959 1959 /*
1960 1960 * If in promiscuous mode ...
1961 1961 */
1962 1962 if (state->id_prom_op == IBD_OP_COMPLETED) {
1963 1963 /*
1964 1964 * Drop all nonmembership.
1965 1965 */
1966 1966 ibd_async_unsetprom(state);
1967 1967
1968 1968 /*
1969 1969 * Then, try to regain nonmembership to all mcg's.
1970 1970 */
1971 1971 ibd_async_setprom(state);
1972 1972
1973 1973 }
1974 1974
1975 1975 /*
1976 1976 * Drop all sendonly membership (which also gets rid of the
1977 1977 * AHs); try to reacquire all full membership.
1978 1978 */
1979 1979 mce = list_head(&state->id_mc_full);
1980 1980 while ((pmce = mce) != NULL) {
1981 1981 mce = list_next(&state->id_mc_full, mce);
1982 1982 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1983 1983 ibd_leave_group(state,
1984 1984 pmce->mc_info.mc_adds_vect.av_dgid,
1985 1985 IB_MC_JSTATE_SEND_ONLY_NON);
1986 1986 else
1987 1987 ibd_reacquire_group(state, pmce);
1988 1988 }
1989 1989
1990 1990 /*
1991 1991 * Recycle all active AHs to free list (and if there are
1992 1992 * pending posts, make sure they will go into the free list
1993 1993 * once the Tx's complete). Grab the lock to prevent
1994 1994 * concurrent Tx's as well as Tx cleanups.
1995 1995 */
1996 1996 mutex_enter(&state->id_ac_mutex);
1997 1997 ace = list_head(&state->id_ah_active);
1998 1998 while ((pace = ace) != NULL) {
1999 1999 boolean_t cycled;
2000 2000
2001 2001 ace = list_next(&state->id_ah_active, ace);
2002 2002 mce = pace->ac_mce;
2003 2003 if (pace->ac_chan != NULL) {
2004 2004 ASSERT(mce == NULL);
2005 2005 ASSERT(state->id_enable_rc == B_TRUE);
2006 2006 if (pace->ac_chan->chan_state ==
2007 2007 IBD_RC_STATE_ACT_ESTAB) {
2008 2008 INC_REF(pace, 1);
2009 2009 IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2010 2010 pace->ac_chan->chan_state =
2011 2011 IBD_RC_STATE_ACT_CLOSING;
2012 2012 ibd_rc_signal_act_close(state, pace);
2013 2013 } else {
2014 2014 state->rc_act_close_simultaneous++;
2015 2015 DPRINT(40, "ibd_async_link: other "
2016 2016 "thread is closing it, ace=%p, "
2017 2017 "ac_chan=%p, chan_state=%d",
2018 2018 pace, pace->ac_chan,
2019 2019 pace->ac_chan->chan_state);
2020 2020 }
2021 2021 } else {
2022 2022 cycled = ibd_acache_recycle(state,
2023 2023 &pace->ac_mac, B_TRUE);
2024 2024 }
2025 2025 /*
2026 2026 * If this is for an mcg, it must be for a fullmember,
2027 2027 * since we got rid of send-only members above when
2028 2028 * processing the mce list.
2029 2029 */
2030 2030 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2031 2031 IB_MC_JSTATE_FULL)));
2032 2032
2033 2033 /*
2034 2034 * Check if the fullmember mce needs to be torn down,
2035 2035 * ie whether the DLPI disable has already been done.
2036 2036 * If so, do some of the work of tx_cleanup, namely
2037 2037 * causing leave (which will fail), detach and
2038 2038 * mce-freeing. tx_cleanup will put the AH into free
2039 2039 * list. The reason to duplicate some of this
2040 2040 * tx_cleanup work is because we want to delete the
2041 2041 * AH right now instead of waiting for tx_cleanup, to
2042 2042 * force subsequent Tx's to reacquire an AH.
2043 2043 */
2044 2044 if ((mce != NULL) && (mce->mc_fullreap))
2045 2045 ibd_async_reap_group(state, mce,
2046 2046 mce->mc_info.mc_adds_vect.av_dgid,
2047 2047 mce->mc_jstate);
2048 2048 }
2049 2049 mutex_exit(&state->id_ac_mutex);
2050 2050 }
2051 2051
2052 2052 /*
2053 2053 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2054 2054 * (which stops further events from being delivered) before
2055 2055 * mac_unregister(). At this point, it is guaranteed that mac_register
2056 2056 * has already been done.
2057 2057 */
2058 2058 mutex_enter(&state->id_link_mutex);
2059 2059 state->id_link_state = lstate;
2060 2060 mac_link_update(state->id_mh, lstate);
2061 2061 mutex_exit(&state->id_link_mutex);
2062 2062
2063 2063 ibd_async_done(state);
2064 2064 }
2065 2065
2066 2066 /*
2067 2067 * Check the pkey table to see if we can find the pkey we're looking for.
2068 2068 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2069 2069 * failure.
2070 2070 */
2071 2071 static int
2072 2072 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2073 2073 uint16_t *pkix)
2074 2074 {
2075 2075 uint16_t ndx;
2076 2076
2077 2077 ASSERT(pkix != NULL);
2078 2078
2079 2079 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2080 2080 if (pkey_tbl[ndx] == pkey) {
2081 2081 *pkix = ndx;
2082 2082 return (0);
2083 2083 }
2084 2084 }
2085 2085 return (-1);
2086 2086 }
2087 2087
2088 2088 /*
2089 2089 * Late HCA Initialization:
2090 2090 * If plumb had succeeded without the availability of an active port or the
2091 2091 * pkey, and either of their availability is now being indicated via PORT_UP
2092 2092 * or PORT_CHANGE respectively, try a start of the interface.
2093 2093 *
2094 2094 * Normal Operation:
2095 2095 * When the link is notified up, we need to do a few things, based
2096 2096 * on the port's current p_init_type_reply claiming a reinit has been
2097 2097 * done or not. The reinit steps are:
2098 2098 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2099 2099 * the old Pkey and GID0 are correct.
2100 2100 * 2. Register for mcg traps (already done by ibmf).
2101 2101 * 3. If PreservePresenceReply indicates the SM has restored port's presence
2102 2102 * in subnet, nothing more to do. Else go to next steps (on async daemon).
2103 2103 * 4. Give up all sendonly memberships.
2104 2104 * 5. Acquire all full memberships.
2105 2105 * 6. In promiscuous mode, acquire all non memberships.
2106 2106 * 7. Recycle all AHs to free list.
2107 2107 */
2108 2108 static void
2109 2109 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2110 2110 {
2111 2111 ibt_hca_portinfo_t *port_infop = NULL;
2112 2112 ibt_status_t ibt_status;
2113 2113 uint_t psize, port_infosz;
2114 2114 ibd_link_op_t opcode;
2115 2115 ibd_req_t *req;
2116 2116 link_state_t new_link_state = LINK_STATE_UP;
2117 2117 uint8_t itreply;
2118 2118 uint16_t pkix;
2119 2119 int ret;
2120 2120
2121 2121 /*
2122 2122 * Let's not race with a plumb or an unplumb; if we detect a
2123 2123 * pkey relocation event later on here, we may have to restart.
2124 2124 */
2125 2125 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2126 2126
2127 2127 mutex_enter(&state->id_link_mutex);
2128 2128
2129 2129 /*
2130 2130 * If the link state is unknown, a plumb has not yet been attempted
2131 2131 * on the interface. Nothing to do.
2132 2132 */
2133 2133 if (state->id_link_state == LINK_STATE_UNKNOWN) {
2134 2134 mutex_exit(&state->id_link_mutex);
2135 2135 goto link_mod_return;
2136 2136 }
2137 2137
2138 2138 /*
2139 2139 * If link state is down because of plumb failure, and we are not in
2140 2140 * late HCA init, and we were not successfully plumbed, nothing to do.
2141 2141 */
2142 2142 if ((state->id_link_state == LINK_STATE_DOWN) &&
2143 2143 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2144 2144 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2145 2145 mutex_exit(&state->id_link_mutex);
2146 2146 goto link_mod_return;
2147 2147 }
2148 2148
2149 2149 /*
2150 2150 * If this routine was called in response to a port down event,
2151 2151 * we just need to see if this should be informed.
2152 2152 */
2153 2153 if (code == IBT_ERROR_PORT_DOWN) {
2154 2154 new_link_state = LINK_STATE_DOWN;
2155 2155 goto update_link_state;
2156 2156 }
2157 2157
2158 2158 /*
2159 2159 * If it's not a port down event we've received, try to get the port
2160 2160 * attributes first. If we fail here, the port is as good as down.
2161 2161 * Otherwise, if the link went down by the time the handler gets
2162 2162 * here, give up - we cannot even validate the pkey/gid since those
2163 2163 * are not valid and this is as bad as a port down anyway.
2164 2164 */
2165 2165 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2166 2166 &port_infop, &psize, &port_infosz);
2167 2167 if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2168 2168 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2169 2169 new_link_state = LINK_STATE_DOWN;
2170 2170 goto update_link_state;
2171 2171 }
2172 2172
2173 2173 /*
2174 2174 * If in the previous attempt, the pkey was not found either due to the
2175 2175 * port state being down, or due to it's absence in the pkey table,
2176 2176 * look for it now and try to start the interface.
2177 2177 */
2178 2178 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2179 2179 mutex_exit(&state->id_link_mutex);
2180 2180 if ((ret = ibd_start(state)) != 0) {
2181 2181 DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2182 2182 "init, ret=%d", ret);
2183 2183 }
2184 2184 ibt_free_portinfo(port_infop, port_infosz);
2185 2185 goto link_mod_return;
2186 2186 }
2187 2187
2188 2188 /*
2189 2189 * Check the SM InitTypeReply flags. If both NoLoadReply and
2190 2190 * PreserveContentReply are 0, we don't know anything about the
2191 2191 * data loaded into the port attributes, so we need to verify
2192 2192 * if gid0 and pkey are still valid.
2193 2193 */
2194 2194 itreply = port_infop->p_init_type_reply;
2195 2195 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2196 2196 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2197 2197 /*
2198 2198 * Check to see if the subnet part of GID0 has changed. If
2199 2199 * not, check the simple case first to see if the pkey
2200 2200 * index is the same as before; finally check to see if the
2201 2201 * pkey has been relocated to a different index in the table.
2202 2202 */
2203 2203 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2204 2204 if (bcmp(port_infop->p_sgid_tbl,
2205 2205 &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2206 2206
2207 2207 new_link_state = LINK_STATE_DOWN;
2208 2208
2209 2209 } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2210 2210 state->id_pkey) {
2211 2211
2212 2212 new_link_state = LINK_STATE_UP;
2213 2213
2214 2214 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2215 2215 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2216 2216
2217 2217 ibt_free_portinfo(port_infop, port_infosz);
2218 2218 mutex_exit(&state->id_link_mutex);
2219 2219
2220 2220 /*
2221 2221 * Currently a restart is required if our pkey has moved
2222 2222 * in the pkey table. If we get the ibt_recycle_ud() to
2223 2223 * work as documented (expected), we may be able to
2224 2224 * avoid a complete restart. Note that we've already
2225 2225 * marked both the start and stop 'in-progress' flags,
2226 2226 * so it is ok to go ahead and do this restart.
2227 2227 */
2228 2228 (void) ibd_undo_start(state, LINK_STATE_DOWN);
2229 2229 if ((ret = ibd_start(state)) != 0) {
2230 2230 DPRINT(10, "ibd_restart: cannot restart, "
2231 2231 "ret=%d", ret);
2232 2232 }
2233 2233
2234 2234 goto link_mod_return;
2235 2235 } else {
2236 2236 new_link_state = LINK_STATE_DOWN;
2237 2237 }
2238 2238 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2239 2239 }
2240 2240
2241 2241 update_link_state:
2242 2242 if (port_infop) {
2243 2243 ibt_free_portinfo(port_infop, port_infosz);
2244 2244 }
2245 2245
2246 2246 /*
2247 2247 * If we're reporting a link up, check InitTypeReply to see if
2248 2248 * the SM has ensured that the port's presence in mcg, traps,
2249 2249 * etc. is intact.
2250 2250 */
2251 2251 if (new_link_state == LINK_STATE_DOWN) {
2252 2252 opcode = IBD_LINK_DOWN;
2253 2253 } else {
2254 2254 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2255 2255 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2256 2256 opcode = IBD_LINK_UP;
2257 2257 } else {
2258 2258 opcode = IBD_LINK_UP_ABSENT;
2259 2259 }
2260 2260 }
2261 2261
2262 2262 /*
2263 2263 * If the old state is the same as the new state, and the SM indicated
2264 2264 * no change in the port parameters, nothing to do.
2265 2265 */
2266 2266 if ((state->id_link_state == new_link_state) && (opcode !=
2267 2267 IBD_LINK_UP_ABSENT)) {
2268 2268 mutex_exit(&state->id_link_mutex);
2269 2269 goto link_mod_return;
2270 2270 }
2271 2271
2272 2272 /*
2273 2273 * Ok, so there was a link state change; see if it's safe to ask
2274 2274 * the async thread to do the work
2275 2275 */
2276 2276 if (!ibd_async_safe(state)) {
2277 2277 state->id_link_state = new_link_state;
2278 2278 mutex_exit(&state->id_link_mutex);
2279 2279 goto link_mod_return;
2280 2280 }
2281 2281
2282 2282 mutex_exit(&state->id_link_mutex);
2283 2283
2284 2284 /*
2285 2285 * Queue up a request for ibd_async_link() to handle this link
2286 2286 * state change event
2287 2287 */
2288 2288 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2289 2289 req->rq_ptr = (void *)opcode;
2290 2290 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2291 2291
2292 2292 link_mod_return:
2293 2293 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2294 2294 }
2295 2295
2296 2296 /*
2297 2297 * For the port up/down events, IBTL guarantees there will not be concurrent
2298 2298 * invocations of the handler. IBTL might coalesce link transition events,
2299 2299 * and not invoke the handler for _each_ up/down transition, but it will
2300 2300 * invoke the handler with last known state
2301 2301 */
2302 2302 static void
2303 2303 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2304 2304 ibt_async_code_t code, ibt_async_event_t *event)
2305 2305 {
2306 2306 ibd_state_t *state = (ibd_state_t *)clnt_private;
2307 2307
2308 2308 switch (code) {
2309 2309 case IBT_ERROR_CATASTROPHIC_CHAN:
2310 2310 ibd_print_warn(state, "catastrophic channel error");
2311 2311 break;
2312 2312 case IBT_ERROR_CQ:
2313 2313 ibd_print_warn(state, "completion queue error");
2314 2314 break;
2315 2315 case IBT_PORT_CHANGE_EVENT:
2316 2316 /*
2317 2317 * Events will be delivered to all instances that have
2318 2318 * done ibt_open_hca() but not yet done ibt_close_hca().
2319 2319 * Only need to do work for our port; IBTF will deliver
2320 2320 * events for other ports on the hca we have ibt_open_hca'ed
2321 2321 * too. Note that id_port is initialized in ibd_attach()
2322 2322 * before we do an ibt_open_hca() in ibd_attach().
2323 2323 */
2324 2324 ASSERT(state->id_hca_hdl == hca_hdl);
2325 2325 if (state->id_port != event->ev_port)
2326 2326 break;
2327 2327
2328 2328 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2329 2329 IBT_PORT_CHANGE_PKEY) {
2330 2330 ibd_link_mod(state, code);
2331 2331 }
2332 2332 break;
2333 2333 case IBT_ERROR_PORT_DOWN:
2334 2334 case IBT_CLNT_REREG_EVENT:
2335 2335 case IBT_EVENT_PORT_UP:
2336 2336 /*
2337 2337 * Events will be delivered to all instances that have
2338 2338 * done ibt_open_hca() but not yet done ibt_close_hca().
2339 2339 * Only need to do work for our port; IBTF will deliver
2340 2340 * events for other ports on the hca we have ibt_open_hca'ed
2341 2341 * too. Note that id_port is initialized in ibd_attach()
2342 2342 * before we do an ibt_open_hca() in ibd_attach().
2343 2343 */
2344 2344 ASSERT(state->id_hca_hdl == hca_hdl);
2345 2345 if (state->id_port != event->ev_port)
2346 2346 break;
2347 2347
2348 2348 ibd_link_mod(state, code);
2349 2349 break;
2350 2350
2351 2351 case IBT_HCA_ATTACH_EVENT:
2352 2352 case IBT_HCA_DETACH_EVENT:
2353 2353 /*
2354 2354 * When a new card is plugged to the system, attach_event is
2355 2355 * invoked. Additionally, a cfgadm needs to be run to make the
2356 2356 * card known to the system, and an ifconfig needs to be run to
2357 2357 * plumb up any ibd interfaces on the card. In the case of card
2358 2358 * unplug, a cfgadm is run that will trigger any RCM scripts to
2359 2359 * unplumb the ibd interfaces on the card; when the card is
2360 2360 * actually unplugged, the detach_event is invoked;
2361 2361 * additionally, if any ibd instances are still active on the
2362 2362 * card (eg there were no associated RCM scripts), driver's
2363 2363 * detach routine is invoked.
2364 2364 */
2365 2365 break;
2366 2366 default:
2367 2367 break;
2368 2368 }
2369 2369 }
2370 2370
2371 2371 static int
2372 2372 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2373 2373 {
2374 2374 mac_register_t *macp;
2375 2375 int ret;
2376 2376
2377 2377 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2378 2378 DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2379 2379 return (DDI_FAILURE);
2380 2380 }
2381 2381
2382 2382 /*
2383 2383 * Note that when we register with mac during attach, we don't
2384 2384 * have the id_macaddr yet, so we'll simply be registering a
2385 2385 * zero macaddr that we'll overwrite later during plumb (in
2386 2386 * ibd_m_start()). Similar is the case with id_mtu - we'll
2387 2387 * update the mac layer with the correct mtu during plumb.
2388 2388 */
2389 2389 macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2390 2390 macp->m_driver = state;
2391 2391 macp->m_dip = dip;
2392 2392 macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2393 2393 macp->m_callbacks = &ibd_m_callbacks;
2394 2394 macp->m_min_sdu = 0;
2395 2395 macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2396 2396 if (state->id_type == IBD_PORT_DRIVER) {
2397 2397 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2398 2398 } else if (state->id_enable_rc) {
2399 2399 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2400 2400 } else {
2401 2401 macp->m_max_sdu = IBD_DEF_MAX_SDU;
2402 2402 }
2403 2403 macp->m_priv_props = ibd_priv_props;
2404 2404
2405 2405 /*
2406 2406 * Register ourselves with the GLDv3 interface
2407 2407 */
2408 2408 if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2409 2409 mac_free(macp);
2410 2410 DPRINT(10,
2411 2411 "ibd_register_mac: mac_register() failed, ret=%d", ret);
2412 2412 return (DDI_FAILURE);
2413 2413 }
2414 2414
2415 2415 mac_free(macp);
2416 2416 return (DDI_SUCCESS);
2417 2417 }
2418 2418
2419 2419 static int
2420 2420 ibd_record_capab(ibd_state_t *state)
2421 2421 {
2422 2422 ibt_hca_attr_t hca_attrs;
2423 2423 ibt_status_t ibt_status;
2424 2424
2425 2425 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2426 2426
2427 2427 /*
2428 2428 * Query the HCA and fetch its attributes
2429 2429 */
2430 2430 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2431 2431 ASSERT(ibt_status == IBT_SUCCESS);
2432 2432
2433 2433 /*
2434 2434 * 1. Set the Hardware Checksum capability. Currently we only consider
2435 2435 * full checksum offload.
2436 2436 */
2437 2437 if (state->id_enable_rc) {
2438 2438 state->id_hwcksum_capab = 0;
2439 2439 } else {
2440 2440 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2441 2441 == IBT_HCA_CKSUM_FULL) {
2442 2442 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2443 2443 }
2444 2444 }
2445 2445
2446 2446 /*
2447 2447 * 2. Set LSO policy, capability and maximum length
2448 2448 */
2449 2449 if (state->id_enable_rc) {
2450 2450 state->id_lso_capable = B_FALSE;
2451 2451 state->id_lso_maxlen = 0;
2452 2452 } else {
2453 2453 if (hca_attrs.hca_max_lso_size > 0) {
2454 2454 state->id_lso_capable = B_TRUE;
2455 2455 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2456 2456 state->id_lso_maxlen = IBD_LSO_MAXLEN;
2457 2457 else
2458 2458 state->id_lso_maxlen =
2459 2459 hca_attrs.hca_max_lso_size;
2460 2460 } else {
2461 2461 state->id_lso_capable = B_FALSE;
2462 2462 state->id_lso_maxlen = 0;
2463 2463 }
2464 2464 }
2465 2465
2466 2466 /*
2467 2467 * 3. Set Reserved L_Key capability
2468 2468 */
2469 2469 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2470 2470 state->id_hca_res_lkey_capab = 1;
2471 2471 state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2472 2472 state->rc_enable_iov_map = B_TRUE;
2473 2473 } else {
2474 2474 /* If no reserved lkey, we will not use ibt_map_mem_iov */
2475 2475 state->rc_enable_iov_map = B_FALSE;
2476 2476 }
2477 2477
2478 2478 /*
2479 2479 * 4. Set maximum sqseg value after checking to see if extended sgl
2480 2480 * size information is provided by the hca
2481 2481 */
2482 2482 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2483 2483 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2484 2484 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2485 2485 } else {
2486 2486 state->id_max_sqseg = hca_attrs.hca_max_sgl;
2487 2487 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2488 2488 }
2489 2489 if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2490 2490 state->id_max_sqseg = IBD_MAX_SQSEG;
2491 2491 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2492 2492 ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2493 2493 state->id_max_sqseg, IBD_MAX_SQSEG);
2494 2494 }
2495 2495 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2496 2496 state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2497 2497 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2498 2498 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2499 2499 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2500 2500 }
2501 2501
2502 2502 /*
2503 2503 * Translating the virtual address regions into physical regions
2504 2504 * for using the Reserved LKey feature results in a wr sgl that
2505 2505 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2506 2506 * we'll fix a high-water mark (65%) for when we should stop.
2507 2507 */
2508 2508 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2509 2509 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2510 2510
2511 2511 /*
2512 2512 * 5. Set number of recv and send wqes after checking hca maximum
2513 2513 * channel size. Store the max channel size in the state so that it
2514 2514 * can be referred to when the swqe/rwqe change is requested via
2515 2515 * dladm.
2516 2516 */
2517 2517
2518 2518 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2519 2519
2520 2520 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2521 2521 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2522 2522
2523 2523 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2524 2524 IBD_RWQE_MIN;
2525 2525
2526 2526 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2527 2527 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2528 2528
2529 2529 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2530 2530
2531 2531 return (DDI_SUCCESS);
2532 2532 }
2533 2533
2534 2534 static int
2535 2535 ibd_part_busy(ibd_state_t *state)
2536 2536 {
2537 2537 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2538 2538 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2539 2539 return (DDI_FAILURE);
2540 2540 }
2541 2541
2542 2542 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2543 2543 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2544 2544 return (DDI_FAILURE);
2545 2545 }
2546 2546
2547 2547 /*
2548 2548 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2549 2549 * connecting to a remote IPoIB port. We can't remove this port.
2550 2550 */
2551 2551 if (state->id_ah_op == IBD_OP_ONGOING) {
2552 2552 DPRINT(10, "ibd_part_busy: failed: connecting\n");
2553 2553 return (DDI_FAILURE);
2554 2554 }
2555 2555
2556 2556 return (DDI_SUCCESS);
2557 2557 }
2558 2558
2559 2559
2560 2560 static void
2561 2561 ibd_part_unattach(ibd_state_t *state)
2562 2562 {
2563 2563 uint32_t progress = state->id_mac_state;
2564 2564 ibt_status_t ret;
2565 2565
2566 2566 /* make sure rx resources are freed */
2567 2567 ibd_free_rx_rsrcs(state);
2568 2568
2569 2569 if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2570 2570 ASSERT(state->id_enable_rc);
2571 2571 ibd_rc_fini_srq_list(state);
2572 2572 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2573 2573 }
2574 2574
2575 2575 if (progress & IBD_DRV_MAC_REGISTERED) {
2576 2576 (void) mac_unregister(state->id_mh);
2577 2577 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2578 2578 }
2579 2579
2580 2580 if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2581 2581 /*
2582 2582 * No new async requests will be posted since the device
2583 2583 * link state has been marked as unknown; completion handlers
2584 2584 * have been turned off, so Tx handler will not cause any
2585 2585 * more IBD_ASYNC_REAP requests.
2586 2586 *
2587 2587 * Queue a request for the async thread to exit, which will
2588 2588 * be serviced after any pending ones. This can take a while,
2589 2589 * specially if the SM is unreachable, since IBMF will slowly
2590 2590 * timeout each SM request issued by the async thread. Reap
2591 2591 * the thread before continuing on, we do not want it to be
2592 2592 * lingering in modunloaded code.
2593 2593 */
2594 2594 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2595 2595 thread_join(state->id_async_thrid);
2596 2596
2597 2597 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2598 2598 }
2599 2599
2600 2600 if (progress & IBD_DRV_REQ_LIST_INITED) {
2601 2601 list_destroy(&state->id_req_list);
2602 2602 mutex_destroy(&state->id_acache_req_lock);
2603 2603 cv_destroy(&state->id_acache_req_cv);
2604 2604 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2605 2605 }
2606 2606
2607 2607 if (progress & IBD_DRV_PD_ALLOCD) {
2608 2608 if ((ret = ibt_free_pd(state->id_hca_hdl,
2609 2609 state->id_pd_hdl)) != IBT_SUCCESS) {
2610 2610 ibd_print_warn(state, "failed to free "
2611 2611 "protection domain, ret=%d", ret);
2612 2612 }
2613 2613 state->id_pd_hdl = NULL;
2614 2614 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2615 2615 }
2616 2616
2617 2617 if (progress & IBD_DRV_HCA_OPENED) {
2618 2618 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2619 2619 IBT_SUCCESS) {
2620 2620 ibd_print_warn(state, "failed to close "
2621 2621 "HCA device, ret=%d", ret);
2622 2622 }
2623 2623 state->id_hca_hdl = NULL;
2624 2624 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2625 2625 }
2626 2626
2627 2627 mutex_enter(&ibd_gstate.ig_mutex);
2628 2628 if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2629 2629 if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2630 2630 IBT_SUCCESS) {
2631 2631 ibd_print_warn(state,
2632 2632 "ibt_detach() failed, ret=%d", ret);
2633 2633 }
2634 2634 state->id_ibt_hdl = NULL;
2635 2635 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2636 2636 ibd_gstate.ig_ibt_hdl_ref_cnt--;
2637 2637 }
2638 2638 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2639 2639 (ibd_gstate.ig_ibt_hdl != NULL)) {
2640 2640 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2641 2641 IBT_SUCCESS) {
2642 2642 ibd_print_warn(state, "ibt_detach(): global "
2643 2643 "failed, ret=%d", ret);
2644 2644 }
2645 2645 ibd_gstate.ig_ibt_hdl = NULL;
2646 2646 }
2647 2647 mutex_exit(&ibd_gstate.ig_mutex);
2648 2648
2649 2649 if (progress & IBD_DRV_TXINTR_ADDED) {
2650 2650 ddi_remove_softintr(state->id_tx);
2651 2651 state->id_tx = NULL;
2652 2652 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2653 2653 }
2654 2654
2655 2655 if (progress & IBD_DRV_RXINTR_ADDED) {
2656 2656 ddi_remove_softintr(state->id_rx);
2657 2657 state->id_rx = NULL;
2658 2658 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2659 2659 }
2660 2660
2661 2661 #ifdef DEBUG
2662 2662 if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2663 2663 kstat_delete(state->rc_ksp);
2664 2664 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2665 2665 }
2666 2666 #endif
2667 2667
2668 2668 if (progress & IBD_DRV_STATE_INITIALIZED) {
2669 2669 ibd_state_fini(state);
2670 2670 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2671 2671 }
2672 2672 }
2673 2673
2674 2674 int
2675 2675 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2676 2676 {
2677 2677 ibt_status_t ret;
2678 2678 int rv;
2679 2679 kthread_t *kht;
2680 2680
2681 2681 /*
2682 2682 * Initialize mutexes and condition variables
2683 2683 */
2684 2684 if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2685 2685 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2686 2686 return (DDI_FAILURE);
2687 2687 }
2688 2688 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2689 2689
2690 2690 /*
2691 2691 * Allocate rx,tx softintr
2692 2692 */
2693 2693 if (ibd_rx_softintr == 1) {
2694 2694 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2695 2695 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2696 2696 DPRINT(10, "ibd_part_attach: failed in "
2697 2697 "ddi_add_softintr(id_rx), ret=%d", rv);
2698 2698 return (DDI_FAILURE);
2699 2699 }
2700 2700 state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2701 2701 }
2702 2702 if (ibd_tx_softintr == 1) {
2703 2703 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2704 2704 NULL, NULL, ibd_tx_recycle,
2705 2705 (caddr_t)state)) != DDI_SUCCESS) {
2706 2706 DPRINT(10, "ibd_part_attach: failed in "
2707 2707 "ddi_add_softintr(id_tx), ret=%d", rv);
2708 2708 return (DDI_FAILURE);
2709 2709 }
2710 2710 state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2711 2711 }
2712 2712
2713 2713 /*
2714 2714 * Attach to IBTL
2715 2715 */
2716 2716 mutex_enter(&ibd_gstate.ig_mutex);
2717 2717 if (ibd_gstate.ig_ibt_hdl == NULL) {
2718 2718 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2719 2719 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2720 2720 DPRINT(10, "ibd_part_attach: global: failed in "
2721 2721 "ibt_attach(), ret=%d", ret);
2722 2722 mutex_exit(&ibd_gstate.ig_mutex);
2723 2723 return (DDI_FAILURE);
2724 2724 }
2725 2725 }
2726 2726 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2727 2727 &state->id_ibt_hdl)) != IBT_SUCCESS) {
2728 2728 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2729 2729 ret);
2730 2730 mutex_exit(&ibd_gstate.ig_mutex);
2731 2731 return (DDI_FAILURE);
2732 2732 }
2733 2733 ibd_gstate.ig_ibt_hdl_ref_cnt++;
2734 2734 mutex_exit(&ibd_gstate.ig_mutex);
2735 2735 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2736 2736
2737 2737 /*
2738 2738 * Open the HCA
2739 2739 */
2740 2740 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2741 2741 &state->id_hca_hdl)) != IBT_SUCCESS) {
2742 2742 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2743 2743 ret);
2744 2744 return (DDI_FAILURE);
2745 2745 }
2746 2746 state->id_mac_state |= IBD_DRV_HCA_OPENED;
2747 2747
2748 2748 #ifdef DEBUG
2749 2749 /* Initialize Driver Counters for Reliable Connected Mode */
2750 2750 if (state->id_enable_rc) {
2751 2751 if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2752 2752 DPRINT(10, "ibd_part_attach: failed in "
2753 2753 "ibd_rc_init_stats");
2754 2754 return (DDI_FAILURE);
2755 2755 }
2756 2756 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2757 2757 }
2758 2758 #endif
2759 2759
2760 2760 /*
2761 2761 * Record capabilities
2762 2762 */
2763 2763 (void) ibd_record_capab(state);
2764 2764
2765 2765 /*
2766 2766 * Allocate a protection domain on the HCA
2767 2767 */
2768 2768 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2769 2769 &state->id_pd_hdl)) != IBT_SUCCESS) {
2770 2770 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2771 2771 ret);
2772 2772 return (DDI_FAILURE);
2773 2773 }
2774 2774 state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2775 2775
2776 2776
2777 2777 /*
2778 2778 * We need to initialise the req_list that is required for the
2779 2779 * operation of the async_thread.
2780 2780 */
2781 2781 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2782 2782 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2783 2783 list_create(&state->id_req_list, sizeof (ibd_req_t),
2784 2784 offsetof(ibd_req_t, rq_list));
2785 2785 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2786 2786
2787 2787 /*
2788 2788 * Create the async thread; thread_create never fails.
2789 2789 */
2790 2790 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2791 2791 TS_RUN, minclsyspri);
2792 2792 state->id_async_thrid = kht->t_did;
2793 2793 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2794 2794
2795 2795 return (DDI_SUCCESS);
2796 2796 }
2797 2797
2798 2798 /*
2799 2799 * Attach device to the IO framework.
2800 2800 */
2801 2801 static int
2802 2802 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2803 2803 {
2804 2804 int ret;
2805 2805
2806 2806 switch (cmd) {
2807 2807 case DDI_ATTACH:
2808 2808 ret = ibd_port_attach(dip);
2809 2809 break;
2810 2810 default:
2811 2811 ret = DDI_FAILURE;
2812 2812 break;
2813 2813 }
2814 2814 return (ret);
2815 2815 }
2816 2816
2817 2817 /*
2818 2818 * Detach device from the IO framework.
2819 2819 */
2820 2820 static int
2821 2821 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2822 2822 {
2823 2823 ibd_state_t *state;
2824 2824 int instance;
2825 2825
2826 2826 /*
2827 2827 * IBD doesn't support suspend/resume
2828 2828 */
2829 2829 if (cmd != DDI_DETACH)
2830 2830 return (DDI_FAILURE);
2831 2831
2832 2832 /*
2833 2833 * Get the instance softstate
2834 2834 */
2835 2835 instance = ddi_get_instance(dip);
2836 2836 state = ddi_get_soft_state(ibd_list, instance);
2837 2837
2838 2838 /*
2839 2839 * Release all resources we're holding still. Note that if we'd
2840 2840 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2841 2841 * so far, we should find all the flags we need in id_mac_state.
2842 2842 */
2843 2843 return (ibd_port_unattach(state, dip));
2844 2844 }
2845 2845
2846 2846 /*
2847 2847 * Pre ibt_attach() driver initialization
2848 2848 */
2849 2849 static int
2850 2850 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2851 2851 {
2852 2852 char buf[64];
2853 2853
2854 2854 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2855 2855 state->id_link_state = LINK_STATE_UNKNOWN;
2856 2856
2857 2857 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2858 2858 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2859 2859 state->id_trap_stop = B_TRUE;
2860 2860 state->id_trap_inprog = 0;
2861 2861
2862 2862 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2863 2863 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2864 2864 state->id_dip = dip;
2865 2865
2866 2866 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2867 2867
2868 2868 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2869 2869 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2870 2870 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2871 2871 state->id_tx_busy = 0;
2872 2872 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2873 2873
2874 2874 state->id_rx_list.dl_bufs_outstanding = 0;
2875 2875 state->id_rx_list.dl_cnt = 0;
2876 2876 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2877 2877 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2878 2878 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2879 2879 state->id_pkey, state->id_plinkid);
2880 2880 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2881 2881 0, NULL, NULL, NULL, NULL, NULL, 0);
2882 2882
2883 2883 /* For Reliable Connected Mode */
2884 2884 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2885 2885 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2886 2886 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2887 2887 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2888 2888 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2889 2889 MUTEX_DRIVER, NULL);
2890 2890 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2891 2891
2892 2892 /*
2893 2893 * Make the default link mode as RC. If this fails during connection
2894 2894 * setup, the link mode is automatically transitioned to UD.
2895 2895 * Also set the RC MTU.
2896 2896 */
2897 2897 state->id_enable_rc = IBD_DEF_LINK_MODE;
2898 2898 state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2899 2899 state->id_mtu = IBD_DEF_MAX_MTU;
2900 2900
2901 2901 /* Iniatialize all tunables to default */
2902 2902 state->id_lso_policy = IBD_DEF_LSO_POLICY;
2903 2903 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2904 2904 state->id_num_ah = IBD_DEF_NUM_AH;
2905 2905 state->id_hash_size = IBD_DEF_HASH_SIZE;
2906 2906 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2907 2907 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2908 2908 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2909 2909 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2910 2910 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2911 2911 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2912 2912 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2913 2913 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2914 2914 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2915 2915 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2916 2916 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2917 2917 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2918 2918 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2919 2919 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2920 2920 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2921 2921 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2922 2922 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2923 2923 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2924 2924 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2925 2925 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2926 2926
2927 2927 return (DDI_SUCCESS);
2928 2928 }
2929 2929
2930 2930 /*
2931 2931 * Post ibt_detach() driver deconstruction
2932 2932 */
2933 2933 static void
2934 2934 ibd_state_fini(ibd_state_t *state)
2935 2935 {
2936 2936 kmem_cache_destroy(state->id_req_kmc);
2937 2937
2938 2938 mutex_destroy(&state->id_rx_list.dl_mutex);
2939 2939 mutex_destroy(&state->id_rx_free_list.dl_mutex);
2940 2940
2941 2941 mutex_destroy(&state->id_txpost_lock);
2942 2942 mutex_destroy(&state->id_tx_list.dl_mutex);
2943 2943 mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2944 2944 mutex_destroy(&state->id_lso_lock);
2945 2945
2946 2946 mutex_destroy(&state->id_sched_lock);
2947 2947 mutex_destroy(&state->id_scq_poll_lock);
2948 2948 mutex_destroy(&state->id_rcq_poll_lock);
2949 2949
2950 2950 cv_destroy(&state->id_trap_cv);
2951 2951 mutex_destroy(&state->id_trap_lock);
2952 2952 mutex_destroy(&state->id_link_mutex);
2953 2953
2954 2954 /* For Reliable Connected Mode */
2955 2955 mutex_destroy(&state->rc_timeout_lock);
2956 2956 mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2957 2957 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2958 2958 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2959 2959 mutex_destroy(&state->rc_tx_large_bufs_lock);
2960 2960 mutex_destroy(&state->rc_rx_lock);
2961 2961 }
2962 2962
2963 2963 /*
2964 2964 * Fetch link speed from SA for snmp ifspeed reporting.
2965 2965 */
2966 2966 static uint64_t
2967 2967 ibd_get_portspeed(ibd_state_t *state)
2968 2968 {
2969 2969 int ret;
2970 2970 ibt_path_info_t path;
2971 2971 ibt_path_attr_t path_attr;
2972 2972 uint8_t num_paths;
2973 2973 uint64_t ifspeed;
2974 2974
2975 2975 /*
2976 2976 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2977 2977 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2978 2978 * 2000000000. Start with that as default.
2979 2979 */
2980 2980 ifspeed = 2000000000;
2981 2981
2982 2982 bzero(&path_attr, sizeof (path_attr));
2983 2983
2984 2984 /*
2985 2985 * Get the port speed from Loopback path information.
2986 2986 */
2987 2987 path_attr.pa_dgids = &state->id_sgid;
2988 2988 path_attr.pa_num_dgids = 1;
2989 2989 path_attr.pa_sgid = state->id_sgid;
2990 2990
2991 2991 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2992 2992 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2993 2993 goto earlydone;
2994 2994
2995 2995 if (num_paths < 1)
2996 2996 goto earlydone;
2997 2997
2998 2998 /*
2999 2999 * In case SA does not return an expected value, report the default
3000 3000 * speed as 1X.
3001 3001 */
3002 3002 ret = 1;
3003 3003 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3004 3004 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */
3005 3005 ret = 1;
3006 3006 break;
3007 3007 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */
3008 3008 ret = 4;
3009 3009 break;
3010 3010 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */
3011 3011 ret = 12;
3012 3012 break;
3013 3013 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */
3014 3014 ret = 2;
3015 3015 break;
3016 3016 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */
3017 3017 ret = 8;
3018 3018 break;
3019 3019 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */
3020 3020 ret = 16;
3021 3021 break;
3022 3022 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */
3023 3023 ret = 24;
3024 3024 break;
3025 3025 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */
3026 3026 ret = 32;
3027 3027 break;
3028 3028 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */
3029 3029 ret = 48;
3030 3030 break;
3031 3031 }
3032 3032
3033 3033 ifspeed *= ret;
3034 3034
3035 3035 earlydone:
3036 3036 return (ifspeed);
3037 3037 }
3038 3038
3039 3039 /*
3040 3040 * Search input mcg list (id_mc_full or id_mc_non) for an entry
3041 3041 * representing the input mcg mgid.
3042 3042 */
3043 3043 static ibd_mce_t *
3044 3044 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3045 3045 {
3046 3046 ibd_mce_t *ptr = list_head(mlist);
3047 3047
3048 3048 /*
3049 3049 * Do plain linear search.
3050 3050 */
3051 3051 while (ptr != NULL) {
3052 3052 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3053 3053 sizeof (ib_gid_t)) == 0)
3054 3054 return (ptr);
3055 3055 ptr = list_next(mlist, ptr);
3056 3056 }
3057 3057 return (NULL);
3058 3058 }
3059 3059
3060 3060 /*
3061 3061 * Execute IBA JOIN.
3062 3062 */
3063 3063 static ibt_status_t
3064 3064 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3065 3065 {
3066 3066 ibt_mcg_attr_t mcg_attr;
3067 3067
3068 3068 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3069 3069 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3070 3070 mcg_attr.mc_mgid = mgid;
3071 3071 mcg_attr.mc_join_state = mce->mc_jstate;
3072 3072 mcg_attr.mc_scope = state->id_scope;
3073 3073 mcg_attr.mc_pkey = state->id_pkey;
3074 3074 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3075 3075 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3076 3076 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3077 3077 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3078 3078 NULL, NULL));
3079 3079 }
3080 3080
3081 3081 /*
3082 3082 * This code JOINs the port in the proper way (depending on the join
3083 3083 * state) so that IBA fabric will forward mcg packets to/from the port.
3084 3084 * It also attaches the QPN to the mcg so it can receive those mcg
3085 3085 * packets. This code makes sure not to attach the mcg to the QP if
3086 3086 * that has been previously done due to the mcg being joined with a
3087 3087 * different join state, even though this is not required by SWG_0216,
3088 3088 * refid 3610.
3089 3089 */
3090 3090 static ibd_mce_t *
3091 3091 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3092 3092 {
3093 3093 ibt_status_t ibt_status;
3094 3094 ibd_mce_t *mce, *tmce, *omce = NULL;
3095 3095 boolean_t do_attach = B_TRUE;
3096 3096
3097 3097 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3098 3098 jstate, mgid.gid_prefix, mgid.gid_guid);
3099 3099
3100 3100 /*
3101 3101 * For enable_multicast Full member joins, we need to do some
3102 3102 * extra work. If there is already an mce on the list that
3103 3103 * indicates full membership, that means the membership has
3104 3104 * not yet been dropped (since the disable_multicast was issued)
3105 3105 * because there are pending Tx's to the mcg; in that case, just
3106 3106 * mark the mce not to be reaped when the Tx completion queues
3107 3107 * an async reap operation.
3108 3108 *
3109 3109 * If there is already an mce on the list indicating sendonly
3110 3110 * membership, try to promote to full membership. Be careful
3111 3111 * not to deallocate the old mce, since there might be an AH
3112 3112 * pointing to it; instead, update the old mce with new data
3113 3113 * that tracks the full membership.
3114 3114 */
3115 3115 if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3116 3116 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3117 3117 if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3118 3118 ASSERT(omce->mc_fullreap);
3119 3119 omce->mc_fullreap = B_FALSE;
3120 3120 return (omce);
3121 3121 } else {
3122 3122 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3123 3123 }
3124 3124 }
3125 3125
3126 3126 /*
3127 3127 * Allocate the ibd_mce_t to track this JOIN.
3128 3128 */
3129 3129 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3130 3130 mce->mc_fullreap = B_FALSE;
3131 3131 mce->mc_jstate = jstate;
3132 3132
3133 3133 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3134 3134 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3135 3135 ibt_status);
3136 3136 kmem_free(mce, sizeof (ibd_mce_t));
3137 3137 return (NULL);
3138 3138 }
3139 3139
3140 3140 /*
3141 3141 * Is an IBA attach required? Not if the interface is already joined
3142 3142 * to the mcg in a different appropriate join state.
3143 3143 */
3144 3144 if (jstate == IB_MC_JSTATE_NON) {
3145 3145 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3146 3146 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3147 3147 do_attach = B_FALSE;
3148 3148 } else if (jstate == IB_MC_JSTATE_FULL) {
3149 3149 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3150 3150 do_attach = B_FALSE;
3151 3151 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3152 3152 do_attach = B_FALSE;
3153 3153 }
3154 3154
3155 3155 if (do_attach) {
3156 3156 /*
3157 3157 * Do the IBA attach.
3158 3158 */
3159 3159 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3160 3160 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3161 3161 &mce->mc_info)) != IBT_SUCCESS) {
3162 3162 DPRINT(10, "ibd_join_group : failed qp attachment "
3163 3163 "%d\n", ibt_status);
3164 3164 /*
3165 3165 * NOTE that we should probably preserve the join info
3166 3166 * in the list and later try to leave again at detach
3167 3167 * time.
3168 3168 */
3169 3169 (void) ibt_leave_mcg(state->id_sgid, mgid,
3170 3170 state->id_sgid, jstate);
3171 3171 kmem_free(mce, sizeof (ibd_mce_t));
3172 3172 return (NULL);
3173 3173 }
3174 3174 }
3175 3175
3176 3176 /*
3177 3177 * Insert the ibd_mce_t in the proper list.
3178 3178 */
3179 3179 if (jstate == IB_MC_JSTATE_NON) {
3180 3180 IBD_MCACHE_INSERT_NON(state, mce);
3181 3181 } else {
3182 3182 /*
3183 3183 * Set up the mc_req fields used for reaping the
3184 3184 * mcg in case of delayed tx completion (see
3185 3185 * ibd_tx_cleanup()). Also done for sendonly join in
3186 3186 * case we are promoted to fullmembership later and
3187 3187 * keep using the same mce.
3188 3188 */
3189 3189 mce->mc_req.rq_gid = mgid;
3190 3190 mce->mc_req.rq_ptr = mce;
3191 3191 /*
3192 3192 * Check whether this is the case of trying to join
3193 3193 * full member, and we were already joined send only.
3194 3194 * We try to drop our SendOnly membership, but it is
3195 3195 * possible that the mcg does not exist anymore (and
3196 3196 * the subnet trap never reached us), so the leave
3197 3197 * operation might fail.
3198 3198 */
3199 3199 if (omce != NULL) {
3200 3200 (void) ibt_leave_mcg(state->id_sgid, mgid,
3201 3201 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3202 3202 omce->mc_jstate = IB_MC_JSTATE_FULL;
3203 3203 bcopy(&mce->mc_info, &omce->mc_info,
3204 3204 sizeof (ibt_mcg_info_t));
3205 3205 kmem_free(mce, sizeof (ibd_mce_t));
3206 3206 return (omce);
3207 3207 }
3208 3208 mutex_enter(&state->id_mc_mutex);
3209 3209 IBD_MCACHE_INSERT_FULL(state, mce);
3210 3210 mutex_exit(&state->id_mc_mutex);
3211 3211 }
3212 3212
3213 3213 return (mce);
3214 3214 }
3215 3215
3216 3216 /*
3217 3217 * Called during port up event handling to attempt to reacquire full
3218 3218 * membership to an mcg. Stripped down version of ibd_join_group().
3219 3219 * Note that it is possible that the mcg might have gone away, and
3220 3220 * gets recreated at this point.
3221 3221 */
3222 3222 static void
3223 3223 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3224 3224 {
3225 3225 ib_gid_t mgid;
3226 3226
3227 3227 /*
3228 3228 * If the mc_fullreap flag is set, or this join fails, a subsequent
3229 3229 * reap/leave is going to try to leave the group. We could prevent
3230 3230 * that by adding a boolean flag into ibd_mce_t, if required.
3231 3231 */
3232 3232 if (mce->mc_fullreap)
3233 3233 return;
3234 3234
3235 3235 mgid = mce->mc_info.mc_adds_vect.av_dgid;
3236 3236
3237 3237 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3238 3238 mgid.gid_guid);
3239 3239
3240 3240 /* While reacquiring, leave and then join the MCG */
3241 3241 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3242 3242 mce->mc_jstate);
3243 3243 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3244 3244 ibd_print_warn(state, "Failure on port up to rejoin "
3245 3245 "multicast gid %016llx:%016llx",
3246 3246 (u_longlong_t)mgid.gid_prefix,
3247 3247 (u_longlong_t)mgid.gid_guid);
3248 3248 }
3249 3249
3250 3250 /*
3251 3251 * This code handles delayed Tx completion cleanups for mcg's to which
3252 3252 * disable_multicast has been issued, regular mcg related cleanups during
3253 3253 * disable_multicast, disable_promiscuous and mcg traps, as well as
3254 3254 * cleanups during driver detach time. Depending on the join state,
3255 3255 * it deletes the mce from the appropriate list and issues the IBA
3256 3256 * leave/detach; except in the disable_multicast case when the mce
3257 3257 * is left on the active list for a subsequent Tx completion cleanup.
3258 3258 */
3259 3259 static void
3260 3260 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3261 3261 uint8_t jstate)
3262 3262 {
3263 3263 ibd_mce_t *tmce;
3264 3264 boolean_t do_detach = B_TRUE;
3265 3265
3266 3266 /*
3267 3267 * Before detaching, we must check whether the other list
3268 3268 * contains the mcg; if we detach blindly, the consumer
3269 3269 * who set up the other list will also stop receiving
3270 3270 * traffic.
3271 3271 */
3272 3272 if (jstate == IB_MC_JSTATE_FULL) {
3273 3273 /*
3274 3274 * The following check is only relevant while coming
3275 3275 * from the Tx completion path in the reap case.
3276 3276 */
3277 3277 if (!mce->mc_fullreap)
3278 3278 return;
3279 3279 mutex_enter(&state->id_mc_mutex);
3280 3280 IBD_MCACHE_PULLOUT_FULL(state, mce);
3281 3281 mutex_exit(&state->id_mc_mutex);
3282 3282 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3283 3283 do_detach = B_FALSE;
3284 3284 } else if (jstate == IB_MC_JSTATE_NON) {
3285 3285 IBD_MCACHE_PULLOUT_NON(state, mce);
3286 3286 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3287 3287 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3288 3288 do_detach = B_FALSE;
3289 3289 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3290 3290 mutex_enter(&state->id_mc_mutex);
3291 3291 IBD_MCACHE_PULLOUT_FULL(state, mce);
3292 3292 mutex_exit(&state->id_mc_mutex);
3293 3293 do_detach = B_FALSE;
3294 3294 }
3295 3295
3296 3296 /*
3297 3297 * If we are reacting to a mcg trap and leaving our sendonly or
3298 3298 * non membership, the mcg is possibly already gone, so attempting
3299 3299 * to leave might fail. On the other hand, we must try to leave
3300 3300 * anyway, since this might be a trap from long ago, and we could
3301 3301 * have potentially sendonly joined to a recent incarnation of
3302 3302 * the mcg and are about to loose track of this information.
3303 3303 */
3304 3304 if (do_detach) {
3305 3305 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3306 3306 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3307 3307 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3308 3308 }
3309 3309
3310 3310 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3311 3311 kmem_free(mce, sizeof (ibd_mce_t));
3312 3312 }
3313 3313
3314 3314 /*
3315 3315 * Async code executed due to multicast and promiscuous disable requests
3316 3316 * and mcg trap handling; also executed during driver detach. Mostly, a
3317 3317 * leave and detach is done; except for the fullmember case when Tx
3318 3318 * requests are pending, whence arrangements are made for subsequent
3319 3319 * cleanup on Tx completion.
3320 3320 */
3321 3321 static void
3322 3322 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3323 3323 {
3324 3324 ipoib_mac_t mcmac;
3325 3325 boolean_t recycled;
3326 3326 ibd_mce_t *mce;
3327 3327
3328 3328 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3329 3329 jstate, mgid.gid_prefix, mgid.gid_guid);
3330 3330
3331 3331 if (jstate == IB_MC_JSTATE_NON) {
3332 3332 recycled = B_TRUE;
3333 3333 mce = IBD_MCACHE_FIND_NON(state, mgid);
3334 3334 /*
3335 3335 * In case we are handling a mcg trap, we might not find
3336 3336 * the mcg in the non list.
3337 3337 */
3338 3338 if (mce == NULL) {
3339 3339 return;
3340 3340 }
3341 3341 } else {
3342 3342 mce = IBD_MCACHE_FIND_FULL(state, mgid);
3343 3343
3344 3344 /*
3345 3345 * In case we are handling a mcg trap, make sure the trap
3346 3346 * is not arriving late; if we have an mce that indicates
3347 3347 * that we are already a fullmember, that would be a clear
3348 3348 * indication that the trap arrived late (ie, is for a
3349 3349 * previous incarnation of the mcg).
3350 3350 */
3351 3351 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3352 3352 if ((mce == NULL) || (mce->mc_jstate ==
3353 3353 IB_MC_JSTATE_FULL)) {
3354 3354 return;
3355 3355 }
3356 3356 } else {
3357 3357 ASSERT(jstate == IB_MC_JSTATE_FULL);
3358 3358
3359 3359 /*
3360 3360 * If join group failed, mce will be NULL here.
3361 3361 * This is because in GLDv3 driver, set multicast
3362 3362 * will always return success.
3363 3363 */
3364 3364 if (mce == NULL) {
3365 3365 return;
3366 3366 }
3367 3367
3368 3368 mce->mc_fullreap = B_TRUE;
3369 3369 }
3370 3370
3371 3371 /*
3372 3372 * If no pending Tx's remain that reference the AH
3373 3373 * for the mcg, recycle it from active to free list.
3374 3374 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3375 3375 * so the last completing Tx will cause an async reap
3376 3376 * operation to be invoked, at which time we will drop our
3377 3377 * membership to the mcg so that the pending Tx's complete
3378 3378 * successfully. Refer to comments on "AH and MCE active
3379 3379 * list manipulation" at top of this file. The lock protects
3380 3380 * against Tx fast path and Tx cleanup code.
3381 3381 */
3382 3382 mutex_enter(&state->id_ac_mutex);
3383 3383 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3384 3384 recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3385 3385 IB_MC_JSTATE_SEND_ONLY_NON));
3386 3386 mutex_exit(&state->id_ac_mutex);
3387 3387 }
3388 3388
3389 3389 if (recycled) {
3390 3390 DPRINT(2, "ibd_leave_group : leave_group reaping : "
3391 3391 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3392 3392 ibd_async_reap_group(state, mce, mgid, jstate);
3393 3393 }
3394 3394 }
3395 3395
3396 3396 /*
3397 3397 * Find the broadcast address as defined by IPoIB; implicitly
3398 3398 * determines the IBA scope, mtu, tclass etc of the link the
3399 3399 * interface is going to be a member of.
3400 3400 */
3401 3401 static ibt_status_t
3402 3402 ibd_find_bgroup(ibd_state_t *state)
3403 3403 {
3404 3404 ibt_mcg_attr_t mcg_attr;
3405 3405 uint_t numg;
3406 3406 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3407 3407 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3408 3408 IB_MC_SCOPE_GLOBAL };
3409 3409 int i, mcgmtu;
3410 3410 boolean_t found = B_FALSE;
3411 3411 int ret;
3412 3412 ibt_mcg_info_t mcg_info;
3413 3413
3414 3414 state->id_bgroup_created = B_FALSE;
3415 3415 state->id_bgroup_present = B_FALSE;
3416 3416
3417 3417 query_bcast_grp:
3418 3418 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3419 3419 mcg_attr.mc_pkey = state->id_pkey;
3420 3420 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3421 3421 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3422 3422 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3423 3423
3424 3424 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3425 3425 state->id_scope = mcg_attr.mc_scope = scopes[i];
3426 3426
3427 3427 /*
3428 3428 * Look for the IPoIB broadcast group.
3429 3429 */
3430 3430 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3431 3431 state->id_mgid.gid_prefix =
3432 3432 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3433 3433 ((uint64_t)state->id_scope << 48) |
3434 3434 ((uint32_t)(state->id_pkey << 16)));
3435 3435 mcg_attr.mc_mgid = state->id_mgid;
3436 3436 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3437 3437 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3438 3438 &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3439 3439 found = B_TRUE;
3440 3440 break;
3441 3441 }
3442 3442 }
3443 3443
3444 3444 if (!found) {
3445 3445 if (state->id_create_broadcast_group) {
3446 3446 /*
3447 3447 * If we created the broadcast group, but failed to
3448 3448 * find it, we can't do anything except leave the
3449 3449 * one we created and return failure.
3450 3450 */
3451 3451 if (state->id_bgroup_created) {
3452 3452 ibd_print_warn(state, "IPoIB broadcast group "
3453 3453 "absent. Unable to query after create.");
3454 3454 goto find_bgroup_fail;
3455 3455 }
3456 3456
3457 3457 /*
3458 3458 * Create the ipoib broadcast group if it didn't exist
3459 3459 */
3460 3460 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3461 3461 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3462 3462 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3463 3463 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3464 3464 mcg_attr.mc_pkey = state->id_pkey;
3465 3465 mcg_attr.mc_flow = 0;
3466 3466 mcg_attr.mc_sl = 0;
3467 3467 mcg_attr.mc_tclass = 0;
3468 3468 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3469 3469 state->id_mgid.gid_prefix =
3470 3470 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3471 3471 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3472 3472 ((uint32_t)(state->id_pkey << 16)));
3473 3473 mcg_attr.mc_mgid = state->id_mgid;
3474 3474 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3475 3475
3476 3476 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3477 3477 &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3478 3478 ibd_print_warn(state, "IPoIB broadcast group "
3479 3479 "absent, create failed: ret = %d\n", ret);
3480 3480 state->id_bgroup_created = B_FALSE;
3481 3481 return (IBT_FAILURE);
3482 3482 }
3483 3483 state->id_bgroup_created = B_TRUE;
3484 3484 goto query_bcast_grp;
3485 3485 } else {
3486 3486 ibd_print_warn(state, "IPoIB broadcast group absent");
3487 3487 return (IBT_FAILURE);
3488 3488 }
3489 3489 }
3490 3490
3491 3491 /*
3492 3492 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3493 3493 */
3494 3494 mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3495 3495 if (state->id_mtu < mcgmtu) {
3496 3496 ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3497 3497 "greater than port's maximum MTU %d", mcgmtu,
3498 3498 state->id_mtu);
3499 3499 ibt_free_mcg_info(state->id_mcinfo, 1);
3500 3500 goto find_bgroup_fail;
3501 3501 }
3502 3502 state->id_mtu = mcgmtu;
3503 3503 state->id_bgroup_present = B_TRUE;
3504 3504
3505 3505 return (IBT_SUCCESS);
3506 3506
3507 3507 find_bgroup_fail:
3508 3508 if (state->id_bgroup_created) {
3509 3509 (void) ibt_leave_mcg(state->id_sgid,
3510 3510 mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3511 3511 IB_MC_JSTATE_FULL);
3512 3512 }
3513 3513
3514 3514 return (IBT_FAILURE);
3515 3515 }
3516 3516
3517 3517 static int
3518 3518 ibd_alloc_tx_copybufs(ibd_state_t *state)
3519 3519 {
3520 3520 ibt_mr_attr_t mem_attr;
3521 3521
3522 3522 /*
3523 3523 * Allocate one big chunk for all regular tx copy bufs
3524 3524 */
3525 3525 state->id_tx_buf_sz = state->id_mtu;
3526 3526 if (state->id_lso_policy && state->id_lso_capable &&
3527 3527 (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3528 3528 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3529 3529 }
3530 3530
3531 3531 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3532 3532 state->id_tx_buf_sz, KM_SLEEP);
3533 3533
3534 3534 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3535 3535 sizeof (ibd_swqe_t), KM_SLEEP);
3536 3536
3537 3537 /*
3538 3538 * Do one memory registration on the entire txbuf area
3539 3539 */
3540 3540 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3541 3541 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3542 3542 mem_attr.mr_as = NULL;
3543 3543 mem_attr.mr_flags = IBT_MR_SLEEP;
3544 3544 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3545 3545 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3546 3546 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3547 3547 kmem_free(state->id_tx_wqes,
3548 3548 state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3549 3549 kmem_free(state->id_tx_bufs,
3550 3550 state->id_ud_num_swqe * state->id_tx_buf_sz);
3551 3551 state->id_tx_bufs = NULL;
3552 3552 return (DDI_FAILURE);
3553 3553 }
3554 3554
3555 3555 return (DDI_SUCCESS);
3556 3556 }
3557 3557
3558 3558 static int
3559 3559 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3560 3560 {
3561 3561 ibt_mr_attr_t mem_attr;
3562 3562 ibd_lsobuf_t *buflist;
3563 3563 ibd_lsobuf_t *lbufp;
3564 3564 ibd_lsobuf_t *tail;
3565 3565 ibd_lsobkt_t *bktp;
3566 3566 uint8_t *membase;
3567 3567 uint8_t *memp;
3568 3568 uint_t memsz;
3569 3569 int i;
3570 3570
3571 3571 /*
3572 3572 * Allocate the lso bucket
3573 3573 */
3574 3574 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3575 3575
3576 3576 /*
3577 3577 * Allocate the entire lso memory and register it
3578 3578 */
3579 3579 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3580 3580 membase = kmem_zalloc(memsz, KM_SLEEP);
3581 3581
3582 3582 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3583 3583 mem_attr.mr_len = memsz;
3584 3584 mem_attr.mr_as = NULL;
3585 3585 mem_attr.mr_flags = IBT_MR_SLEEP;
3586 3586 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3587 3587 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3588 3588 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3589 3589 kmem_free(membase, memsz);
3590 3590 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3591 3591 return (DDI_FAILURE);
3592 3592 }
3593 3593
3594 3594 mutex_enter(&state->id_lso_lock);
3595 3595
3596 3596 /*
3597 3597 * Now allocate the buflist. Note that the elements in the buflist and
3598 3598 * the buffers in the lso memory have a permanent 1-1 relation, so we
3599 3599 * can always derive the address of a buflist entry from the address of
3600 3600 * an lso buffer.
3601 3601 */
3602 3602 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3603 3603 KM_SLEEP);
3604 3604
3605 3605 /*
3606 3606 * Set up the lso buf chain
3607 3607 */
3608 3608 memp = membase;
3609 3609 lbufp = buflist;
3610 3610 for (i = 0; i < state->id_num_lso_bufs; i++) {
3611 3611 lbufp->lb_isfree = 1;
3612 3612 lbufp->lb_buf = memp;
3613 3613 lbufp->lb_next = lbufp + 1;
3614 3614
3615 3615 tail = lbufp;
3616 3616
3617 3617 memp += IBD_LSO_BUFSZ;
3618 3618 lbufp++;
3619 3619 }
3620 3620 tail->lb_next = NULL;
3621 3621
3622 3622 /*
3623 3623 * Set up the LSO buffer information in ibd state
3624 3624 */
3625 3625 bktp->bkt_bufl = buflist;
3626 3626 bktp->bkt_free_head = buflist;
3627 3627 bktp->bkt_mem = membase;
3628 3628 bktp->bkt_nelem = state->id_num_lso_bufs;
3629 3629 bktp->bkt_nfree = bktp->bkt_nelem;
3630 3630
3631 3631 state->id_lso = bktp;
3632 3632 mutex_exit(&state->id_lso_lock);
3633 3633
3634 3634 return (DDI_SUCCESS);
3635 3635 }
3636 3636
3637 3637 /*
3638 3638 * Statically allocate Tx buffer list(s).
3639 3639 */
3640 3640 static int
3641 3641 ibd_init_txlist(ibd_state_t *state)
3642 3642 {
3643 3643 ibd_swqe_t *swqe;
3644 3644 ibt_lkey_t lkey;
3645 3645 int i;
3646 3646 uint_t len;
3647 3647 uint8_t *bufaddr;
3648 3648
3649 3649 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3650 3650 return (DDI_FAILURE);
3651 3651
3652 3652 if (state->id_lso_policy && state->id_lso_capable) {
3653 3653 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3654 3654 state->id_lso_capable = B_FALSE;
3655 3655 }
3656 3656
3657 3657 mutex_enter(&state->id_tx_list.dl_mutex);
3658 3658 state->id_tx_list.dl_head = NULL;
3659 3659 state->id_tx_list.dl_pending_sends = B_FALSE;
3660 3660 state->id_tx_list.dl_cnt = 0;
3661 3661 mutex_exit(&state->id_tx_list.dl_mutex);
3662 3662 mutex_enter(&state->id_tx_rel_list.dl_mutex);
3663 3663 state->id_tx_rel_list.dl_head = NULL;
3664 3664 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3665 3665 state->id_tx_rel_list.dl_cnt = 0;
3666 3666 mutex_exit(&state->id_tx_rel_list.dl_mutex);
3667 3667
3668 3668 /*
3669 3669 * Allocate and setup the swqe list
3670 3670 */
3671 3671 lkey = state->id_tx_mr_desc.md_lkey;
3672 3672 bufaddr = state->id_tx_bufs;
3673 3673 len = state->id_tx_buf_sz;
3674 3674 swqe = state->id_tx_wqes;
3675 3675 mutex_enter(&state->id_tx_list.dl_mutex);
3676 3676 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3677 3677 swqe->swqe_next = NULL;
3678 3678 swqe->swqe_im_mblk = NULL;
3679 3679
3680 3680 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3681 3681 bufaddr;
3682 3682 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3683 3683 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3684 3684
3685 3685 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3686 3686 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3687 3687 swqe->w_swr.wr_trans = IBT_UD_SRV;
3688 3688
3689 3689 /* These are set in send */
3690 3690 swqe->w_swr.wr_nds = 0;
3691 3691 swqe->w_swr.wr_sgl = NULL;
3692 3692 swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3693 3693
3694 3694 /* add to list */
3695 3695 state->id_tx_list.dl_cnt++;
3696 3696 swqe->swqe_next = state->id_tx_list.dl_head;
3697 3697 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3698 3698 }
3699 3699 mutex_exit(&state->id_tx_list.dl_mutex);
3700 3700
3701 3701 return (DDI_SUCCESS);
3702 3702 }
3703 3703
3704 3704 static int
3705 3705 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3706 3706 uint32_t *nds_p)
3707 3707 {
3708 3708 ibd_lsobkt_t *bktp;
3709 3709 ibd_lsobuf_t *lbufp;
3710 3710 ibd_lsobuf_t *nextp;
3711 3711 ibt_lkey_t lso_lkey;
3712 3712 uint_t frag_sz;
3713 3713 uint_t num_needed;
3714 3714 int i;
3715 3715
3716 3716 ASSERT(sgl_p != NULL);
3717 3717 ASSERT(nds_p != NULL);
3718 3718 ASSERT(req_sz != 0);
3719 3719
3720 3720 /*
3721 3721 * Determine how many bufs we'd need for the size requested
3722 3722 */
3723 3723 num_needed = req_sz / IBD_LSO_BUFSZ;
3724 3724 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3725 3725 num_needed++;
3726 3726
3727 3727 mutex_enter(&state->id_lso_lock);
3728 3728
3729 3729 /*
3730 3730 * If we don't have enough lso bufs, return failure
3731 3731 */
3732 3732 ASSERT(state->id_lso != NULL);
3733 3733 bktp = state->id_lso;
3734 3734 if (bktp->bkt_nfree < num_needed) {
3735 3735 mutex_exit(&state->id_lso_lock);
3736 3736 return (-1);
3737 3737 }
3738 3738
3739 3739 /*
3740 3740 * Pick the first 'num_needed' bufs from the free list
3741 3741 */
3742 3742 lso_lkey = bktp->bkt_mr_desc.md_lkey;
3743 3743 lbufp = bktp->bkt_free_head;
3744 3744 for (i = 0; i < num_needed; i++) {
3745 3745 ASSERT(lbufp->lb_isfree != 0);
3746 3746 ASSERT(lbufp->lb_buf != NULL);
3747 3747
3748 3748 nextp = lbufp->lb_next;
3749 3749
3750 3750 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3751 3751 sgl_p[i].ds_key = lso_lkey;
3752 3752 sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3753 3753
3754 3754 lbufp->lb_isfree = 0;
3755 3755 lbufp->lb_next = NULL;
3756 3756
3757 3757 lbufp = nextp;
3758 3758 }
3759 3759 bktp->bkt_free_head = lbufp;
3760 3760
3761 3761 /*
3762 3762 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3763 3763 * to adjust the last sgl entry's length. Since we know we need atleast
3764 3764 * one, the i-1 use below is ok.
3765 3765 */
3766 3766 if (frag_sz) {
3767 3767 sgl_p[i-1].ds_len = frag_sz;
3768 3768 }
3769 3769
3770 3770 /*
3771 3771 * Update nfree count and return
3772 3772 */
3773 3773 bktp->bkt_nfree -= num_needed;
3774 3774
3775 3775 mutex_exit(&state->id_lso_lock);
3776 3776
3777 3777 *nds_p = num_needed;
3778 3778
3779 3779 return (0);
3780 3780 }
3781 3781
3782 3782 static void
3783 3783 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3784 3784 {
3785 3785 ibd_lsobkt_t *bktp;
3786 3786 ibd_lsobuf_t *lbufp;
3787 3787 uint8_t *lso_mem_end;
3788 3788 uint_t ndx;
3789 3789 int i;
3790 3790
3791 3791 mutex_enter(&state->id_lso_lock);
3792 3792
3793 3793 bktp = state->id_lso;
3794 3794 ASSERT(bktp != NULL);
3795 3795
3796 3796 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3797 3797 for (i = 0; i < nds; i++) {
3798 3798 uint8_t *va;
3799 3799
3800 3800 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3801 3801 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3802 3802
3803 3803 /*
3804 3804 * Figure out the buflist element this sgl buffer corresponds
3805 3805 * to and put it back at the head
3806 3806 */
3807 3807 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3808 3808 lbufp = bktp->bkt_bufl + ndx;
3809 3809
3810 3810 ASSERT(lbufp->lb_isfree == 0);
3811 3811 ASSERT(lbufp->lb_buf == va);
3812 3812
3813 3813 lbufp->lb_isfree = 1;
3814 3814 lbufp->lb_next = bktp->bkt_free_head;
3815 3815 bktp->bkt_free_head = lbufp;
3816 3816 }
3817 3817 bktp->bkt_nfree += nds;
3818 3818
3819 3819 mutex_exit(&state->id_lso_lock);
3820 3820 }
3821 3821
3822 3822 static void
3823 3823 ibd_free_tx_copybufs(ibd_state_t *state)
3824 3824 {
3825 3825 /*
3826 3826 * Unregister txbuf mr
3827 3827 */
3828 3828 if (ibt_deregister_mr(state->id_hca_hdl,
3829 3829 state->id_tx_mr_hdl) != IBT_SUCCESS) {
3830 3830 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3831 3831 }
3832 3832 state->id_tx_mr_hdl = NULL;
3833 3833
3834 3834 /*
3835 3835 * Free txbuf memory
3836 3836 */
3837 3837 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3838 3838 sizeof (ibd_swqe_t));
3839 3839 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3840 3840 state->id_tx_buf_sz);
3841 3841 state->id_tx_wqes = NULL;
3842 3842 state->id_tx_bufs = NULL;
3843 3843 }
3844 3844
3845 3845 static void
3846 3846 ibd_free_tx_lsobufs(ibd_state_t *state)
3847 3847 {
3848 3848 ibd_lsobkt_t *bktp;
3849 3849
3850 3850 mutex_enter(&state->id_lso_lock);
3851 3851
3852 3852 if ((bktp = state->id_lso) == NULL) {
3853 3853 mutex_exit(&state->id_lso_lock);
3854 3854 return;
3855 3855 }
3856 3856
3857 3857 /*
3858 3858 * First, free the buflist
3859 3859 */
3860 3860 ASSERT(bktp->bkt_bufl != NULL);
3861 3861 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3862 3862
3863 3863 /*
3864 3864 * Unregister the LSO memory and free it
3865 3865 */
3866 3866 ASSERT(bktp->bkt_mr_hdl != NULL);
3867 3867 if (ibt_deregister_mr(state->id_hca_hdl,
3868 3868 bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3869 3869 DPRINT(10,
3870 3870 "ibd_free_lsobufs: ibt_deregister_mr failed");
3871 3871 }
3872 3872 ASSERT(bktp->bkt_mem);
3873 3873 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3874 3874
3875 3875 /*
3876 3876 * Finally free the bucket
3877 3877 */
3878 3878 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3879 3879 state->id_lso = NULL;
3880 3880
3881 3881 mutex_exit(&state->id_lso_lock);
3882 3882 }
3883 3883
3884 3884 /*
3885 3885 * Free the statically allocated Tx buffer list.
3886 3886 */
3887 3887 static void
3888 3888 ibd_fini_txlist(ibd_state_t *state)
3889 3889 {
3890 3890 /*
3891 3891 * Free the allocated swqes
3892 3892 */
3893 3893 mutex_enter(&state->id_tx_list.dl_mutex);
3894 3894 mutex_enter(&state->id_tx_rel_list.dl_mutex);
3895 3895 state->id_tx_list.dl_head = NULL;
3896 3896 state->id_tx_list.dl_pending_sends = B_FALSE;
3897 3897 state->id_tx_list.dl_cnt = 0;
3898 3898 state->id_tx_rel_list.dl_head = NULL;
3899 3899 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3900 3900 state->id_tx_rel_list.dl_cnt = 0;
3901 3901 mutex_exit(&state->id_tx_rel_list.dl_mutex);
3902 3902 mutex_exit(&state->id_tx_list.dl_mutex);
3903 3903
3904 3904 ibd_free_tx_lsobufs(state);
3905 3905 ibd_free_tx_copybufs(state);
3906 3906 }
3907 3907
3908 3908 /*
3909 3909 * post a list of rwqes, NULL terminated.
3910 3910 */
3911 3911 static void
3912 3912 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3913 3913 {
3914 3914 uint_t i;
3915 3915 uint_t num_posted;
3916 3916 ibt_status_t ibt_status;
3917 3917 ibt_recv_wr_t wrs[IBD_RX_POST_CNT];
3918 3918
3919 3919 while (rwqe) {
3920 3920 /* Post up to IBD_RX_POST_CNT receive work requests */
3921 3921 for (i = 0; i < IBD_RX_POST_CNT; i++) {
3922 3922 wrs[i] = rwqe->w_rwr;
3923 3923 rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3924 3924 if (rwqe == NULL) {
3925 3925 i++;
3926 3926 break;
3927 3927 }
3928 3928 }
3929 3929
3930 3930 /*
3931 3931 * If posting fails for some reason, we'll never receive
3932 3932 * completion intimation, so we'll need to cleanup. But
3933 3933 * we need to make sure we don't clean up nodes whose
3934 3934 * wrs have been successfully posted. We assume that the
3935 3935 * hca driver returns on the first failure to post and
3936 3936 * therefore the first 'num_posted' entries don't need
3937 3937 * cleanup here.
3938 3938 */
3939 3939 atomic_add_32(&state->id_rx_list.dl_cnt, i);
3940 3940
3941 3941 num_posted = 0;
3942 3942 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3943 3943 &num_posted);
3944 3944 if (ibt_status != IBT_SUCCESS) {
3945 3945 /* This cannot happen unless the device has an error. */
3946 3946 ibd_print_warn(state, "ibd_post_recv: FATAL: "
3947 3947 "posting multiple wrs failed: "
3948 3948 "requested=%d, done=%d, ret=%d",
3949 3949 IBD_RX_POST_CNT, num_posted, ibt_status);
3950 3950 atomic_add_32(&state->id_rx_list.dl_cnt,
3951 3951 num_posted - i);
3952 3952 }
3953 3953 }
3954 3954 }
3955 3955
3956 3956 /*
3957 3957 * Grab a list of rwqes from the array of lists, and post the list.
3958 3958 */
3959 3959 static void
3960 3960 ibd_post_recv_intr(ibd_state_t *state)
3961 3961 {
3962 3962 ibd_rx_queue_t *rxp;
3963 3963 ibd_rwqe_t *list;
3964 3964
3965 3965 /* rotate through the rx_queue array, expecting an adequate number */
3966 3966 state->id_rx_post_queue_index =
3967 3967 (state->id_rx_post_queue_index + 1) &
3968 3968 (state->id_rx_nqueues - 1);
3969 3969
3970 3970 rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3971 3971 mutex_enter(&rxp->rx_post_lock);
3972 3972 list = WQE_TO_RWQE(rxp->rx_head);
3973 3973 rxp->rx_head = NULL;
3974 3974 rxp->rx_cnt = 0;
3975 3975 mutex_exit(&rxp->rx_post_lock);
3976 3976 ibd_post_recv_list(state, list);
3977 3977 }
3978 3978
3979 3979 /* macro explained below */
3980 3980 #define RX_QUEUE_HASH(rwqe) \
3981 3981 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3982 3982
3983 3983 /*
3984 3984 * Add a rwqe to one of the the Rx lists. If the list is large enough
3985 3985 * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3986 3986 *
3987 3987 * Note: one of 2^N lists is chosen via a hash. This is done
3988 3988 * because using one list is contentious. If the first list is busy
3989 3989 * (mutex_tryenter fails), use a second list (just call mutex_enter).
3990 3990 *
3991 3991 * The number 8 in RX_QUEUE_HASH is a random choice that provides
3992 3992 * even distribution of mapping rwqes to the 2^N queues.
3993 3993 */
3994 3994 static void
3995 3995 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3996 3996 {
3997 3997 ibd_rx_queue_t *rxp;
3998 3998
3999 3999 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
4000 4000
4001 4001 if (!mutex_tryenter(&rxp->rx_post_lock)) {
4002 4002 /* Failed. Try a different queue ("ptr + 16" ensures that). */
4003 4003 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
4004 4004 mutex_enter(&rxp->rx_post_lock);
4005 4005 }
4006 4006 rwqe->rwqe_next = rxp->rx_head;
4007 4007 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4008 4008 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4009 4009
4010 4010 /* only call ibt_post_recv() every Nth time through here */
4011 4011 if ((active & (state->id_rx_nqueues - 1)) == 0) {
4012 4012 rxp->rx_head = NULL;
4013 4013 rxp->rx_cnt = 0;
4014 4014 mutex_exit(&rxp->rx_post_lock);
4015 4015 ibd_post_recv_list(state, rwqe);
4016 4016 return;
4017 4017 }
4018 4018 }
4019 4019 rxp->rx_head = RWQE_TO_WQE(rwqe);
4020 4020 mutex_exit(&rxp->rx_post_lock);
4021 4021 }
4022 4022
4023 4023 static int
4024 4024 ibd_alloc_rx_copybufs(ibd_state_t *state)
4025 4025 {
4026 4026 ibt_mr_attr_t mem_attr;
4027 4027 int i;
4028 4028
4029 4029 /*
4030 4030 * Allocate one big chunk for all regular rx copy bufs
4031 4031 */
4032 4032 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4033 4033
4034 4034 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4035 4035 state->id_rx_buf_sz, KM_SLEEP);
4036 4036
4037 4037 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4038 4038 sizeof (ibd_rwqe_t), KM_SLEEP);
4039 4039
4040 4040 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4041 4041 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4042 4042 sizeof (ibd_rx_queue_t), KM_SLEEP);
4043 4043 for (i = 0; i < state->id_rx_nqueues; i++) {
4044 4044 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4045 4045 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4046 4046 }
4047 4047
4048 4048 /*
4049 4049 * Do one memory registration on the entire rxbuf area
4050 4050 */
4051 4051 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4052 4052 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4053 4053 mem_attr.mr_as = NULL;
4054 4054 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4055 4055 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4056 4056 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4057 4057 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4058 4058 kmem_free(state->id_rx_wqes,
4059 4059 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4060 4060 kmem_free(state->id_rx_bufs,
4061 4061 state->id_ud_num_rwqe * state->id_rx_buf_sz);
4062 4062 state->id_rx_bufs = NULL;
4063 4063 state->id_rx_wqes = NULL;
4064 4064 return (DDI_FAILURE);
4065 4065 }
4066 4066
4067 4067 return (DDI_SUCCESS);
4068 4068 }
4069 4069
4070 4070 /*
4071 4071 * Allocate the statically allocated Rx buffer list.
4072 4072 */
4073 4073 static int
4074 4074 ibd_init_rxlist(ibd_state_t *state)
4075 4075 {
4076 4076 ibd_rwqe_t *rwqe, *next;
4077 4077 ibd_wqe_t *list;
4078 4078 ibt_lkey_t lkey;
4079 4079 int i;
4080 4080 uint_t len;
4081 4081 uint8_t *bufaddr;
4082 4082
4083 4083 mutex_enter(&state->id_rx_free_list.dl_mutex);
4084 4084 if (state->id_rx_free_list.dl_head != NULL) {
4085 4085 /* rx rsrcs were never freed. Just repost them */
4086 4086 len = state->id_rx_buf_sz;
4087 4087 list = state->id_rx_free_list.dl_head;
4088 4088 state->id_rx_free_list.dl_head = NULL;
4089 4089 state->id_rx_free_list.dl_cnt = 0;
4090 4090 mutex_exit(&state->id_rx_free_list.dl_mutex);
4091 4091 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4092 4092 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4093 4093 if ((rwqe->rwqe_im_mblk = desballoc(
4094 4094 rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4095 4095 &rwqe->w_freemsg_cb)) == NULL) {
4096 4096 /* allow freemsg_cb to free the rwqes */
4097 4097 if (atomic_dec_32_nv(&state->id_running) != 0) {
4098 4098 cmn_err(CE_WARN, "ibd_init_rxlist: "
4099 4099 "id_running was not 1\n");
4100 4100 }
4101 4101 DPRINT(10, "ibd_init_rxlist : "
4102 4102 "failed in desballoc()");
4103 4103 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4104 4104 rwqe = next) {
4105 4105 next = WQE_TO_RWQE(rwqe->rwqe_next);
4106 4106 if (rwqe->rwqe_im_mblk) {
4107 4107 atomic_inc_32(&state->
4108 4108 id_rx_list.
4109 4109 dl_bufs_outstanding);
4110 4110 freemsg(rwqe->rwqe_im_mblk);
4111 4111 } else
4112 4112 ibd_free_rwqe(state, rwqe);
4113 4113 }
4114 4114 atomic_inc_32(&state->id_running);
4115 4115 return (DDI_FAILURE);
4116 4116 }
4117 4117 }
4118 4118 ibd_post_recv_list(state, WQE_TO_RWQE(list));
4119 4119 return (DDI_SUCCESS);
4120 4120 }
4121 4121 mutex_exit(&state->id_rx_free_list.dl_mutex);
4122 4122
4123 4123 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4124 4124 return (DDI_FAILURE);
4125 4125
4126 4126 /*
4127 4127 * Allocate and setup the rwqe list
4128 4128 */
4129 4129 len = state->id_rx_buf_sz;
4130 4130 lkey = state->id_rx_mr_desc.md_lkey;
4131 4131 rwqe = state->id_rx_wqes;
4132 4132 bufaddr = state->id_rx_bufs;
4133 4133 list = NULL;
4134 4134 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4135 4135 rwqe->w_state = state;
4136 4136 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4137 4137 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4138 4138
4139 4139 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4140 4140
4141 4141 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4142 4142 &rwqe->w_freemsg_cb)) == NULL) {
4143 4143 DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4144 4144 /* allow freemsg_cb to free the rwqes */
4145 4145 if (atomic_dec_32_nv(&state->id_running) != 0) {
4146 4146 cmn_err(CE_WARN, "ibd_init_rxlist: "
4147 4147 "id_running was not 1\n");
4148 4148 }
4149 4149 DPRINT(10, "ibd_init_rxlist : "
4150 4150 "failed in desballoc()");
4151 4151 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4152 4152 rwqe = next) {
4153 4153 next = WQE_TO_RWQE(rwqe->rwqe_next);
4154 4154 freemsg(rwqe->rwqe_im_mblk);
4155 4155 }
4156 4156 atomic_inc_32(&state->id_running);
4157 4157
4158 4158 /* remove reference to free'd rwqes */
4159 4159 mutex_enter(&state->id_rx_free_list.dl_mutex);
4160 4160 state->id_rx_free_list.dl_head = NULL;
4161 4161 state->id_rx_free_list.dl_cnt = 0;
4162 4162 mutex_exit(&state->id_rx_free_list.dl_mutex);
4163 4163
4164 4164 ibd_fini_rxlist(state);
4165 4165 return (DDI_FAILURE);
4166 4166 }
4167 4167
4168 4168 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4169 4169 rwqe->rwqe_copybuf.ic_sgl.ds_va =
4170 4170 (ib_vaddr_t)(uintptr_t)bufaddr;
4171 4171 rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4172 4172 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4173 4173 rwqe->w_rwr.wr_nds = 1;
4174 4174 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4175 4175
4176 4176 rwqe->rwqe_next = list;
4177 4177 list = RWQE_TO_WQE(rwqe);
4178 4178 }
4179 4179 ibd_post_recv_list(state, WQE_TO_RWQE(list));
4180 4180
4181 4181 return (DDI_SUCCESS);
4182 4182 }
4183 4183
4184 4184 static void
4185 4185 ibd_free_rx_copybufs(ibd_state_t *state)
4186 4186 {
4187 4187 int i;
4188 4188
4189 4189 /*
4190 4190 * Unregister rxbuf mr
4191 4191 */
4192 4192 if (ibt_deregister_mr(state->id_hca_hdl,
4193 4193 state->id_rx_mr_hdl) != IBT_SUCCESS) {
4194 4194 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4195 4195 }
4196 4196 state->id_rx_mr_hdl = NULL;
4197 4197
4198 4198 /*
4199 4199 * Free rxbuf memory
4200 4200 */
4201 4201 for (i = 0; i < state->id_rx_nqueues; i++) {
4202 4202 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4203 4203 mutex_destroy(&rxp->rx_post_lock);
4204 4204 }
4205 4205 kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4206 4206 sizeof (ibd_rx_queue_t));
4207 4207 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4208 4208 sizeof (ibd_rwqe_t));
4209 4209 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4210 4210 state->id_rx_buf_sz);
4211 4211 state->id_rx_queues = NULL;
4212 4212 state->id_rx_wqes = NULL;
4213 4213 state->id_rx_bufs = NULL;
4214 4214 }
4215 4215
4216 4216 static void
4217 4217 ibd_free_rx_rsrcs(ibd_state_t *state)
4218 4218 {
4219 4219 mutex_enter(&state->id_rx_free_list.dl_mutex);
4220 4220 if (state->id_rx_free_list.dl_head == NULL) {
4221 4221 /* already freed */
4222 4222 mutex_exit(&state->id_rx_free_list.dl_mutex);
4223 4223 return;
4224 4224 }
4225 4225 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4226 4226 ibd_free_rx_copybufs(state);
4227 4227 state->id_rx_free_list.dl_cnt = 0;
4228 4228 state->id_rx_free_list.dl_head = NULL;
4229 4229 mutex_exit(&state->id_rx_free_list.dl_mutex);
4230 4230 }
4231 4231
4232 4232 /*
4233 4233 * Free the statically allocated Rx buffer list.
4234 4234 */
4235 4235 static void
4236 4236 ibd_fini_rxlist(ibd_state_t *state)
4237 4237 {
4238 4238 ibd_rwqe_t *rwqe;
4239 4239 int i;
4240 4240
4241 4241 /* run through the rx_queue's, calling freemsg() */
4242 4242 for (i = 0; i < state->id_rx_nqueues; i++) {
4243 4243 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4244 4244 mutex_enter(&rxp->rx_post_lock);
4245 4245 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4246 4246 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4247 4247 freemsg(rwqe->rwqe_im_mblk);
4248 4248 rxp->rx_cnt--;
4249 4249 }
4250 4250 rxp->rx_head = NULL;
4251 4251 mutex_exit(&rxp->rx_post_lock);
4252 4252 }
4253 4253
4254 4254 /* cannot free rx resources unless gld returned everything */
4255 4255 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4256 4256 ibd_free_rx_rsrcs(state);
4257 4257 }
4258 4258
4259 4259 /*
4260 4260 * Free an allocated recv wqe.
4261 4261 */
4262 4262 /* ARGSUSED */
4263 4263 static void
4264 4264 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4265 4265 {
4266 4266 /*
4267 4267 * desballoc() failed (no memory).
4268 4268 *
4269 4269 * This rwqe is placed on a free list so that it
4270 4270 * can be reinstated when memory is available.
4271 4271 *
4272 4272 * NOTE: no code currently exists to reinstate
4273 4273 * these "lost" rwqes.
4274 4274 */
4275 4275 mutex_enter(&state->id_rx_free_list.dl_mutex);
4276 4276 state->id_rx_free_list.dl_cnt++;
4277 4277 rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4278 4278 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4279 4279 mutex_exit(&state->id_rx_free_list.dl_mutex);
4280 4280 }
4281 4281
4282 4282 /*
4283 4283 * IBA Rx completion queue handler. Guaranteed to be single
4284 4284 * threaded and nonreentrant for this CQ.
4285 4285 */
4286 4286 /* ARGSUSED */
4287 4287 static void
4288 4288 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4289 4289 {
4290 4290 ibd_state_t *state = (ibd_state_t *)arg;
4291 4291
4292 4292 atomic_inc_64(&state->id_num_intrs);
4293 4293
4294 4294 if (ibd_rx_softintr == 1) {
4295 4295 mutex_enter(&state->id_rcq_poll_lock);
4296 4296 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4297 4297 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4298 4298 mutex_exit(&state->id_rcq_poll_lock);
4299 4299 return;
4300 4300 } else {
4301 4301 mutex_exit(&state->id_rcq_poll_lock);
4302 4302 ddi_trigger_softintr(state->id_rx);
4303 4303 }
4304 4304 } else
4305 4305 (void) ibd_intr((caddr_t)state);
4306 4306 }
4307 4307
4308 4308 /*
4309 4309 * CQ handler for Tx completions, when the Tx CQ is in
4310 4310 * interrupt driven mode.
4311 4311 */
4312 4312 /* ARGSUSED */
4313 4313 static void
4314 4314 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4315 4315 {
4316 4316 ibd_state_t *state = (ibd_state_t *)arg;
4317 4317
4318 4318 atomic_inc_64(&state->id_num_intrs);
4319 4319
4320 4320 if (ibd_tx_softintr == 1) {
4321 4321 mutex_enter(&state->id_scq_poll_lock);
4322 4322 if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4323 4323 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4324 4324 mutex_exit(&state->id_scq_poll_lock);
4325 4325 return;
4326 4326 } else {
4327 4327 mutex_exit(&state->id_scq_poll_lock);
4328 4328 ddi_trigger_softintr(state->id_tx);
4329 4329 }
4330 4330 } else
4331 4331 (void) ibd_tx_recycle((caddr_t)state);
4332 4332 }
4333 4333
4334 4334 /*
4335 4335 * Multicast group create/delete trap handler. These will be delivered
4336 4336 * on a kernel thread (handling can thus block) and can be invoked
4337 4337 * concurrently. The handler can be invoked anytime after it is
4338 4338 * registered and before ibt_detach().
4339 4339 */
4340 4340 /* ARGSUSED */
4341 4341 static void
4342 4342 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4343 4343 ibt_subnet_event_t *event)
4344 4344 {
4345 4345 ibd_state_t *state = (ibd_state_t *)arg;
4346 4346 ibd_req_t *req;
4347 4347
4348 4348 /*
4349 4349 * The trap handler will get invoked once for every event for
4350 4350 * every port. The input "gid" is the GID0 of the port the
4351 4351 * trap came in on; we just need to act on traps that came
4352 4352 * to our port, meaning the port on which the ipoib interface
4353 4353 * resides. Since ipoib uses GID0 of the port, we just match
4354 4354 * the gids to check whether we need to handle the trap.
4355 4355 */
4356 4356 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4357 4357 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4358 4358 return;
4359 4359 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4360 4360
4361 4361 DPRINT(10, "ibd_notices_handler : %d\n", code);
4362 4362
4363 4363 switch (code) {
4364 4364 case IBT_SM_EVENT_UNAVAILABLE:
4365 4365 /*
4366 4366 * If we are in promiscuous mode or have
4367 4367 * sendnonmembers, we need to print a warning
4368 4368 * message right now. Else, just store the
4369 4369 * information, print when we enter promiscuous
4370 4370 * mode or attempt nonmember send. We might
4371 4371 * also want to stop caching sendnonmember.
4372 4372 */
4373 4373 ibd_print_warn(state, "IBA multicast support "
4374 4374 "degraded due to unavailability of multicast "
4375 4375 "traps");
4376 4376 break;
4377 4377 case IBT_SM_EVENT_AVAILABLE:
4378 4378 /*
4379 4379 * If we printed a warning message above or
4380 4380 * while trying to nonmember send or get into
4381 4381 * promiscuous mode, print an okay message.
4382 4382 */
4383 4383 ibd_print_warn(state, "IBA multicast support "
4384 4384 "restored due to availability of multicast "
4385 4385 "traps");
4386 4386 break;
4387 4387 case IBT_SM_EVENT_MCG_CREATED:
4388 4388 case IBT_SM_EVENT_MCG_DELETED:
4389 4389 /*
4390 4390 * If it is a "deleted" event and we are in late hca
4391 4391 * init, nothing to do.
4392 4392 */
4393 4393 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4394 4394 IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4395 4395 IBT_SM_EVENT_MCG_DELETED)) {
4396 4396 break;
4397 4397 }
4398 4398 /*
4399 4399 * Common processing of creation/deletion traps.
4400 4400 * First check if the instance is being
4401 4401 * [de]initialized; back off then, without doing
4402 4402 * anything more, since we are not sure if the
4403 4403 * async thread is around, or whether we might
4404 4404 * be racing with the detach code in ibd_m_stop()
4405 4405 * that scans the mcg list.
4406 4406 */
4407 4407 if (!ibd_async_safe(state))
4408 4408 return;
4409 4409
4410 4410 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4411 4411 req->rq_gid = event->sm_notice_gid;
4412 4412 req->rq_ptr = (void *)code;
4413 4413 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4414 4414 break;
4415 4415 }
4416 4416 }
4417 4417
4418 4418 static void
4419 4419 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4420 4420 {
4421 4421 ib_gid_t mgid = req->rq_gid;
4422 4422 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4423 4423 int ret;
4424 4424 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4425 4425
4426 4426 DPRINT(10, "ibd_async_trap : %d\n", code);
4427 4427
4428 4428 /*
4429 4429 * Check if we have already joined the IPoIB broadcast group for our
4430 4430 * PKEY. If joined, perform the rest of the operation.
4431 4431 * Else, the interface is not initialised. Do the initialisation here
4432 4432 * by calling ibd_start() and return.
4433 4433 */
4434 4434
4435 4435 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4436 4436 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4437 4437 (code == IBT_SM_EVENT_MCG_CREATED)) {
4438 4438 /*
4439 4439 * If we are in late HCA init and a notification for the
4440 4440 * creation of a MCG came in, check if it is the IPoIB MCG for
4441 4441 * this pkey. If not, return.
4442 4442 */
4443 4443 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4444 4444 state->id_pkey)) {
4445 4445 ibd_async_done(state);
4446 4446 return;
4447 4447 }
4448 4448 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4449 4449 /*
4450 4450 * Check if there is still a necessity to start the interface.
4451 4451 * It is possible that the user attempted unplumb at just about
4452 4452 * the same time, and if unplumb succeeded, we have nothing to
4453 4453 * do.
4454 4454 */
4455 4455 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4456 4456 IBD_DRV_IN_LATE_HCA_INIT) &&
4457 4457 ((ret = ibd_start(state)) != 0)) {
4458 4458 DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4459 4459 "init, ret=%d", ret);
4460 4460 }
4461 4461 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4462 4462 ibd_async_done(state);
4463 4463 return;
4464 4464 }
4465 4465
4466 4466 /*
4467 4467 * Atomically search the nonmember and sendonlymember lists and
4468 4468 * delete.
4469 4469 */
4470 4470 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4471 4471
4472 4472 if (state->id_prom_op == IBD_OP_COMPLETED) {
4473 4473 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4474 4474
4475 4475 /*
4476 4476 * If in promiscuous mode, try to join/attach to the new
4477 4477 * mcg. Given the unreliable out-of-order mode of trap
4478 4478 * delivery, we can never be sure whether it is a problem
4479 4479 * if the join fails. Thus, we warn the admin of a failure
4480 4480 * if this was a creation trap. Note that the trap might
4481 4481 * actually be reporting a long past event, and the mcg
4482 4482 * might already have been deleted, thus we might be warning
4483 4483 * in vain.
4484 4484 */
4485 4485 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4486 4486 NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4487 4487 ibd_print_warn(state, "IBA promiscuous mode missed "
4488 4488 "new multicast gid %016llx:%016llx",
4489 4489 (u_longlong_t)mgid.gid_prefix,
4490 4490 (u_longlong_t)mgid.gid_guid);
4491 4491 }
4492 4492
4493 4493 /*
4494 4494 * Free the request slot allocated by the subnet event thread.
4495 4495 */
4496 4496 ibd_async_done(state);
4497 4497 }
4498 4498
4499 4499 /*
4500 4500 * GLDv3 entry point to get capabilities.
4501 4501 */
4502 4502 static boolean_t
4503 4503 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4504 4504 {
4505 4505 ibd_state_t *state = arg;
4506 4506
4507 4507 if (state->id_type == IBD_PORT_DRIVER)
4508 4508 return (B_FALSE);
4509 4509
4510 4510 switch (cap) {
4511 4511 case MAC_CAPAB_HCKSUM: {
4512 4512 uint32_t *txflags = cap_data;
4513 4513
4514 4514 /*
4515 4515 * We either do full checksum or not do it at all
4516 4516 */
4517 4517 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4518 4518 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4519 4519 else
4520 4520 return (B_FALSE);
4521 4521 break;
4522 4522 }
4523 4523
4524 4524 case MAC_CAPAB_LSO: {
4525 4525 mac_capab_lso_t *cap_lso = cap_data;
4526 4526
4527 4527 /*
4528 4528 * In addition to the capability and policy, since LSO
4529 4529 * relies on hw checksum, we'll not enable LSO if we
4530 4530 * don't have hw checksum. Of course, if the HCA doesn't
4531 4531 * provide the reserved lkey capability, enabling LSO will
4532 4532 * actually affect performance adversely, so we'll disable
4533 4533 * LSO even for that case.
4534 4534 */
4535 4535 if (!state->id_lso_policy || !state->id_lso_capable)
4536 4536 return (B_FALSE);
4537 4537
4538 4538 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4539 4539 return (B_FALSE);
4540 4540
4541 4541 if (state->id_hca_res_lkey_capab == 0) {
4542 4542 ibd_print_warn(state, "no reserved-lkey capability, "
4543 4543 "disabling LSO");
4544 4544 return (B_FALSE);
4545 4545 }
4546 4546
4547 4547 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4548 4548 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4549 4549 break;
4550 4550 }
4551 4551
4552 4552 default:
4553 4553 return (B_FALSE);
4554 4554 }
4555 4555
4556 4556 return (B_TRUE);
4557 4557 }
4558 4558
4559 4559 /*
4560 4560 * callback function for set/get of properties
4561 4561 */
4562 4562 static int
4563 4563 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4564 4564 uint_t pr_valsize, const void *pr_val)
4565 4565 {
4566 4566 ibd_state_t *state = arg;
4567 4567 int err = 0;
4568 4568 uint32_t link_mode;
4569 4569
4570 4570 /* Cannot set properties on a port driver */
4571 4571 if (state->id_type == IBD_PORT_DRIVER) {
4572 4572 return (ENOTSUP);
4573 4573 }
4574 4574
4575 4575 switch (pr_num) {
4576 4576 case MAC_PROP_IB_LINKMODE:
4577 4577 if (state->id_mac_state & IBD_DRV_STARTED) {
4578 4578 err = EBUSY;
4579 4579 break;
4580 4580 }
4581 4581 if (pr_val == NULL) {
4582 4582 err = EINVAL;
4583 4583 break;
4584 4584 }
4585 4585 bcopy(pr_val, &link_mode, sizeof (link_mode));
4586 4586 if (link_mode != IBD_LINK_MODE_UD &&
4587 4587 link_mode != IBD_LINK_MODE_RC) {
4588 4588 err = EINVAL;
4589 4589 } else {
4590 4590 if (link_mode == IBD_LINK_MODE_RC) {
4591 4591 if (state->id_enable_rc) {
4592 4592 return (0);
4593 4593 }
4594 4594 state->id_enable_rc = 1;
4595 4595 /* inform MAC framework of new MTU */
4596 4596 err = mac_maxsdu_update2(state->id_mh,
4597 4597 state->rc_mtu - IPOIB_HDRSIZE,
4598 4598 state->id_mtu - IPOIB_HDRSIZE);
4599 4599 } else {
4600 4600 if (!state->id_enable_rc) {
4601 4601 return (0);
4602 4602 }
4603 4603 state->id_enable_rc = 0;
4604 4604 err = mac_maxsdu_update2(state->id_mh,
4605 4605 state->id_mtu - IPOIB_HDRSIZE,
4606 4606 state->id_mtu - IPOIB_HDRSIZE);
4607 4607 }
4608 4608 (void) ibd_record_capab(state);
4609 4609 mac_capab_update(state->id_mh);
4610 4610 }
4611 4611 break;
4612 4612 case MAC_PROP_PRIVATE:
4613 4613 err = ibd_set_priv_prop(state, pr_name,
4614 4614 pr_valsize, pr_val);
4615 4615 break;
4616 4616 default:
4617 4617 err = ENOTSUP;
4618 4618 break;
4619 4619 }
4620 4620 return (err);
4621 4621 }
4622 4622
4623 4623 static int
4624 4624 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4625 4625 uint_t pr_valsize, void *pr_val)
4626 4626 {
4627 4627 ibd_state_t *state = arg;
4628 4628 int err = 0;
4629 4629
4630 4630 switch (pr_num) {
4631 4631 case MAC_PROP_MTU:
4632 4632 break;
4633 4633 default:
4634 4634 if (state->id_type == IBD_PORT_DRIVER) {
4635 4635 return (ENOTSUP);
4636 4636 }
4637 4637 break;
4638 4638 }
4639 4639
4640 4640 switch (pr_num) {
4641 4641 case MAC_PROP_IB_LINKMODE:
4642 4642 *(uint_t *)pr_val = state->id_enable_rc;
4643 4643 break;
4644 4644 case MAC_PROP_PRIVATE:
4645 4645 err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4646 4646 pr_val);
4647 4647 break;
4648 4648 default:
4649 4649 err = ENOTSUP;
4650 4650 break;
4651 4651 }
4652 4652 return (err);
4653 4653 }
4654 4654
4655 4655 static void
4656 4656 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4657 4657 mac_prop_info_handle_t prh)
4658 4658 {
4659 4659 ibd_state_t *state = arg;
4660 4660
4661 4661 switch (pr_num) {
4662 4662 case MAC_PROP_IB_LINKMODE: {
4663 4663 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4664 4664 break;
4665 4665 }
4666 4666 case MAC_PROP_MTU: {
4667 4667 uint32_t min, max;
4668 4668 if (state->id_type == IBD_PORT_DRIVER) {
4669 4669 min = 1500;
4670 4670 max = IBD_DEF_RC_MAX_SDU;
4671 4671 } else if (state->id_enable_rc) {
4672 4672 min = max = IBD_DEF_RC_MAX_SDU;
4673 4673 } else {
4674 4674 min = max = state->id_mtu - IPOIB_HDRSIZE;
4675 4675 }
4676 4676 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4677 4677 mac_prop_info_set_range_uint32(prh, min, max);
4678 4678 break;
4679 4679 }
4680 4680 case MAC_PROP_PRIVATE: {
4681 4681 char valstr[64];
4682 4682 int value;
4683 4683
4684 4684 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4685 4685 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4686 4686 return;
4687 4687 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4688 4688 value = IBD_DEF_COALESCE_COMPLETIONS;
4689 4689 } else if (strcmp(pr_name,
4690 4690 "_ibd_create_broadcast_group") == 0) {
4691 4691 value = IBD_DEF_CREATE_BCAST_GROUP;
4692 4692 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4693 4693 value = IBD_DEF_HASH_SIZE;
4694 4694 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4695 4695 value = IBD_DEF_LSO_POLICY;
4696 4696 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4697 4697 value = IBD_DEF_NUM_AH;
4698 4698 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4699 4699 value = IBD_DEF_NUM_LSO_BUFS;
4700 4700 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4701 4701 value = IBD_DEF_RC_ENABLE_SRQ;
4702 4702 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4703 4703 value = IBD_DEF_RC_NUM_RWQE;
4704 4704 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4705 4705 value = IBD_DEF_RC_NUM_SRQ;
4706 4706 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4707 4707 value = IBD_DEF_RC_NUM_SWQE;
4708 4708 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4709 4709 value = IBD_DEF_RC_RX_COMP_COUNT;
4710 4710 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4711 4711 value = IBD_DEF_RC_RX_COMP_USEC;
4712 4712 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4713 4713 value = IBD_DEF_RC_RX_COPY_THRESH;
4714 4714 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4715 4715 value = IBD_DEF_RC_RX_RWQE_THRESH;
4716 4716 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4717 4717 value = IBD_DEF_RC_TX_COMP_COUNT;
4718 4718 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4719 4719 value = IBD_DEF_RC_TX_COMP_USEC;
4720 4720 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4721 4721 value = IBD_DEF_RC_TX_COPY_THRESH;
4722 4722 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4723 4723 value = IBD_DEF_UD_NUM_RWQE;
4724 4724 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4725 4725 value = IBD_DEF_UD_NUM_SWQE;
4726 4726 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4727 4727 value = IBD_DEF_UD_RX_COMP_COUNT;
4728 4728 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4729 4729 value = IBD_DEF_UD_RX_COMP_USEC;
4730 4730 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4731 4731 value = IBD_DEF_UD_TX_COMP_COUNT;
4732 4732 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4733 4733 value = IBD_DEF_UD_TX_COMP_USEC;
4734 4734 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4735 4735 value = IBD_DEF_UD_TX_COPY_THRESH;
4736 4736 } else {
4737 4737 return;
4738 4738 }
4739 4739
4740 4740 (void) snprintf(valstr, sizeof (valstr), "%d", value);
4741 4741 mac_prop_info_set_default_str(prh, valstr);
4742 4742 break;
4743 4743 }
4744 4744 } /* switch (pr_num) */
4745 4745 }
4746 4746
4747 4747 /* ARGSUSED2 */
4748 4748 static int
4749 4749 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4750 4750 uint_t pr_valsize, const void *pr_val)
4751 4751 {
4752 4752 int err = 0;
4753 4753 long result;
4754 4754
4755 4755 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4756 4756 if (pr_val == NULL) {
4757 4757 return (EINVAL);
4758 4758 }
4759 4759 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4760 4760 if (result < 0 || result > 1) {
4761 4761 err = EINVAL;
4762 4762 } else {
4763 4763 state->id_allow_coalesce_comp_tuning = (result == 1) ?
4764 4764 B_TRUE: B_FALSE;
4765 4765 }
4766 4766 return (err);
4767 4767 }
4768 4768 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4769 4769 if (state->id_mac_state & IBD_DRV_STARTED) {
4770 4770 return (EBUSY);
4771 4771 }
4772 4772 if (pr_val == NULL) {
4773 4773 return (EINVAL);
4774 4774 }
4775 4775 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4776 4776 if (result < 0 || result > 1) {
4777 4777 err = EINVAL;
4778 4778 } else {
4779 4779 state->id_create_broadcast_group = (result == 1) ?
4780 4780 B_TRUE: B_FALSE;
4781 4781 }
4782 4782 return (err);
4783 4783 }
4784 4784 if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4785 4785 if (state->id_mac_state & IBD_DRV_STARTED) {
4786 4786 return (EBUSY);
4787 4787 }
4788 4788 if (pr_val == NULL) {
4789 4789 return (EINVAL);
4790 4790 }
4791 4791 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4792 4792 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4793 4793 err = EINVAL;
4794 4794 } else {
4795 4795 state->id_hash_size = (uint32_t)result;
4796 4796 }
4797 4797 return (err);
4798 4798 }
4799 4799 if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4800 4800 if (state->id_mac_state & IBD_DRV_STARTED) {
4801 4801 return (EBUSY);
4802 4802 }
4803 4803 if (pr_val == NULL) {
4804 4804 return (EINVAL);
4805 4805 }
4806 4806 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4807 4807 if (result < 0 || result > 1) {
4808 4808 err = EINVAL;
4809 4809 } else {
4810 4810 state->id_lso_policy = (result == 1) ?
4811 4811 B_TRUE: B_FALSE;
4812 4812 }
4813 4813 mac_capab_update(state->id_mh);
4814 4814 return (err);
4815 4815 }
4816 4816 if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4817 4817 if (state->id_mac_state & IBD_DRV_STARTED) {
4818 4818 return (EBUSY);
4819 4819 }
4820 4820 if (pr_val == NULL) {
4821 4821 return (EINVAL);
4822 4822 }
4823 4823 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4824 4824 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4825 4825 err = EINVAL;
4826 4826 } else {
4827 4827 state->id_num_ah = (uint32_t)result;
4828 4828 }
4829 4829 return (err);
4830 4830 }
4831 4831 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4832 4832 if (state->id_mac_state & IBD_DRV_STARTED) {
4833 4833 return (EBUSY);
4834 4834 }
4835 4835 if (!state->id_lso_policy || !state->id_lso_capable) {
4836 4836 return (EINVAL);
4837 4837 }
4838 4838 if (pr_val == NULL) {
4839 4839 return (EINVAL);
4840 4840 }
4841 4841 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4842 4842 if (result < IBD_MIN_NUM_LSO_BUFS ||
4843 4843 result > IBD_MAX_NUM_LSO_BUFS) {
4844 4844 err = EINVAL;
4845 4845 } else {
4846 4846 state->id_num_lso_bufs = (uint32_t)result;
4847 4847 }
4848 4848 return (err);
4849 4849 }
4850 4850 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4851 4851 if (state->id_mac_state & IBD_DRV_STARTED) {
4852 4852 return (EBUSY);
4853 4853 }
4854 4854 if (pr_val == NULL) {
4855 4855 return (EINVAL);
4856 4856 }
4857 4857 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4858 4858 if (result < 0 || result > 1) {
4859 4859 err = EINVAL;
4860 4860 } else {
4861 4861 state->rc_enable_srq = (result == 1) ?
4862 4862 B_TRUE: B_FALSE;
4863 4863 }
4864 4864 if (!state->rc_enable_srq) {
4865 4865 state->id_rc_num_srq = 0;
4866 4866 }
4867 4867 return (err);
4868 4868 }
4869 4869 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4870 4870 if (state->id_mac_state & IBD_DRV_STARTED) {
4871 4871 return (EBUSY);
4872 4872 }
4873 4873 if (pr_val == NULL) {
4874 4874 return (EINVAL);
4875 4875 }
4876 4876 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4877 4877 if (result < IBD_MIN_RC_NUM_RWQE ||
4878 4878 result > IBD_MAX_RC_NUM_RWQE) {
4879 4879 err = EINVAL;
4880 4880 } else {
4881 4881 state->id_rc_num_rwqe = (uint32_t)result;
4882 4882 if (state->id_allow_coalesce_comp_tuning &&
4883 4883 state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4884 4884 state->id_rc_rx_comp_count =
4885 4885 state->id_rc_num_rwqe;
4886 4886 if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4887 4887 state->id_rc_num_srq =
4888 4888 state->id_rc_num_rwqe - 1;
4889 4889 /*
4890 4890 * If rx_rwqe_threshold is greater than the number of
4891 4891 * rwqes, pull it back to 25% of number of rwqes.
4892 4892 */
4893 4893 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4894 4894 state->id_rc_rx_rwqe_thresh =
4895 4895 (state->id_rc_num_rwqe >> 2);
4896 4896
4897 4897 }
4898 4898 return (err);
4899 4899 }
4900 4900 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4901 4901 if (state->id_mac_state & IBD_DRV_STARTED) {
4902 4902 return (EBUSY);
4903 4903 }
4904 4904 if (pr_val == NULL) {
4905 4905 return (EINVAL);
4906 4906 }
4907 4907 if (!state->rc_enable_srq)
4908 4908 return (EINVAL);
4909 4909
4910 4910 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4911 4911 if (result < IBD_MIN_RC_NUM_SRQ ||
4912 4912 result >= state->id_rc_num_rwqe) {
4913 4913 err = EINVAL;
4914 4914 } else
4915 4915 state->id_rc_num_srq = (uint32_t)result;
4916 4916 return (err);
4917 4917 }
4918 4918 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4919 4919 if (state->id_mac_state & IBD_DRV_STARTED) {
4920 4920 return (EBUSY);
4921 4921 }
4922 4922 if (pr_val == NULL) {
4923 4923 return (EINVAL);
4924 4924 }
4925 4925 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4926 4926 if (result < IBD_MIN_RC_NUM_SWQE ||
4927 4927 result > IBD_MAX_RC_NUM_SWQE) {
4928 4928 err = EINVAL;
4929 4929 } else {
4930 4930 state->id_rc_num_swqe = (uint32_t)result;
4931 4931 if (state->id_allow_coalesce_comp_tuning &&
4932 4932 state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4933 4933 state->id_rc_tx_comp_count =
4934 4934 state->id_rc_num_swqe;
4935 4935 }
4936 4936 return (err);
4937 4937 }
4938 4938 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4939 4939 if (!state->id_allow_coalesce_comp_tuning) {
4940 4940 return (ENOTSUP);
4941 4941 }
4942 4942 if (pr_val == NULL) {
4943 4943 return (EINVAL);
4944 4944 }
4945 4945 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4946 4946 if (result < 1 || result > state->id_rc_num_rwqe) {
4947 4947 err = EINVAL;
4948 4948 } else {
4949 4949 state->id_rc_rx_comp_count = (uint32_t)result;
4950 4950 }
4951 4951 return (err);
4952 4952 }
4953 4953 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4954 4954 if (!state->id_allow_coalesce_comp_tuning) {
4955 4955 return (ENOTSUP);
4956 4956 }
4957 4957 if (pr_val == NULL) {
4958 4958 return (EINVAL);
4959 4959 }
4960 4960 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4961 4961 if (result < 1) {
4962 4962 err = EINVAL;
4963 4963 } else {
4964 4964 state->id_rc_rx_comp_usec = (uint32_t)result;
4965 4965 }
4966 4966 return (err);
4967 4967 }
4968 4968 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4969 4969 if (state->id_mac_state & IBD_DRV_STARTED) {
4970 4970 return (EBUSY);
4971 4971 }
4972 4972 if (pr_val == NULL) {
4973 4973 return (EINVAL);
4974 4974 }
4975 4975 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4976 4976 if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4977 4977 result > state->rc_mtu) {
4978 4978 err = EINVAL;
4979 4979 } else {
4980 4980 state->id_rc_rx_copy_thresh = (uint32_t)result;
4981 4981 }
4982 4982 return (err);
4983 4983 }
4984 4984 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4985 4985 if (state->id_mac_state & IBD_DRV_STARTED) {
4986 4986 return (EBUSY);
4987 4987 }
4988 4988 if (pr_val == NULL) {
4989 4989 return (EINVAL);
4990 4990 }
4991 4991 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4992 4992 if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4993 4993 result >= state->id_rc_num_rwqe) {
4994 4994 err = EINVAL;
4995 4995 } else {
4996 4996 state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4997 4997 }
4998 4998 return (err);
4999 4999 }
5000 5000 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5001 5001 if (!state->id_allow_coalesce_comp_tuning) {
5002 5002 return (ENOTSUP);
5003 5003 }
5004 5004 if (pr_val == NULL) {
5005 5005 return (EINVAL);
5006 5006 }
5007 5007 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5008 5008 if (result < 1 || result > state->id_rc_num_swqe) {
5009 5009 err = EINVAL;
5010 5010 } else {
5011 5011 state->id_rc_tx_comp_count = (uint32_t)result;
5012 5012 }
5013 5013 return (err);
5014 5014 }
5015 5015 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5016 5016 if (!state->id_allow_coalesce_comp_tuning) {
5017 5017 return (ENOTSUP);
5018 5018 }
5019 5019 if (pr_val == NULL) {
5020 5020 return (EINVAL);
5021 5021 }
5022 5022 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5023 5023 if (result < 1)
5024 5024 err = EINVAL;
5025 5025 else {
5026 5026 state->id_rc_tx_comp_usec = (uint32_t)result;
5027 5027 }
5028 5028 return (err);
5029 5029 }
5030 5030 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5031 5031 if (state->id_mac_state & IBD_DRV_STARTED) {
5032 5032 return (EBUSY);
5033 5033 }
5034 5034 if (pr_val == NULL) {
5035 5035 return (EINVAL);
5036 5036 }
5037 5037 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5038 5038 if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5039 5039 result > state->rc_mtu) {
5040 5040 err = EINVAL;
5041 5041 } else {
5042 5042 state->id_rc_tx_copy_thresh = (uint32_t)result;
5043 5043 }
5044 5044 return (err);
5045 5045 }
5046 5046 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5047 5047 if (state->id_mac_state & IBD_DRV_STARTED) {
5048 5048 return (EBUSY);
5049 5049 }
5050 5050 if (pr_val == NULL) {
5051 5051 return (EINVAL);
5052 5052 }
5053 5053 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5054 5054 if (result < IBD_MIN_UD_NUM_RWQE ||
5055 5055 result > IBD_MAX_UD_NUM_RWQE) {
5056 5056 err = EINVAL;
5057 5057 } else {
5058 5058 if (result > state->id_hca_max_chan_sz) {
5059 5059 state->id_ud_num_rwqe =
5060 5060 state->id_hca_max_chan_sz;
5061 5061 } else {
5062 5062 state->id_ud_num_rwqe = (uint32_t)result;
5063 5063 }
5064 5064 if (state->id_allow_coalesce_comp_tuning &&
5065 5065 state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5066 5066 state->id_ud_rx_comp_count =
5067 5067 state->id_ud_num_rwqe;
5068 5068 }
5069 5069 return (err);
5070 5070 }
5071 5071 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5072 5072 if (state->id_mac_state & IBD_DRV_STARTED) {
5073 5073 return (EBUSY);
5074 5074 }
5075 5075 if (pr_val == NULL) {
5076 5076 return (EINVAL);
5077 5077 }
5078 5078 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5079 5079 if (result < IBD_MIN_UD_NUM_SWQE ||
5080 5080 result > IBD_MAX_UD_NUM_SWQE) {
5081 5081 err = EINVAL;
5082 5082 } else {
5083 5083 if (result > state->id_hca_max_chan_sz) {
5084 5084 state->id_ud_num_swqe =
5085 5085 state->id_hca_max_chan_sz;
5086 5086 } else {
5087 5087 state->id_ud_num_swqe = (uint32_t)result;
5088 5088 }
5089 5089 if (state->id_allow_coalesce_comp_tuning &&
5090 5090 state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5091 5091 state->id_ud_tx_comp_count =
5092 5092 state->id_ud_num_swqe;
5093 5093 }
5094 5094 return (err);
5095 5095 }
5096 5096 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5097 5097 if (!state->id_allow_coalesce_comp_tuning) {
5098 5098 return (ENOTSUP);
5099 5099 }
5100 5100 if (pr_val == NULL) {
5101 5101 return (EINVAL);
5102 5102 }
5103 5103 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5104 5104 if (result < 1 || result > state->id_ud_num_rwqe) {
5105 5105 err = EINVAL;
5106 5106 } else {
5107 5107 state->id_ud_rx_comp_count = (uint32_t)result;
5108 5108 }
5109 5109 return (err);
5110 5110 }
5111 5111 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5112 5112 if (!state->id_allow_coalesce_comp_tuning) {
5113 5113 return (ENOTSUP);
5114 5114 }
5115 5115 if (pr_val == NULL) {
5116 5116 return (EINVAL);
5117 5117 }
5118 5118 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5119 5119 if (result < 1) {
5120 5120 err = EINVAL;
5121 5121 } else {
5122 5122 state->id_ud_rx_comp_usec = (uint32_t)result;
5123 5123 }
5124 5124 return (err);
5125 5125 }
5126 5126 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5127 5127 if (!state->id_allow_coalesce_comp_tuning) {
5128 5128 return (ENOTSUP);
5129 5129 }
5130 5130 if (pr_val == NULL) {
5131 5131 return (EINVAL);
5132 5132 }
5133 5133 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5134 5134 if (result < 1 || result > state->id_ud_num_swqe) {
5135 5135 err = EINVAL;
5136 5136 } else {
5137 5137 state->id_ud_tx_comp_count = (uint32_t)result;
5138 5138 }
5139 5139 return (err);
5140 5140 }
5141 5141 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5142 5142 if (!state->id_allow_coalesce_comp_tuning) {
5143 5143 return (ENOTSUP);
5144 5144 }
5145 5145 if (pr_val == NULL) {
5146 5146 return (EINVAL);
5147 5147 }
5148 5148 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5149 5149 if (result < 1) {
5150 5150 err = EINVAL;
5151 5151 } else {
5152 5152 state->id_ud_tx_comp_usec = (uint32_t)result;
5153 5153 }
5154 5154 return (err);
5155 5155 }
5156 5156 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5157 5157 if (state->id_mac_state & IBD_DRV_STARTED) {
5158 5158 return (EBUSY);
5159 5159 }
5160 5160 if (pr_val == NULL) {
5161 5161 return (EINVAL);
5162 5162 }
5163 5163 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5164 5164 if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5165 5165 result > IBD_MAX_UD_TX_COPY_THRESH) {
5166 5166 err = EINVAL;
5167 5167 } else {
5168 5168 state->id_ud_tx_copy_thresh = (uint32_t)result;
5169 5169 }
5170 5170 return (err);
5171 5171 }
5172 5172 return (ENOTSUP);
5173 5173 }
5174 5174
5175 5175 static int
5176 5176 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5177 5177 void *pr_val)
5178 5178 {
5179 5179 int err = ENOTSUP;
5180 5180 int value;
5181 5181
5182 5182 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5183 5183 value = state->id_bgroup_present;
5184 5184 err = 0;
5185 5185 goto done;
5186 5186 }
5187 5187 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5188 5188 value = state->id_allow_coalesce_comp_tuning;
5189 5189 err = 0;
5190 5190 goto done;
5191 5191 }
5192 5192 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5193 5193 value = state->id_create_broadcast_group;
5194 5194 err = 0;
5195 5195 goto done;
5196 5196 }
5197 5197 if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5198 5198 value = state->id_hash_size;
5199 5199 err = 0;
5200 5200 goto done;
5201 5201 }
5202 5202 if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5203 5203 value = state->id_lso_policy;
5204 5204 err = 0;
5205 5205 goto done;
5206 5206 }
5207 5207 if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5208 5208 value = state->id_num_ah;
5209 5209 err = 0;
5210 5210 goto done;
5211 5211 }
5212 5212 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5213 5213 value = state->id_num_lso_bufs;
5214 5214 err = 0;
5215 5215 goto done;
5216 5216 }
5217 5217 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5218 5218 value = state->rc_enable_srq;
5219 5219 err = 0;
5220 5220 goto done;
5221 5221 }
5222 5222 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5223 5223 value = state->id_rc_num_rwqe;
5224 5224 err = 0;
5225 5225 goto done;
5226 5226 }
5227 5227 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5228 5228 value = state->id_rc_num_srq;
5229 5229 err = 0;
5230 5230 goto done;
5231 5231 }
5232 5232 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5233 5233 value = state->id_rc_num_swqe;
5234 5234 err = 0;
5235 5235 goto done;
5236 5236 }
5237 5237 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5238 5238 value = state->id_rc_rx_comp_count;
5239 5239 err = 0;
5240 5240 goto done;
5241 5241 }
5242 5242 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5243 5243 value = state->id_rc_rx_comp_usec;
5244 5244 err = 0;
5245 5245 goto done;
5246 5246 }
5247 5247 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5248 5248 value = state->id_rc_rx_copy_thresh;
5249 5249 err = 0;
5250 5250 goto done;
5251 5251 }
5252 5252 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5253 5253 value = state->id_rc_rx_rwqe_thresh;
5254 5254 err = 0;
5255 5255 goto done;
5256 5256 }
5257 5257 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5258 5258 value = state->id_rc_tx_comp_count;
5259 5259 err = 0;
5260 5260 goto done;
5261 5261 }
5262 5262 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5263 5263 value = state->id_rc_tx_comp_usec;
5264 5264 err = 0;
5265 5265 goto done;
5266 5266 }
5267 5267 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5268 5268 value = state->id_rc_tx_copy_thresh;
5269 5269 err = 0;
5270 5270 goto done;
5271 5271 }
5272 5272 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5273 5273 value = state->id_ud_num_rwqe;
5274 5274 err = 0;
5275 5275 goto done;
5276 5276 }
5277 5277 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5278 5278 value = state->id_ud_num_swqe;
5279 5279 err = 0;
5280 5280 goto done;
5281 5281 }
5282 5282 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5283 5283 value = state->id_ud_rx_comp_count;
5284 5284 err = 0;
5285 5285 goto done;
5286 5286 }
5287 5287 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5288 5288 value = state->id_ud_rx_comp_usec;
5289 5289 err = 0;
5290 5290 goto done;
5291 5291 }
5292 5292 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5293 5293 value = state->id_ud_tx_comp_count;
5294 5294 err = 0;
5295 5295 goto done;
5296 5296 }
5297 5297 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5298 5298 value = state->id_ud_tx_comp_usec;
5299 5299 err = 0;
5300 5300 goto done;
5301 5301 }
5302 5302 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5303 5303 value = state->id_ud_tx_copy_thresh;
5304 5304 err = 0;
5305 5305 goto done;
5306 5306 }
5307 5307 done:
5308 5308 if (err == 0) {
5309 5309 (void) snprintf(pr_val, pr_valsize, "%d", value);
5310 5310 }
5311 5311 return (err);
5312 5312 }
5313 5313
5314 5314 static int
5315 5315 ibd_get_port_details(ibd_state_t *state)
5316 5316 {
5317 5317 ibt_hca_portinfo_t *port_infop;
5318 5318 ibt_status_t ret;
5319 5319 uint_t psize, port_infosz;
5320 5320
5321 5321 mutex_enter(&state->id_link_mutex);
5322 5322
5323 5323 /*
5324 5324 * Query for port information
5325 5325 */
5326 5326 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5327 5327 &port_infop, &psize, &port_infosz);
5328 5328 if ((ret != IBT_SUCCESS) || (psize != 1)) {
5329 5329 mutex_exit(&state->id_link_mutex);
5330 5330 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5331 5331 "failed, ret=%d", ret);
5332 5332 return (ENETDOWN);
5333 5333 }
5334 5334
5335 5335 /*
5336 5336 * If the link is active, verify the pkey
5337 5337 */
5338 5338 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5339 5339 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5340 5340 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5341 5341 state->id_link_state = LINK_STATE_DOWN;
5342 5342 } else {
5343 5343 state->id_link_state = LINK_STATE_UP;
5344 5344 }
5345 5345 state->id_mtu = (128 << port_infop->p_mtu);
5346 5346 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5347 5347 state->id_sgid = *port_infop->p_sgid_tbl;
5348 5348 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5349 5349 /*
5350 5350 * Now that the port is active, record the port speed
5351 5351 */
5352 5352 state->id_link_speed = ibd_get_portspeed(state);
5353 5353 } else {
5354 5354 /* Make sure that these are handled in PORT_UP/CHANGE */
5355 5355 state->id_mtu = 0;
5356 5356 state->id_link_state = LINK_STATE_DOWN;
5357 5357 state->id_link_speed = 0;
5358 5358 }
5359 5359 mutex_exit(&state->id_link_mutex);
5360 5360 ibt_free_portinfo(port_infop, port_infosz);
5361 5361
5362 5362 return (0);
5363 5363 }
5364 5364
5365 5365 static int
5366 5366 ibd_alloc_cqs(ibd_state_t *state)
5367 5367 {
5368 5368 ibt_hca_attr_t hca_attrs;
5369 5369 ibt_cq_attr_t cq_attr;
5370 5370 ibt_status_t ret;
5371 5371 uint32_t real_size;
5372 5372 uint_t num_rwqe_change = 0;
5373 5373 uint_t num_swqe_change = 0;
5374 5374
5375 5375 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5376 5376 ASSERT(ret == IBT_SUCCESS);
5377 5377
5378 5378 /*
5379 5379 * Allocate Rx/combined CQ:
5380 5380 * Theoretically, there is no point in having more than #rwqe
5381 5381 * plus #swqe cqe's, except that the CQ will be signaled for
5382 5382 * overflow when the last wqe completes, if none of the previous
5383 5383 * cqe's have been polled. Thus, we allocate just a few less wqe's
5384 5384 * to make sure such overflow does not occur.
5385 5385 */
5386 5386 cq_attr.cq_sched = NULL;
5387 5387 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5388 5388
5389 5389 /*
5390 5390 * Allocate Receive CQ.
5391 5391 */
5392 5392 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5393 5393 cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5394 5394 } else {
5395 5395 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5396 5396 num_rwqe_change = state->id_ud_num_rwqe;
5397 5397 state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5398 5398 }
5399 5399
5400 5400 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5401 5401 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5402 5402 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5403 5403 "failed, ret=%d\n", ret);
5404 5404 return (DDI_FAILURE);
5405 5405 }
5406 5406
5407 5407 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5408 5408 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5409 5409 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5410 5410 "moderation failed, ret=%d\n", ret);
5411 5411 }
5412 5412
5413 5413 /* make the #rx wc's the same as max rx chain size */
5414 5414 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5415 5415 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5416 5416 state->id_rxwcs_size, KM_SLEEP);
5417 5417
5418 5418 /*
5419 5419 * Allocate Send CQ.
5420 5420 */
5421 5421 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5422 5422 cq_attr.cq_size = state->id_ud_num_swqe + 1;
5423 5423 } else {
5424 5424 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5425 5425 num_swqe_change = state->id_ud_num_swqe;
5426 5426 state->id_ud_num_swqe = cq_attr.cq_size - 1;
5427 5427 }
5428 5428
5429 5429 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5430 5430 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5431 5431 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5432 5432 "failed, ret=%d\n", ret);
5433 5433 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5434 5434 state->id_rxwcs_size);
5435 5435 (void) ibt_free_cq(state->id_rcq_hdl);
5436 5436 return (DDI_FAILURE);
5437 5437 }
5438 5438 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5439 5439 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5440 5440 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5441 5441 "moderation failed, ret=%d\n", ret);
5442 5442 }
5443 5443
5444 5444 state->id_txwcs_size = IBD_TX_POLL_THRESH;
5445 5445 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5446 5446 state->id_txwcs_size, KM_SLEEP);
5447 5447
5448 5448 /*
5449 5449 * Print message in case we could not allocate as many wqe's
5450 5450 * as was requested.
5451 5451 */
5452 5452 if (num_rwqe_change) {
5453 5453 ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5454 5454 "%d", state->id_ud_num_rwqe, num_rwqe_change);
5455 5455 }
5456 5456 if (num_swqe_change) {
5457 5457 ibd_print_warn(state, "Setting #swqe = %d instead of default "
5458 5458 "%d", state->id_ud_num_swqe, num_swqe_change);
5459 5459 }
5460 5460
5461 5461 return (DDI_SUCCESS);
5462 5462 }
5463 5463
5464 5464 static int
5465 5465 ibd_setup_ud_channel(ibd_state_t *state)
5466 5466 {
5467 5467 ibt_ud_chan_alloc_args_t ud_alloc_attr;
5468 5468 ibt_ud_chan_query_attr_t ud_chan_attr;
5469 5469 ibt_status_t ret;
5470 5470
5471 5471 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED;
5472 5472 if (state->id_hca_res_lkey_capab)
5473 5473 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5474 5474 if (state->id_lso_policy && state->id_lso_capable)
5475 5475 ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5476 5476
5477 5477 ud_alloc_attr.ud_hca_port_num = state->id_port;
5478 5478 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5479 5479 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5480 5480 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe;
5481 5481 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe;
5482 5482 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey;
5483 5483 ud_alloc_attr.ud_scq = state->id_scq_hdl;
5484 5484 ud_alloc_attr.ud_rcq = state->id_rcq_hdl;
5485 5485 ud_alloc_attr.ud_pd = state->id_pd_hdl;
5486 5486 ud_alloc_attr.ud_pkey_ix = state->id_pkix;
5487 5487 ud_alloc_attr.ud_clone_chan = NULL;
5488 5488
5489 5489 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5490 5490 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5491 5491 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5492 5492 "failed, ret=%d\n", ret);
5493 5493 return (DDI_FAILURE);
5494 5494 }
5495 5495
5496 5496 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5497 5497 &ud_chan_attr)) != IBT_SUCCESS) {
5498 5498 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5499 5499 "failed, ret=%d\n", ret);
5500 5500 (void) ibt_free_channel(state->id_chnl_hdl);
5501 5501 return (DDI_FAILURE);
5502 5502 }
5503 5503
5504 5504 state->id_qpnum = ud_chan_attr.ud_qpn;
5505 5505
5506 5506 return (DDI_SUCCESS);
5507 5507 }
5508 5508
5509 5509 static int
5510 5510 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5511 5511 {
5512 5512 uint32_t progress = state->id_mac_state;
5513 5513 uint_t attempts;
5514 5514 ibt_status_t ret;
5515 5515 ib_gid_t mgid;
5516 5516 ibd_mce_t *mce;
5517 5517 uint8_t jstate;
5518 5518 timeout_id_t tid;
5519 5519
5520 5520 if (atomic_dec_32_nv(&state->id_running) != 0)
5521 5521 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5522 5522
5523 5523 /*
5524 5524 * Before we try to stop/undo whatever we did in ibd_start(),
5525 5525 * we need to mark the link state appropriately to prevent the
5526 5526 * ip layer from using this instance for any new transfers. Note
5527 5527 * that if the original state of the link was "up" when we're
5528 5528 * here, we'll set the final link state to "unknown", to behave
5529 5529 * in the same fashion as other ethernet drivers.
5530 5530 */
5531 5531 mutex_enter(&state->id_link_mutex);
5532 5532 if (cur_link_state == LINK_STATE_DOWN) {
5533 5533 state->id_link_state = cur_link_state;
5534 5534 } else {
5535 5535 state->id_link_state = LINK_STATE_UNKNOWN;
5536 5536 }
5537 5537 mutex_exit(&state->id_link_mutex);
5538 5538 bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5539 5539 mac_link_update(state->id_mh, state->id_link_state);
5540 5540
5541 5541 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5542 5542 if (progress & IBD_DRV_STARTED) {
5543 5543 state->id_mac_state &= (~IBD_DRV_STARTED);
5544 5544 }
5545 5545
5546 5546 if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5547 5547 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5548 5548 }
5549 5549
5550 5550 /* Stop listen under Reliable Connected Mode */
5551 5551 if (progress & IBD_DRV_RC_LISTEN) {
5552 5552 ASSERT(state->id_enable_rc);
5553 5553 if (state->rc_listen_hdl != NULL) {
5554 5554 ibd_rc_stop_listen(state);
5555 5555 }
5556 5556 state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5557 5557 }
5558 5558
5559 5559 /* Stop timeout routine */
5560 5560 if (progress & IBD_DRV_RC_TIMEOUT) {
5561 5561 ASSERT(state->id_enable_rc);
5562 5562 mutex_enter(&state->rc_timeout_lock);
5563 5563 state->rc_timeout_start = B_FALSE;
5564 5564 tid = state->rc_timeout;
5565 5565 state->rc_timeout = 0;
5566 5566 mutex_exit(&state->rc_timeout_lock);
5567 5567 if (tid != 0)
5568 5568 (void) untimeout(tid);
5569 5569 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5570 5570 }
5571 5571
5572 5572 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5573 5573 attempts = 100;
5574 5574 while (state->id_ah_op == IBD_OP_ONGOING) {
5575 5575 /*
5576 5576 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5577 5577 * port is connecting to a remote IPoIB port. Wait for
5578 5578 * the end of this connecting operation.
5579 5579 */
5580 5580 delay(drv_usectohz(100000));
5581 5581 if (--attempts == 0) {
5582 5582 state->rc_stop_connect++;
5583 5583 DPRINT(40, "ibd_undo_start: connecting");
5584 5584 break;
5585 5585 }
5586 5586 }
5587 5587 mutex_enter(&state->id_sched_lock);
5588 5588 state->id_sched_needed = 0;
5589 5589 mutex_exit(&state->id_sched_lock);
5590 5590 (void) ibd_rc_close_all_chan(state);
5591 5591 }
5592 5592
5593 5593 /*
5594 5594 * First, stop receive interrupts; this stops the driver from
5595 5595 * handing up buffers to higher layers. Wait for receive buffers
5596 5596 * to be returned and give up after 1 second.
5597 5597 */
5598 5598 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5599 5599 attempts = 10;
5600 5600 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5601 5601 0) > 0) {
5602 5602 delay(drv_usectohz(100000));
5603 5603 if (--attempts == 0) {
5604 5604 /*
5605 5605 * There are pending bufs with the network
5606 5606 * layer and we have no choice but to wait
5607 5607 * for them to be done with. Reap all the
5608 5608 * Tx/Rx completions that were posted since
5609 5609 * we turned off the notification and
5610 5610 * return failure.
5611 5611 */
5612 5612 cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5613 5613 DPRINT(2, "ibd_undo_start: "
5614 5614 "reclaiming failed");
5615 5615 break;
5616 5616 }
5617 5617 }
5618 5618 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5619 5619 }
5620 5620
5621 5621 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5622 5622 ibd_rc_fini_tx_largebuf_list(state);
5623 5623 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5624 5624 }
5625 5625
5626 5626 if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5627 5627 ASSERT(state->id_enable_rc);
5628 5628 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5629 5629 if (state->id_ah_op == IBD_OP_ONGOING) {
5630 5630 delay(drv_usectohz(10000));
5631 5631 if (state->id_ah_op == IBD_OP_ONGOING) {
5632 5632 /*
5633 5633 * "state->id_ah_op == IBD_OP_ONGOING"
5634 5634 * means this IPoIB port is connecting
5635 5635 * to a remote IPoIB port. We can't
5636 5636 * delete SRQ here.
5637 5637 */
5638 5638 state->rc_stop_connect++;
5639 5639 DPRINT(40, "ibd_undo_start: "
5640 5640 "connecting");
5641 5641 } else {
5642 5642 ibd_rc_fini_srq_list(state);
5643 5643 state->id_mac_state &=
5644 5644 (~IBD_DRV_RC_SRQ_ALLOCD);
5645 5645 }
5646 5646 } else {
5647 5647 ibd_rc_fini_srq_list(state);
5648 5648 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5649 5649 }
5650 5650 } else {
5651 5651 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5652 5652 }
5653 5653 }
5654 5654
5655 5655 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5656 5656 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5657 5657
5658 5658 mutex_enter(&state->id_trap_lock);
5659 5659 state->id_trap_stop = B_TRUE;
5660 5660 while (state->id_trap_inprog > 0)
5661 5661 cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5662 5662 mutex_exit(&state->id_trap_lock);
5663 5663
5664 5664 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5665 5665 }
5666 5666
5667 5667 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5668 5668 /*
5669 5669 * Flushing the channel ensures that all pending WQE's
5670 5670 * are marked with flush_error and handed to the CQ. It
5671 5671 * does not guarantee the invocation of the CQ handler.
5672 5672 * This call is guaranteed to return successfully for
5673 5673 * UD QPNs.
5674 5674 */
5675 5675 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5676 5676 IBT_SUCCESS) {
5677 5677 DPRINT(10, "ibd_undo_start: flush_channel "
5678 5678 "failed, ret=%d", ret);
5679 5679 }
5680 5680
5681 5681 /*
5682 5682 * Give some time for the TX CQ handler to process the
5683 5683 * completions.
5684 5684 */
5685 5685 attempts = 10;
5686 5686 mutex_enter(&state->id_tx_list.dl_mutex);
5687 5687 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5688 5688 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5689 5689 != state->id_ud_num_swqe) {
5690 5690 if (--attempts == 0)
5691 5691 break;
5692 5692 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5693 5693 mutex_exit(&state->id_tx_list.dl_mutex);
5694 5694 delay(drv_usectohz(100000));
5695 5695 mutex_enter(&state->id_tx_list.dl_mutex);
5696 5696 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5697 5697 }
5698 5698 ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5699 5699 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5700 5700 state->id_ud_num_swqe) {
5701 5701 cmn_err(CE_WARN, "tx resources not freed\n");
5702 5702 }
5703 5703 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5704 5704 mutex_exit(&state->id_tx_list.dl_mutex);
5705 5705
5706 5706 attempts = 10;
5707 5707 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5708 5708 if (--attempts == 0)
5709 5709 break;
5710 5710 delay(drv_usectohz(100000));
5711 5711 }
5712 5712 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5713 5713 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5714 5714 cmn_err(CE_WARN, "rx resources not freed\n");
5715 5715 }
5716 5716
5717 5717 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5718 5718 }
5719 5719
5720 5720 if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5721 5721 /*
5722 5722 * Drop all residual full/non membership. This includes full
5723 5723 * membership to the broadcast group, and any nonmembership
5724 5724 * acquired during transmits. We do this after the Tx completion
5725 5725 * handlers are done, since those might result in some late
5726 5726 * leaves; this also eliminates a potential race with that
5727 5727 * path wrt the mc full list insert/delete. Trap handling
5728 5728 * has also been suppressed at this point. Thus, no locks
5729 5729 * are required while traversing the mc full list.
5730 5730 */
5731 5731 DPRINT(2, "ibd_undo_start: clear full cache entries");
5732 5732 mce = list_head(&state->id_mc_full);
5733 5733 while (mce != NULL) {
5734 5734 mgid = mce->mc_info.mc_adds_vect.av_dgid;
5735 5735 jstate = mce->mc_jstate;
5736 5736 mce = list_next(&state->id_mc_full, mce);
5737 5737 ibd_leave_group(state, mgid, jstate);
5738 5738 }
5739 5739 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5740 5740 }
5741 5741
5742 5742 if (progress & IBD_DRV_RXLIST_ALLOCD) {
5743 5743 ibd_fini_rxlist(state);
5744 5744 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5745 5745 }
5746 5746
5747 5747 if (progress & IBD_DRV_TXLIST_ALLOCD) {
5748 5748 ibd_fini_txlist(state);
5749 5749 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5750 5750 }
5751 5751
5752 5752 if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5753 5753 if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5754 5754 IBT_SUCCESS) {
5755 5755 DPRINT(10, "ibd_undo_start: free_channel "
5756 5756 "failed, ret=%d", ret);
5757 5757 }
5758 5758
5759 5759 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5760 5760 }
5761 5761
5762 5762 if (progress & IBD_DRV_CQS_ALLOCD) {
5763 5763 kmem_free(state->id_txwcs,
5764 5764 sizeof (ibt_wc_t) * state->id_txwcs_size);
5765 5765 if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5766 5766 IBT_SUCCESS) {
5767 5767 DPRINT(10, "ibd_undo_start: free_cq(scq) "
5768 5768 "failed, ret=%d", ret);
5769 5769 }
5770 5770
5771 5771 kmem_free(state->id_rxwcs,
5772 5772 sizeof (ibt_wc_t) * state->id_rxwcs_size);
5773 5773 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5774 5774 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5775 5775 "ret=%d", ret);
5776 5776 }
5777 5777
5778 5778 state->id_txwcs = NULL;
5779 5779 state->id_rxwcs = NULL;
5780 5780 state->id_scq_hdl = NULL;
5781 5781 state->id_rcq_hdl = NULL;
5782 5782
5783 5783 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5784 5784 }
5785 5785
5786 5786 if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5787 5787 mutex_enter(&state->id_ac_mutex);
5788 5788 mod_hash_destroy_hash(state->id_ah_active_hash);
5789 5789 mutex_exit(&state->id_ac_mutex);
5790 5790 ibd_acache_fini(state);
5791 5791
5792 5792 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5793 5793 }
5794 5794
5795 5795 if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5796 5796 /*
5797 5797 * If we'd created the ipoib broadcast group and had
5798 5798 * successfully joined it, leave it now
5799 5799 */
5800 5800 if (state->id_bgroup_created) {
5801 5801 mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5802 5802 jstate = IB_MC_JSTATE_FULL;
5803 5803 (void) ibt_leave_mcg(state->id_sgid, mgid,
5804 5804 state->id_sgid, jstate);
5805 5805 }
5806 5806 ibt_free_mcg_info(state->id_mcinfo, 1);
5807 5807
5808 5808 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5809 5809 }
5810 5810
5811 5811 return (DDI_SUCCESS);
5812 5812 }
5813 5813
5814 5814 /*
5815 5815 * These pair of routines are used to set/clear the condition that
5816 5816 * the caller is likely to do something to change the id_mac_state.
5817 5817 * If there's already someone doing either a start or a stop (possibly
5818 5818 * due to the async handler detecting a pkey relocation event, a plumb
5819 5819 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5820 5820 * that's done.
5821 5821 */
5822 5822 static void
5823 5823 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5824 5824 {
5825 5825 mutex_enter(&state->id_macst_lock);
5826 5826 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5827 5827 cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5828 5828
5829 5829 state->id_mac_state |= flag;
5830 5830 mutex_exit(&state->id_macst_lock);
5831 5831 }
5832 5832
5833 5833 static void
5834 5834 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5835 5835 {
5836 5836 mutex_enter(&state->id_macst_lock);
5837 5837 state->id_mac_state &= (~flag);
5838 5838 cv_signal(&state->id_macst_cv);
5839 5839 mutex_exit(&state->id_macst_lock);
5840 5840 }
5841 5841
5842 5842 /*
5843 5843 * GLDv3 entry point to start hardware.
5844 5844 */
5845 5845 /*ARGSUSED*/
5846 5846 static int
5847 5847 ibd_m_start(void *arg)
5848 5848 {
5849 5849 ibd_state_t *state = arg;
5850 5850 int ret;
5851 5851
5852 5852 if (state->id_type == IBD_PORT_DRIVER)
5853 5853 return (EINVAL);
5854 5854
5855 5855 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5856 5856 if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5857 5857 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5858 5858 return (EIO);
5859 5859 }
5860 5860
5861 5861 ret = ibd_start(state);
5862 5862 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5863 5863 return (ret);
5864 5864 }
5865 5865
5866 5866 static int
5867 5867 ibd_start(ibd_state_t *state)
5868 5868 {
5869 5869 int err;
5870 5870 ibt_status_t ret;
5871 5871 int late_hca_init = 0;
5872 5872
5873 5873 if (state->id_mac_state & IBD_DRV_STARTED)
5874 5874 return (DDI_SUCCESS);
5875 5875
5876 5876 /*
5877 5877 * We do not increment the running flag when calling ibd_start() as
5878 5878 * a result of some event which moves the state away from late HCA
5879 5879 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5880 5880 */
5881 5881 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5882 5882 (atomic_inc_32_nv(&state->id_running) != 1)) {
5883 5883 DPRINT(10, "ibd_start: id_running is non-zero");
5884 5884 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5885 5885 atomic_dec_32(&state->id_running);
5886 5886 return (EINVAL);
5887 5887 }
5888 5888
5889 5889 /*
5890 5890 * Get port details; if we fail here, something bad happened.
5891 5891 * Fail plumb.
5892 5892 */
5893 5893 if ((err = ibd_get_port_details(state)) != 0) {
5894 5894 DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5895 5895 goto start_fail;
5896 5896 }
5897 5897 /*
5898 5898 * If state->id_link_state is DOWN, it indicates that either the port
5899 5899 * is down, or the pkey is not available. In both cases, resort to late
5900 5900 * initialization. Register for subnet notices, and return success.
5901 5901 */
5902 5902 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5903 5903 if (state->id_link_state == LINK_STATE_DOWN) {
5904 5904 late_hca_init = 1;
5905 5905 goto late_hca_init_return;
5906 5906 }
5907 5907
5908 5908 /*
5909 5909 * Find the IPoIB broadcast group
5910 5910 */
5911 5911 if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5912 5912 /* Resort to late initialization */
5913 5913 late_hca_init = 1;
5914 5914 goto reg_snet_notices;
5915 5915 }
5916 5916 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5917 5917
5918 5918 /*
5919 5919 * Initialize per-interface caches and lists; if we fail here,
5920 5920 * it is most likely due to a lack of resources
5921 5921 */
5922 5922 if (ibd_acache_init(state) != DDI_SUCCESS) {
5923 5923 DPRINT(10, "ibd_start: ibd_acache_init() failed");
5924 5924 err = ENOMEM;
5925 5925 goto start_fail;
5926 5926 }
5927 5927 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5928 5928
5929 5929 /*
5930 5930 * Allocate send and receive completion queues
5931 5931 */
5932 5932 if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5933 5933 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5934 5934 err = ENOMEM;
5935 5935 goto start_fail;
5936 5936 }
5937 5937 state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5938 5938
5939 5939 /*
5940 5940 * Setup a UD channel
5941 5941 */
5942 5942 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5943 5943 err = ENOMEM;
5944 5944 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5945 5945 goto start_fail;
5946 5946 }
5947 5947 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5948 5948
5949 5949 /*
5950 5950 * Allocate and initialize the tx buffer list
5951 5951 */
5952 5952 if (ibd_init_txlist(state) != DDI_SUCCESS) {
5953 5953 DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5954 5954 err = ENOMEM;
5955 5955 goto start_fail;
5956 5956 }
5957 5957 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5958 5958
5959 5959 /*
5960 5960 * Create the send cq handler here
5961 5961 */
5962 5962 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5963 5963 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5964 5964 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5965 5965 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5966 5966 "failed, ret=%d", ret);
5967 5967 err = EINVAL;
5968 5968 goto start_fail;
5969 5969 }
5970 5970 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5971 5971
5972 5972 /*
5973 5973 * Allocate and initialize the rx buffer list
5974 5974 */
5975 5975 if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5976 5976 DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5977 5977 err = ENOMEM;
5978 5978 goto start_fail;
5979 5979 }
5980 5980 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5981 5981
5982 5982 /*
5983 5983 * Join IPoIB broadcast group
5984 5984 */
5985 5985 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5986 5986 DPRINT(10, "ibd_start: ibd_join_group() failed");
5987 5987 err = ENOTACTIVE;
5988 5988 goto start_fail;
5989 5989 }
5990 5990 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5991 5991
5992 5992 /*
5993 5993 * When we did mac_register() in ibd_attach(), we didn't register
5994 5994 * the real macaddr and we didn't have the true port mtu. Now that
5995 5995 * we're almost ready, set the local mac address and broadcast
5996 5996 * addresses and update gldv3 about the real values of these
5997 5997 * parameters.
5998 5998 */
5999 5999 if (state->id_enable_rc) {
6000 6000 ibd_h2n_mac(&state->id_macaddr,
6001 6001 IBD_MAC_ADDR_RC + state->id_qpnum,
6002 6002 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6003 6003 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
6004 6004 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6005 6005 } else {
6006 6006 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
6007 6007 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6008 6008 }
6009 6009 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
6010 6010 state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
6011 6011
6012 6012 if (!state->id_enable_rc) {
6013 6013 (void) mac_maxsdu_update2(state->id_mh,
6014 6014 state->id_mtu - IPOIB_HDRSIZE,
6015 6015 state->id_mtu - IPOIB_HDRSIZE);
6016 6016 }
6017 6017 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6018 6018
6019 6019 /*
6020 6020 * Setup the receive cq handler
6021 6021 */
6022 6022 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
6023 6023 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
6024 6024 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
6025 6025 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
6026 6026 "failed, ret=%d", ret);
6027 6027 err = EINVAL;
6028 6028 goto start_fail;
6029 6029 }
6030 6030 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
6031 6031
6032 6032 reg_snet_notices:
6033 6033 /*
6034 6034 * In case of normal initialization sequence,
6035 6035 * Setup the subnet notices handler after we've initialized the acache/
6036 6036 * mcache and started the async thread, both of which are required for
6037 6037 * the trap handler to function properly.
6038 6038 *
6039 6039 * Now that the async thread has been started (and we've already done
6040 6040 * a mac_register() during attach so mac_tx_update() can be called
6041 6041 * if necessary without any problem), we can enable the trap handler
6042 6042 * to queue requests to the async thread.
6043 6043 *
6044 6044 * In case of late hca initialization, the subnet notices handler will
6045 6045 * only handle MCG created/deleted event. The action performed as part
6046 6046 * of handling these events is to start the interface. So, the
6047 6047 * acache/mcache initialization is not a necessity in such cases for
6048 6048 * registering the subnet notices handler. Also, if we are in
6049 6049 * ibd_start() as a result of, say, some event handling after entering
6050 6050 * late hca initialization phase no need to register again.
6051 6051 */
6052 6052 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6053 6053 ibt_register_subnet_notices(state->id_ibt_hdl,
6054 6054 ibd_snet_notices_handler, state);
6055 6055 mutex_enter(&state->id_trap_lock);
6056 6056 state->id_trap_stop = B_FALSE;
6057 6057 mutex_exit(&state->id_trap_lock);
6058 6058 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6059 6059 }
6060 6060
6061 6061 late_hca_init_return:
6062 6062 if (late_hca_init == 1) {
6063 6063 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6064 6064 /*
6065 6065 * In case of late initialization, mark the link state as down,
6066 6066 * immaterial of the actual link state as reported in the
6067 6067 * port_info.
6068 6068 */
6069 6069 state->id_link_state = LINK_STATE_DOWN;
6070 6070 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6071 6071 mac_link_update(state->id_mh, state->id_link_state);
6072 6072 return (DDI_SUCCESS);
6073 6073 }
6074 6074
6075 6075 if (state->id_enable_rc) {
6076 6076 if (state->rc_enable_srq) {
6077 6077 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6078 6078 if (ibd_rc_repost_srq_free_list(state) !=
6079 6079 IBT_SUCCESS) {
6080 6080 err = ENOMEM;
6081 6081 goto start_fail;
6082 6082 }
6083 6083 } else {
6084 6084 /* Allocate SRQ resource */
6085 6085 if (ibd_rc_init_srq_list(state) !=
6086 6086 IBT_SUCCESS) {
6087 6087 err = ENOMEM;
6088 6088 goto start_fail;
6089 6089 }
6090 6090 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6091 6091 }
6092 6092 }
6093 6093
6094 6094 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6095 6095 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6096 6096 "failed");
6097 6097 err = ENOMEM;
6098 6098 goto start_fail;
6099 6099 }
6100 6100 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6101 6101
6102 6102 /* RC: begin to listen only after everything is available */
6103 6103 if (ibd_rc_listen(state) != IBT_SUCCESS) {
6104 6104 DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6105 6105 err = EINVAL;
6106 6106 goto start_fail;
6107 6107 }
6108 6108 state->id_mac_state |= IBD_DRV_RC_LISTEN;
6109 6109 }
6110 6110
6111 6111 /*
6112 6112 * Indicate link status to GLDv3 and higher layers. By default,
6113 6113 * we assume we are in up state (which must have been true at
6114 6114 * least at the time the broadcast mcg's were probed); if there
6115 6115 * were any up/down transitions till the time we come here, the
6116 6116 * async handler will have updated last known state, which we
6117 6117 * use to tell GLDv3. The async handler will not send any
6118 6118 * notifications to GLDv3 till we reach here in the initialization
6119 6119 * sequence.
6120 6120 */
6121 6121 mac_link_update(state->id_mh, state->id_link_state);
6122 6122 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6123 6123 state->id_mac_state |= IBD_DRV_STARTED;
6124 6124
6125 6125 /* Start timer after everything is ready */
6126 6126 if (state->id_enable_rc) {
6127 6127 mutex_enter(&state->rc_timeout_lock);
6128 6128 state->rc_timeout_start = B_TRUE;
6129 6129 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
6130 6130 SEC_TO_TICK(ibd_rc_conn_timeout));
6131 6131 mutex_exit(&state->rc_timeout_lock);
6132 6132 state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
6133 6133 }
6134 6134
6135 6135 return (DDI_SUCCESS);
6136 6136
6137 6137 start_fail:
6138 6138 /*
6139 6139 * If we ran into a problem during ibd_start() and ran into
6140 6140 * some other problem during undoing our partial work, we can't
6141 6141 * do anything about it. Ignore any errors we might get from
6142 6142 * ibd_undo_start() and just return the original error we got.
6143 6143 */
6144 6144 (void) ibd_undo_start(state, LINK_STATE_DOWN);
6145 6145 return (err);
6146 6146 }
6147 6147
6148 6148 /*
6149 6149 * GLDv3 entry point to stop hardware from receiving packets.
6150 6150 */
6151 6151 /*ARGSUSED*/
6152 6152 static void
6153 6153 ibd_m_stop(void *arg)
6154 6154 {
6155 6155 ibd_state_t *state = (ibd_state_t *)arg;
6156 6156
6157 6157 if (state->id_type == IBD_PORT_DRIVER)
6158 6158 return;
6159 6159
6160 6160 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6161 6161
6162 6162 (void) ibd_undo_start(state, state->id_link_state);
6163 6163
6164 6164 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6165 6165 }
6166 6166
6167 6167 /*
6168 6168 * GLDv3 entry point to modify device's mac address. We do not
6169 6169 * allow address modifications.
6170 6170 */
6171 6171 static int
6172 6172 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6173 6173 {
6174 6174 ibd_state_t *state = arg;
6175 6175
6176 6176 if (state->id_type == IBD_PORT_DRIVER)
6177 6177 return (EINVAL);
6178 6178
6179 6179 /*
6180 6180 * Don't bother even comparing the macaddr if we haven't
6181 6181 * completed ibd_m_start().
6182 6182 */
6183 6183 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6184 6184 return (0);
6185 6185
6186 6186 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6187 6187 return (0);
6188 6188 else
6189 6189 return (EINVAL);
6190 6190 }
6191 6191
6192 6192 /*
6193 6193 * The blocking part of the IBA join/leave operations are done out
6194 6194 * of here on the async thread.
6195 6195 */
6196 6196 static void
6197 6197 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6198 6198 {
6199 6199 DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6200 6200 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6201 6201
6202 6202 if (op == IBD_ASYNC_JOIN) {
6203 6203 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6204 6204 ibd_print_warn(state, "Join multicast group failed :"
6205 6205 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6206 6206 }
6207 6207 } else {
6208 6208 /*
6209 6209 * Here, we must search for the proper mcg_info and
6210 6210 * use that to leave the group.
6211 6211 */
6212 6212 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6213 6213 }
6214 6214 }
6215 6215
6216 6216 /*
6217 6217 * GLDv3 entry point for multicast enable/disable requests.
6218 6218 * This function queues the operation to the async thread and
6219 6219 * return success for a valid multicast address.
6220 6220 */
6221 6221 static int
6222 6222 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6223 6223 {
6224 6224 ibd_state_t *state = (ibd_state_t *)arg;
6225 6225 ipoib_mac_t maddr, *mcast;
6226 6226 ib_gid_t mgid;
6227 6227 ibd_req_t *req;
6228 6228
6229 6229 if (state->id_type == IBD_PORT_DRIVER)
6230 6230 return (EINVAL);
6231 6231
6232 6232 /*
6233 6233 * If we haven't completed ibd_m_start(), async thread wouldn't
6234 6234 * have been started and id_bcaddr wouldn't be set, so there's
6235 6235 * no point in continuing.
6236 6236 */
6237 6237 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6238 6238 return (0);
6239 6239
6240 6240 /*
6241 6241 * The incoming multicast address might not be aligned properly
6242 6242 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6243 6243 * it to look like one though, to get the offsets of the mc gid,
6244 6244 * since we know we are not going to dereference any values with
6245 6245 * the ipoib_mac_t pointer.
6246 6246 */
6247 6247 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6248 6248 mcast = &maddr;
6249 6249
6250 6250 /*
6251 6251 * Check validity of MCG address. We could additionally check
6252 6252 * that a enable/disable is not being issued on the "broadcast"
6253 6253 * mcg, but since this operation is only invokable by privileged
6254 6254 * programs anyway, we allow the flexibility to those dlpi apps.
6255 6255 * Note that we do not validate the "scope" of the IBA mcg.
6256 6256 */
6257 6257 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6258 6258 return (EINVAL);
6259 6259
6260 6260 /*
6261 6261 * fill in multicast pkey and scope
6262 6262 */
6263 6263 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6264 6264
6265 6265 /*
6266 6266 * If someone is trying to JOIN/LEAVE the broadcast group, we do
6267 6267 * nothing (i.e. we stay JOINed to the broadcast group done in
6268 6268 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6269 6269 * requires to be joined to broadcast groups at all times.
6270 6270 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6271 6271 * depends on this.
6272 6272 */
6273 6273 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6274 6274 return (0);
6275 6275
6276 6276 ibd_n2h_gid(mcast, &mgid);
6277 6277 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6278 6278 if (req == NULL)
6279 6279 return (ENOMEM);
6280 6280
6281 6281 req->rq_gid = mgid;
6282 6282
6283 6283 if (add) {
6284 6284 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6285 6285 mgid.gid_prefix, mgid.gid_guid);
6286 6286 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6287 6287 } else {
6288 6288 DPRINT(1, "ibd_m_multicst : unset_multicast : "
6289 6289 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6290 6290 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6291 6291 }
6292 6292 return (0);
6293 6293 }
6294 6294
6295 6295 /*
6296 6296 * The blocking part of the IBA promiscuous operations are done
6297 6297 * out of here on the async thread. The dlpireq parameter indicates
6298 6298 * whether this invocation is due to a dlpi request or due to
6299 6299 * a port up/down event.
6300 6300 */
6301 6301 static void
6302 6302 ibd_async_unsetprom(ibd_state_t *state)
6303 6303 {
6304 6304 ibd_mce_t *mce = list_head(&state->id_mc_non);
6305 6305 ib_gid_t mgid;
6306 6306
6307 6307 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6308 6308
6309 6309 while (mce != NULL) {
6310 6310 mgid = mce->mc_info.mc_adds_vect.av_dgid;
6311 6311 mce = list_next(&state->id_mc_non, mce);
6312 6312 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6313 6313 }
6314 6314 state->id_prom_op = IBD_OP_NOTSTARTED;
6315 6315 }
6316 6316
6317 6317 /*
6318 6318 * The blocking part of the IBA promiscuous operations are done
6319 6319 * out of here on the async thread. The dlpireq parameter indicates
6320 6320 * whether this invocation is due to a dlpi request or due to
6321 6321 * a port up/down event.
6322 6322 */
6323 6323 static void
6324 6324 ibd_async_setprom(ibd_state_t *state)
6325 6325 {
6326 6326 ibt_mcg_attr_t mcg_attr;
6327 6327 ibt_mcg_info_t *mcg_info;
6328 6328 ib_gid_t mgid;
6329 6329 uint_t numg;
6330 6330 int i;
6331 6331 char ret = IBD_OP_COMPLETED;
6332 6332
6333 6333 DPRINT(2, "ibd_async_setprom : async_set_promisc");
6334 6334
6335 6335 /*
6336 6336 * Obtain all active MC groups on the IB fabric with
6337 6337 * specified criteria (scope + Pkey + Qkey + mtu).
6338 6338 */
6339 6339 bzero(&mcg_attr, sizeof (mcg_attr));
6340 6340 mcg_attr.mc_pkey = state->id_pkey;
6341 6341 mcg_attr.mc_scope = state->id_scope;
6342 6342 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6343 6343 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6344 6344 mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6345 6345 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6346 6346 IBT_SUCCESS) {
6347 6347 ibd_print_warn(state, "Could not get list of IBA multicast "
6348 6348 "groups");
6349 6349 ret = IBD_OP_ERRORED;
6350 6350 goto done;
6351 6351 }
6352 6352
6353 6353 /*
6354 6354 * Iterate over the returned mcg's and join as NonMember
6355 6355 * to the IP mcg's.
6356 6356 */
6357 6357 for (i = 0; i < numg; i++) {
6358 6358 /*
6359 6359 * Do a NonMember JOIN on the MC group.
6360 6360 */
6361 6361 mgid = mcg_info[i].mc_adds_vect.av_dgid;
6362 6362 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6363 6363 ibd_print_warn(state, "IBA promiscuous mode missed "
6364 6364 "multicast gid %016llx:%016llx",
6365 6365 (u_longlong_t)mgid.gid_prefix,
6366 6366 (u_longlong_t)mgid.gid_guid);
6367 6367 }
6368 6368
6369 6369 ibt_free_mcg_info(mcg_info, numg);
6370 6370 DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6371 6371 done:
6372 6372 state->id_prom_op = ret;
6373 6373 }
6374 6374
6375 6375 /*
6376 6376 * GLDv3 entry point for multicast promiscuous enable/disable requests.
6377 6377 * GLDv3 assumes phys state receives more packets than multi state,
6378 6378 * which is not true for IPoIB. Thus, treat the multi and phys
6379 6379 * promiscuous states the same way to work with GLDv3's assumption.
6380 6380 */
6381 6381 static int
6382 6382 ibd_m_promisc(void *arg, boolean_t on)
6383 6383 {
6384 6384 ibd_state_t *state = (ibd_state_t *)arg;
6385 6385 ibd_req_t *req;
6386 6386
6387 6387 if (state->id_type == IBD_PORT_DRIVER)
6388 6388 return (EINVAL);
6389 6389
6390 6390 /*
6391 6391 * Async thread wouldn't have been started if we haven't
6392 6392 * passed ibd_m_start()
6393 6393 */
6394 6394 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6395 6395 return (0);
6396 6396
6397 6397 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6398 6398 if (req == NULL)
6399 6399 return (ENOMEM);
6400 6400 if (on) {
6401 6401 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6402 6402 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6403 6403 } else {
6404 6404 DPRINT(1, "ibd_m_promisc : unset_promisc");
6405 6405 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6406 6406 }
6407 6407
6408 6408 return (0);
6409 6409 }
6410 6410
6411 6411 /*
6412 6412 * GLDv3 entry point for gathering statistics.
6413 6413 */
6414 6414 static int
6415 6415 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6416 6416 {
6417 6417 ibd_state_t *state = (ibd_state_t *)arg;
6418 6418
6419 6419 switch (stat) {
6420 6420 case MAC_STAT_IFSPEED:
6421 6421 *val = state->id_link_speed;
6422 6422 break;
6423 6423 case MAC_STAT_MULTIRCV:
6424 6424 *val = state->id_multi_rcv;
6425 6425 break;
6426 6426 case MAC_STAT_BRDCSTRCV:
6427 6427 *val = state->id_brd_rcv;
6428 6428 break;
6429 6429 case MAC_STAT_MULTIXMT:
6430 6430 *val = state->id_multi_xmt;
6431 6431 break;
6432 6432 case MAC_STAT_BRDCSTXMT:
6433 6433 *val = state->id_brd_xmt;
6434 6434 break;
6435 6435 case MAC_STAT_RBYTES:
6436 6436 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6437 6437 + state->rc_rcv_copy_byte;
6438 6438 break;
6439 6439 case MAC_STAT_IPACKETS:
6440 6440 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6441 6441 + state->rc_rcv_copy_pkt;
6442 6442 break;
6443 6443 case MAC_STAT_OBYTES:
6444 6444 *val = state->id_xmt_bytes + state->rc_xmt_bytes;
6445 6445 break;
6446 6446 case MAC_STAT_OPACKETS:
6447 6447 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6448 6448 state->rc_xmt_fragmented_pkt +
6449 6449 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6450 6450 break;
6451 6451 case MAC_STAT_OERRORS:
6452 6452 *val = state->id_ah_error; /* failed AH translation */
6453 6453 break;
6454 6454 case MAC_STAT_IERRORS:
6455 6455 *val = 0;
6456 6456 break;
6457 6457 case MAC_STAT_NOXMTBUF:
6458 6458 *val = state->id_tx_short + state->rc_swqe_short +
6459 6459 state->rc_xmt_buf_short;
6460 6460 break;
6461 6461 case MAC_STAT_NORCVBUF:
6462 6462 default:
6463 6463 return (ENOTSUP);
6464 6464 }
6465 6465
6466 6466 return (0);
6467 6467 }
6468 6468
6469 6469 static void
6470 6470 ibd_async_txsched(ibd_state_t *state)
6471 6471 {
6472 6472 ibd_resume_transmission(state);
6473 6473 }
6474 6474
6475 6475 static void
6476 6476 ibd_resume_transmission(ibd_state_t *state)
6477 6477 {
6478 6478 int flag;
6479 6479 int met_thresh = 0;
6480 6480 int thresh = 0;
6481 6481 int ret = -1;
6482 6482
6483 6483 mutex_enter(&state->id_sched_lock);
6484 6484 if (state->id_sched_needed & IBD_RSRC_SWQE) {
6485 6485 mutex_enter(&state->id_tx_list.dl_mutex);
6486 6486 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6487 6487 met_thresh = state->id_tx_list.dl_cnt +
6488 6488 state->id_tx_rel_list.dl_cnt;
6489 6489 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6490 6490 mutex_exit(&state->id_tx_list.dl_mutex);
6491 6491 thresh = IBD_FREE_SWQES_THRESH;
6492 6492 flag = IBD_RSRC_SWQE;
6493 6493 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6494 6494 ASSERT(state->id_lso != NULL);
6495 6495 mutex_enter(&state->id_lso_lock);
6496 6496 met_thresh = state->id_lso->bkt_nfree;
6497 6497 thresh = IBD_FREE_LSOS_THRESH;
6498 6498 mutex_exit(&state->id_lso_lock);
6499 6499 flag = IBD_RSRC_LSOBUF;
6500 6500 if (met_thresh > thresh)
6501 6501 state->id_sched_lso_cnt++;
6502 6502 }
6503 6503 if (met_thresh > thresh) {
6504 6504 state->id_sched_needed &= ~flag;
6505 6505 state->id_sched_cnt++;
6506 6506 ret = 0;
6507 6507 }
6508 6508 mutex_exit(&state->id_sched_lock);
6509 6509
6510 6510 if (ret == 0)
6511 6511 mac_tx_update(state->id_mh);
6512 6512 }
6513 6513
6514 6514 /*
6515 6515 * Release the send wqe back into free list.
6516 6516 */
6517 6517 static void
6518 6518 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6519 6519 {
6520 6520 /*
6521 6521 * Add back on Tx list for reuse.
6522 6522 */
6523 6523 ASSERT(tail->swqe_next == NULL);
6524 6524 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6525 6525 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6526 6526 tail->swqe_next = state->id_tx_rel_list.dl_head;
6527 6527 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6528 6528 state->id_tx_rel_list.dl_cnt += n;
6529 6529 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6530 6530 }
6531 6531
6532 6532 /*
6533 6533 * Acquire a send wqe from free list.
6534 6534 * Returns error number and send wqe pointer.
6535 6535 */
6536 6536 static ibd_swqe_t *
6537 6537 ibd_acquire_swqe(ibd_state_t *state)
6538 6538 {
6539 6539 ibd_swqe_t *wqe;
6540 6540
6541 6541 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6542 6542 if (state->id_tx_rel_list.dl_head != NULL) {
6543 6543 /* transfer id_tx_rel_list to id_tx_list */
6544 6544 state->id_tx_list.dl_head =
6545 6545 state->id_tx_rel_list.dl_head;
6546 6546 state->id_tx_list.dl_cnt =
6547 6547 state->id_tx_rel_list.dl_cnt;
6548 6548 state->id_tx_list.dl_pending_sends = B_FALSE;
6549 6549
6550 6550 /* clear id_tx_rel_list */
6551 6551 state->id_tx_rel_list.dl_head = NULL;
6552 6552 state->id_tx_rel_list.dl_cnt = 0;
6553 6553 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6554 6554
6555 6555 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6556 6556 state->id_tx_list.dl_cnt -= 1;
6557 6557 state->id_tx_list.dl_head = wqe->swqe_next;
6558 6558 } else { /* no free swqe */
6559 6559 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6560 6560 state->id_tx_list.dl_pending_sends = B_TRUE;
6561 6561 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6562 6562 state->id_tx_short++;
6563 6563 wqe = NULL;
6564 6564 }
6565 6565 return (wqe);
6566 6566 }
6567 6567
6568 6568 static int
6569 6569 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6570 6570 ibt_ud_dest_hdl_t ud_dest)
6571 6571 {
6572 6572 mblk_t *nmp;
6573 6573 int iph_len, tcph_len;
6574 6574 ibt_wr_lso_t *lso;
6575 6575 uintptr_t ip_start, tcp_start;
6576 6576 uint8_t *dst;
6577 6577 uint_t pending, mblen;
6578 6578
6579 6579 /*
6580 6580 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6581 6581 * we need to adjust it here for lso.
6582 6582 */
6583 6583 lso = &(node->w_swr.wr.ud_lso);
6584 6584 lso->lso_ud_dest = ud_dest;
6585 6585 lso->lso_mss = mss;
6586 6586
6587 6587 /*
6588 6588 * Calculate the LSO header size and set it in the UD LSO structure.
6589 6589 * Note that the only assumption we make is that each of the IPoIB,
6590 6590 * IP and TCP headers will be contained in a single mblk fragment;
6591 6591 * together, the headers may span multiple mblk fragments.
6592 6592 */
6593 6593 nmp = mp;
6594 6594 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6595 6595 if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6596 6596 ip_start = (uintptr_t)nmp->b_cont->b_rptr
6597 6597 + (ip_start - (uintptr_t)(nmp->b_wptr));
6598 6598 nmp = nmp->b_cont;
6599 6599
6600 6600 }
6601 6601 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6602 6602
6603 6603 tcp_start = ip_start + iph_len;
6604 6604 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6605 6605 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6606 6606 + (tcp_start - (uintptr_t)(nmp->b_wptr));
6607 6607 nmp = nmp->b_cont;
6608 6608 }
6609 6609 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6610 6610 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6611 6611
6612 6612 /*
6613 6613 * If the lso header fits entirely within a single mblk fragment,
6614 6614 * we'll avoid an additional copy of the lso header here and just
6615 6615 * pass the b_rptr of the mblk directly.
6616 6616 *
6617 6617 * If this isn't true, we'd have to allocate for it explicitly.
6618 6618 */
6619 6619 if (lso->lso_hdr_sz <= MBLKL(mp)) {
6620 6620 lso->lso_hdr = mp->b_rptr;
6621 6621 } else {
6622 6622 /* On work completion, remember to free this allocated hdr */
6623 6623 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6624 6624 if (lso->lso_hdr == NULL) {
6625 6625 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6626 6626 "sz = %d", lso->lso_hdr_sz);
6627 6627 lso->lso_hdr_sz = 0;
6628 6628 lso->lso_mss = 0;
6629 6629 return (-1);
6630 6630 }
6631 6631 }
6632 6632
6633 6633 /*
6634 6634 * Copy in the lso header only if we need to
6635 6635 */
6636 6636 if (lso->lso_hdr != mp->b_rptr) {
6637 6637 dst = lso->lso_hdr;
6638 6638 pending = lso->lso_hdr_sz;
6639 6639
6640 6640 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6641 6641 mblen = MBLKL(nmp);
6642 6642 if (pending > mblen) {
6643 6643 bcopy(nmp->b_rptr, dst, mblen);
6644 6644 dst += mblen;
6645 6645 pending -= mblen;
6646 6646 } else {
6647 6647 bcopy(nmp->b_rptr, dst, pending);
6648 6648 break;
6649 6649 }
6650 6650 }
6651 6651 }
6652 6652
6653 6653 return (0);
6654 6654 }
6655 6655
6656 6656 static void
6657 6657 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6658 6658 {
6659 6659 ibt_wr_lso_t *lso;
6660 6660
6661 6661 if ((!node) || (!mp))
6662 6662 return;
6663 6663
6664 6664 /*
6665 6665 * Free any header space that we might've allocated if we
6666 6666 * did an LSO
6667 6667 */
6668 6668 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6669 6669 lso = &(node->w_swr.wr.ud_lso);
6670 6670 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6671 6671 kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6672 6672 lso->lso_hdr = NULL;
6673 6673 lso->lso_hdr_sz = 0;
6674 6674 }
6675 6675 }
6676 6676 }
6677 6677
6678 6678 static void
6679 6679 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6680 6680 {
6681 6681 uint_t i;
6682 6682 uint_t num_posted;
6683 6683 uint_t n_wrs;
6684 6684 ibt_status_t ibt_status;
6685 6685 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE];
6686 6686 ibd_swqe_t *tx_head, *elem;
6687 6687 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE];
6688 6688
6689 6689 /* post the one request, then check for more */
6690 6690 ibt_status = ibt_post_send(state->id_chnl_hdl,
6691 6691 &node->w_swr, 1, NULL);
6692 6692 if (ibt_status != IBT_SUCCESS) {
6693 6693 ibd_print_warn(state, "ibd_post_send: "
6694 6694 "posting one wr failed: ret=%d", ibt_status);
6695 6695 ibd_tx_cleanup(state, node);
6696 6696 }
6697 6697
6698 6698 tx_head = NULL;
6699 6699 for (;;) {
6700 6700 if (tx_head == NULL) {
6701 6701 mutex_enter(&state->id_txpost_lock);
6702 6702 tx_head = state->id_tx_head;
6703 6703 if (tx_head == NULL) {
6704 6704 state->id_tx_busy = 0;
6705 6705 mutex_exit(&state->id_txpost_lock);
6706 6706 return;
6707 6707 }
6708 6708 state->id_tx_head = NULL;
6709 6709 mutex_exit(&state->id_txpost_lock);
6710 6710 }
6711 6711
6712 6712 /*
6713 6713 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6714 6714 * at a time if possible, and keep posting them.
6715 6715 */
6716 6716 for (n_wrs = 0, elem = tx_head;
6717 6717 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6718 6718 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6719 6719 nodes[n_wrs] = elem;
6720 6720 wrs[n_wrs] = elem->w_swr;
6721 6721 }
6722 6722 tx_head = elem;
6723 6723
6724 6724 ASSERT(n_wrs != 0);
6725 6725
6726 6726 /*
6727 6727 * If posting fails for some reason, we'll never receive
6728 6728 * completion intimation, so we'll need to cleanup. But
6729 6729 * we need to make sure we don't clean up nodes whose
6730 6730 * wrs have been successfully posted. We assume that the
6731 6731 * hca driver returns on the first failure to post and
6732 6732 * therefore the first 'num_posted' entries don't need
6733 6733 * cleanup here.
6734 6734 */
6735 6735 num_posted = 0;
6736 6736 ibt_status = ibt_post_send(state->id_chnl_hdl,
6737 6737 wrs, n_wrs, &num_posted);
6738 6738 if (ibt_status != IBT_SUCCESS) {
6739 6739 ibd_print_warn(state, "ibd_post_send: "
6740 6740 "posting multiple wrs failed: "
6741 6741 "requested=%d, done=%d, ret=%d",
6742 6742 n_wrs, num_posted, ibt_status);
6743 6743
6744 6744 for (i = num_posted; i < n_wrs; i++)
6745 6745 ibd_tx_cleanup(state, nodes[i]);
6746 6746 }
6747 6747 }
6748 6748 }
6749 6749
6750 6750 static int
6751 6751 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6752 6752 uint_t lsohdr_sz)
6753 6753 {
6754 6754 ibt_wr_ds_t *sgl;
6755 6755 ibt_status_t ibt_status;
6756 6756 mblk_t *nmp;
6757 6757 mblk_t *data_mp;
6758 6758 uchar_t *bufp;
6759 6759 size_t blksize;
6760 6760 size_t skip;
6761 6761 size_t avail;
6762 6762 uint_t pktsize;
6763 6763 uint_t frag_len;
6764 6764 uint_t pending_hdr;
6765 6765 int nmblks;
6766 6766 int i;
6767 6767
6768 6768 /*
6769 6769 * Let's skip ahead to the data if this is LSO
6770 6770 */
6771 6771 data_mp = mp;
6772 6772 pending_hdr = 0;
6773 6773 if (lsohdr_sz) {
6774 6774 pending_hdr = lsohdr_sz;
6775 6775 for (nmp = mp; nmp; nmp = nmp->b_cont) {
6776 6776 frag_len = nmp->b_wptr - nmp->b_rptr;
6777 6777 if (frag_len > pending_hdr)
6778 6778 break;
6779 6779 pending_hdr -= frag_len;
6780 6780 }
6781 6781 data_mp = nmp; /* start of data past lso header */
6782 6782 ASSERT(data_mp != NULL);
6783 6783 }
6784 6784
6785 6785 /*
6786 6786 * Calculate the size of message data and number of msg blocks
6787 6787 */
6788 6788 pktsize = 0;
6789 6789 for (nmblks = 0, nmp = data_mp; nmp != NULL;
6790 6790 nmp = nmp->b_cont, nmblks++) {
6791 6791 pktsize += MBLKL(nmp);
6792 6792 }
6793 6793 pktsize -= pending_hdr;
6794 6794
6795 6795 /*
6796 6796 * We only do ibt_map_mem_iov() if the pktsize is above the
6797 6797 * "copy-threshold", and if the number of mp fragments is less than
6798 6798 * the maximum acceptable.
6799 6799 */
6800 6800 if ((state->id_hca_res_lkey_capab) &&
6801 6801 (pktsize > state->id_ud_tx_copy_thresh) &&
6802 6802 (nmblks < state->id_max_sqseg_hiwm)) {
6803 6803 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6804 6804 ibt_iov_attr_t iov_attr;
6805 6805
6806 6806 iov_attr.iov_as = NULL;
6807 6807 iov_attr.iov = iov_arr;
6808 6808 iov_attr.iov_buf = NULL;
6809 6809 iov_attr.iov_list_len = nmblks;
6810 6810 iov_attr.iov_wr_nds = state->id_max_sqseg;
6811 6811 iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6812 6812 iov_attr.iov_flags = IBT_IOV_SLEEP;
6813 6813
6814 6814 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6815 6815 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6816 6816 iov_arr[i].iov_len = MBLKL(nmp);
6817 6817 if (i == 0) {
6818 6818 iov_arr[i].iov_addr += pending_hdr;
6819 6819 iov_arr[i].iov_len -= pending_hdr;
6820 6820 }
6821 6821 }
6822 6822
6823 6823 node->w_buftype = IBD_WQE_MAPPED;
6824 6824 node->w_swr.wr_sgl = node->w_sgl;
6825 6825
6826 6826 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6827 6827 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6828 6828 if (ibt_status != IBT_SUCCESS) {
6829 6829 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6830 6830 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6831 6831 goto ibd_copy_path;
6832 6832 }
6833 6833
6834 6834 return (0);
6835 6835 }
6836 6836
6837 6837 ibd_copy_path:
6838 6838 if (pktsize <= state->id_tx_buf_sz) {
6839 6839 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6840 6840 node->w_swr.wr_nds = 1;
6841 6841 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6842 6842 node->w_buftype = IBD_WQE_TXBUF;
6843 6843
6844 6844 /*
6845 6845 * Even though this is the copy path for transfers less than
6846 6846 * id_tx_buf_sz, it could still be an LSO packet. If so, it
6847 6847 * is possible the first data mblk fragment (data_mp) still
6848 6848 * contains part of the LSO header that we need to skip.
6849 6849 */
6850 6850 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6851 6851 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6852 6852 blksize = MBLKL(nmp) - pending_hdr;
6853 6853 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6854 6854 bufp += blksize;
6855 6855 pending_hdr = 0;
6856 6856 }
6857 6857
6858 6858 return (0);
6859 6859 }
6860 6860
6861 6861 /*
6862 6862 * Copy path for transfers greater than id_tx_buf_sz
6863 6863 */
6864 6864 node->w_swr.wr_sgl = node->w_sgl;
6865 6865 if (ibd_acquire_lsobufs(state, pktsize,
6866 6866 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6867 6867 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6868 6868 return (-1);
6869 6869 }
6870 6870 node->w_buftype = IBD_WQE_LSOBUF;
6871 6871
6872 6872 /*
6873 6873 * Copy the larger-than-id_tx_buf_sz packet into a set of
6874 6874 * fixed-sized, pre-mapped LSO buffers. Note that we might
6875 6875 * need to skip part of the LSO header in the first fragment
6876 6876 * as before.
6877 6877 */
6878 6878 nmp = data_mp;
6879 6879 skip = pending_hdr;
6880 6880 for (i = 0; i < node->w_swr.wr_nds; i++) {
6881 6881 sgl = node->w_swr.wr_sgl + i;
6882 6882 bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6883 6883 avail = IBD_LSO_BUFSZ;
6884 6884 while (nmp && avail) {
6885 6885 blksize = MBLKL(nmp) - skip;
6886 6886 if (blksize > avail) {
6887 6887 bcopy(nmp->b_rptr + skip, bufp, avail);
6888 6888 skip += avail;
6889 6889 avail = 0;
6890 6890 } else {
6891 6891 bcopy(nmp->b_rptr + skip, bufp, blksize);
6892 6892 skip = 0;
6893 6893 avail -= blksize;
6894 6894 bufp += blksize;
6895 6895 nmp = nmp->b_cont;
6896 6896 }
6897 6897 }
6898 6898 }
6899 6899
6900 6900 return (0);
6901 6901 }
6902 6902
6903 6903 /*
6904 6904 * Schedule a completion queue polling to reap the resource we're
6905 6905 * short on. If we implement the change to reap tx completions
6906 6906 * in a separate thread, we'll need to wake up that thread here.
6907 6907 */
6908 6908 static int
6909 6909 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6910 6910 {
6911 6911 ibd_req_t *req;
6912 6912
6913 6913 mutex_enter(&state->id_sched_lock);
6914 6914 state->id_sched_needed |= resource_type;
6915 6915 mutex_exit(&state->id_sched_lock);
6916 6916
6917 6917 /*
6918 6918 * If we are asked to queue a work entry, we need to do it
6919 6919 */
6920 6920 if (q_flag) {
6921 6921 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6922 6922 if (req == NULL)
6923 6923 return (-1);
6924 6924
6925 6925 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6926 6926 }
6927 6927
6928 6928 return (0);
6929 6929 }
6930 6930
6931 6931 /*
6932 6932 * The passed in packet has this format:
6933 6933 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6934 6934 */
6935 6935 static boolean_t
6936 6936 ibd_send(ibd_state_t *state, mblk_t *mp)
6937 6937 {
6938 6938 ibd_ace_t *ace;
6939 6939 ibd_swqe_t *node;
6940 6940 ipoib_mac_t *dest;
6941 6941 ib_header_info_t *ipibp;
6942 6942 ip6_t *ip6h;
6943 6943 uint_t pktsize;
6944 6944 uint32_t mss;
6945 6945 uint32_t hckflags;
6946 6946 uint32_t lsoflags = 0;
6947 6947 uint_t lsohdr_sz = 0;
6948 6948 int ret, len;
6949 6949 boolean_t dofree = B_FALSE;
6950 6950 boolean_t rc;
6951 6951 /* if (rc_chan == NULL) send by UD; else send by RC; */
6952 6952 ibd_rc_chan_t *rc_chan;
6953 6953 int nmblks;
6954 6954 mblk_t *nmp;
6955 6955
6956 6956 /*
6957 6957 * If we aren't done with the device initialization and start,
6958 6958 * we shouldn't be here.
6959 6959 */
6960 6960 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6961 6961 return (B_FALSE);
6962 6962
6963 6963 /*
6964 6964 * Obtain an address handle for the destination.
6965 6965 */
6966 6966 ipibp = (ib_header_info_t *)mp->b_rptr;
6967 6967 dest = (ipoib_mac_t *)&ipibp->ib_dst;
6968 6968 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6969 6969 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6970 6970
6971 6971 rc_chan = NULL;
6972 6972 ace = ibd_acache_lookup(state, dest, &ret, 1);
6973 6973 if (state->id_enable_rc && (ace != NULL) &&
6974 6974 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6975 6975 if (ace->ac_chan == NULL) {
6976 6976 state->rc_null_conn++;
6977 6977 } else {
6978 6978 if (ace->ac_chan->chan_state ==
6979 6979 IBD_RC_STATE_ACT_ESTAB) {
6980 6980 rc_chan = ace->ac_chan;
6981 6981 rc_chan->is_used = B_TRUE;
6982 6982 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6983 6983 node = WQE_TO_SWQE(
6984 6984 rc_chan->tx_wqe_list.dl_head);
6985 6985 if (node != NULL) {
6986 6986 rc_chan->tx_wqe_list.dl_cnt -= 1;
6987 6987 rc_chan->tx_wqe_list.dl_head =
6988 6988 node->swqe_next;
6989 6989 } else {
6990 6990 node = ibd_rc_acquire_swqes(rc_chan);
6991 6991 }
6992 6992 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6993 6993
6994 6994 if (node == NULL) {
6995 6995 state->rc_swqe_short++;
6996 6996 mutex_enter(&state->id_sched_lock);
6997 6997 state->id_sched_needed |=
6998 6998 IBD_RSRC_RC_SWQE;
6999 6999 mutex_exit(&state->id_sched_lock);
7000 7000 ibd_dec_ref_ace(state, ace);
7001 7001 return (B_FALSE);
7002 7002 }
7003 7003 } else {
7004 7004 state->rc_no_estab_conn++;
7005 7005 }
7006 7006 }
7007 7007 }
7008 7008
7009 7009 if (rc_chan == NULL) {
7010 7010 mutex_enter(&state->id_tx_list.dl_mutex);
7011 7011 node = WQE_TO_SWQE(state->id_tx_list.dl_head);
7012 7012 if (node != NULL) {
7013 7013 state->id_tx_list.dl_cnt -= 1;
7014 7014 state->id_tx_list.dl_head = node->swqe_next;
7015 7015 } else {
7016 7016 node = ibd_acquire_swqe(state);
7017 7017 }
7018 7018 mutex_exit(&state->id_tx_list.dl_mutex);
7019 7019 if (node == NULL) {
7020 7020 /*
7021 7021 * If we don't have an swqe available, schedule a
7022 7022 * transmit completion queue cleanup and hold off on
7023 7023 * sending more packets until we have some free swqes
7024 7024 */
7025 7025 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
7026 7026 if (ace != NULL) {
7027 7027 ibd_dec_ref_ace(state, ace);
7028 7028 }
7029 7029 return (B_FALSE);
7030 7030 }
7031 7031
7032 7032 /*
7033 7033 * If a poll cannot be scheduled, we have no choice but
7034 7034 * to drop this packet
7035 7035 */
7036 7036 ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
7037 7037 if (ace != NULL) {
7038 7038 ibd_dec_ref_ace(state, ace);
7039 7039 }
7040 7040 return (B_TRUE);
7041 7041 }
7042 7042 }
7043 7043
7044 7044 /*
7045 7045 * Initialize the commonly used fields in swqe to NULL to protect
7046 7046 * against ibd_tx_cleanup accidentally misinterpreting these on a
7047 7047 * failure.
7048 7048 */
7049 7049 node->swqe_im_mblk = NULL;
7050 7050 node->w_swr.wr_nds = 0;
7051 7051 node->w_swr.wr_sgl = NULL;
7052 7052 node->w_swr.wr_opcode = IBT_WRC_SEND;
7053 7053
7054 7054 /*
7055 7055 * Calculate the size of message data and number of msg blocks
7056 7056 */
7057 7057 pktsize = 0;
7058 7058 for (nmblks = 0, nmp = mp; nmp != NULL;
7059 7059 nmp = nmp->b_cont, nmblks++) {
7060 7060 pktsize += MBLKL(nmp);
7061 7061 }
7062 7062
7063 7063 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7064 7064 atomic_inc_64(&state->id_brd_xmt);
7065 7065 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7066 7066 atomic_inc_64(&state->id_multi_xmt);
7067 7067
7068 7068 if (ace != NULL) {
7069 7069 node->w_ahandle = ace;
7070 7070 node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7071 7071 } else {
7072 7072 DPRINT(5,
7073 7073 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7074 7074 ((ret == EFAULT) ? "failed" : "queued"),
7075 7075 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7076 7076 htonl(dest->ipoib_gidpref[1]),
7077 7077 htonl(dest->ipoib_gidsuff[0]),
7078 7078 htonl(dest->ipoib_gidsuff[1]));
7079 7079 state->rc_ace_not_found++;
7080 7080 node->w_ahandle = NULL;
7081 7081
7082 7082 /*
7083 7083 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7084 7084 * can not find a path for the specific dest address. We
7085 7085 * should get rid of this kind of packet. We also should get
7086 7086 * rid of the packet if we cannot schedule a poll via the
7087 7087 * async thread. For the normal case, ibd will return the
7088 7088 * packet to upper layer and wait for AH creating.
7089 7089 *
7090 7090 * Note that we always queue a work slot entry for the async
7091 7091 * thread when we fail AH lookup (even in intr mode); this is
7092 7092 * due to the convoluted way the code currently looks for AH.
7093 7093 */
7094 7094 if (ret == EFAULT) {
7095 7095 dofree = B_TRUE;
7096 7096 rc = B_TRUE;
7097 7097 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7098 7098 dofree = B_TRUE;
7099 7099 rc = B_TRUE;
7100 7100 } else {
7101 7101 dofree = B_FALSE;
7102 7102 rc = B_FALSE;
7103 7103 }
7104 7104 goto ibd_send_fail;
7105 7105 }
7106 7106
7107 7107 /*
7108 7108 * For ND6 packets, padding is at the front of the source lladdr.
7109 7109 * Insert the padding at front.
7110 7110 */
7111 7111 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7112 7112 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7113 7113 if (!pullupmsg(mp, IPV6_HDR_LEN +
7114 7114 sizeof (ib_header_info_t))) {
7115 7115 DPRINT(10, "ibd_send: pullupmsg failure ");
7116 7116 dofree = B_TRUE;
7117 7117 rc = B_TRUE;
7118 7118 goto ibd_send_fail;
7119 7119 }
7120 7120 ipibp = (ib_header_info_t *)mp->b_rptr;
7121 7121 }
7122 7122 ip6h = (ip6_t *)((uchar_t *)ipibp +
7123 7123 sizeof (ib_header_info_t));
7124 7124 len = ntohs(ip6h->ip6_plen);
7125 7125 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7126 7126 mblk_t *pad;
7127 7127
7128 7128 pad = allocb(4, 0);
7129 7129 pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7130 7130 linkb(mp, pad);
7131 7131 if (MBLKL(mp) < sizeof (ib_header_info_t) +
7132 7132 IPV6_HDR_LEN + len + 4) {
7133 7133 if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7134 7134 IPV6_HDR_LEN + len + 4)) {
7135 7135 DPRINT(10, "ibd_send: pullupmsg "
7136 7136 "failure ");
7137 7137 dofree = B_TRUE;
7138 7138 rc = B_TRUE;
7139 7139 goto ibd_send_fail;
7140 7140 }
7141 7141 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7142 7142 sizeof (ib_header_info_t));
7143 7143 }
7144 7144
7145 7145 /* LINTED: E_CONSTANT_CONDITION */
7146 7146 IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7147 7147 }
7148 7148 }
7149 7149
7150 7150 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7151 7151 mp->b_rptr += sizeof (ib_addrs_t);
7152 7152 pktsize -= sizeof (ib_addrs_t);
7153 7153
7154 7154 if (rc_chan) { /* send in RC mode */
7155 7155 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7156 7156 ibt_iov_attr_t iov_attr;
7157 7157 uint_t i;
7158 7158 size_t blksize;
7159 7159 uchar_t *bufp;
7160 7160 ibd_rc_tx_largebuf_t *lbufp;
7161 7161
7162 7162 atomic_add_64(&state->rc_xmt_bytes, pktsize);
7163 7163
7164 7164 /*
7165 7165 * Upper layer does Tx checksum, we don't need do any
7166 7166 * checksum here.
7167 7167 */
7168 7168 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7169 7169
7170 7170 /*
7171 7171 * We only do ibt_map_mem_iov() if the pktsize is above
7172 7172 * the "copy-threshold", and if the number of mp
7173 7173 * fragments is less than the maximum acceptable.
7174 7174 */
7175 7175 if (pktsize <= state->id_rc_tx_copy_thresh) {
7176 7176 atomic_inc_64(&state->rc_xmt_small_pkt);
7177 7177 /*
7178 7178 * Only process unicast packet in Reliable Connected
7179 7179 * mode.
7180 7180 */
7181 7181 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7182 7182 node->w_swr.wr_nds = 1;
7183 7183 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7184 7184 node->w_buftype = IBD_WQE_TXBUF;
7185 7185
7186 7186 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7187 7187 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7188 7188 blksize = MBLKL(nmp);
7189 7189 bcopy(nmp->b_rptr, bufp, blksize);
7190 7190 bufp += blksize;
7191 7191 }
7192 7192 freemsg(mp);
7193 7193 ASSERT(node->swqe_im_mblk == NULL);
7194 7194 } else {
7195 7195 if ((state->rc_enable_iov_map) &&
7196 7196 (nmblks < state->rc_max_sqseg_hiwm)) {
7197 7197
7198 7198 /* do ibt_map_mem_iov() */
7199 7199 iov_attr.iov_as = NULL;
7200 7200 iov_attr.iov = iov_arr;
7201 7201 iov_attr.iov_buf = NULL;
7202 7202 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7203 7203 iov_attr.iov_lso_hdr_sz = 0;
7204 7204 iov_attr.iov_flags = IBT_IOV_SLEEP;
7205 7205
7206 7206 i = 0;
7207 7207 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7208 7208 iov_arr[i].iov_len = MBLKL(nmp);
7209 7209 if (iov_arr[i].iov_len != 0) {
7210 7210 iov_arr[i].iov_addr = (caddr_t)
7211 7211 (void *)nmp->b_rptr;
7212 7212 i++;
7213 7213 }
7214 7214 }
7215 7215 iov_attr.iov_list_len = i;
7216 7216 node->w_swr.wr_sgl = node->w_sgl;
7217 7217
7218 7218 ret = ibt_map_mem_iov(state->id_hca_hdl,
7219 7219 &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7220 7220 &node->w_mi_hdl);
7221 7221 if (ret != IBT_SUCCESS) {
7222 7222 atomic_inc_64(
7223 7223 &state->rc_xmt_map_fail_pkt);
7224 7224 DPRINT(30, "ibd_send: ibt_map_mem_iov("
7225 7225 ") failed, nmblks=%d, real_nmblks"
7226 7226 "=%d, ret=0x%x", nmblks, i, ret);
7227 7227 goto ibd_rc_large_copy;
7228 7228 }
7229 7229
7230 7230 atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7231 7231 node->w_buftype = IBD_WQE_MAPPED;
7232 7232 node->swqe_im_mblk = mp;
7233 7233 } else {
7234 7234 atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7235 7235 ibd_rc_large_copy:
7236 7236 mutex_enter(&state->rc_tx_large_bufs_lock);
7237 7237 if (state->rc_tx_largebuf_nfree == 0) {
7238 7238 state->rc_xmt_buf_short++;
7239 7239 mutex_exit
7240 7240 (&state->rc_tx_large_bufs_lock);
7241 7241 mutex_enter(&state->id_sched_lock);
7242 7242 state->id_sched_needed |=
7243 7243 IBD_RSRC_RC_TX_LARGEBUF;
7244 7244 mutex_exit(&state->id_sched_lock);
7245 7245 dofree = B_FALSE;
7246 7246 rc = B_FALSE;
7247 7247 /*
7248 7248 * If we don't have Tx large bufs,
7249 7249 * return failure. node->w_buftype
7250 7250 * should not be IBD_WQE_RC_COPYBUF,
7251 7251 * otherwise it will cause problem
7252 7252 * in ibd_rc_tx_cleanup()
7253 7253 */
7254 7254 node->w_buftype = IBD_WQE_TXBUF;
7255 7255 goto ibd_send_fail;
7256 7256 }
7257 7257
7258 7258 lbufp = state->rc_tx_largebuf_free_head;
7259 7259 ASSERT(lbufp->lb_buf != NULL);
7260 7260 state->rc_tx_largebuf_free_head =
7261 7261 lbufp->lb_next;
7262 7262 lbufp->lb_next = NULL;
7263 7263 /* Update nfree count */
7264 7264 state->rc_tx_largebuf_nfree --;
7265 7265 mutex_exit(&state->rc_tx_large_bufs_lock);
7266 7266 bufp = lbufp->lb_buf;
7267 7267 node->w_sgl[0].ds_va =
7268 7268 (ib_vaddr_t)(uintptr_t)bufp;
7269 7269 node->w_sgl[0].ds_key =
7270 7270 state->rc_tx_mr_desc.md_lkey;
7271 7271 node->w_sgl[0].ds_len = pktsize;
7272 7272 node->w_swr.wr_sgl = node->w_sgl;
7273 7273 node->w_swr.wr_nds = 1;
7274 7274 node->w_buftype = IBD_WQE_RC_COPYBUF;
7275 7275 node->w_rc_tx_largebuf = lbufp;
7276 7276
7277 7277 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7278 7278 blksize = MBLKL(nmp);
7279 7279 if (blksize != 0) {
7280 7280 bcopy(nmp->b_rptr, bufp,
7281 7281 blksize);
7282 7282 bufp += blksize;
7283 7283 }
7284 7284 }
7285 7285 freemsg(mp);
7286 7286 ASSERT(node->swqe_im_mblk == NULL);
7287 7287 }
7288 7288 }
7289 7289
7290 7290 node->swqe_next = NULL;
7291 7291 mutex_enter(&rc_chan->tx_post_lock);
7292 7292 if (rc_chan->tx_busy) {
7293 7293 if (rc_chan->tx_head) {
7294 7294 rc_chan->tx_tail->swqe_next =
7295 7295 SWQE_TO_WQE(node);
7296 7296 } else {
7297 7297 rc_chan->tx_head = node;
7298 7298 }
7299 7299 rc_chan->tx_tail = node;
7300 7300 mutex_exit(&rc_chan->tx_post_lock);
7301 7301 } else {
7302 7302 rc_chan->tx_busy = 1;
7303 7303 mutex_exit(&rc_chan->tx_post_lock);
7304 7304 ibd_rc_post_send(rc_chan, node);
7305 7305 }
7306 7306
7307 7307 return (B_TRUE);
7308 7308 } /* send by RC */
7309 7309
7310 7310 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7311 7311 /*
7312 7312 * Too long pktsize. The packet size from GLD should <=
7313 7313 * state->id_mtu + sizeof (ib_addrs_t)
7314 7314 */
7315 7315 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7316 7316 ibd_req_t *req;
7317 7317
7318 7318 mutex_enter(&ace->tx_too_big_mutex);
7319 7319 if (ace->tx_too_big_ongoing) {
7320 7320 mutex_exit(&ace->tx_too_big_mutex);
7321 7321 state->rc_xmt_reenter_too_long_pkt++;
7322 7322 dofree = B_TRUE;
7323 7323 } else {
7324 7324 ace->tx_too_big_ongoing = B_TRUE;
7325 7325 mutex_exit(&ace->tx_too_big_mutex);
7326 7326 state->rc_xmt_icmp_too_long_pkt++;
7327 7327
7328 7328 req = kmem_cache_alloc(state->id_req_kmc,
7329 7329 KM_NOSLEEP);
7330 7330 if (req == NULL) {
7331 7331 ibd_print_warn(state, "ibd_send: alloc "
7332 7332 "ibd_req_t fail");
7333 7333 /* Drop it. */
7334 7334 dofree = B_TRUE;
7335 7335 } else {
7336 7336 req->rq_ptr = mp;
7337 7337 req->rq_ptr2 = ace;
7338 7338 ibd_queue_work_slot(state, req,
7339 7339 IBD_ASYNC_RC_TOO_BIG);
7340 7340 dofree = B_FALSE;
7341 7341 }
7342 7342 }
7343 7343 } else {
7344 7344 ibd_print_warn(state, "Reliable Connected mode is on. "
7345 7345 "Multicast packet length %d > %d is too long to "
7346 7346 "send packet (%d > %d), drop it",
7347 7347 pktsize, state->id_mtu);
7348 7348 state->rc_xmt_drop_too_long_pkt++;
7349 7349 /* Drop it. */
7350 7350 dofree = B_TRUE;
7351 7351 }
7352 7352 rc = B_TRUE;
7353 7353 goto ibd_send_fail;
7354 7354 }
7355 7355
7356 7356 atomic_add_64(&state->id_xmt_bytes, pktsize);
7357 7357 atomic_inc_64(&state->id_xmt_pkt);
7358 7358
7359 7359 /*
7360 7360 * Do LSO and checksum related work here. For LSO send, adjust the
7361 7361 * ud destination, the opcode and the LSO header information to the
7362 7362 * work request.
7363 7363 */
7364 7364 mac_lso_get(mp, &mss, &lsoflags);
7365 7365 if ((lsoflags & HW_LSO) != HW_LSO) {
7366 7366 node->w_swr.wr_opcode = IBT_WRC_SEND;
7367 7367 lsohdr_sz = 0;
7368 7368 } else {
7369 7369 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7370 7370 /*
7371 7371 * The routine can only fail if there's no memory; we
7372 7372 * can only drop the packet if this happens
7373 7373 */
7374 7374 ibd_print_warn(state,
7375 7375 "ibd_send: no memory, lso posting failed");
7376 7376 dofree = B_TRUE;
7377 7377 rc = B_TRUE;
7378 7378 goto ibd_send_fail;
7379 7379 }
7380 7380
7381 7381 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7382 7382 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7383 7383 }
7384 7384
7385 7385 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7386 7386 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7387 7387 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7388 7388 else
7389 7389 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7390 7390
7391 7391 /*
7392 7392 * Prepare the sgl for posting; the routine can only fail if there's
7393 7393 * no lso buf available for posting. If this is the case, we should
7394 7394 * probably resched for lso bufs to become available and then try again.
7395 7395 */
7396 7396 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7397 7397 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7398 7398 dofree = B_TRUE;
7399 7399 rc = B_TRUE;
7400 7400 } else {
7401 7401 dofree = B_FALSE;
7402 7402 rc = B_FALSE;
7403 7403 }
7404 7404 goto ibd_send_fail;
7405 7405 }
7406 7406 node->swqe_im_mblk = mp;
7407 7407
7408 7408 /*
7409 7409 * Queue the wqe to hardware; since we can now simply queue a
7410 7410 * post instead of doing it serially, we cannot assume anything
7411 7411 * about the 'node' after ibd_post_send() returns.
7412 7412 */
7413 7413 node->swqe_next = NULL;
7414 7414
7415 7415 mutex_enter(&state->id_txpost_lock);
7416 7416 if (state->id_tx_busy) {
7417 7417 if (state->id_tx_head) {
7418 7418 state->id_tx_tail->swqe_next =
7419 7419 SWQE_TO_WQE(node);
7420 7420 } else {
7421 7421 state->id_tx_head = node;
7422 7422 }
7423 7423 state->id_tx_tail = node;
7424 7424 mutex_exit(&state->id_txpost_lock);
7425 7425 } else {
7426 7426 state->id_tx_busy = 1;
7427 7427 mutex_exit(&state->id_txpost_lock);
7428 7428 ibd_post_send(state, node);
7429 7429 }
7430 7430
7431 7431 return (B_TRUE);
7432 7432
7433 7433 ibd_send_fail:
7434 7434 if (node && mp)
7435 7435 ibd_free_lsohdr(node, mp);
7436 7436
7437 7437 if (dofree)
7438 7438 freemsg(mp);
7439 7439
7440 7440 if (node != NULL) {
7441 7441 if (rc_chan) {
7442 7442 ibd_rc_tx_cleanup(node);
7443 7443 } else {
7444 7444 ibd_tx_cleanup(state, node);
7445 7445 }
7446 7446 }
7447 7447
7448 7448 return (rc);
7449 7449 }
7450 7450
7451 7451 /*
7452 7452 * GLDv3 entry point for transmitting datagram.
7453 7453 */
7454 7454 static mblk_t *
7455 7455 ibd_m_tx(void *arg, mblk_t *mp)
7456 7456 {
7457 7457 ibd_state_t *state = (ibd_state_t *)arg;
7458 7458 mblk_t *next;
7459 7459
7460 7460 if (state->id_type == IBD_PORT_DRIVER) {
7461 7461 freemsgchain(mp);
7462 7462 return (NULL);
7463 7463 }
7464 7464
7465 7465 if ((state->id_link_state != LINK_STATE_UP) ||
7466 7466 !(state->id_mac_state & IBD_DRV_STARTED)) {
7467 7467 freemsgchain(mp);
7468 7468 mp = NULL;
7469 7469 }
7470 7470
7471 7471 while (mp != NULL) {
7472 7472 next = mp->b_next;
7473 7473 mp->b_next = NULL;
7474 7474 if (ibd_send(state, mp) == B_FALSE) {
7475 7475 /* Send fail */
7476 7476 mp->b_next = next;
7477 7477 break;
7478 7478 }
7479 7479 mp = next;
7480 7480 }
7481 7481
7482 7482 return (mp);
7483 7483 }
7484 7484
7485 7485 /*
7486 7486 * this handles Tx and Rx completions. With separate CQs, this handles
7487 7487 * only Rx completions.
7488 7488 */
7489 7489 static uint_t
7490 7490 ibd_intr(caddr_t arg)
7491 7491 {
7492 7492 ibd_state_t *state = (ibd_state_t *)arg;
7493 7493
7494 7494 ibd_poll_rcq(state, state->id_rcq_hdl);
7495 7495
7496 7496 return (DDI_INTR_CLAIMED);
7497 7497 }
7498 7498
7499 7499 /*
7500 7500 * Poll and fully drain the send cq
7501 7501 */
7502 7502 static void
7503 7503 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7504 7504 {
7505 7505 ibt_wc_t *wcs = state->id_txwcs;
7506 7506 uint_t numwcs = state->id_txwcs_size;
7507 7507 ibd_wqe_t *wqe;
7508 7508 ibd_swqe_t *head, *tail;
7509 7509 ibt_wc_t *wc;
7510 7510 uint_t num_polled;
7511 7511 int i;
7512 7512
7513 7513 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7514 7514 head = tail = NULL;
7515 7515 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7516 7516 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7517 7517 if (wc->wc_status != IBT_WC_SUCCESS) {
7518 7518 /*
7519 7519 * Channel being torn down.
7520 7520 */
7521 7521 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7522 7522 DPRINT(5, "ibd_drain_scq: flush error");
7523 7523 DPRINT(10, "ibd_drain_scq: Bad "
7524 7524 "status %d", wc->wc_status);
7525 7525 } else {
7526 7526 DPRINT(10, "ibd_drain_scq: "
7527 7527 "unexpected wc_status %d",
7528 7528 wc->wc_status);
7529 7529 }
7530 7530 /*
7531 7531 * Fallthrough to invoke the Tx handler to
7532 7532 * release held resources, e.g., AH refcount.
7533 7533 */
7534 7534 }
7535 7535 /*
7536 7536 * Add this swqe to the list to be cleaned up.
7537 7537 */
7538 7538 if (head)
7539 7539 tail->swqe_next = wqe;
7540 7540 else
7541 7541 head = WQE_TO_SWQE(wqe);
7542 7542 tail = WQE_TO_SWQE(wqe);
7543 7543 }
7544 7544 tail->swqe_next = NULL;
7545 7545 ibd_tx_cleanup_list(state, head, tail);
7546 7546
7547 7547 /*
7548 7548 * Resume any blocked transmissions if possible
7549 7549 */
7550 7550 ibd_resume_transmission(state);
7551 7551 }
7552 7552 }
7553 7553
7554 7554 /*
7555 7555 * Poll and fully drain the receive cq
7556 7556 */
7557 7557 static void
7558 7558 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7559 7559 {
7560 7560 ibt_wc_t *wcs = state->id_rxwcs;
7561 7561 uint_t numwcs = state->id_rxwcs_size;
7562 7562 ibd_rwqe_t *rwqe;
7563 7563 ibt_wc_t *wc;
7564 7564 uint_t num_polled;
7565 7565 int i;
7566 7566 mblk_t *head, *tail, *mp;
7567 7567
7568 7568 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7569 7569 head = tail = NULL;
7570 7570 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7571 7571 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7572 7572 if (wc->wc_status != IBT_WC_SUCCESS) {
7573 7573 /*
7574 7574 * Channel being torn down.
7575 7575 */
7576 7576 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7577 7577 DPRINT(5, "ibd_drain_rcq: "
7578 7578 "expected flushed rwqe");
7579 7579 } else {
7580 7580 DPRINT(5, "ibd_drain_rcq: "
7581 7581 "unexpected wc_status %d",
7582 7582 wc->wc_status);
7583 7583 }
7584 7584 atomic_inc_32(
7585 7585 &state->id_rx_list.dl_bufs_outstanding);
7586 7586 freemsg(rwqe->rwqe_im_mblk);
7587 7587 continue;
7588 7588 }
7589 7589 mp = ibd_process_rx(state, rwqe, wc);
7590 7590 if (mp == NULL)
7591 7591 continue;
7592 7592
7593 7593 /*
7594 7594 * Add this mp to the list to send to the nw layer.
7595 7595 */
7596 7596 if (head)
7597 7597 tail->b_next = mp;
7598 7598 else
7599 7599 head = mp;
7600 7600 tail = mp;
7601 7601 }
7602 7602 if (head)
7603 7603 mac_rx(state->id_mh, state->id_rh, head);
7604 7604
7605 7605 /*
7606 7606 * Account for #rwqes polled.
7607 7607 * Post more here, if less than one fourth full.
7608 7608 */
7609 7609 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7610 7610 (state->id_ud_num_rwqe / 4))
7611 7611 ibd_post_recv_intr(state);
7612 7612 }
7613 7613 }
7614 7614
7615 7615 /*
7616 7616 * Common code for interrupt handling as well as for polling
7617 7617 * for all completed wqe's while detaching.
7618 7618 */
7619 7619 static void
7620 7620 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7621 7621 {
7622 7622 int flag, redo_flag;
7623 7623 int redo = 1;
7624 7624
7625 7625 flag = IBD_CQ_POLLING;
7626 7626 redo_flag = IBD_REDO_CQ_POLLING;
7627 7627
7628 7628 mutex_enter(&state->id_scq_poll_lock);
7629 7629 if (state->id_scq_poll_busy & flag) {
7630 7630 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7631 7631 state->id_scq_poll_busy |= redo_flag;
7632 7632 mutex_exit(&state->id_scq_poll_lock);
7633 7633 return;
7634 7634 }
7635 7635 state->id_scq_poll_busy |= flag;
7636 7636 mutex_exit(&state->id_scq_poll_lock);
7637 7637
7638 7638 /*
7639 7639 * In some cases (eg detaching), this code can be invoked on
7640 7640 * any cpu after disabling cq notification (thus no concurrency
7641 7641 * exists). Apart from that, the following applies normally:
7642 7642 * Transmit completion handling could be from any cpu if
7643 7643 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7644 7644 * is interrupt driven.
7645 7645 */
7646 7646
7647 7647 /*
7648 7648 * Poll and drain the CQ
7649 7649 */
7650 7650 ibd_drain_scq(state, cq_hdl);
7651 7651
7652 7652 /*
7653 7653 * Enable CQ notifications and redrain the cq to catch any
7654 7654 * completions we might have missed after the ibd_drain_scq()
7655 7655 * above and before the ibt_enable_cq_notify() that follows.
7656 7656 * Finally, service any new requests to poll the cq that
7657 7657 * could've come in after the ibt_enable_cq_notify().
7658 7658 */
7659 7659 do {
7660 7660 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7661 7661 IBT_SUCCESS) {
7662 7662 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7663 7663 }
7664 7664
7665 7665 ibd_drain_scq(state, cq_hdl);
7666 7666
7667 7667 mutex_enter(&state->id_scq_poll_lock);
7668 7668 if (state->id_scq_poll_busy & redo_flag)
7669 7669 state->id_scq_poll_busy &= ~redo_flag;
7670 7670 else {
7671 7671 state->id_scq_poll_busy &= ~flag;
7672 7672 redo = 0;
7673 7673 }
7674 7674 mutex_exit(&state->id_scq_poll_lock);
7675 7675
7676 7676 } while (redo);
7677 7677 }
7678 7678
7679 7679 /*
7680 7680 * Common code for interrupt handling as well as for polling
7681 7681 * for all completed wqe's while detaching.
7682 7682 */
7683 7683 static void
7684 7684 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7685 7685 {
7686 7686 int flag, redo_flag;
7687 7687 int redo = 1;
7688 7688
7689 7689 flag = IBD_CQ_POLLING;
7690 7690 redo_flag = IBD_REDO_CQ_POLLING;
7691 7691
7692 7692 mutex_enter(&state->id_rcq_poll_lock);
7693 7693 if (state->id_rcq_poll_busy & flag) {
7694 7694 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7695 7695 state->id_rcq_poll_busy |= redo_flag;
7696 7696 mutex_exit(&state->id_rcq_poll_lock);
7697 7697 return;
7698 7698 }
7699 7699 state->id_rcq_poll_busy |= flag;
7700 7700 mutex_exit(&state->id_rcq_poll_lock);
7701 7701
7702 7702 /*
7703 7703 * Poll and drain the CQ
7704 7704 */
7705 7705 ibd_drain_rcq(state, rcq);
7706 7706
7707 7707 /*
7708 7708 * Enable CQ notifications and redrain the cq to catch any
7709 7709 * completions we might have missed after the ibd_drain_cq()
7710 7710 * above and before the ibt_enable_cq_notify() that follows.
7711 7711 * Finally, service any new requests to poll the cq that
7712 7712 * could've come in after the ibt_enable_cq_notify().
7713 7713 */
7714 7714 do {
7715 7715 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7716 7716 IBT_SUCCESS) {
7717 7717 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7718 7718 }
7719 7719
7720 7720 ibd_drain_rcq(state, rcq);
7721 7721
7722 7722 mutex_enter(&state->id_rcq_poll_lock);
7723 7723 if (state->id_rcq_poll_busy & redo_flag)
7724 7724 state->id_rcq_poll_busy &= ~redo_flag;
7725 7725 else {
7726 7726 state->id_rcq_poll_busy &= ~flag;
7727 7727 redo = 0;
7728 7728 }
7729 7729 mutex_exit(&state->id_rcq_poll_lock);
7730 7730
7731 7731 } while (redo);
7732 7732 }
7733 7733
7734 7734 /*
7735 7735 * Unmap the memory area associated with a given swqe.
7736 7736 */
7737 7737 void
7738 7738 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7739 7739 {
7740 7740 ibt_status_t stat;
7741 7741
7742 7742 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7743 7743
7744 7744 if (swqe->w_mi_hdl) {
7745 7745 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7746 7746 swqe->w_mi_hdl)) != IBT_SUCCESS) {
7747 7747 DPRINT(10,
7748 7748 "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7749 7749 }
7750 7750 swqe->w_mi_hdl = NULL;
7751 7751 }
7752 7752 swqe->w_swr.wr_nds = 0;
7753 7753 }
7754 7754
7755 7755 void
7756 7756 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7757 7757 {
7758 7758 /*
7759 7759 * The recycling logic can be eliminated from here
7760 7760 * and put into the async thread if we create another
7761 7761 * list to hold ACE's for unjoined mcg's.
7762 7762 */
7763 7763 if (DEC_REF_DO_CYCLE(ace)) {
7764 7764 ibd_mce_t *mce;
7765 7765
7766 7766 /*
7767 7767 * Check with the lock taken: we decremented
7768 7768 * reference count without the lock, and some
7769 7769 * transmitter might already have bumped the
7770 7770 * reference count (possible in case of multicast
7771 7771 * disable when we leave the AH on the active
7772 7772 * list). If not still 0, get out, leaving the
7773 7773 * recycle bit intact.
7774 7774 *
7775 7775 * Atomically transition the AH from active
7776 7776 * to free list, and queue a work request to
7777 7777 * leave the group and destroy the mce. No
7778 7778 * transmitter can be looking at the AH or
7779 7779 * the MCE in between, since we have the
7780 7780 * ac_mutex lock. In the SendOnly reap case,
7781 7781 * it is not necessary to hold the ac_mutex
7782 7782 * and recheck the ref count (since the AH was
7783 7783 * taken off the active list), we just do it
7784 7784 * to have uniform processing with the Full
7785 7785 * reap case.
7786 7786 */
7787 7787 mutex_enter(&state->id_ac_mutex);
7788 7788 mce = ace->ac_mce;
7789 7789 if (GET_REF_CYCLE(ace) == 0) {
7790 7790 CLEAR_REFCYCLE(ace);
7791 7791 /*
7792 7792 * Identify the case of fullmember reap as
7793 7793 * opposed to mcg trap reap. Also, port up
7794 7794 * might set ac_mce to NULL to indicate Tx
7795 7795 * cleanup should do no more than put the
7796 7796 * AH in the free list (see ibd_async_link).
7797 7797 */
7798 7798 if (mce != NULL) {
7799 7799 ace->ac_mce = NULL;
7800 7800 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7801 7801 /*
7802 7802 * mc_req was initialized at mce
7803 7803 * creation time.
7804 7804 */
7805 7805 ibd_queue_work_slot(state,
7806 7806 &mce->mc_req, IBD_ASYNC_REAP);
7807 7807 }
7808 7808 IBD_ACACHE_INSERT_FREE(state, ace);
7809 7809 }
7810 7810 mutex_exit(&state->id_ac_mutex);
7811 7811 }
7812 7812 }
7813 7813
7814 7814 /*
7815 7815 * Common code that deals with clean ups after a successful or
7816 7816 * erroneous transmission attempt.
7817 7817 */
7818 7818 static void
7819 7819 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7820 7820 {
7821 7821 ibd_ace_t *ace = swqe->w_ahandle;
7822 7822
7823 7823 DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7824 7824
7825 7825 /*
7826 7826 * If this was a dynamic mapping in ibd_send(), we need to
7827 7827 * unmap here. If this was an lso buffer we'd used for sending,
7828 7828 * we need to release the lso buf to the pool, since the resource
7829 7829 * is scarce. However, if this was simply a normal send using
7830 7830 * the copybuf (present in each swqe), we don't need to release it.
7831 7831 */
7832 7832 if (swqe->swqe_im_mblk != NULL) {
7833 7833 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7834 7834 ibd_unmap_mem(state, swqe);
7835 7835 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7836 7836 ibd_release_lsobufs(state,
7837 7837 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7838 7838 }
7839 7839 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7840 7840 freemsg(swqe->swqe_im_mblk);
7841 7841 swqe->swqe_im_mblk = NULL;
7842 7842 }
7843 7843
7844 7844 /*
7845 7845 * Drop the reference count on the AH; it can be reused
7846 7846 * now for a different destination if there are no more
7847 7847 * posted sends that will use it. This can be eliminated
7848 7848 * if we can always associate each Tx buffer with an AH.
7849 7849 * The ace can be null if we are cleaning up from the
7850 7850 * ibd_send() error path.
7851 7851 */
7852 7852 if (ace != NULL) {
7853 7853 ibd_dec_ref_ace(state, ace);
7854 7854 }
7855 7855
7856 7856 /*
7857 7857 * Release the send wqe for reuse.
7858 7858 */
7859 7859 swqe->swqe_next = NULL;
7860 7860 ibd_release_swqe(state, swqe, swqe, 1);
7861 7861 }
7862 7862
7863 7863 static void
7864 7864 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7865 7865 {
7866 7866 ibd_ace_t *ace;
7867 7867 ibd_swqe_t *swqe;
7868 7868 int n = 0;
7869 7869
7870 7870 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7871 7871
7872 7872 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7873 7873
7874 7874 /*
7875 7875 * If this was a dynamic mapping in ibd_send(), we need to
7876 7876 * unmap here. If this was an lso buffer we'd used for sending,
7877 7877 * we need to release the lso buf to the pool, since the
7878 7878 * resource is scarce. However, if this was simply a normal
7879 7879 * send using the copybuf (present in each swqe), we don't need
7880 7880 * to release it.
7881 7881 */
7882 7882 if (swqe->swqe_im_mblk != NULL) {
7883 7883 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7884 7884 ibd_unmap_mem(state, swqe);
7885 7885 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7886 7886 ibd_release_lsobufs(state,
7887 7887 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7888 7888 }
7889 7889 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7890 7890 freemsg(swqe->swqe_im_mblk);
7891 7891 swqe->swqe_im_mblk = NULL;
7892 7892 }
7893 7893
7894 7894 /*
7895 7895 * Drop the reference count on the AH; it can be reused
7896 7896 * now for a different destination if there are no more
7897 7897 * posted sends that will use it. This can be eliminated
7898 7898 * if we can always associate each Tx buffer with an AH.
7899 7899 * The ace can be null if we are cleaning up from the
7900 7900 * ibd_send() error path.
7901 7901 */
7902 7902 ace = swqe->w_ahandle;
7903 7903 if (ace != NULL) {
7904 7904 ibd_dec_ref_ace(state, ace);
7905 7905 }
7906 7906 n++;
7907 7907 }
7908 7908
7909 7909 /*
7910 7910 * Release the send wqes for reuse.
7911 7911 */
7912 7912 ibd_release_swqe(state, head, tail, n);
7913 7913 }
7914 7914
7915 7915 /*
7916 7916 * Processing to be done after receipt of a packet; hand off to GLD
7917 7917 * in the format expected by GLD. The received packet has this
7918 7918 * format: 2b sap :: 00 :: data.
7919 7919 */
7920 7920 static mblk_t *
7921 7921 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7922 7922 {
7923 7923 ib_header_info_t *phdr;
7924 7924 mblk_t *mp;
7925 7925 ipoib_hdr_t *ipibp;
7926 7926 ipha_t *iphap;
7927 7927 ip6_t *ip6h;
7928 7928 int len;
7929 7929 ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7930 7930 uint32_t bufs;
7931 7931
7932 7932 /*
7933 7933 * Track number handed to upper layer that need to be returned.
7934 7934 */
7935 7935 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7936 7936
7937 7937 /* Never run out of rwqes, use allocb when running low */
7938 7938 if (bufs >= state->id_rx_bufs_outstanding_limit) {
7939 7939 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7940 7940 atomic_inc_32(&state->id_rx_allocb);
7941 7941 mp = allocb(pkt_len, BPRI_HI);
7942 7942 if (mp) {
7943 7943 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7944 7944 ibd_post_recv(state, rwqe);
7945 7945 } else { /* no memory */
7946 7946 atomic_inc_32(&state->id_rx_allocb_failed);
7947 7947 ibd_post_recv(state, rwqe);
7948 7948 return (NULL);
7949 7949 }
7950 7950 } else {
7951 7951 mp = rwqe->rwqe_im_mblk;
7952 7952 }
7953 7953
7954 7954
7955 7955 /*
7956 7956 * Adjust write pointer depending on how much data came in.
7957 7957 */
7958 7958 mp->b_wptr = mp->b_rptr + pkt_len;
7959 7959
7960 7960 /*
7961 7961 * Make sure this is NULL or we're in trouble.
7962 7962 */
7963 7963 if (mp->b_next != NULL) {
7964 7964 ibd_print_warn(state,
7965 7965 "ibd_process_rx: got duplicate mp from rcq?");
7966 7966 mp->b_next = NULL;
7967 7967 }
7968 7968
7969 7969 /*
7970 7970 * the IB link will deliver one of the IB link layer
7971 7971 * headers called, the Global Routing Header (GRH).
7972 7972 * ibd driver uses the information in GRH to build the
7973 7973 * Header_info structure and pass it with the datagram up
7974 7974 * to GLDv3.
7975 7975 * If the GRH is not valid, indicate to GLDv3 by setting
7976 7976 * the VerTcFlow field to 0.
7977 7977 */
7978 7978 phdr = (ib_header_info_t *)mp->b_rptr;
7979 7979 if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7980 7980 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7981 7981
7982 7982 /* if it is loop back packet, just drop it. */
7983 7983 if (state->id_enable_rc) {
7984 7984 if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7985 7985 &state->rc_macaddr_loopback,
7986 7986 IPOIB_ADDRL) == 0) {
7987 7987 freemsg(mp);
7988 7988 return (NULL);
7989 7989 }
7990 7990 } else {
7991 7991 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7992 7992 IPOIB_ADDRL) == 0) {
7993 7993 freemsg(mp);
7994 7994 return (NULL);
7995 7995 }
7996 7996 }
7997 7997
7998 7998 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7999 7999 sizeof (ipoib_mac_t));
8000 8000 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
8001 8001 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
8002 8002 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
8003 8003 } else {
8004 8004 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
8005 8005 }
8006 8006 } else {
8007 8007 /*
8008 8008 * It can not be a IBA multicast packet. Must have been
8009 8009 * unicast for us. Just copy the interface address to dst.
8010 8010 */
8011 8011 phdr->ib_grh.ipoib_vertcflow = 0;
8012 8012 ovbcopy(&state->id_macaddr, &phdr->ib_dst,
8013 8013 sizeof (ipoib_mac_t));
8014 8014 }
8015 8015
8016 8016 /*
8017 8017 * For ND6 packets, padding is at the front of the source/target
8018 8018 * lladdr. However the inet6 layer is not aware of it, hence remove
8019 8019 * the padding from such packets.
8020 8020 */
8021 8021 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
8022 8022 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
8023 8023 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8024 8024 len = ntohs(ip6h->ip6_plen);
8025 8025 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8026 8026 /* LINTED: E_CONSTANT_CONDITION */
8027 8027 IBD_PAD_NSNA(ip6h, len, IBD_RECV);
8028 8028 }
8029 8029 }
8030 8030
8031 8031 /*
8032 8032 * Update statistics
8033 8033 */
8034 8034 atomic_add_64(&state->id_rcv_bytes, pkt_len);
8035 8035 atomic_inc_64(&state->id_rcv_pkt);
8036 8036 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
8037 8037 atomic_inc_64(&state->id_brd_rcv);
8038 8038 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
8039 8039 atomic_inc_64(&state->id_multi_rcv);
8040 8040
8041 8041 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8042 8042 /*
8043 8043 * Set receive checksum status in mp
8044 8044 * Hardware checksumming can be considered valid only if:
8045 8045 * 1. CQE.IP_OK bit is set
8046 8046 * 2. CQE.CKSUM = 0xffff
8047 8047 * 3. IPv6 routing header is not present in the packet
8048 8048 * 4. If there are no IP_OPTIONS in the IP HEADER
8049 8049 */
8050 8050
8051 8051 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
8052 8052 (wc->wc_cksum == 0xFFFF) &&
8053 8053 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
8054 8054 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
8055 8055 }
8056 8056
8057 8057 return (mp);
8058 8058 }
8059 8059
8060 8060 /*
8061 8061 * Callback code invoked from STREAMs when the receive data buffer is
8062 8062 * free for recycling.
8063 8063 */
8064 8064 static void
8065 8065 ibd_freemsg_cb(char *arg)
8066 8066 {
8067 8067 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8068 8068 ibd_state_t *state = rwqe->w_state;
8069 8069
8070 8070 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8071 8071
8072 8072 /*
8073 8073 * If the driver is stopped, just free the rwqe.
8074 8074 */
8075 8075 if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8076 8076 DPRINT(6, "ibd_freemsg: wqe being freed");
8077 8077 rwqe->rwqe_im_mblk = NULL;
8078 8078 ibd_free_rwqe(state, rwqe);
8079 8079 return;
8080 8080 }
8081 8081
8082 8082 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8083 8083 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8084 8084 if (rwqe->rwqe_im_mblk == NULL) {
8085 8085 ibd_free_rwqe(state, rwqe);
8086 8086 DPRINT(6, "ibd_freemsg: desballoc failed");
8087 8087 return;
8088 8088 }
8089 8089
8090 8090 ibd_post_recv(state, rwqe);
8091 8091 }
8092 8092
8093 8093 static uint_t
8094 8094 ibd_tx_recycle(caddr_t arg)
8095 8095 {
8096 8096 ibd_state_t *state = (ibd_state_t *)arg;
8097 8097
8098 8098 /*
8099 8099 * Poll for completed entries
8100 8100 */
8101 8101 ibd_poll_scq(state, state->id_scq_hdl);
8102 8102
8103 8103 return (DDI_INTR_CLAIMED);
8104 8104 }
8105 8105
8106 8106 #ifdef IBD_LOGGING
8107 8107 static void
8108 8108 ibd_log_init(void)
8109 8109 {
8110 8110 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8111 8111 ibd_lbuf_ndx = 0;
8112 8112
8113 8113 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8114 8114 }
8115 8115
8116 8116 static void
8117 8117 ibd_log_fini(void)
8118 8118 {
8119 8119 if (ibd_lbuf)
8120 8120 kmem_free(ibd_lbuf, IBD_LOG_SZ);
8121 8121 ibd_lbuf_ndx = 0;
8122 8122 ibd_lbuf = NULL;
8123 8123
8124 8124 mutex_destroy(&ibd_lbuf_lock);
8125 8125 }
8126 8126
8127 8127 static void
8128 8128 ibd_log(const char *fmt, ...)
8129 8129 {
8130 8130 va_list ap;
8131 8131 uint32_t off;
8132 8132 uint32_t msglen;
8133 8133 char tmpbuf[IBD_DMAX_LINE];
8134 8134
8135 8135 if (ibd_lbuf == NULL)
8136 8136 return;
8137 8137
8138 8138 va_start(ap, fmt);
8139 8139 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8140 8140 va_end(ap);
8141 8141
8142 8142 if (msglen >= IBD_DMAX_LINE)
8143 8143 msglen = IBD_DMAX_LINE - 1;
8144 8144
8145 8145 mutex_enter(&ibd_lbuf_lock);
8146 8146
8147 8147 off = ibd_lbuf_ndx; /* current msg should go here */
8148 8148 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8149 8149 ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8150 8150
8151 8151 ibd_lbuf_ndx += msglen; /* place where next msg should start */
8152 8152 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */
8153 8153
8154 8154 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8155 8155 ibd_lbuf_ndx = 0;
8156 8156
8157 8157 mutex_exit(&ibd_lbuf_lock);
8158 8158
8159 8159 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */
8160 8160 }
8161 8161 #endif
8162 8162
8163 8163 /* ARGSUSED */
8164 8164 static int
8165 8165 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8166 8166 int *rvalp)
8167 8167 {
8168 8168 ibd_create_ioctl_t *cmd = karg;
8169 8169 ibd_state_t *state, *port_state, *p;
8170 8170 int i, err, rval = 0;
8171 8171 mac_register_t *macp;
8172 8172 ibt_hca_portinfo_t *pinfop = NULL;
8173 8173 ibt_status_t ibt_status;
8174 8174 uint_t psize, pinfosz;
8175 8175 boolean_t force_create = B_FALSE;
8176 8176
8177 8177 cmd->ibdioc.ioc_status = 0;
8178 8178
8179 8179 if (cmd->ibdioc.ioc_port_inst < 0) {
8180 8180 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8181 8181 return (EINVAL);
8182 8182 }
8183 8183 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8184 8184 if (port_state == NULL) {
8185 8185 DPRINT(10, "ibd_create_partition: failed to get state %d",
8186 8186 cmd->ibdioc.ioc_port_inst);
8187 8187 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8188 8188 return (EINVAL);
8189 8189 }
8190 8190
8191 8191 /* Limited PKeys not supported */
8192 8192 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8193 8193 rval = EINVAL;
8194 8194 goto part_create_return;
8195 8195 }
8196 8196
8197 8197 if (cmd->ioc_force_create == 0) {
8198 8198 /*
8199 8199 * Check if the port pkey table contains the pkey for which
8200 8200 * this partition is being created.
8201 8201 */
8202 8202 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8203 8203 port_state->id_port, &pinfop, &psize, &pinfosz);
8204 8204
8205 8205 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8206 8206 rval = EINVAL;
8207 8207 goto part_create_return;
8208 8208 }
8209 8209
8210 8210 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8211 8211 rval = ENETDOWN;
8212 8212 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8213 8213 goto part_create_return;
8214 8214 }
8215 8215
8216 8216 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8217 8217 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8218 8218 break;
8219 8219 }
8220 8220 }
8221 8221 if (i == pinfop->p_pkey_tbl_sz) {
8222 8222 rval = EINVAL;
8223 8223 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8224 8224 goto part_create_return;
8225 8225 }
8226 8226 } else {
8227 8227 force_create = B_TRUE;
8228 8228 }
8229 8229
8230 8230 mutex_enter(&ibd_objlist_lock);
8231 8231 for (p = ibd_objlist_head; p; p = p->id_next) {
8232 8232 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8233 8233 (p->id_pkey == cmd->ioc_pkey) &&
8234 8234 (p->id_plinkid == cmd->ioc_partid)) {
8235 8235 mutex_exit(&ibd_objlist_lock);
8236 8236 rval = EEXIST;
8237 8237 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8238 8238 goto part_create_return;
8239 8239 }
8240 8240 }
8241 8241 mutex_exit(&ibd_objlist_lock);
8242 8242
8243 8243 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8244 8244
8245 8245 state->id_type = IBD_PARTITION_OBJ;
8246 8246
8247 8247 state->id_plinkid = cmd->ioc_partid;
8248 8248 state->id_dlinkid = cmd->ibdioc.ioc_linkid;
8249 8249 state->id_port_inst = cmd->ibdioc.ioc_port_inst;
8250 8250
8251 8251 state->id_dip = port_state->id_dip;
8252 8252 state->id_port = port_state->id_port;
8253 8253 state->id_pkey = cmd->ioc_pkey;
8254 8254 state->id_hca_guid = port_state->id_hca_guid;
8255 8255 state->id_port_guid = port_state->id_port_guid;
8256 8256 state->id_force_create = force_create;
8257 8257
8258 8258 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8259 8259 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8260 8260
8261 8261 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8262 8262 rval = EIO;
8263 8263 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8264 8264 goto fail;
8265 8265 }
8266 8266
8267 8267 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8268 8268 rval = EAGAIN;
8269 8269 goto fail;
8270 8270 }
8271 8271
8272 8272 macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
8273 8273 macp->m_dip = port_state->id_dip;
8274 8274 macp->m_instance = (uint_t)-1;
8275 8275 macp->m_driver = state;
8276 8276 macp->m_src_addr = (uint8_t *)&state->id_macaddr;
8277 8277 macp->m_callbacks = &ibd_m_callbacks;
8278 8278 macp->m_min_sdu = 0;
8279 8279 macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
8280 8280 if (state->id_enable_rc) {
8281 8281 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
8282 8282 } else {
8283 8283 macp->m_max_sdu = IBD_DEF_MAX_SDU;
8284 8284 }
8285 8285 macp->m_priv_props = ibd_priv_props;
8286 8286
8287 8287 err = mac_register(macp, &state->id_mh);
8288 8288 mac_free(macp);
8289 8289
8290 8290 if (err != 0) {
8291 8291 DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8292 8292 err);
8293 8293 rval = err;
8294 8294 goto fail;
8295 8295 }
8296 8296
8297 8297 err = dls_devnet_create(state->id_mh,
8298 8298 cmd->ioc_partid, crgetzoneid(credp));
8299 8299 if (err != 0) {
8300 8300 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8301 8301 "%d", err);
8302 8302 rval = err;
8303 8303 (void) mac_unregister(state->id_mh);
8304 8304 goto fail;
8305 8305 }
8306 8306
8307 8307 /*
8308 8308 * Add the new partition state structure to the list
8309 8309 */
8310 8310 mutex_enter(&ibd_objlist_lock);
8311 8311 if (ibd_objlist_head)
8312 8312 state->id_next = ibd_objlist_head;
8313 8313
8314 8314 ibd_objlist_head = state;
8315 8315 mutex_exit(&ibd_objlist_lock);
8316 8316
8317 8317 part_create_return:
8318 8318 if (pinfop) {
8319 8319 ibt_free_portinfo(pinfop, pinfosz);
8320 8320 }
8321 8321 return (rval);
8322 8322
8323 8323 fail:
8324 8324 if (pinfop) {
8325 8325 ibt_free_portinfo(pinfop, pinfosz);
8326 8326 }
8327 8327 ibd_part_unattach(state);
8328 8328 kmem_free(state, sizeof (ibd_state_t));
8329 8329 return (rval);
8330 8330 }
8331 8331
8332 8332 /* ARGSUSED */
8333 8333 static int
8334 8334 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8335 8335 int *rvalp)
8336 8336 {
8337 8337 int err;
8338 8338 datalink_id_t tmpid;
8339 8339 ibd_state_t *node, *prev;
8340 8340 ibd_delete_ioctl_t *cmd = karg;
8341 8341
8342 8342 prev = NULL;
8343 8343
8344 8344 mutex_enter(&ibd_objlist_lock);
8345 8345 node = ibd_objlist_head;
8346 8346
8347 8347 /* Find the ibd state structure corresponding to the partition */
8348 8348 while (node != NULL) {
8349 8349 if (node->id_plinkid == cmd->ioc_partid)
8350 8350 break;
8351 8351 prev = node;
8352 8352 node = node->id_next;
8353 8353 }
8354 8354
8355 8355 if (node == NULL) {
8356 8356 mutex_exit(&ibd_objlist_lock);
8357 8357 return (ENOENT);
8358 8358 }
8359 8359
8360 8360 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8361 8361 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8362 8362 "%d", err);
8363 8363 mutex_exit(&ibd_objlist_lock);
8364 8364 return (err);
8365 8365 }
8366 8366
8367 8367 /*
8368 8368 * Call ibd_part_unattach() only after making sure that the instance has
8369 8369 * not been started yet and is also not in late hca init mode.
8370 8370 */
8371 8371 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8372 8372
8373 8373 err = 0;
8374 8374 if ((node->id_mac_state & IBD_DRV_STARTED) ||
8375 8375 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8376 8376 (ibd_part_busy(node) != DDI_SUCCESS) ||
8377 8377 ((err = mac_disable(node->id_mh)) != 0)) {
8378 8378 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8379 8379 crgetzoneid(credp));
8380 8380 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8381 8381 mutex_exit(&ibd_objlist_lock);
8382 8382 return (err != 0 ? err : EBUSY);
8383 8383 }
8384 8384
8385 8385 node->id_mac_state |= IBD_DRV_IN_DELETION;
8386 8386
8387 8387 ibd_part_unattach(node);
8388 8388
8389 8389 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8390 8390
8391 8391 /* Remove the partition state structure from the linked list */
8392 8392 if (prev == NULL)
8393 8393 ibd_objlist_head = node->id_next;
8394 8394 else
8395 8395 prev->id_next = node->id_next;
8396 8396 mutex_exit(&ibd_objlist_lock);
8397 8397
8398 8398 if ((err = mac_unregister(node->id_mh)) != 0) {
8399 8399 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8400 8400 err);
8401 8401 }
8402 8402
8403 8403 cv_destroy(&node->id_macst_cv);
8404 8404 mutex_destroy(&node->id_macst_lock);
8405 8405
8406 8406 kmem_free(node, sizeof (ibd_state_t));
8407 8407
8408 8408 return (0);
8409 8409 }
8410 8410
8411 8411 /* ARGSUSED */
8412 8412 static int
8413 8413 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8414 8414 int *rvalp)
8415 8415 {
8416 8416 ibd_ioctl_t cmd;
8417 8417 ibpart_ioctl_t partioc;
8418 8418 ibport_ioctl_t portioc;
8419 8419 #ifdef _MULTI_DATAMODEL
8420 8420 ibport_ioctl32_t portioc32;
8421 8421 #endif
8422 8422 ibd_state_t *state, *port_state;
8423 8423 int size;
8424 8424 ibt_hca_portinfo_t *pinfop = NULL;
8425 8425 ibt_status_t ibt_status;
8426 8426 uint_t psize, pinfosz;
8427 8427 int rval = 0;
8428 8428
8429 8429 size = sizeof (ibd_ioctl_t);
8430 8430 if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8431 8431 return (EFAULT);
8432 8432 }
8433 8433 cmd.ioc_status = 0;
8434 8434 switch (cmd.ioc_info_cmd) {
8435 8435 case IBD_INFO_CMD_IBPART:
8436 8436 size = sizeof (ibpart_ioctl_t);
8437 8437 if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8438 8438 return (EFAULT);
8439 8439 }
8440 8440
8441 8441 mutex_enter(&ibd_objlist_lock);
8442 8442 /* Find the ibd state structure corresponding the partition */
8443 8443 for (state = ibd_objlist_head; state; state = state->id_next) {
8444 8444 if (state->id_plinkid == cmd.ioc_linkid) {
8445 8445 break;
8446 8446 }
8447 8447 }
8448 8448
8449 8449 if (state == NULL) {
8450 8450 mutex_exit(&ibd_objlist_lock);
8451 8451 return (ENOENT);
8452 8452 }
8453 8453
8454 8454 partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8455 8455 partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8456 8456 partioc.ibdioc.ioc_portnum = state->id_port;
8457 8457 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8458 8458 partioc.ibdioc.ioc_portguid = state->id_port_guid;
8459 8459 partioc.ibdioc.ioc_status = 0;
8460 8460 partioc.ioc_partid = state->id_plinkid;
8461 8461 partioc.ioc_pkey = state->id_pkey;
8462 8462 partioc.ioc_force_create = state->id_force_create;
8463 8463 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8464 8464 mutex_exit(&ibd_objlist_lock);
8465 8465 return (EFAULT);
8466 8466 }
8467 8467 mutex_exit(&ibd_objlist_lock);
8468 8468
8469 8469 break;
8470 8470
8471 8471 case IBD_INFO_CMD_IBPORT:
8472 8472 if ((cmd.ioc_port_inst < 0) || ((port_state =
8473 8473 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8474 8474 DPRINT(10, "ibd_create_partition: failed to get"
8475 8475 " state %d", cmd.ioc_port_inst);
8476 8476 size = sizeof (ibd_ioctl_t);
8477 8477 cmd.ioc_status = IBD_INVALID_PORT_INST;
8478 8478 if (ddi_copyout((void *)&cmd, (void *)arg, size,
8479 8479 mode)) {
8480 8480 return (EFAULT);
8481 8481 }
8482 8482 return (EINVAL);
8483 8483 }
8484 8484 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8485 8485 port_state->id_port, &pinfop, &psize, &pinfosz);
8486 8486 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8487 8487 return (EINVAL);
8488 8488 }
8489 8489 #ifdef _MULTI_DATAMODEL
8490 8490 switch (ddi_model_convert_from(mode & FMODELS)) {
8491 8491 case DDI_MODEL_ILP32: {
8492 8492 size = sizeof (ibport_ioctl32_t);
8493 8493 if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8494 8494 rval = EFAULT;
8495 8495 goto fail;
8496 8496 }
8497 8497 portioc32.ibdioc.ioc_status = 0;
8498 8498 portioc32.ibdioc.ioc_portnum = port_state->id_port;
8499 8499 portioc32.ibdioc.ioc_hcaguid =
8500 8500 port_state->id_hca_guid;
8501 8501 portioc32.ibdioc.ioc_portguid =
8502 8502 port_state->id_port_guid;
8503 8503 if (portioc32.ioc_pkey_tbl_sz !=
8504 8504 pinfop->p_pkey_tbl_sz) {
8505 8505 rval = EINVAL;
8506 8506 size = sizeof (ibd_ioctl_t);
8507 8507 portioc32.ibdioc.ioc_status =
8508 8508 IBD_INVALID_PKEY_TBL_SIZE;
8509 8509 if (ddi_copyout((void *)&portioc32.ibdioc,
8510 8510 (void *)arg, size, mode)) {
8511 8511 rval = EFAULT;
8512 8512 goto fail;
8513 8513 }
8514 8514 goto fail;
8515 8515 }
8516 8516 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8517 8517 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8518 8518 (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8519 8519 mode)) {
8520 8520 rval = EFAULT;
8521 8521 goto fail;
8522 8522 }
8523 8523 size = sizeof (ibport_ioctl32_t);
8524 8524 if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8525 8525 mode)) {
8526 8526 rval = EFAULT;
8527 8527 goto fail;
8528 8528 }
8529 8529 break;
8530 8530 }
8531 8531 case DDI_MODEL_NONE:
8532 8532 size = sizeof (ibport_ioctl_t);
8533 8533 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8534 8534 rval = EFAULT;
8535 8535 goto fail;
8536 8536 }
8537 8537 portioc.ibdioc.ioc_status = 0;
8538 8538 portioc.ibdioc.ioc_portnum = port_state->id_port;
8539 8539 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8540 8540 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8541 8541 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8542 8542 rval = EINVAL;
8543 8543 size = sizeof (ibd_ioctl_t);
8544 8544 portioc.ibdioc.ioc_status =
8545 8545 IBD_INVALID_PKEY_TBL_SIZE;
8546 8546 if (ddi_copyout((void *)&portioc.ibdioc,
8547 8547 (void *)arg, size, mode)) {
8548 8548 rval = EFAULT;
8549 8549 goto fail;
8550 8550 }
8551 8551 goto fail;
8552 8552 }
8553 8553 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8554 8554 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8555 8555 (void *)(portioc.ioc_pkeys), size, mode)) {
8556 8556 rval = EFAULT;
8557 8557 goto fail;
8558 8558 }
8559 8559 size = sizeof (ibport_ioctl_t);
8560 8560 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8561 8561 mode)) {
8562 8562 rval = EFAULT;
8563 8563 goto fail;
8564 8564 }
8565 8565 break;
8566 8566 }
8567 8567 #else /* ! _MULTI_DATAMODEL */
8568 8568 size = sizeof (ibport_ioctl_t);
8569 8569 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8570 8570 rval = EFAULT;
8571 8571 goto fail;
8572 8572 }
8573 8573 portioc.ibdioc.ioc_status = 0;
8574 8574 portioc.ibdioc.ioc_portnum = port_state->id_port;
8575 8575 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8576 8576 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8577 8577 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8578 8578 rval = EINVAL;
8579 8579 size = sizeof (ibd_ioctl_t);
8580 8580 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8581 8581 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8582 8582 size, mode)) {
8583 8583 rval = EFAULT;
8584 8584 goto fail;
8585 8585 }
8586 8586 goto fail;
8587 8587 }
8588 8588 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8589 8589 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8590 8590 (void *)(portioc.ioc_pkeys), size, mode)) {
8591 8591 rval = EFAULT;
8592 8592 goto fail;
8593 8593 }
8594 8594 size = sizeof (ibport_ioctl_t);
8595 8595 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8596 8596 mode)) {
8597 8597 rval = EFAULT;
8598 8598 goto fail;
8599 8599 }
8600 8600 #endif /* _MULTI_DATAMODEL */
8601 8601
8602 8602 break;
8603 8603
8604 8604 case IBD_INFO_CMD_PKEYTBLSZ:
8605 8605 if ((cmd.ioc_port_inst < 0) || ((port_state =
8606 8606 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8607 8607 DPRINT(10, "ibd_create_partition: failed to get"
8608 8608 " state %d", cmd.ioc_port_inst);
8609 8609 size = sizeof (ibd_ioctl_t);
8610 8610 cmd.ioc_status = IBD_INVALID_PORT_INST;
8611 8611 if (ddi_copyout((void *)&cmd, (void *)arg, size,
8612 8612 mode)) {
8613 8613 return (EFAULT);
8614 8614 }
8615 8615 return (EINVAL);
8616 8616 }
8617 8617 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8618 8618 port_state->id_port, &pinfop, &psize, &pinfosz);
8619 8619 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8620 8620 return (EINVAL);
8621 8621 }
8622 8622 #ifdef _MULTI_DATAMODEL
8623 8623 switch (ddi_model_convert_from(mode & FMODELS)) {
8624 8624 case DDI_MODEL_ILP32: {
8625 8625 size = sizeof (ibport_ioctl32_t);
8626 8626 if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8627 8627 rval = EFAULT;
8628 8628 goto fail;
8629 8629 }
8630 8630 portioc32.ibdioc.ioc_status = 0;
8631 8631 portioc32.ibdioc.ioc_portnum = port_state->id_port;
8632 8632 portioc32.ibdioc.ioc_hcaguid =
8633 8633 port_state->id_hca_guid;
8634 8634 portioc32.ibdioc.ioc_portguid =
8635 8635 port_state->id_port_guid;
8636 8636 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8637 8637 if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8638 8638 mode)) {
8639 8639 rval = EFAULT;
8640 8640 goto fail;
8641 8641 }
8642 8642 break;
8643 8643 }
8644 8644 case DDI_MODEL_NONE:
8645 8645 size = sizeof (ibport_ioctl_t);
8646 8646 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8647 8647 rval = EFAULT;
8648 8648 goto fail;
8649 8649 }
8650 8650 portioc.ibdioc.ioc_status = 0;
8651 8651 portioc.ibdioc.ioc_portnum = port_state->id_port;
8652 8652 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8653 8653 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8654 8654 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8655 8655 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8656 8656 mode)) {
8657 8657 rval = EFAULT;
8658 8658 goto fail;
8659 8659 }
8660 8660 break;
8661 8661 }
8662 8662 #else /* ! _MULTI_DATAMODEL */
8663 8663 size = sizeof (ibport_ioctl_t);
8664 8664 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8665 8665 rval = EFAULT;
8666 8666 goto fail;
8667 8667 }
8668 8668 portioc.ibdioc.ioc_status = 0;
8669 8669 portioc.ibdioc.ioc_portnum = port_state->id_port;
8670 8670 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8671 8671 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8672 8672 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8673 8673 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8674 8674 mode)) {
8675 8675 rval = EFAULT;
8676 8676 goto fail;
8677 8677 }
8678 8678 #endif /* _MULTI_DATAMODEL */
8679 8679 break;
8680 8680
8681 8681 default:
8682 8682 return (EINVAL);
8683 8683
8684 8684 } /* switch (cmd.ioc_info_cmd) */
8685 8685 fail:
8686 8686 if (pinfop) {
8687 8687 ibt_free_portinfo(pinfop, pinfosz);
8688 8688 }
8689 8689 return (rval);
8690 8690 }
8691 8691
8692 8692 /* ARGSUSED */
8693 8693 static void
8694 8694 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8695 8695 ibt_async_code_t code, ibt_async_event_t *event)
8696 8696 {
8697 8697 ibd_state_t *state = (ibd_state_t *)arg;
8698 8698 link_state_t lstate;
8699 8699
8700 8700 switch (code) {
8701 8701 case IBT_EVENT_PORT_UP:
8702 8702 case IBT_ERROR_PORT_DOWN:
8703 8703 if (ibd_get_port_state(state, &lstate) != 0)
8704 8704 break;
8705 8705
8706 8706 if (state->id_link_state != lstate) {
8707 8707 state->id_link_state = lstate;
8708 8708 mac_link_update(state->id_mh, lstate);
8709 8709 }
8710 8710 break;
8711 8711 default:
8712 8712 break;
8713 8713 }
8714 8714 }
8715 8715
8716 8716 static int
8717 8717 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8718 8718 {
8719 8719 ibt_hca_portinfo_t *port_infop;
8720 8720 uint_t psize, port_infosz;
8721 8721 ibt_status_t ret;
8722 8722
8723 8723 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8724 8724 &port_infop, &psize, &port_infosz);
8725 8725 if ((ret != IBT_SUCCESS) || (psize != 1))
8726 8726 return (-1);
8727 8727
8728 8728 state->id_sgid = *port_infop->p_sgid_tbl;
8729 8729 state->id_link_speed = ibd_get_portspeed(state);
8730 8730
8731 8731 if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8732 8732 *lstate = LINK_STATE_UP;
8733 8733 else
8734 8734 *lstate = LINK_STATE_DOWN;
8735 8735
8736 8736 ibt_free_portinfo(port_infop, port_infosz);
8737 8737 return (0);
8738 8738 }
8739 8739
8740 8740 static int
8741 8741 ibd_port_attach(dev_info_t *dip)
8742 8742 {
8743 8743 ibd_state_t *state;
8744 8744 link_state_t lstate;
8745 8745 int instance;
8746 8746 ibt_status_t ret;
8747 8747
8748 8748 /*
8749 8749 * Allocate softstate structure
8750 8750 */
8751 8751 instance = ddi_get_instance(dip);
8752 8752 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8753 8753 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8754 8754 return (DDI_FAILURE);
8755 8755 }
8756 8756
8757 8757 state = ddi_get_soft_state(ibd_list, instance);
8758 8758
8759 8759 state->id_dip = dip;
8760 8760 state->id_type = IBD_PORT_DRIVER;
8761 8761
8762 8762 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8763 8763 "port-number", 0)) == 0) {
8764 8764 DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8765 8765 state->id_port);
8766 8766 return (DDI_FAILURE);
8767 8767 }
8768 8768 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8769 8769 "hca-guid", 0)) == 0) {
8770 8770 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8771 8771 state->id_hca_guid);
8772 8772 return (DDI_FAILURE);
8773 8773 }
8774 8774 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8775 8775 "port-guid", 0)) == 0) {
8776 8776 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8777 8777 state->id_port_guid);
8778 8778 return (DDI_FAILURE);
8779 8779 }
8780 8780
8781 8781 /*
8782 8782 * Attach to IBTL
8783 8783 */
8784 8784 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8785 8785 &state->id_ibt_hdl)) != IBT_SUCCESS) {
8786 8786 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8787 8787 ret);
8788 8788 goto done;
8789 8789 }
8790 8790
8791 8791 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8792 8792
8793 8793 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8794 8794 &state->id_hca_hdl)) != IBT_SUCCESS) {
8795 8795 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8796 8796 ret);
8797 8797 goto done;
8798 8798 }
8799 8799 state->id_mac_state |= IBD_DRV_HCA_OPENED;
8800 8800
8801 8801 /* Update link status */
8802 8802
8803 8803 if (ibd_get_port_state(state, &lstate) != 0) {
8804 8804 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8805 8805 ret);
8806 8806 goto done;
8807 8807 }
8808 8808 state->id_link_state = lstate;
8809 8809 /*
8810 8810 * Register ibd interfaces with the Nemo framework
8811 8811 */
8812 8812 if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8813 8813 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8814 8814 goto done;
8815 8815 }
8816 8816 state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8817 8817
8818 8818 mac_link_update(state->id_mh, lstate);
8819 8819
8820 8820 return (DDI_SUCCESS);
8821 8821 done:
8822 8822 (void) ibd_port_unattach(state, dip);
8823 8823 return (DDI_FAILURE);
8824 8824 }
8825 8825
8826 8826 static int
8827 8827 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8828 8828 {
8829 8829 int instance;
8830 8830 uint32_t progress = state->id_mac_state;
8831 8831 ibt_status_t ret;
8832 8832
8833 8833 if (progress & IBD_DRV_MAC_REGISTERED) {
8834 8834 (void) mac_unregister(state->id_mh);
8835 8835 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8836 8836 }
8837 8837
8838 8838 if (progress & IBD_DRV_HCA_OPENED) {
8839 8839 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8840 8840 IBT_SUCCESS) {
8841 8841 ibd_print_warn(state, "failed to close "
8842 8842 "HCA device, ret=%d", ret);
8843 8843 }
8844 8844 state->id_hca_hdl = NULL;
8845 8845 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8846 8846 }
8847 8847
8848 8848 if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8849 8849 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8850 8850 ibd_print_warn(state,
8851 8851 "ibt_detach() failed, ret=%d", ret);
8852 8852 }
8853 8853 state->id_ibt_hdl = NULL;
8854 8854 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8855 8855 }
8856 8856 instance = ddi_get_instance(dip);
8857 8857 ddi_soft_state_free(ibd_list, instance);
8858 8858
8859 8859 return (DDI_SUCCESS);
8860 8860 }
8861 8861
8862 8862 ibt_status_t
8863 8863 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8864 8864 {
8865 8865 ibd_state_t *state;
8866 8866
8867 8867 mutex_enter(&ibd_objlist_lock);
8868 8868
8869 8869 /* Find the ibd state structure corresponding the partition */
8870 8870 for (state = ibd_objlist_head; state; state = state->id_next) {
8871 8871 if (state->id_plinkid == linkid) {
8872 8872 break;
8873 8873 }
8874 8874 }
8875 8875
8876 8876 if (state == NULL) {
8877 8877 mutex_exit(&ibd_objlist_lock);
8878 8878 return (IBT_NO_SUCH_OBJECT);
8879 8879 }
8880 8880
8881 8881 attr->pa_dlinkid = state->id_dlinkid;
8882 8882 attr->pa_plinkid = state->id_plinkid;
8883 8883 attr->pa_port = state->id_port;
8884 8884 attr->pa_hca_guid = state->id_hca_guid;
8885 8885 attr->pa_port_guid = state->id_port_guid;
8886 8886 attr->pa_pkey = state->id_pkey;
8887 8887
8888 8888 mutex_exit(&ibd_objlist_lock);
8889 8889
8890 8890 return (IBT_SUCCESS);
8891 8891 }
8892 8892
8893 8893 ibt_status_t
8894 8894 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8895 8895 {
8896 8896 ibd_state_t *state;
8897 8897 int n = 0;
8898 8898 ibt_part_attr_t *attr;
8899 8899
8900 8900 mutex_enter(&ibd_objlist_lock);
8901 8901
8902 8902 for (state = ibd_objlist_head; state; state = state->id_next)
8903 8903 n++;
8904 8904
8905 8905 *nparts = n;
8906 8906 if (n == 0) {
8907 8907 *attr_list = NULL;
8908 8908 mutex_exit(&ibd_objlist_lock);
8909 8909 return (IBT_SUCCESS);
8910 8910 }
8911 8911
8912 8912 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8913 8913 attr = *attr_list;
8914 8914 for (state = ibd_objlist_head; state; state = state->id_next) {
8915 8915 #ifdef DEBUG
8916 8916 ASSERT(n > 0);
8917 8917 n--;
8918 8918 #endif
8919 8919 attr->pa_dlinkid = state->id_dlinkid;
8920 8920 attr->pa_plinkid = state->id_plinkid;
8921 8921 attr->pa_port = state->id_port;
8922 8922 attr->pa_hca_guid = state->id_hca_guid;
8923 8923 attr->pa_port_guid = state->id_port_guid;
8924 8924 attr->pa_pkey = state->id_pkey;
8925 8925 attr++;
8926 8926 }
8927 8927
8928 8928 mutex_exit(&ibd_objlist_lock);
8929 8929 return (IBT_SUCCESS);
8930 8930 }
↓ open down ↓ |
8454 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX