1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * An implementation of the IPoIB standard based on PSARC 2001/289.
28 */
29
30 #include <sys/types.h>
31 #include <sys/conf.h>
32 #include <sys/ddi.h>
33 #include <sys/sunddi.h>
34 #include <sys/modctl.h>
35 #include <sys/stropts.h>
36 #include <sys/stream.h>
37 #include <sys/strsun.h>
38 #include <sys/strsubr.h>
39 #include <sys/dlpi.h>
40 #include <sys/mac_provider.h>
41
42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */
43 #include <sys/sysmacros.h> /* for offsetof */
44 #include <sys/disp.h> /* for async thread pri */
45 #include <sys/atomic.h> /* for atomic_add*() */
46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */
47 #include <netinet/in.h> /* for netinet/ip.h below */
48 #include <netinet/ip.h> /* for struct ip */
49 #include <netinet/udp.h> /* for struct udphdr */
50 #include <inet/common.h> /* for inet/ip.h below */
51 #include <inet/ip.h> /* for ipha_t */
52 #include <inet/ip6.h> /* for ip6_t */
53 #include <inet/tcp.h> /* for tcph_t */
54 #include <netinet/icmp6.h> /* for icmp6_t */
55 #include <sys/callb.h>
56 #include <sys/modhash.h>
57
58 #include <sys/ib/clients/ibd/ibd.h>
59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */
60 #include <sys/note.h>
61 #include <sys/multidata.h>
62
63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */
64
65 #include <sys/priv_names.h>
66 #include <sys/dls.h>
67 #include <sys/dld_ioc.h>
68 #include <sys/policy.h>
69 #include <sys/ibpart.h>
70 #include <sys/file.h>
71
72 /*
73 * The write-up below includes details on the following:
74 * 1. The dladm administrative model.
75 * 2. Late HCA initialization feature.
76 * 3. Brussels support and its implications to the current architecture.
77 *
78 * 1. The dladm administrative model.
79 * ------------------------------------------
80 * With the dladm model, ibnex will create one ibd instance per port. These
81 * instances will be created independent of the port state.
82 *
83 * The ibd driver is two faceted: One side of it working as the port driver and
84 * the other as the partition object driver.
85 *
86 * The port instance is a child of the HCA, and will have an entry in the devfs.
87 * A DDI attach only happens for the port driver, and its attach is
88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89 * handled in ibd_port_unattach().
90 *
91 * The partition object is only a registrant to the mac layer via mac_register()
92 * and does not have an entry in the device tree. There is no DDI softstate
93 * managed by the DDI framework for the partition objects. However, the state is
94 * managed inside the ibd driver, and every partition object hangs off the
95 * "ibd_objlist_head".
96 *
97 * The partition object first comes into existence when a user runs the
98 * 'create-part' subcommand of dladm. This is like invoking the attach entry
99 * point of the partition object. The partition object goes away with the
100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101 * point of the partition object.
102 *
103 * The create-part and delete-part subcommands result in dld ioctls that end up
104 * calling ibd_create_parition() and ibd_delete_partition respectively.
105 * There ioctls are registered with the dld layer in _init() via a call to
106 * dld_ioc_register().
107 *
108 * The port instance by itself cannot be plumbed. It is only the partition
109 * objects that can be plumbed and they alone participate in I/O and not the
110 * port driver.
111 *
112 * There are some info ioctls supported in ibd which are used by dladm(1M) to
113 * display useful information. The info entry point for ibd is
114 * ibd_get_partition_info().
115 *
116 * 2. Late HCA initialization feature.
117 * ------------------------------------
118 * As mentioned in section 1, the user creates the partition objects via
119 * dladm(1M). It is possible that:
120 * a) The physical port itself is down and the SM cannot be reached.
121 * b) The PKEY specified by the used has not been created in the SM yet.
122 * c) An IPoIB broadcast group for the specified PKEY is not present.
123 *
124 * In all of the above cases, complete initialization of the partition object is
125 * not possible. However, the new model allows the creation of partition
126 * objects even in such cases but will defer the initialization for later.
127 * When such a partition object is plumbed, the link state will be displayed as
128 * "down".
129 * The driver, at this point, is listening to events that herald the
130 * availability of resources -
131 * i) LINK_UP when the link becomes available
132 * ii) PORT_CHANGE when the PKEY has been created
133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134 * created
135 * via ibd_async_handler() for events i) and ii), and via
136 * ibd_snet_notices_handler() for iii.
137 * The driver handles these events (as and when they arrive) and completes the
138 * initialization of the partition object and transitions it to a usable state.
139 *
140 * 3. Brussels support and its implications to the current architecture.
141 * ---------------------------------------------------------------------
142 * The brussels support introduces two new interfaces to the ibd driver -
143 * ibd_m_getprop() and ibd_m_setprop().
144 * These interfaces allow setting and retrieval of certain properties.
145 * Some of them are public properties while most other are private properties
146 * meant to be used by developers. Tuning the latter kind can cause
147 * performance issues and should not be used without understanding the
148 * implications. All properties are specific to an instance of either the
149 * partition object or the port driver.
150 *
151 * The public properties are : mtu and linkmode.
152 * mtu is a read-only property.
153 * linkmode can take two values - UD and CM.
154 *
155 * Changing the linkmode requires some bookkeeping in the driver. The
156 * capabilities need to be re-reported to the mac layer. This is done by
157 * calling mac_capab_update(). The maxsdu is updated by calling
158 * mac_maxsdu_update2().
159 * The private properties retain their values across the change of linkmode.
160 * NOTE:
161 * - The port driver does not support any property apart from mtu.
162 * - All other properties are only meant for the partition object.
163 * - The properties cannot be set when an instance is plumbed. The
164 * instance has to be unplumbed to effect any setting.
165 */
166
167 /*
168 * Driver wide tunables
169 *
170 * ibd_tx_softintr
171 * ibd_rx_softintr
172 * The softintr mechanism allows ibd to avoid event queue overflows if
173 * the receive/completion handlers are to be expensive. These are enabled
174 * by default.
175 *
176 * ibd_log_sz
177 * This specifies the size of the ibd log buffer in bytes. The buffer is
178 * allocated and logging is enabled only when IBD_LOGGING is defined.
179 *
180 */
181 uint_t ibd_rx_softintr = 1;
182 uint_t ibd_tx_softintr = 1;
183
184 #ifdef IBD_LOGGING
185 uint_t ibd_log_sz = 0x20000;
186 #endif
187
188 #ifdef IBD_LOGGING
189 #define IBD_LOG_SZ ibd_log_sz
190 #endif
191
192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 #define IBD_RX_POST_CNT 8
194
195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 #define IBD_LOG_RX_POST 4
197
198 /* Minimum number of receive work requests driver needs to always have */
199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200
201 /*
202 * LSO parameters
203 */
204 #define IBD_LSO_MAXLEN 65536
205 #define IBD_LSO_BUFSZ 8192
206
207 /*
208 * Async operation states
209 */
210 #define IBD_OP_NOTSTARTED 0
211 #define IBD_OP_ONGOING 1
212 #define IBD_OP_COMPLETED 2
213 #define IBD_OP_ERRORED 3
214 #define IBD_OP_ROUTERED 4
215
216 /*
217 * Start/stop in-progress flags; note that restart must always remain
218 * the OR of start and stop flag values.
219 */
220 #define IBD_DRV_START_IN_PROGRESS 0x10000000
221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000
222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000
223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS
224
225 /*
226 * Miscellaneous constants
227 */
228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF
229 #define IBD_DEF_MAX_SDU 2044
230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
231 #define IBD_DEF_RC_MAX_SDU 65520
232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
233 #define IBD_DEFAULT_QKEY 0xB1B
234 #ifdef IBD_LOGGING
235 #define IBD_DMAX_LINE 100
236 #endif
237
238 /*
239 * Enumerations for link states
240 */
241 typedef enum {
242 IBD_LINK_DOWN,
243 IBD_LINK_UP,
244 IBD_LINK_UP_ABSENT
245 } ibd_link_op_t;
246
247 /*
248 * Driver State Pointer
249 */
250 void *ibd_list;
251
252 /*
253 * Driver Global Data
254 */
255 ibd_global_state_t ibd_gstate;
256
257 /*
258 * Partition object list
259 */
260 ibd_state_t *ibd_objlist_head = NULL;
261 kmutex_t ibd_objlist_lock;
262
263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */
264
265 /*
266 * Logging
267 */
268 #ifdef IBD_LOGGING
269 kmutex_t ibd_lbuf_lock;
270 uint8_t *ibd_lbuf;
271 uint32_t ibd_lbuf_ndx;
272 #endif
273
274 /*
275 * Required system entry points
276 */
277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
279
280 /*
281 * Required driver entry points for GLDv3
282 */
283 static int ibd_m_stat(void *, uint_t, uint64_t *);
284 static int ibd_m_start(void *);
285 static void ibd_m_stop(void *);
286 static int ibd_m_promisc(void *, boolean_t);
287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
288 static int ibd_m_unicst(void *, const uint8_t *);
289 static mblk_t *ibd_m_tx(void *, mblk_t *);
290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
291
292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
293 const void *);
294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
296 mac_prop_info_handle_t);
297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
298 const void *);
299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
300
301 /*
302 * Private driver entry points for GLDv3
303 */
304
305 /*
306 * Initialization
307 */
308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
309 static int ibd_init_txlist(ibd_state_t *);
310 static int ibd_init_rxlist(ibd_state_t *);
311 static int ibd_acache_init(ibd_state_t *);
312 #ifdef IBD_LOGGING
313 static void ibd_log_init(void);
314 #endif
315
316 /*
317 * Termination/cleanup
318 */
319 static void ibd_state_fini(ibd_state_t *);
320 static void ibd_fini_txlist(ibd_state_t *);
321 static void ibd_fini_rxlist(ibd_state_t *);
322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
324 static void ibd_acache_fini(ibd_state_t *);
325 #ifdef IBD_LOGGING
326 static void ibd_log_fini(void);
327 #endif
328
329 /*
330 * Allocation/acquire/map routines
331 */
332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
337 uint32_t *);
338
339 /*
340 * Free/release/unmap routines
341 */
342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
343 static void ibd_free_tx_copybufs(ibd_state_t *);
344 static void ibd_free_rx_copybufs(ibd_state_t *);
345 static void ibd_free_rx_rsrcs(ibd_state_t *);
346 static void ibd_free_tx_lsobufs(ibd_state_t *);
347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
350
351 /*
352 * Handlers/callback routines
353 */
354 static uint_t ibd_intr(caddr_t);
355 static uint_t ibd_tx_recycle(caddr_t);
356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
362 static void ibd_freemsg_cb(char *);
363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
364 ibt_async_event_t *);
365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
366 ibt_async_event_t *);
367 static void ibd_snet_notices_handler(void *, ib_gid_t,
368 ibt_subnet_event_code_t, ibt_subnet_event_t *);
369
370 /*
371 * Send/receive routines
372 */
373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
377
378 /*
379 * Threads
380 */
381 static void ibd_async_work(ibd_state_t *);
382
383 /*
384 * Async tasks
385 */
386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
388 static void ibd_async_setprom(ibd_state_t *);
389 static void ibd_async_unsetprom(ibd_state_t *);
390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
392 static void ibd_async_txsched(ibd_state_t *);
393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
394
395 /*
396 * Async task helpers
397 */
398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
402 ipoib_mac_t *, ipoib_mac_t *);
403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
409 static uint64_t ibd_get_portspeed(ibd_state_t *);
410 static boolean_t ibd_async_safe(ibd_state_t *);
411 static void ibd_async_done(ibd_state_t *);
412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
416
417 /*
418 * Helpers for attach/start routines
419 */
420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
421 static int ibd_record_capab(ibd_state_t *);
422 static int ibd_get_port_details(ibd_state_t *);
423 static int ibd_alloc_cqs(ibd_state_t *);
424 static int ibd_setup_ud_channel(ibd_state_t *);
425 static int ibd_start(ibd_state_t *);
426 static int ibd_undo_start(ibd_state_t *, link_state_t);
427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
430 static void ibd_part_unattach(ibd_state_t *state);
431 static int ibd_port_attach(dev_info_t *);
432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
434 static int ibd_part_busy(ibd_state_t *);
435
436 /*
437 * Miscellaneous helpers
438 */
439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 static void ibd_resume_transmission(ibd_state_t *);
441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
443 static void *list_get_head(list_t *);
444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
446
447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
449
450 #ifdef IBD_LOGGING
451 static void ibd_log(const char *, ...);
452 #endif
453
454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
456
457 /* Module Driver Info */
458 static struct modldrv ibd_modldrv = {
459 &mod_driverops, /* This one is a driver */
460 "InfiniBand GLDv3 Driver", /* short description */
461 &ibd_dev_ops /* driver specific ops */
462 };
463
464 /* Module Linkage */
465 static struct modlinkage ibd_modlinkage = {
466 MODREV_1, (void *)&ibd_modldrv, NULL
467 };
468
469 /*
470 * Module (static) info passed to IBTL during ibt_attach
471 */
472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
473 IBTI_V_CURR,
474 IBT_NETWORK,
475 ibd_async_handler,
476 NULL,
477 "IBPART"
478 };
479
480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
481 IBTI_V_CURR,
482 IBT_NETWORK,
483 ibdpd_async_handler,
484 NULL,
485 "IPIB"
486 };
487
488 /*
489 * GLDv3 entry points
490 */
491 #define IBD_M_CALLBACK_FLAGS \
492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
493
494 static mac_callbacks_t ibd_m_callbacks = {
495 IBD_M_CALLBACK_FLAGS,
496 ibd_m_stat,
497 ibd_m_start,
498 ibd_m_stop,
499 ibd_m_promisc,
500 ibd_m_multicst,
501 ibd_m_unicst,
502 ibd_m_tx,
503 NULL,
504 NULL,
505 ibd_m_getcapab,
506 NULL,
507 NULL,
508 ibd_m_setprop,
509 ibd_m_getprop,
510 ibd_m_propinfo
511 };
512
513 /* Private properties */
514 char *ibd_priv_props[] = {
515 "_ibd_broadcast_group",
516 "_ibd_coalesce_completions",
517 "_ibd_create_broadcast_group",
518 "_ibd_hash_size",
519 "_ibd_lso_enable",
520 "_ibd_num_ah",
521 "_ibd_num_lso_bufs",
522 "_ibd_rc_enable_srq",
523 "_ibd_rc_num_rwqe",
524 "_ibd_rc_num_srq",
525 "_ibd_rc_num_swqe",
526 "_ibd_rc_rx_comp_count",
527 "_ibd_rc_rx_comp_usec",
528 "_ibd_rc_rx_copy_thresh",
529 "_ibd_rc_rx_rwqe_thresh",
530 "_ibd_rc_tx_comp_count",
531 "_ibd_rc_tx_comp_usec",
532 "_ibd_rc_tx_copy_thresh",
533 "_ibd_ud_num_rwqe",
534 "_ibd_ud_num_swqe",
535 "_ibd_ud_rx_comp_count",
536 "_ibd_ud_rx_comp_usec",
537 "_ibd_ud_tx_comp_count",
538 "_ibd_ud_tx_comp_usec",
539 "_ibd_ud_tx_copy_thresh",
540 NULL
541 };
542
543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
546
547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
549 ibd_create_partition, secpolicy_dl_config},
550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
551 ibd_delete_partition, secpolicy_dl_config},
552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
553 ibd_get_partition_info, NULL}
554 };
555
556 /*
557 * Fill/clear <scope> and <p_key> in multicast/broadcast address
558 */
559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \
560 { \
561 *(uint32_t *)((char *)(maddr) + 4) |= \
562 htonl((uint32_t)(scope) << 16); \
563 *(uint32_t *)((char *)(maddr) + 8) |= \
564 htonl((uint32_t)(pkey) << 16); \
565 }
566
567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \
568 { \
569 *(uint32_t *)((char *)(maddr) + 4) &= \
570 htonl(~((uint32_t)0xF << 16)); \
571 *(uint32_t *)((char *)(maddr) + 8) &= \
572 htonl(~((uint32_t)0xFFFF << 16)); \
573 }
574
575 /*
576 * Rudimentary debugging support
577 */
578 #ifdef DEBUG
579 int ibd_debuglevel = 100;
580 void
581 debug_print(int l, char *fmt, ...)
582 {
583 va_list ap;
584
585 if (l < ibd_debuglevel)
586 return;
587 va_start(ap, fmt);
588 vcmn_err(CE_CONT, fmt, ap);
589 va_end(ap);
590 }
591 #endif
592
593 /*
594 * Common routine to print warning messages; adds in hca guid, port number
595 * and pkey to be able to identify the IBA interface.
596 */
597 void
598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
599 {
600 ib_guid_t hca_guid;
601 char ibd_print_buf[MAXNAMELEN + 256];
602 int len;
603 va_list ap;
604 char part_name[MAXNAMELEN];
605 datalink_id_t linkid = state->id_plinkid;
606
607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
608 0, "hca-guid", 0);
609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
614 part_name);
615 va_start(ap, fmt);
616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
617 fmt, ap);
618 cmn_err(CE_NOTE, "!%s", ibd_print_buf);
619 va_end(ap);
620 }
621
622 int
623 _init()
624 {
625 int status;
626
627 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
628 PAGESIZE), 0);
629 if (status != 0) {
630 DPRINT(10, "_init:failed in ddi_soft_state_init()");
631 return (status);
632 }
633
634 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
635
636 mac_init_ops(&ibd_dev_ops, "ibp");
637 status = mod_install(&ibd_modlinkage);
638 if (status != 0) {
639 DPRINT(10, "_init:failed in mod_install()");
640 ddi_soft_state_fini(&ibd_list);
641 mac_fini_ops(&ibd_dev_ops);
642 return (status);
643 }
644
645 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
646 mutex_enter(&ibd_gstate.ig_mutex);
647 ibd_gstate.ig_ibt_hdl = NULL;
648 ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
649 ibd_gstate.ig_service_list = NULL;
650 mutex_exit(&ibd_gstate.ig_mutex);
651
652 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
653 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
654 return (EIO);
655 }
656
657 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
658
659 #ifdef IBD_LOGGING
660 ibd_log_init();
661 #endif
662 return (0);
663 }
664
665 int
666 _info(struct modinfo *modinfop)
667 {
668 return (mod_info(&ibd_modlinkage, modinfop));
669 }
670
671 int
672 _fini()
673 {
674 int status;
675
676 status = mod_remove(&ibd_modlinkage);
677 if (status != 0)
678 return (status);
679
680 ibt_unregister_part_attr_cb();
681
682 mac_fini_ops(&ibd_dev_ops);
683 mutex_destroy(&ibd_objlist_lock);
684 ddi_soft_state_fini(&ibd_list);
685 mutex_destroy(&ibd_gstate.ig_mutex);
686 #ifdef IBD_LOGGING
687 ibd_log_fini();
688 #endif
689 return (0);
690 }
691
692 /*
693 * Convert the GID part of the mac address from network byte order
694 * to host order.
695 */
696 static void
697 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
698 {
699 ib_sn_prefix_t nbopref;
700 ib_guid_t nboguid;
701
702 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
703 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
704 dgid->gid_prefix = b2h64(nbopref);
705 dgid->gid_guid = b2h64(nboguid);
706 }
707
708 /*
709 * Create the IPoIB address in network byte order from host order inputs.
710 */
711 static void
712 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
713 ib_guid_t guid)
714 {
715 ib_sn_prefix_t nbopref;
716 ib_guid_t nboguid;
717
718 mac->ipoib_qpn = htonl(qpn);
719 nbopref = h2b64(prefix);
720 nboguid = h2b64(guid);
721 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
722 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
723 }
724
725 /*
726 * Send to the appropriate all-routers group when the IBA multicast group
727 * does not exist, based on whether the target group is v4 or v6.
728 */
729 static boolean_t
730 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
731 ipoib_mac_t *rmac)
732 {
733 boolean_t retval = B_TRUE;
734 uint32_t adjscope = state->id_scope << 16;
735 uint32_t topword;
736
737 /*
738 * Copy the first 4 bytes in without assuming any alignment of
739 * input mac address; this will have IPoIB signature, flags and
740 * scope bits.
741 */
742 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
743 topword = ntohl(topword);
744
745 /*
746 * Generate proper address for IPv4/v6, adding in the Pkey properly.
747 */
748 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
749 (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
750 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
751 ((uint32_t)(state->id_pkey << 16))),
752 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
753 else
754 /*
755 * Does not have proper bits in the mgid address.
756 */
757 retval = B_FALSE;
758
759 return (retval);
760 }
761
762 /*
763 * Membership states for different mcg's are tracked by two lists:
764 * the "non" list is used for promiscuous mode, when all mcg traffic
765 * needs to be inspected. This type of membership is never used for
766 * transmission, so there can not be an AH in the active list
767 * corresponding to a member in this list. This list does not need
768 * any protection, since all operations are performed by the async
769 * thread.
770 *
771 * "Full" and "SendOnly" membership is tracked using a single list,
772 * the "full" list. This is because this single list can then be
773 * searched during transmit to a multicast group (if an AH for the
774 * mcg is not found in the active list), since at least one type
775 * of membership must be present before initiating the transmit.
776 * This list is also emptied during driver detach, since sendonly
777 * membership acquired during transmit is dropped at detach time
778 * along with ipv4 broadcast full membership. Insert/deletes to
779 * this list are done only by the async thread, but it is also
780 * searched in program context (see multicast disable case), thus
781 * the id_mc_mutex protects the list. The driver detach path also
782 * deconstructs the "full" list, but it ensures that the async
783 * thread will not be accessing the list (by blocking out mcg
784 * trap handling and making sure no more Tx reaping will happen).
785 *
786 * Currently, an IBA attach is done in the SendOnly case too,
787 * although this is not required.
788 */
789 #define IBD_MCACHE_INSERT_FULL(state, mce) \
790 list_insert_head(&state->id_mc_full, mce)
791 #define IBD_MCACHE_INSERT_NON(state, mce) \
792 list_insert_head(&state->id_mc_non, mce)
793 #define IBD_MCACHE_FIND_FULL(state, mgid) \
794 ibd_mcache_find(mgid, &state->id_mc_full)
795 #define IBD_MCACHE_FIND_NON(state, mgid) \
796 ibd_mcache_find(mgid, &state->id_mc_non)
797 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \
798 list_remove(&state->id_mc_full, mce)
799 #define IBD_MCACHE_PULLOUT_NON(state, mce) \
800 list_remove(&state->id_mc_non, mce)
801
802 static void *
803 list_get_head(list_t *list)
804 {
805 list_node_t *lhead = list_head(list);
806
807 if (lhead != NULL)
808 list_remove(list, lhead);
809 return (lhead);
810 }
811
812 /*
813 * This is always guaranteed to be able to queue the work.
814 */
815 void
816 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
817 {
818 /* Initialize request */
819 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
820 ptr->rq_op = op;
821
822 /*
823 * Queue provided slot onto request pool.
824 */
825 mutex_enter(&state->id_acache_req_lock);
826 list_insert_tail(&state->id_req_list, ptr);
827
828 /* Go, fetch, async thread */
829 cv_signal(&state->id_acache_req_cv);
830 mutex_exit(&state->id_acache_req_lock);
831 }
832
833 /*
834 * Main body of the per interface async thread.
835 */
836 static void
837 ibd_async_work(ibd_state_t *state)
838 {
839 ibd_req_t *ptr;
840 callb_cpr_t cprinfo;
841
842 mutex_enter(&state->id_acache_req_lock);
843 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
844 callb_generic_cpr, "ibd_async_work");
845
846 for (;;) {
847 ptr = list_get_head(&state->id_req_list);
848 if (ptr != NULL) {
849 mutex_exit(&state->id_acache_req_lock);
850
851 /*
852 * If we are in late hca initialization mode, do not
853 * process any other async request other than TRAP. TRAP
854 * is used for indicating creation of a broadcast group;
855 * in which case, we need to join/create the group.
856 */
857 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
858 (ptr->rq_op != IBD_ASYNC_TRAP)) {
859 goto free_req_and_continue;
860 }
861
862 /*
863 * Once we have done the operation, there is no
864 * guarantee the request slot is going to be valid,
865 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
866 * TRAP).
867 *
868 * Perform the request.
869 */
870 switch (ptr->rq_op) {
871 case IBD_ASYNC_GETAH:
872 ibd_async_acache(state, &ptr->rq_mac);
873 break;
874 case IBD_ASYNC_JOIN:
875 case IBD_ASYNC_LEAVE:
876 ibd_async_multicast(state,
877 ptr->rq_gid, ptr->rq_op);
878 break;
879 case IBD_ASYNC_PROMON:
880 ibd_async_setprom(state);
881 break;
882 case IBD_ASYNC_PROMOFF:
883 ibd_async_unsetprom(state);
884 break;
885 case IBD_ASYNC_REAP:
886 ibd_async_reap_group(state,
887 ptr->rq_ptr, ptr->rq_gid,
888 IB_MC_JSTATE_FULL);
889 /*
890 * the req buf contains in mce
891 * structure, so we do not need
892 * to free it here.
893 */
894 ptr = NULL;
895 break;
896 case IBD_ASYNC_TRAP:
897 ibd_async_trap(state, ptr);
898 break;
899 case IBD_ASYNC_SCHED:
900 ibd_async_txsched(state);
901 break;
902 case IBD_ASYNC_LINK:
903 ibd_async_link(state, ptr);
904 break;
905 case IBD_ASYNC_EXIT:
906 mutex_enter(&state->id_acache_req_lock);
907 CALLB_CPR_EXIT(&cprinfo);
908 return;
909 case IBD_ASYNC_RC_TOO_BIG:
910 ibd_async_rc_process_too_big(state,
911 ptr);
912 break;
913 case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
914 ibd_async_rc_close_act_chan(state, ptr);
915 break;
916 case IBD_ASYNC_RC_RECYCLE_ACE:
917 ibd_async_rc_recycle_ace(state, ptr);
918 break;
919 case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
920 (void) ibd_rc_pas_close(ptr->rq_ptr,
921 B_TRUE, B_TRUE);
922 break;
923 }
924 free_req_and_continue:
925 if (ptr != NULL)
926 kmem_cache_free(state->id_req_kmc, ptr);
927
928 mutex_enter(&state->id_acache_req_lock);
929 } else {
930 /*
931 * Nothing to do: wait till new request arrives.
932 */
933 CALLB_CPR_SAFE_BEGIN(&cprinfo);
934 cv_wait(&state->id_acache_req_cv,
935 &state->id_acache_req_lock);
936 CALLB_CPR_SAFE_END(&cprinfo,
937 &state->id_acache_req_lock);
938 }
939 }
940
941 /*NOTREACHED*/
942 _NOTE(NOT_REACHED)
943 }
944
945 /*
946 * Return when it is safe to queue requests to the async daemon; primarily
947 * for subnet trap and async event handling. Disallow requests before the
948 * daemon is created, and when interface deinitilization starts.
949 */
950 static boolean_t
951 ibd_async_safe(ibd_state_t *state)
952 {
953 mutex_enter(&state->id_trap_lock);
954 if (state->id_trap_stop) {
955 mutex_exit(&state->id_trap_lock);
956 return (B_FALSE);
957 }
958 state->id_trap_inprog++;
959 mutex_exit(&state->id_trap_lock);
960 return (B_TRUE);
961 }
962
963 /*
964 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
965 * trap or event handling to complete to kill the async thread and deconstruct
966 * the mcg/ace list.
967 */
968 static void
969 ibd_async_done(ibd_state_t *state)
970 {
971 mutex_enter(&state->id_trap_lock);
972 if (--state->id_trap_inprog == 0)
973 cv_signal(&state->id_trap_cv);
974 mutex_exit(&state->id_trap_lock);
975 }
976
977 /*
978 * Hash functions:
979 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
980 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
981 * These operate on mac addresses input into ibd_send, but there is no
982 * guarantee on the alignment of the ipoib_mac_t structure.
983 */
984 /*ARGSUSED*/
985 static uint_t
986 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
987 {
988 ulong_t ptraddr = (ulong_t)key;
989 uint_t hval;
990
991 /*
992 * If the input address is 4 byte aligned, we can just dereference
993 * it. This is most common, since IP will send in a 4 byte aligned
994 * IP header, which implies the 24 byte IPoIB psuedo header will be
995 * 4 byte aligned too.
996 */
997 if ((ptraddr & 3) == 0)
998 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
999
1000 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1001 return (hval);
1002 }
1003
1004 static int
1005 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1006 {
1007 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1008 return (0);
1009 else
1010 return (1);
1011 }
1012
1013 /*
1014 * Initialize all the per interface caches and lists; AH cache,
1015 * MCG list etc.
1016 */
1017 static int
1018 ibd_acache_init(ibd_state_t *state)
1019 {
1020 ibd_ace_t *ce;
1021 int i;
1022
1023 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1024 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1025 mutex_enter(&state->id_ac_mutex);
1026 list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1027 offsetof(ibd_ace_t, ac_list));
1028 list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1029 offsetof(ibd_ace_t, ac_list));
1030 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1031 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1032 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1033 list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1034 offsetof(ibd_mce_t, mc_list));
1035 list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1036 offsetof(ibd_mce_t, mc_list));
1037 state->id_ac_hot_ace = NULL;
1038
1039 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1040 state->id_num_ah, KM_SLEEP);
1041 for (i = 0; i < state->id_num_ah; i++, ce++) {
1042 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1043 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1044 mutex_exit(&state->id_ac_mutex);
1045 ibd_acache_fini(state);
1046 return (DDI_FAILURE);
1047 } else {
1048 CLEAR_REFCYCLE(ce);
1049 ce->ac_mce = NULL;
1050 mutex_init(&ce->tx_too_big_mutex, NULL,
1051 MUTEX_DRIVER, NULL);
1052 IBD_ACACHE_INSERT_FREE(state, ce);
1053 }
1054 }
1055 mutex_exit(&state->id_ac_mutex);
1056 return (DDI_SUCCESS);
1057 }
1058
1059 static void
1060 ibd_acache_fini(ibd_state_t *state)
1061 {
1062 ibd_ace_t *ptr;
1063
1064 mutex_enter(&state->id_ac_mutex);
1065
1066 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1067 ASSERT(GET_REF(ptr) == 0);
1068 mutex_destroy(&ptr->tx_too_big_mutex);
1069 (void) ibt_free_ud_dest(ptr->ac_dest);
1070 }
1071
1072 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1073 ASSERT(GET_REF(ptr) == 0);
1074 mutex_destroy(&ptr->tx_too_big_mutex);
1075 (void) ibt_free_ud_dest(ptr->ac_dest);
1076 }
1077
1078 list_destroy(&state->id_ah_free);
1079 list_destroy(&state->id_ah_active);
1080 list_destroy(&state->id_mc_full);
1081 list_destroy(&state->id_mc_non);
1082 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1083 mutex_exit(&state->id_ac_mutex);
1084 mutex_destroy(&state->id_ac_mutex);
1085 mutex_destroy(&state->id_mc_mutex);
1086 }
1087
1088 /*
1089 * Search AH active hash list for a cached path to input destination.
1090 * If we are "just looking", hold == F. When we are in the Tx path,
1091 * we set hold == T to grab a reference on the AH so that it can not
1092 * be recycled to a new destination while the Tx request is posted.
1093 */
1094 ibd_ace_t *
1095 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1096 {
1097 ibd_ace_t *ptr;
1098
1099 ASSERT(mutex_owned(&state->id_ac_mutex));
1100
1101 /*
1102 * Do hash search.
1103 */
1104 if (mod_hash_find(state->id_ah_active_hash,
1105 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1106 if (hold)
1107 INC_REF(ptr, num);
1108 return (ptr);
1109 }
1110 return (NULL);
1111 }
1112
1113 /*
1114 * This is called by the tx side; if an initialized AH is found in
1115 * the active list, it is locked down and can be used; if no entry
1116 * is found, an async request is queued to do path resolution.
1117 */
1118 static ibd_ace_t *
1119 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1120 {
1121 ibd_ace_t *ptr;
1122 ibd_req_t *req;
1123
1124 /*
1125 * Only attempt to print when we can; in the mdt pattr case, the
1126 * address is not aligned properly.
1127 */
1128 if (((ulong_t)mac & 3) == 0) {
1129 DPRINT(4,
1130 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1131 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1132 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1133 htonl(mac->ipoib_gidsuff[1]));
1134 }
1135
1136 mutex_enter(&state->id_ac_mutex);
1137
1138 if (((ptr = state->id_ac_hot_ace) != NULL) &&
1139 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1140 INC_REF(ptr, numwqe);
1141 mutex_exit(&state->id_ac_mutex);
1142 return (ptr);
1143 }
1144 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1145 state->id_ac_hot_ace = ptr;
1146 mutex_exit(&state->id_ac_mutex);
1147 return (ptr);
1148 }
1149
1150 /*
1151 * Implementation of a single outstanding async request; if
1152 * the operation is not started yet, queue a request and move
1153 * to ongoing state. Remember in id_ah_addr for which address
1154 * we are queueing the request, in case we need to flag an error;
1155 * Any further requests, for the same or different address, until
1156 * the operation completes, is sent back to GLDv3 to be retried.
1157 * The async thread will update id_ah_op with an error indication
1158 * or will set it to indicate the next look up can start; either
1159 * way, it will mac_tx_update() so that all blocked requests come
1160 * back here.
1161 */
1162 *err = EAGAIN;
1163 if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1164 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1165 if (req != NULL) {
1166 /*
1167 * We did not even find the entry; queue a request
1168 * for it.
1169 */
1170 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1171 state->id_ah_op = IBD_OP_ONGOING;
1172 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1173 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1174 }
1175 } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1176 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1177 /*
1178 * Check the status of the pathrecord lookup request
1179 * we had queued before.
1180 */
1181 if (state->id_ah_op == IBD_OP_ERRORED) {
1182 *err = EFAULT;
1183 state->id_ah_error++;
1184 } else {
1185 /*
1186 * IBD_OP_ROUTERED case: We need to send to the
1187 * all-router MCG. If we can find the AH for
1188 * the mcg, the Tx will be attempted. If we
1189 * do not find the AH, we return NORESOURCES
1190 * to retry.
1191 */
1192 ipoib_mac_t routermac;
1193
1194 (void) ibd_get_allroutergroup(state, mac, &routermac);
1195 ptr = ibd_acache_find(state, &routermac, B_TRUE,
1196 numwqe);
1197 }
1198 state->id_ah_op = IBD_OP_NOTSTARTED;
1199 } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1200 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1201 /*
1202 * This case can happen when we get a higher band
1203 * packet. The easiest way is to reset the state machine
1204 * to accommodate the higher priority packet.
1205 */
1206 state->id_ah_op = IBD_OP_NOTSTARTED;
1207 }
1208 mutex_exit(&state->id_ac_mutex);
1209
1210 return (ptr);
1211 }
1212
1213 /*
1214 * Grab a not-currently-in-use AH/PathRecord from the active
1215 * list to recycle to a new destination. Only the async thread
1216 * executes this code.
1217 */
1218 static ibd_ace_t *
1219 ibd_acache_get_unref(ibd_state_t *state)
1220 {
1221 ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1222 boolean_t try_rc_chan_recycle = B_FALSE;
1223
1224 ASSERT(mutex_owned(&state->id_ac_mutex));
1225
1226 /*
1227 * Do plain linear search.
1228 */
1229 while (ptr != NULL) {
1230 /*
1231 * Note that it is possible that the "cycle" bit
1232 * is set on the AH w/o any reference count. The
1233 * mcg must have been deleted, and the tx cleanup
1234 * just decremented the reference count to 0, but
1235 * hasn't gotten around to grabbing the id_ac_mutex
1236 * to move the AH into the free list.
1237 */
1238 if (GET_REF(ptr) == 0) {
1239 if (ptr->ac_chan != NULL) {
1240 ASSERT(state->id_enable_rc == B_TRUE);
1241 if (!try_rc_chan_recycle) {
1242 try_rc_chan_recycle = B_TRUE;
1243 ibd_rc_signal_ace_recycle(state, ptr);
1244 }
1245 } else {
1246 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1247 break;
1248 }
1249 }
1250 ptr = list_prev(&state->id_ah_active, ptr);
1251 }
1252 return (ptr);
1253 }
1254
1255 /*
1256 * Invoked to clean up AH from active list in case of multicast
1257 * disable and to handle sendonly memberships during mcg traps.
1258 * And for port up processing for multicast and unicast AHs.
1259 * Normally, the AH is taken off the active list, and put into
1260 * the free list to be recycled for a new destination. In case
1261 * Tx requests on the AH have not completed yet, the AH is marked
1262 * for reaping (which will put the AH on the free list) once the Tx's
1263 * complete; in this case, depending on the "force" input, we take
1264 * out the AH from the active list right now, or leave it also for
1265 * the reap operation. Returns TRUE if the AH is taken off the active
1266 * list (and either put into the free list right now, or arranged for
1267 * later), FALSE otherwise.
1268 */
1269 boolean_t
1270 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1271 {
1272 ibd_ace_t *acactive;
1273 boolean_t ret = B_TRUE;
1274
1275 ASSERT(mutex_owned(&state->id_ac_mutex));
1276
1277 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1278
1279 /*
1280 * Note that the AH might already have the cycle bit set
1281 * on it; this might happen if sequences of multicast
1282 * enables and disables are coming so fast, that posted
1283 * Tx's to the mcg have not completed yet, and the cycle
1284 * bit is set successively by each multicast disable.
1285 */
1286 if (SET_CYCLE_IF_REF(acactive)) {
1287 if (!force) {
1288 /*
1289 * The ace is kept on the active list, further
1290 * Tx's can still grab a reference on it; the
1291 * ace is reaped when all pending Tx's
1292 * referencing the AH complete.
1293 */
1294 ret = B_FALSE;
1295 } else {
1296 /*
1297 * In the mcg trap case, we always pull the
1298 * AH from the active list. And also the port
1299 * up multi/unicast case.
1300 */
1301 ASSERT(acactive->ac_chan == NULL);
1302 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1303 acactive->ac_mce = NULL;
1304 }
1305 } else {
1306 /*
1307 * Determined the ref count is 0, thus reclaim
1308 * immediately after pulling out the ace from
1309 * the active list.
1310 */
1311 ASSERT(acactive->ac_chan == NULL);
1312 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1313 acactive->ac_mce = NULL;
1314 IBD_ACACHE_INSERT_FREE(state, acactive);
1315 }
1316
1317 }
1318 return (ret);
1319 }
1320
1321 /*
1322 * Helper function for async path record lookup. If we are trying to
1323 * Tx to a MCG, check our membership, possibly trying to join the
1324 * group if required. If that fails, try to send the packet to the
1325 * all router group (indicated by the redirect output), pointing
1326 * the input mac address to the router mcg address.
1327 */
1328 static ibd_mce_t *
1329 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1330 {
1331 ib_gid_t mgid;
1332 ibd_mce_t *mce;
1333 ipoib_mac_t routermac;
1334
1335 *redirect = B_FALSE;
1336 ibd_n2h_gid(mac, &mgid);
1337
1338 /*
1339 * Check the FullMember+SendOnlyNonMember list.
1340 * Since we are the only one who manipulates the
1341 * id_mc_full list, no locks are needed.
1342 */
1343 mce = IBD_MCACHE_FIND_FULL(state, mgid);
1344 if (mce != NULL) {
1345 DPRINT(4, "ibd_async_mcache : already joined to group");
1346 return (mce);
1347 }
1348
1349 /*
1350 * Not found; try to join(SendOnlyNonMember) and attach.
1351 */
1352 DPRINT(4, "ibd_async_mcache : not joined to group");
1353 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1354 NULL) {
1355 DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1356 return (mce);
1357 }
1358
1359 /*
1360 * MCGroup not present; try to join the all-router group. If
1361 * any of the following steps succeed, we will be redirecting
1362 * to the all router group.
1363 */
1364 DPRINT(4, "ibd_async_mcache : nonmem join failed");
1365 if (!ibd_get_allroutergroup(state, mac, &routermac))
1366 return (NULL);
1367 *redirect = B_TRUE;
1368 ibd_n2h_gid(&routermac, &mgid);
1369 bcopy(&routermac, mac, IPOIB_ADDRL);
1370 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1371 mgid.gid_prefix, mgid.gid_guid);
1372
1373 /*
1374 * Are we already joined to the router group?
1375 */
1376 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1377 DPRINT(4, "ibd_async_mcache : using already joined router"
1378 "group\n");
1379 return (mce);
1380 }
1381
1382 /*
1383 * Can we join(SendOnlyNonMember) the router group?
1384 */
1385 DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1386 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1387 NULL) {
1388 DPRINT(4, "ibd_async_mcache : joined to router grp");
1389 return (mce);
1390 }
1391
1392 return (NULL);
1393 }
1394
1395 /*
1396 * Async path record lookup code.
1397 */
1398 static void
1399 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1400 {
1401 ibd_ace_t *ce;
1402 ibd_mce_t *mce = NULL;
1403 ibt_path_attr_t path_attr;
1404 ibt_path_info_t path_info;
1405 ib_gid_t destgid;
1406 char ret = IBD_OP_NOTSTARTED;
1407
1408 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X",
1409 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1410 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1411 htonl(mac->ipoib_gidsuff[1]));
1412
1413 /*
1414 * Check whether we are trying to transmit to a MCG.
1415 * In that case, we need to make sure we are a member of
1416 * the MCG.
1417 */
1418 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1419 boolean_t redirected;
1420
1421 /*
1422 * If we can not find or join the group or even
1423 * redirect, error out.
1424 */
1425 if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1426 NULL) {
1427 state->id_ah_op = IBD_OP_ERRORED;
1428 return;
1429 }
1430
1431 /*
1432 * If we got redirected, we need to determine whether
1433 * the AH for the new mcg is in the cache already, and
1434 * not pull it in then; otherwise proceed to get the
1435 * path for the new mcg. There is no guarantee that
1436 * if the AH is currently in the cache, it will still be
1437 * there when we look in ibd_acache_lookup(), but that's
1438 * okay, we will come back here.
1439 */
1440 if (redirected) {
1441 ret = IBD_OP_ROUTERED;
1442 DPRINT(4, "ibd_async_acache : redirected to "
1443 "%08X:%08X:%08X:%08X:%08X",
1444 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1445 htonl(mac->ipoib_gidpref[1]),
1446 htonl(mac->ipoib_gidsuff[0]),
1447 htonl(mac->ipoib_gidsuff[1]));
1448
1449 mutex_enter(&state->id_ac_mutex);
1450 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1451 state->id_ah_op = IBD_OP_ROUTERED;
1452 mutex_exit(&state->id_ac_mutex);
1453 DPRINT(4, "ibd_async_acache : router AH found");
1454 return;
1455 }
1456 mutex_exit(&state->id_ac_mutex);
1457 }
1458 }
1459
1460 /*
1461 * Get an AH from the free list.
1462 */
1463 mutex_enter(&state->id_ac_mutex);
1464 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1465 /*
1466 * No free ones; try to grab an unreferenced active
1467 * one. Maybe we need to make the active list LRU,
1468 * but that will create more work for Tx callbacks.
1469 * Is there a way of not having to pull out the
1470 * entry from the active list, but just indicate it
1471 * is being recycled? Yes, but that creates one more
1472 * check in the fast lookup path.
1473 */
1474 if ((ce = ibd_acache_get_unref(state)) == NULL) {
1475 /*
1476 * Pretty serious shortage now.
1477 */
1478 state->id_ah_op = IBD_OP_NOTSTARTED;
1479 mutex_exit(&state->id_ac_mutex);
1480 DPRINT(10, "ibd_async_acache : failed to find AH "
1481 "slot\n");
1482 return;
1483 }
1484 /*
1485 * We could check whether ac_mce points to a SendOnly
1486 * member and drop that membership now. Or do it lazily
1487 * at detach time.
1488 */
1489 ce->ac_mce = NULL;
1490 }
1491 mutex_exit(&state->id_ac_mutex);
1492 ASSERT(ce->ac_mce == NULL);
1493
1494 /*
1495 * Update the entry.
1496 */
1497 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1498
1499 bzero(&path_info, sizeof (path_info));
1500 bzero(&path_attr, sizeof (ibt_path_attr_t));
1501 path_attr.pa_sgid = state->id_sgid;
1502 path_attr.pa_num_dgids = 1;
1503 ibd_n2h_gid(&ce->ac_mac, &destgid);
1504 path_attr.pa_dgids = &destgid;
1505 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1506 path_attr.pa_pkey = state->id_pkey;
1507 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1508 &path_info, NULL) != IBT_SUCCESS) {
1509 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1510 goto error;
1511 }
1512 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1513 ntohl(ce->ac_mac.ipoib_qpn),
1514 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1515 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1516 goto error;
1517 }
1518
1519 /*
1520 * mce is set whenever an AH is being associated with a
1521 * MCG; this will come in handy when we leave the MCG. The
1522 * lock protects Tx fastpath from scanning the active list.
1523 */
1524 if (mce != NULL)
1525 ce->ac_mce = mce;
1526
1527 /*
1528 * initiate a RC mode connection for unicast address
1529 */
1530 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1531 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1532 ASSERT(ce->ac_chan == NULL);
1533 DPRINT(10, "ibd_async_acache: call "
1534 "ibd_rc_try_connect(ace=%p)", ce);
1535 ibd_rc_try_connect(state, ce, &path_info);
1536 if (ce->ac_chan == NULL) {
1537 DPRINT(10, "ibd_async_acache: fail to setup RC"
1538 " channel");
1539 state->rc_conn_fail++;
1540 goto error;
1541 }
1542 }
1543
1544 mutex_enter(&state->id_ac_mutex);
1545 IBD_ACACHE_INSERT_ACTIVE(state, ce);
1546 state->id_ah_op = ret;
1547 mutex_exit(&state->id_ac_mutex);
1548 return;
1549 error:
1550 /*
1551 * We might want to drop SendOnly membership here if we
1552 * joined above. The lock protects Tx callbacks inserting
1553 * into the free list.
1554 */
1555 mutex_enter(&state->id_ac_mutex);
1556 state->id_ah_op = IBD_OP_ERRORED;
1557 IBD_ACACHE_INSERT_FREE(state, ce);
1558 mutex_exit(&state->id_ac_mutex);
1559 }
1560
1561 /*
1562 * While restoring port's presence on the subnet on a port up, it is possible
1563 * that the port goes down again.
1564 */
1565 static void
1566 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1567 {
1568 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1569 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1570 LINK_STATE_UP;
1571 ibd_mce_t *mce, *pmce;
1572 ibd_ace_t *ace, *pace;
1573
1574 DPRINT(10, "ibd_async_link(): %d", opcode);
1575
1576 /*
1577 * On a link up, revalidate the link speed/width. No point doing
1578 * this on a link down, since we will be unable to do SA operations,
1579 * defaulting to the lowest speed. Also notice that we update our
1580 * notion of speed before calling mac_link_update(), which will do
1581 * necessary higher level notifications for speed changes.
1582 */
1583 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1584 state->id_link_speed = ibd_get_portspeed(state);
1585 }
1586
1587 /*
1588 * Do all the work required to establish our presence on
1589 * the subnet.
1590 */
1591 if (opcode == IBD_LINK_UP_ABSENT) {
1592 /*
1593 * If in promiscuous mode ...
1594 */
1595 if (state->id_prom_op == IBD_OP_COMPLETED) {
1596 /*
1597 * Drop all nonmembership.
1598 */
1599 ibd_async_unsetprom(state);
1600
1601 /*
1602 * Then, try to regain nonmembership to all mcg's.
1603 */
1604 ibd_async_setprom(state);
1605
1606 }
1607
1608 /*
1609 * Drop all sendonly membership (which also gets rid of the
1610 * AHs); try to reacquire all full membership.
1611 */
1612 mce = list_head(&state->id_mc_full);
1613 while ((pmce = mce) != NULL) {
1614 mce = list_next(&state->id_mc_full, mce);
1615 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1616 ibd_leave_group(state,
1617 pmce->mc_info.mc_adds_vect.av_dgid,
1618 IB_MC_JSTATE_SEND_ONLY_NON);
1619 else
1620 ibd_reacquire_group(state, pmce);
1621 }
1622
1623 /*
1624 * Recycle all active AHs to free list (and if there are
1625 * pending posts, make sure they will go into the free list
1626 * once the Tx's complete). Grab the lock to prevent
1627 * concurrent Tx's as well as Tx cleanups.
1628 */
1629 mutex_enter(&state->id_ac_mutex);
1630 ace = list_head(&state->id_ah_active);
1631 while ((pace = ace) != NULL) {
1632 boolean_t cycled;
1633
1634 ace = list_next(&state->id_ah_active, ace);
1635 mce = pace->ac_mce;
1636 if (pace->ac_chan != NULL) {
1637 ASSERT(mce == NULL);
1638 ASSERT(state->id_enable_rc == B_TRUE);
1639 if (pace->ac_chan->chan_state ==
1640 IBD_RC_STATE_ACT_ESTAB) {
1641 INC_REF(pace, 1);
1642 IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
1643 pace->ac_chan->chan_state =
1644 IBD_RC_STATE_ACT_CLOSING;
1645 ibd_rc_signal_act_close(state, pace);
1646 } else {
1647 state->rc_act_close_simultaneous++;
1648 DPRINT(40, "ibd_async_link: other "
1649 "thread is closing it, ace=%p, "
1650 "ac_chan=%p, chan_state=%d",
1651 pace, pace->ac_chan,
1652 pace->ac_chan->chan_state);
1653 }
1654 } else {
1655 cycled = ibd_acache_recycle(state,
1656 &pace->ac_mac, B_TRUE);
1657 }
1658 /*
1659 * If this is for an mcg, it must be for a fullmember,
1660 * since we got rid of send-only members above when
1661 * processing the mce list.
1662 */
1663 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1664 IB_MC_JSTATE_FULL)));
1665
1666 /*
1667 * Check if the fullmember mce needs to be torn down,
1668 * ie whether the DLPI disable has already been done.
1669 * If so, do some of the work of tx_cleanup, namely
1670 * causing leave (which will fail), detach and
1671 * mce-freeing. tx_cleanup will put the AH into free
1672 * list. The reason to duplicate some of this
1673 * tx_cleanup work is because we want to delete the
1674 * AH right now instead of waiting for tx_cleanup, to
1675 * force subsequent Tx's to reacquire an AH.
1676 */
1677 if ((mce != NULL) && (mce->mc_fullreap))
1678 ibd_async_reap_group(state, mce,
1679 mce->mc_info.mc_adds_vect.av_dgid,
1680 mce->mc_jstate);
1681 }
1682 mutex_exit(&state->id_ac_mutex);
1683 }
1684
1685 /*
1686 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1687 * (which stops further events from being delivered) before
1688 * mac_unregister(). At this point, it is guaranteed that mac_register
1689 * has already been done.
1690 */
1691 mutex_enter(&state->id_link_mutex);
1692 state->id_link_state = lstate;
1693 mac_link_update(state->id_mh, lstate);
1694 mutex_exit(&state->id_link_mutex);
1695
1696 ibd_async_done(state);
1697 }
1698
1699 /*
1700 * Check the pkey table to see if we can find the pkey we're looking for.
1701 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1702 * failure.
1703 */
1704 static int
1705 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1706 uint16_t *pkix)
1707 {
1708 uint16_t ndx;
1709
1710 ASSERT(pkix != NULL);
1711
1712 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1713 if (pkey_tbl[ndx] == pkey) {
1714 *pkix = ndx;
1715 return (0);
1716 }
1717 }
1718 return (-1);
1719 }
1720
1721 /*
1722 * Late HCA Initialization:
1723 * If plumb had succeeded without the availability of an active port or the
1724 * pkey, and either of their availability is now being indicated via PORT_UP
1725 * or PORT_CHANGE respectively, try a start of the interface.
1726 *
1727 * Normal Operation:
1728 * When the link is notified up, we need to do a few things, based
1729 * on the port's current p_init_type_reply claiming a reinit has been
1730 * done or not. The reinit steps are:
1731 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
1732 * the old Pkey and GID0 are correct.
1733 * 2. Register for mcg traps (already done by ibmf).
1734 * 3. If PreservePresenceReply indicates the SM has restored port's presence
1735 * in subnet, nothing more to do. Else go to next steps (on async daemon).
1736 * 4. Give up all sendonly memberships.
1737 * 5. Acquire all full memberships.
1738 * 6. In promiscuous mode, acquire all non memberships.
1739 * 7. Recycle all AHs to free list.
1740 */
1741 static void
1742 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
1743 {
1744 ibt_hca_portinfo_t *port_infop = NULL;
1745 ibt_status_t ibt_status;
1746 uint_t psize, port_infosz;
1747 ibd_link_op_t opcode;
1748 ibd_req_t *req;
1749 link_state_t new_link_state = LINK_STATE_UP;
1750 uint8_t itreply;
1751 uint16_t pkix;
1752 int ret;
1753
1754 /*
1755 * Let's not race with a plumb or an unplumb; if we detect a
1756 * pkey relocation event later on here, we may have to restart.
1757 */
1758 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
1759
1760 mutex_enter(&state->id_link_mutex);
1761
1762 /*
1763 * If the link state is unknown, a plumb has not yet been attempted
1764 * on the interface. Nothing to do.
1765 */
1766 if (state->id_link_state == LINK_STATE_UNKNOWN) {
1767 mutex_exit(&state->id_link_mutex);
1768 goto link_mod_return;
1769 }
1770
1771 /*
1772 * If link state is down because of plumb failure, and we are not in
1773 * late HCA init, and we were not successfully plumbed, nothing to do.
1774 */
1775 if ((state->id_link_state == LINK_STATE_DOWN) &&
1776 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
1777 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
1778 mutex_exit(&state->id_link_mutex);
1779 goto link_mod_return;
1780 }
1781
1782 /*
1783 * If this routine was called in response to a port down event,
1784 * we just need to see if this should be informed.
1785 */
1786 if (code == IBT_ERROR_PORT_DOWN) {
1787 new_link_state = LINK_STATE_DOWN;
1788 goto update_link_state;
1789 }
1790
1791 /*
1792 * If it's not a port down event we've received, try to get the port
1793 * attributes first. If we fail here, the port is as good as down.
1794 * Otherwise, if the link went down by the time the handler gets
1795 * here, give up - we cannot even validate the pkey/gid since those
1796 * are not valid and this is as bad as a port down anyway.
1797 */
1798 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
1799 &port_infop, &psize, &port_infosz);
1800 if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
1801 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
1802 new_link_state = LINK_STATE_DOWN;
1803 goto update_link_state;
1804 }
1805
1806 /*
1807 * If in the previous attempt, the pkey was not found either due to the
1808 * port state being down, or due to it's absence in the pkey table,
1809 * look for it now and try to start the interface.
1810 */
1811 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
1812 mutex_exit(&state->id_link_mutex);
1813 if ((ret = ibd_start(state)) != 0) {
1814 DPRINT(10, "ibd_linkmod: cannot start from late HCA "
1815 "init, ret=%d", ret);
1816 }
1817 ibt_free_portinfo(port_infop, port_infosz);
1818 goto link_mod_return;
1819 }
1820
1821 /*
1822 * Check the SM InitTypeReply flags. If both NoLoadReply and
1823 * PreserveContentReply are 0, we don't know anything about the
1824 * data loaded into the port attributes, so we need to verify
1825 * if gid0 and pkey are still valid.
1826 */
1827 itreply = port_infop->p_init_type_reply;
1828 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
1829 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
1830 /*
1831 * Check to see if the subnet part of GID0 has changed. If
1832 * not, check the simple case first to see if the pkey
1833 * index is the same as before; finally check to see if the
1834 * pkey has been relocated to a different index in the table.
1835 */
1836 if (bcmp(port_infop->p_sgid_tbl,
1837 &state->id_sgid, sizeof (ib_gid_t)) != 0) {
1838
1839 new_link_state = LINK_STATE_DOWN;
1840
1841 } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
1842 state->id_pkey) {
1843
1844 new_link_state = LINK_STATE_UP;
1845
1846 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
1847 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
1848
1849 ibt_free_portinfo(port_infop, port_infosz);
1850 mutex_exit(&state->id_link_mutex);
1851
1852 /*
1853 * Currently a restart is required if our pkey has moved
1854 * in the pkey table. If we get the ibt_recycle_ud() to
1855 * work as documented (expected), we may be able to
1856 * avoid a complete restart. Note that we've already
1857 * marked both the start and stop 'in-progress' flags,
1858 * so it is ok to go ahead and do this restart.
1859 */
1860 (void) ibd_undo_start(state, LINK_STATE_DOWN);
1861 if ((ret = ibd_start(state)) != 0) {
1862 DPRINT(10, "ibd_restart: cannot restart, "
1863 "ret=%d", ret);
1864 }
1865
1866 goto link_mod_return;
1867 } else {
1868 new_link_state = LINK_STATE_DOWN;
1869 }
1870 }
1871
1872 update_link_state:
1873 if (port_infop) {
1874 ibt_free_portinfo(port_infop, port_infosz);
1875 }
1876
1877 /*
1878 * If we're reporting a link up, check InitTypeReply to see if
1879 * the SM has ensured that the port's presence in mcg, traps,
1880 * etc. is intact.
1881 */
1882 if (new_link_state == LINK_STATE_DOWN) {
1883 opcode = IBD_LINK_DOWN;
1884 } else {
1885 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
1886 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
1887 opcode = IBD_LINK_UP;
1888 } else {
1889 opcode = IBD_LINK_UP_ABSENT;
1890 }
1891 }
1892
1893 /*
1894 * If the old state is the same as the new state, and the SM indicated
1895 * no change in the port parameters, nothing to do.
1896 */
1897 if ((state->id_link_state == new_link_state) && (opcode !=
1898 IBD_LINK_UP_ABSENT)) {
1899 mutex_exit(&state->id_link_mutex);
1900 goto link_mod_return;
1901 }
1902
1903 /*
1904 * Ok, so there was a link state change; see if it's safe to ask
1905 * the async thread to do the work
1906 */
1907 if (!ibd_async_safe(state)) {
1908 state->id_link_state = new_link_state;
1909 mutex_exit(&state->id_link_mutex);
1910 goto link_mod_return;
1911 }
1912
1913 mutex_exit(&state->id_link_mutex);
1914
1915 /*
1916 * Queue up a request for ibd_async_link() to handle this link
1917 * state change event
1918 */
1919 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
1920 req->rq_ptr = (void *)opcode;
1921 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
1922
1923 link_mod_return:
1924 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
1925 }
1926
1927 /*
1928 * For the port up/down events, IBTL guarantees there will not be concurrent
1929 * invocations of the handler. IBTL might coalesce link transition events,
1930 * and not invoke the handler for _each_ up/down transition, but it will
1931 * invoke the handler with last known state
1932 */
1933 static void
1934 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1935 ibt_async_code_t code, ibt_async_event_t *event)
1936 {
1937 ibd_state_t *state = (ibd_state_t *)clnt_private;
1938
1939 switch (code) {
1940 case IBT_ERROR_CATASTROPHIC_CHAN:
1941 ibd_print_warn(state, "catastrophic channel error");
1942 break;
1943 case IBT_ERROR_CQ:
1944 ibd_print_warn(state, "completion queue error");
1945 break;
1946 case IBT_PORT_CHANGE_EVENT:
1947 /*
1948 * Events will be delivered to all instances that have
1949 * done ibt_open_hca() but not yet done ibt_close_hca().
1950 * Only need to do work for our port; IBTF will deliver
1951 * events for other ports on the hca we have ibt_open_hca'ed
1952 * too. Note that id_port is initialized in ibd_attach()
1953 * before we do an ibt_open_hca() in ibd_attach().
1954 */
1955 ASSERT(state->id_hca_hdl == hca_hdl);
1956 if (state->id_port != event->ev_port)
1957 break;
1958
1959 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
1960 IBT_PORT_CHANGE_PKEY) {
1961 ibd_link_mod(state, code);
1962 }
1963 break;
1964 case IBT_ERROR_PORT_DOWN:
1965 case IBT_CLNT_REREG_EVENT:
1966 case IBT_EVENT_PORT_UP:
1967 /*
1968 * Events will be delivered to all instances that have
1969 * done ibt_open_hca() but not yet done ibt_close_hca().
1970 * Only need to do work for our port; IBTF will deliver
1971 * events for other ports on the hca we have ibt_open_hca'ed
1972 * too. Note that id_port is initialized in ibd_attach()
1973 * before we do an ibt_open_hca() in ibd_attach().
1974 */
1975 ASSERT(state->id_hca_hdl == hca_hdl);
1976 if (state->id_port != event->ev_port)
1977 break;
1978
1979 ibd_link_mod(state, code);
1980 break;
1981
1982 case IBT_HCA_ATTACH_EVENT:
1983 case IBT_HCA_DETACH_EVENT:
1984 /*
1985 * When a new card is plugged to the system, attach_event is
1986 * invoked. Additionally, a cfgadm needs to be run to make the
1987 * card known to the system, and an ifconfig needs to be run to
1988 * plumb up any ibd interfaces on the card. In the case of card
1989 * unplug, a cfgadm is run that will trigger any RCM scripts to
1990 * unplumb the ibd interfaces on the card; when the card is
1991 * actually unplugged, the detach_event is invoked;
1992 * additionally, if any ibd instances are still active on the
1993 * card (eg there were no associated RCM scripts), driver's
1994 * detach routine is invoked.
1995 */
1996 break;
1997 default:
1998 break;
1999 }
2000 }
2001
2002 static int
2003 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2004 {
2005 mac_register_t *macp;
2006 int ret;
2007
2008 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2009 DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2010 return (DDI_FAILURE);
2011 }
2012
2013 /*
2014 * Note that when we register with mac during attach, we don't
2015 * have the id_macaddr yet, so we'll simply be registering a
2016 * zero macaddr that we'll overwrite later during plumb (in
2017 * ibd_m_start()). Similar is the case with id_mtu - we'll
2018 * update the mac layer with the correct mtu during plumb.
2019 */
2020 macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2021 macp->m_driver = state;
2022 macp->m_dip = dip;
2023 macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2024 macp->m_callbacks = &ibd_m_callbacks;
2025 macp->m_min_sdu = 0;
2026 macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2027 if (state->id_type == IBD_PORT_DRIVER) {
2028 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2029 } else if (state->id_enable_rc) {
2030 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2031 } else {
2032 macp->m_max_sdu = IBD_DEF_MAX_SDU;
2033 }
2034 macp->m_priv_props = ibd_priv_props;
2035
2036 /*
2037 * Register ourselves with the GLDv3 interface
2038 */
2039 if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2040 mac_free(macp);
2041 DPRINT(10,
2042 "ibd_register_mac: mac_register() failed, ret=%d", ret);
2043 return (DDI_FAILURE);
2044 }
2045
2046 mac_free(macp);
2047 return (DDI_SUCCESS);
2048 }
2049
2050 static int
2051 ibd_record_capab(ibd_state_t *state)
2052 {
2053 ibt_hca_attr_t hca_attrs;
2054 ibt_status_t ibt_status;
2055
2056 /*
2057 * Query the HCA and fetch its attributes
2058 */
2059 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2060 ASSERT(ibt_status == IBT_SUCCESS);
2061
2062 /*
2063 * 1. Set the Hardware Checksum capability. Currently we only consider
2064 * full checksum offload.
2065 */
2066 if (state->id_enable_rc) {
2067 state->id_hwcksum_capab = 0;
2068 } else {
2069 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2070 == IBT_HCA_CKSUM_FULL) {
2071 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2072 }
2073 }
2074
2075 /*
2076 * 2. Set LSO policy, capability and maximum length
2077 */
2078 if (state->id_enable_rc) {
2079 state->id_lso_capable = B_FALSE;
2080 state->id_lso_maxlen = 0;
2081 } else {
2082 if (hca_attrs.hca_max_lso_size > 0) {
2083 state->id_lso_capable = B_TRUE;
2084 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2085 state->id_lso_maxlen = IBD_LSO_MAXLEN;
2086 else
2087 state->id_lso_maxlen =
2088 hca_attrs.hca_max_lso_size;
2089 } else {
2090 state->id_lso_capable = B_FALSE;
2091 state->id_lso_maxlen = 0;
2092 }
2093 }
2094
2095 /*
2096 * 3. Set Reserved L_Key capability
2097 */
2098 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2099 state->id_hca_res_lkey_capab = 1;
2100 state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2101 state->rc_enable_iov_map = B_TRUE;
2102 } else {
2103 /* If no reserved lkey, we will not use ibt_map_mem_iov */
2104 state->rc_enable_iov_map = B_FALSE;
2105 }
2106
2107 /*
2108 * 4. Set maximum sqseg value after checking to see if extended sgl
2109 * size information is provided by the hca
2110 */
2111 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2112 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2113 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2114 } else {
2115 state->id_max_sqseg = hca_attrs.hca_max_sgl;
2116 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2117 }
2118 if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2119 state->id_max_sqseg = IBD_MAX_SQSEG;
2120 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2121 ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2122 state->id_max_sqseg, IBD_MAX_SQSEG);
2123 }
2124 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2125 state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2126 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2127 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2128 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2129 }
2130
2131 /*
2132 * Translating the virtual address regions into physical regions
2133 * for using the Reserved LKey feature results in a wr sgl that
2134 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2135 * we'll fix a high-water mark (65%) for when we should stop.
2136 */
2137 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2138 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2139
2140 /*
2141 * 5. Set number of recv and send wqes after checking hca maximum
2142 * channel size. Store the max channel size in the state so that it
2143 * can be referred to when the swqe/rwqe change is requested via
2144 * dladm.
2145 */
2146
2147 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
2148
2149 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2150 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2151
2152 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2153 IBD_RWQE_MIN;
2154
2155 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2156 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2157
2158 return (DDI_SUCCESS);
2159 }
2160
2161 static int
2162 ibd_part_busy(ibd_state_t *state)
2163 {
2164 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2165 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2166 return (DDI_FAILURE);
2167 }
2168
2169 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2170 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2171 return (DDI_FAILURE);
2172 }
2173
2174 /*
2175 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2176 * connecting to a remote IPoIB port. We can't remove this port.
2177 */
2178 if (state->id_ah_op == IBD_OP_ONGOING) {
2179 DPRINT(10, "ibd_part_busy: failed: connecting\n");
2180 return (DDI_FAILURE);
2181 }
2182
2183 return (DDI_SUCCESS);
2184 }
2185
2186
2187 static void
2188 ibd_part_unattach(ibd_state_t *state)
2189 {
2190 uint32_t progress = state->id_mac_state;
2191 ibt_status_t ret;
2192
2193 /* make sure rx resources are freed */
2194 ibd_free_rx_rsrcs(state);
2195
2196 if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2197 ASSERT(state->id_enable_rc);
2198 ibd_rc_fini_srq_list(state);
2199 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2200 }
2201
2202 if (progress & IBD_DRV_MAC_REGISTERED) {
2203 (void) mac_unregister(state->id_mh);
2204 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2205 }
2206
2207 if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2208 /*
2209 * No new async requests will be posted since the device
2210 * link state has been marked as unknown; completion handlers
2211 * have been turned off, so Tx handler will not cause any
2212 * more IBD_ASYNC_REAP requests.
2213 *
2214 * Queue a request for the async thread to exit, which will
2215 * be serviced after any pending ones. This can take a while,
2216 * specially if the SM is unreachable, since IBMF will slowly
2217 * timeout each SM request issued by the async thread. Reap
2218 * the thread before continuing on, we do not want it to be
2219 * lingering in modunloaded code.
2220 */
2221 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2222 thread_join(state->id_async_thrid);
2223
2224 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2225 }
2226
2227 if (progress & IBD_DRV_REQ_LIST_INITED) {
2228 list_destroy(&state->id_req_list);
2229 mutex_destroy(&state->id_acache_req_lock);
2230 cv_destroy(&state->id_acache_req_cv);
2231 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2232 }
2233
2234 if (progress & IBD_DRV_PD_ALLOCD) {
2235 if ((ret = ibt_free_pd(state->id_hca_hdl,
2236 state->id_pd_hdl)) != IBT_SUCCESS) {
2237 ibd_print_warn(state, "failed to free "
2238 "protection domain, ret=%d", ret);
2239 }
2240 state->id_pd_hdl = NULL;
2241 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2242 }
2243
2244 if (progress & IBD_DRV_HCA_OPENED) {
2245 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2246 IBT_SUCCESS) {
2247 ibd_print_warn(state, "failed to close "
2248 "HCA device, ret=%d", ret);
2249 }
2250 state->id_hca_hdl = NULL;
2251 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2252 }
2253
2254 mutex_enter(&ibd_gstate.ig_mutex);
2255 if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2256 if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2257 IBT_SUCCESS) {
2258 ibd_print_warn(state,
2259 "ibt_detach() failed, ret=%d", ret);
2260 }
2261 state->id_ibt_hdl = NULL;
2262 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2263 ibd_gstate.ig_ibt_hdl_ref_cnt--;
2264 }
2265 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2266 (ibd_gstate.ig_ibt_hdl != NULL)) {
2267 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2268 IBT_SUCCESS) {
2269 ibd_print_warn(state, "ibt_detach(): global "
2270 "failed, ret=%d", ret);
2271 }
2272 ibd_gstate.ig_ibt_hdl = NULL;
2273 }
2274 mutex_exit(&ibd_gstate.ig_mutex);
2275
2276 if (progress & IBD_DRV_TXINTR_ADDED) {
2277 ddi_remove_softintr(state->id_tx);
2278 state->id_tx = NULL;
2279 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2280 }
2281
2282 if (progress & IBD_DRV_RXINTR_ADDED) {
2283 ddi_remove_softintr(state->id_rx);
2284 state->id_rx = NULL;
2285 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2286 }
2287
2288 #ifdef DEBUG
2289 if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2290 kstat_delete(state->rc_ksp);
2291 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2292 }
2293 #endif
2294
2295 if (progress & IBD_DRV_STATE_INITIALIZED) {
2296 ibd_state_fini(state);
2297 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2298 }
2299 }
2300
2301 int
2302 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2303 {
2304 ibt_status_t ret;
2305 int rv;
2306 kthread_t *kht;
2307
2308 /*
2309 * Initialize mutexes and condition variables
2310 */
2311 if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2312 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2313 return (DDI_FAILURE);
2314 }
2315 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2316
2317 /*
2318 * Allocate rx,tx softintr
2319 */
2320 if (ibd_rx_softintr == 1) {
2321 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2322 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2323 DPRINT(10, "ibd_part_attach: failed in "
2324 "ddi_add_softintr(id_rx), ret=%d", rv);
2325 return (DDI_FAILURE);
2326 }
2327 state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2328 }
2329 if (ibd_tx_softintr == 1) {
2330 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2331 NULL, NULL, ibd_tx_recycle,
2332 (caddr_t)state)) != DDI_SUCCESS) {
2333 DPRINT(10, "ibd_part_attach: failed in "
2334 "ddi_add_softintr(id_tx), ret=%d", rv);
2335 return (DDI_FAILURE);
2336 }
2337 state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2338 }
2339
2340 /*
2341 * Attach to IBTL
2342 */
2343 mutex_enter(&ibd_gstate.ig_mutex);
2344 if (ibd_gstate.ig_ibt_hdl == NULL) {
2345 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2346 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2347 DPRINT(10, "ibd_part_attach: global: failed in "
2348 "ibt_attach(), ret=%d", ret);
2349 mutex_exit(&ibd_gstate.ig_mutex);
2350 return (DDI_FAILURE);
2351 }
2352 }
2353 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2354 &state->id_ibt_hdl)) != IBT_SUCCESS) {
2355 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2356 ret);
2357 mutex_exit(&ibd_gstate.ig_mutex);
2358 return (DDI_FAILURE);
2359 }
2360 ibd_gstate.ig_ibt_hdl_ref_cnt++;
2361 mutex_exit(&ibd_gstate.ig_mutex);
2362 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2363
2364 /*
2365 * Open the HCA
2366 */
2367 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2368 &state->id_hca_hdl)) != IBT_SUCCESS) {
2369 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2370 ret);
2371 return (DDI_FAILURE);
2372 }
2373 state->id_mac_state |= IBD_DRV_HCA_OPENED;
2374
2375 #ifdef DEBUG
2376 /* Initialize Driver Counters for Reliable Connected Mode */
2377 if (state->id_enable_rc) {
2378 if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2379 DPRINT(10, "ibd_part_attach: failed in "
2380 "ibd_rc_init_stats");
2381 return (DDI_FAILURE);
2382 }
2383 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2384 }
2385 #endif
2386
2387 /*
2388 * Record capabilities
2389 */
2390 (void) ibd_record_capab(state);
2391
2392 /*
2393 * Allocate a protection domain on the HCA
2394 */
2395 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2396 &state->id_pd_hdl)) != IBT_SUCCESS) {
2397 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2398 ret);
2399 return (DDI_FAILURE);
2400 }
2401 state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2402
2403
2404 /*
2405 * We need to initialise the req_list that is required for the
2406 * operation of the async_thread.
2407 */
2408 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2409 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2410 list_create(&state->id_req_list, sizeof (ibd_req_t),
2411 offsetof(ibd_req_t, rq_list));
2412 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2413
2414 /*
2415 * Create the async thread; thread_create never fails.
2416 */
2417 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2418 TS_RUN, minclsyspri);
2419 state->id_async_thrid = kht->t_did;
2420 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2421
2422 return (DDI_SUCCESS);
2423 }
2424
2425 /*
2426 * Attach device to the IO framework.
2427 */
2428 static int
2429 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2430 {
2431 int ret;
2432
2433 switch (cmd) {
2434 case DDI_ATTACH:
2435 ret = ibd_port_attach(dip);
2436 break;
2437 default:
2438 ret = DDI_FAILURE;
2439 break;
2440 }
2441 return (ret);
2442 }
2443
2444 /*
2445 * Detach device from the IO framework.
2446 */
2447 static int
2448 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2449 {
2450 ibd_state_t *state;
2451 int instance;
2452
2453 /*
2454 * IBD doesn't support suspend/resume
2455 */
2456 if (cmd != DDI_DETACH)
2457 return (DDI_FAILURE);
2458
2459 /*
2460 * Get the instance softstate
2461 */
2462 instance = ddi_get_instance(dip);
2463 state = ddi_get_soft_state(ibd_list, instance);
2464
2465 /*
2466 * Release all resources we're holding still. Note that if we'd
2467 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2468 * so far, we should find all the flags we need in id_mac_state.
2469 */
2470 return (ibd_port_unattach(state, dip));
2471 }
2472
2473 /*
2474 * Pre ibt_attach() driver initialization
2475 */
2476 static int
2477 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2478 {
2479 char buf[64];
2480
2481 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2482 state->id_link_state = LINK_STATE_UNKNOWN;
2483
2484 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2485 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2486 state->id_trap_stop = B_TRUE;
2487 state->id_trap_inprog = 0;
2488
2489 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2490 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2491 state->id_dip = dip;
2492
2493 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2494
2495 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2496 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2497 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2498 state->id_tx_busy = 0;
2499 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2500
2501 state->id_rx_list.dl_bufs_outstanding = 0;
2502 state->id_rx_list.dl_cnt = 0;
2503 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2504 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2505 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2506 state->id_pkey, state->id_plinkid);
2507 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2508 0, NULL, NULL, NULL, NULL, NULL, 0);
2509
2510 /* For Reliable Connected Mode */
2511 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2512 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2513 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2514 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2515 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2516 MUTEX_DRIVER, NULL);
2517 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2518
2519 /*
2520 * Make the default link mode as RC. If this fails during connection
2521 * setup, the link mode is automatically transitioned to UD.
2522 * Also set the RC MTU.
2523 */
2524 state->id_enable_rc = IBD_DEF_LINK_MODE;
2525 state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2526 state->id_mtu = IBD_DEF_MAX_MTU;
2527
2528 /* Iniatialize all tunables to default */
2529 state->id_lso_policy = IBD_DEF_LSO_POLICY;
2530 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2531 state->id_num_ah = IBD_DEF_NUM_AH;
2532 state->id_hash_size = IBD_DEF_HASH_SIZE;
2533 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2534 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2535 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2536 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2537 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2538 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2539 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2540 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2541 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2542 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2543 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2544 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2545 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2546 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2547 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2548 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2549 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2550 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2551 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2552 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2553
2554 return (DDI_SUCCESS);
2555 }
2556
2557 /*
2558 * Post ibt_detach() driver deconstruction
2559 */
2560 static void
2561 ibd_state_fini(ibd_state_t *state)
2562 {
2563 kmem_cache_destroy(state->id_req_kmc);
2564
2565 mutex_destroy(&state->id_rx_list.dl_mutex);
2566 mutex_destroy(&state->id_rx_free_list.dl_mutex);
2567
2568 mutex_destroy(&state->id_txpost_lock);
2569 mutex_destroy(&state->id_tx_list.dl_mutex);
2570 mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2571 mutex_destroy(&state->id_lso_lock);
2572
2573 mutex_destroy(&state->id_sched_lock);
2574 mutex_destroy(&state->id_scq_poll_lock);
2575 mutex_destroy(&state->id_rcq_poll_lock);
2576
2577 cv_destroy(&state->id_trap_cv);
2578 mutex_destroy(&state->id_trap_lock);
2579 mutex_destroy(&state->id_link_mutex);
2580
2581 /* For Reliable Connected Mode */
2582 mutex_destroy(&state->rc_timeout_lock);
2583 mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2584 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2585 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2586 mutex_destroy(&state->rc_tx_large_bufs_lock);
2587 mutex_destroy(&state->rc_rx_lock);
2588 }
2589
2590 /*
2591 * Fetch link speed from SA for snmp ifspeed reporting.
2592 */
2593 static uint64_t
2594 ibd_get_portspeed(ibd_state_t *state)
2595 {
2596 int ret;
2597 ibt_path_info_t path;
2598 ibt_path_attr_t path_attr;
2599 uint8_t num_paths;
2600 uint64_t ifspeed;
2601
2602 /*
2603 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2604 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2605 * 2000000000. Start with that as default.
2606 */
2607 ifspeed = 2000000000;
2608
2609 bzero(&path_attr, sizeof (path_attr));
2610
2611 /*
2612 * Get the port speed from Loopback path information.
2613 */
2614 path_attr.pa_dgids = &state->id_sgid;
2615 path_attr.pa_num_dgids = 1;
2616 path_attr.pa_sgid = state->id_sgid;
2617
2618 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2619 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2620 goto earlydone;
2621
2622 if (num_paths < 1)
2623 goto earlydone;
2624
2625 /*
2626 * In case SA does not return an expected value, report the default
2627 * speed as 1X.
2628 */
2629 ret = 1;
2630 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2631 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */
2632 ret = 1;
2633 break;
2634 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */
2635 ret = 4;
2636 break;
2637 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */
2638 ret = 12;
2639 break;
2640 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */
2641 ret = 2;
2642 break;
2643 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */
2644 ret = 8;
2645 break;
2646 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */
2647 ret = 16;
2648 break;
2649 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */
2650 ret = 24;
2651 break;
2652 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */
2653 ret = 32;
2654 break;
2655 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */
2656 ret = 48;
2657 break;
2658 }
2659
2660 ifspeed *= ret;
2661
2662 earlydone:
2663 return (ifspeed);
2664 }
2665
2666 /*
2667 * Search input mcg list (id_mc_full or id_mc_non) for an entry
2668 * representing the input mcg mgid.
2669 */
2670 static ibd_mce_t *
2671 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2672 {
2673 ibd_mce_t *ptr = list_head(mlist);
2674
2675 /*
2676 * Do plain linear search.
2677 */
2678 while (ptr != NULL) {
2679 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2680 sizeof (ib_gid_t)) == 0)
2681 return (ptr);
2682 ptr = list_next(mlist, ptr);
2683 }
2684 return (NULL);
2685 }
2686
2687 /*
2688 * Execute IBA JOIN.
2689 */
2690 static ibt_status_t
2691 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2692 {
2693 ibt_mcg_attr_t mcg_attr;
2694
2695 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2696 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2697 mcg_attr.mc_mgid = mgid;
2698 mcg_attr.mc_join_state = mce->mc_jstate;
2699 mcg_attr.mc_scope = state->id_scope;
2700 mcg_attr.mc_pkey = state->id_pkey;
2701 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2702 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2703 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2704 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2705 NULL, NULL));
2706 }
2707
2708 /*
2709 * This code JOINs the port in the proper way (depending on the join
2710 * state) so that IBA fabric will forward mcg packets to/from the port.
2711 * It also attaches the QPN to the mcg so it can receive those mcg
2712 * packets. This code makes sure not to attach the mcg to the QP if
2713 * that has been previously done due to the mcg being joined with a
2714 * different join state, even though this is not required by SWG_0216,
2715 * refid 3610.
2716 */
2717 static ibd_mce_t *
2718 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2719 {
2720 ibt_status_t ibt_status;
2721 ibd_mce_t *mce, *tmce, *omce = NULL;
2722 boolean_t do_attach = B_TRUE;
2723
2724 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2725 jstate, mgid.gid_prefix, mgid.gid_guid);
2726
2727 /*
2728 * For enable_multicast Full member joins, we need to do some
2729 * extra work. If there is already an mce on the list that
2730 * indicates full membership, that means the membership has
2731 * not yet been dropped (since the disable_multicast was issued)
2732 * because there are pending Tx's to the mcg; in that case, just
2733 * mark the mce not to be reaped when the Tx completion queues
2734 * an async reap operation.
2735 *
2736 * If there is already an mce on the list indicating sendonly
2737 * membership, try to promote to full membership. Be careful
2738 * not to deallocate the old mce, since there might be an AH
2739 * pointing to it; instead, update the old mce with new data
2740 * that tracks the full membership.
2741 */
2742 if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2743 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2744 if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2745 ASSERT(omce->mc_fullreap);
2746 omce->mc_fullreap = B_FALSE;
2747 return (omce);
2748 } else {
2749 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2750 }
2751 }
2752
2753 /*
2754 * Allocate the ibd_mce_t to track this JOIN.
2755 */
2756 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2757 mce->mc_fullreap = B_FALSE;
2758 mce->mc_jstate = jstate;
2759
2760 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2761 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2762 ibt_status);
2763 kmem_free(mce, sizeof (ibd_mce_t));
2764 return (NULL);
2765 }
2766
2767 /*
2768 * Is an IBA attach required? Not if the interface is already joined
2769 * to the mcg in a different appropriate join state.
2770 */
2771 if (jstate == IB_MC_JSTATE_NON) {
2772 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2773 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2774 do_attach = B_FALSE;
2775 } else if (jstate == IB_MC_JSTATE_FULL) {
2776 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2777 do_attach = B_FALSE;
2778 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2779 do_attach = B_FALSE;
2780 }
2781
2782 if (do_attach) {
2783 /*
2784 * Do the IBA attach.
2785 */
2786 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2787 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2788 &mce->mc_info)) != IBT_SUCCESS) {
2789 DPRINT(10, "ibd_join_group : failed qp attachment "
2790 "%d\n", ibt_status);
2791 /*
2792 * NOTE that we should probably preserve the join info
2793 * in the list and later try to leave again at detach
2794 * time.
2795 */
2796 (void) ibt_leave_mcg(state->id_sgid, mgid,
2797 state->id_sgid, jstate);
2798 kmem_free(mce, sizeof (ibd_mce_t));
2799 return (NULL);
2800 }
2801 }
2802
2803 /*
2804 * Insert the ibd_mce_t in the proper list.
2805 */
2806 if (jstate == IB_MC_JSTATE_NON) {
2807 IBD_MCACHE_INSERT_NON(state, mce);
2808 } else {
2809 /*
2810 * Set up the mc_req fields used for reaping the
2811 * mcg in case of delayed tx completion (see
2812 * ibd_tx_cleanup()). Also done for sendonly join in
2813 * case we are promoted to fullmembership later and
2814 * keep using the same mce.
2815 */
2816 mce->mc_req.rq_gid = mgid;
2817 mce->mc_req.rq_ptr = mce;
2818 /*
2819 * Check whether this is the case of trying to join
2820 * full member, and we were already joined send only.
2821 * We try to drop our SendOnly membership, but it is
2822 * possible that the mcg does not exist anymore (and
2823 * the subnet trap never reached us), so the leave
2824 * operation might fail.
2825 */
2826 if (omce != NULL) {
2827 (void) ibt_leave_mcg(state->id_sgid, mgid,
2828 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2829 omce->mc_jstate = IB_MC_JSTATE_FULL;
2830 bcopy(&mce->mc_info, &omce->mc_info,
2831 sizeof (ibt_mcg_info_t));
2832 kmem_free(mce, sizeof (ibd_mce_t));
2833 return (omce);
2834 }
2835 mutex_enter(&state->id_mc_mutex);
2836 IBD_MCACHE_INSERT_FULL(state, mce);
2837 mutex_exit(&state->id_mc_mutex);
2838 }
2839
2840 return (mce);
2841 }
2842
2843 /*
2844 * Called during port up event handling to attempt to reacquire full
2845 * membership to an mcg. Stripped down version of ibd_join_group().
2846 * Note that it is possible that the mcg might have gone away, and
2847 * gets recreated at this point.
2848 */
2849 static void
2850 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2851 {
2852 ib_gid_t mgid;
2853
2854 /*
2855 * If the mc_fullreap flag is set, or this join fails, a subsequent
2856 * reap/leave is going to try to leave the group. We could prevent
2857 * that by adding a boolean flag into ibd_mce_t, if required.
2858 */
2859 if (mce->mc_fullreap)
2860 return;
2861
2862 mgid = mce->mc_info.mc_adds_vect.av_dgid;
2863
2864 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2865 mgid.gid_guid);
2866
2867 /* While reacquiring, leave and then join the MCG */
2868 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
2869 mce->mc_jstate);
2870 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2871 ibd_print_warn(state, "Failure on port up to rejoin "
2872 "multicast gid %016llx:%016llx",
2873 (u_longlong_t)mgid.gid_prefix,
2874 (u_longlong_t)mgid.gid_guid);
2875 }
2876
2877 /*
2878 * This code handles delayed Tx completion cleanups for mcg's to which
2879 * disable_multicast has been issued, regular mcg related cleanups during
2880 * disable_multicast, disable_promiscuous and mcg traps, as well as
2881 * cleanups during driver detach time. Depending on the join state,
2882 * it deletes the mce from the appropriate list and issues the IBA
2883 * leave/detach; except in the disable_multicast case when the mce
2884 * is left on the active list for a subsequent Tx completion cleanup.
2885 */
2886 static void
2887 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
2888 uint8_t jstate)
2889 {
2890 ibd_mce_t *tmce;
2891 boolean_t do_detach = B_TRUE;
2892
2893 /*
2894 * Before detaching, we must check whether the other list
2895 * contains the mcg; if we detach blindly, the consumer
2896 * who set up the other list will also stop receiving
2897 * traffic.
2898 */
2899 if (jstate == IB_MC_JSTATE_FULL) {
2900 /*
2901 * The following check is only relevant while coming
2902 * from the Tx completion path in the reap case.
2903 */
2904 if (!mce->mc_fullreap)
2905 return;
2906 mutex_enter(&state->id_mc_mutex);
2907 IBD_MCACHE_PULLOUT_FULL(state, mce);
2908 mutex_exit(&state->id_mc_mutex);
2909 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2910 do_detach = B_FALSE;
2911 } else if (jstate == IB_MC_JSTATE_NON) {
2912 IBD_MCACHE_PULLOUT_NON(state, mce);
2913 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2914 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2915 do_detach = B_FALSE;
2916 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2917 mutex_enter(&state->id_mc_mutex);
2918 IBD_MCACHE_PULLOUT_FULL(state, mce);
2919 mutex_exit(&state->id_mc_mutex);
2920 do_detach = B_FALSE;
2921 }
2922
2923 /*
2924 * If we are reacting to a mcg trap and leaving our sendonly or
2925 * non membership, the mcg is possibly already gone, so attempting
2926 * to leave might fail. On the other hand, we must try to leave
2927 * anyway, since this might be a trap from long ago, and we could
2928 * have potentially sendonly joined to a recent incarnation of
2929 * the mcg and are about to loose track of this information.
2930 */
2931 if (do_detach) {
2932 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
2933 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
2934 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
2935 }
2936
2937 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
2938 kmem_free(mce, sizeof (ibd_mce_t));
2939 }
2940
2941 /*
2942 * Async code executed due to multicast and promiscuous disable requests
2943 * and mcg trap handling; also executed during driver detach. Mostly, a
2944 * leave and detach is done; except for the fullmember case when Tx
2945 * requests are pending, whence arrangements are made for subsequent
2946 * cleanup on Tx completion.
2947 */
2948 static void
2949 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2950 {
2951 ipoib_mac_t mcmac;
2952 boolean_t recycled;
2953 ibd_mce_t *mce;
2954
2955 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
2956 jstate, mgid.gid_prefix, mgid.gid_guid);
2957
2958 if (jstate == IB_MC_JSTATE_NON) {
2959 recycled = B_TRUE;
2960 mce = IBD_MCACHE_FIND_NON(state, mgid);
2961 /*
2962 * In case we are handling a mcg trap, we might not find
2963 * the mcg in the non list.
2964 */
2965 if (mce == NULL) {
2966 return;
2967 }
2968 } else {
2969 mce = IBD_MCACHE_FIND_FULL(state, mgid);
2970
2971 /*
2972 * In case we are handling a mcg trap, make sure the trap
2973 * is not arriving late; if we have an mce that indicates
2974 * that we are already a fullmember, that would be a clear
2975 * indication that the trap arrived late (ie, is for a
2976 * previous incarnation of the mcg).
2977 */
2978 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
2979 if ((mce == NULL) || (mce->mc_jstate ==
2980 IB_MC_JSTATE_FULL)) {
2981 return;
2982 }
2983 } else {
2984 ASSERT(jstate == IB_MC_JSTATE_FULL);
2985
2986 /*
2987 * If join group failed, mce will be NULL here.
2988 * This is because in GLDv3 driver, set multicast
2989 * will always return success.
2990 */
2991 if (mce == NULL) {
2992 return;
2993 }
2994
2995 mce->mc_fullreap = B_TRUE;
2996 }
2997
2998 /*
2999 * If no pending Tx's remain that reference the AH
3000 * for the mcg, recycle it from active to free list.
3001 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3002 * so the last completing Tx will cause an async reap
3003 * operation to be invoked, at which time we will drop our
3004 * membership to the mcg so that the pending Tx's complete
3005 * successfully. Refer to comments on "AH and MCE active
3006 * list manipulation" at top of this file. The lock protects
3007 * against Tx fast path and Tx cleanup code.
3008 */
3009 mutex_enter(&state->id_ac_mutex);
3010 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3011 recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3012 IB_MC_JSTATE_SEND_ONLY_NON));
3013 mutex_exit(&state->id_ac_mutex);
3014 }
3015
3016 if (recycled) {
3017 DPRINT(2, "ibd_leave_group : leave_group reaping : "
3018 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3019 ibd_async_reap_group(state, mce, mgid, jstate);
3020 }
3021 }
3022
3023 /*
3024 * Find the broadcast address as defined by IPoIB; implicitly
3025 * determines the IBA scope, mtu, tclass etc of the link the
3026 * interface is going to be a member of.
3027 */
3028 static ibt_status_t
3029 ibd_find_bgroup(ibd_state_t *state)
3030 {
3031 ibt_mcg_attr_t mcg_attr;
3032 uint_t numg;
3033 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3034 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3035 IB_MC_SCOPE_GLOBAL };
3036 int i, mcgmtu;
3037 boolean_t found = B_FALSE;
3038 int ret;
3039 ibt_mcg_info_t mcg_info;
3040
3041 state->id_bgroup_created = B_FALSE;
3042 state->id_bgroup_present = B_FALSE;
3043
3044 query_bcast_grp:
3045 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3046 mcg_attr.mc_pkey = state->id_pkey;
3047 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3048
3049 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3050 state->id_scope = mcg_attr.mc_scope = scopes[i];
3051
3052 /*
3053 * Look for the IPoIB broadcast group.
3054 */
3055 state->id_mgid.gid_prefix =
3056 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3057 ((uint64_t)state->id_scope << 48) |
3058 ((uint32_t)(state->id_pkey << 16)));
3059 mcg_attr.mc_mgid = state->id_mgid;
3060 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3061 &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3062 found = B_TRUE;
3063 break;
3064 }
3065 }
3066
3067 if (!found) {
3068 if (state->id_create_broadcast_group) {
3069 /*
3070 * If we created the broadcast group, but failed to
3071 * find it, we can't do anything except leave the
3072 * one we created and return failure.
3073 */
3074 if (state->id_bgroup_created) {
3075 ibd_print_warn(state, "IPoIB broadcast group "
3076 "absent. Unable to query after create.");
3077 goto find_bgroup_fail;
3078 }
3079
3080 /*
3081 * Create the ipoib broadcast group if it didn't exist
3082 */
3083 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3084 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3085 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3086 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3087 mcg_attr.mc_pkey = state->id_pkey;
3088 mcg_attr.mc_flow = 0;
3089 mcg_attr.mc_sl = 0;
3090 mcg_attr.mc_tclass = 0;
3091 state->id_mgid.gid_prefix =
3092 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3093 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3094 ((uint32_t)(state->id_pkey << 16)));
3095 mcg_attr.mc_mgid = state->id_mgid;
3096
3097 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3098 &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3099 ibd_print_warn(state, "IPoIB broadcast group "
3100 "absent, create failed: ret = %d\n", ret);
3101 state->id_bgroup_created = B_FALSE;
3102 return (IBT_FAILURE);
3103 }
3104 state->id_bgroup_created = B_TRUE;
3105 goto query_bcast_grp;
3106 } else {
3107 ibd_print_warn(state, "IPoIB broadcast group absent");
3108 return (IBT_FAILURE);
3109 }
3110 }
3111
3112 /*
3113 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3114 */
3115 mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3116 if (state->id_mtu < mcgmtu) {
3117 ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3118 "greater than port's maximum MTU %d", mcgmtu,
3119 state->id_mtu);
3120 ibt_free_mcg_info(state->id_mcinfo, 1);
3121 goto find_bgroup_fail;
3122 }
3123 state->id_mtu = mcgmtu;
3124 state->id_bgroup_present = B_TRUE;
3125
3126 return (IBT_SUCCESS);
3127
3128 find_bgroup_fail:
3129 if (state->id_bgroup_created) {
3130 (void) ibt_leave_mcg(state->id_sgid,
3131 mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3132 IB_MC_JSTATE_FULL);
3133 }
3134
3135 return (IBT_FAILURE);
3136 }
3137
3138 static int
3139 ibd_alloc_tx_copybufs(ibd_state_t *state)
3140 {
3141 ibt_mr_attr_t mem_attr;
3142
3143 /*
3144 * Allocate one big chunk for all regular tx copy bufs
3145 */
3146 state->id_tx_buf_sz = state->id_mtu;
3147 if (state->id_lso_policy && state->id_lso_capable &&
3148 (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3149 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3150 }
3151
3152 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3153 state->id_tx_buf_sz, KM_SLEEP);
3154
3155 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3156 sizeof (ibd_swqe_t), KM_SLEEP);
3157
3158 /*
3159 * Do one memory registration on the entire txbuf area
3160 */
3161 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3162 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3163 mem_attr.mr_as = NULL;
3164 mem_attr.mr_flags = IBT_MR_SLEEP;
3165 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3166 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3167 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3168 kmem_free(state->id_tx_wqes,
3169 state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3170 kmem_free(state->id_tx_bufs,
3171 state->id_ud_num_swqe * state->id_tx_buf_sz);
3172 state->id_tx_bufs = NULL;
3173 return (DDI_FAILURE);
3174 }
3175
3176 return (DDI_SUCCESS);
3177 }
3178
3179 static int
3180 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3181 {
3182 ibt_mr_attr_t mem_attr;
3183 ibd_lsobuf_t *buflist;
3184 ibd_lsobuf_t *lbufp;
3185 ibd_lsobuf_t *tail;
3186 ibd_lsobkt_t *bktp;
3187 uint8_t *membase;
3188 uint8_t *memp;
3189 uint_t memsz;
3190 int i;
3191
3192 /*
3193 * Allocate the lso bucket
3194 */
3195 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3196
3197 /*
3198 * Allocate the entire lso memory and register it
3199 */
3200 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3201 membase = kmem_zalloc(memsz, KM_SLEEP);
3202
3203 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3204 mem_attr.mr_len = memsz;
3205 mem_attr.mr_as = NULL;
3206 mem_attr.mr_flags = IBT_MR_SLEEP;
3207 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3208 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3209 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3210 kmem_free(membase, memsz);
3211 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3212 return (DDI_FAILURE);
3213 }
3214
3215 mutex_enter(&state->id_lso_lock);
3216
3217 /*
3218 * Now allocate the buflist. Note that the elements in the buflist and
3219 * the buffers in the lso memory have a permanent 1-1 relation, so we
3220 * can always derive the address of a buflist entry from the address of
3221 * an lso buffer.
3222 */
3223 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3224 KM_SLEEP);
3225
3226 /*
3227 * Set up the lso buf chain
3228 */
3229 memp = membase;
3230 lbufp = buflist;
3231 for (i = 0; i < state->id_num_lso_bufs; i++) {
3232 lbufp->lb_isfree = 1;
3233 lbufp->lb_buf = memp;
3234 lbufp->lb_next = lbufp + 1;
3235
3236 tail = lbufp;
3237
3238 memp += IBD_LSO_BUFSZ;
3239 lbufp++;
3240 }
3241 tail->lb_next = NULL;
3242
3243 /*
3244 * Set up the LSO buffer information in ibd state
3245 */
3246 bktp->bkt_bufl = buflist;
3247 bktp->bkt_free_head = buflist;
3248 bktp->bkt_mem = membase;
3249 bktp->bkt_nelem = state->id_num_lso_bufs;
3250 bktp->bkt_nfree = bktp->bkt_nelem;
3251
3252 state->id_lso = bktp;
3253 mutex_exit(&state->id_lso_lock);
3254
3255 return (DDI_SUCCESS);
3256 }
3257
3258 /*
3259 * Statically allocate Tx buffer list(s).
3260 */
3261 static int
3262 ibd_init_txlist(ibd_state_t *state)
3263 {
3264 ibd_swqe_t *swqe;
3265 ibt_lkey_t lkey;
3266 int i;
3267 uint_t len;
3268 uint8_t *bufaddr;
3269
3270 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3271 return (DDI_FAILURE);
3272
3273 if (state->id_lso_policy && state->id_lso_capable) {
3274 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3275 state->id_lso_capable = B_FALSE;
3276 }
3277
3278 mutex_enter(&state->id_tx_list.dl_mutex);
3279 state->id_tx_list.dl_head = NULL;
3280 state->id_tx_list.dl_pending_sends = B_FALSE;
3281 state->id_tx_list.dl_cnt = 0;
3282 mutex_exit(&state->id_tx_list.dl_mutex);
3283 mutex_enter(&state->id_tx_rel_list.dl_mutex);
3284 state->id_tx_rel_list.dl_head = NULL;
3285 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3286 state->id_tx_rel_list.dl_cnt = 0;
3287 mutex_exit(&state->id_tx_rel_list.dl_mutex);
3288
3289 /*
3290 * Allocate and setup the swqe list
3291 */
3292 lkey = state->id_tx_mr_desc.md_lkey;
3293 bufaddr = state->id_tx_bufs;
3294 len = state->id_tx_buf_sz;
3295 swqe = state->id_tx_wqes;
3296 mutex_enter(&state->id_tx_list.dl_mutex);
3297 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3298 swqe->swqe_next = NULL;
3299 swqe->swqe_im_mblk = NULL;
3300
3301 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3302 bufaddr;
3303 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3304 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3305
3306 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3307 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3308 swqe->w_swr.wr_trans = IBT_UD_SRV;
3309
3310 /* These are set in send */
3311 swqe->w_swr.wr_nds = 0;
3312 swqe->w_swr.wr_sgl = NULL;
3313 swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3314
3315 /* add to list */
3316 state->id_tx_list.dl_cnt++;
3317 swqe->swqe_next = state->id_tx_list.dl_head;
3318 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3319 }
3320 mutex_exit(&state->id_tx_list.dl_mutex);
3321
3322 return (DDI_SUCCESS);
3323 }
3324
3325 static int
3326 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3327 uint32_t *nds_p)
3328 {
3329 ibd_lsobkt_t *bktp;
3330 ibd_lsobuf_t *lbufp;
3331 ibd_lsobuf_t *nextp;
3332 ibt_lkey_t lso_lkey;
3333 uint_t frag_sz;
3334 uint_t num_needed;
3335 int i;
3336
3337 ASSERT(sgl_p != NULL);
3338 ASSERT(nds_p != NULL);
3339 ASSERT(req_sz != 0);
3340
3341 /*
3342 * Determine how many bufs we'd need for the size requested
3343 */
3344 num_needed = req_sz / IBD_LSO_BUFSZ;
3345 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3346 num_needed++;
3347
3348 mutex_enter(&state->id_lso_lock);
3349
3350 /*
3351 * If we don't have enough lso bufs, return failure
3352 */
3353 ASSERT(state->id_lso != NULL);
3354 bktp = state->id_lso;
3355 if (bktp->bkt_nfree < num_needed) {
3356 mutex_exit(&state->id_lso_lock);
3357 return (-1);
3358 }
3359
3360 /*
3361 * Pick the first 'num_needed' bufs from the free list
3362 */
3363 lso_lkey = bktp->bkt_mr_desc.md_lkey;
3364 lbufp = bktp->bkt_free_head;
3365 for (i = 0; i < num_needed; i++) {
3366 ASSERT(lbufp->lb_isfree != 0);
3367 ASSERT(lbufp->lb_buf != NULL);
3368
3369 nextp = lbufp->lb_next;
3370
3371 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3372 sgl_p[i].ds_key = lso_lkey;
3373 sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3374
3375 lbufp->lb_isfree = 0;
3376 lbufp->lb_next = NULL;
3377
3378 lbufp = nextp;
3379 }
3380 bktp->bkt_free_head = lbufp;
3381
3382 /*
3383 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3384 * to adjust the last sgl entry's length. Since we know we need atleast
3385 * one, the i-1 use below is ok.
3386 */
3387 if (frag_sz) {
3388 sgl_p[i-1].ds_len = frag_sz;
3389 }
3390
3391 /*
3392 * Update nfree count and return
3393 */
3394 bktp->bkt_nfree -= num_needed;
3395
3396 mutex_exit(&state->id_lso_lock);
3397
3398 *nds_p = num_needed;
3399
3400 return (0);
3401 }
3402
3403 static void
3404 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3405 {
3406 ibd_lsobkt_t *bktp;
3407 ibd_lsobuf_t *lbufp;
3408 uint8_t *lso_mem_end;
3409 uint_t ndx;
3410 int i;
3411
3412 mutex_enter(&state->id_lso_lock);
3413
3414 bktp = state->id_lso;
3415 ASSERT(bktp != NULL);
3416
3417 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3418 for (i = 0; i < nds; i++) {
3419 uint8_t *va;
3420
3421 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3422 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3423
3424 /*
3425 * Figure out the buflist element this sgl buffer corresponds
3426 * to and put it back at the head
3427 */
3428 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3429 lbufp = bktp->bkt_bufl + ndx;
3430
3431 ASSERT(lbufp->lb_isfree == 0);
3432 ASSERT(lbufp->lb_buf == va);
3433
3434 lbufp->lb_isfree = 1;
3435 lbufp->lb_next = bktp->bkt_free_head;
3436 bktp->bkt_free_head = lbufp;
3437 }
3438 bktp->bkt_nfree += nds;
3439
3440 mutex_exit(&state->id_lso_lock);
3441 }
3442
3443 static void
3444 ibd_free_tx_copybufs(ibd_state_t *state)
3445 {
3446 /*
3447 * Unregister txbuf mr
3448 */
3449 if (ibt_deregister_mr(state->id_hca_hdl,
3450 state->id_tx_mr_hdl) != IBT_SUCCESS) {
3451 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3452 }
3453 state->id_tx_mr_hdl = NULL;
3454
3455 /*
3456 * Free txbuf memory
3457 */
3458 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3459 sizeof (ibd_swqe_t));
3460 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3461 state->id_tx_buf_sz);
3462 state->id_tx_wqes = NULL;
3463 state->id_tx_bufs = NULL;
3464 }
3465
3466 static void
3467 ibd_free_tx_lsobufs(ibd_state_t *state)
3468 {
3469 ibd_lsobkt_t *bktp;
3470
3471 mutex_enter(&state->id_lso_lock);
3472
3473 if ((bktp = state->id_lso) == NULL) {
3474 mutex_exit(&state->id_lso_lock);
3475 return;
3476 }
3477
3478 /*
3479 * First, free the buflist
3480 */
3481 ASSERT(bktp->bkt_bufl != NULL);
3482 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3483
3484 /*
3485 * Unregister the LSO memory and free it
3486 */
3487 ASSERT(bktp->bkt_mr_hdl != NULL);
3488 if (ibt_deregister_mr(state->id_hca_hdl,
3489 bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3490 DPRINT(10,
3491 "ibd_free_lsobufs: ibt_deregister_mr failed");
3492 }
3493 ASSERT(bktp->bkt_mem);
3494 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3495
3496 /*
3497 * Finally free the bucket
3498 */
3499 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3500 state->id_lso = NULL;
3501
3502 mutex_exit(&state->id_lso_lock);
3503 }
3504
3505 /*
3506 * Free the statically allocated Tx buffer list.
3507 */
3508 static void
3509 ibd_fini_txlist(ibd_state_t *state)
3510 {
3511 /*
3512 * Free the allocated swqes
3513 */
3514 mutex_enter(&state->id_tx_list.dl_mutex);
3515 mutex_enter(&state->id_tx_rel_list.dl_mutex);
3516 state->id_tx_list.dl_head = NULL;
3517 state->id_tx_list.dl_pending_sends = B_FALSE;
3518 state->id_tx_list.dl_cnt = 0;
3519 state->id_tx_rel_list.dl_head = NULL;
3520 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3521 state->id_tx_rel_list.dl_cnt = 0;
3522 mutex_exit(&state->id_tx_rel_list.dl_mutex);
3523 mutex_exit(&state->id_tx_list.dl_mutex);
3524
3525 ibd_free_tx_lsobufs(state);
3526 ibd_free_tx_copybufs(state);
3527 }
3528
3529 /*
3530 * post a list of rwqes, NULL terminated.
3531 */
3532 static void
3533 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3534 {
3535 uint_t i;
3536 uint_t num_posted;
3537 ibt_status_t ibt_status;
3538 ibt_recv_wr_t wrs[IBD_RX_POST_CNT];
3539
3540 while (rwqe) {
3541 /* Post up to IBD_RX_POST_CNT receive work requests */
3542 for (i = 0; i < IBD_RX_POST_CNT; i++) {
3543 wrs[i] = rwqe->w_rwr;
3544 rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3545 if (rwqe == NULL) {
3546 i++;
3547 break;
3548 }
3549 }
3550
3551 /*
3552 * If posting fails for some reason, we'll never receive
3553 * completion intimation, so we'll need to cleanup. But
3554 * we need to make sure we don't clean up nodes whose
3555 * wrs have been successfully posted. We assume that the
3556 * hca driver returns on the first failure to post and
3557 * therefore the first 'num_posted' entries don't need
3558 * cleanup here.
3559 */
3560 atomic_add_32(&state->id_rx_list.dl_cnt, i);
3561
3562 num_posted = 0;
3563 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3564 &num_posted);
3565 if (ibt_status != IBT_SUCCESS) {
3566 /* This cannot happen unless the device has an error. */
3567 ibd_print_warn(state, "ibd_post_recv: FATAL: "
3568 "posting multiple wrs failed: "
3569 "requested=%d, done=%d, ret=%d",
3570 IBD_RX_POST_CNT, num_posted, ibt_status);
3571 atomic_add_32(&state->id_rx_list.dl_cnt,
3572 num_posted - i);
3573 }
3574 }
3575 }
3576
3577 /*
3578 * Grab a list of rwqes from the array of lists, and post the list.
3579 */
3580 static void
3581 ibd_post_recv_intr(ibd_state_t *state)
3582 {
3583 ibd_rx_queue_t *rxp;
3584 ibd_rwqe_t *list;
3585
3586 /* rotate through the rx_queue array, expecting an adequate number */
3587 state->id_rx_post_queue_index =
3588 (state->id_rx_post_queue_index + 1) &
3589 (state->id_rx_nqueues - 1);
3590
3591 rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3592 mutex_enter(&rxp->rx_post_lock);
3593 list = WQE_TO_RWQE(rxp->rx_head);
3594 rxp->rx_head = NULL;
3595 rxp->rx_cnt = 0;
3596 mutex_exit(&rxp->rx_post_lock);
3597 ibd_post_recv_list(state, list);
3598 }
3599
3600 /* macro explained below */
3601 #define RX_QUEUE_HASH(rwqe) \
3602 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3603
3604 /*
3605 * Add a rwqe to one of the the Rx lists. If the list is large enough
3606 * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3607 *
3608 * Note: one of 2^N lists is chosen via a hash. This is done
3609 * because using one list is contentious. If the first list is busy
3610 * (mutex_tryenter fails), use a second list (just call mutex_enter).
3611 *
3612 * The number 8 in RX_QUEUE_HASH is a random choice that provides
3613 * even distribution of mapping rwqes to the 2^N queues.
3614 */
3615 static void
3616 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3617 {
3618 ibd_rx_queue_t *rxp;
3619
3620 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3621
3622 if (!mutex_tryenter(&rxp->rx_post_lock)) {
3623 /* Failed. Try a different queue ("ptr + 16" ensures that). */
3624 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3625 mutex_enter(&rxp->rx_post_lock);
3626 }
3627 rwqe->rwqe_next = rxp->rx_head;
3628 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
3629 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
3630
3631 /* only call ibt_post_recv() every Nth time through here */
3632 if ((active & (state->id_rx_nqueues - 1)) == 0) {
3633 rxp->rx_head = NULL;
3634 rxp->rx_cnt = 0;
3635 mutex_exit(&rxp->rx_post_lock);
3636 ibd_post_recv_list(state, rwqe);
3637 return;
3638 }
3639 }
3640 rxp->rx_head = RWQE_TO_WQE(rwqe);
3641 mutex_exit(&rxp->rx_post_lock);
3642 }
3643
3644 static int
3645 ibd_alloc_rx_copybufs(ibd_state_t *state)
3646 {
3647 ibt_mr_attr_t mem_attr;
3648 int i;
3649
3650 /*
3651 * Allocate one big chunk for all regular rx copy bufs
3652 */
3653 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
3654
3655 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
3656 state->id_rx_buf_sz, KM_SLEEP);
3657
3658 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
3659 sizeof (ibd_rwqe_t), KM_SLEEP);
3660
3661 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
3662 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
3663 sizeof (ibd_rx_queue_t), KM_SLEEP);
3664 for (i = 0; i < state->id_rx_nqueues; i++) {
3665 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3666 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
3667 }
3668
3669 /*
3670 * Do one memory registration on the entire rxbuf area
3671 */
3672 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
3673 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
3674 mem_attr.mr_as = NULL;
3675 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3676 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3677 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
3678 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
3679 kmem_free(state->id_rx_wqes,
3680 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
3681 kmem_free(state->id_rx_bufs,
3682 state->id_ud_num_rwqe * state->id_rx_buf_sz);
3683 state->id_rx_bufs = NULL;
3684 state->id_rx_wqes = NULL;
3685 return (DDI_FAILURE);
3686 }
3687
3688 return (DDI_SUCCESS);
3689 }
3690
3691 /*
3692 * Allocate the statically allocated Rx buffer list.
3693 */
3694 static int
3695 ibd_init_rxlist(ibd_state_t *state)
3696 {
3697 ibd_rwqe_t *rwqe, *next;
3698 ibd_wqe_t *list;
3699 ibt_lkey_t lkey;
3700 int i;
3701 uint_t len;
3702 uint8_t *bufaddr;
3703
3704 mutex_enter(&state->id_rx_free_list.dl_mutex);
3705 if (state->id_rx_free_list.dl_head != NULL) {
3706 /* rx rsrcs were never freed. Just repost them */
3707 len = state->id_rx_buf_sz;
3708 list = state->id_rx_free_list.dl_head;
3709 state->id_rx_free_list.dl_head = NULL;
3710 state->id_rx_free_list.dl_cnt = 0;
3711 mutex_exit(&state->id_rx_free_list.dl_mutex);
3712 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3713 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3714 if ((rwqe->rwqe_im_mblk = desballoc(
3715 rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
3716 &rwqe->w_freemsg_cb)) == NULL) {
3717 /* allow freemsg_cb to free the rwqes */
3718 if (atomic_dec_32_nv(&state->id_running) != 0) {
3719 cmn_err(CE_WARN, "ibd_init_rxlist: "
3720 "id_running was not 1\n");
3721 }
3722 DPRINT(10, "ibd_init_rxlist : "
3723 "failed in desballoc()");
3724 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3725 rwqe = next) {
3726 next = WQE_TO_RWQE(rwqe->rwqe_next);
3727 if (rwqe->rwqe_im_mblk) {
3728 atomic_inc_32(&state->
3729 id_rx_list.
3730 dl_bufs_outstanding);
3731 freemsg(rwqe->rwqe_im_mblk);
3732 } else
3733 ibd_free_rwqe(state, rwqe);
3734 }
3735 atomic_inc_32(&state->id_running);
3736 return (DDI_FAILURE);
3737 }
3738 }
3739 ibd_post_recv_list(state, WQE_TO_RWQE(list));
3740 return (DDI_SUCCESS);
3741 }
3742 mutex_exit(&state->id_rx_free_list.dl_mutex);
3743
3744 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
3745 return (DDI_FAILURE);
3746
3747 /*
3748 * Allocate and setup the rwqe list
3749 */
3750 len = state->id_rx_buf_sz;
3751 lkey = state->id_rx_mr_desc.md_lkey;
3752 rwqe = state->id_rx_wqes;
3753 bufaddr = state->id_rx_bufs;
3754 list = NULL;
3755 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
3756 rwqe->w_state = state;
3757 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3758 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3759
3760 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
3761
3762 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
3763 &rwqe->w_freemsg_cb)) == NULL) {
3764 DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
3765 /* allow freemsg_cb to free the rwqes */
3766 if (atomic_dec_32_nv(&state->id_running) != 0) {
3767 cmn_err(CE_WARN, "ibd_init_rxlist: "
3768 "id_running was not 1\n");
3769 }
3770 DPRINT(10, "ibd_init_rxlist : "
3771 "failed in desballoc()");
3772 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
3773 rwqe = next) {
3774 next = WQE_TO_RWQE(rwqe->rwqe_next);
3775 freemsg(rwqe->rwqe_im_mblk);
3776 }
3777 atomic_inc_32(&state->id_running);
3778
3779 /* remove reference to free'd rwqes */
3780 mutex_enter(&state->id_rx_free_list.dl_mutex);
3781 state->id_rx_free_list.dl_head = NULL;
3782 state->id_rx_free_list.dl_cnt = 0;
3783 mutex_exit(&state->id_rx_free_list.dl_mutex);
3784
3785 ibd_fini_rxlist(state);
3786 return (DDI_FAILURE);
3787 }
3788
3789 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
3790 rwqe->rwqe_copybuf.ic_sgl.ds_va =
3791 (ib_vaddr_t)(uintptr_t)bufaddr;
3792 rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
3793 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3794 rwqe->w_rwr.wr_nds = 1;
3795 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3796
3797 rwqe->rwqe_next = list;
3798 list = RWQE_TO_WQE(rwqe);
3799 }
3800 ibd_post_recv_list(state, WQE_TO_RWQE(list));
3801
3802 return (DDI_SUCCESS);
3803 }
3804
3805 static void
3806 ibd_free_rx_copybufs(ibd_state_t *state)
3807 {
3808 int i;
3809
3810 /*
3811 * Unregister rxbuf mr
3812 */
3813 if (ibt_deregister_mr(state->id_hca_hdl,
3814 state->id_rx_mr_hdl) != IBT_SUCCESS) {
3815 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
3816 }
3817 state->id_rx_mr_hdl = NULL;
3818
3819 /*
3820 * Free rxbuf memory
3821 */
3822 for (i = 0; i < state->id_rx_nqueues; i++) {
3823 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3824 mutex_destroy(&rxp->rx_post_lock);
3825 }
3826 kmem_free(state->id_rx_queues, state->id_rx_nqueues *
3827 sizeof (ibd_rx_queue_t));
3828 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
3829 sizeof (ibd_rwqe_t));
3830 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
3831 state->id_rx_buf_sz);
3832 state->id_rx_queues = NULL;
3833 state->id_rx_wqes = NULL;
3834 state->id_rx_bufs = NULL;
3835 }
3836
3837 static void
3838 ibd_free_rx_rsrcs(ibd_state_t *state)
3839 {
3840 mutex_enter(&state->id_rx_free_list.dl_mutex);
3841 if (state->id_rx_free_list.dl_head == NULL) {
3842 /* already freed */
3843 mutex_exit(&state->id_rx_free_list.dl_mutex);
3844 return;
3845 }
3846 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
3847 ibd_free_rx_copybufs(state);
3848 state->id_rx_free_list.dl_cnt = 0;
3849 state->id_rx_free_list.dl_head = NULL;
3850 mutex_exit(&state->id_rx_free_list.dl_mutex);
3851 }
3852
3853 /*
3854 * Free the statically allocated Rx buffer list.
3855 */
3856 static void
3857 ibd_fini_rxlist(ibd_state_t *state)
3858 {
3859 ibd_rwqe_t *rwqe;
3860 int i;
3861
3862 /* run through the rx_queue's, calling freemsg() */
3863 for (i = 0; i < state->id_rx_nqueues; i++) {
3864 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3865 mutex_enter(&rxp->rx_post_lock);
3866 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
3867 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
3868 freemsg(rwqe->rwqe_im_mblk);
3869 rxp->rx_cnt--;
3870 }
3871 rxp->rx_head = NULL;
3872 mutex_exit(&rxp->rx_post_lock);
3873 }
3874
3875 /* cannot free rx resources unless gld returned everything */
3876 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
3877 ibd_free_rx_rsrcs(state);
3878 }
3879
3880 /*
3881 * Free an allocated recv wqe.
3882 */
3883 /* ARGSUSED */
3884 static void
3885 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3886 {
3887 /*
3888 * desballoc() failed (no memory).
3889 *
3890 * This rwqe is placed on a free list so that it
3891 * can be reinstated when memory is available.
3892 *
3893 * NOTE: no code currently exists to reinstate
3894 * these "lost" rwqes.
3895 */
3896 mutex_enter(&state->id_rx_free_list.dl_mutex);
3897 state->id_rx_free_list.dl_cnt++;
3898 rwqe->rwqe_next = state->id_rx_free_list.dl_head;
3899 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
3900 mutex_exit(&state->id_rx_free_list.dl_mutex);
3901 }
3902
3903 /*
3904 * IBA Rx completion queue handler. Guaranteed to be single
3905 * threaded and nonreentrant for this CQ.
3906 */
3907 /* ARGSUSED */
3908 static void
3909 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3910 {
3911 ibd_state_t *state = (ibd_state_t *)arg;
3912
3913 atomic_inc_64(&state->id_num_intrs);
3914
3915 if (ibd_rx_softintr == 1) {
3916 mutex_enter(&state->id_rcq_poll_lock);
3917 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
3918 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
3919 mutex_exit(&state->id_rcq_poll_lock);
3920 return;
3921 } else {
3922 mutex_exit(&state->id_rcq_poll_lock);
3923 ddi_trigger_softintr(state->id_rx);
3924 }
3925 } else
3926 (void) ibd_intr((caddr_t)state);
3927 }
3928
3929 /*
3930 * CQ handler for Tx completions, when the Tx CQ is in
3931 * interrupt driven mode.
3932 */
3933 /* ARGSUSED */
3934 static void
3935 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3936 {
3937 ibd_state_t *state = (ibd_state_t *)arg;
3938
3939 atomic_inc_64(&state->id_num_intrs);
3940
3941 if (ibd_tx_softintr == 1) {
3942 mutex_enter(&state->id_scq_poll_lock);
3943 if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
3944 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
3945 mutex_exit(&state->id_scq_poll_lock);
3946 return;
3947 } else {
3948 mutex_exit(&state->id_scq_poll_lock);
3949 ddi_trigger_softintr(state->id_tx);
3950 }
3951 } else
3952 (void) ibd_tx_recycle((caddr_t)state);
3953 }
3954
3955 /*
3956 * Multicast group create/delete trap handler. These will be delivered
3957 * on a kernel thread (handling can thus block) and can be invoked
3958 * concurrently. The handler can be invoked anytime after it is
3959 * registered and before ibt_detach().
3960 */
3961 /* ARGSUSED */
3962 static void
3963 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3964 ibt_subnet_event_t *event)
3965 {
3966 ibd_state_t *state = (ibd_state_t *)arg;
3967 ibd_req_t *req;
3968
3969 /*
3970 * The trap handler will get invoked once for every event for
3971 * every port. The input "gid" is the GID0 of the port the
3972 * trap came in on; we just need to act on traps that came
3973 * to our port, meaning the port on which the ipoib interface
3974 * resides. Since ipoib uses GID0 of the port, we just match
3975 * the gids to check whether we need to handle the trap.
3976 */
3977 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
3978 return;
3979
3980 DPRINT(10, "ibd_notices_handler : %d\n", code);
3981
3982 switch (code) {
3983 case IBT_SM_EVENT_UNAVAILABLE:
3984 /*
3985 * If we are in promiscuous mode or have
3986 * sendnonmembers, we need to print a warning
3987 * message right now. Else, just store the
3988 * information, print when we enter promiscuous
3989 * mode or attempt nonmember send. We might
3990 * also want to stop caching sendnonmember.
3991 */
3992 ibd_print_warn(state, "IBA multicast support "
3993 "degraded due to unavailability of multicast "
3994 "traps");
3995 break;
3996 case IBT_SM_EVENT_AVAILABLE:
3997 /*
3998 * If we printed a warning message above or
3999 * while trying to nonmember send or get into
4000 * promiscuous mode, print an okay message.
4001 */
4002 ibd_print_warn(state, "IBA multicast support "
4003 "restored due to availability of multicast "
4004 "traps");
4005 break;
4006 case IBT_SM_EVENT_MCG_CREATED:
4007 case IBT_SM_EVENT_MCG_DELETED:
4008 /*
4009 * If it is a "deleted" event and we are in late hca
4010 * init, nothing to do.
4011 */
4012 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4013 IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4014 IBT_SM_EVENT_MCG_DELETED)) {
4015 break;
4016 }
4017 /*
4018 * Common processing of creation/deletion traps.
4019 * First check if the instance is being
4020 * [de]initialized; back off then, without doing
4021 * anything more, since we are not sure if the
4022 * async thread is around, or whether we might
4023 * be racing with the detach code in ibd_m_stop()
4024 * that scans the mcg list.
4025 */
4026 if (!ibd_async_safe(state))
4027 return;
4028
4029 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4030 req->rq_gid = event->sm_notice_gid;
4031 req->rq_ptr = (void *)code;
4032 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4033 break;
4034 }
4035 }
4036
4037 static void
4038 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4039 {
4040 ib_gid_t mgid = req->rq_gid;
4041 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4042 int ret;
4043 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4044
4045 DPRINT(10, "ibd_async_trap : %d\n", code);
4046
4047 /*
4048 * Check if we have already joined the IPoIB broadcast group for our
4049 * PKEY. If joined, perform the rest of the operation.
4050 * Else, the interface is not initialised. Do the initialisation here
4051 * by calling ibd_start() and return.
4052 */
4053
4054 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4055 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4056 (code == IBT_SM_EVENT_MCG_CREATED)) {
4057 /*
4058 * If we are in late HCA init and a notification for the
4059 * creation of a MCG came in, check if it is the IPoIB MCG for
4060 * this pkey. If not, return.
4061 */
4062 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4063 state->id_pkey)) {
4064 ibd_async_done(state);
4065 return;
4066 }
4067 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4068 /*
4069 * Check if there is still a necessity to start the interface.
4070 * It is possible that the user attempted unplumb at just about
4071 * the same time, and if unplumb succeeded, we have nothing to
4072 * do.
4073 */
4074 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4075 IBD_DRV_IN_LATE_HCA_INIT) &&
4076 ((ret = ibd_start(state)) != 0)) {
4077 DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4078 "init, ret=%d", ret);
4079 }
4080 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4081 ibd_async_done(state);
4082 return;
4083 }
4084
4085 /*
4086 * Atomically search the nonmember and sendonlymember lists and
4087 * delete.
4088 */
4089 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4090
4091 if (state->id_prom_op == IBD_OP_COMPLETED) {
4092 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4093
4094 /*
4095 * If in promiscuous mode, try to join/attach to the new
4096 * mcg. Given the unreliable out-of-order mode of trap
4097 * delivery, we can never be sure whether it is a problem
4098 * if the join fails. Thus, we warn the admin of a failure
4099 * if this was a creation trap. Note that the trap might
4100 * actually be reporting a long past event, and the mcg
4101 * might already have been deleted, thus we might be warning
4102 * in vain.
4103 */
4104 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4105 NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4106 ibd_print_warn(state, "IBA promiscuous mode missed "
4107 "new multicast gid %016llx:%016llx",
4108 (u_longlong_t)mgid.gid_prefix,
4109 (u_longlong_t)mgid.gid_guid);
4110 }
4111
4112 /*
4113 * Free the request slot allocated by the subnet event thread.
4114 */
4115 ibd_async_done(state);
4116 }
4117
4118 /*
4119 * GLDv3 entry point to get capabilities.
4120 */
4121 static boolean_t
4122 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4123 {
4124 ibd_state_t *state = arg;
4125
4126 if (state->id_type == IBD_PORT_DRIVER)
4127 return (B_FALSE);
4128
4129 switch (cap) {
4130 case MAC_CAPAB_HCKSUM: {
4131 uint32_t *txflags = cap_data;
4132
4133 /*
4134 * We either do full checksum or not do it at all
4135 */
4136 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4137 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4138 else
4139 return (B_FALSE);
4140 break;
4141 }
4142
4143 case MAC_CAPAB_LSO: {
4144 mac_capab_lso_t *cap_lso = cap_data;
4145
4146 /*
4147 * In addition to the capability and policy, since LSO
4148 * relies on hw checksum, we'll not enable LSO if we
4149 * don't have hw checksum. Of course, if the HCA doesn't
4150 * provide the reserved lkey capability, enabling LSO will
4151 * actually affect performance adversely, so we'll disable
4152 * LSO even for that case.
4153 */
4154 if (!state->id_lso_policy || !state->id_lso_capable)
4155 return (B_FALSE);
4156
4157 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4158 return (B_FALSE);
4159
4160 if (state->id_hca_res_lkey_capab == 0) {
4161 ibd_print_warn(state, "no reserved-lkey capability, "
4162 "disabling LSO");
4163 return (B_FALSE);
4164 }
4165
4166 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4167 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4168 break;
4169 }
4170
4171 default:
4172 return (B_FALSE);
4173 }
4174
4175 return (B_TRUE);
4176 }
4177
4178 /*
4179 * callback function for set/get of properties
4180 */
4181 static int
4182 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4183 uint_t pr_valsize, const void *pr_val)
4184 {
4185 ibd_state_t *state = arg;
4186 int err = 0;
4187 uint32_t link_mode;
4188
4189 /* Cannot set properties on a port driver */
4190 if (state->id_type == IBD_PORT_DRIVER) {
4191 return (ENOTSUP);
4192 }
4193
4194 switch (pr_num) {
4195 case MAC_PROP_IB_LINKMODE:
4196 if (state->id_mac_state & IBD_DRV_STARTED) {
4197 err = EBUSY;
4198 break;
4199 }
4200 if (pr_val == NULL) {
4201 err = EINVAL;
4202 break;
4203 }
4204 bcopy(pr_val, &link_mode, sizeof (link_mode));
4205 if (link_mode != IBD_LINK_MODE_UD &&
4206 link_mode != IBD_LINK_MODE_RC) {
4207 err = EINVAL;
4208 } else {
4209 if (link_mode == IBD_LINK_MODE_RC) {
4210 if (state->id_enable_rc) {
4211 return (0);
4212 }
4213 state->id_enable_rc = 1;
4214 /* inform MAC framework of new MTU */
4215 err = mac_maxsdu_update2(state->id_mh,
4216 state->rc_mtu - IPOIB_HDRSIZE,
4217 state->id_mtu - IPOIB_HDRSIZE);
4218 } else {
4219 if (!state->id_enable_rc) {
4220 return (0);
4221 }
4222 state->id_enable_rc = 0;
4223 err = mac_maxsdu_update2(state->id_mh,
4224 state->id_mtu - IPOIB_HDRSIZE,
4225 state->id_mtu - IPOIB_HDRSIZE);
4226 }
4227 (void) ibd_record_capab(state);
4228 mac_capab_update(state->id_mh);
4229 }
4230 break;
4231 case MAC_PROP_PRIVATE:
4232 err = ibd_set_priv_prop(state, pr_name,
4233 pr_valsize, pr_val);
4234 break;
4235 default:
4236 err = ENOTSUP;
4237 break;
4238 }
4239 return (err);
4240 }
4241
4242 static int
4243 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4244 uint_t pr_valsize, void *pr_val)
4245 {
4246 ibd_state_t *state = arg;
4247 int err = 0;
4248
4249 switch (pr_num) {
4250 case MAC_PROP_MTU:
4251 break;
4252 default:
4253 if (state->id_type == IBD_PORT_DRIVER) {
4254 return (ENOTSUP);
4255 }
4256 break;
4257 }
4258
4259 switch (pr_num) {
4260 case MAC_PROP_IB_LINKMODE:
4261 *(uint_t *)pr_val = state->id_enable_rc;
4262 break;
4263 case MAC_PROP_PRIVATE:
4264 err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4265 pr_val);
4266 break;
4267 default:
4268 err = ENOTSUP;
4269 break;
4270 }
4271 return (err);
4272 }
4273
4274 static void
4275 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4276 mac_prop_info_handle_t prh)
4277 {
4278 ibd_state_t *state = arg;
4279
4280 switch (pr_num) {
4281 case MAC_PROP_IB_LINKMODE: {
4282 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4283 break;
4284 }
4285 case MAC_PROP_MTU: {
4286 uint32_t min, max;
4287 if (state->id_type == IBD_PORT_DRIVER) {
4288 min = 1500;
4289 max = IBD_DEF_RC_MAX_SDU;
4290 } else if (state->id_enable_rc) {
4291 min = max = IBD_DEF_RC_MAX_SDU;
4292 } else {
4293 min = max = state->id_mtu - IPOIB_HDRSIZE;
4294 }
4295 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4296 mac_prop_info_set_range_uint32(prh, min, max);
4297 break;
4298 }
4299 case MAC_PROP_PRIVATE: {
4300 char valstr[64];
4301 int value;
4302
4303 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4304 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4305 return;
4306 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4307 value = IBD_DEF_COALESCE_COMPLETIONS;
4308 } else if (strcmp(pr_name,
4309 "_ibd_create_broadcast_group") == 0) {
4310 value = IBD_DEF_CREATE_BCAST_GROUP;
4311 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4312 value = IBD_DEF_HASH_SIZE;
4313 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4314 value = IBD_DEF_LSO_POLICY;
4315 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4316 value = IBD_DEF_NUM_AH;
4317 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4318 value = IBD_DEF_NUM_LSO_BUFS;
4319 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4320 value = IBD_DEF_RC_ENABLE_SRQ;
4321 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4322 value = IBD_DEF_RC_NUM_RWQE;
4323 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4324 value = IBD_DEF_RC_NUM_SRQ;
4325 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4326 value = IBD_DEF_RC_NUM_SWQE;
4327 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4328 value = IBD_DEF_RC_RX_COMP_COUNT;
4329 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4330 value = IBD_DEF_RC_RX_COMP_USEC;
4331 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4332 value = IBD_DEF_RC_RX_COPY_THRESH;
4333 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4334 value = IBD_DEF_RC_RX_RWQE_THRESH;
4335 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4336 value = IBD_DEF_RC_TX_COMP_COUNT;
4337 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4338 value = IBD_DEF_RC_TX_COMP_USEC;
4339 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4340 value = IBD_DEF_RC_TX_COPY_THRESH;
4341 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4342 value = IBD_DEF_UD_NUM_RWQE;
4343 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4344 value = IBD_DEF_UD_NUM_SWQE;
4345 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4346 value = IBD_DEF_UD_RX_COMP_COUNT;
4347 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4348 value = IBD_DEF_UD_RX_COMP_USEC;
4349 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4350 value = IBD_DEF_UD_TX_COMP_COUNT;
4351 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4352 value = IBD_DEF_UD_TX_COMP_USEC;
4353 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4354 value = IBD_DEF_UD_TX_COPY_THRESH;
4355 } else {
4356 return;
4357 }
4358
4359 (void) snprintf(valstr, sizeof (valstr), "%d", value);
4360 mac_prop_info_set_default_str(prh, valstr);
4361 break;
4362 }
4363 } /* switch (pr_num) */
4364 }
4365
4366 /* ARGSUSED2 */
4367 static int
4368 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4369 uint_t pr_valsize, const void *pr_val)
4370 {
4371 int err = 0;
4372 long result;
4373
4374 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4375 if (pr_val == NULL) {
4376 return (EINVAL);
4377 }
4378 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4379 if (result < 0 || result > 1) {
4380 err = EINVAL;
4381 } else {
4382 state->id_allow_coalesce_comp_tuning = (result == 1) ?
4383 B_TRUE: B_FALSE;
4384 }
4385 return (err);
4386 }
4387 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4388 if (state->id_mac_state & IBD_DRV_STARTED) {
4389 return (EBUSY);
4390 }
4391 if (pr_val == NULL) {
4392 return (EINVAL);
4393 }
4394 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4395 if (result < 0 || result > 1) {
4396 err = EINVAL;
4397 } else {
4398 state->id_create_broadcast_group = (result == 1) ?
4399 B_TRUE: B_FALSE;
4400 }
4401 return (err);
4402 }
4403 if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4404 if (state->id_mac_state & IBD_DRV_STARTED) {
4405 return (EBUSY);
4406 }
4407 if (pr_val == NULL) {
4408 return (EINVAL);
4409 }
4410 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4411 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4412 err = EINVAL;
4413 } else {
4414 state->id_hash_size = (uint32_t)result;
4415 }
4416 return (err);
4417 }
4418 if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4419 if (state->id_mac_state & IBD_DRV_STARTED) {
4420 return (EBUSY);
4421 }
4422 if (pr_val == NULL) {
4423 return (EINVAL);
4424 }
4425 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4426 if (result < 0 || result > 1) {
4427 err = EINVAL;
4428 } else {
4429 state->id_lso_policy = (result == 1) ?
4430 B_TRUE: B_FALSE;
4431 }
4432 mac_capab_update(state->id_mh);
4433 return (err);
4434 }
4435 if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4436 if (state->id_mac_state & IBD_DRV_STARTED) {
4437 return (EBUSY);
4438 }
4439 if (pr_val == NULL) {
4440 return (EINVAL);
4441 }
4442 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4443 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4444 err = EINVAL;
4445 } else {
4446 state->id_num_ah = (uint32_t)result;
4447 }
4448 return (err);
4449 }
4450 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4451 if (state->id_mac_state & IBD_DRV_STARTED) {
4452 return (EBUSY);
4453 }
4454 if (!state->id_lso_policy || !state->id_lso_capable) {
4455 return (EINVAL);
4456 }
4457 if (pr_val == NULL) {
4458 return (EINVAL);
4459 }
4460 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4461 if (result < IBD_MIN_NUM_LSO_BUFS ||
4462 result > IBD_MAX_NUM_LSO_BUFS) {
4463 err = EINVAL;
4464 } else {
4465 state->id_num_lso_bufs = (uint32_t)result;
4466 }
4467 return (err);
4468 }
4469 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4470 if (state->id_mac_state & IBD_DRV_STARTED) {
4471 return (EBUSY);
4472 }
4473 if (pr_val == NULL) {
4474 return (EINVAL);
4475 }
4476 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4477 if (result < 0 || result > 1) {
4478 err = EINVAL;
4479 } else {
4480 state->rc_enable_srq = (result == 1) ?
4481 B_TRUE: B_FALSE;
4482 }
4483 if (!state->rc_enable_srq) {
4484 state->id_rc_num_srq = 0;
4485 }
4486 return (err);
4487 }
4488 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4489 if (state->id_mac_state & IBD_DRV_STARTED) {
4490 return (EBUSY);
4491 }
4492 if (pr_val == NULL) {
4493 return (EINVAL);
4494 }
4495 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4496 if (result < IBD_MIN_RC_NUM_RWQE ||
4497 result > IBD_MAX_RC_NUM_RWQE) {
4498 err = EINVAL;
4499 } else {
4500 state->id_rc_num_rwqe = (uint32_t)result;
4501 if (state->id_allow_coalesce_comp_tuning &&
4502 state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4503 state->id_rc_rx_comp_count =
4504 state->id_rc_num_rwqe;
4505 if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4506 state->id_rc_num_srq =
4507 state->id_rc_num_rwqe - 1;
4508 /*
4509 * If rx_rwqe_threshold is greater than the number of
4510 * rwqes, pull it back to 25% of number of rwqes.
4511 */
4512 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4513 state->id_rc_rx_rwqe_thresh =
4514 (state->id_rc_num_rwqe >> 2);
4515
4516 }
4517 return (err);
4518 }
4519 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4520 if (state->id_mac_state & IBD_DRV_STARTED) {
4521 return (EBUSY);
4522 }
4523 if (pr_val == NULL) {
4524 return (EINVAL);
4525 }
4526 if (!state->rc_enable_srq)
4527 return (EINVAL);
4528
4529 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4530 if (result < IBD_MIN_RC_NUM_SRQ ||
4531 result >= state->id_rc_num_rwqe) {
4532 err = EINVAL;
4533 } else
4534 state->id_rc_num_srq = (uint32_t)result;
4535 return (err);
4536 }
4537 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4538 if (state->id_mac_state & IBD_DRV_STARTED) {
4539 return (EBUSY);
4540 }
4541 if (pr_val == NULL) {
4542 return (EINVAL);
4543 }
4544 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4545 if (result < IBD_MIN_RC_NUM_SWQE ||
4546 result > IBD_MAX_RC_NUM_SWQE) {
4547 err = EINVAL;
4548 } else {
4549 state->id_rc_num_swqe = (uint32_t)result;
4550 if (state->id_allow_coalesce_comp_tuning &&
4551 state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4552 state->id_rc_tx_comp_count =
4553 state->id_rc_num_swqe;
4554 }
4555 return (err);
4556 }
4557 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4558 if (!state->id_allow_coalesce_comp_tuning) {
4559 return (ENOTSUP);
4560 }
4561 if (pr_val == NULL) {
4562 return (EINVAL);
4563 }
4564 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4565 if (result < 1 || result > state->id_rc_num_rwqe) {
4566 err = EINVAL;
4567 } else {
4568 state->id_rc_rx_comp_count = (uint32_t)result;
4569 }
4570 return (err);
4571 }
4572 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4573 if (!state->id_allow_coalesce_comp_tuning) {
4574 return (ENOTSUP);
4575 }
4576 if (pr_val == NULL) {
4577 return (EINVAL);
4578 }
4579 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4580 if (result < 1) {
4581 err = EINVAL;
4582 } else {
4583 state->id_rc_rx_comp_usec = (uint32_t)result;
4584 }
4585 return (err);
4586 }
4587 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4588 if (state->id_mac_state & IBD_DRV_STARTED) {
4589 return (EBUSY);
4590 }
4591 if (pr_val == NULL) {
4592 return (EINVAL);
4593 }
4594 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4595 if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4596 result > state->rc_mtu) {
4597 err = EINVAL;
4598 } else {
4599 state->id_rc_rx_copy_thresh = (uint32_t)result;
4600 }
4601 return (err);
4602 }
4603 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4604 if (state->id_mac_state & IBD_DRV_STARTED) {
4605 return (EBUSY);
4606 }
4607 if (pr_val == NULL) {
4608 return (EINVAL);
4609 }
4610 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4611 if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4612 result >= state->id_rc_num_rwqe) {
4613 err = EINVAL;
4614 } else {
4615 state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4616 }
4617 return (err);
4618 }
4619 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4620 if (!state->id_allow_coalesce_comp_tuning) {
4621 return (ENOTSUP);
4622 }
4623 if (pr_val == NULL) {
4624 return (EINVAL);
4625 }
4626 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4627 if (result < 1 || result > state->id_rc_num_swqe) {
4628 err = EINVAL;
4629 } else {
4630 state->id_rc_tx_comp_count = (uint32_t)result;
4631 }
4632 return (err);
4633 }
4634 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4635 if (!state->id_allow_coalesce_comp_tuning) {
4636 return (ENOTSUP);
4637 }
4638 if (pr_val == NULL) {
4639 return (EINVAL);
4640 }
4641 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4642 if (result < 1)
4643 err = EINVAL;
4644 else {
4645 state->id_rc_tx_comp_usec = (uint32_t)result;
4646 }
4647 return (err);
4648 }
4649 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4650 if (state->id_mac_state & IBD_DRV_STARTED) {
4651 return (EBUSY);
4652 }
4653 if (pr_val == NULL) {
4654 return (EINVAL);
4655 }
4656 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4657 if (result < IBD_MIN_RC_TX_COPY_THRESH ||
4658 result > state->rc_mtu) {
4659 err = EINVAL;
4660 } else {
4661 state->id_rc_tx_copy_thresh = (uint32_t)result;
4662 }
4663 return (err);
4664 }
4665 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4666 if (state->id_mac_state & IBD_DRV_STARTED) {
4667 return (EBUSY);
4668 }
4669 if (pr_val == NULL) {
4670 return (EINVAL);
4671 }
4672 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4673 if (result < IBD_MIN_UD_NUM_RWQE ||
4674 result > IBD_MAX_UD_NUM_RWQE) {
4675 err = EINVAL;
4676 } else {
4677 if (result > state->id_hca_max_chan_sz) {
4678 state->id_ud_num_rwqe =
4679 state->id_hca_max_chan_sz;
4680 } else {
4681 state->id_ud_num_rwqe = (uint32_t)result;
4682 }
4683 if (state->id_allow_coalesce_comp_tuning &&
4684 state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
4685 state->id_ud_rx_comp_count =
4686 state->id_ud_num_rwqe;
4687 }
4688 return (err);
4689 }
4690 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4691 if (state->id_mac_state & IBD_DRV_STARTED) {
4692 return (EBUSY);
4693 }
4694 if (pr_val == NULL) {
4695 return (EINVAL);
4696 }
4697 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4698 if (result < IBD_MIN_UD_NUM_SWQE ||
4699 result > IBD_MAX_UD_NUM_SWQE) {
4700 err = EINVAL;
4701 } else {
4702 if (result > state->id_hca_max_chan_sz) {
4703 state->id_ud_num_swqe =
4704 state->id_hca_max_chan_sz;
4705 } else {
4706 state->id_ud_num_swqe = (uint32_t)result;
4707 }
4708 if (state->id_allow_coalesce_comp_tuning &&
4709 state->id_ud_tx_comp_count > state->id_ud_num_swqe)
4710 state->id_ud_tx_comp_count =
4711 state->id_ud_num_swqe;
4712 }
4713 return (err);
4714 }
4715 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4716 if (!state->id_allow_coalesce_comp_tuning) {
4717 return (ENOTSUP);
4718 }
4719 if (pr_val == NULL) {
4720 return (EINVAL);
4721 }
4722 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4723 if (result < 1 || result > state->id_ud_num_rwqe) {
4724 err = EINVAL;
4725 } else {
4726 state->id_ud_rx_comp_count = (uint32_t)result;
4727 }
4728 return (err);
4729 }
4730 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4731 if (!state->id_allow_coalesce_comp_tuning) {
4732 return (ENOTSUP);
4733 }
4734 if (pr_val == NULL) {
4735 return (EINVAL);
4736 }
4737 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4738 if (result < 1) {
4739 err = EINVAL;
4740 } else {
4741 state->id_ud_rx_comp_usec = (uint32_t)result;
4742 }
4743 return (err);
4744 }
4745 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4746 if (!state->id_allow_coalesce_comp_tuning) {
4747 return (ENOTSUP);
4748 }
4749 if (pr_val == NULL) {
4750 return (EINVAL);
4751 }
4752 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4753 if (result < 1 || result > state->id_ud_num_swqe) {
4754 err = EINVAL;
4755 } else {
4756 state->id_ud_tx_comp_count = (uint32_t)result;
4757 }
4758 return (err);
4759 }
4760 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4761 if (!state->id_allow_coalesce_comp_tuning) {
4762 return (ENOTSUP);
4763 }
4764 if (pr_val == NULL) {
4765 return (EINVAL);
4766 }
4767 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4768 if (result < 1) {
4769 err = EINVAL;
4770 } else {
4771 state->id_ud_tx_comp_usec = (uint32_t)result;
4772 }
4773 return (err);
4774 }
4775 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4776 if (state->id_mac_state & IBD_DRV_STARTED) {
4777 return (EBUSY);
4778 }
4779 if (pr_val == NULL) {
4780 return (EINVAL);
4781 }
4782 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4783 if (result < IBD_MIN_UD_TX_COPY_THRESH ||
4784 result > IBD_MAX_UD_TX_COPY_THRESH) {
4785 err = EINVAL;
4786 } else {
4787 state->id_ud_tx_copy_thresh = (uint32_t)result;
4788 }
4789 return (err);
4790 }
4791 return (ENOTSUP);
4792 }
4793
4794 static int
4795 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
4796 void *pr_val)
4797 {
4798 int err = ENOTSUP;
4799 int value;
4800
4801 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4802 value = state->id_bgroup_present;
4803 err = 0;
4804 goto done;
4805 }
4806 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4807 value = state->id_allow_coalesce_comp_tuning;
4808 err = 0;
4809 goto done;
4810 }
4811 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4812 value = state->id_create_broadcast_group;
4813 err = 0;
4814 goto done;
4815 }
4816 if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4817 value = state->id_hash_size;
4818 err = 0;
4819 goto done;
4820 }
4821 if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4822 value = state->id_lso_policy;
4823 err = 0;
4824 goto done;
4825 }
4826 if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4827 value = state->id_num_ah;
4828 err = 0;
4829 goto done;
4830 }
4831 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4832 value = state->id_num_lso_bufs;
4833 err = 0;
4834 goto done;
4835 }
4836 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4837 value = state->rc_enable_srq;
4838 err = 0;
4839 goto done;
4840 }
4841 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4842 value = state->id_rc_num_rwqe;
4843 err = 0;
4844 goto done;
4845 }
4846 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4847 value = state->id_rc_num_srq;
4848 err = 0;
4849 goto done;
4850 }
4851 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4852 value = state->id_rc_num_swqe;
4853 err = 0;
4854 goto done;
4855 }
4856 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4857 value = state->id_rc_rx_comp_count;
4858 err = 0;
4859 goto done;
4860 }
4861 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4862 value = state->id_rc_rx_comp_usec;
4863 err = 0;
4864 goto done;
4865 }
4866 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4867 value = state->id_rc_rx_copy_thresh;
4868 err = 0;
4869 goto done;
4870 }
4871 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4872 value = state->id_rc_rx_rwqe_thresh;
4873 err = 0;
4874 goto done;
4875 }
4876 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4877 value = state->id_rc_tx_comp_count;
4878 err = 0;
4879 goto done;
4880 }
4881 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4882 value = state->id_rc_tx_comp_usec;
4883 err = 0;
4884 goto done;
4885 }
4886 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4887 value = state->id_rc_tx_copy_thresh;
4888 err = 0;
4889 goto done;
4890 }
4891 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4892 value = state->id_ud_num_rwqe;
4893 err = 0;
4894 goto done;
4895 }
4896 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4897 value = state->id_ud_num_swqe;
4898 err = 0;
4899 goto done;
4900 }
4901 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4902 value = state->id_ud_rx_comp_count;
4903 err = 0;
4904 goto done;
4905 }
4906 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4907 value = state->id_ud_rx_comp_usec;
4908 err = 0;
4909 goto done;
4910 }
4911 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4912 value = state->id_ud_tx_comp_count;
4913 err = 0;
4914 goto done;
4915 }
4916 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4917 value = state->id_ud_tx_comp_usec;
4918 err = 0;
4919 goto done;
4920 }
4921 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4922 value = state->id_ud_tx_copy_thresh;
4923 err = 0;
4924 goto done;
4925 }
4926 done:
4927 if (err == 0) {
4928 (void) snprintf(pr_val, pr_valsize, "%d", value);
4929 }
4930 return (err);
4931 }
4932
4933 static int
4934 ibd_get_port_details(ibd_state_t *state)
4935 {
4936 ibt_hca_portinfo_t *port_infop;
4937 ibt_status_t ret;
4938 uint_t psize, port_infosz;
4939
4940 mutex_enter(&state->id_link_mutex);
4941
4942 /*
4943 * Query for port information
4944 */
4945 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4946 &port_infop, &psize, &port_infosz);
4947 if ((ret != IBT_SUCCESS) || (psize != 1)) {
4948 mutex_exit(&state->id_link_mutex);
4949 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4950 "failed, ret=%d", ret);
4951 return (ENETDOWN);
4952 }
4953
4954 /*
4955 * If the link is active, verify the pkey
4956 */
4957 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
4958 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4959 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4960 state->id_link_state = LINK_STATE_DOWN;
4961 } else {
4962 state->id_link_state = LINK_STATE_UP;
4963 }
4964 state->id_mtu = (128 << port_infop->p_mtu);
4965 state->id_sgid = *port_infop->p_sgid_tbl;
4966 /*
4967 * Now that the port is active, record the port speed
4968 */
4969 state->id_link_speed = ibd_get_portspeed(state);
4970 } else {
4971 /* Make sure that these are handled in PORT_UP/CHANGE */
4972 state->id_mtu = 0;
4973 state->id_link_state = LINK_STATE_DOWN;
4974 state->id_link_speed = 0;
4975 }
4976 mutex_exit(&state->id_link_mutex);
4977 ibt_free_portinfo(port_infop, port_infosz);
4978
4979 return (0);
4980 }
4981
4982 static int
4983 ibd_alloc_cqs(ibd_state_t *state)
4984 {
4985 ibt_hca_attr_t hca_attrs;
4986 ibt_cq_attr_t cq_attr;
4987 ibt_status_t ret;
4988 uint32_t real_size;
4989 uint_t num_rwqe_change = 0;
4990 uint_t num_swqe_change = 0;
4991
4992 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4993 ASSERT(ret == IBT_SUCCESS);
4994
4995 /*
4996 * Allocate Rx/combined CQ:
4997 * Theoretically, there is no point in having more than #rwqe
4998 * plus #swqe cqe's, except that the CQ will be signaled for
4999 * overflow when the last wqe completes, if none of the previous
5000 * cqe's have been polled. Thus, we allocate just a few less wqe's
5001 * to make sure such overflow does not occur.
5002 */
5003 cq_attr.cq_sched = NULL;
5004 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5005
5006 /*
5007 * Allocate Receive CQ.
5008 */
5009 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5010 cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5011 } else {
5012 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5013 num_rwqe_change = state->id_ud_num_rwqe;
5014 state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5015 }
5016
5017 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5018 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5019 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5020 "failed, ret=%d\n", ret);
5021 return (DDI_FAILURE);
5022 }
5023
5024 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5025 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5026 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5027 "moderation failed, ret=%d\n", ret);
5028 }
5029
5030 /* make the #rx wc's the same as max rx chain size */
5031 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5032 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5033 state->id_rxwcs_size, KM_SLEEP);
5034
5035 /*
5036 * Allocate Send CQ.
5037 */
5038 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5039 cq_attr.cq_size = state->id_ud_num_swqe + 1;
5040 } else {
5041 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5042 num_swqe_change = state->id_ud_num_swqe;
5043 state->id_ud_num_swqe = cq_attr.cq_size - 1;
5044 }
5045
5046 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5047 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5048 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5049 "failed, ret=%d\n", ret);
5050 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5051 state->id_rxwcs_size);
5052 (void) ibt_free_cq(state->id_rcq_hdl);
5053 return (DDI_FAILURE);
5054 }
5055 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5056 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5057 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5058 "moderation failed, ret=%d\n", ret);
5059 }
5060
5061 state->id_txwcs_size = IBD_TX_POLL_THRESH;
5062 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5063 state->id_txwcs_size, KM_SLEEP);
5064
5065 /*
5066 * Print message in case we could not allocate as many wqe's
5067 * as was requested.
5068 */
5069 if (num_rwqe_change) {
5070 ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5071 "%d", state->id_ud_num_rwqe, num_rwqe_change);
5072 }
5073 if (num_swqe_change) {
5074 ibd_print_warn(state, "Setting #swqe = %d instead of default "
5075 "%d", state->id_ud_num_swqe, num_swqe_change);
5076 }
5077
5078 return (DDI_SUCCESS);
5079 }
5080
5081 static int
5082 ibd_setup_ud_channel(ibd_state_t *state)
5083 {
5084 ibt_ud_chan_alloc_args_t ud_alloc_attr;
5085 ibt_ud_chan_query_attr_t ud_chan_attr;
5086 ibt_status_t ret;
5087
5088 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED;
5089 if (state->id_hca_res_lkey_capab)
5090 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5091 if (state->id_lso_policy && state->id_lso_capable)
5092 ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5093
5094 ud_alloc_attr.ud_hca_port_num = state->id_port;
5095 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5096 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5097 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe;
5098 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe;
5099 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey;
5100 ud_alloc_attr.ud_scq = state->id_scq_hdl;
5101 ud_alloc_attr.ud_rcq = state->id_rcq_hdl;
5102 ud_alloc_attr.ud_pd = state->id_pd_hdl;
5103 ud_alloc_attr.ud_pkey_ix = state->id_pkix;
5104 ud_alloc_attr.ud_clone_chan = NULL;
5105
5106 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5107 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5108 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5109 "failed, ret=%d\n", ret);
5110 return (DDI_FAILURE);
5111 }
5112
5113 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5114 &ud_chan_attr)) != IBT_SUCCESS) {
5115 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5116 "failed, ret=%d\n", ret);
5117 (void) ibt_free_channel(state->id_chnl_hdl);
5118 return (DDI_FAILURE);
5119 }
5120
5121 state->id_qpnum = ud_chan_attr.ud_qpn;
5122
5123 return (DDI_SUCCESS);
5124 }
5125
5126 static int
5127 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5128 {
5129 uint32_t progress = state->id_mac_state;
5130 uint_t attempts;
5131 ibt_status_t ret;
5132 ib_gid_t mgid;
5133 ibd_mce_t *mce;
5134 uint8_t jstate;
5135 timeout_id_t tid;
5136
5137 if (atomic_dec_32_nv(&state->id_running) != 0)
5138 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5139
5140 /*
5141 * Before we try to stop/undo whatever we did in ibd_start(),
5142 * we need to mark the link state appropriately to prevent the
5143 * ip layer from using this instance for any new transfers. Note
5144 * that if the original state of the link was "up" when we're
5145 * here, we'll set the final link state to "unknown", to behave
5146 * in the same fashion as other ethernet drivers.
5147 */
5148 mutex_enter(&state->id_link_mutex);
5149 if (cur_link_state == LINK_STATE_DOWN) {
5150 state->id_link_state = cur_link_state;
5151 } else {
5152 state->id_link_state = LINK_STATE_UNKNOWN;
5153 }
5154 mutex_exit(&state->id_link_mutex);
5155 bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5156 mac_link_update(state->id_mh, state->id_link_state);
5157
5158 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5159 if (progress & IBD_DRV_STARTED) {
5160 state->id_mac_state &= (~IBD_DRV_STARTED);
5161 }
5162
5163 if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5164 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5165 }
5166
5167 /* Stop listen under Reliable Connected Mode */
5168 if (progress & IBD_DRV_RC_LISTEN) {
5169 ASSERT(state->id_enable_rc);
5170 if (state->rc_listen_hdl != NULL) {
5171 ibd_rc_stop_listen(state);
5172 }
5173 state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5174 }
5175
5176 /* Stop timeout routine */
5177 if (progress & IBD_DRV_RC_TIMEOUT) {
5178 ASSERT(state->id_enable_rc);
5179 mutex_enter(&state->rc_timeout_lock);
5180 state->rc_timeout_start = B_FALSE;
5181 tid = state->rc_timeout;
5182 state->rc_timeout = 0;
5183 mutex_exit(&state->rc_timeout_lock);
5184 if (tid != 0)
5185 (void) untimeout(tid);
5186 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5187 }
5188
5189 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5190 attempts = 100;
5191 while (state->id_ah_op == IBD_OP_ONGOING) {
5192 /*
5193 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5194 * port is connecting to a remote IPoIB port. Wait for
5195 * the end of this connecting operation.
5196 */
5197 delay(drv_usectohz(100000));
5198 if (--attempts == 0) {
5199 state->rc_stop_connect++;
5200 DPRINT(40, "ibd_undo_start: connecting");
5201 break;
5202 }
5203 }
5204 mutex_enter(&state->id_sched_lock);
5205 state->id_sched_needed = 0;
5206 mutex_exit(&state->id_sched_lock);
5207 (void) ibd_rc_close_all_chan(state);
5208 }
5209
5210 /*
5211 * First, stop receive interrupts; this stops the driver from
5212 * handing up buffers to higher layers. Wait for receive buffers
5213 * to be returned and give up after 1 second.
5214 */
5215 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5216 attempts = 10;
5217 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5218 0) > 0) {
5219 delay(drv_usectohz(100000));
5220 if (--attempts == 0) {
5221 /*
5222 * There are pending bufs with the network
5223 * layer and we have no choice but to wait
5224 * for them to be done with. Reap all the
5225 * Tx/Rx completions that were posted since
5226 * we turned off the notification and
5227 * return failure.
5228 */
5229 cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5230 DPRINT(2, "ibd_undo_start: "
5231 "reclaiming failed");
5232 break;
5233 }
5234 }
5235 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5236 }
5237
5238 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5239 ibd_rc_fini_tx_largebuf_list(state);
5240 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5241 }
5242
5243 if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5244 ASSERT(state->id_enable_rc);
5245 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5246 if (state->id_ah_op == IBD_OP_ONGOING) {
5247 delay(drv_usectohz(10000));
5248 if (state->id_ah_op == IBD_OP_ONGOING) {
5249 /*
5250 * "state->id_ah_op == IBD_OP_ONGOING"
5251 * means this IPoIB port is connecting
5252 * to a remote IPoIB port. We can't
5253 * delete SRQ here.
5254 */
5255 state->rc_stop_connect++;
5256 DPRINT(40, "ibd_undo_start: "
5257 "connecting");
5258 } else {
5259 ibd_rc_fini_srq_list(state);
5260 state->id_mac_state &=
5261 (~IBD_DRV_RC_SRQ_ALLOCD);
5262 }
5263 } else {
5264 ibd_rc_fini_srq_list(state);
5265 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5266 }
5267 } else {
5268 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5269 }
5270 }
5271
5272 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5273 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5274
5275 mutex_enter(&state->id_trap_lock);
5276 state->id_trap_stop = B_TRUE;
5277 while (state->id_trap_inprog > 0)
5278 cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5279 mutex_exit(&state->id_trap_lock);
5280
5281 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5282 }
5283
5284 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5285 /*
5286 * Flushing the channel ensures that all pending WQE's
5287 * are marked with flush_error and handed to the CQ. It
5288 * does not guarantee the invocation of the CQ handler.
5289 * This call is guaranteed to return successfully for
5290 * UD QPNs.
5291 */
5292 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5293 IBT_SUCCESS) {
5294 DPRINT(10, "ibd_undo_start: flush_channel "
5295 "failed, ret=%d", ret);
5296 }
5297
5298 /*
5299 * Give some time for the TX CQ handler to process the
5300 * completions.
5301 */
5302 attempts = 10;
5303 mutex_enter(&state->id_tx_list.dl_mutex);
5304 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5305 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5306 != state->id_ud_num_swqe) {
5307 if (--attempts == 0)
5308 break;
5309 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5310 mutex_exit(&state->id_tx_list.dl_mutex);
5311 delay(drv_usectohz(100000));
5312 mutex_enter(&state->id_tx_list.dl_mutex);
5313 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5314 }
5315 ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5316 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5317 state->id_ud_num_swqe) {
5318 cmn_err(CE_WARN, "tx resources not freed\n");
5319 }
5320 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5321 mutex_exit(&state->id_tx_list.dl_mutex);
5322
5323 attempts = 10;
5324 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5325 if (--attempts == 0)
5326 break;
5327 delay(drv_usectohz(100000));
5328 }
5329 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5330 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5331 cmn_err(CE_WARN, "rx resources not freed\n");
5332 }
5333
5334 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5335 }
5336
5337 if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5338 /*
5339 * Drop all residual full/non membership. This includes full
5340 * membership to the broadcast group, and any nonmembership
5341 * acquired during transmits. We do this after the Tx completion
5342 * handlers are done, since those might result in some late
5343 * leaves; this also eliminates a potential race with that
5344 * path wrt the mc full list insert/delete. Trap handling
5345 * has also been suppressed at this point. Thus, no locks
5346 * are required while traversing the mc full list.
5347 */
5348 DPRINT(2, "ibd_undo_start: clear full cache entries");
5349 mce = list_head(&state->id_mc_full);
5350 while (mce != NULL) {
5351 mgid = mce->mc_info.mc_adds_vect.av_dgid;
5352 jstate = mce->mc_jstate;
5353 mce = list_next(&state->id_mc_full, mce);
5354 ibd_leave_group(state, mgid, jstate);
5355 }
5356 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5357 }
5358
5359 if (progress & IBD_DRV_RXLIST_ALLOCD) {
5360 ibd_fini_rxlist(state);
5361 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5362 }
5363
5364 if (progress & IBD_DRV_TXLIST_ALLOCD) {
5365 ibd_fini_txlist(state);
5366 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5367 }
5368
5369 if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5370 if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5371 IBT_SUCCESS) {
5372 DPRINT(10, "ibd_undo_start: free_channel "
5373 "failed, ret=%d", ret);
5374 }
5375
5376 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5377 }
5378
5379 if (progress & IBD_DRV_CQS_ALLOCD) {
5380 kmem_free(state->id_txwcs,
5381 sizeof (ibt_wc_t) * state->id_txwcs_size);
5382 if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5383 IBT_SUCCESS) {
5384 DPRINT(10, "ibd_undo_start: free_cq(scq) "
5385 "failed, ret=%d", ret);
5386 }
5387
5388 kmem_free(state->id_rxwcs,
5389 sizeof (ibt_wc_t) * state->id_rxwcs_size);
5390 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5391 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5392 "ret=%d", ret);
5393 }
5394
5395 state->id_txwcs = NULL;
5396 state->id_rxwcs = NULL;
5397 state->id_scq_hdl = NULL;
5398 state->id_rcq_hdl = NULL;
5399
5400 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5401 }
5402
5403 if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5404 mutex_enter(&state->id_ac_mutex);
5405 mod_hash_destroy_hash(state->id_ah_active_hash);
5406 mutex_exit(&state->id_ac_mutex);
5407 ibd_acache_fini(state);
5408
5409 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5410 }
5411
5412 if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5413 /*
5414 * If we'd created the ipoib broadcast group and had
5415 * successfully joined it, leave it now
5416 */
5417 if (state->id_bgroup_created) {
5418 mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5419 jstate = IB_MC_JSTATE_FULL;
5420 (void) ibt_leave_mcg(state->id_sgid, mgid,
5421 state->id_sgid, jstate);
5422 }
5423 ibt_free_mcg_info(state->id_mcinfo, 1);
5424
5425 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5426 }
5427
5428 return (DDI_SUCCESS);
5429 }
5430
5431 /*
5432 * These pair of routines are used to set/clear the condition that
5433 * the caller is likely to do something to change the id_mac_state.
5434 * If there's already someone doing either a start or a stop (possibly
5435 * due to the async handler detecting a pkey relocation event, a plumb
5436 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5437 * that's done.
5438 */
5439 static void
5440 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5441 {
5442 mutex_enter(&state->id_macst_lock);
5443 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5444 cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5445
5446 state->id_mac_state |= flag;
5447 mutex_exit(&state->id_macst_lock);
5448 }
5449
5450 static void
5451 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5452 {
5453 mutex_enter(&state->id_macst_lock);
5454 state->id_mac_state &= (~flag);
5455 cv_signal(&state->id_macst_cv);
5456 mutex_exit(&state->id_macst_lock);
5457 }
5458
5459 /*
5460 * GLDv3 entry point to start hardware.
5461 */
5462 /*ARGSUSED*/
5463 static int
5464 ibd_m_start(void *arg)
5465 {
5466 ibd_state_t *state = arg;
5467 int ret;
5468
5469 if (state->id_type == IBD_PORT_DRIVER)
5470 return (EINVAL);
5471
5472 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5473 if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5474 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5475 return (EIO);
5476 }
5477
5478 ret = ibd_start(state);
5479 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5480 return (ret);
5481 }
5482
5483 static int
5484 ibd_start(ibd_state_t *state)
5485 {
5486 int err;
5487 ibt_status_t ret;
5488 int late_hca_init = 0;
5489
5490 if (state->id_mac_state & IBD_DRV_STARTED)
5491 return (DDI_SUCCESS);
5492
5493 /*
5494 * We do not increment the running flag when calling ibd_start() as
5495 * a result of some event which moves the state away from late HCA
5496 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5497 */
5498 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5499 (atomic_inc_32_nv(&state->id_running) != 1)) {
5500 DPRINT(10, "ibd_start: id_running is non-zero");
5501 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5502 atomic_dec_32(&state->id_running);
5503 return (EINVAL);
5504 }
5505
5506 /*
5507 * Get port details; if we fail here, something bad happened.
5508 * Fail plumb.
5509 */
5510 if ((err = ibd_get_port_details(state)) != 0) {
5511 DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5512 goto start_fail;
5513 }
5514 /*
5515 * If state->id_link_state is DOWN, it indicates that either the port
5516 * is down, or the pkey is not available. In both cases, resort to late
5517 * initialization. Register for subnet notices, and return success.
5518 */
5519 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5520 if (state->id_link_state == LINK_STATE_DOWN) {
5521 late_hca_init = 1;
5522 goto late_hca_init_return;
5523 }
5524
5525 /*
5526 * Find the IPoIB broadcast group
5527 */
5528 if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5529 /* Resort to late initialization */
5530 late_hca_init = 1;
5531 goto reg_snet_notices;
5532 }
5533 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5534
5535 /*
5536 * Initialize per-interface caches and lists; if we fail here,
5537 * it is most likely due to a lack of resources
5538 */
5539 if (ibd_acache_init(state) != DDI_SUCCESS) {
5540 DPRINT(10, "ibd_start: ibd_acache_init() failed");
5541 err = ENOMEM;
5542 goto start_fail;
5543 }
5544 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5545
5546 /*
5547 * Allocate send and receive completion queues
5548 */
5549 if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5550 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5551 err = ENOMEM;
5552 goto start_fail;
5553 }
5554 state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5555
5556 /*
5557 * Setup a UD channel
5558 */
5559 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5560 err = ENOMEM;
5561 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5562 goto start_fail;
5563 }
5564 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5565
5566 /*
5567 * Allocate and initialize the tx buffer list
5568 */
5569 if (ibd_init_txlist(state) != DDI_SUCCESS) {
5570 DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5571 err = ENOMEM;
5572 goto start_fail;
5573 }
5574 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5575
5576 /*
5577 * Create the send cq handler here
5578 */
5579 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5580 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5581 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5582 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5583 "failed, ret=%d", ret);
5584 err = EINVAL;
5585 goto start_fail;
5586 }
5587 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5588
5589 /*
5590 * Allocate and initialize the rx buffer list
5591 */
5592 if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5593 DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5594 err = ENOMEM;
5595 goto start_fail;
5596 }
5597 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5598
5599 /*
5600 * Join IPoIB broadcast group
5601 */
5602 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5603 DPRINT(10, "ibd_start: ibd_join_group() failed");
5604 err = ENOTACTIVE;
5605 goto start_fail;
5606 }
5607 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5608
5609 /*
5610 * When we did mac_register() in ibd_attach(), we didn't register
5611 * the real macaddr and we didn't have the true port mtu. Now that
5612 * we're almost ready, set the local mac address and broadcast
5613 * addresses and update gldv3 about the real values of these
5614 * parameters.
5615 */
5616 if (state->id_enable_rc) {
5617 ibd_h2n_mac(&state->id_macaddr,
5618 IBD_MAC_ADDR_RC + state->id_qpnum,
5619 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5620 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
5621 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5622 } else {
5623 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
5624 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
5625 }
5626 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
5627 state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
5628
5629 if (!state->id_enable_rc) {
5630 (void) mac_maxsdu_update2(state->id_mh,
5631 state->id_mtu - IPOIB_HDRSIZE,
5632 state->id_mtu - IPOIB_HDRSIZE);
5633 }
5634 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
5635
5636 /*
5637 * Setup the receive cq handler
5638 */
5639 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
5640 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
5641 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5642 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
5643 "failed, ret=%d", ret);
5644 err = EINVAL;
5645 goto start_fail;
5646 }
5647 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
5648
5649 reg_snet_notices:
5650 /*
5651 * In case of normal initialization sequence,
5652 * Setup the subnet notices handler after we've initialized the acache/
5653 * mcache and started the async thread, both of which are required for
5654 * the trap handler to function properly.
5655 *
5656 * Now that the async thread has been started (and we've already done
5657 * a mac_register() during attach so mac_tx_update() can be called
5658 * if necessary without any problem), we can enable the trap handler
5659 * to queue requests to the async thread.
5660 *
5661 * In case of late hca initialization, the subnet notices handler will
5662 * only handle MCG created/deleted event. The action performed as part
5663 * of handling these events is to start the interface. So, the
5664 * acache/mcache initialization is not a necessity in such cases for
5665 * registering the subnet notices handler. Also, if we are in
5666 * ibd_start() as a result of, say, some event handling after entering
5667 * late hca initialization phase no need to register again.
5668 */
5669 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
5670 ibt_register_subnet_notices(state->id_ibt_hdl,
5671 ibd_snet_notices_handler, state);
5672 mutex_enter(&state->id_trap_lock);
5673 state->id_trap_stop = B_FALSE;
5674 mutex_exit(&state->id_trap_lock);
5675 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
5676 }
5677
5678 late_hca_init_return:
5679 if (late_hca_init == 1) {
5680 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
5681 /*
5682 * In case of late initialization, mark the link state as down,
5683 * immaterial of the actual link state as reported in the
5684 * port_info.
5685 */
5686 state->id_link_state = LINK_STATE_DOWN;
5687 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
5688 mac_link_update(state->id_mh, state->id_link_state);
5689 return (DDI_SUCCESS);
5690 }
5691
5692 if (state->id_enable_rc) {
5693 if (state->rc_enable_srq) {
5694 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
5695 if (ibd_rc_repost_srq_free_list(state) !=
5696 IBT_SUCCESS) {
5697 err = ENOMEM;
5698 goto start_fail;
5699 }
5700 } else {
5701 /* Allocate SRQ resource */
5702 if (ibd_rc_init_srq_list(state) !=
5703 IBT_SUCCESS) {
5704 err = ENOMEM;
5705 goto start_fail;
5706 }
5707 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
5708 }
5709 }
5710
5711 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
5712 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
5713 "failed");
5714 err = ENOMEM;
5715 goto start_fail;
5716 }
5717 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
5718
5719 /* RC: begin to listen only after everything is available */
5720 if (ibd_rc_listen(state) != IBT_SUCCESS) {
5721 DPRINT(10, "ibd_start: ibd_rc_listen() failed");
5722 err = EINVAL;
5723 goto start_fail;
5724 }
5725 state->id_mac_state |= IBD_DRV_RC_LISTEN;
5726 }
5727
5728 /*
5729 * Indicate link status to GLDv3 and higher layers. By default,
5730 * we assume we are in up state (which must have been true at
5731 * least at the time the broadcast mcg's were probed); if there
5732 * were any up/down transitions till the time we come here, the
5733 * async handler will have updated last known state, which we
5734 * use to tell GLDv3. The async handler will not send any
5735 * notifications to GLDv3 till we reach here in the initialization
5736 * sequence.
5737 */
5738 mac_link_update(state->id_mh, state->id_link_state);
5739 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
5740 state->id_mac_state |= IBD_DRV_STARTED;
5741
5742 /* Start timer after everything is ready */
5743 if (state->id_enable_rc) {
5744 mutex_enter(&state->rc_timeout_lock);
5745 state->rc_timeout_start = B_TRUE;
5746 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
5747 SEC_TO_TICK(ibd_rc_conn_timeout));
5748 mutex_exit(&state->rc_timeout_lock);
5749 state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
5750 }
5751
5752 return (DDI_SUCCESS);
5753
5754 start_fail:
5755 /*
5756 * If we ran into a problem during ibd_start() and ran into
5757 * some other problem during undoing our partial work, we can't
5758 * do anything about it. Ignore any errors we might get from
5759 * ibd_undo_start() and just return the original error we got.
5760 */
5761 (void) ibd_undo_start(state, LINK_STATE_DOWN);
5762 return (err);
5763 }
5764
5765 /*
5766 * GLDv3 entry point to stop hardware from receiving packets.
5767 */
5768 /*ARGSUSED*/
5769 static void
5770 ibd_m_stop(void *arg)
5771 {
5772 ibd_state_t *state = (ibd_state_t *)arg;
5773
5774 if (state->id_type == IBD_PORT_DRIVER)
5775 return;
5776
5777 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5778
5779 (void) ibd_undo_start(state, state->id_link_state);
5780
5781 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
5782 }
5783
5784 /*
5785 * GLDv3 entry point to modify device's mac address. We do not
5786 * allow address modifications.
5787 */
5788 static int
5789 ibd_m_unicst(void *arg, const uint8_t *macaddr)
5790 {
5791 ibd_state_t *state = arg;
5792
5793 if (state->id_type == IBD_PORT_DRIVER)
5794 return (EINVAL);
5795
5796 /*
5797 * Don't bother even comparing the macaddr if we haven't
5798 * completed ibd_m_start().
5799 */
5800 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5801 return (0);
5802
5803 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
5804 return (0);
5805 else
5806 return (EINVAL);
5807 }
5808
5809 /*
5810 * The blocking part of the IBA join/leave operations are done out
5811 * of here on the async thread.
5812 */
5813 static void
5814 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
5815 {
5816 DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
5817 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
5818
5819 if (op == IBD_ASYNC_JOIN) {
5820 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
5821 ibd_print_warn(state, "Join multicast group failed :"
5822 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5823 }
5824 } else {
5825 /*
5826 * Here, we must search for the proper mcg_info and
5827 * use that to leave the group.
5828 */
5829 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
5830 }
5831 }
5832
5833 /*
5834 * GLDv3 entry point for multicast enable/disable requests.
5835 * This function queues the operation to the async thread and
5836 * return success for a valid multicast address.
5837 */
5838 static int
5839 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
5840 {
5841 ibd_state_t *state = (ibd_state_t *)arg;
5842 ipoib_mac_t maddr, *mcast;
5843 ib_gid_t mgid;
5844 ibd_req_t *req;
5845
5846 if (state->id_type == IBD_PORT_DRIVER)
5847 return (EINVAL);
5848
5849 /*
5850 * If we haven't completed ibd_m_start(), async thread wouldn't
5851 * have been started and id_bcaddr wouldn't be set, so there's
5852 * no point in continuing.
5853 */
5854 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5855 return (0);
5856
5857 /*
5858 * The incoming multicast address might not be aligned properly
5859 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
5860 * it to look like one though, to get the offsets of the mc gid,
5861 * since we know we are not going to dereference any values with
5862 * the ipoib_mac_t pointer.
5863 */
5864 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
5865 mcast = &maddr;
5866
5867 /*
5868 * Check validity of MCG address. We could additionally check
5869 * that a enable/disable is not being issued on the "broadcast"
5870 * mcg, but since this operation is only invokable by privileged
5871 * programs anyway, we allow the flexibility to those dlpi apps.
5872 * Note that we do not validate the "scope" of the IBA mcg.
5873 */
5874 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
5875 return (EINVAL);
5876
5877 /*
5878 * fill in multicast pkey and scope
5879 */
5880 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
5881
5882 /*
5883 * If someone is trying to JOIN/LEAVE the broadcast group, we do
5884 * nothing (i.e. we stay JOINed to the broadcast group done in
5885 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
5886 * requires to be joined to broadcast groups at all times.
5887 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
5888 * depends on this.
5889 */
5890 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5891 return (0);
5892
5893 ibd_n2h_gid(mcast, &mgid);
5894 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5895 if (req == NULL)
5896 return (ENOMEM);
5897
5898 req->rq_gid = mgid;
5899
5900 if (add) {
5901 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
5902 mgid.gid_prefix, mgid.gid_guid);
5903 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
5904 } else {
5905 DPRINT(1, "ibd_m_multicst : unset_multicast : "
5906 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
5907 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
5908 }
5909 return (0);
5910 }
5911
5912 /*
5913 * The blocking part of the IBA promiscuous operations are done
5914 * out of here on the async thread. The dlpireq parameter indicates
5915 * whether this invocation is due to a dlpi request or due to
5916 * a port up/down event.
5917 */
5918 static void
5919 ibd_async_unsetprom(ibd_state_t *state)
5920 {
5921 ibd_mce_t *mce = list_head(&state->id_mc_non);
5922 ib_gid_t mgid;
5923
5924 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
5925
5926 while (mce != NULL) {
5927 mgid = mce->mc_info.mc_adds_vect.av_dgid;
5928 mce = list_next(&state->id_mc_non, mce);
5929 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
5930 }
5931 state->id_prom_op = IBD_OP_NOTSTARTED;
5932 }
5933
5934 /*
5935 * The blocking part of the IBA promiscuous operations are done
5936 * out of here on the async thread. The dlpireq parameter indicates
5937 * whether this invocation is due to a dlpi request or due to
5938 * a port up/down event.
5939 */
5940 static void
5941 ibd_async_setprom(ibd_state_t *state)
5942 {
5943 ibt_mcg_attr_t mcg_attr;
5944 ibt_mcg_info_t *mcg_info;
5945 ib_gid_t mgid;
5946 uint_t numg;
5947 int i;
5948 char ret = IBD_OP_COMPLETED;
5949
5950 DPRINT(2, "ibd_async_setprom : async_set_promisc");
5951
5952 /*
5953 * Obtain all active MC groups on the IB fabric with
5954 * specified criteria (scope + Pkey + Qkey + mtu).
5955 */
5956 bzero(&mcg_attr, sizeof (mcg_attr));
5957 mcg_attr.mc_pkey = state->id_pkey;
5958 mcg_attr.mc_scope = state->id_scope;
5959 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
5960 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
5961 mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
5962 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
5963 IBT_SUCCESS) {
5964 ibd_print_warn(state, "Could not get list of IBA multicast "
5965 "groups");
5966 ret = IBD_OP_ERRORED;
5967 goto done;
5968 }
5969
5970 /*
5971 * Iterate over the returned mcg's and join as NonMember
5972 * to the IP mcg's.
5973 */
5974 for (i = 0; i < numg; i++) {
5975 /*
5976 * Do a NonMember JOIN on the MC group.
5977 */
5978 mgid = mcg_info[i].mc_adds_vect.av_dgid;
5979 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
5980 ibd_print_warn(state, "IBA promiscuous mode missed "
5981 "multicast gid %016llx:%016llx",
5982 (u_longlong_t)mgid.gid_prefix,
5983 (u_longlong_t)mgid.gid_guid);
5984 }
5985
5986 ibt_free_mcg_info(mcg_info, numg);
5987 DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
5988 done:
5989 state->id_prom_op = ret;
5990 }
5991
5992 /*
5993 * GLDv3 entry point for multicast promiscuous enable/disable requests.
5994 * GLDv3 assumes phys state receives more packets than multi state,
5995 * which is not true for IPoIB. Thus, treat the multi and phys
5996 * promiscuous states the same way to work with GLDv3's assumption.
5997 */
5998 static int
5999 ibd_m_promisc(void *arg, boolean_t on)
6000 {
6001 ibd_state_t *state = (ibd_state_t *)arg;
6002 ibd_req_t *req;
6003
6004 if (state->id_type == IBD_PORT_DRIVER)
6005 return (EINVAL);
6006
6007 /*
6008 * Async thread wouldn't have been started if we haven't
6009 * passed ibd_m_start()
6010 */
6011 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6012 return (0);
6013
6014 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6015 if (req == NULL)
6016 return (ENOMEM);
6017 if (on) {
6018 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6019 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6020 } else {
6021 DPRINT(1, "ibd_m_promisc : unset_promisc");
6022 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6023 }
6024
6025 return (0);
6026 }
6027
6028 /*
6029 * GLDv3 entry point for gathering statistics.
6030 */
6031 static int
6032 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6033 {
6034 ibd_state_t *state = (ibd_state_t *)arg;
6035
6036 switch (stat) {
6037 case MAC_STAT_IFSPEED:
6038 *val = state->id_link_speed;
6039 break;
6040 case MAC_STAT_MULTIRCV:
6041 *val = state->id_multi_rcv;
6042 break;
6043 case MAC_STAT_BRDCSTRCV:
6044 *val = state->id_brd_rcv;
6045 break;
6046 case MAC_STAT_MULTIXMT:
6047 *val = state->id_multi_xmt;
6048 break;
6049 case MAC_STAT_BRDCSTXMT:
6050 *val = state->id_brd_xmt;
6051 break;
6052 case MAC_STAT_RBYTES:
6053 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6054 + state->rc_rcv_copy_byte;
6055 break;
6056 case MAC_STAT_IPACKETS:
6057 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6058 + state->rc_rcv_copy_pkt;
6059 break;
6060 case MAC_STAT_OBYTES:
6061 *val = state->id_xmt_bytes + state->rc_xmt_bytes;
6062 break;
6063 case MAC_STAT_OPACKETS:
6064 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6065 state->rc_xmt_fragmented_pkt +
6066 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6067 break;
6068 case MAC_STAT_OERRORS:
6069 *val = state->id_ah_error; /* failed AH translation */
6070 break;
6071 case MAC_STAT_IERRORS:
6072 *val = 0;
6073 break;
6074 case MAC_STAT_NOXMTBUF:
6075 *val = state->id_tx_short + state->rc_swqe_short +
6076 state->rc_xmt_buf_short;
6077 break;
6078 case MAC_STAT_NORCVBUF:
6079 default:
6080 return (ENOTSUP);
6081 }
6082
6083 return (0);
6084 }
6085
6086 static void
6087 ibd_async_txsched(ibd_state_t *state)
6088 {
6089 ibd_resume_transmission(state);
6090 }
6091
6092 static void
6093 ibd_resume_transmission(ibd_state_t *state)
6094 {
6095 int flag;
6096 int met_thresh = 0;
6097 int thresh = 0;
6098 int ret = -1;
6099
6100 mutex_enter(&state->id_sched_lock);
6101 if (state->id_sched_needed & IBD_RSRC_SWQE) {
6102 mutex_enter(&state->id_tx_list.dl_mutex);
6103 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6104 met_thresh = state->id_tx_list.dl_cnt +
6105 state->id_tx_rel_list.dl_cnt;
6106 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6107 mutex_exit(&state->id_tx_list.dl_mutex);
6108 thresh = IBD_FREE_SWQES_THRESH;
6109 flag = IBD_RSRC_SWQE;
6110 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6111 ASSERT(state->id_lso != NULL);
6112 mutex_enter(&state->id_lso_lock);
6113 met_thresh = state->id_lso->bkt_nfree;
6114 thresh = IBD_FREE_LSOS_THRESH;
6115 mutex_exit(&state->id_lso_lock);
6116 flag = IBD_RSRC_LSOBUF;
6117 if (met_thresh > thresh)
6118 state->id_sched_lso_cnt++;
6119 }
6120 if (met_thresh > thresh) {
6121 state->id_sched_needed &= ~flag;
6122 state->id_sched_cnt++;
6123 ret = 0;
6124 }
6125 mutex_exit(&state->id_sched_lock);
6126
6127 if (ret == 0)
6128 mac_tx_update(state->id_mh);
6129 }
6130
6131 /*
6132 * Release the send wqe back into free list.
6133 */
6134 static void
6135 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6136 {
6137 /*
6138 * Add back on Tx list for reuse.
6139 */
6140 ASSERT(tail->swqe_next == NULL);
6141 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6142 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6143 tail->swqe_next = state->id_tx_rel_list.dl_head;
6144 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6145 state->id_tx_rel_list.dl_cnt += n;
6146 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6147 }
6148
6149 /*
6150 * Acquire a send wqe from free list.
6151 * Returns error number and send wqe pointer.
6152 */
6153 static ibd_swqe_t *
6154 ibd_acquire_swqe(ibd_state_t *state)
6155 {
6156 ibd_swqe_t *wqe;
6157
6158 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6159 if (state->id_tx_rel_list.dl_head != NULL) {
6160 /* transfer id_tx_rel_list to id_tx_list */
6161 state->id_tx_list.dl_head =
6162 state->id_tx_rel_list.dl_head;
6163 state->id_tx_list.dl_cnt =
6164 state->id_tx_rel_list.dl_cnt;
6165 state->id_tx_list.dl_pending_sends = B_FALSE;
6166
6167 /* clear id_tx_rel_list */
6168 state->id_tx_rel_list.dl_head = NULL;
6169 state->id_tx_rel_list.dl_cnt = 0;
6170 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6171
6172 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6173 state->id_tx_list.dl_cnt -= 1;
6174 state->id_tx_list.dl_head = wqe->swqe_next;
6175 } else { /* no free swqe */
6176 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6177 state->id_tx_list.dl_pending_sends = B_TRUE;
6178 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6179 state->id_tx_short++;
6180 wqe = NULL;
6181 }
6182 return (wqe);
6183 }
6184
6185 static int
6186 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6187 ibt_ud_dest_hdl_t ud_dest)
6188 {
6189 mblk_t *nmp;
6190 int iph_len, tcph_len;
6191 ibt_wr_lso_t *lso;
6192 uintptr_t ip_start, tcp_start;
6193 uint8_t *dst;
6194 uint_t pending, mblen;
6195
6196 /*
6197 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6198 * we need to adjust it here for lso.
6199 */
6200 lso = &(node->w_swr.wr.ud_lso);
6201 lso->lso_ud_dest = ud_dest;
6202 lso->lso_mss = mss;
6203
6204 /*
6205 * Calculate the LSO header size and set it in the UD LSO structure.
6206 * Note that the only assumption we make is that each of the IPoIB,
6207 * IP and TCP headers will be contained in a single mblk fragment;
6208 * together, the headers may span multiple mblk fragments.
6209 */
6210 nmp = mp;
6211 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6212 if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6213 ip_start = (uintptr_t)nmp->b_cont->b_rptr
6214 + (ip_start - (uintptr_t)(nmp->b_wptr));
6215 nmp = nmp->b_cont;
6216
6217 }
6218 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6219
6220 tcp_start = ip_start + iph_len;
6221 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6222 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6223 + (tcp_start - (uintptr_t)(nmp->b_wptr));
6224 nmp = nmp->b_cont;
6225 }
6226 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6227 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6228
6229 /*
6230 * If the lso header fits entirely within a single mblk fragment,
6231 * we'll avoid an additional copy of the lso header here and just
6232 * pass the b_rptr of the mblk directly.
6233 *
6234 * If this isn't true, we'd have to allocate for it explicitly.
6235 */
6236 if (lso->lso_hdr_sz <= MBLKL(mp)) {
6237 lso->lso_hdr = mp->b_rptr;
6238 } else {
6239 /* On work completion, remember to free this allocated hdr */
6240 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6241 if (lso->lso_hdr == NULL) {
6242 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6243 "sz = %d", lso->lso_hdr_sz);
6244 lso->lso_hdr_sz = 0;
6245 lso->lso_mss = 0;
6246 return (-1);
6247 }
6248 }
6249
6250 /*
6251 * Copy in the lso header only if we need to
6252 */
6253 if (lso->lso_hdr != mp->b_rptr) {
6254 dst = lso->lso_hdr;
6255 pending = lso->lso_hdr_sz;
6256
6257 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6258 mblen = MBLKL(nmp);
6259 if (pending > mblen) {
6260 bcopy(nmp->b_rptr, dst, mblen);
6261 dst += mblen;
6262 pending -= mblen;
6263 } else {
6264 bcopy(nmp->b_rptr, dst, pending);
6265 break;
6266 }
6267 }
6268 }
6269
6270 return (0);
6271 }
6272
6273 static void
6274 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6275 {
6276 ibt_wr_lso_t *lso;
6277
6278 if ((!node) || (!mp))
6279 return;
6280
6281 /*
6282 * Free any header space that we might've allocated if we
6283 * did an LSO
6284 */
6285 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6286 lso = &(node->w_swr.wr.ud_lso);
6287 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6288 kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6289 lso->lso_hdr = NULL;
6290 lso->lso_hdr_sz = 0;
6291 }
6292 }
6293 }
6294
6295 static void
6296 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6297 {
6298 uint_t i;
6299 uint_t num_posted;
6300 uint_t n_wrs;
6301 ibt_status_t ibt_status;
6302 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE];
6303 ibd_swqe_t *tx_head, *elem;
6304 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE];
6305
6306 /* post the one request, then check for more */
6307 ibt_status = ibt_post_send(state->id_chnl_hdl,
6308 &node->w_swr, 1, NULL);
6309 if (ibt_status != IBT_SUCCESS) {
6310 ibd_print_warn(state, "ibd_post_send: "
6311 "posting one wr failed: ret=%d", ibt_status);
6312 ibd_tx_cleanup(state, node);
6313 }
6314
6315 tx_head = NULL;
6316 for (;;) {
6317 if (tx_head == NULL) {
6318 mutex_enter(&state->id_txpost_lock);
6319 tx_head = state->id_tx_head;
6320 if (tx_head == NULL) {
6321 state->id_tx_busy = 0;
6322 mutex_exit(&state->id_txpost_lock);
6323 return;
6324 }
6325 state->id_tx_head = NULL;
6326 mutex_exit(&state->id_txpost_lock);
6327 }
6328
6329 /*
6330 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6331 * at a time if possible, and keep posting them.
6332 */
6333 for (n_wrs = 0, elem = tx_head;
6334 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6335 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6336 nodes[n_wrs] = elem;
6337 wrs[n_wrs] = elem->w_swr;
6338 }
6339 tx_head = elem;
6340
6341 ASSERT(n_wrs != 0);
6342
6343 /*
6344 * If posting fails for some reason, we'll never receive
6345 * completion intimation, so we'll need to cleanup. But
6346 * we need to make sure we don't clean up nodes whose
6347 * wrs have been successfully posted. We assume that the
6348 * hca driver returns on the first failure to post and
6349 * therefore the first 'num_posted' entries don't need
6350 * cleanup here.
6351 */
6352 num_posted = 0;
6353 ibt_status = ibt_post_send(state->id_chnl_hdl,
6354 wrs, n_wrs, &num_posted);
6355 if (ibt_status != IBT_SUCCESS) {
6356 ibd_print_warn(state, "ibd_post_send: "
6357 "posting multiple wrs failed: "
6358 "requested=%d, done=%d, ret=%d",
6359 n_wrs, num_posted, ibt_status);
6360
6361 for (i = num_posted; i < n_wrs; i++)
6362 ibd_tx_cleanup(state, nodes[i]);
6363 }
6364 }
6365 }
6366
6367 static int
6368 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6369 uint_t lsohdr_sz)
6370 {
6371 ibt_wr_ds_t *sgl;
6372 ibt_status_t ibt_status;
6373 mblk_t *nmp;
6374 mblk_t *data_mp;
6375 uchar_t *bufp;
6376 size_t blksize;
6377 size_t skip;
6378 size_t avail;
6379 uint_t pktsize;
6380 uint_t frag_len;
6381 uint_t pending_hdr;
6382 int nmblks;
6383 int i;
6384
6385 /*
6386 * Let's skip ahead to the data if this is LSO
6387 */
6388 data_mp = mp;
6389 pending_hdr = 0;
6390 if (lsohdr_sz) {
6391 pending_hdr = lsohdr_sz;
6392 for (nmp = mp; nmp; nmp = nmp->b_cont) {
6393 frag_len = nmp->b_wptr - nmp->b_rptr;
6394 if (frag_len > pending_hdr)
6395 break;
6396 pending_hdr -= frag_len;
6397 }
6398 data_mp = nmp; /* start of data past lso header */
6399 ASSERT(data_mp != NULL);
6400 }
6401
6402 /*
6403 * Calculate the size of message data and number of msg blocks
6404 */
6405 pktsize = 0;
6406 for (nmblks = 0, nmp = data_mp; nmp != NULL;
6407 nmp = nmp->b_cont, nmblks++) {
6408 pktsize += MBLKL(nmp);
6409 }
6410 pktsize -= pending_hdr;
6411
6412 /*
6413 * We only do ibt_map_mem_iov() if the pktsize is above the
6414 * "copy-threshold", and if the number of mp fragments is less than
6415 * the maximum acceptable.
6416 */
6417 if ((state->id_hca_res_lkey_capab) &&
6418 (pktsize > state->id_ud_tx_copy_thresh) &&
6419 (nmblks < state->id_max_sqseg_hiwm)) {
6420 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6421 ibt_iov_attr_t iov_attr;
6422
6423 iov_attr.iov_as = NULL;
6424 iov_attr.iov = iov_arr;
6425 iov_attr.iov_buf = NULL;
6426 iov_attr.iov_list_len = nmblks;
6427 iov_attr.iov_wr_nds = state->id_max_sqseg;
6428 iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6429 iov_attr.iov_flags = IBT_IOV_SLEEP;
6430
6431 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6432 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6433 iov_arr[i].iov_len = MBLKL(nmp);
6434 if (i == 0) {
6435 iov_arr[i].iov_addr += pending_hdr;
6436 iov_arr[i].iov_len -= pending_hdr;
6437 }
6438 }
6439
6440 node->w_buftype = IBD_WQE_MAPPED;
6441 node->w_swr.wr_sgl = node->w_sgl;
6442
6443 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6444 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6445 if (ibt_status != IBT_SUCCESS) {
6446 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6447 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6448 goto ibd_copy_path;
6449 }
6450
6451 return (0);
6452 }
6453
6454 ibd_copy_path:
6455 if (pktsize <= state->id_tx_buf_sz) {
6456 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6457 node->w_swr.wr_nds = 1;
6458 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6459 node->w_buftype = IBD_WQE_TXBUF;
6460
6461 /*
6462 * Even though this is the copy path for transfers less than
6463 * id_tx_buf_sz, it could still be an LSO packet. If so, it
6464 * is possible the first data mblk fragment (data_mp) still
6465 * contains part of the LSO header that we need to skip.
6466 */
6467 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6468 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6469 blksize = MBLKL(nmp) - pending_hdr;
6470 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6471 bufp += blksize;
6472 pending_hdr = 0;
6473 }
6474
6475 return (0);
6476 }
6477
6478 /*
6479 * Copy path for transfers greater than id_tx_buf_sz
6480 */
6481 node->w_swr.wr_sgl = node->w_sgl;
6482 if (ibd_acquire_lsobufs(state, pktsize,
6483 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6484 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6485 return (-1);
6486 }
6487 node->w_buftype = IBD_WQE_LSOBUF;
6488
6489 /*
6490 * Copy the larger-than-id_tx_buf_sz packet into a set of
6491 * fixed-sized, pre-mapped LSO buffers. Note that we might
6492 * need to skip part of the LSO header in the first fragment
6493 * as before.
6494 */
6495 nmp = data_mp;
6496 skip = pending_hdr;
6497 for (i = 0; i < node->w_swr.wr_nds; i++) {
6498 sgl = node->w_swr.wr_sgl + i;
6499 bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6500 avail = IBD_LSO_BUFSZ;
6501 while (nmp && avail) {
6502 blksize = MBLKL(nmp) - skip;
6503 if (blksize > avail) {
6504 bcopy(nmp->b_rptr + skip, bufp, avail);
6505 skip += avail;
6506 avail = 0;
6507 } else {
6508 bcopy(nmp->b_rptr + skip, bufp, blksize);
6509 skip = 0;
6510 avail -= blksize;
6511 bufp += blksize;
6512 nmp = nmp->b_cont;
6513 }
6514 }
6515 }
6516
6517 return (0);
6518 }
6519
6520 /*
6521 * Schedule a completion queue polling to reap the resource we're
6522 * short on. If we implement the change to reap tx completions
6523 * in a separate thread, we'll need to wake up that thread here.
6524 */
6525 static int
6526 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6527 {
6528 ibd_req_t *req;
6529
6530 mutex_enter(&state->id_sched_lock);
6531 state->id_sched_needed |= resource_type;
6532 mutex_exit(&state->id_sched_lock);
6533
6534 /*
6535 * If we are asked to queue a work entry, we need to do it
6536 */
6537 if (q_flag) {
6538 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6539 if (req == NULL)
6540 return (-1);
6541
6542 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6543 }
6544
6545 return (0);
6546 }
6547
6548 /*
6549 * The passed in packet has this format:
6550 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6551 */
6552 static boolean_t
6553 ibd_send(ibd_state_t *state, mblk_t *mp)
6554 {
6555 ibd_ace_t *ace;
6556 ibd_swqe_t *node;
6557 ipoib_mac_t *dest;
6558 ib_header_info_t *ipibp;
6559 ip6_t *ip6h;
6560 uint_t pktsize;
6561 uint32_t mss;
6562 uint32_t hckflags;
6563 uint32_t lsoflags = 0;
6564 uint_t lsohdr_sz = 0;
6565 int ret, len;
6566 boolean_t dofree = B_FALSE;
6567 boolean_t rc;
6568 /* if (rc_chan == NULL) send by UD; else send by RC; */
6569 ibd_rc_chan_t *rc_chan;
6570 int nmblks;
6571 mblk_t *nmp;
6572
6573 /*
6574 * If we aren't done with the device initialization and start,
6575 * we shouldn't be here.
6576 */
6577 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6578 return (B_FALSE);
6579
6580 /*
6581 * Obtain an address handle for the destination.
6582 */
6583 ipibp = (ib_header_info_t *)mp->b_rptr;
6584 dest = (ipoib_mac_t *)&ipibp->ib_dst;
6585 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6586 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6587
6588 rc_chan = NULL;
6589 ace = ibd_acache_lookup(state, dest, &ret, 1);
6590 if (state->id_enable_rc && (ace != NULL) &&
6591 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6592 if (ace->ac_chan == NULL) {
6593 state->rc_null_conn++;
6594 } else {
6595 if (ace->ac_chan->chan_state ==
6596 IBD_RC_STATE_ACT_ESTAB) {
6597 rc_chan = ace->ac_chan;
6598 rc_chan->is_used = B_TRUE;
6599 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6600 node = WQE_TO_SWQE(
6601 rc_chan->tx_wqe_list.dl_head);
6602 if (node != NULL) {
6603 rc_chan->tx_wqe_list.dl_cnt -= 1;
6604 rc_chan->tx_wqe_list.dl_head =
6605 node->swqe_next;
6606 } else {
6607 node = ibd_rc_acquire_swqes(rc_chan);
6608 }
6609 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6610
6611 if (node == NULL) {
6612 state->rc_swqe_short++;
6613 mutex_enter(&state->id_sched_lock);
6614 state->id_sched_needed |=
6615 IBD_RSRC_RC_SWQE;
6616 mutex_exit(&state->id_sched_lock);
6617 ibd_dec_ref_ace(state, ace);
6618 return (B_FALSE);
6619 }
6620 } else {
6621 state->rc_no_estab_conn++;
6622 }
6623 }
6624 }
6625
6626 if (rc_chan == NULL) {
6627 mutex_enter(&state->id_tx_list.dl_mutex);
6628 node = WQE_TO_SWQE(state->id_tx_list.dl_head);
6629 if (node != NULL) {
6630 state->id_tx_list.dl_cnt -= 1;
6631 state->id_tx_list.dl_head = node->swqe_next;
6632 } else {
6633 node = ibd_acquire_swqe(state);
6634 }
6635 mutex_exit(&state->id_tx_list.dl_mutex);
6636 if (node == NULL) {
6637 /*
6638 * If we don't have an swqe available, schedule a
6639 * transmit completion queue cleanup and hold off on
6640 * sending more packets until we have some free swqes
6641 */
6642 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
6643 if (ace != NULL) {
6644 ibd_dec_ref_ace(state, ace);
6645 }
6646 return (B_FALSE);
6647 }
6648
6649 /*
6650 * If a poll cannot be scheduled, we have no choice but
6651 * to drop this packet
6652 */
6653 ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
6654 if (ace != NULL) {
6655 ibd_dec_ref_ace(state, ace);
6656 }
6657 return (B_TRUE);
6658 }
6659 }
6660
6661 /*
6662 * Initialize the commonly used fields in swqe to NULL to protect
6663 * against ibd_tx_cleanup accidentally misinterpreting these on a
6664 * failure.
6665 */
6666 node->swqe_im_mblk = NULL;
6667 node->w_swr.wr_nds = 0;
6668 node->w_swr.wr_sgl = NULL;
6669 node->w_swr.wr_opcode = IBT_WRC_SEND;
6670
6671 /*
6672 * Calculate the size of message data and number of msg blocks
6673 */
6674 pktsize = 0;
6675 for (nmblks = 0, nmp = mp; nmp != NULL;
6676 nmp = nmp->b_cont, nmblks++) {
6677 pktsize += MBLKL(nmp);
6678 }
6679
6680 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6681 atomic_inc_64(&state->id_brd_xmt);
6682 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6683 atomic_inc_64(&state->id_multi_xmt);
6684
6685 if (ace != NULL) {
6686 node->w_ahandle = ace;
6687 node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
6688 } else {
6689 DPRINT(5,
6690 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
6691 ((ret == EFAULT) ? "failed" : "queued"),
6692 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
6693 htonl(dest->ipoib_gidpref[1]),
6694 htonl(dest->ipoib_gidsuff[0]),
6695 htonl(dest->ipoib_gidsuff[1]));
6696 state->rc_ace_not_found++;
6697 node->w_ahandle = NULL;
6698
6699 /*
6700 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
6701 * can not find a path for the specific dest address. We
6702 * should get rid of this kind of packet. We also should get
6703 * rid of the packet if we cannot schedule a poll via the
6704 * async thread. For the normal case, ibd will return the
6705 * packet to upper layer and wait for AH creating.
6706 *
6707 * Note that we always queue a work slot entry for the async
6708 * thread when we fail AH lookup (even in intr mode); this is
6709 * due to the convoluted way the code currently looks for AH.
6710 */
6711 if (ret == EFAULT) {
6712 dofree = B_TRUE;
6713 rc = B_TRUE;
6714 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
6715 dofree = B_TRUE;
6716 rc = B_TRUE;
6717 } else {
6718 dofree = B_FALSE;
6719 rc = B_FALSE;
6720 }
6721 goto ibd_send_fail;
6722 }
6723
6724 /*
6725 * For ND6 packets, padding is at the front of the source lladdr.
6726 * Insert the padding at front.
6727 */
6728 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
6729 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
6730 if (!pullupmsg(mp, IPV6_HDR_LEN +
6731 sizeof (ib_header_info_t))) {
6732 DPRINT(10, "ibd_send: pullupmsg failure ");
6733 dofree = B_TRUE;
6734 rc = B_TRUE;
6735 goto ibd_send_fail;
6736 }
6737 ipibp = (ib_header_info_t *)mp->b_rptr;
6738 }
6739 ip6h = (ip6_t *)((uchar_t *)ipibp +
6740 sizeof (ib_header_info_t));
6741 len = ntohs(ip6h->ip6_plen);
6742 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6743 mblk_t *pad;
6744
6745 pad = allocb(4, 0);
6746 pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
6747 linkb(mp, pad);
6748 if (MBLKL(mp) < sizeof (ib_header_info_t) +
6749 IPV6_HDR_LEN + len + 4) {
6750 if (!pullupmsg(mp, sizeof (ib_header_info_t) +
6751 IPV6_HDR_LEN + len + 4)) {
6752 DPRINT(10, "ibd_send: pullupmsg "
6753 "failure ");
6754 dofree = B_TRUE;
6755 rc = B_TRUE;
6756 goto ibd_send_fail;
6757 }
6758 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
6759 sizeof (ib_header_info_t));
6760 }
6761
6762 /* LINTED: E_CONSTANT_CONDITION */
6763 IBD_PAD_NSNA(ip6h, len, IBD_SEND);
6764 }
6765 }
6766
6767 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
6768 mp->b_rptr += sizeof (ib_addrs_t);
6769 pktsize -= sizeof (ib_addrs_t);
6770
6771 if (rc_chan) { /* send in RC mode */
6772 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6773 ibt_iov_attr_t iov_attr;
6774 uint_t i;
6775 size_t blksize;
6776 uchar_t *bufp;
6777 ibd_rc_tx_largebuf_t *lbufp;
6778
6779 atomic_add_64(&state->rc_xmt_bytes, pktsize);
6780
6781 /*
6782 * Upper layer does Tx checksum, we don't need do any
6783 * checksum here.
6784 */
6785 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
6786
6787 /*
6788 * We only do ibt_map_mem_iov() if the pktsize is above
6789 * the "copy-threshold", and if the number of mp
6790 * fragments is less than the maximum acceptable.
6791 */
6792 if (pktsize <= state->id_rc_tx_copy_thresh) {
6793 atomic_inc_64(&state->rc_xmt_small_pkt);
6794 /*
6795 * Only process unicast packet in Reliable Connected
6796 * mode.
6797 */
6798 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6799 node->w_swr.wr_nds = 1;
6800 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6801 node->w_buftype = IBD_WQE_TXBUF;
6802
6803 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6804 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6805 blksize = MBLKL(nmp);
6806 bcopy(nmp->b_rptr, bufp, blksize);
6807 bufp += blksize;
6808 }
6809 freemsg(mp);
6810 ASSERT(node->swqe_im_mblk == NULL);
6811 } else {
6812 if ((state->rc_enable_iov_map) &&
6813 (nmblks < state->rc_max_sqseg_hiwm)) {
6814
6815 /* do ibt_map_mem_iov() */
6816 iov_attr.iov_as = NULL;
6817 iov_attr.iov = iov_arr;
6818 iov_attr.iov_buf = NULL;
6819 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
6820 iov_attr.iov_lso_hdr_sz = 0;
6821 iov_attr.iov_flags = IBT_IOV_SLEEP;
6822
6823 i = 0;
6824 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6825 iov_arr[i].iov_len = MBLKL(nmp);
6826 if (iov_arr[i].iov_len != 0) {
6827 iov_arr[i].iov_addr = (caddr_t)
6828 (void *)nmp->b_rptr;
6829 i++;
6830 }
6831 }
6832 iov_attr.iov_list_len = i;
6833 node->w_swr.wr_sgl = node->w_sgl;
6834
6835 ret = ibt_map_mem_iov(state->id_hca_hdl,
6836 &iov_attr, (ibt_all_wr_t *)&node->w_swr,
6837 &node->w_mi_hdl);
6838 if (ret != IBT_SUCCESS) {
6839 atomic_inc_64(
6840 &state->rc_xmt_map_fail_pkt);
6841 DPRINT(30, "ibd_send: ibt_map_mem_iov("
6842 ") failed, nmblks=%d, real_nmblks"
6843 "=%d, ret=0x%x", nmblks, i, ret);
6844 goto ibd_rc_large_copy;
6845 }
6846
6847 atomic_inc_64(&state->rc_xmt_map_succ_pkt);
6848 node->w_buftype = IBD_WQE_MAPPED;
6849 node->swqe_im_mblk = mp;
6850 } else {
6851 atomic_inc_64(&state->rc_xmt_fragmented_pkt);
6852 ibd_rc_large_copy:
6853 mutex_enter(&state->rc_tx_large_bufs_lock);
6854 if (state->rc_tx_largebuf_nfree == 0) {
6855 state->rc_xmt_buf_short++;
6856 mutex_exit
6857 (&state->rc_tx_large_bufs_lock);
6858 mutex_enter(&state->id_sched_lock);
6859 state->id_sched_needed |=
6860 IBD_RSRC_RC_TX_LARGEBUF;
6861 mutex_exit(&state->id_sched_lock);
6862 dofree = B_FALSE;
6863 rc = B_FALSE;
6864 /*
6865 * If we don't have Tx large bufs,
6866 * return failure. node->w_buftype
6867 * should not be IBD_WQE_RC_COPYBUF,
6868 * otherwise it will cause problem
6869 * in ibd_rc_tx_cleanup()
6870 */
6871 node->w_buftype = IBD_WQE_TXBUF;
6872 goto ibd_send_fail;
6873 }
6874
6875 lbufp = state->rc_tx_largebuf_free_head;
6876 ASSERT(lbufp->lb_buf != NULL);
6877 state->rc_tx_largebuf_free_head =
6878 lbufp->lb_next;
6879 lbufp->lb_next = NULL;
6880 /* Update nfree count */
6881 state->rc_tx_largebuf_nfree --;
6882 mutex_exit(&state->rc_tx_large_bufs_lock);
6883 bufp = lbufp->lb_buf;
6884 node->w_sgl[0].ds_va =
6885 (ib_vaddr_t)(uintptr_t)bufp;
6886 node->w_sgl[0].ds_key =
6887 state->rc_tx_mr_desc.md_lkey;
6888 node->w_sgl[0].ds_len = pktsize;
6889 node->w_swr.wr_sgl = node->w_sgl;
6890 node->w_swr.wr_nds = 1;
6891 node->w_buftype = IBD_WQE_RC_COPYBUF;
6892 node->w_rc_tx_largebuf = lbufp;
6893
6894 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
6895 blksize = MBLKL(nmp);
6896 if (blksize != 0) {
6897 bcopy(nmp->b_rptr, bufp,
6898 blksize);
6899 bufp += blksize;
6900 }
6901 }
6902 freemsg(mp);
6903 ASSERT(node->swqe_im_mblk == NULL);
6904 }
6905 }
6906
6907 node->swqe_next = NULL;
6908 mutex_enter(&rc_chan->tx_post_lock);
6909 if (rc_chan->tx_busy) {
6910 if (rc_chan->tx_head) {
6911 rc_chan->tx_tail->swqe_next =
6912 SWQE_TO_WQE(node);
6913 } else {
6914 rc_chan->tx_head = node;
6915 }
6916 rc_chan->tx_tail = node;
6917 mutex_exit(&rc_chan->tx_post_lock);
6918 } else {
6919 rc_chan->tx_busy = 1;
6920 mutex_exit(&rc_chan->tx_post_lock);
6921 ibd_rc_post_send(rc_chan, node);
6922 }
6923
6924 return (B_TRUE);
6925 } /* send by RC */
6926
6927 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
6928 /*
6929 * Too long pktsize. The packet size from GLD should <=
6930 * state->id_mtu + sizeof (ib_addrs_t)
6931 */
6932 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
6933 ibd_req_t *req;
6934
6935 mutex_enter(&ace->tx_too_big_mutex);
6936 if (ace->tx_too_big_ongoing) {
6937 mutex_exit(&ace->tx_too_big_mutex);
6938 state->rc_xmt_reenter_too_long_pkt++;
6939 dofree = B_TRUE;
6940 } else {
6941 ace->tx_too_big_ongoing = B_TRUE;
6942 mutex_exit(&ace->tx_too_big_mutex);
6943 state->rc_xmt_icmp_too_long_pkt++;
6944
6945 req = kmem_cache_alloc(state->id_req_kmc,
6946 KM_NOSLEEP);
6947 if (req == NULL) {
6948 ibd_print_warn(state, "ibd_send: alloc "
6949 "ibd_req_t fail");
6950 /* Drop it. */
6951 dofree = B_TRUE;
6952 } else {
6953 req->rq_ptr = mp;
6954 req->rq_ptr2 = ace;
6955 ibd_queue_work_slot(state, req,
6956 IBD_ASYNC_RC_TOO_BIG);
6957 dofree = B_FALSE;
6958 }
6959 }
6960 } else {
6961 ibd_print_warn(state, "Reliable Connected mode is on. "
6962 "Multicast packet length %d > %d is too long to "
6963 "send packet (%d > %d), drop it",
6964 pktsize, state->id_mtu);
6965 state->rc_xmt_drop_too_long_pkt++;
6966 /* Drop it. */
6967 dofree = B_TRUE;
6968 }
6969 rc = B_TRUE;
6970 goto ibd_send_fail;
6971 }
6972
6973 atomic_add_64(&state->id_xmt_bytes, pktsize);
6974 atomic_inc_64(&state->id_xmt_pkt);
6975
6976 /*
6977 * Do LSO and checksum related work here. For LSO send, adjust the
6978 * ud destination, the opcode and the LSO header information to the
6979 * work request.
6980 */
6981 mac_lso_get(mp, &mss, &lsoflags);
6982 if ((lsoflags & HW_LSO) != HW_LSO) {
6983 node->w_swr.wr_opcode = IBT_WRC_SEND;
6984 lsohdr_sz = 0;
6985 } else {
6986 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
6987 /*
6988 * The routine can only fail if there's no memory; we
6989 * can only drop the packet if this happens
6990 */
6991 ibd_print_warn(state,
6992 "ibd_send: no memory, lso posting failed");
6993 dofree = B_TRUE;
6994 rc = B_TRUE;
6995 goto ibd_send_fail;
6996 }
6997
6998 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
6999 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7000 }
7001
7002 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7003 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7004 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7005 else
7006 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7007
7008 /*
7009 * Prepare the sgl for posting; the routine can only fail if there's
7010 * no lso buf available for posting. If this is the case, we should
7011 * probably resched for lso bufs to become available and then try again.
7012 */
7013 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7014 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7015 dofree = B_TRUE;
7016 rc = B_TRUE;
7017 } else {
7018 dofree = B_FALSE;
7019 rc = B_FALSE;
7020 }
7021 goto ibd_send_fail;
7022 }
7023 node->swqe_im_mblk = mp;
7024
7025 /*
7026 * Queue the wqe to hardware; since we can now simply queue a
7027 * post instead of doing it serially, we cannot assume anything
7028 * about the 'node' after ibd_post_send() returns.
7029 */
7030 node->swqe_next = NULL;
7031
7032 mutex_enter(&state->id_txpost_lock);
7033 if (state->id_tx_busy) {
7034 if (state->id_tx_head) {
7035 state->id_tx_tail->swqe_next =
7036 SWQE_TO_WQE(node);
7037 } else {
7038 state->id_tx_head = node;
7039 }
7040 state->id_tx_tail = node;
7041 mutex_exit(&state->id_txpost_lock);
7042 } else {
7043 state->id_tx_busy = 1;
7044 mutex_exit(&state->id_txpost_lock);
7045 ibd_post_send(state, node);
7046 }
7047
7048 return (B_TRUE);
7049
7050 ibd_send_fail:
7051 if (node && mp)
7052 ibd_free_lsohdr(node, mp);
7053
7054 if (dofree)
7055 freemsg(mp);
7056
7057 if (node != NULL) {
7058 if (rc_chan) {
7059 ibd_rc_tx_cleanup(node);
7060 } else {
7061 ibd_tx_cleanup(state, node);
7062 }
7063 }
7064
7065 return (rc);
7066 }
7067
7068 /*
7069 * GLDv3 entry point for transmitting datagram.
7070 */
7071 static mblk_t *
7072 ibd_m_tx(void *arg, mblk_t *mp)
7073 {
7074 ibd_state_t *state = (ibd_state_t *)arg;
7075 mblk_t *next;
7076
7077 if (state->id_type == IBD_PORT_DRIVER) {
7078 freemsgchain(mp);
7079 return (NULL);
7080 }
7081
7082 if ((state->id_link_state != LINK_STATE_UP) ||
7083 !(state->id_mac_state & IBD_DRV_STARTED)) {
7084 freemsgchain(mp);
7085 mp = NULL;
7086 }
7087
7088 while (mp != NULL) {
7089 next = mp->b_next;
7090 mp->b_next = NULL;
7091 if (ibd_send(state, mp) == B_FALSE) {
7092 /* Send fail */
7093 mp->b_next = next;
7094 break;
7095 }
7096 mp = next;
7097 }
7098
7099 return (mp);
7100 }
7101
7102 /*
7103 * this handles Tx and Rx completions. With separate CQs, this handles
7104 * only Rx completions.
7105 */
7106 static uint_t
7107 ibd_intr(caddr_t arg)
7108 {
7109 ibd_state_t *state = (ibd_state_t *)arg;
7110
7111 ibd_poll_rcq(state, state->id_rcq_hdl);
7112
7113 return (DDI_INTR_CLAIMED);
7114 }
7115
7116 /*
7117 * Poll and fully drain the send cq
7118 */
7119 static void
7120 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7121 {
7122 ibt_wc_t *wcs = state->id_txwcs;
7123 uint_t numwcs = state->id_txwcs_size;
7124 ibd_wqe_t *wqe;
7125 ibd_swqe_t *head, *tail;
7126 ibt_wc_t *wc;
7127 uint_t num_polled;
7128 int i;
7129
7130 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7131 head = tail = NULL;
7132 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7133 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7134 if (wc->wc_status != IBT_WC_SUCCESS) {
7135 /*
7136 * Channel being torn down.
7137 */
7138 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7139 DPRINT(5, "ibd_drain_scq: flush error");
7140 DPRINT(10, "ibd_drain_scq: Bad "
7141 "status %d", wc->wc_status);
7142 } else {
7143 DPRINT(10, "ibd_drain_scq: "
7144 "unexpected wc_status %d",
7145 wc->wc_status);
7146 }
7147 /*
7148 * Fallthrough to invoke the Tx handler to
7149 * release held resources, e.g., AH refcount.
7150 */
7151 }
7152 /*
7153 * Add this swqe to the list to be cleaned up.
7154 */
7155 if (head)
7156 tail->swqe_next = wqe;
7157 else
7158 head = WQE_TO_SWQE(wqe);
7159 tail = WQE_TO_SWQE(wqe);
7160 }
7161 tail->swqe_next = NULL;
7162 ibd_tx_cleanup_list(state, head, tail);
7163
7164 /*
7165 * Resume any blocked transmissions if possible
7166 */
7167 ibd_resume_transmission(state);
7168 }
7169 }
7170
7171 /*
7172 * Poll and fully drain the receive cq
7173 */
7174 static void
7175 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7176 {
7177 ibt_wc_t *wcs = state->id_rxwcs;
7178 uint_t numwcs = state->id_rxwcs_size;
7179 ibd_rwqe_t *rwqe;
7180 ibt_wc_t *wc;
7181 uint_t num_polled;
7182 int i;
7183 mblk_t *head, *tail, *mp;
7184
7185 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7186 head = tail = NULL;
7187 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7188 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7189 if (wc->wc_status != IBT_WC_SUCCESS) {
7190 /*
7191 * Channel being torn down.
7192 */
7193 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7194 DPRINT(5, "ibd_drain_rcq: "
7195 "expected flushed rwqe");
7196 } else {
7197 DPRINT(5, "ibd_drain_rcq: "
7198 "unexpected wc_status %d",
7199 wc->wc_status);
7200 }
7201 atomic_inc_32(
7202 &state->id_rx_list.dl_bufs_outstanding);
7203 freemsg(rwqe->rwqe_im_mblk);
7204 continue;
7205 }
7206 mp = ibd_process_rx(state, rwqe, wc);
7207 if (mp == NULL)
7208 continue;
7209
7210 /*
7211 * Add this mp to the list to send to the nw layer.
7212 */
7213 if (head)
7214 tail->b_next = mp;
7215 else
7216 head = mp;
7217 tail = mp;
7218 }
7219 if (head)
7220 mac_rx(state->id_mh, state->id_rh, head);
7221
7222 /*
7223 * Account for #rwqes polled.
7224 * Post more here, if less than one fourth full.
7225 */
7226 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7227 (state->id_ud_num_rwqe / 4))
7228 ibd_post_recv_intr(state);
7229 }
7230 }
7231
7232 /*
7233 * Common code for interrupt handling as well as for polling
7234 * for all completed wqe's while detaching.
7235 */
7236 static void
7237 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7238 {
7239 int flag, redo_flag;
7240 int redo = 1;
7241
7242 flag = IBD_CQ_POLLING;
7243 redo_flag = IBD_REDO_CQ_POLLING;
7244
7245 mutex_enter(&state->id_scq_poll_lock);
7246 if (state->id_scq_poll_busy & flag) {
7247 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7248 state->id_scq_poll_busy |= redo_flag;
7249 mutex_exit(&state->id_scq_poll_lock);
7250 return;
7251 }
7252 state->id_scq_poll_busy |= flag;
7253 mutex_exit(&state->id_scq_poll_lock);
7254
7255 /*
7256 * In some cases (eg detaching), this code can be invoked on
7257 * any cpu after disabling cq notification (thus no concurrency
7258 * exists). Apart from that, the following applies normally:
7259 * Transmit completion handling could be from any cpu if
7260 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7261 * is interrupt driven.
7262 */
7263
7264 /*
7265 * Poll and drain the CQ
7266 */
7267 ibd_drain_scq(state, cq_hdl);
7268
7269 /*
7270 * Enable CQ notifications and redrain the cq to catch any
7271 * completions we might have missed after the ibd_drain_scq()
7272 * above and before the ibt_enable_cq_notify() that follows.
7273 * Finally, service any new requests to poll the cq that
7274 * could've come in after the ibt_enable_cq_notify().
7275 */
7276 do {
7277 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7278 IBT_SUCCESS) {
7279 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7280 }
7281
7282 ibd_drain_scq(state, cq_hdl);
7283
7284 mutex_enter(&state->id_scq_poll_lock);
7285 if (state->id_scq_poll_busy & redo_flag)
7286 state->id_scq_poll_busy &= ~redo_flag;
7287 else {
7288 state->id_scq_poll_busy &= ~flag;
7289 redo = 0;
7290 }
7291 mutex_exit(&state->id_scq_poll_lock);
7292
7293 } while (redo);
7294 }
7295
7296 /*
7297 * Common code for interrupt handling as well as for polling
7298 * for all completed wqe's while detaching.
7299 */
7300 static void
7301 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7302 {
7303 int flag, redo_flag;
7304 int redo = 1;
7305
7306 flag = IBD_CQ_POLLING;
7307 redo_flag = IBD_REDO_CQ_POLLING;
7308
7309 mutex_enter(&state->id_rcq_poll_lock);
7310 if (state->id_rcq_poll_busy & flag) {
7311 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7312 state->id_rcq_poll_busy |= redo_flag;
7313 mutex_exit(&state->id_rcq_poll_lock);
7314 return;
7315 }
7316 state->id_rcq_poll_busy |= flag;
7317 mutex_exit(&state->id_rcq_poll_lock);
7318
7319 /*
7320 * Poll and drain the CQ
7321 */
7322 ibd_drain_rcq(state, rcq);
7323
7324 /*
7325 * Enable CQ notifications and redrain the cq to catch any
7326 * completions we might have missed after the ibd_drain_cq()
7327 * above and before the ibt_enable_cq_notify() that follows.
7328 * Finally, service any new requests to poll the cq that
7329 * could've come in after the ibt_enable_cq_notify().
7330 */
7331 do {
7332 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7333 IBT_SUCCESS) {
7334 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7335 }
7336
7337 ibd_drain_rcq(state, rcq);
7338
7339 mutex_enter(&state->id_rcq_poll_lock);
7340 if (state->id_rcq_poll_busy & redo_flag)
7341 state->id_rcq_poll_busy &= ~redo_flag;
7342 else {
7343 state->id_rcq_poll_busy &= ~flag;
7344 redo = 0;
7345 }
7346 mutex_exit(&state->id_rcq_poll_lock);
7347
7348 } while (redo);
7349 }
7350
7351 /*
7352 * Unmap the memory area associated with a given swqe.
7353 */
7354 void
7355 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7356 {
7357 ibt_status_t stat;
7358
7359 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7360
7361 if (swqe->w_mi_hdl) {
7362 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7363 swqe->w_mi_hdl)) != IBT_SUCCESS) {
7364 DPRINT(10,
7365 "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7366 }
7367 swqe->w_mi_hdl = NULL;
7368 }
7369 swqe->w_swr.wr_nds = 0;
7370 }
7371
7372 void
7373 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7374 {
7375 /*
7376 * The recycling logic can be eliminated from here
7377 * and put into the async thread if we create another
7378 * list to hold ACE's for unjoined mcg's.
7379 */
7380 if (DEC_REF_DO_CYCLE(ace)) {
7381 ibd_mce_t *mce;
7382
7383 /*
7384 * Check with the lock taken: we decremented
7385 * reference count without the lock, and some
7386 * transmitter might already have bumped the
7387 * reference count (possible in case of multicast
7388 * disable when we leave the AH on the active
7389 * list). If not still 0, get out, leaving the
7390 * recycle bit intact.
7391 *
7392 * Atomically transition the AH from active
7393 * to free list, and queue a work request to
7394 * leave the group and destroy the mce. No
7395 * transmitter can be looking at the AH or
7396 * the MCE in between, since we have the
7397 * ac_mutex lock. In the SendOnly reap case,
7398 * it is not necessary to hold the ac_mutex
7399 * and recheck the ref count (since the AH was
7400 * taken off the active list), we just do it
7401 * to have uniform processing with the Full
7402 * reap case.
7403 */
7404 mutex_enter(&state->id_ac_mutex);
7405 mce = ace->ac_mce;
7406 if (GET_REF_CYCLE(ace) == 0) {
7407 CLEAR_REFCYCLE(ace);
7408 /*
7409 * Identify the case of fullmember reap as
7410 * opposed to mcg trap reap. Also, port up
7411 * might set ac_mce to NULL to indicate Tx
7412 * cleanup should do no more than put the
7413 * AH in the free list (see ibd_async_link).
7414 */
7415 if (mce != NULL) {
7416 ace->ac_mce = NULL;
7417 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7418 /*
7419 * mc_req was initialized at mce
7420 * creation time.
7421 */
7422 ibd_queue_work_slot(state,
7423 &mce->mc_req, IBD_ASYNC_REAP);
7424 }
7425 IBD_ACACHE_INSERT_FREE(state, ace);
7426 }
7427 mutex_exit(&state->id_ac_mutex);
7428 }
7429 }
7430
7431 /*
7432 * Common code that deals with clean ups after a successful or
7433 * erroneous transmission attempt.
7434 */
7435 static void
7436 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7437 {
7438 ibd_ace_t *ace = swqe->w_ahandle;
7439
7440 DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7441
7442 /*
7443 * If this was a dynamic mapping in ibd_send(), we need to
7444 * unmap here. If this was an lso buffer we'd used for sending,
7445 * we need to release the lso buf to the pool, since the resource
7446 * is scarce. However, if this was simply a normal send using
7447 * the copybuf (present in each swqe), we don't need to release it.
7448 */
7449 if (swqe->swqe_im_mblk != NULL) {
7450 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7451 ibd_unmap_mem(state, swqe);
7452 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7453 ibd_release_lsobufs(state,
7454 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7455 }
7456 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7457 freemsg(swqe->swqe_im_mblk);
7458 swqe->swqe_im_mblk = NULL;
7459 }
7460
7461 /*
7462 * Drop the reference count on the AH; it can be reused
7463 * now for a different destination if there are no more
7464 * posted sends that will use it. This can be eliminated
7465 * if we can always associate each Tx buffer with an AH.
7466 * The ace can be null if we are cleaning up from the
7467 * ibd_send() error path.
7468 */
7469 if (ace != NULL) {
7470 ibd_dec_ref_ace(state, ace);
7471 }
7472
7473 /*
7474 * Release the send wqe for reuse.
7475 */
7476 swqe->swqe_next = NULL;
7477 ibd_release_swqe(state, swqe, swqe, 1);
7478 }
7479
7480 static void
7481 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7482 {
7483 ibd_ace_t *ace;
7484 ibd_swqe_t *swqe;
7485 int n = 0;
7486
7487 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7488
7489 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7490
7491 /*
7492 * If this was a dynamic mapping in ibd_send(), we need to
7493 * unmap here. If this was an lso buffer we'd used for sending,
7494 * we need to release the lso buf to the pool, since the
7495 * resource is scarce. However, if this was simply a normal
7496 * send using the copybuf (present in each swqe), we don't need
7497 * to release it.
7498 */
7499 if (swqe->swqe_im_mblk != NULL) {
7500 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7501 ibd_unmap_mem(state, swqe);
7502 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7503 ibd_release_lsobufs(state,
7504 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7505 }
7506 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7507 freemsg(swqe->swqe_im_mblk);
7508 swqe->swqe_im_mblk = NULL;
7509 }
7510
7511 /*
7512 * Drop the reference count on the AH; it can be reused
7513 * now for a different destination if there are no more
7514 * posted sends that will use it. This can be eliminated
7515 * if we can always associate each Tx buffer with an AH.
7516 * The ace can be null if we are cleaning up from the
7517 * ibd_send() error path.
7518 */
7519 ace = swqe->w_ahandle;
7520 if (ace != NULL) {
7521 ibd_dec_ref_ace(state, ace);
7522 }
7523 n++;
7524 }
7525
7526 /*
7527 * Release the send wqes for reuse.
7528 */
7529 ibd_release_swqe(state, head, tail, n);
7530 }
7531
7532 /*
7533 * Processing to be done after receipt of a packet; hand off to GLD
7534 * in the format expected by GLD. The received packet has this
7535 * format: 2b sap :: 00 :: data.
7536 */
7537 static mblk_t *
7538 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7539 {
7540 ib_header_info_t *phdr;
7541 mblk_t *mp;
7542 ipoib_hdr_t *ipibp;
7543 ipha_t *iphap;
7544 ip6_t *ip6h;
7545 int len;
7546 ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7547 uint32_t bufs;
7548
7549 /*
7550 * Track number handed to upper layer that need to be returned.
7551 */
7552 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7553
7554 /* Never run out of rwqes, use allocb when running low */
7555 if (bufs >= state->id_rx_bufs_outstanding_limit) {
7556 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7557 atomic_inc_32(&state->id_rx_allocb);
7558 mp = allocb(pkt_len, BPRI_HI);
7559 if (mp) {
7560 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7561 ibd_post_recv(state, rwqe);
7562 } else { /* no memory */
7563 atomic_inc_32(&state->id_rx_allocb_failed);
7564 ibd_post_recv(state, rwqe);
7565 return (NULL);
7566 }
7567 } else {
7568 mp = rwqe->rwqe_im_mblk;
7569 }
7570
7571
7572 /*
7573 * Adjust write pointer depending on how much data came in.
7574 */
7575 mp->b_wptr = mp->b_rptr + pkt_len;
7576
7577 /*
7578 * Make sure this is NULL or we're in trouble.
7579 */
7580 if (mp->b_next != NULL) {
7581 ibd_print_warn(state,
7582 "ibd_process_rx: got duplicate mp from rcq?");
7583 mp->b_next = NULL;
7584 }
7585
7586 /*
7587 * the IB link will deliver one of the IB link layer
7588 * headers called, the Global Routing Header (GRH).
7589 * ibd driver uses the information in GRH to build the
7590 * Header_info structure and pass it with the datagram up
7591 * to GLDv3.
7592 * If the GRH is not valid, indicate to GLDv3 by setting
7593 * the VerTcFlow field to 0.
7594 */
7595 phdr = (ib_header_info_t *)mp->b_rptr;
7596 if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7597 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7598
7599 /* if it is loop back packet, just drop it. */
7600 if (state->id_enable_rc) {
7601 if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7602 &state->rc_macaddr_loopback,
7603 IPOIB_ADDRL) == 0) {
7604 freemsg(mp);
7605 return (NULL);
7606 }
7607 } else {
7608 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7609 IPOIB_ADDRL) == 0) {
7610 freemsg(mp);
7611 return (NULL);
7612 }
7613 }
7614
7615 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7616 sizeof (ipoib_mac_t));
7617 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
7618 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
7619 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
7620 } else {
7621 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
7622 }
7623 } else {
7624 /*
7625 * It can not be a IBA multicast packet. Must have been
7626 * unicast for us. Just copy the interface address to dst.
7627 */
7628 phdr->ib_grh.ipoib_vertcflow = 0;
7629 ovbcopy(&state->id_macaddr, &phdr->ib_dst,
7630 sizeof (ipoib_mac_t));
7631 }
7632
7633 /*
7634 * For ND6 packets, padding is at the front of the source/target
7635 * lladdr. However the inet6 layer is not aware of it, hence remove
7636 * the padding from such packets.
7637 */
7638 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
7639 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
7640 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7641 len = ntohs(ip6h->ip6_plen);
7642 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7643 /* LINTED: E_CONSTANT_CONDITION */
7644 IBD_PAD_NSNA(ip6h, len, IBD_RECV);
7645 }
7646 }
7647
7648 /*
7649 * Update statistics
7650 */
7651 atomic_add_64(&state->id_rcv_bytes, pkt_len);
7652 atomic_inc_64(&state->id_rcv_pkt);
7653 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7654 atomic_inc_64(&state->id_brd_rcv);
7655 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7656 atomic_inc_64(&state->id_multi_rcv);
7657
7658 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
7659 /*
7660 * Set receive checksum status in mp
7661 * Hardware checksumming can be considered valid only if:
7662 * 1. CQE.IP_OK bit is set
7663 * 2. CQE.CKSUM = 0xffff
7664 * 3. IPv6 routing header is not present in the packet
7665 * 4. If there are no IP_OPTIONS in the IP HEADER
7666 */
7667
7668 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
7669 (wc->wc_cksum == 0xFFFF) &&
7670 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
7671 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
7672 }
7673
7674 return (mp);
7675 }
7676
7677 /*
7678 * Callback code invoked from STREAMs when the receive data buffer is
7679 * free for recycling.
7680 */
7681 static void
7682 ibd_freemsg_cb(char *arg)
7683 {
7684 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
7685 ibd_state_t *state = rwqe->w_state;
7686
7687 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7688
7689 /*
7690 * If the driver is stopped, just free the rwqe.
7691 */
7692 if (atomic_add_32_nv(&state->id_running, 0) == 0) {
7693 DPRINT(6, "ibd_freemsg: wqe being freed");
7694 rwqe->rwqe_im_mblk = NULL;
7695 ibd_free_rwqe(state, rwqe);
7696 return;
7697 }
7698
7699 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
7700 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
7701 if (rwqe->rwqe_im_mblk == NULL) {
7702 ibd_free_rwqe(state, rwqe);
7703 DPRINT(6, "ibd_freemsg: desballoc failed");
7704 return;
7705 }
7706
7707 ibd_post_recv(state, rwqe);
7708 }
7709
7710 static uint_t
7711 ibd_tx_recycle(caddr_t arg)
7712 {
7713 ibd_state_t *state = (ibd_state_t *)arg;
7714
7715 /*
7716 * Poll for completed entries
7717 */
7718 ibd_poll_scq(state, state->id_scq_hdl);
7719
7720 return (DDI_INTR_CLAIMED);
7721 }
7722
7723 #ifdef IBD_LOGGING
7724 static void
7725 ibd_log_init(void)
7726 {
7727 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
7728 ibd_lbuf_ndx = 0;
7729
7730 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
7731 }
7732
7733 static void
7734 ibd_log_fini(void)
7735 {
7736 if (ibd_lbuf)
7737 kmem_free(ibd_lbuf, IBD_LOG_SZ);
7738 ibd_lbuf_ndx = 0;
7739 ibd_lbuf = NULL;
7740
7741 mutex_destroy(&ibd_lbuf_lock);
7742 }
7743
7744 static void
7745 ibd_log(const char *fmt, ...)
7746 {
7747 va_list ap;
7748 uint32_t off;
7749 uint32_t msglen;
7750 char tmpbuf[IBD_DMAX_LINE];
7751
7752 if (ibd_lbuf == NULL)
7753 return;
7754
7755 va_start(ap, fmt);
7756 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
7757 va_end(ap);
7758
7759 if (msglen >= IBD_DMAX_LINE)
7760 msglen = IBD_DMAX_LINE - 1;
7761
7762 mutex_enter(&ibd_lbuf_lock);
7763
7764 off = ibd_lbuf_ndx; /* current msg should go here */
7765 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
7766 ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
7767
7768 ibd_lbuf_ndx += msglen; /* place where next msg should start */
7769 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */
7770
7771 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
7772 ibd_lbuf_ndx = 0;
7773
7774 mutex_exit(&ibd_lbuf_lock);
7775
7776 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */
7777 }
7778 #endif
7779
7780 /* ARGSUSED */
7781 static int
7782 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
7783 int *rvalp)
7784 {
7785 ibd_create_ioctl_t *cmd = karg;
7786 ibd_state_t *state, *port_state, *p;
7787 int i, err, rval = 0;
7788 mac_register_t *macp;
7789 ibt_hca_portinfo_t *pinfop = NULL;
7790 ibt_status_t ibt_status;
7791 uint_t psize, pinfosz;
7792 boolean_t force_create = B_FALSE;
7793
7794 cmd->ibdioc.ioc_status = 0;
7795
7796 if (cmd->ibdioc.ioc_port_inst < 0) {
7797 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
7798 return (EINVAL);
7799 }
7800 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
7801 if (port_state == NULL) {
7802 DPRINT(10, "ibd_create_partition: failed to get state %d",
7803 cmd->ibdioc.ioc_port_inst);
7804 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
7805 return (EINVAL);
7806 }
7807
7808 /* Limited PKeys not supported */
7809 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
7810 rval = EINVAL;
7811 goto part_create_return;
7812 }
7813
7814 if (cmd->ioc_force_create == 0) {
7815 /*
7816 * Check if the port pkey table contains the pkey for which
7817 * this partition is being created.
7818 */
7819 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
7820 port_state->id_port, &pinfop, &psize, &pinfosz);
7821
7822 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
7823 rval = EINVAL;
7824 goto part_create_return;
7825 }
7826
7827 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
7828 rval = ENETDOWN;
7829 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
7830 goto part_create_return;
7831 }
7832
7833 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
7834 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
7835 break;
7836 }
7837 }
7838 if (i == pinfop->p_pkey_tbl_sz) {
7839 rval = EINVAL;
7840 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
7841 goto part_create_return;
7842 }
7843 } else {
7844 force_create = B_TRUE;
7845 }
7846
7847 mutex_enter(&ibd_objlist_lock);
7848 for (p = ibd_objlist_head; p; p = p->id_next) {
7849 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
7850 (p->id_pkey == cmd->ioc_pkey) &&
7851 (p->id_plinkid == cmd->ioc_partid)) {
7852 mutex_exit(&ibd_objlist_lock);
7853 rval = EEXIST;
7854 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
7855 goto part_create_return;
7856 }
7857 }
7858 mutex_exit(&ibd_objlist_lock);
7859
7860 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
7861
7862 state->id_type = IBD_PARTITION_OBJ;
7863
7864 state->id_plinkid = cmd->ioc_partid;
7865 state->id_dlinkid = cmd->ibdioc.ioc_linkid;
7866 state->id_port_inst = cmd->ibdioc.ioc_port_inst;
7867
7868 state->id_dip = port_state->id_dip;
7869 state->id_port = port_state->id_port;
7870 state->id_pkey = cmd->ioc_pkey;
7871 state->id_hca_guid = port_state->id_hca_guid;
7872 state->id_port_guid = port_state->id_port_guid;
7873 state->id_force_create = force_create;
7874
7875 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
7876 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
7877
7878 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
7879 rval = EIO;
7880 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
7881 goto fail;
7882 }
7883
7884 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
7885 rval = EAGAIN;
7886 goto fail;
7887 }
7888
7889 macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
7890 macp->m_dip = port_state->id_dip;
7891 macp->m_instance = (uint_t)-1;
7892 macp->m_driver = state;
7893 macp->m_src_addr = (uint8_t *)&state->id_macaddr;
7894 macp->m_callbacks = &ibd_m_callbacks;
7895 macp->m_min_sdu = 0;
7896 macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
7897 if (state->id_enable_rc) {
7898 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
7899 } else {
7900 macp->m_max_sdu = IBD_DEF_MAX_SDU;
7901 }
7902 macp->m_priv_props = ibd_priv_props;
7903
7904 err = mac_register(macp, &state->id_mh);
7905 mac_free(macp);
7906
7907 if (err != 0) {
7908 DPRINT(10, "ibd_create_partition: mac_register() failed %d",
7909 err);
7910 rval = err;
7911 goto fail;
7912 }
7913
7914 err = dls_devnet_create(state->id_mh,
7915 cmd->ioc_partid, crgetzoneid(credp));
7916 if (err != 0) {
7917 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
7918 "%d", err);
7919 rval = err;
7920 (void) mac_unregister(state->id_mh);
7921 goto fail;
7922 }
7923
7924 /*
7925 * Add the new partition state structure to the list
7926 */
7927 mutex_enter(&ibd_objlist_lock);
7928 if (ibd_objlist_head)
7929 state->id_next = ibd_objlist_head;
7930
7931 ibd_objlist_head = state;
7932 mutex_exit(&ibd_objlist_lock);
7933
7934 part_create_return:
7935 if (pinfop) {
7936 ibt_free_portinfo(pinfop, pinfosz);
7937 }
7938 return (rval);
7939
7940 fail:
7941 if (pinfop) {
7942 ibt_free_portinfo(pinfop, pinfosz);
7943 }
7944 ibd_part_unattach(state);
7945 kmem_free(state, sizeof (ibd_state_t));
7946 return (rval);
7947 }
7948
7949 /* ARGSUSED */
7950 static int
7951 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
7952 int *rvalp)
7953 {
7954 int err;
7955 datalink_id_t tmpid;
7956 ibd_state_t *node, *prev;
7957 ibd_delete_ioctl_t *cmd = karg;
7958
7959 prev = NULL;
7960
7961 mutex_enter(&ibd_objlist_lock);
7962 node = ibd_objlist_head;
7963
7964 /* Find the ibd state structure corresponding to the partition */
7965 while (node != NULL) {
7966 if (node->id_plinkid == cmd->ioc_partid)
7967 break;
7968 prev = node;
7969 node = node->id_next;
7970 }
7971
7972 if (node == NULL) {
7973 mutex_exit(&ibd_objlist_lock);
7974 return (ENOENT);
7975 }
7976
7977 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
7978 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
7979 "%d", err);
7980 mutex_exit(&ibd_objlist_lock);
7981 return (err);
7982 }
7983
7984 /*
7985 * Call ibd_part_unattach() only after making sure that the instance has
7986 * not been started yet and is also not in late hca init mode.
7987 */
7988 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
7989
7990 err = 0;
7991 if ((node->id_mac_state & IBD_DRV_STARTED) ||
7992 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
7993 (ibd_part_busy(node) != DDI_SUCCESS) ||
7994 ((err = mac_disable(node->id_mh)) != 0)) {
7995 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
7996 crgetzoneid(credp));
7997 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
7998 mutex_exit(&ibd_objlist_lock);
7999 return (err != 0 ? err : EBUSY);
8000 }
8001
8002 node->id_mac_state |= IBD_DRV_IN_DELETION;
8003
8004 ibd_part_unattach(node);
8005
8006 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8007
8008 /* Remove the partition state structure from the linked list */
8009 if (prev == NULL)
8010 ibd_objlist_head = node->id_next;
8011 else
8012 prev->id_next = node->id_next;
8013 mutex_exit(&ibd_objlist_lock);
8014
8015 if ((err = mac_unregister(node->id_mh)) != 0) {
8016 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8017 err);
8018 }
8019
8020 cv_destroy(&node->id_macst_cv);
8021 mutex_destroy(&node->id_macst_lock);
8022
8023 kmem_free(node, sizeof (ibd_state_t));
8024
8025 return (0);
8026 }
8027
8028 /* ARGSUSED */
8029 static int
8030 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8031 int *rvalp)
8032 {
8033 ibd_ioctl_t cmd;
8034 ibpart_ioctl_t partioc;
8035 ibport_ioctl_t portioc;
8036 #ifdef _MULTI_DATAMODEL
8037 ibport_ioctl32_t portioc32;
8038 #endif
8039 ibd_state_t *state, *port_state;
8040 int size;
8041 ibt_hca_portinfo_t *pinfop = NULL;
8042 ibt_status_t ibt_status;
8043 uint_t psize, pinfosz;
8044 int rval = 0;
8045
8046 size = sizeof (ibd_ioctl_t);
8047 if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8048 return (EFAULT);
8049 }
8050 cmd.ioc_status = 0;
8051 switch (cmd.ioc_info_cmd) {
8052 case IBD_INFO_CMD_IBPART:
8053 size = sizeof (ibpart_ioctl_t);
8054 if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8055 return (EFAULT);
8056 }
8057
8058 mutex_enter(&ibd_objlist_lock);
8059 /* Find the ibd state structure corresponding the partition */
8060 for (state = ibd_objlist_head; state; state = state->id_next) {
8061 if (state->id_plinkid == cmd.ioc_linkid) {
8062 break;
8063 }
8064 }
8065
8066 if (state == NULL) {
8067 mutex_exit(&ibd_objlist_lock);
8068 return (ENOENT);
8069 }
8070
8071 partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8072 partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8073 partioc.ibdioc.ioc_portnum = state->id_port;
8074 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8075 partioc.ibdioc.ioc_portguid = state->id_port_guid;
8076 partioc.ibdioc.ioc_status = 0;
8077 partioc.ioc_partid = state->id_plinkid;
8078 partioc.ioc_pkey = state->id_pkey;
8079 partioc.ioc_force_create = state->id_force_create;
8080 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8081 mutex_exit(&ibd_objlist_lock);
8082 return (EFAULT);
8083 }
8084 mutex_exit(&ibd_objlist_lock);
8085
8086 break;
8087
8088 case IBD_INFO_CMD_IBPORT:
8089 if ((cmd.ioc_port_inst < 0) || ((port_state =
8090 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8091 DPRINT(10, "ibd_create_partition: failed to get"
8092 " state %d", cmd.ioc_port_inst);
8093 size = sizeof (ibd_ioctl_t);
8094 cmd.ioc_status = IBD_INVALID_PORT_INST;
8095 if (ddi_copyout((void *)&cmd, (void *)arg, size,
8096 mode)) {
8097 return (EFAULT);
8098 }
8099 return (EINVAL);
8100 }
8101 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8102 port_state->id_port, &pinfop, &psize, &pinfosz);
8103 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8104 return (EINVAL);
8105 }
8106 #ifdef _MULTI_DATAMODEL
8107 switch (ddi_model_convert_from(mode & FMODELS)) {
8108 case DDI_MODEL_ILP32: {
8109 size = sizeof (ibport_ioctl32_t);
8110 if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8111 rval = EFAULT;
8112 goto fail;
8113 }
8114 portioc32.ibdioc.ioc_status = 0;
8115 portioc32.ibdioc.ioc_portnum = port_state->id_port;
8116 portioc32.ibdioc.ioc_hcaguid =
8117 port_state->id_hca_guid;
8118 portioc32.ibdioc.ioc_portguid =
8119 port_state->id_port_guid;
8120 if (portioc32.ioc_pkey_tbl_sz !=
8121 pinfop->p_pkey_tbl_sz) {
8122 rval = EINVAL;
8123 size = sizeof (ibd_ioctl_t);
8124 portioc32.ibdioc.ioc_status =
8125 IBD_INVALID_PKEY_TBL_SIZE;
8126 if (ddi_copyout((void *)&portioc32.ibdioc,
8127 (void *)arg, size, mode)) {
8128 rval = EFAULT;
8129 goto fail;
8130 }
8131 goto fail;
8132 }
8133 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8134 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8135 (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8136 mode)) {
8137 rval = EFAULT;
8138 goto fail;
8139 }
8140 size = sizeof (ibport_ioctl32_t);
8141 if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8142 mode)) {
8143 rval = EFAULT;
8144 goto fail;
8145 }
8146 break;
8147 }
8148 case DDI_MODEL_NONE:
8149 size = sizeof (ibport_ioctl_t);
8150 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8151 rval = EFAULT;
8152 goto fail;
8153 }
8154 portioc.ibdioc.ioc_status = 0;
8155 portioc.ibdioc.ioc_portnum = port_state->id_port;
8156 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8157 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8158 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8159 rval = EINVAL;
8160 size = sizeof (ibd_ioctl_t);
8161 portioc.ibdioc.ioc_status =
8162 IBD_INVALID_PKEY_TBL_SIZE;
8163 if (ddi_copyout((void *)&portioc.ibdioc,
8164 (void *)arg, size, mode)) {
8165 rval = EFAULT;
8166 goto fail;
8167 }
8168 goto fail;
8169 }
8170 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8171 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8172 (void *)(portioc.ioc_pkeys), size, mode)) {
8173 rval = EFAULT;
8174 goto fail;
8175 }
8176 size = sizeof (ibport_ioctl_t);
8177 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8178 mode)) {
8179 rval = EFAULT;
8180 goto fail;
8181 }
8182 break;
8183 }
8184 #else /* ! _MULTI_DATAMODEL */
8185 size = sizeof (ibport_ioctl_t);
8186 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8187 rval = EFAULT;
8188 goto fail;
8189 }
8190 portioc.ibdioc.ioc_status = 0;
8191 portioc.ibdioc.ioc_portnum = port_state->id_port;
8192 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8193 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8194 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8195 rval = EINVAL;
8196 size = sizeof (ibd_ioctl_t);
8197 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8198 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8199 size, mode)) {
8200 rval = EFAULT;
8201 goto fail;
8202 }
8203 goto fail;
8204 }
8205 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8206 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8207 (void *)(portioc.ioc_pkeys), size, mode)) {
8208 rval = EFAULT;
8209 goto fail;
8210 }
8211 size = sizeof (ibport_ioctl_t);
8212 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8213 mode)) {
8214 rval = EFAULT;
8215 goto fail;
8216 }
8217 #endif /* _MULTI_DATAMODEL */
8218
8219 break;
8220
8221 case IBD_INFO_CMD_PKEYTBLSZ:
8222 if ((cmd.ioc_port_inst < 0) || ((port_state =
8223 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8224 DPRINT(10, "ibd_create_partition: failed to get"
8225 " state %d", cmd.ioc_port_inst);
8226 size = sizeof (ibd_ioctl_t);
8227 cmd.ioc_status = IBD_INVALID_PORT_INST;
8228 if (ddi_copyout((void *)&cmd, (void *)arg, size,
8229 mode)) {
8230 return (EFAULT);
8231 }
8232 return (EINVAL);
8233 }
8234 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8235 port_state->id_port, &pinfop, &psize, &pinfosz);
8236 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8237 return (EINVAL);
8238 }
8239 #ifdef _MULTI_DATAMODEL
8240 switch (ddi_model_convert_from(mode & FMODELS)) {
8241 case DDI_MODEL_ILP32: {
8242 size = sizeof (ibport_ioctl32_t);
8243 if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8244 rval = EFAULT;
8245 goto fail;
8246 }
8247 portioc32.ibdioc.ioc_status = 0;
8248 portioc32.ibdioc.ioc_portnum = port_state->id_port;
8249 portioc32.ibdioc.ioc_hcaguid =
8250 port_state->id_hca_guid;
8251 portioc32.ibdioc.ioc_portguid =
8252 port_state->id_port_guid;
8253 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8254 if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8255 mode)) {
8256 rval = EFAULT;
8257 goto fail;
8258 }
8259 break;
8260 }
8261 case DDI_MODEL_NONE:
8262 size = sizeof (ibport_ioctl_t);
8263 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8264 rval = EFAULT;
8265 goto fail;
8266 }
8267 portioc.ibdioc.ioc_status = 0;
8268 portioc.ibdioc.ioc_portnum = port_state->id_port;
8269 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8270 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8271 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8272 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8273 mode)) {
8274 rval = EFAULT;
8275 goto fail;
8276 }
8277 break;
8278 }
8279 #else /* ! _MULTI_DATAMODEL */
8280 size = sizeof (ibport_ioctl_t);
8281 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8282 rval = EFAULT;
8283 goto fail;
8284 }
8285 portioc.ibdioc.ioc_status = 0;
8286 portioc.ibdioc.ioc_portnum = port_state->id_port;
8287 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8288 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8289 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8290 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8291 mode)) {
8292 rval = EFAULT;
8293 goto fail;
8294 }
8295 #endif /* _MULTI_DATAMODEL */
8296 break;
8297
8298 default:
8299 return (EINVAL);
8300
8301 } /* switch (cmd.ioc_info_cmd) */
8302 fail:
8303 if (pinfop) {
8304 ibt_free_portinfo(pinfop, pinfosz);
8305 }
8306 return (rval);
8307 }
8308
8309 /* ARGSUSED */
8310 static void
8311 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8312 ibt_async_code_t code, ibt_async_event_t *event)
8313 {
8314 ibd_state_t *state = (ibd_state_t *)arg;
8315 link_state_t lstate;
8316
8317 switch (code) {
8318 case IBT_EVENT_PORT_UP:
8319 case IBT_ERROR_PORT_DOWN:
8320 if (ibd_get_port_state(state, &lstate) != 0)
8321 break;
8322
8323 if (state->id_link_state != lstate) {
8324 state->id_link_state = lstate;
8325 mac_link_update(state->id_mh, lstate);
8326 }
8327 break;
8328 default:
8329 break;
8330 }
8331 }
8332
8333 static int
8334 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8335 {
8336 ibt_hca_portinfo_t *port_infop;
8337 uint_t psize, port_infosz;
8338 ibt_status_t ret;
8339
8340 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8341 &port_infop, &psize, &port_infosz);
8342 if ((ret != IBT_SUCCESS) || (psize != 1))
8343 return (-1);
8344
8345 state->id_sgid = *port_infop->p_sgid_tbl;
8346 state->id_link_speed = ibd_get_portspeed(state);
8347
8348 if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8349 *lstate = LINK_STATE_UP;
8350 else
8351 *lstate = LINK_STATE_DOWN;
8352
8353 ibt_free_portinfo(port_infop, port_infosz);
8354 return (0);
8355 }
8356
8357 static int
8358 ibd_port_attach(dev_info_t *dip)
8359 {
8360 ibd_state_t *state;
8361 link_state_t lstate;
8362 int instance;
8363 ibt_status_t ret;
8364
8365 /*
8366 * Allocate softstate structure
8367 */
8368 instance = ddi_get_instance(dip);
8369 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8370 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8371 return (DDI_FAILURE);
8372 }
8373
8374 state = ddi_get_soft_state(ibd_list, instance);
8375
8376 state->id_dip = dip;
8377 state->id_type = IBD_PORT_DRIVER;
8378
8379 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8380 "port-number", 0)) == 0) {
8381 DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8382 state->id_port);
8383 return (DDI_FAILURE);
8384 }
8385 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8386 "hca-guid", 0)) == 0) {
8387 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8388 state->id_hca_guid);
8389 return (DDI_FAILURE);
8390 }
8391 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8392 "port-guid", 0)) == 0) {
8393 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8394 state->id_port_guid);
8395 return (DDI_FAILURE);
8396 }
8397
8398 /*
8399 * Attach to IBTL
8400 */
8401 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8402 &state->id_ibt_hdl)) != IBT_SUCCESS) {
8403 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8404 ret);
8405 goto done;
8406 }
8407
8408 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8409
8410 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8411 &state->id_hca_hdl)) != IBT_SUCCESS) {
8412 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8413 ret);
8414 goto done;
8415 }
8416 state->id_mac_state |= IBD_DRV_HCA_OPENED;
8417
8418 /* Update link status */
8419
8420 if (ibd_get_port_state(state, &lstate) != 0) {
8421 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8422 ret);
8423 goto done;
8424 }
8425 state->id_link_state = lstate;
8426 /*
8427 * Register ibd interfaces with the Nemo framework
8428 */
8429 if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8430 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8431 goto done;
8432 }
8433 state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8434
8435 mac_link_update(state->id_mh, lstate);
8436
8437 return (DDI_SUCCESS);
8438 done:
8439 (void) ibd_port_unattach(state, dip);
8440 return (DDI_FAILURE);
8441 }
8442
8443 static int
8444 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8445 {
8446 int instance;
8447 uint32_t progress = state->id_mac_state;
8448 ibt_status_t ret;
8449
8450 if (progress & IBD_DRV_MAC_REGISTERED) {
8451 (void) mac_unregister(state->id_mh);
8452 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8453 }
8454
8455 if (progress & IBD_DRV_HCA_OPENED) {
8456 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8457 IBT_SUCCESS) {
8458 ibd_print_warn(state, "failed to close "
8459 "HCA device, ret=%d", ret);
8460 }
8461 state->id_hca_hdl = NULL;
8462 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8463 }
8464
8465 if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8466 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8467 ibd_print_warn(state,
8468 "ibt_detach() failed, ret=%d", ret);
8469 }
8470 state->id_ibt_hdl = NULL;
8471 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8472 }
8473 instance = ddi_get_instance(dip);
8474 ddi_soft_state_free(ibd_list, instance);
8475
8476 return (DDI_SUCCESS);
8477 }
8478
8479 ibt_status_t
8480 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8481 {
8482 ibd_state_t *state;
8483
8484 mutex_enter(&ibd_objlist_lock);
8485
8486 /* Find the ibd state structure corresponding the partition */
8487 for (state = ibd_objlist_head; state; state = state->id_next) {
8488 if (state->id_plinkid == linkid) {
8489 break;
8490 }
8491 }
8492
8493 if (state == NULL) {
8494 mutex_exit(&ibd_objlist_lock);
8495 return (IBT_NO_SUCH_OBJECT);
8496 }
8497
8498 attr->pa_dlinkid = state->id_dlinkid;
8499 attr->pa_plinkid = state->id_plinkid;
8500 attr->pa_port = state->id_port;
8501 attr->pa_hca_guid = state->id_hca_guid;
8502 attr->pa_port_guid = state->id_port_guid;
8503 attr->pa_pkey = state->id_pkey;
8504
8505 mutex_exit(&ibd_objlist_lock);
8506
8507 return (IBT_SUCCESS);
8508 }
8509
8510 ibt_status_t
8511 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8512 {
8513 ibd_state_t *state;
8514 int n = 0;
8515 ibt_part_attr_t *attr;
8516
8517 mutex_enter(&ibd_objlist_lock);
8518
8519 for (state = ibd_objlist_head; state; state = state->id_next)
8520 n++;
8521
8522 *nparts = n;
8523 if (n == 0) {
8524 *attr_list = NULL;
8525 mutex_exit(&ibd_objlist_lock);
8526 return (IBT_SUCCESS);
8527 }
8528
8529 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8530 attr = *attr_list;
8531 for (state = ibd_objlist_head; state; state = state->id_next) {
8532 #ifdef DEBUG
8533 ASSERT(n > 0);
8534 n--;
8535 #endif
8536 attr->pa_dlinkid = state->id_dlinkid;
8537 attr->pa_plinkid = state->id_plinkid;
8538 attr->pa_port = state->id_port;
8539 attr->pa_hca_guid = state->id_hca_guid;
8540 attr->pa_port_guid = state->id_port_guid;
8541 attr->pa_pkey = state->id_pkey;
8542 attr++;
8543 }
8544
8545 mutex_exit(&ibd_objlist_lock);
8546 return (IBT_SUCCESS);
8547 }