Print this page
8368 remove warlock leftovers from usr/src/uts
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/io/ib/clients/ibd/ibd.c
+++ new/usr/src/uts/common/io/ib/clients/ibd/ibd.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /*
27 27 * An implementation of the IPoIB standard based on PSARC 2001/289.
28 28 */
29 29
30 30 #include <sys/types.h>
31 31 #include <sys/conf.h>
32 32 #include <sys/ddi.h>
33 33 #include <sys/sunddi.h>
34 34 #include <sys/modctl.h>
35 35 #include <sys/stropts.h>
36 36 #include <sys/stream.h>
37 37 #include <sys/strsun.h>
38 38 #include <sys/strsubr.h>
39 39 #include <sys/dlpi.h>
40 40 #include <sys/mac_provider.h>
41 41
42 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */
43 43 #include <sys/sysmacros.h> /* for offsetof */
44 44 #include <sys/disp.h> /* for async thread pri */
45 45 #include <sys/atomic.h> /* for atomic_add*() */
46 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */
47 47 #include <netinet/in.h> /* for netinet/ip.h below */
48 48 #include <netinet/ip.h> /* for struct ip */
49 49 #include <netinet/udp.h> /* for struct udphdr */
50 50 #include <inet/common.h> /* for inet/ip.h below */
51 51 #include <inet/ip.h> /* for ipha_t */
52 52 #include <inet/ip6.h> /* for ip6_t */
53 53 #include <inet/tcp.h> /* for tcph_t */
54 54 #include <netinet/icmp6.h> /* for icmp6_t */
55 55 #include <sys/callb.h>
56 56 #include <sys/modhash.h>
57 57
58 58 #include <sys/ib/clients/ibd/ibd.h>
59 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */
60 60 #include <sys/note.h>
61 61 #include <sys/multidata.h>
62 62
63 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */
64 64
65 65 #include <sys/priv_names.h>
66 66 #include <sys/dls.h>
67 67 #include <sys/dld_ioc.h>
68 68 #include <sys/policy.h>
69 69 #include <sys/ibpart.h>
70 70 #include <sys/file.h>
71 71
72 72 /*
73 73 * The write-up below includes details on the following:
74 74 * 1. The dladm administrative model.
75 75 * 2. Late HCA initialization feature.
76 76 * 3. Brussels support and its implications to the current architecture.
77 77 *
78 78 * 1. The dladm administrative model.
79 79 * ------------------------------------------
80 80 * With the dladm model, ibnex will create one ibd instance per port. These
81 81 * instances will be created independent of the port state.
82 82 *
83 83 * The ibd driver is two faceted: One side of it working as the port driver and
84 84 * the other as the partition object driver.
85 85 *
86 86 * The port instance is a child of the HCA, and will have an entry in the devfs.
87 87 * A DDI attach only happens for the port driver, and its attach is
88 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is
89 89 * handled in ibd_port_unattach().
90 90 *
91 91 * The partition object is only a registrant to the mac layer via mac_register()
92 92 * and does not have an entry in the device tree. There is no DDI softstate
93 93 * managed by the DDI framework for the partition objects. However, the state is
94 94 * managed inside the ibd driver, and every partition object hangs off the
95 95 * "ibd_objlist_head".
96 96 *
97 97 * The partition object first comes into existence when a user runs the
98 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry
99 99 * point of the partition object. The partition object goes away with the
100 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry
101 101 * point of the partition object.
102 102 *
103 103 * The create-part and delete-part subcommands result in dld ioctls that end up
104 104 * calling ibd_create_parition() and ibd_delete_partition respectively.
105 105 * There ioctls are registered with the dld layer in _init() via a call to
106 106 * dld_ioc_register().
107 107 *
108 108 * The port instance by itself cannot be plumbed. It is only the partition
109 109 * objects that can be plumbed and they alone participate in I/O and not the
110 110 * port driver.
111 111 *
112 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to
113 113 * display useful information. The info entry point for ibd is
114 114 * ibd_get_partition_info().
115 115 *
116 116 * 2. Late HCA initialization feature.
117 117 * ------------------------------------
118 118 * As mentioned in section 1, the user creates the partition objects via
119 119 * dladm(1M). It is possible that:
120 120 * a) The physical port itself is down and the SM cannot be reached.
121 121 * b) The PKEY specified by the used has not been created in the SM yet.
122 122 * c) An IPoIB broadcast group for the specified PKEY is not present.
123 123 *
124 124 * In all of the above cases, complete initialization of the partition object is
125 125 * not possible. However, the new model allows the creation of partition
126 126 * objects even in such cases but will defer the initialization for later.
127 127 * When such a partition object is plumbed, the link state will be displayed as
128 128 * "down".
129 129 * The driver, at this point, is listening to events that herald the
130 130 * availability of resources -
131 131 * i) LINK_UP when the link becomes available
132 132 * ii) PORT_CHANGE when the PKEY has been created
133 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been
134 134 * created
135 135 * via ibd_async_handler() for events i) and ii), and via
136 136 * ibd_snet_notices_handler() for iii.
137 137 * The driver handles these events (as and when they arrive) and completes the
138 138 * initialization of the partition object and transitions it to a usable state.
139 139 *
140 140 * 3. Brussels support and its implications to the current architecture.
141 141 * ---------------------------------------------------------------------
142 142 * The brussels support introduces two new interfaces to the ibd driver -
143 143 * ibd_m_getprop() and ibd_m_setprop().
144 144 * These interfaces allow setting and retrieval of certain properties.
145 145 * Some of them are public properties while most other are private properties
146 146 * meant to be used by developers. Tuning the latter kind can cause
147 147 * performance issues and should not be used without understanding the
148 148 * implications. All properties are specific to an instance of either the
149 149 * partition object or the port driver.
150 150 *
151 151 * The public properties are : mtu and linkmode.
152 152 * mtu is a read-only property.
153 153 * linkmode can take two values - UD and CM.
154 154 *
155 155 * Changing the linkmode requires some bookkeeping in the driver. The
156 156 * capabilities need to be re-reported to the mac layer. This is done by
157 157 * calling mac_capab_update(). The maxsdu is updated by calling
158 158 * mac_maxsdu_update2().
159 159 * The private properties retain their values across the change of linkmode.
160 160 * NOTE:
161 161 * - The port driver does not support any property apart from mtu.
162 162 * - All other properties are only meant for the partition object.
163 163 * - The properties cannot be set when an instance is plumbed. The
164 164 * instance has to be unplumbed to effect any setting.
165 165 */
166 166
167 167 /*
168 168 * Driver wide tunables
169 169 *
170 170 * ibd_tx_softintr
171 171 * ibd_rx_softintr
172 172 * The softintr mechanism allows ibd to avoid event queue overflows if
173 173 * the receive/completion handlers are to be expensive. These are enabled
174 174 * by default.
175 175 *
176 176 * ibd_log_sz
177 177 * This specifies the size of the ibd log buffer in bytes. The buffer is
178 178 * allocated and logging is enabled only when IBD_LOGGING is defined.
179 179 *
180 180 */
181 181 uint_t ibd_rx_softintr = 1;
182 182 uint_t ibd_tx_softintr = 1;
183 183
184 184 #ifdef IBD_LOGGING
185 185 uint_t ibd_log_sz = 0x20000;
186 186 #endif
187 187
188 188 #ifdef IBD_LOGGING
189 189 #define IBD_LOG_SZ ibd_log_sz
190 190 #endif
191 191
192 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */
193 193 #define IBD_RX_POST_CNT 8
194 194
195 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
196 196 #define IBD_LOG_RX_POST 4
197 197
198 198 /* Minimum number of receive work requests driver needs to always have */
199 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
200 200
201 201 /*
202 202 * LSO parameters
203 203 */
204 204 #define IBD_LSO_MAXLEN 65536
205 205 #define IBD_LSO_BUFSZ 8192
206 206
207 207 /*
208 208 * Async operation states
209 209 */
210 210 #define IBD_OP_NOTSTARTED 0
211 211 #define IBD_OP_ONGOING 1
212 212 #define IBD_OP_COMPLETED 2
213 213 #define IBD_OP_ERRORED 3
214 214 #define IBD_OP_ROUTERED 4
215 215
216 216 /*
217 217 * Start/stop in-progress flags; note that restart must always remain
218 218 * the OR of start and stop flag values.
219 219 */
220 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000
221 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000
222 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000
223 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS
224 224
225 225 /*
226 226 * Miscellaneous constants
227 227 */
228 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF
229 229 #define IBD_DEF_MAX_SDU 2044
230 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE)
231 231 #define IBD_DEF_RC_MAX_SDU 65520
232 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE)
233 233 #define IBD_DEFAULT_QKEY 0xB1B
234 234 #ifdef IBD_LOGGING
235 235 #define IBD_DMAX_LINE 100
236 236 #endif
237 237
238 238 /*
239 239 * Enumerations for link states
240 240 */
241 241 typedef enum {
242 242 IBD_LINK_DOWN,
243 243 IBD_LINK_UP,
244 244 IBD_LINK_UP_ABSENT
245 245 } ibd_link_op_t;
246 246
247 247 /*
248 248 * Driver State Pointer
249 249 */
250 250 void *ibd_list;
251 251
252 252 /*
253 253 * Driver Global Data
254 254 */
255 255 ibd_global_state_t ibd_gstate;
256 256
257 257 /*
258 258 * Partition object list
259 259 */
260 260 ibd_state_t *ibd_objlist_head = NULL;
261 261 kmutex_t ibd_objlist_lock;
262 262
263 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */
264 264
265 265 /*
266 266 * Logging
267 267 */
268 268 #ifdef IBD_LOGGING
269 269 kmutex_t ibd_lbuf_lock;
270 270 uint8_t *ibd_lbuf;
271 271 uint32_t ibd_lbuf_ndx;
272 272 #endif
273 273
274 274 /*
275 275 * Required system entry points
276 276 */
277 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
278 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
279 279
280 280 /*
281 281 * Required driver entry points for GLDv3
282 282 */
283 283 static int ibd_m_stat(void *, uint_t, uint64_t *);
284 284 static int ibd_m_start(void *);
285 285 static void ibd_m_stop(void *);
286 286 static int ibd_m_promisc(void *, boolean_t);
287 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
288 288 static int ibd_m_unicst(void *, const uint8_t *);
289 289 static mblk_t *ibd_m_tx(void *, mblk_t *);
290 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
291 291
292 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
293 293 const void *);
294 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
295 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t,
296 296 mac_prop_info_handle_t);
297 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t,
298 298 const void *);
299 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *);
300 300
301 301 /*
302 302 * Private driver entry points for GLDv3
303 303 */
304 304
305 305 /*
306 306 * Initialization
307 307 */
308 308 static int ibd_state_init(ibd_state_t *, dev_info_t *);
309 309 static int ibd_init_txlist(ibd_state_t *);
310 310 static int ibd_init_rxlist(ibd_state_t *);
311 311 static int ibd_acache_init(ibd_state_t *);
312 312 #ifdef IBD_LOGGING
313 313 static void ibd_log_init(void);
314 314 #endif
315 315
316 316 /*
317 317 * Termination/cleanup
318 318 */
319 319 static void ibd_state_fini(ibd_state_t *);
320 320 static void ibd_fini_txlist(ibd_state_t *);
321 321 static void ibd_fini_rxlist(ibd_state_t *);
322 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
323 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
324 324 static void ibd_acache_fini(ibd_state_t *);
325 325 #ifdef IBD_LOGGING
326 326 static void ibd_log_fini(void);
327 327 #endif
328 328
329 329 /*
330 330 * Allocation/acquire/map routines
331 331 */
332 332 static int ibd_alloc_tx_copybufs(ibd_state_t *);
333 333 static int ibd_alloc_rx_copybufs(ibd_state_t *);
334 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
335 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
336 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
337 337 uint32_t *);
338 338
339 339 /*
340 340 * Free/release/unmap routines
341 341 */
342 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
343 343 static void ibd_free_tx_copybufs(ibd_state_t *);
344 344 static void ibd_free_rx_copybufs(ibd_state_t *);
345 345 static void ibd_free_rx_rsrcs(ibd_state_t *);
346 346 static void ibd_free_tx_lsobufs(ibd_state_t *);
347 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
348 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
349 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
350 350
351 351 /*
352 352 * Handlers/callback routines
353 353 */
354 354 static uint_t ibd_intr(caddr_t);
355 355 static uint_t ibd_tx_recycle(caddr_t);
356 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
357 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
358 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
359 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
360 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
361 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
362 362 static void ibd_freemsg_cb(char *);
363 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
364 364 ibt_async_event_t *);
365 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
366 366 ibt_async_event_t *);
367 367 static void ibd_snet_notices_handler(void *, ib_gid_t,
368 368 ibt_subnet_event_code_t, ibt_subnet_event_t *);
369 369
370 370 /*
371 371 * Send/receive routines
372 372 */
373 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
374 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
375 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
376 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
377 377
378 378 /*
379 379 * Threads
380 380 */
381 381 static void ibd_async_work(ibd_state_t *);
382 382
383 383 /*
384 384 * Async tasks
385 385 */
386 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
387 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
388 388 static void ibd_async_setprom(ibd_state_t *);
389 389 static void ibd_async_unsetprom(ibd_state_t *);
390 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
391 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
392 392 static void ibd_async_txsched(ibd_state_t *);
393 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
394 394
395 395 /*
396 396 * Async task helpers
397 397 */
398 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
399 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
400 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
401 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
402 402 ipoib_mac_t *, ipoib_mac_t *);
403 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
404 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
405 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
406 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
407 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
408 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
409 409 static uint64_t ibd_get_portspeed(ibd_state_t *);
410 410 static boolean_t ibd_async_safe(ibd_state_t *);
411 411 static void ibd_async_done(ibd_state_t *);
412 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
413 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
414 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
415 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
416 416
417 417 /*
418 418 * Helpers for attach/start routines
419 419 */
420 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
421 421 static int ibd_record_capab(ibd_state_t *);
422 422 static int ibd_get_port_details(ibd_state_t *);
423 423 static int ibd_alloc_cqs(ibd_state_t *);
424 424 static int ibd_setup_ud_channel(ibd_state_t *);
425 425 static int ibd_start(ibd_state_t *);
426 426 static int ibd_undo_start(ibd_state_t *, link_state_t);
427 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
428 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
429 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip);
430 430 static void ibd_part_unattach(ibd_state_t *state);
431 431 static int ibd_port_attach(dev_info_t *);
432 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip);
433 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *);
434 434 static int ibd_part_busy(ibd_state_t *);
435 435
436 436 /*
437 437 * Miscellaneous helpers
438 438 */
439 439 static int ibd_sched_poll(ibd_state_t *, int, int);
440 440 static void ibd_resume_transmission(ibd_state_t *);
441 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
442 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
443 443 static void *list_get_head(list_t *);
444 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
445 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
446 446
447 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *);
448 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *);
449 449
450 450 #ifdef IBD_LOGGING
451 451 static void ibd_log(const char *, ...);
452 452 #endif
453 453
454 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
455 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
456 456
457 457 /* Module Driver Info */
458 458 static struct modldrv ibd_modldrv = {
459 459 &mod_driverops, /* This one is a driver */
460 460 "InfiniBand GLDv3 Driver", /* short description */
461 461 &ibd_dev_ops /* driver specific ops */
462 462 };
463 463
464 464 /* Module Linkage */
465 465 static struct modlinkage ibd_modlinkage = {
466 466 MODREV_1, (void *)&ibd_modldrv, NULL
467 467 };
468 468
469 469 /*
470 470 * Module (static) info passed to IBTL during ibt_attach
471 471 */
472 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
473 473 IBTI_V_CURR,
474 474 IBT_NETWORK,
475 475 ibd_async_handler,
476 476 NULL,
477 477 "IBPART"
478 478 };
479 479
480 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = {
481 481 IBTI_V_CURR,
482 482 IBT_NETWORK,
483 483 ibdpd_async_handler,
484 484 NULL,
485 485 "IPIB"
486 486 };
487 487
488 488 /*
489 489 * GLDv3 entry points
490 490 */
491 491 #define IBD_M_CALLBACK_FLAGS \
492 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO)
493 493
494 494 static mac_callbacks_t ibd_m_callbacks = {
495 495 IBD_M_CALLBACK_FLAGS,
496 496 ibd_m_stat,
497 497 ibd_m_start,
498 498 ibd_m_stop,
499 499 ibd_m_promisc,
500 500 ibd_m_multicst,
501 501 ibd_m_unicst,
502 502 ibd_m_tx,
503 503 NULL,
504 504 NULL,
505 505 ibd_m_getcapab,
506 506 NULL,
507 507 NULL,
508 508 ibd_m_setprop,
509 509 ibd_m_getprop,
510 510 ibd_m_propinfo
511 511 };
512 512
513 513 /* Private properties */
514 514 char *ibd_priv_props[] = {
515 515 "_ibd_broadcast_group",
516 516 "_ibd_coalesce_completions",
517 517 "_ibd_create_broadcast_group",
518 518 "_ibd_hash_size",
519 519 "_ibd_lso_enable",
520 520 "_ibd_num_ah",
521 521 "_ibd_num_lso_bufs",
522 522 "_ibd_rc_enable_srq",
523 523 "_ibd_rc_num_rwqe",
524 524 "_ibd_rc_num_srq",
525 525 "_ibd_rc_num_swqe",
526 526 "_ibd_rc_rx_comp_count",
527 527 "_ibd_rc_rx_comp_usec",
528 528 "_ibd_rc_rx_copy_thresh",
529 529 "_ibd_rc_rx_rwqe_thresh",
530 530 "_ibd_rc_tx_comp_count",
531 531 "_ibd_rc_tx_comp_usec",
532 532 "_ibd_rc_tx_copy_thresh",
533 533 "_ibd_ud_num_rwqe",
534 534 "_ibd_ud_num_swqe",
535 535 "_ibd_ud_rx_comp_count",
536 536 "_ibd_ud_rx_comp_usec",
537 537 "_ibd_ud_tx_comp_count",
538 538 "_ibd_ud_tx_comp_usec",
539 539 "_ibd_ud_tx_copy_thresh",
540 540 NULL
541 541 };
542 542
543 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *);
544 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *);
545 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *);
546 546
547 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = {
548 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t),
549 549 ibd_create_partition, secpolicy_dl_config},
550 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t),
551 551 ibd_delete_partition, secpolicy_dl_config},
552 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t),
553 553 ibd_get_partition_info, NULL}
554 554 };
555 555
556 556 /*
557 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address
558 558 */
559 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \
560 560 { \
561 561 *(uint32_t *)((char *)(maddr) + 4) |= \
562 562 htonl((uint32_t)(scope) << 16); \
563 563 *(uint32_t *)((char *)(maddr) + 8) |= \
564 564 htonl((uint32_t)(pkey) << 16); \
565 565 }
566 566
567 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \
568 568 { \
569 569 *(uint32_t *)((char *)(maddr) + 4) &= \
570 570 htonl(~((uint32_t)0xF << 16)); \
571 571 *(uint32_t *)((char *)(maddr) + 8) &= \
572 572 htonl(~((uint32_t)0xFFFF << 16)); \
573 573 }
574 574
575 575 /*
576 576 * Rudimentary debugging support
577 577 */
578 578 #ifdef DEBUG
579 579 int ibd_debuglevel = 100;
580 580 void
581 581 debug_print(int l, char *fmt, ...)
582 582 {
583 583 va_list ap;
584 584
585 585 if (l < ibd_debuglevel)
586 586 return;
587 587 va_start(ap, fmt);
588 588 vcmn_err(CE_CONT, fmt, ap);
589 589 va_end(ap);
590 590 }
591 591 #endif
592 592
593 593 /*
594 594 * Common routine to print warning messages; adds in hca guid, port number
595 595 * and pkey to be able to identify the IBA interface.
596 596 */
597 597 void
598 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
599 599 {
600 600 ib_guid_t hca_guid;
601 601 char ibd_print_buf[MAXNAMELEN + 256];
602 602 int len;
603 603 va_list ap;
604 604 char part_name[MAXNAMELEN];
605 605 datalink_id_t linkid = state->id_plinkid;
606 606
607 607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
608 608 0, "hca-guid", 0);
609 609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL);
610 610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
611 611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ",
↓ open down ↓ |
611 lines elided |
↑ open up ↑ |
612 612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
613 613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey,
614 614 part_name);
615 615 va_start(ap, fmt);
616 616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
617 617 fmt, ap);
618 618 cmn_err(CE_NOTE, "!%s", ibd_print_buf);
619 619 va_end(ap);
620 620 }
621 621
622 -/*
623 - * Warlock directives
624 - */
625 -
626 -/*
627 - * id_lso_lock
628 - *
629 - * state->id_lso->bkt_nfree may be accessed without a lock to
630 - * determine the threshold at which we have to ask the nw layer
631 - * to resume transmission (see ibd_resume_transmission()).
632 - */
633 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
634 - ibd_state_t::id_lso))
635 -_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
636 -_NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
637 -_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
638 -
639 -/*
640 - * id_scq_poll_lock
641 - */
642 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
643 - ibd_state_t::id_scq_poll_busy))
644 -
645 -/*
646 - * id_txpost_lock
647 - */
648 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
649 - ibd_state_t::id_tx_head))
650 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
651 - ibd_state_t::id_tx_busy))
652 -
653 -/*
654 - * id_acache_req_lock
655 - */
656 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
657 - ibd_state_t::id_acache_req_cv))
658 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
659 - ibd_state_t::id_req_list))
660 -_NOTE(SCHEME_PROTECTS_DATA("atomic",
661 - ibd_acache_s::ac_ref))
662 -
663 -/*
664 - * id_ac_mutex
665 - *
666 - * This mutex is actually supposed to protect id_ah_op as well,
667 - * but this path of the code isn't clean (see update of id_ah_op
668 - * in ibd_async_acache(), immediately after the call to
669 - * ibd_async_mcache()). For now, we'll skip this check by
670 - * declaring that id_ah_op is protected by some internal scheme
671 - * that warlock isn't aware of.
672 - */
673 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
674 - ibd_state_t::id_ah_active))
675 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
676 - ibd_state_t::id_ah_free))
677 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
678 - ibd_state_t::id_ah_addr))
679 -_NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
680 - ibd_state_t::id_ah_op))
681 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
682 - ibd_state_t::id_ah_error))
683 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
684 - ibd_state_t::id_ac_hot_ace))
685 -_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
686 -
687 -/*
688 - * id_mc_mutex
689 - */
690 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
691 - ibd_state_t::id_mc_full))
692 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
693 - ibd_state_t::id_mc_non))
694 -
695 -/*
696 - * id_trap_lock
697 - */
698 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
699 - ibd_state_t::id_trap_cv))
700 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
701 - ibd_state_t::id_trap_stop))
702 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
703 - ibd_state_t::id_trap_inprog))
704 -
705 -/*
706 - * id_prom_op
707 - */
708 -_NOTE(SCHEME_PROTECTS_DATA("only by async thread",
709 - ibd_state_t::id_prom_op))
710 -
711 -/*
712 - * id_sched_lock
713 - */
714 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
715 - ibd_state_t::id_sched_needed))
716 -
717 -/*
718 - * id_link_mutex
719 - */
720 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
721 - ibd_state_t::id_link_state))
722 -_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
723 -_NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
724 - ibd_state_t::id_link_speed))
725 -_NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
726 -
727 -/*
728 - * id_tx_list.dl_mutex
729 - */
730 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
731 - ibd_state_t::id_tx_list.dl_head))
732 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
733 - ibd_state_t::id_tx_list.dl_pending_sends))
734 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
735 - ibd_state_t::id_tx_list.dl_cnt))
736 -
737 -/*
738 - * id_rx_list.dl_mutex
739 - */
740 -_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
741 - ibd_state_t::id_rx_list.dl_bufs_outstanding))
742 -_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
743 - ibd_state_t::id_rx_list.dl_cnt))
744 -
745 -/*
746 - * rc_timeout_lock
747 - */
748 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
749 - ibd_state_t::rc_timeout_start))
750 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::rc_timeout_lock,
751 - ibd_state_t::rc_timeout))
752 -
753 -
754 -/*
755 - * Items protected by atomic updates
756 - */
757 -_NOTE(SCHEME_PROTECTS_DATA("atomic update only",
758 - ibd_state_s::id_brd_rcv
759 - ibd_state_s::id_brd_xmt
760 - ibd_state_s::id_multi_rcv
761 - ibd_state_s::id_multi_xmt
762 - ibd_state_s::id_num_intrs
763 - ibd_state_s::id_rcv_bytes
764 - ibd_state_s::id_rcv_pkt
765 - ibd_state_s::id_rx_post_queue_index
766 - ibd_state_s::id_tx_short
767 - ibd_state_s::id_xmt_bytes
768 - ibd_state_s::id_xmt_pkt
769 - ibd_state_s::rc_rcv_trans_byte
770 - ibd_state_s::rc_rcv_trans_pkt
771 - ibd_state_s::rc_rcv_copy_byte
772 - ibd_state_s::rc_rcv_copy_pkt
773 - ibd_state_s::rc_xmt_bytes
774 - ibd_state_s::rc_xmt_small_pkt
775 - ibd_state_s::rc_xmt_fragmented_pkt
776 - ibd_state_s::rc_xmt_map_fail_pkt
777 - ibd_state_s::rc_xmt_map_succ_pkt
778 - ibd_rc_chan_s::rcq_invoking))
779 -
780 -/*
781 - * Non-mutex protection schemes for data elements. Almost all of
782 - * these are non-shared items.
783 - */
784 -_NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
785 - callb_cpr
786 - ib_gid_s
787 - ib_header_info
788 - ibd_acache_rq
789 - ibd_acache_s::ac_mce
790 - ibd_acache_s::ac_chan
791 - ibd_mcache::mc_fullreap
792 - ibd_mcache::mc_jstate
793 - ibd_mcache::mc_req
794 - ibd_rwqe_s
795 - ibd_swqe_s
796 - ibd_wqe_s
797 - ibt_wr_ds_s::ds_va
798 - ibt_wr_lso_s
799 - ipoib_mac::ipoib_qpn
800 - mac_capab_lso_s
801 - msgb::b_next
802 - msgb::b_cont
803 - msgb::b_rptr
804 - msgb::b_wptr
805 - ibd_state_s::id_bgroup_created
806 - ibd_state_s::id_mac_state
807 - ibd_state_s::id_mtu
808 - ibd_state_s::id_ud_num_rwqe
809 - ibd_state_s::id_ud_num_swqe
810 - ibd_state_s::id_qpnum
811 - ibd_state_s::id_rcq_hdl
812 - ibd_state_s::id_rx_buf_sz
813 - ibd_state_s::id_rx_bufs
814 - ibd_state_s::id_rx_mr_hdl
815 - ibd_state_s::id_rx_wqes
816 - ibd_state_s::id_rxwcs
817 - ibd_state_s::id_rxwcs_size
818 - ibd_state_s::id_rx_nqueues
819 - ibd_state_s::id_rx_queues
820 - ibd_state_s::id_scope
821 - ibd_state_s::id_scq_hdl
822 - ibd_state_s::id_tx_buf_sz
823 - ibd_state_s::id_tx_bufs
824 - ibd_state_s::id_tx_mr_hdl
825 - ibd_state_s::id_tx_rel_list.dl_cnt
826 - ibd_state_s::id_tx_wqes
827 - ibd_state_s::id_txwcs
828 - ibd_state_s::id_txwcs_size
829 - ibd_state_s::rc_listen_hdl
830 - ibd_state_s::rc_listen_hdl_OFED_interop
831 - ibd_state_s::rc_srq_size
832 - ibd_state_s::rc_srq_rwqes
833 - ibd_state_s::rc_srq_rx_bufs
834 - ibd_state_s::rc_srq_rx_mr_hdl
835 - ibd_state_s::rc_tx_largebuf_desc_base
836 - ibd_state_s::rc_tx_mr_bufs
837 - ibd_state_s::rc_tx_mr_hdl
838 - ipha_s
839 - icmph_s
840 - ibt_path_info_s::pi_sid
841 - ibd_rc_chan_s::ace
842 - ibd_rc_chan_s::chan_hdl
843 - ibd_rc_chan_s::state
844 - ibd_rc_chan_s::chan_state
845 - ibd_rc_chan_s::is_tx_chan
846 - ibd_rc_chan_s::rcq_hdl
847 - ibd_rc_chan_s::rcq_size
848 - ibd_rc_chan_s::scq_hdl
849 - ibd_rc_chan_s::scq_size
850 - ibd_rc_chan_s::rx_bufs
851 - ibd_rc_chan_s::rx_mr_hdl
852 - ibd_rc_chan_s::rx_rwqes
853 - ibd_rc_chan_s::tx_wqes
854 - ibd_rc_chan_s::tx_mr_bufs
855 - ibd_rc_chan_s::tx_mr_hdl
856 - ibd_rc_chan_s::tx_rel_list.dl_cnt
857 - ibd_rc_chan_s::is_used
858 - ibd_rc_tx_largebuf_s::lb_buf
859 - ibd_rc_msg_hello_s
860 - ibt_cm_return_args_s))
861 -
862 -/*
863 - * ibd_rc_chan_s::next is protected by two mutexes:
864 - * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex
865 - * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex.
866 - */
867 -_NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes",
868 - ibd_rc_chan_s::next))
869 -
870 -/*
871 - * ibd_state_s.rc_tx_large_bufs_lock
872 - */
873 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
874 - ibd_state_s::rc_tx_largebuf_free_head))
875 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
876 - ibd_state_s::rc_tx_largebuf_nfree))
877 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock,
878 - ibd_rc_tx_largebuf_s::lb_next))
879 -
880 -/*
881 - * ibd_acache_s.tx_too_big_mutex
882 - */
883 -_NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex,
884 - ibd_acache_s::tx_too_big_ongoing))
885 -
886 -/*
887 - * tx_wqe_list.dl_mutex
888 - */
889 -_NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
890 - ibd_rc_chan_s::tx_wqe_list.dl_head))
891 -_NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
892 - ibd_rc_chan_s::tx_wqe_list.dl_pending_sends))
893 -_NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex,
894 - ibd_rc_chan_s::tx_wqe_list.dl_cnt))
895 -
896 -/*
897 - * ibd_state_s.rc_ace_recycle_lock
898 - */
899 -_NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock,
900 - ibd_state_s::rc_ace_recycle))
901 -
902 -/*
903 - * rc_srq_rwqe_list.dl_mutex
904 - */
905 -_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
906 - ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding))
907 -_NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
908 - ibd_state_t::rc_srq_rwqe_list.dl_cnt))
909 -
910 -/*
911 - * Non-mutex protection schemes for data elements. They are counters
912 - * for problem diagnosis. Don't need be protected.
913 - */
914 -_NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
915 - ibd_state_s::rc_rcv_alloc_fail
916 - ibd_state_s::rc_rcq_err
917 - ibd_state_s::rc_ace_not_found
918 - ibd_state_s::rc_xmt_drop_too_long_pkt
919 - ibd_state_s::rc_xmt_icmp_too_long_pkt
920 - ibd_state_s::rc_xmt_reenter_too_long_pkt
921 - ibd_state_s::rc_swqe_short
922 - ibd_state_s::rc_swqe_mac_update
923 - ibd_state_s::rc_xmt_buf_short
924 - ibd_state_s::rc_xmt_buf_mac_update
925 - ibd_state_s::rc_scq_no_swqe
926 - ibd_state_s::rc_scq_no_largebuf
927 - ibd_state_s::rc_conn_succ
928 - ibd_state_s::rc_conn_fail
929 - ibd_state_s::rc_null_conn
930 - ibd_state_s::rc_no_estab_conn
931 - ibd_state_s::rc_act_close
932 - ibd_state_s::rc_pas_close
933 - ibd_state_s::rc_delay_ace_recycle
934 - ibd_state_s::rc_act_close_simultaneous
935 - ibd_state_s::rc_act_close_not_clean
936 - ibd_state_s::rc_pas_close_rcq_invoking
937 - ibd_state_s::rc_reset_cnt
938 - ibd_state_s::rc_timeout_act
939 - ibd_state_s::rc_timeout_pas
940 - ibd_state_s::rc_stop_connect))
941 -
942 -#ifdef DEBUG
943 -/*
944 - * Non-mutex protection schemes for data elements. They are counters
945 - * for problem diagnosis. Don't need be protected.
946 - */
947 -_NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis",
948 - ibd_state_s::rc_rwqe_short
949 - ibd_rc_stat_s::rc_rcv_trans_byte
950 - ibd_rc_stat_s::rc_rcv_trans_pkt
951 - ibd_rc_stat_s::rc_rcv_copy_byte
952 - ibd_rc_stat_s::rc_rcv_copy_pkt
953 - ibd_rc_stat_s::rc_rcv_alloc_fail
954 - ibd_rc_stat_s::rc_rcq_err
955 - ibd_rc_stat_s::rc_rwqe_short
956 - ibd_rc_stat_s::rc_xmt_bytes
957 - ibd_rc_stat_s::rc_xmt_small_pkt
958 - ibd_rc_stat_s::rc_xmt_fragmented_pkt
959 - ibd_rc_stat_s::rc_xmt_map_fail_pkt
960 - ibd_rc_stat_s::rc_xmt_map_succ_pkt
961 - ibd_rc_stat_s::rc_ace_not_found
962 - ibd_rc_stat_s::rc_scq_no_swqe
963 - ibd_rc_stat_s::rc_scq_no_largebuf
964 - ibd_rc_stat_s::rc_swqe_short
965 - ibd_rc_stat_s::rc_swqe_mac_update
966 - ibd_rc_stat_s::rc_xmt_buf_short
967 - ibd_rc_stat_s::rc_xmt_buf_mac_update
968 - ibd_rc_stat_s::rc_conn_succ
969 - ibd_rc_stat_s::rc_conn_fail
970 - ibd_rc_stat_s::rc_null_conn
971 - ibd_rc_stat_s::rc_no_estab_conn
972 - ibd_rc_stat_s::rc_act_close
973 - ibd_rc_stat_s::rc_pas_close
974 - ibd_rc_stat_s::rc_delay_ace_recycle
975 - ibd_rc_stat_s::rc_act_close_simultaneous
976 - ibd_rc_stat_s::rc_reset_cnt
977 - ibd_rc_stat_s::rc_timeout_act
978 - ibd_rc_stat_s::rc_timeout_pas))
979 -#endif
980 -
981 622 int
982 623 _init()
983 624 {
984 625 int status;
985 626
986 627 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
987 628 PAGESIZE), 0);
988 629 if (status != 0) {
989 630 DPRINT(10, "_init:failed in ddi_soft_state_init()");
990 631 return (status);
991 632 }
992 633
993 634 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL);
994 635
995 636 mac_init_ops(&ibd_dev_ops, "ibp");
996 637 status = mod_install(&ibd_modlinkage);
997 638 if (status != 0) {
998 639 DPRINT(10, "_init:failed in mod_install()");
999 640 ddi_soft_state_fini(&ibd_list);
1000 641 mac_fini_ops(&ibd_dev_ops);
1001 642 return (status);
1002 643 }
1003 644
1004 645 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL);
1005 646 mutex_enter(&ibd_gstate.ig_mutex);
1006 647 ibd_gstate.ig_ibt_hdl = NULL;
1007 648 ibd_gstate.ig_ibt_hdl_ref_cnt = 0;
1008 649 ibd_gstate.ig_service_list = NULL;
1009 650 mutex_exit(&ibd_gstate.ig_mutex);
1010 651
1011 652 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list,
1012 653 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) {
1013 654 return (EIO);
1014 655 }
1015 656
1016 657 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr);
1017 658
1018 659 #ifdef IBD_LOGGING
1019 660 ibd_log_init();
1020 661 #endif
1021 662 return (0);
1022 663 }
1023 664
1024 665 int
1025 666 _info(struct modinfo *modinfop)
1026 667 {
1027 668 return (mod_info(&ibd_modlinkage, modinfop));
1028 669 }
1029 670
1030 671 int
1031 672 _fini()
1032 673 {
1033 674 int status;
1034 675
1035 676 status = mod_remove(&ibd_modlinkage);
1036 677 if (status != 0)
1037 678 return (status);
1038 679
1039 680 ibt_unregister_part_attr_cb();
1040 681
1041 682 mac_fini_ops(&ibd_dev_ops);
1042 683 mutex_destroy(&ibd_objlist_lock);
1043 684 ddi_soft_state_fini(&ibd_list);
1044 685 mutex_destroy(&ibd_gstate.ig_mutex);
1045 686 #ifdef IBD_LOGGING
1046 687 ibd_log_fini();
1047 688 #endif
1048 689 return (0);
1049 690 }
1050 691
1051 692 /*
1052 693 * Convert the GID part of the mac address from network byte order
1053 694 * to host order.
1054 695 */
1055 696 static void
1056 697 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
1057 698 {
1058 699 ib_sn_prefix_t nbopref;
1059 700 ib_guid_t nboguid;
1060 701
1061 702 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
1062 703 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
1063 704 dgid->gid_prefix = b2h64(nbopref);
1064 705 dgid->gid_guid = b2h64(nboguid);
1065 706 }
1066 707
1067 708 /*
1068 709 * Create the IPoIB address in network byte order from host order inputs.
1069 710 */
1070 711 static void
1071 712 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
1072 713 ib_guid_t guid)
1073 714 {
1074 715 ib_sn_prefix_t nbopref;
1075 716 ib_guid_t nboguid;
1076 717
1077 718 mac->ipoib_qpn = htonl(qpn);
1078 719 nbopref = h2b64(prefix);
1079 720 nboguid = h2b64(guid);
1080 721 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
1081 722 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
1082 723 }
1083 724
1084 725 /*
1085 726 * Send to the appropriate all-routers group when the IBA multicast group
1086 727 * does not exist, based on whether the target group is v4 or v6.
1087 728 */
1088 729 static boolean_t
1089 730 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
1090 731 ipoib_mac_t *rmac)
1091 732 {
1092 733 boolean_t retval = B_TRUE;
1093 734 uint32_t adjscope = state->id_scope << 16;
1094 735 uint32_t topword;
1095 736
1096 737 /*
1097 738 * Copy the first 4 bytes in without assuming any alignment of
1098 739 * input mac address; this will have IPoIB signature, flags and
1099 740 * scope bits.
1100 741 */
1101 742 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
1102 743 topword = ntohl(topword);
1103 744
1104 745 /*
1105 746 * Generate proper address for IPv4/v6, adding in the Pkey properly.
1106 747 */
1107 748 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
1108 749 (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
1109 750 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
1110 751 ((uint32_t)(state->id_pkey << 16))),
1111 752 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
1112 753 else
1113 754 /*
1114 755 * Does not have proper bits in the mgid address.
1115 756 */
1116 757 retval = B_FALSE;
1117 758
1118 759 return (retval);
1119 760 }
1120 761
1121 762 /*
1122 763 * Membership states for different mcg's are tracked by two lists:
1123 764 * the "non" list is used for promiscuous mode, when all mcg traffic
1124 765 * needs to be inspected. This type of membership is never used for
1125 766 * transmission, so there can not be an AH in the active list
1126 767 * corresponding to a member in this list. This list does not need
1127 768 * any protection, since all operations are performed by the async
1128 769 * thread.
1129 770 *
1130 771 * "Full" and "SendOnly" membership is tracked using a single list,
1131 772 * the "full" list. This is because this single list can then be
1132 773 * searched during transmit to a multicast group (if an AH for the
1133 774 * mcg is not found in the active list), since at least one type
1134 775 * of membership must be present before initiating the transmit.
1135 776 * This list is also emptied during driver detach, since sendonly
1136 777 * membership acquired during transmit is dropped at detach time
1137 778 * along with ipv4 broadcast full membership. Insert/deletes to
1138 779 * this list are done only by the async thread, but it is also
1139 780 * searched in program context (see multicast disable case), thus
1140 781 * the id_mc_mutex protects the list. The driver detach path also
1141 782 * deconstructs the "full" list, but it ensures that the async
1142 783 * thread will not be accessing the list (by blocking out mcg
1143 784 * trap handling and making sure no more Tx reaping will happen).
1144 785 *
1145 786 * Currently, an IBA attach is done in the SendOnly case too,
1146 787 * although this is not required.
1147 788 */
1148 789 #define IBD_MCACHE_INSERT_FULL(state, mce) \
1149 790 list_insert_head(&state->id_mc_full, mce)
1150 791 #define IBD_MCACHE_INSERT_NON(state, mce) \
1151 792 list_insert_head(&state->id_mc_non, mce)
1152 793 #define IBD_MCACHE_FIND_FULL(state, mgid) \
1153 794 ibd_mcache_find(mgid, &state->id_mc_full)
1154 795 #define IBD_MCACHE_FIND_NON(state, mgid) \
1155 796 ibd_mcache_find(mgid, &state->id_mc_non)
1156 797 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \
1157 798 list_remove(&state->id_mc_full, mce)
1158 799 #define IBD_MCACHE_PULLOUT_NON(state, mce) \
1159 800 list_remove(&state->id_mc_non, mce)
1160 801
1161 802 static void *
1162 803 list_get_head(list_t *list)
1163 804 {
1164 805 list_node_t *lhead = list_head(list);
1165 806
1166 807 if (lhead != NULL)
1167 808 list_remove(list, lhead);
1168 809 return (lhead);
1169 810 }
1170 811
1171 812 /*
1172 813 * This is always guaranteed to be able to queue the work.
1173 814 */
1174 815 void
1175 816 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1176 817 {
1177 818 /* Initialize request */
1178 819 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1179 820 ptr->rq_op = op;
1180 821
1181 822 /*
1182 823 * Queue provided slot onto request pool.
1183 824 */
1184 825 mutex_enter(&state->id_acache_req_lock);
1185 826 list_insert_tail(&state->id_req_list, ptr);
1186 827
1187 828 /* Go, fetch, async thread */
1188 829 cv_signal(&state->id_acache_req_cv);
1189 830 mutex_exit(&state->id_acache_req_lock);
1190 831 }
1191 832
1192 833 /*
1193 834 * Main body of the per interface async thread.
1194 835 */
1195 836 static void
1196 837 ibd_async_work(ibd_state_t *state)
1197 838 {
1198 839 ibd_req_t *ptr;
1199 840 callb_cpr_t cprinfo;
1200 841
1201 842 mutex_enter(&state->id_acache_req_lock);
1202 843 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1203 844 callb_generic_cpr, "ibd_async_work");
1204 845
1205 846 for (;;) {
1206 847 ptr = list_get_head(&state->id_req_list);
1207 848 if (ptr != NULL) {
1208 849 mutex_exit(&state->id_acache_req_lock);
1209 850
1210 851 /*
1211 852 * If we are in late hca initialization mode, do not
1212 853 * process any other async request other than TRAP. TRAP
1213 854 * is used for indicating creation of a broadcast group;
1214 855 * in which case, we need to join/create the group.
1215 856 */
1216 857 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
1217 858 (ptr->rq_op != IBD_ASYNC_TRAP)) {
1218 859 goto free_req_and_continue;
1219 860 }
1220 861
1221 862 /*
1222 863 * Once we have done the operation, there is no
1223 864 * guarantee the request slot is going to be valid,
1224 865 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1225 866 * TRAP).
1226 867 *
1227 868 * Perform the request.
1228 869 */
1229 870 switch (ptr->rq_op) {
1230 871 case IBD_ASYNC_GETAH:
1231 872 ibd_async_acache(state, &ptr->rq_mac);
1232 873 break;
1233 874 case IBD_ASYNC_JOIN:
1234 875 case IBD_ASYNC_LEAVE:
1235 876 ibd_async_multicast(state,
1236 877 ptr->rq_gid, ptr->rq_op);
1237 878 break;
1238 879 case IBD_ASYNC_PROMON:
1239 880 ibd_async_setprom(state);
1240 881 break;
1241 882 case IBD_ASYNC_PROMOFF:
1242 883 ibd_async_unsetprom(state);
1243 884 break;
1244 885 case IBD_ASYNC_REAP:
1245 886 ibd_async_reap_group(state,
1246 887 ptr->rq_ptr, ptr->rq_gid,
1247 888 IB_MC_JSTATE_FULL);
1248 889 /*
1249 890 * the req buf contains in mce
1250 891 * structure, so we do not need
1251 892 * to free it here.
1252 893 */
1253 894 ptr = NULL;
1254 895 break;
1255 896 case IBD_ASYNC_TRAP:
↓ open down ↓ |
265 lines elided |
↑ open up ↑ |
1256 897 ibd_async_trap(state, ptr);
1257 898 break;
1258 899 case IBD_ASYNC_SCHED:
1259 900 ibd_async_txsched(state);
1260 901 break;
1261 902 case IBD_ASYNC_LINK:
1262 903 ibd_async_link(state, ptr);
1263 904 break;
1264 905 case IBD_ASYNC_EXIT:
1265 906 mutex_enter(&state->id_acache_req_lock);
1266 -#ifndef __lock_lint
1267 907 CALLB_CPR_EXIT(&cprinfo);
1268 -#else
1269 - mutex_exit(&state->id_acache_req_lock);
1270 -#endif
1271 908 return;
1272 909 case IBD_ASYNC_RC_TOO_BIG:
1273 910 ibd_async_rc_process_too_big(state,
1274 911 ptr);
1275 912 break;
1276 913 case IBD_ASYNC_RC_CLOSE_ACT_CHAN:
1277 914 ibd_async_rc_close_act_chan(state, ptr);
1278 915 break;
1279 916 case IBD_ASYNC_RC_RECYCLE_ACE:
1280 917 ibd_async_rc_recycle_ace(state, ptr);
1281 918 break;
1282 919 case IBD_ASYNC_RC_CLOSE_PAS_CHAN:
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
1283 920 (void) ibd_rc_pas_close(ptr->rq_ptr,
1284 921 B_TRUE, B_TRUE);
1285 922 break;
1286 923 }
1287 924 free_req_and_continue:
1288 925 if (ptr != NULL)
1289 926 kmem_cache_free(state->id_req_kmc, ptr);
1290 927
1291 928 mutex_enter(&state->id_acache_req_lock);
1292 929 } else {
1293 -#ifndef __lock_lint
1294 930 /*
1295 931 * Nothing to do: wait till new request arrives.
1296 932 */
1297 933 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1298 934 cv_wait(&state->id_acache_req_cv,
1299 935 &state->id_acache_req_lock);
1300 936 CALLB_CPR_SAFE_END(&cprinfo,
1301 937 &state->id_acache_req_lock);
1302 -#endif
1303 938 }
1304 939 }
1305 940
1306 941 /*NOTREACHED*/
1307 942 _NOTE(NOT_REACHED)
1308 943 }
1309 944
1310 945 /*
1311 946 * Return when it is safe to queue requests to the async daemon; primarily
1312 947 * for subnet trap and async event handling. Disallow requests before the
1313 948 * daemon is created, and when interface deinitilization starts.
1314 949 */
1315 950 static boolean_t
1316 951 ibd_async_safe(ibd_state_t *state)
1317 952 {
1318 953 mutex_enter(&state->id_trap_lock);
1319 954 if (state->id_trap_stop) {
1320 955 mutex_exit(&state->id_trap_lock);
1321 956 return (B_FALSE);
1322 957 }
1323 958 state->id_trap_inprog++;
1324 959 mutex_exit(&state->id_trap_lock);
1325 960 return (B_TRUE);
1326 961 }
1327 962
1328 963 /*
1329 964 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1330 965 * trap or event handling to complete to kill the async thread and deconstruct
1331 966 * the mcg/ace list.
1332 967 */
1333 968 static void
1334 969 ibd_async_done(ibd_state_t *state)
1335 970 {
1336 971 mutex_enter(&state->id_trap_lock);
1337 972 if (--state->id_trap_inprog == 0)
1338 973 cv_signal(&state->id_trap_cv);
1339 974 mutex_exit(&state->id_trap_lock);
1340 975 }
1341 976
1342 977 /*
1343 978 * Hash functions:
1344 979 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1345 980 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1346 981 * These operate on mac addresses input into ibd_send, but there is no
1347 982 * guarantee on the alignment of the ipoib_mac_t structure.
1348 983 */
1349 984 /*ARGSUSED*/
1350 985 static uint_t
1351 986 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1352 987 {
1353 988 ulong_t ptraddr = (ulong_t)key;
1354 989 uint_t hval;
1355 990
1356 991 /*
1357 992 * If the input address is 4 byte aligned, we can just dereference
1358 993 * it. This is most common, since IP will send in a 4 byte aligned
1359 994 * IP header, which implies the 24 byte IPoIB psuedo header will be
1360 995 * 4 byte aligned too.
1361 996 */
1362 997 if ((ptraddr & 3) == 0)
1363 998 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1364 999
1365 1000 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1366 1001 return (hval);
1367 1002 }
1368 1003
1369 1004 static int
1370 1005 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1371 1006 {
1372 1007 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1373 1008 return (0);
1374 1009 else
1375 1010 return (1);
1376 1011 }
1377 1012
1378 1013 /*
1379 1014 * Initialize all the per interface caches and lists; AH cache,
1380 1015 * MCG list etc.
1381 1016 */
1382 1017 static int
1383 1018 ibd_acache_init(ibd_state_t *state)
1384 1019 {
1385 1020 ibd_ace_t *ce;
1386 1021 int i;
1387 1022
1388 1023 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1389 1024 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1390 1025 mutex_enter(&state->id_ac_mutex);
1391 1026 list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1392 1027 offsetof(ibd_ace_t, ac_list));
1393 1028 list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1394 1029 offsetof(ibd_ace_t, ac_list));
1395 1030 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1396 1031 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor,
1397 1032 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1398 1033 list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1399 1034 offsetof(ibd_mce_t, mc_list));
1400 1035 list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1401 1036 offsetof(ibd_mce_t, mc_list));
1402 1037 state->id_ac_hot_ace = NULL;
1403 1038
1404 1039 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1405 1040 state->id_num_ah, KM_SLEEP);
1406 1041 for (i = 0; i < state->id_num_ah; i++, ce++) {
1407 1042 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1408 1043 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1409 1044 mutex_exit(&state->id_ac_mutex);
1410 1045 ibd_acache_fini(state);
1411 1046 return (DDI_FAILURE);
1412 1047 } else {
1413 1048 CLEAR_REFCYCLE(ce);
1414 1049 ce->ac_mce = NULL;
1415 1050 mutex_init(&ce->tx_too_big_mutex, NULL,
1416 1051 MUTEX_DRIVER, NULL);
1417 1052 IBD_ACACHE_INSERT_FREE(state, ce);
1418 1053 }
1419 1054 }
1420 1055 mutex_exit(&state->id_ac_mutex);
1421 1056 return (DDI_SUCCESS);
1422 1057 }
1423 1058
1424 1059 static void
1425 1060 ibd_acache_fini(ibd_state_t *state)
1426 1061 {
1427 1062 ibd_ace_t *ptr;
1428 1063
1429 1064 mutex_enter(&state->id_ac_mutex);
1430 1065
1431 1066 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1432 1067 ASSERT(GET_REF(ptr) == 0);
1433 1068 mutex_destroy(&ptr->tx_too_big_mutex);
1434 1069 (void) ibt_free_ud_dest(ptr->ac_dest);
1435 1070 }
1436 1071
1437 1072 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1438 1073 ASSERT(GET_REF(ptr) == 0);
1439 1074 mutex_destroy(&ptr->tx_too_big_mutex);
1440 1075 (void) ibt_free_ud_dest(ptr->ac_dest);
1441 1076 }
1442 1077
1443 1078 list_destroy(&state->id_ah_free);
1444 1079 list_destroy(&state->id_ah_active);
1445 1080 list_destroy(&state->id_mc_full);
1446 1081 list_destroy(&state->id_mc_non);
1447 1082 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah);
1448 1083 mutex_exit(&state->id_ac_mutex);
1449 1084 mutex_destroy(&state->id_ac_mutex);
1450 1085 mutex_destroy(&state->id_mc_mutex);
1451 1086 }
1452 1087
1453 1088 /*
1454 1089 * Search AH active hash list for a cached path to input destination.
1455 1090 * If we are "just looking", hold == F. When we are in the Tx path,
1456 1091 * we set hold == T to grab a reference on the AH so that it can not
1457 1092 * be recycled to a new destination while the Tx request is posted.
1458 1093 */
1459 1094 ibd_ace_t *
1460 1095 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1461 1096 {
1462 1097 ibd_ace_t *ptr;
1463 1098
1464 1099 ASSERT(mutex_owned(&state->id_ac_mutex));
1465 1100
1466 1101 /*
1467 1102 * Do hash search.
1468 1103 */
1469 1104 if (mod_hash_find(state->id_ah_active_hash,
1470 1105 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1471 1106 if (hold)
1472 1107 INC_REF(ptr, num);
1473 1108 return (ptr);
1474 1109 }
1475 1110 return (NULL);
1476 1111 }
1477 1112
1478 1113 /*
1479 1114 * This is called by the tx side; if an initialized AH is found in
1480 1115 * the active list, it is locked down and can be used; if no entry
1481 1116 * is found, an async request is queued to do path resolution.
1482 1117 */
1483 1118 static ibd_ace_t *
1484 1119 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1485 1120 {
1486 1121 ibd_ace_t *ptr;
1487 1122 ibd_req_t *req;
1488 1123
1489 1124 /*
1490 1125 * Only attempt to print when we can; in the mdt pattr case, the
1491 1126 * address is not aligned properly.
1492 1127 */
1493 1128 if (((ulong_t)mac & 3) == 0) {
1494 1129 DPRINT(4,
1495 1130 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1496 1131 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1497 1132 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1498 1133 htonl(mac->ipoib_gidsuff[1]));
1499 1134 }
1500 1135
1501 1136 mutex_enter(&state->id_ac_mutex);
1502 1137
1503 1138 if (((ptr = state->id_ac_hot_ace) != NULL) &&
1504 1139 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1505 1140 INC_REF(ptr, numwqe);
1506 1141 mutex_exit(&state->id_ac_mutex);
1507 1142 return (ptr);
1508 1143 }
1509 1144 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1510 1145 state->id_ac_hot_ace = ptr;
1511 1146 mutex_exit(&state->id_ac_mutex);
1512 1147 return (ptr);
1513 1148 }
1514 1149
1515 1150 /*
1516 1151 * Implementation of a single outstanding async request; if
1517 1152 * the operation is not started yet, queue a request and move
1518 1153 * to ongoing state. Remember in id_ah_addr for which address
1519 1154 * we are queueing the request, in case we need to flag an error;
1520 1155 * Any further requests, for the same or different address, until
1521 1156 * the operation completes, is sent back to GLDv3 to be retried.
1522 1157 * The async thread will update id_ah_op with an error indication
1523 1158 * or will set it to indicate the next look up can start; either
1524 1159 * way, it will mac_tx_update() so that all blocked requests come
1525 1160 * back here.
1526 1161 */
1527 1162 *err = EAGAIN;
1528 1163 if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1529 1164 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1530 1165 if (req != NULL) {
1531 1166 /*
1532 1167 * We did not even find the entry; queue a request
1533 1168 * for it.
1534 1169 */
1535 1170 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1536 1171 state->id_ah_op = IBD_OP_ONGOING;
1537 1172 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1538 1173 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1539 1174 }
1540 1175 } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1541 1176 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1542 1177 /*
1543 1178 * Check the status of the pathrecord lookup request
1544 1179 * we had queued before.
1545 1180 */
1546 1181 if (state->id_ah_op == IBD_OP_ERRORED) {
1547 1182 *err = EFAULT;
1548 1183 state->id_ah_error++;
1549 1184 } else {
1550 1185 /*
1551 1186 * IBD_OP_ROUTERED case: We need to send to the
1552 1187 * all-router MCG. If we can find the AH for
1553 1188 * the mcg, the Tx will be attempted. If we
1554 1189 * do not find the AH, we return NORESOURCES
1555 1190 * to retry.
1556 1191 */
1557 1192 ipoib_mac_t routermac;
1558 1193
1559 1194 (void) ibd_get_allroutergroup(state, mac, &routermac);
1560 1195 ptr = ibd_acache_find(state, &routermac, B_TRUE,
1561 1196 numwqe);
1562 1197 }
1563 1198 state->id_ah_op = IBD_OP_NOTSTARTED;
1564 1199 } else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1565 1200 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1566 1201 /*
1567 1202 * This case can happen when we get a higher band
1568 1203 * packet. The easiest way is to reset the state machine
1569 1204 * to accommodate the higher priority packet.
1570 1205 */
1571 1206 state->id_ah_op = IBD_OP_NOTSTARTED;
1572 1207 }
1573 1208 mutex_exit(&state->id_ac_mutex);
1574 1209
1575 1210 return (ptr);
1576 1211 }
1577 1212
1578 1213 /*
1579 1214 * Grab a not-currently-in-use AH/PathRecord from the active
1580 1215 * list to recycle to a new destination. Only the async thread
1581 1216 * executes this code.
1582 1217 */
1583 1218 static ibd_ace_t *
1584 1219 ibd_acache_get_unref(ibd_state_t *state)
1585 1220 {
1586 1221 ibd_ace_t *ptr = list_tail(&state->id_ah_active);
1587 1222 boolean_t try_rc_chan_recycle = B_FALSE;
1588 1223
1589 1224 ASSERT(mutex_owned(&state->id_ac_mutex));
1590 1225
1591 1226 /*
1592 1227 * Do plain linear search.
1593 1228 */
1594 1229 while (ptr != NULL) {
1595 1230 /*
1596 1231 * Note that it is possible that the "cycle" bit
1597 1232 * is set on the AH w/o any reference count. The
1598 1233 * mcg must have been deleted, and the tx cleanup
1599 1234 * just decremented the reference count to 0, but
1600 1235 * hasn't gotten around to grabbing the id_ac_mutex
1601 1236 * to move the AH into the free list.
1602 1237 */
1603 1238 if (GET_REF(ptr) == 0) {
1604 1239 if (ptr->ac_chan != NULL) {
1605 1240 ASSERT(state->id_enable_rc == B_TRUE);
1606 1241 if (!try_rc_chan_recycle) {
1607 1242 try_rc_chan_recycle = B_TRUE;
1608 1243 ibd_rc_signal_ace_recycle(state, ptr);
1609 1244 }
1610 1245 } else {
1611 1246 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1612 1247 break;
1613 1248 }
1614 1249 }
1615 1250 ptr = list_prev(&state->id_ah_active, ptr);
1616 1251 }
1617 1252 return (ptr);
1618 1253 }
1619 1254
1620 1255 /*
1621 1256 * Invoked to clean up AH from active list in case of multicast
1622 1257 * disable and to handle sendonly memberships during mcg traps.
1623 1258 * And for port up processing for multicast and unicast AHs.
1624 1259 * Normally, the AH is taken off the active list, and put into
1625 1260 * the free list to be recycled for a new destination. In case
1626 1261 * Tx requests on the AH have not completed yet, the AH is marked
1627 1262 * for reaping (which will put the AH on the free list) once the Tx's
1628 1263 * complete; in this case, depending on the "force" input, we take
1629 1264 * out the AH from the active list right now, or leave it also for
1630 1265 * the reap operation. Returns TRUE if the AH is taken off the active
1631 1266 * list (and either put into the free list right now, or arranged for
1632 1267 * later), FALSE otherwise.
1633 1268 */
1634 1269 boolean_t
1635 1270 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1636 1271 {
1637 1272 ibd_ace_t *acactive;
1638 1273 boolean_t ret = B_TRUE;
1639 1274
1640 1275 ASSERT(mutex_owned(&state->id_ac_mutex));
1641 1276
1642 1277 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1643 1278
1644 1279 /*
1645 1280 * Note that the AH might already have the cycle bit set
1646 1281 * on it; this might happen if sequences of multicast
1647 1282 * enables and disables are coming so fast, that posted
1648 1283 * Tx's to the mcg have not completed yet, and the cycle
1649 1284 * bit is set successively by each multicast disable.
1650 1285 */
1651 1286 if (SET_CYCLE_IF_REF(acactive)) {
1652 1287 if (!force) {
1653 1288 /*
1654 1289 * The ace is kept on the active list, further
1655 1290 * Tx's can still grab a reference on it; the
1656 1291 * ace is reaped when all pending Tx's
1657 1292 * referencing the AH complete.
1658 1293 */
1659 1294 ret = B_FALSE;
1660 1295 } else {
1661 1296 /*
1662 1297 * In the mcg trap case, we always pull the
1663 1298 * AH from the active list. And also the port
1664 1299 * up multi/unicast case.
1665 1300 */
1666 1301 ASSERT(acactive->ac_chan == NULL);
1667 1302 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1668 1303 acactive->ac_mce = NULL;
1669 1304 }
1670 1305 } else {
1671 1306 /*
1672 1307 * Determined the ref count is 0, thus reclaim
1673 1308 * immediately after pulling out the ace from
1674 1309 * the active list.
1675 1310 */
1676 1311 ASSERT(acactive->ac_chan == NULL);
1677 1312 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1678 1313 acactive->ac_mce = NULL;
1679 1314 IBD_ACACHE_INSERT_FREE(state, acactive);
1680 1315 }
1681 1316
1682 1317 }
1683 1318 return (ret);
1684 1319 }
1685 1320
1686 1321 /*
1687 1322 * Helper function for async path record lookup. If we are trying to
1688 1323 * Tx to a MCG, check our membership, possibly trying to join the
1689 1324 * group if required. If that fails, try to send the packet to the
1690 1325 * all router group (indicated by the redirect output), pointing
1691 1326 * the input mac address to the router mcg address.
1692 1327 */
1693 1328 static ibd_mce_t *
1694 1329 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1695 1330 {
1696 1331 ib_gid_t mgid;
1697 1332 ibd_mce_t *mce;
1698 1333 ipoib_mac_t routermac;
1699 1334
1700 1335 *redirect = B_FALSE;
1701 1336 ibd_n2h_gid(mac, &mgid);
1702 1337
1703 1338 /*
1704 1339 * Check the FullMember+SendOnlyNonMember list.
1705 1340 * Since we are the only one who manipulates the
1706 1341 * id_mc_full list, no locks are needed.
1707 1342 */
1708 1343 mce = IBD_MCACHE_FIND_FULL(state, mgid);
1709 1344 if (mce != NULL) {
1710 1345 DPRINT(4, "ibd_async_mcache : already joined to group");
1711 1346 return (mce);
1712 1347 }
1713 1348
1714 1349 /*
1715 1350 * Not found; try to join(SendOnlyNonMember) and attach.
1716 1351 */
1717 1352 DPRINT(4, "ibd_async_mcache : not joined to group");
1718 1353 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1719 1354 NULL) {
1720 1355 DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1721 1356 return (mce);
1722 1357 }
1723 1358
1724 1359 /*
1725 1360 * MCGroup not present; try to join the all-router group. If
1726 1361 * any of the following steps succeed, we will be redirecting
1727 1362 * to the all router group.
1728 1363 */
1729 1364 DPRINT(4, "ibd_async_mcache : nonmem join failed");
1730 1365 if (!ibd_get_allroutergroup(state, mac, &routermac))
1731 1366 return (NULL);
1732 1367 *redirect = B_TRUE;
1733 1368 ibd_n2h_gid(&routermac, &mgid);
1734 1369 bcopy(&routermac, mac, IPOIB_ADDRL);
1735 1370 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1736 1371 mgid.gid_prefix, mgid.gid_guid);
1737 1372
1738 1373 /*
1739 1374 * Are we already joined to the router group?
1740 1375 */
1741 1376 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1742 1377 DPRINT(4, "ibd_async_mcache : using already joined router"
1743 1378 "group\n");
1744 1379 return (mce);
1745 1380 }
1746 1381
1747 1382 /*
1748 1383 * Can we join(SendOnlyNonMember) the router group?
1749 1384 */
1750 1385 DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1751 1386 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1752 1387 NULL) {
1753 1388 DPRINT(4, "ibd_async_mcache : joined to router grp");
1754 1389 return (mce);
1755 1390 }
1756 1391
1757 1392 return (NULL);
1758 1393 }
1759 1394
1760 1395 /*
1761 1396 * Async path record lookup code.
1762 1397 */
1763 1398 static void
1764 1399 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1765 1400 {
1766 1401 ibd_ace_t *ce;
1767 1402 ibd_mce_t *mce = NULL;
1768 1403 ibt_path_attr_t path_attr;
1769 1404 ibt_path_info_t path_info;
1770 1405 ib_gid_t destgid;
1771 1406 char ret = IBD_OP_NOTSTARTED;
1772 1407
1773 1408 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X",
1774 1409 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1775 1410 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1776 1411 htonl(mac->ipoib_gidsuff[1]));
1777 1412
1778 1413 /*
1779 1414 * Check whether we are trying to transmit to a MCG.
1780 1415 * In that case, we need to make sure we are a member of
1781 1416 * the MCG.
1782 1417 */
1783 1418 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1784 1419 boolean_t redirected;
1785 1420
1786 1421 /*
1787 1422 * If we can not find or join the group or even
1788 1423 * redirect, error out.
1789 1424 */
1790 1425 if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1791 1426 NULL) {
1792 1427 state->id_ah_op = IBD_OP_ERRORED;
1793 1428 return;
1794 1429 }
1795 1430
1796 1431 /*
1797 1432 * If we got redirected, we need to determine whether
1798 1433 * the AH for the new mcg is in the cache already, and
1799 1434 * not pull it in then; otherwise proceed to get the
1800 1435 * path for the new mcg. There is no guarantee that
1801 1436 * if the AH is currently in the cache, it will still be
1802 1437 * there when we look in ibd_acache_lookup(), but that's
1803 1438 * okay, we will come back here.
1804 1439 */
1805 1440 if (redirected) {
1806 1441 ret = IBD_OP_ROUTERED;
1807 1442 DPRINT(4, "ibd_async_acache : redirected to "
1808 1443 "%08X:%08X:%08X:%08X:%08X",
1809 1444 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1810 1445 htonl(mac->ipoib_gidpref[1]),
1811 1446 htonl(mac->ipoib_gidsuff[0]),
1812 1447 htonl(mac->ipoib_gidsuff[1]));
1813 1448
1814 1449 mutex_enter(&state->id_ac_mutex);
1815 1450 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1816 1451 state->id_ah_op = IBD_OP_ROUTERED;
1817 1452 mutex_exit(&state->id_ac_mutex);
1818 1453 DPRINT(4, "ibd_async_acache : router AH found");
1819 1454 return;
1820 1455 }
1821 1456 mutex_exit(&state->id_ac_mutex);
1822 1457 }
1823 1458 }
1824 1459
1825 1460 /*
1826 1461 * Get an AH from the free list.
1827 1462 */
1828 1463 mutex_enter(&state->id_ac_mutex);
1829 1464 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1830 1465 /*
1831 1466 * No free ones; try to grab an unreferenced active
1832 1467 * one. Maybe we need to make the active list LRU,
1833 1468 * but that will create more work for Tx callbacks.
1834 1469 * Is there a way of not having to pull out the
1835 1470 * entry from the active list, but just indicate it
1836 1471 * is being recycled? Yes, but that creates one more
1837 1472 * check in the fast lookup path.
1838 1473 */
1839 1474 if ((ce = ibd_acache_get_unref(state)) == NULL) {
1840 1475 /*
1841 1476 * Pretty serious shortage now.
1842 1477 */
1843 1478 state->id_ah_op = IBD_OP_NOTSTARTED;
1844 1479 mutex_exit(&state->id_ac_mutex);
1845 1480 DPRINT(10, "ibd_async_acache : failed to find AH "
1846 1481 "slot\n");
1847 1482 return;
1848 1483 }
1849 1484 /*
1850 1485 * We could check whether ac_mce points to a SendOnly
1851 1486 * member and drop that membership now. Or do it lazily
1852 1487 * at detach time.
1853 1488 */
1854 1489 ce->ac_mce = NULL;
1855 1490 }
1856 1491 mutex_exit(&state->id_ac_mutex);
1857 1492 ASSERT(ce->ac_mce == NULL);
1858 1493
1859 1494 /*
1860 1495 * Update the entry.
1861 1496 */
1862 1497 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1863 1498
1864 1499 bzero(&path_info, sizeof (path_info));
1865 1500 bzero(&path_attr, sizeof (ibt_path_attr_t));
1866 1501 path_attr.pa_sgid = state->id_sgid;
1867 1502 path_attr.pa_num_dgids = 1;
1868 1503 ibd_n2h_gid(&ce->ac_mac, &destgid);
1869 1504 path_attr.pa_dgids = &destgid;
1870 1505 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1871 1506 path_attr.pa_pkey = state->id_pkey;
1872 1507 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1,
1873 1508 &path_info, NULL) != IBT_SUCCESS) {
1874 1509 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1875 1510 goto error;
1876 1511 }
1877 1512 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1878 1513 ntohl(ce->ac_mac.ipoib_qpn),
1879 1514 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1880 1515 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1881 1516 goto error;
1882 1517 }
1883 1518
1884 1519 /*
1885 1520 * mce is set whenever an AH is being associated with a
1886 1521 * MCG; this will come in handy when we leave the MCG. The
1887 1522 * lock protects Tx fastpath from scanning the active list.
1888 1523 */
1889 1524 if (mce != NULL)
1890 1525 ce->ac_mce = mce;
1891 1526
1892 1527 /*
1893 1528 * initiate a RC mode connection for unicast address
1894 1529 */
1895 1530 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) &&
1896 1531 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) {
1897 1532 ASSERT(ce->ac_chan == NULL);
1898 1533 DPRINT(10, "ibd_async_acache: call "
1899 1534 "ibd_rc_try_connect(ace=%p)", ce);
1900 1535 ibd_rc_try_connect(state, ce, &path_info);
1901 1536 if (ce->ac_chan == NULL) {
1902 1537 DPRINT(10, "ibd_async_acache: fail to setup RC"
1903 1538 " channel");
1904 1539 state->rc_conn_fail++;
1905 1540 goto error;
1906 1541 }
1907 1542 }
1908 1543
1909 1544 mutex_enter(&state->id_ac_mutex);
1910 1545 IBD_ACACHE_INSERT_ACTIVE(state, ce);
1911 1546 state->id_ah_op = ret;
1912 1547 mutex_exit(&state->id_ac_mutex);
1913 1548 return;
1914 1549 error:
1915 1550 /*
1916 1551 * We might want to drop SendOnly membership here if we
1917 1552 * joined above. The lock protects Tx callbacks inserting
1918 1553 * into the free list.
1919 1554 */
1920 1555 mutex_enter(&state->id_ac_mutex);
1921 1556 state->id_ah_op = IBD_OP_ERRORED;
1922 1557 IBD_ACACHE_INSERT_FREE(state, ce);
1923 1558 mutex_exit(&state->id_ac_mutex);
1924 1559 }
1925 1560
1926 1561 /*
1927 1562 * While restoring port's presence on the subnet on a port up, it is possible
1928 1563 * that the port goes down again.
1929 1564 */
1930 1565 static void
1931 1566 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1932 1567 {
1933 1568 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1934 1569 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1935 1570 LINK_STATE_UP;
1936 1571 ibd_mce_t *mce, *pmce;
1937 1572 ibd_ace_t *ace, *pace;
1938 1573
↓ open down ↓ |
626 lines elided |
↑ open up ↑ |
1939 1574 DPRINT(10, "ibd_async_link(): %d", opcode);
1940 1575
1941 1576 /*
1942 1577 * On a link up, revalidate the link speed/width. No point doing
1943 1578 * this on a link down, since we will be unable to do SA operations,
1944 1579 * defaulting to the lowest speed. Also notice that we update our
1945 1580 * notion of speed before calling mac_link_update(), which will do
1946 1581 * necessary higher level notifications for speed changes.
1947 1582 */
1948 1583 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1949 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1950 1584 state->id_link_speed = ibd_get_portspeed(state);
1951 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1952 1585 }
1953 1586
1954 1587 /*
1955 1588 * Do all the work required to establish our presence on
1956 1589 * the subnet.
1957 1590 */
1958 1591 if (opcode == IBD_LINK_UP_ABSENT) {
1959 1592 /*
1960 1593 * If in promiscuous mode ...
1961 1594 */
1962 1595 if (state->id_prom_op == IBD_OP_COMPLETED) {
1963 1596 /*
1964 1597 * Drop all nonmembership.
1965 1598 */
1966 1599 ibd_async_unsetprom(state);
1967 1600
1968 1601 /*
1969 1602 * Then, try to regain nonmembership to all mcg's.
1970 1603 */
1971 1604 ibd_async_setprom(state);
1972 1605
1973 1606 }
1974 1607
1975 1608 /*
1976 1609 * Drop all sendonly membership (which also gets rid of the
1977 1610 * AHs); try to reacquire all full membership.
1978 1611 */
1979 1612 mce = list_head(&state->id_mc_full);
1980 1613 while ((pmce = mce) != NULL) {
1981 1614 mce = list_next(&state->id_mc_full, mce);
1982 1615 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1983 1616 ibd_leave_group(state,
1984 1617 pmce->mc_info.mc_adds_vect.av_dgid,
1985 1618 IB_MC_JSTATE_SEND_ONLY_NON);
1986 1619 else
1987 1620 ibd_reacquire_group(state, pmce);
1988 1621 }
1989 1622
1990 1623 /*
1991 1624 * Recycle all active AHs to free list (and if there are
1992 1625 * pending posts, make sure they will go into the free list
1993 1626 * once the Tx's complete). Grab the lock to prevent
1994 1627 * concurrent Tx's as well as Tx cleanups.
1995 1628 */
1996 1629 mutex_enter(&state->id_ac_mutex);
1997 1630 ace = list_head(&state->id_ah_active);
1998 1631 while ((pace = ace) != NULL) {
1999 1632 boolean_t cycled;
2000 1633
2001 1634 ace = list_next(&state->id_ah_active, ace);
2002 1635 mce = pace->ac_mce;
2003 1636 if (pace->ac_chan != NULL) {
2004 1637 ASSERT(mce == NULL);
2005 1638 ASSERT(state->id_enable_rc == B_TRUE);
2006 1639 if (pace->ac_chan->chan_state ==
2007 1640 IBD_RC_STATE_ACT_ESTAB) {
2008 1641 INC_REF(pace, 1);
2009 1642 IBD_ACACHE_PULLOUT_ACTIVE(state, pace);
2010 1643 pace->ac_chan->chan_state =
2011 1644 IBD_RC_STATE_ACT_CLOSING;
2012 1645 ibd_rc_signal_act_close(state, pace);
2013 1646 } else {
2014 1647 state->rc_act_close_simultaneous++;
2015 1648 DPRINT(40, "ibd_async_link: other "
2016 1649 "thread is closing it, ace=%p, "
2017 1650 "ac_chan=%p, chan_state=%d",
2018 1651 pace, pace->ac_chan,
2019 1652 pace->ac_chan->chan_state);
2020 1653 }
2021 1654 } else {
2022 1655 cycled = ibd_acache_recycle(state,
2023 1656 &pace->ac_mac, B_TRUE);
2024 1657 }
2025 1658 /*
2026 1659 * If this is for an mcg, it must be for a fullmember,
2027 1660 * since we got rid of send-only members above when
2028 1661 * processing the mce list.
2029 1662 */
2030 1663 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
2031 1664 IB_MC_JSTATE_FULL)));
2032 1665
2033 1666 /*
2034 1667 * Check if the fullmember mce needs to be torn down,
2035 1668 * ie whether the DLPI disable has already been done.
2036 1669 * If so, do some of the work of tx_cleanup, namely
2037 1670 * causing leave (which will fail), detach and
2038 1671 * mce-freeing. tx_cleanup will put the AH into free
2039 1672 * list. The reason to duplicate some of this
2040 1673 * tx_cleanup work is because we want to delete the
2041 1674 * AH right now instead of waiting for tx_cleanup, to
2042 1675 * force subsequent Tx's to reacquire an AH.
2043 1676 */
2044 1677 if ((mce != NULL) && (mce->mc_fullreap))
2045 1678 ibd_async_reap_group(state, mce,
2046 1679 mce->mc_info.mc_adds_vect.av_dgid,
2047 1680 mce->mc_jstate);
2048 1681 }
2049 1682 mutex_exit(&state->id_ac_mutex);
2050 1683 }
2051 1684
2052 1685 /*
2053 1686 * mac handle is guaranteed to exist since driver does ibt_close_hca()
2054 1687 * (which stops further events from being delivered) before
2055 1688 * mac_unregister(). At this point, it is guaranteed that mac_register
2056 1689 * has already been done.
2057 1690 */
2058 1691 mutex_enter(&state->id_link_mutex);
2059 1692 state->id_link_state = lstate;
2060 1693 mac_link_update(state->id_mh, lstate);
2061 1694 mutex_exit(&state->id_link_mutex);
2062 1695
2063 1696 ibd_async_done(state);
2064 1697 }
2065 1698
2066 1699 /*
2067 1700 * Check the pkey table to see if we can find the pkey we're looking for.
2068 1701 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
2069 1702 * failure.
2070 1703 */
2071 1704 static int
2072 1705 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
2073 1706 uint16_t *pkix)
2074 1707 {
2075 1708 uint16_t ndx;
2076 1709
2077 1710 ASSERT(pkix != NULL);
2078 1711
2079 1712 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
2080 1713 if (pkey_tbl[ndx] == pkey) {
2081 1714 *pkix = ndx;
2082 1715 return (0);
2083 1716 }
2084 1717 }
2085 1718 return (-1);
2086 1719 }
2087 1720
2088 1721 /*
2089 1722 * Late HCA Initialization:
2090 1723 * If plumb had succeeded without the availability of an active port or the
2091 1724 * pkey, and either of their availability is now being indicated via PORT_UP
2092 1725 * or PORT_CHANGE respectively, try a start of the interface.
2093 1726 *
2094 1727 * Normal Operation:
2095 1728 * When the link is notified up, we need to do a few things, based
2096 1729 * on the port's current p_init_type_reply claiming a reinit has been
2097 1730 * done or not. The reinit steps are:
2098 1731 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2099 1732 * the old Pkey and GID0 are correct.
2100 1733 * 2. Register for mcg traps (already done by ibmf).
2101 1734 * 3. If PreservePresenceReply indicates the SM has restored port's presence
2102 1735 * in subnet, nothing more to do. Else go to next steps (on async daemon).
2103 1736 * 4. Give up all sendonly memberships.
2104 1737 * 5. Acquire all full memberships.
2105 1738 * 6. In promiscuous mode, acquire all non memberships.
2106 1739 * 7. Recycle all AHs to free list.
2107 1740 */
2108 1741 static void
2109 1742 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2110 1743 {
2111 1744 ibt_hca_portinfo_t *port_infop = NULL;
2112 1745 ibt_status_t ibt_status;
2113 1746 uint_t psize, port_infosz;
2114 1747 ibd_link_op_t opcode;
2115 1748 ibd_req_t *req;
2116 1749 link_state_t new_link_state = LINK_STATE_UP;
2117 1750 uint8_t itreply;
2118 1751 uint16_t pkix;
2119 1752 int ret;
2120 1753
2121 1754 /*
2122 1755 * Let's not race with a plumb or an unplumb; if we detect a
2123 1756 * pkey relocation event later on here, we may have to restart.
2124 1757 */
2125 1758 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2126 1759
2127 1760 mutex_enter(&state->id_link_mutex);
2128 1761
2129 1762 /*
2130 1763 * If the link state is unknown, a plumb has not yet been attempted
2131 1764 * on the interface. Nothing to do.
2132 1765 */
2133 1766 if (state->id_link_state == LINK_STATE_UNKNOWN) {
2134 1767 mutex_exit(&state->id_link_mutex);
2135 1768 goto link_mod_return;
2136 1769 }
2137 1770
2138 1771 /*
2139 1772 * If link state is down because of plumb failure, and we are not in
2140 1773 * late HCA init, and we were not successfully plumbed, nothing to do.
2141 1774 */
2142 1775 if ((state->id_link_state == LINK_STATE_DOWN) &&
2143 1776 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) &&
2144 1777 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) {
2145 1778 mutex_exit(&state->id_link_mutex);
2146 1779 goto link_mod_return;
2147 1780 }
2148 1781
2149 1782 /*
2150 1783 * If this routine was called in response to a port down event,
2151 1784 * we just need to see if this should be informed.
2152 1785 */
2153 1786 if (code == IBT_ERROR_PORT_DOWN) {
2154 1787 new_link_state = LINK_STATE_DOWN;
2155 1788 goto update_link_state;
2156 1789 }
2157 1790
2158 1791 /*
2159 1792 * If it's not a port down event we've received, try to get the port
2160 1793 * attributes first. If we fail here, the port is as good as down.
2161 1794 * Otherwise, if the link went down by the time the handler gets
2162 1795 * here, give up - we cannot even validate the pkey/gid since those
2163 1796 * are not valid and this is as bad as a port down anyway.
2164 1797 */
2165 1798 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2166 1799 &port_infop, &psize, &port_infosz);
2167 1800 if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2168 1801 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2169 1802 new_link_state = LINK_STATE_DOWN;
2170 1803 goto update_link_state;
2171 1804 }
2172 1805
2173 1806 /*
2174 1807 * If in the previous attempt, the pkey was not found either due to the
2175 1808 * port state being down, or due to it's absence in the pkey table,
2176 1809 * look for it now and try to start the interface.
2177 1810 */
2178 1811 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) {
2179 1812 mutex_exit(&state->id_link_mutex);
2180 1813 if ((ret = ibd_start(state)) != 0) {
2181 1814 DPRINT(10, "ibd_linkmod: cannot start from late HCA "
2182 1815 "init, ret=%d", ret);
2183 1816 }
2184 1817 ibt_free_portinfo(port_infop, port_infosz);
2185 1818 goto link_mod_return;
2186 1819 }
2187 1820
2188 1821 /*
2189 1822 * Check the SM InitTypeReply flags. If both NoLoadReply and
2190 1823 * PreserveContentReply are 0, we don't know anything about the
2191 1824 * data loaded into the port attributes, so we need to verify
2192 1825 * if gid0 and pkey are still valid.
↓ open down ↓ |
231 lines elided |
↑ open up ↑ |
2193 1826 */
2194 1827 itreply = port_infop->p_init_type_reply;
2195 1828 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2196 1829 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2197 1830 /*
2198 1831 * Check to see if the subnet part of GID0 has changed. If
2199 1832 * not, check the simple case first to see if the pkey
2200 1833 * index is the same as before; finally check to see if the
2201 1834 * pkey has been relocated to a different index in the table.
2202 1835 */
2203 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2204 1836 if (bcmp(port_infop->p_sgid_tbl,
2205 1837 &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2206 1838
2207 1839 new_link_state = LINK_STATE_DOWN;
2208 1840
2209 1841 } else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2210 1842 state->id_pkey) {
2211 1843
2212 1844 new_link_state = LINK_STATE_UP;
2213 1845
2214 1846 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2215 1847 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2216 1848
2217 1849 ibt_free_portinfo(port_infop, port_infosz);
2218 1850 mutex_exit(&state->id_link_mutex);
2219 1851
2220 1852 /*
2221 1853 * Currently a restart is required if our pkey has moved
2222 1854 * in the pkey table. If we get the ibt_recycle_ud() to
2223 1855 * work as documented (expected), we may be able to
2224 1856 * avoid a complete restart. Note that we've already
2225 1857 * marked both the start and stop 'in-progress' flags,
2226 1858 * so it is ok to go ahead and do this restart.
2227 1859 */
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
2228 1860 (void) ibd_undo_start(state, LINK_STATE_DOWN);
2229 1861 if ((ret = ibd_start(state)) != 0) {
2230 1862 DPRINT(10, "ibd_restart: cannot restart, "
2231 1863 "ret=%d", ret);
2232 1864 }
2233 1865
2234 1866 goto link_mod_return;
2235 1867 } else {
2236 1868 new_link_state = LINK_STATE_DOWN;
2237 1869 }
2238 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2239 1870 }
2240 1871
2241 1872 update_link_state:
2242 1873 if (port_infop) {
2243 1874 ibt_free_portinfo(port_infop, port_infosz);
2244 1875 }
2245 1876
2246 1877 /*
2247 1878 * If we're reporting a link up, check InitTypeReply to see if
2248 1879 * the SM has ensured that the port's presence in mcg, traps,
2249 1880 * etc. is intact.
2250 1881 */
2251 1882 if (new_link_state == LINK_STATE_DOWN) {
2252 1883 opcode = IBD_LINK_DOWN;
2253 1884 } else {
2254 1885 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2255 1886 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2256 1887 opcode = IBD_LINK_UP;
2257 1888 } else {
2258 1889 opcode = IBD_LINK_UP_ABSENT;
2259 1890 }
2260 1891 }
2261 1892
2262 1893 /*
2263 1894 * If the old state is the same as the new state, and the SM indicated
2264 1895 * no change in the port parameters, nothing to do.
2265 1896 */
2266 1897 if ((state->id_link_state == new_link_state) && (opcode !=
2267 1898 IBD_LINK_UP_ABSENT)) {
2268 1899 mutex_exit(&state->id_link_mutex);
2269 1900 goto link_mod_return;
2270 1901 }
2271 1902
2272 1903 /*
2273 1904 * Ok, so there was a link state change; see if it's safe to ask
2274 1905 * the async thread to do the work
2275 1906 */
2276 1907 if (!ibd_async_safe(state)) {
2277 1908 state->id_link_state = new_link_state;
2278 1909 mutex_exit(&state->id_link_mutex);
2279 1910 goto link_mod_return;
2280 1911 }
2281 1912
2282 1913 mutex_exit(&state->id_link_mutex);
2283 1914
2284 1915 /*
2285 1916 * Queue up a request for ibd_async_link() to handle this link
2286 1917 * state change event
2287 1918 */
2288 1919 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2289 1920 req->rq_ptr = (void *)opcode;
2290 1921 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2291 1922
2292 1923 link_mod_return:
2293 1924 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2294 1925 }
2295 1926
2296 1927 /*
2297 1928 * For the port up/down events, IBTL guarantees there will not be concurrent
2298 1929 * invocations of the handler. IBTL might coalesce link transition events,
2299 1930 * and not invoke the handler for _each_ up/down transition, but it will
2300 1931 * invoke the handler with last known state
2301 1932 */
2302 1933 static void
2303 1934 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2304 1935 ibt_async_code_t code, ibt_async_event_t *event)
2305 1936 {
2306 1937 ibd_state_t *state = (ibd_state_t *)clnt_private;
2307 1938
2308 1939 switch (code) {
2309 1940 case IBT_ERROR_CATASTROPHIC_CHAN:
2310 1941 ibd_print_warn(state, "catastrophic channel error");
2311 1942 break;
2312 1943 case IBT_ERROR_CQ:
2313 1944 ibd_print_warn(state, "completion queue error");
2314 1945 break;
2315 1946 case IBT_PORT_CHANGE_EVENT:
2316 1947 /*
2317 1948 * Events will be delivered to all instances that have
2318 1949 * done ibt_open_hca() but not yet done ibt_close_hca().
2319 1950 * Only need to do work for our port; IBTF will deliver
2320 1951 * events for other ports on the hca we have ibt_open_hca'ed
2321 1952 * too. Note that id_port is initialized in ibd_attach()
2322 1953 * before we do an ibt_open_hca() in ibd_attach().
2323 1954 */
2324 1955 ASSERT(state->id_hca_hdl == hca_hdl);
2325 1956 if (state->id_port != event->ev_port)
2326 1957 break;
2327 1958
2328 1959 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2329 1960 IBT_PORT_CHANGE_PKEY) {
2330 1961 ibd_link_mod(state, code);
2331 1962 }
2332 1963 break;
2333 1964 case IBT_ERROR_PORT_DOWN:
2334 1965 case IBT_CLNT_REREG_EVENT:
2335 1966 case IBT_EVENT_PORT_UP:
2336 1967 /*
2337 1968 * Events will be delivered to all instances that have
2338 1969 * done ibt_open_hca() but not yet done ibt_close_hca().
2339 1970 * Only need to do work for our port; IBTF will deliver
2340 1971 * events for other ports on the hca we have ibt_open_hca'ed
2341 1972 * too. Note that id_port is initialized in ibd_attach()
2342 1973 * before we do an ibt_open_hca() in ibd_attach().
2343 1974 */
2344 1975 ASSERT(state->id_hca_hdl == hca_hdl);
2345 1976 if (state->id_port != event->ev_port)
2346 1977 break;
2347 1978
2348 1979 ibd_link_mod(state, code);
2349 1980 break;
2350 1981
2351 1982 case IBT_HCA_ATTACH_EVENT:
2352 1983 case IBT_HCA_DETACH_EVENT:
2353 1984 /*
2354 1985 * When a new card is plugged to the system, attach_event is
2355 1986 * invoked. Additionally, a cfgadm needs to be run to make the
2356 1987 * card known to the system, and an ifconfig needs to be run to
2357 1988 * plumb up any ibd interfaces on the card. In the case of card
2358 1989 * unplug, a cfgadm is run that will trigger any RCM scripts to
2359 1990 * unplumb the ibd interfaces on the card; when the card is
2360 1991 * actually unplugged, the detach_event is invoked;
2361 1992 * additionally, if any ibd instances are still active on the
2362 1993 * card (eg there were no associated RCM scripts), driver's
2363 1994 * detach routine is invoked.
2364 1995 */
2365 1996 break;
2366 1997 default:
2367 1998 break;
2368 1999 }
2369 2000 }
2370 2001
2371 2002 static int
2372 2003 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2373 2004 {
2374 2005 mac_register_t *macp;
2375 2006 int ret;
2376 2007
2377 2008 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2378 2009 DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2379 2010 return (DDI_FAILURE);
2380 2011 }
2381 2012
2382 2013 /*
2383 2014 * Note that when we register with mac during attach, we don't
2384 2015 * have the id_macaddr yet, so we'll simply be registering a
2385 2016 * zero macaddr that we'll overwrite later during plumb (in
2386 2017 * ibd_m_start()). Similar is the case with id_mtu - we'll
2387 2018 * update the mac layer with the correct mtu during plumb.
2388 2019 */
2389 2020 macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2390 2021 macp->m_driver = state;
2391 2022 macp->m_dip = dip;
2392 2023 macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2393 2024 macp->m_callbacks = &ibd_m_callbacks;
2394 2025 macp->m_min_sdu = 0;
2395 2026 macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
2396 2027 if (state->id_type == IBD_PORT_DRIVER) {
2397 2028 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
2398 2029 } else if (state->id_enable_rc) {
2399 2030 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE;
2400 2031 } else {
2401 2032 macp->m_max_sdu = IBD_DEF_MAX_SDU;
2402 2033 }
2403 2034 macp->m_priv_props = ibd_priv_props;
2404 2035
2405 2036 /*
2406 2037 * Register ourselves with the GLDv3 interface
2407 2038 */
2408 2039 if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2409 2040 mac_free(macp);
2410 2041 DPRINT(10,
2411 2042 "ibd_register_mac: mac_register() failed, ret=%d", ret);
2412 2043 return (DDI_FAILURE);
2413 2044 }
2414 2045
↓ open down ↓ |
166 lines elided |
↑ open up ↑ |
2415 2046 mac_free(macp);
2416 2047 return (DDI_SUCCESS);
2417 2048 }
2418 2049
2419 2050 static int
2420 2051 ibd_record_capab(ibd_state_t *state)
2421 2052 {
2422 2053 ibt_hca_attr_t hca_attrs;
2423 2054 ibt_status_t ibt_status;
2424 2055
2425 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2426 -
2427 2056 /*
2428 2057 * Query the HCA and fetch its attributes
2429 2058 */
2430 2059 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2431 2060 ASSERT(ibt_status == IBT_SUCCESS);
2432 2061
2433 2062 /*
2434 2063 * 1. Set the Hardware Checksum capability. Currently we only consider
2435 2064 * full checksum offload.
2436 2065 */
2437 2066 if (state->id_enable_rc) {
2438 2067 state->id_hwcksum_capab = 0;
2439 2068 } else {
2440 2069 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL)
2441 2070 == IBT_HCA_CKSUM_FULL) {
2442 2071 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2443 2072 }
2444 2073 }
2445 2074
2446 2075 /*
2447 2076 * 2. Set LSO policy, capability and maximum length
2448 2077 */
2449 2078 if (state->id_enable_rc) {
2450 2079 state->id_lso_capable = B_FALSE;
2451 2080 state->id_lso_maxlen = 0;
2452 2081 } else {
2453 2082 if (hca_attrs.hca_max_lso_size > 0) {
2454 2083 state->id_lso_capable = B_TRUE;
2455 2084 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2456 2085 state->id_lso_maxlen = IBD_LSO_MAXLEN;
2457 2086 else
2458 2087 state->id_lso_maxlen =
2459 2088 hca_attrs.hca_max_lso_size;
2460 2089 } else {
2461 2090 state->id_lso_capable = B_FALSE;
2462 2091 state->id_lso_maxlen = 0;
2463 2092 }
2464 2093 }
2465 2094
2466 2095 /*
2467 2096 * 3. Set Reserved L_Key capability
2468 2097 */
2469 2098 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2470 2099 state->id_hca_res_lkey_capab = 1;
2471 2100 state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2472 2101 state->rc_enable_iov_map = B_TRUE;
2473 2102 } else {
2474 2103 /* If no reserved lkey, we will not use ibt_map_mem_iov */
2475 2104 state->rc_enable_iov_map = B_FALSE;
2476 2105 }
2477 2106
2478 2107 /*
2479 2108 * 4. Set maximum sqseg value after checking to see if extended sgl
2480 2109 * size information is provided by the hca
2481 2110 */
2482 2111 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2483 2112 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2484 2113 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz;
2485 2114 } else {
2486 2115 state->id_max_sqseg = hca_attrs.hca_max_sgl;
2487 2116 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl;
2488 2117 }
2489 2118 if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2490 2119 state->id_max_sqseg = IBD_MAX_SQSEG;
2491 2120 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2492 2121 ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2493 2122 state->id_max_sqseg, IBD_MAX_SQSEG);
2494 2123 }
2495 2124 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) {
2496 2125 state->rc_tx_max_sqseg = IBD_MAX_SQSEG;
2497 2126 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) {
2498 2127 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of "
2499 2128 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG);
2500 2129 }
2501 2130
2502 2131 /*
2503 2132 * Translating the virtual address regions into physical regions
2504 2133 * for using the Reserved LKey feature results in a wr sgl that
2505 2134 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2506 2135 * we'll fix a high-water mark (65%) for when we should stop.
2507 2136 */
2508 2137 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2509 2138 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100;
2510 2139
2511 2140 /*
2512 2141 * 5. Set number of recv and send wqes after checking hca maximum
2513 2142 * channel size. Store the max channel size in the state so that it
2514 2143 * can be referred to when the swqe/rwqe change is requested via
2515 2144 * dladm.
2516 2145 */
2517 2146
2518 2147 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz;
↓ open down ↓ |
82 lines elided |
↑ open up ↑ |
2519 2148
2520 2149 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe)
2521 2150 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz;
2522 2151
2523 2152 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe -
2524 2153 IBD_RWQE_MIN;
2525 2154
2526 2155 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe)
2527 2156 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz;
2528 2157
2529 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2530 -
2531 2158 return (DDI_SUCCESS);
2532 2159 }
2533 2160
2534 2161 static int
2535 2162 ibd_part_busy(ibd_state_t *state)
2536 2163 {
2537 2164 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) {
2538 2165 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n");
2539 2166 return (DDI_FAILURE);
2540 2167 }
2541 2168
2542 2169 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) {
2543 2170 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n");
2544 2171 return (DDI_FAILURE);
2545 2172 }
2546 2173
2547 2174 /*
2548 2175 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is
2549 2176 * connecting to a remote IPoIB port. We can't remove this port.
2550 2177 */
2551 2178 if (state->id_ah_op == IBD_OP_ONGOING) {
2552 2179 DPRINT(10, "ibd_part_busy: failed: connecting\n");
2553 2180 return (DDI_FAILURE);
2554 2181 }
2555 2182
2556 2183 return (DDI_SUCCESS);
2557 2184 }
2558 2185
2559 2186
2560 2187 static void
2561 2188 ibd_part_unattach(ibd_state_t *state)
2562 2189 {
2563 2190 uint32_t progress = state->id_mac_state;
2564 2191 ibt_status_t ret;
2565 2192
2566 2193 /* make sure rx resources are freed */
2567 2194 ibd_free_rx_rsrcs(state);
2568 2195
2569 2196 if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
2570 2197 ASSERT(state->id_enable_rc);
2571 2198 ibd_rc_fini_srq_list(state);
2572 2199 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
2573 2200 }
2574 2201
2575 2202 if (progress & IBD_DRV_MAC_REGISTERED) {
2576 2203 (void) mac_unregister(state->id_mh);
2577 2204 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2578 2205 }
2579 2206
2580 2207 if (progress & IBD_DRV_ASYNC_THR_CREATED) {
2581 2208 /*
2582 2209 * No new async requests will be posted since the device
2583 2210 * link state has been marked as unknown; completion handlers
2584 2211 * have been turned off, so Tx handler will not cause any
2585 2212 * more IBD_ASYNC_REAP requests.
2586 2213 *
2587 2214 * Queue a request for the async thread to exit, which will
2588 2215 * be serviced after any pending ones. This can take a while,
2589 2216 * specially if the SM is unreachable, since IBMF will slowly
2590 2217 * timeout each SM request issued by the async thread. Reap
2591 2218 * the thread before continuing on, we do not want it to be
2592 2219 * lingering in modunloaded code.
2593 2220 */
2594 2221 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
2595 2222 thread_join(state->id_async_thrid);
2596 2223
2597 2224 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
2598 2225 }
2599 2226
2600 2227 if (progress & IBD_DRV_REQ_LIST_INITED) {
2601 2228 list_destroy(&state->id_req_list);
2602 2229 mutex_destroy(&state->id_acache_req_lock);
2603 2230 cv_destroy(&state->id_acache_req_cv);
2604 2231 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED;
2605 2232 }
2606 2233
2607 2234 if (progress & IBD_DRV_PD_ALLOCD) {
2608 2235 if ((ret = ibt_free_pd(state->id_hca_hdl,
2609 2236 state->id_pd_hdl)) != IBT_SUCCESS) {
2610 2237 ibd_print_warn(state, "failed to free "
2611 2238 "protection domain, ret=%d", ret);
2612 2239 }
2613 2240 state->id_pd_hdl = NULL;
2614 2241 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2615 2242 }
2616 2243
2617 2244 if (progress & IBD_DRV_HCA_OPENED) {
2618 2245 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2619 2246 IBT_SUCCESS) {
2620 2247 ibd_print_warn(state, "failed to close "
2621 2248 "HCA device, ret=%d", ret);
2622 2249 }
2623 2250 state->id_hca_hdl = NULL;
2624 2251 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2625 2252 }
2626 2253
2627 2254 mutex_enter(&ibd_gstate.ig_mutex);
2628 2255 if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2629 2256 if ((ret = ibt_detach(state->id_ibt_hdl)) !=
2630 2257 IBT_SUCCESS) {
2631 2258 ibd_print_warn(state,
2632 2259 "ibt_detach() failed, ret=%d", ret);
2633 2260 }
2634 2261 state->id_ibt_hdl = NULL;
2635 2262 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2636 2263 ibd_gstate.ig_ibt_hdl_ref_cnt--;
2637 2264 }
2638 2265 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) &&
2639 2266 (ibd_gstate.ig_ibt_hdl != NULL)) {
2640 2267 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) !=
2641 2268 IBT_SUCCESS) {
2642 2269 ibd_print_warn(state, "ibt_detach(): global "
2643 2270 "failed, ret=%d", ret);
2644 2271 }
2645 2272 ibd_gstate.ig_ibt_hdl = NULL;
2646 2273 }
2647 2274 mutex_exit(&ibd_gstate.ig_mutex);
2648 2275
2649 2276 if (progress & IBD_DRV_TXINTR_ADDED) {
2650 2277 ddi_remove_softintr(state->id_tx);
2651 2278 state->id_tx = NULL;
2652 2279 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2653 2280 }
2654 2281
2655 2282 if (progress & IBD_DRV_RXINTR_ADDED) {
2656 2283 ddi_remove_softintr(state->id_rx);
2657 2284 state->id_rx = NULL;
2658 2285 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2659 2286 }
2660 2287
2661 2288 #ifdef DEBUG
2662 2289 if (progress & IBD_DRV_RC_PRIVATE_STATE) {
2663 2290 kstat_delete(state->rc_ksp);
2664 2291 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE);
2665 2292 }
2666 2293 #endif
2667 2294
2668 2295 if (progress & IBD_DRV_STATE_INITIALIZED) {
2669 2296 ibd_state_fini(state);
2670 2297 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2671 2298 }
2672 2299 }
2673 2300
2674 2301 int
2675 2302 ibd_part_attach(ibd_state_t *state, dev_info_t *dip)
2676 2303 {
2677 2304 ibt_status_t ret;
2678 2305 int rv;
2679 2306 kthread_t *kht;
2680 2307
2681 2308 /*
2682 2309 * Initialize mutexes and condition variables
2683 2310 */
2684 2311 if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2685 2312 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()");
2686 2313 return (DDI_FAILURE);
2687 2314 }
2688 2315 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2689 2316
2690 2317 /*
2691 2318 * Allocate rx,tx softintr
2692 2319 */
2693 2320 if (ibd_rx_softintr == 1) {
2694 2321 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2695 2322 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2696 2323 DPRINT(10, "ibd_part_attach: failed in "
2697 2324 "ddi_add_softintr(id_rx), ret=%d", rv);
2698 2325 return (DDI_FAILURE);
2699 2326 }
2700 2327 state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2701 2328 }
2702 2329 if (ibd_tx_softintr == 1) {
2703 2330 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2704 2331 NULL, NULL, ibd_tx_recycle,
2705 2332 (caddr_t)state)) != DDI_SUCCESS) {
2706 2333 DPRINT(10, "ibd_part_attach: failed in "
2707 2334 "ddi_add_softintr(id_tx), ret=%d", rv);
2708 2335 return (DDI_FAILURE);
2709 2336 }
2710 2337 state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2711 2338 }
2712 2339
2713 2340 /*
2714 2341 * Attach to IBTL
2715 2342 */
2716 2343 mutex_enter(&ibd_gstate.ig_mutex);
2717 2344 if (ibd_gstate.ig_ibt_hdl == NULL) {
2718 2345 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2719 2346 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) {
2720 2347 DPRINT(10, "ibd_part_attach: global: failed in "
2721 2348 "ibt_attach(), ret=%d", ret);
2722 2349 mutex_exit(&ibd_gstate.ig_mutex);
2723 2350 return (DDI_FAILURE);
2724 2351 }
2725 2352 }
2726 2353 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2727 2354 &state->id_ibt_hdl)) != IBT_SUCCESS) {
2728 2355 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d",
2729 2356 ret);
2730 2357 mutex_exit(&ibd_gstate.ig_mutex);
2731 2358 return (DDI_FAILURE);
2732 2359 }
2733 2360 ibd_gstate.ig_ibt_hdl_ref_cnt++;
2734 2361 mutex_exit(&ibd_gstate.ig_mutex);
2735 2362 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2736 2363
2737 2364 /*
2738 2365 * Open the HCA
2739 2366 */
2740 2367 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
2741 2368 &state->id_hca_hdl)) != IBT_SUCCESS) {
2742 2369 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d",
2743 2370 ret);
2744 2371 return (DDI_FAILURE);
2745 2372 }
2746 2373 state->id_mac_state |= IBD_DRV_HCA_OPENED;
2747 2374
2748 2375 #ifdef DEBUG
2749 2376 /* Initialize Driver Counters for Reliable Connected Mode */
2750 2377 if (state->id_enable_rc) {
2751 2378 if (ibd_rc_init_stats(state) != DDI_SUCCESS) {
2752 2379 DPRINT(10, "ibd_part_attach: failed in "
2753 2380 "ibd_rc_init_stats");
2754 2381 return (DDI_FAILURE);
2755 2382 }
2756 2383 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE;
2757 2384 }
2758 2385 #endif
2759 2386
2760 2387 /*
2761 2388 * Record capabilities
2762 2389 */
2763 2390 (void) ibd_record_capab(state);
2764 2391
2765 2392 /*
2766 2393 * Allocate a protection domain on the HCA
2767 2394 */
2768 2395 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2769 2396 &state->id_pd_hdl)) != IBT_SUCCESS) {
2770 2397 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d",
2771 2398 ret);
2772 2399 return (DDI_FAILURE);
2773 2400 }
2774 2401 state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2775 2402
2776 2403
2777 2404 /*
2778 2405 * We need to initialise the req_list that is required for the
2779 2406 * operation of the async_thread.
2780 2407 */
2781 2408 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
2782 2409 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
2783 2410 list_create(&state->id_req_list, sizeof (ibd_req_t),
2784 2411 offsetof(ibd_req_t, rq_list));
2785 2412 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED;
2786 2413
2787 2414 /*
2788 2415 * Create the async thread; thread_create never fails.
2789 2416 */
2790 2417 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
2791 2418 TS_RUN, minclsyspri);
2792 2419 state->id_async_thrid = kht->t_did;
2793 2420 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
2794 2421
2795 2422 return (DDI_SUCCESS);
2796 2423 }
2797 2424
2798 2425 /*
2799 2426 * Attach device to the IO framework.
2800 2427 */
2801 2428 static int
2802 2429 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2803 2430 {
2804 2431 int ret;
2805 2432
2806 2433 switch (cmd) {
2807 2434 case DDI_ATTACH:
2808 2435 ret = ibd_port_attach(dip);
2809 2436 break;
2810 2437 default:
2811 2438 ret = DDI_FAILURE;
2812 2439 break;
2813 2440 }
2814 2441 return (ret);
2815 2442 }
2816 2443
2817 2444 /*
2818 2445 * Detach device from the IO framework.
2819 2446 */
2820 2447 static int
2821 2448 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2822 2449 {
2823 2450 ibd_state_t *state;
2824 2451 int instance;
2825 2452
2826 2453 /*
2827 2454 * IBD doesn't support suspend/resume
2828 2455 */
2829 2456 if (cmd != DDI_DETACH)
2830 2457 return (DDI_FAILURE);
2831 2458
2832 2459 /*
2833 2460 * Get the instance softstate
2834 2461 */
2835 2462 instance = ddi_get_instance(dip);
2836 2463 state = ddi_get_soft_state(ibd_list, instance);
2837 2464
2838 2465 /*
2839 2466 * Release all resources we're holding still. Note that if we'd
2840 2467 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2841 2468 * so far, we should find all the flags we need in id_mac_state.
2842 2469 */
2843 2470 return (ibd_port_unattach(state, dip));
2844 2471 }
2845 2472
2846 2473 /*
2847 2474 * Pre ibt_attach() driver initialization
2848 2475 */
2849 2476 static int
2850 2477 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2851 2478 {
2852 2479 char buf[64];
2853 2480
2854 2481 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2855 2482 state->id_link_state = LINK_STATE_UNKNOWN;
2856 2483
2857 2484 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2858 2485 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2859 2486 state->id_trap_stop = B_TRUE;
2860 2487 state->id_trap_inprog = 0;
2861 2488
2862 2489 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2863 2490 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2864 2491 state->id_dip = dip;
2865 2492
2866 2493 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2867 2494
2868 2495 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2869 2496 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2870 2497 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2871 2498 state->id_tx_busy = 0;
2872 2499 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2873 2500
2874 2501 state->id_rx_list.dl_bufs_outstanding = 0;
2875 2502 state->id_rx_list.dl_cnt = 0;
2876 2503 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2877 2504 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2878 2505 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip),
2879 2506 state->id_pkey, state->id_plinkid);
2880 2507 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2881 2508 0, NULL, NULL, NULL, NULL, NULL, 0);
2882 2509
2883 2510 /* For Reliable Connected Mode */
2884 2511 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL);
2885 2512 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL);
2886 2513 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2887 2514 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2888 2515 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL,
2889 2516 MUTEX_DRIVER, NULL);
2890 2517 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL);
2891 2518
2892 2519 /*
2893 2520 * Make the default link mode as RC. If this fails during connection
2894 2521 * setup, the link mode is automatically transitioned to UD.
2895 2522 * Also set the RC MTU.
2896 2523 */
2897 2524 state->id_enable_rc = IBD_DEF_LINK_MODE;
2898 2525 state->rc_mtu = IBD_DEF_RC_MAX_MTU;
2899 2526 state->id_mtu = IBD_DEF_MAX_MTU;
2900 2527
2901 2528 /* Iniatialize all tunables to default */
2902 2529 state->id_lso_policy = IBD_DEF_LSO_POLICY;
2903 2530 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS;
2904 2531 state->id_num_ah = IBD_DEF_NUM_AH;
2905 2532 state->id_hash_size = IBD_DEF_HASH_SIZE;
2906 2533 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP;
2907 2534 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS;
2908 2535 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT;
2909 2536 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC;
2910 2537 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT;
2911 2538 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC;
2912 2539 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT;
2913 2540 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC;
2914 2541 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT;
2915 2542 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC;
2916 2543 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH;
2917 2544 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH;
2918 2545 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH;
2919 2546 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE;
2920 2547 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE;
2921 2548 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE;
2922 2549 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE;
2923 2550 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ;
2924 2551 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ;
2925 2552 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH;
2926 2553
2927 2554 return (DDI_SUCCESS);
2928 2555 }
2929 2556
2930 2557 /*
2931 2558 * Post ibt_detach() driver deconstruction
2932 2559 */
2933 2560 static void
2934 2561 ibd_state_fini(ibd_state_t *state)
2935 2562 {
2936 2563 kmem_cache_destroy(state->id_req_kmc);
2937 2564
2938 2565 mutex_destroy(&state->id_rx_list.dl_mutex);
2939 2566 mutex_destroy(&state->id_rx_free_list.dl_mutex);
2940 2567
2941 2568 mutex_destroy(&state->id_txpost_lock);
2942 2569 mutex_destroy(&state->id_tx_list.dl_mutex);
2943 2570 mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2944 2571 mutex_destroy(&state->id_lso_lock);
2945 2572
2946 2573 mutex_destroy(&state->id_sched_lock);
2947 2574 mutex_destroy(&state->id_scq_poll_lock);
2948 2575 mutex_destroy(&state->id_rcq_poll_lock);
2949 2576
2950 2577 cv_destroy(&state->id_trap_cv);
2951 2578 mutex_destroy(&state->id_trap_lock);
2952 2579 mutex_destroy(&state->id_link_mutex);
2953 2580
2954 2581 /* For Reliable Connected Mode */
2955 2582 mutex_destroy(&state->rc_timeout_lock);
2956 2583 mutex_destroy(&state->rc_srq_free_list.dl_mutex);
2957 2584 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex);
2958 2585 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex);
2959 2586 mutex_destroy(&state->rc_tx_large_bufs_lock);
2960 2587 mutex_destroy(&state->rc_rx_lock);
2961 2588 }
2962 2589
2963 2590 /*
2964 2591 * Fetch link speed from SA for snmp ifspeed reporting.
2965 2592 */
2966 2593 static uint64_t
2967 2594 ibd_get_portspeed(ibd_state_t *state)
2968 2595 {
2969 2596 int ret;
2970 2597 ibt_path_info_t path;
2971 2598 ibt_path_attr_t path_attr;
2972 2599 uint8_t num_paths;
2973 2600 uint64_t ifspeed;
2974 2601
2975 2602 /*
2976 2603 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2977 2604 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2978 2605 * 2000000000. Start with that as default.
2979 2606 */
2980 2607 ifspeed = 2000000000;
2981 2608
2982 2609 bzero(&path_attr, sizeof (path_attr));
2983 2610
2984 2611 /*
2985 2612 * Get the port speed from Loopback path information.
2986 2613 */
2987 2614 path_attr.pa_dgids = &state->id_sgid;
2988 2615 path_attr.pa_num_dgids = 1;
2989 2616 path_attr.pa_sgid = state->id_sgid;
2990 2617
2991 2618 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2992 2619 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2993 2620 goto earlydone;
2994 2621
2995 2622 if (num_paths < 1)
2996 2623 goto earlydone;
2997 2624
2998 2625 /*
2999 2626 * In case SA does not return an expected value, report the default
3000 2627 * speed as 1X.
3001 2628 */
3002 2629 ret = 1;
3003 2630 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
3004 2631 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */
3005 2632 ret = 1;
3006 2633 break;
3007 2634 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */
3008 2635 ret = 4;
3009 2636 break;
3010 2637 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */
3011 2638 ret = 12;
3012 2639 break;
3013 2640 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */
3014 2641 ret = 2;
3015 2642 break;
3016 2643 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */
3017 2644 ret = 8;
3018 2645 break;
3019 2646 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */
3020 2647 ret = 16;
3021 2648 break;
3022 2649 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */
3023 2650 ret = 24;
3024 2651 break;
3025 2652 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */
3026 2653 ret = 32;
3027 2654 break;
3028 2655 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */
3029 2656 ret = 48;
3030 2657 break;
3031 2658 }
3032 2659
3033 2660 ifspeed *= ret;
3034 2661
3035 2662 earlydone:
3036 2663 return (ifspeed);
3037 2664 }
3038 2665
3039 2666 /*
3040 2667 * Search input mcg list (id_mc_full or id_mc_non) for an entry
3041 2668 * representing the input mcg mgid.
3042 2669 */
3043 2670 static ibd_mce_t *
3044 2671 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
3045 2672 {
3046 2673 ibd_mce_t *ptr = list_head(mlist);
3047 2674
3048 2675 /*
3049 2676 * Do plain linear search.
3050 2677 */
3051 2678 while (ptr != NULL) {
3052 2679 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
3053 2680 sizeof (ib_gid_t)) == 0)
3054 2681 return (ptr);
3055 2682 ptr = list_next(mlist, ptr);
3056 2683 }
3057 2684 return (NULL);
3058 2685 }
3059 2686
3060 2687 /*
3061 2688 * Execute IBA JOIN.
3062 2689 */
3063 2690 static ibt_status_t
3064 2691 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
3065 2692 {
3066 2693 ibt_mcg_attr_t mcg_attr;
3067 2694
3068 2695 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3069 2696 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
3070 2697 mcg_attr.mc_mgid = mgid;
3071 2698 mcg_attr.mc_join_state = mce->mc_jstate;
3072 2699 mcg_attr.mc_scope = state->id_scope;
3073 2700 mcg_attr.mc_pkey = state->id_pkey;
3074 2701 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
3075 2702 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
3076 2703 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
3077 2704 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
3078 2705 NULL, NULL));
3079 2706 }
3080 2707
3081 2708 /*
3082 2709 * This code JOINs the port in the proper way (depending on the join
3083 2710 * state) so that IBA fabric will forward mcg packets to/from the port.
3084 2711 * It also attaches the QPN to the mcg so it can receive those mcg
3085 2712 * packets. This code makes sure not to attach the mcg to the QP if
3086 2713 * that has been previously done due to the mcg being joined with a
3087 2714 * different join state, even though this is not required by SWG_0216,
3088 2715 * refid 3610.
3089 2716 */
3090 2717 static ibd_mce_t *
3091 2718 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3092 2719 {
3093 2720 ibt_status_t ibt_status;
3094 2721 ibd_mce_t *mce, *tmce, *omce = NULL;
3095 2722 boolean_t do_attach = B_TRUE;
3096 2723
3097 2724 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
3098 2725 jstate, mgid.gid_prefix, mgid.gid_guid);
3099 2726
3100 2727 /*
3101 2728 * For enable_multicast Full member joins, we need to do some
3102 2729 * extra work. If there is already an mce on the list that
3103 2730 * indicates full membership, that means the membership has
3104 2731 * not yet been dropped (since the disable_multicast was issued)
3105 2732 * because there are pending Tx's to the mcg; in that case, just
3106 2733 * mark the mce not to be reaped when the Tx completion queues
3107 2734 * an async reap operation.
3108 2735 *
3109 2736 * If there is already an mce on the list indicating sendonly
3110 2737 * membership, try to promote to full membership. Be careful
3111 2738 * not to deallocate the old mce, since there might be an AH
3112 2739 * pointing to it; instead, update the old mce with new data
3113 2740 * that tracks the full membership.
3114 2741 */
3115 2742 if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
3116 2743 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
3117 2744 if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
3118 2745 ASSERT(omce->mc_fullreap);
3119 2746 omce->mc_fullreap = B_FALSE;
3120 2747 return (omce);
3121 2748 } else {
3122 2749 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
3123 2750 }
3124 2751 }
3125 2752
3126 2753 /*
3127 2754 * Allocate the ibd_mce_t to track this JOIN.
3128 2755 */
3129 2756 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
3130 2757 mce->mc_fullreap = B_FALSE;
3131 2758 mce->mc_jstate = jstate;
3132 2759
3133 2760 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
3134 2761 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
3135 2762 ibt_status);
3136 2763 kmem_free(mce, sizeof (ibd_mce_t));
3137 2764 return (NULL);
3138 2765 }
3139 2766
3140 2767 /*
3141 2768 * Is an IBA attach required? Not if the interface is already joined
3142 2769 * to the mcg in a different appropriate join state.
3143 2770 */
3144 2771 if (jstate == IB_MC_JSTATE_NON) {
3145 2772 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3146 2773 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3147 2774 do_attach = B_FALSE;
3148 2775 } else if (jstate == IB_MC_JSTATE_FULL) {
3149 2776 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3150 2777 do_attach = B_FALSE;
3151 2778 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3152 2779 do_attach = B_FALSE;
3153 2780 }
3154 2781
3155 2782 if (do_attach) {
3156 2783 /*
3157 2784 * Do the IBA attach.
3158 2785 */
3159 2786 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
3160 2787 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
3161 2788 &mce->mc_info)) != IBT_SUCCESS) {
3162 2789 DPRINT(10, "ibd_join_group : failed qp attachment "
3163 2790 "%d\n", ibt_status);
3164 2791 /*
3165 2792 * NOTE that we should probably preserve the join info
3166 2793 * in the list and later try to leave again at detach
3167 2794 * time.
3168 2795 */
3169 2796 (void) ibt_leave_mcg(state->id_sgid, mgid,
3170 2797 state->id_sgid, jstate);
3171 2798 kmem_free(mce, sizeof (ibd_mce_t));
3172 2799 return (NULL);
3173 2800 }
3174 2801 }
3175 2802
3176 2803 /*
3177 2804 * Insert the ibd_mce_t in the proper list.
3178 2805 */
3179 2806 if (jstate == IB_MC_JSTATE_NON) {
3180 2807 IBD_MCACHE_INSERT_NON(state, mce);
3181 2808 } else {
3182 2809 /*
3183 2810 * Set up the mc_req fields used for reaping the
3184 2811 * mcg in case of delayed tx completion (see
3185 2812 * ibd_tx_cleanup()). Also done for sendonly join in
3186 2813 * case we are promoted to fullmembership later and
3187 2814 * keep using the same mce.
3188 2815 */
3189 2816 mce->mc_req.rq_gid = mgid;
3190 2817 mce->mc_req.rq_ptr = mce;
3191 2818 /*
3192 2819 * Check whether this is the case of trying to join
3193 2820 * full member, and we were already joined send only.
3194 2821 * We try to drop our SendOnly membership, but it is
3195 2822 * possible that the mcg does not exist anymore (and
3196 2823 * the subnet trap never reached us), so the leave
3197 2824 * operation might fail.
3198 2825 */
3199 2826 if (omce != NULL) {
3200 2827 (void) ibt_leave_mcg(state->id_sgid, mgid,
3201 2828 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
3202 2829 omce->mc_jstate = IB_MC_JSTATE_FULL;
3203 2830 bcopy(&mce->mc_info, &omce->mc_info,
3204 2831 sizeof (ibt_mcg_info_t));
3205 2832 kmem_free(mce, sizeof (ibd_mce_t));
3206 2833 return (omce);
3207 2834 }
3208 2835 mutex_enter(&state->id_mc_mutex);
3209 2836 IBD_MCACHE_INSERT_FULL(state, mce);
3210 2837 mutex_exit(&state->id_mc_mutex);
3211 2838 }
3212 2839
3213 2840 return (mce);
3214 2841 }
3215 2842
3216 2843 /*
3217 2844 * Called during port up event handling to attempt to reacquire full
3218 2845 * membership to an mcg. Stripped down version of ibd_join_group().
3219 2846 * Note that it is possible that the mcg might have gone away, and
3220 2847 * gets recreated at this point.
3221 2848 */
3222 2849 static void
3223 2850 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
3224 2851 {
3225 2852 ib_gid_t mgid;
3226 2853
3227 2854 /*
3228 2855 * If the mc_fullreap flag is set, or this join fails, a subsequent
3229 2856 * reap/leave is going to try to leave the group. We could prevent
3230 2857 * that by adding a boolean flag into ibd_mce_t, if required.
3231 2858 */
3232 2859 if (mce->mc_fullreap)
3233 2860 return;
3234 2861
3235 2862 mgid = mce->mc_info.mc_adds_vect.av_dgid;
3236 2863
3237 2864 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
3238 2865 mgid.gid_guid);
3239 2866
3240 2867 /* While reacquiring, leave and then join the MCG */
3241 2868 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid,
3242 2869 mce->mc_jstate);
3243 2870 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
3244 2871 ibd_print_warn(state, "Failure on port up to rejoin "
3245 2872 "multicast gid %016llx:%016llx",
3246 2873 (u_longlong_t)mgid.gid_prefix,
3247 2874 (u_longlong_t)mgid.gid_guid);
3248 2875 }
3249 2876
3250 2877 /*
3251 2878 * This code handles delayed Tx completion cleanups for mcg's to which
3252 2879 * disable_multicast has been issued, regular mcg related cleanups during
3253 2880 * disable_multicast, disable_promiscuous and mcg traps, as well as
3254 2881 * cleanups during driver detach time. Depending on the join state,
3255 2882 * it deletes the mce from the appropriate list and issues the IBA
3256 2883 * leave/detach; except in the disable_multicast case when the mce
3257 2884 * is left on the active list for a subsequent Tx completion cleanup.
3258 2885 */
3259 2886 static void
3260 2887 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3261 2888 uint8_t jstate)
3262 2889 {
3263 2890 ibd_mce_t *tmce;
3264 2891 boolean_t do_detach = B_TRUE;
3265 2892
3266 2893 /*
3267 2894 * Before detaching, we must check whether the other list
3268 2895 * contains the mcg; if we detach blindly, the consumer
3269 2896 * who set up the other list will also stop receiving
3270 2897 * traffic.
3271 2898 */
3272 2899 if (jstate == IB_MC_JSTATE_FULL) {
3273 2900 /*
3274 2901 * The following check is only relevant while coming
3275 2902 * from the Tx completion path in the reap case.
3276 2903 */
3277 2904 if (!mce->mc_fullreap)
3278 2905 return;
3279 2906 mutex_enter(&state->id_mc_mutex);
3280 2907 IBD_MCACHE_PULLOUT_FULL(state, mce);
3281 2908 mutex_exit(&state->id_mc_mutex);
3282 2909 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3283 2910 do_detach = B_FALSE;
3284 2911 } else if (jstate == IB_MC_JSTATE_NON) {
3285 2912 IBD_MCACHE_PULLOUT_NON(state, mce);
3286 2913 tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3287 2914 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3288 2915 do_detach = B_FALSE;
3289 2916 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3290 2917 mutex_enter(&state->id_mc_mutex);
3291 2918 IBD_MCACHE_PULLOUT_FULL(state, mce);
3292 2919 mutex_exit(&state->id_mc_mutex);
3293 2920 do_detach = B_FALSE;
3294 2921 }
3295 2922
3296 2923 /*
3297 2924 * If we are reacting to a mcg trap and leaving our sendonly or
3298 2925 * non membership, the mcg is possibly already gone, so attempting
3299 2926 * to leave might fail. On the other hand, we must try to leave
3300 2927 * anyway, since this might be a trap from long ago, and we could
3301 2928 * have potentially sendonly joined to a recent incarnation of
3302 2929 * the mcg and are about to loose track of this information.
3303 2930 */
3304 2931 if (do_detach) {
3305 2932 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3306 2933 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3307 2934 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3308 2935 }
3309 2936
3310 2937 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3311 2938 kmem_free(mce, sizeof (ibd_mce_t));
3312 2939 }
3313 2940
3314 2941 /*
3315 2942 * Async code executed due to multicast and promiscuous disable requests
3316 2943 * and mcg trap handling; also executed during driver detach. Mostly, a
3317 2944 * leave and detach is done; except for the fullmember case when Tx
3318 2945 * requests are pending, whence arrangements are made for subsequent
3319 2946 * cleanup on Tx completion.
3320 2947 */
3321 2948 static void
3322 2949 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3323 2950 {
3324 2951 ipoib_mac_t mcmac;
3325 2952 boolean_t recycled;
3326 2953 ibd_mce_t *mce;
3327 2954
3328 2955 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3329 2956 jstate, mgid.gid_prefix, mgid.gid_guid);
3330 2957
3331 2958 if (jstate == IB_MC_JSTATE_NON) {
3332 2959 recycled = B_TRUE;
3333 2960 mce = IBD_MCACHE_FIND_NON(state, mgid);
3334 2961 /*
3335 2962 * In case we are handling a mcg trap, we might not find
3336 2963 * the mcg in the non list.
3337 2964 */
3338 2965 if (mce == NULL) {
3339 2966 return;
3340 2967 }
3341 2968 } else {
3342 2969 mce = IBD_MCACHE_FIND_FULL(state, mgid);
3343 2970
3344 2971 /*
3345 2972 * In case we are handling a mcg trap, make sure the trap
3346 2973 * is not arriving late; if we have an mce that indicates
3347 2974 * that we are already a fullmember, that would be a clear
3348 2975 * indication that the trap arrived late (ie, is for a
3349 2976 * previous incarnation of the mcg).
3350 2977 */
3351 2978 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3352 2979 if ((mce == NULL) || (mce->mc_jstate ==
3353 2980 IB_MC_JSTATE_FULL)) {
3354 2981 return;
3355 2982 }
3356 2983 } else {
3357 2984 ASSERT(jstate == IB_MC_JSTATE_FULL);
3358 2985
3359 2986 /*
3360 2987 * If join group failed, mce will be NULL here.
3361 2988 * This is because in GLDv3 driver, set multicast
3362 2989 * will always return success.
3363 2990 */
3364 2991 if (mce == NULL) {
3365 2992 return;
3366 2993 }
3367 2994
3368 2995 mce->mc_fullreap = B_TRUE;
3369 2996 }
3370 2997
3371 2998 /*
3372 2999 * If no pending Tx's remain that reference the AH
3373 3000 * for the mcg, recycle it from active to free list.
3374 3001 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3375 3002 * so the last completing Tx will cause an async reap
3376 3003 * operation to be invoked, at which time we will drop our
3377 3004 * membership to the mcg so that the pending Tx's complete
3378 3005 * successfully. Refer to comments on "AH and MCE active
3379 3006 * list manipulation" at top of this file. The lock protects
3380 3007 * against Tx fast path and Tx cleanup code.
3381 3008 */
3382 3009 mutex_enter(&state->id_ac_mutex);
3383 3010 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3384 3011 recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3385 3012 IB_MC_JSTATE_SEND_ONLY_NON));
3386 3013 mutex_exit(&state->id_ac_mutex);
3387 3014 }
3388 3015
3389 3016 if (recycled) {
3390 3017 DPRINT(2, "ibd_leave_group : leave_group reaping : "
3391 3018 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3392 3019 ibd_async_reap_group(state, mce, mgid, jstate);
3393 3020 }
3394 3021 }
3395 3022
3396 3023 /*
3397 3024 * Find the broadcast address as defined by IPoIB; implicitly
3398 3025 * determines the IBA scope, mtu, tclass etc of the link the
3399 3026 * interface is going to be a member of.
3400 3027 */
3401 3028 static ibt_status_t
3402 3029 ibd_find_bgroup(ibd_state_t *state)
3403 3030 {
3404 3031 ibt_mcg_attr_t mcg_attr;
3405 3032 uint_t numg;
3406 3033 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3407 3034 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3408 3035 IB_MC_SCOPE_GLOBAL };
3409 3036 int i, mcgmtu;
↓ open down ↓ |
869 lines elided |
↑ open up ↑ |
3410 3037 boolean_t found = B_FALSE;
3411 3038 int ret;
3412 3039 ibt_mcg_info_t mcg_info;
3413 3040
3414 3041 state->id_bgroup_created = B_FALSE;
3415 3042 state->id_bgroup_present = B_FALSE;
3416 3043
3417 3044 query_bcast_grp:
3418 3045 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3419 3046 mcg_attr.mc_pkey = state->id_pkey;
3420 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3421 3047 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3422 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3423 3048
3424 3049 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3425 3050 state->id_scope = mcg_attr.mc_scope = scopes[i];
3426 3051
3427 3052 /*
3428 3053 * Look for the IPoIB broadcast group.
3429 3054 */
3430 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3431 3055 state->id_mgid.gid_prefix =
3432 3056 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3433 3057 ((uint64_t)state->id_scope << 48) |
3434 3058 ((uint32_t)(state->id_pkey << 16)));
3435 3059 mcg_attr.mc_mgid = state->id_mgid;
3436 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3437 3060 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3438 3061 &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3439 3062 found = B_TRUE;
3440 3063 break;
3441 3064 }
3442 3065 }
3443 3066
3444 3067 if (!found) {
3445 3068 if (state->id_create_broadcast_group) {
3446 3069 /*
3447 3070 * If we created the broadcast group, but failed to
3448 3071 * find it, we can't do anything except leave the
3449 3072 * one we created and return failure.
3450 3073 */
3451 3074 if (state->id_bgroup_created) {
3452 3075 ibd_print_warn(state, "IPoIB broadcast group "
3453 3076 "absent. Unable to query after create.");
3454 3077 goto find_bgroup_fail;
3455 3078 }
3456 3079
3457 3080 /*
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
3458 3081 * Create the ipoib broadcast group if it didn't exist
3459 3082 */
3460 3083 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3461 3084 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3462 3085 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3463 3086 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3464 3087 mcg_attr.mc_pkey = state->id_pkey;
3465 3088 mcg_attr.mc_flow = 0;
3466 3089 mcg_attr.mc_sl = 0;
3467 3090 mcg_attr.mc_tclass = 0;
3468 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3469 3091 state->id_mgid.gid_prefix =
3470 3092 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3471 3093 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3472 3094 ((uint32_t)(state->id_pkey << 16)));
3473 3095 mcg_attr.mc_mgid = state->id_mgid;
3474 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3475 3096
3476 3097 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3477 3098 &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3478 3099 ibd_print_warn(state, "IPoIB broadcast group "
3479 3100 "absent, create failed: ret = %d\n", ret);
3480 3101 state->id_bgroup_created = B_FALSE;
3481 3102 return (IBT_FAILURE);
3482 3103 }
3483 3104 state->id_bgroup_created = B_TRUE;
3484 3105 goto query_bcast_grp;
3485 3106 } else {
3486 3107 ibd_print_warn(state, "IPoIB broadcast group absent");
3487 3108 return (IBT_FAILURE);
3488 3109 }
3489 3110 }
3490 3111
3491 3112 /*
3492 3113 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3493 3114 */
3494 3115 mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3495 3116 if (state->id_mtu < mcgmtu) {
3496 3117 ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3497 3118 "greater than port's maximum MTU %d", mcgmtu,
3498 3119 state->id_mtu);
3499 3120 ibt_free_mcg_info(state->id_mcinfo, 1);
3500 3121 goto find_bgroup_fail;
3501 3122 }
3502 3123 state->id_mtu = mcgmtu;
3503 3124 state->id_bgroup_present = B_TRUE;
3504 3125
3505 3126 return (IBT_SUCCESS);
3506 3127
3507 3128 find_bgroup_fail:
3508 3129 if (state->id_bgroup_created) {
3509 3130 (void) ibt_leave_mcg(state->id_sgid,
3510 3131 mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3511 3132 IB_MC_JSTATE_FULL);
3512 3133 }
3513 3134
3514 3135 return (IBT_FAILURE);
3515 3136 }
3516 3137
3517 3138 static int
3518 3139 ibd_alloc_tx_copybufs(ibd_state_t *state)
3519 3140 {
3520 3141 ibt_mr_attr_t mem_attr;
3521 3142
3522 3143 /*
3523 3144 * Allocate one big chunk for all regular tx copy bufs
3524 3145 */
3525 3146 state->id_tx_buf_sz = state->id_mtu;
3526 3147 if (state->id_lso_policy && state->id_lso_capable &&
3527 3148 (state->id_ud_tx_copy_thresh > state->id_mtu)) {
3528 3149 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh;
3529 3150 }
3530 3151
3531 3152 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe *
3532 3153 state->id_tx_buf_sz, KM_SLEEP);
3533 3154
3534 3155 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe *
3535 3156 sizeof (ibd_swqe_t), KM_SLEEP);
3536 3157
3537 3158 /*
3538 3159 * Do one memory registration on the entire txbuf area
3539 3160 */
3540 3161 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3541 3162 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz;
3542 3163 mem_attr.mr_as = NULL;
3543 3164 mem_attr.mr_flags = IBT_MR_SLEEP;
3544 3165 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3545 3166 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3546 3167 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3547 3168 kmem_free(state->id_tx_wqes,
3548 3169 state->id_ud_num_swqe * sizeof (ibd_swqe_t));
3549 3170 kmem_free(state->id_tx_bufs,
3550 3171 state->id_ud_num_swqe * state->id_tx_buf_sz);
3551 3172 state->id_tx_bufs = NULL;
3552 3173 return (DDI_FAILURE);
3553 3174 }
3554 3175
3555 3176 return (DDI_SUCCESS);
3556 3177 }
3557 3178
3558 3179 static int
3559 3180 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3560 3181 {
3561 3182 ibt_mr_attr_t mem_attr;
3562 3183 ibd_lsobuf_t *buflist;
3563 3184 ibd_lsobuf_t *lbufp;
3564 3185 ibd_lsobuf_t *tail;
3565 3186 ibd_lsobkt_t *bktp;
3566 3187 uint8_t *membase;
3567 3188 uint8_t *memp;
3568 3189 uint_t memsz;
3569 3190 int i;
3570 3191
3571 3192 /*
3572 3193 * Allocate the lso bucket
3573 3194 */
3574 3195 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3575 3196
3576 3197 /*
3577 3198 * Allocate the entire lso memory and register it
3578 3199 */
3579 3200 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ;
3580 3201 membase = kmem_zalloc(memsz, KM_SLEEP);
3581 3202
3582 3203 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3583 3204 mem_attr.mr_len = memsz;
3584 3205 mem_attr.mr_as = NULL;
3585 3206 mem_attr.mr_flags = IBT_MR_SLEEP;
3586 3207 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3587 3208 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3588 3209 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3589 3210 kmem_free(membase, memsz);
3590 3211 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3591 3212 return (DDI_FAILURE);
3592 3213 }
3593 3214
3594 3215 mutex_enter(&state->id_lso_lock);
3595 3216
3596 3217 /*
3597 3218 * Now allocate the buflist. Note that the elements in the buflist and
3598 3219 * the buffers in the lso memory have a permanent 1-1 relation, so we
3599 3220 * can always derive the address of a buflist entry from the address of
3600 3221 * an lso buffer.
3601 3222 */
3602 3223 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t),
3603 3224 KM_SLEEP);
3604 3225
3605 3226 /*
3606 3227 * Set up the lso buf chain
3607 3228 */
3608 3229 memp = membase;
3609 3230 lbufp = buflist;
3610 3231 for (i = 0; i < state->id_num_lso_bufs; i++) {
3611 3232 lbufp->lb_isfree = 1;
3612 3233 lbufp->lb_buf = memp;
3613 3234 lbufp->lb_next = lbufp + 1;
3614 3235
3615 3236 tail = lbufp;
3616 3237
3617 3238 memp += IBD_LSO_BUFSZ;
3618 3239 lbufp++;
3619 3240 }
3620 3241 tail->lb_next = NULL;
3621 3242
3622 3243 /*
3623 3244 * Set up the LSO buffer information in ibd state
3624 3245 */
3625 3246 bktp->bkt_bufl = buflist;
3626 3247 bktp->bkt_free_head = buflist;
3627 3248 bktp->bkt_mem = membase;
3628 3249 bktp->bkt_nelem = state->id_num_lso_bufs;
3629 3250 bktp->bkt_nfree = bktp->bkt_nelem;
3630 3251
3631 3252 state->id_lso = bktp;
3632 3253 mutex_exit(&state->id_lso_lock);
3633 3254
3634 3255 return (DDI_SUCCESS);
3635 3256 }
3636 3257
3637 3258 /*
3638 3259 * Statically allocate Tx buffer list(s).
3639 3260 */
3640 3261 static int
3641 3262 ibd_init_txlist(ibd_state_t *state)
3642 3263 {
3643 3264 ibd_swqe_t *swqe;
3644 3265 ibt_lkey_t lkey;
3645 3266 int i;
3646 3267 uint_t len;
3647 3268 uint8_t *bufaddr;
3648 3269
3649 3270 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3650 3271 return (DDI_FAILURE);
3651 3272
3652 3273 if (state->id_lso_policy && state->id_lso_capable) {
3653 3274 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3654 3275 state->id_lso_capable = B_FALSE;
3655 3276 }
3656 3277
3657 3278 mutex_enter(&state->id_tx_list.dl_mutex);
3658 3279 state->id_tx_list.dl_head = NULL;
3659 3280 state->id_tx_list.dl_pending_sends = B_FALSE;
3660 3281 state->id_tx_list.dl_cnt = 0;
3661 3282 mutex_exit(&state->id_tx_list.dl_mutex);
3662 3283 mutex_enter(&state->id_tx_rel_list.dl_mutex);
3663 3284 state->id_tx_rel_list.dl_head = NULL;
3664 3285 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3665 3286 state->id_tx_rel_list.dl_cnt = 0;
3666 3287 mutex_exit(&state->id_tx_rel_list.dl_mutex);
3667 3288
3668 3289 /*
3669 3290 * Allocate and setup the swqe list
3670 3291 */
3671 3292 lkey = state->id_tx_mr_desc.md_lkey;
3672 3293 bufaddr = state->id_tx_bufs;
3673 3294 len = state->id_tx_buf_sz;
3674 3295 swqe = state->id_tx_wqes;
3675 3296 mutex_enter(&state->id_tx_list.dl_mutex);
3676 3297 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) {
3677 3298 swqe->swqe_next = NULL;
3678 3299 swqe->swqe_im_mblk = NULL;
3679 3300
3680 3301 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3681 3302 bufaddr;
3682 3303 swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3683 3304 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3684 3305
3685 3306 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3686 3307 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3687 3308 swqe->w_swr.wr_trans = IBT_UD_SRV;
3688 3309
3689 3310 /* These are set in send */
3690 3311 swqe->w_swr.wr_nds = 0;
3691 3312 swqe->w_swr.wr_sgl = NULL;
3692 3313 swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3693 3314
3694 3315 /* add to list */
3695 3316 state->id_tx_list.dl_cnt++;
3696 3317 swqe->swqe_next = state->id_tx_list.dl_head;
3697 3318 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3698 3319 }
3699 3320 mutex_exit(&state->id_tx_list.dl_mutex);
3700 3321
3701 3322 return (DDI_SUCCESS);
3702 3323 }
3703 3324
3704 3325 static int
3705 3326 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3706 3327 uint32_t *nds_p)
3707 3328 {
3708 3329 ibd_lsobkt_t *bktp;
3709 3330 ibd_lsobuf_t *lbufp;
3710 3331 ibd_lsobuf_t *nextp;
3711 3332 ibt_lkey_t lso_lkey;
3712 3333 uint_t frag_sz;
3713 3334 uint_t num_needed;
3714 3335 int i;
3715 3336
3716 3337 ASSERT(sgl_p != NULL);
3717 3338 ASSERT(nds_p != NULL);
3718 3339 ASSERT(req_sz != 0);
3719 3340
3720 3341 /*
3721 3342 * Determine how many bufs we'd need for the size requested
3722 3343 */
3723 3344 num_needed = req_sz / IBD_LSO_BUFSZ;
3724 3345 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3725 3346 num_needed++;
3726 3347
3727 3348 mutex_enter(&state->id_lso_lock);
3728 3349
3729 3350 /*
3730 3351 * If we don't have enough lso bufs, return failure
3731 3352 */
3732 3353 ASSERT(state->id_lso != NULL);
3733 3354 bktp = state->id_lso;
3734 3355 if (bktp->bkt_nfree < num_needed) {
3735 3356 mutex_exit(&state->id_lso_lock);
3736 3357 return (-1);
3737 3358 }
3738 3359
3739 3360 /*
3740 3361 * Pick the first 'num_needed' bufs from the free list
3741 3362 */
3742 3363 lso_lkey = bktp->bkt_mr_desc.md_lkey;
3743 3364 lbufp = bktp->bkt_free_head;
3744 3365 for (i = 0; i < num_needed; i++) {
3745 3366 ASSERT(lbufp->lb_isfree != 0);
3746 3367 ASSERT(lbufp->lb_buf != NULL);
3747 3368
3748 3369 nextp = lbufp->lb_next;
3749 3370
3750 3371 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3751 3372 sgl_p[i].ds_key = lso_lkey;
3752 3373 sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3753 3374
3754 3375 lbufp->lb_isfree = 0;
3755 3376 lbufp->lb_next = NULL;
3756 3377
3757 3378 lbufp = nextp;
3758 3379 }
3759 3380 bktp->bkt_free_head = lbufp;
3760 3381
3761 3382 /*
3762 3383 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3763 3384 * to adjust the last sgl entry's length. Since we know we need atleast
3764 3385 * one, the i-1 use below is ok.
3765 3386 */
3766 3387 if (frag_sz) {
3767 3388 sgl_p[i-1].ds_len = frag_sz;
3768 3389 }
3769 3390
3770 3391 /*
3771 3392 * Update nfree count and return
3772 3393 */
3773 3394 bktp->bkt_nfree -= num_needed;
3774 3395
3775 3396 mutex_exit(&state->id_lso_lock);
3776 3397
3777 3398 *nds_p = num_needed;
3778 3399
3779 3400 return (0);
3780 3401 }
3781 3402
3782 3403 static void
3783 3404 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3784 3405 {
3785 3406 ibd_lsobkt_t *bktp;
3786 3407 ibd_lsobuf_t *lbufp;
3787 3408 uint8_t *lso_mem_end;
3788 3409 uint_t ndx;
3789 3410 int i;
3790 3411
3791 3412 mutex_enter(&state->id_lso_lock);
3792 3413
3793 3414 bktp = state->id_lso;
3794 3415 ASSERT(bktp != NULL);
3795 3416
3796 3417 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3797 3418 for (i = 0; i < nds; i++) {
3798 3419 uint8_t *va;
3799 3420
3800 3421 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3801 3422 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3802 3423
3803 3424 /*
3804 3425 * Figure out the buflist element this sgl buffer corresponds
3805 3426 * to and put it back at the head
3806 3427 */
3807 3428 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3808 3429 lbufp = bktp->bkt_bufl + ndx;
3809 3430
3810 3431 ASSERT(lbufp->lb_isfree == 0);
3811 3432 ASSERT(lbufp->lb_buf == va);
3812 3433
3813 3434 lbufp->lb_isfree = 1;
3814 3435 lbufp->lb_next = bktp->bkt_free_head;
3815 3436 bktp->bkt_free_head = lbufp;
3816 3437 }
3817 3438 bktp->bkt_nfree += nds;
3818 3439
3819 3440 mutex_exit(&state->id_lso_lock);
3820 3441 }
3821 3442
3822 3443 static void
3823 3444 ibd_free_tx_copybufs(ibd_state_t *state)
3824 3445 {
3825 3446 /*
3826 3447 * Unregister txbuf mr
3827 3448 */
3828 3449 if (ibt_deregister_mr(state->id_hca_hdl,
3829 3450 state->id_tx_mr_hdl) != IBT_SUCCESS) {
3830 3451 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3831 3452 }
3832 3453 state->id_tx_mr_hdl = NULL;
3833 3454
3834 3455 /*
3835 3456 * Free txbuf memory
3836 3457 */
3837 3458 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe *
3838 3459 sizeof (ibd_swqe_t));
3839 3460 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe *
3840 3461 state->id_tx_buf_sz);
3841 3462 state->id_tx_wqes = NULL;
3842 3463 state->id_tx_bufs = NULL;
3843 3464 }
3844 3465
3845 3466 static void
3846 3467 ibd_free_tx_lsobufs(ibd_state_t *state)
3847 3468 {
3848 3469 ibd_lsobkt_t *bktp;
3849 3470
3850 3471 mutex_enter(&state->id_lso_lock);
3851 3472
3852 3473 if ((bktp = state->id_lso) == NULL) {
3853 3474 mutex_exit(&state->id_lso_lock);
3854 3475 return;
3855 3476 }
3856 3477
3857 3478 /*
3858 3479 * First, free the buflist
3859 3480 */
3860 3481 ASSERT(bktp->bkt_bufl != NULL);
3861 3482 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3862 3483
3863 3484 /*
3864 3485 * Unregister the LSO memory and free it
3865 3486 */
3866 3487 ASSERT(bktp->bkt_mr_hdl != NULL);
3867 3488 if (ibt_deregister_mr(state->id_hca_hdl,
3868 3489 bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3869 3490 DPRINT(10,
3870 3491 "ibd_free_lsobufs: ibt_deregister_mr failed");
3871 3492 }
3872 3493 ASSERT(bktp->bkt_mem);
3873 3494 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3874 3495
3875 3496 /*
3876 3497 * Finally free the bucket
3877 3498 */
3878 3499 kmem_free(bktp, sizeof (ibd_lsobkt_t));
3879 3500 state->id_lso = NULL;
3880 3501
3881 3502 mutex_exit(&state->id_lso_lock);
3882 3503 }
3883 3504
3884 3505 /*
3885 3506 * Free the statically allocated Tx buffer list.
3886 3507 */
3887 3508 static void
3888 3509 ibd_fini_txlist(ibd_state_t *state)
3889 3510 {
3890 3511 /*
3891 3512 * Free the allocated swqes
3892 3513 */
3893 3514 mutex_enter(&state->id_tx_list.dl_mutex);
3894 3515 mutex_enter(&state->id_tx_rel_list.dl_mutex);
3895 3516 state->id_tx_list.dl_head = NULL;
3896 3517 state->id_tx_list.dl_pending_sends = B_FALSE;
3897 3518 state->id_tx_list.dl_cnt = 0;
3898 3519 state->id_tx_rel_list.dl_head = NULL;
3899 3520 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
3900 3521 state->id_tx_rel_list.dl_cnt = 0;
3901 3522 mutex_exit(&state->id_tx_rel_list.dl_mutex);
3902 3523 mutex_exit(&state->id_tx_list.dl_mutex);
3903 3524
3904 3525 ibd_free_tx_lsobufs(state);
3905 3526 ibd_free_tx_copybufs(state);
3906 3527 }
3907 3528
3908 3529 /*
3909 3530 * post a list of rwqes, NULL terminated.
3910 3531 */
3911 3532 static void
3912 3533 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe)
3913 3534 {
3914 3535 uint_t i;
3915 3536 uint_t num_posted;
3916 3537 ibt_status_t ibt_status;
3917 3538 ibt_recv_wr_t wrs[IBD_RX_POST_CNT];
3918 3539
3919 3540 while (rwqe) {
3920 3541 /* Post up to IBD_RX_POST_CNT receive work requests */
3921 3542 for (i = 0; i < IBD_RX_POST_CNT; i++) {
3922 3543 wrs[i] = rwqe->w_rwr;
3923 3544 rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3924 3545 if (rwqe == NULL) {
3925 3546 i++;
3926 3547 break;
3927 3548 }
3928 3549 }
3929 3550
3930 3551 /*
3931 3552 * If posting fails for some reason, we'll never receive
3932 3553 * completion intimation, so we'll need to cleanup. But
3933 3554 * we need to make sure we don't clean up nodes whose
3934 3555 * wrs have been successfully posted. We assume that the
3935 3556 * hca driver returns on the first failure to post and
3936 3557 * therefore the first 'num_posted' entries don't need
3937 3558 * cleanup here.
3938 3559 */
3939 3560 atomic_add_32(&state->id_rx_list.dl_cnt, i);
3940 3561
3941 3562 num_posted = 0;
3942 3563 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i,
3943 3564 &num_posted);
3944 3565 if (ibt_status != IBT_SUCCESS) {
3945 3566 /* This cannot happen unless the device has an error. */
3946 3567 ibd_print_warn(state, "ibd_post_recv: FATAL: "
3947 3568 "posting multiple wrs failed: "
3948 3569 "requested=%d, done=%d, ret=%d",
3949 3570 IBD_RX_POST_CNT, num_posted, ibt_status);
3950 3571 atomic_add_32(&state->id_rx_list.dl_cnt,
3951 3572 num_posted - i);
3952 3573 }
3953 3574 }
3954 3575 }
3955 3576
3956 3577 /*
3957 3578 * Grab a list of rwqes from the array of lists, and post the list.
3958 3579 */
3959 3580 static void
3960 3581 ibd_post_recv_intr(ibd_state_t *state)
3961 3582 {
3962 3583 ibd_rx_queue_t *rxp;
3963 3584 ibd_rwqe_t *list;
3964 3585
3965 3586 /* rotate through the rx_queue array, expecting an adequate number */
3966 3587 state->id_rx_post_queue_index =
3967 3588 (state->id_rx_post_queue_index + 1) &
3968 3589 (state->id_rx_nqueues - 1);
3969 3590
3970 3591 rxp = state->id_rx_queues + state->id_rx_post_queue_index;
3971 3592 mutex_enter(&rxp->rx_post_lock);
3972 3593 list = WQE_TO_RWQE(rxp->rx_head);
3973 3594 rxp->rx_head = NULL;
3974 3595 rxp->rx_cnt = 0;
3975 3596 mutex_exit(&rxp->rx_post_lock);
3976 3597 ibd_post_recv_list(state, list);
3977 3598 }
3978 3599
3979 3600 /* macro explained below */
3980 3601 #define RX_QUEUE_HASH(rwqe) \
3981 3602 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3982 3603
3983 3604 /*
3984 3605 * Add a rwqe to one of the the Rx lists. If the list is large enough
3985 3606 * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3986 3607 *
3987 3608 * Note: one of 2^N lists is chosen via a hash. This is done
3988 3609 * because using one list is contentious. If the first list is busy
3989 3610 * (mutex_tryenter fails), use a second list (just call mutex_enter).
3990 3611 *
3991 3612 * The number 8 in RX_QUEUE_HASH is a random choice that provides
3992 3613 * even distribution of mapping rwqes to the 2^N queues.
3993 3614 */
3994 3615 static void
3995 3616 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3996 3617 {
3997 3618 ibd_rx_queue_t *rxp;
3998 3619
3999 3620 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
4000 3621
4001 3622 if (!mutex_tryenter(&rxp->rx_post_lock)) {
4002 3623 /* Failed. Try a different queue ("ptr + 16" ensures that). */
4003 3624 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
4004 3625 mutex_enter(&rxp->rx_post_lock);
4005 3626 }
4006 3627 rwqe->rwqe_next = rxp->rx_head;
4007 3628 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) {
4008 3629 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active);
4009 3630
4010 3631 /* only call ibt_post_recv() every Nth time through here */
4011 3632 if ((active & (state->id_rx_nqueues - 1)) == 0) {
4012 3633 rxp->rx_head = NULL;
4013 3634 rxp->rx_cnt = 0;
4014 3635 mutex_exit(&rxp->rx_post_lock);
4015 3636 ibd_post_recv_list(state, rwqe);
4016 3637 return;
4017 3638 }
4018 3639 }
4019 3640 rxp->rx_head = RWQE_TO_WQE(rwqe);
4020 3641 mutex_exit(&rxp->rx_post_lock);
4021 3642 }
4022 3643
4023 3644 static int
4024 3645 ibd_alloc_rx_copybufs(ibd_state_t *state)
4025 3646 {
4026 3647 ibt_mr_attr_t mem_attr;
4027 3648 int i;
4028 3649
4029 3650 /*
4030 3651 * Allocate one big chunk for all regular rx copy bufs
4031 3652 */
4032 3653 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
4033 3654
4034 3655 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe *
4035 3656 state->id_rx_buf_sz, KM_SLEEP);
4036 3657
4037 3658 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe *
4038 3659 sizeof (ibd_rwqe_t), KM_SLEEP);
4039 3660
4040 3661 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
4041 3662 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
4042 3663 sizeof (ibd_rx_queue_t), KM_SLEEP);
4043 3664 for (i = 0; i < state->id_rx_nqueues; i++) {
4044 3665 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4045 3666 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
4046 3667 }
4047 3668
4048 3669 /*
4049 3670 * Do one memory registration on the entire rxbuf area
4050 3671 */
4051 3672 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
4052 3673 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz;
4053 3674 mem_attr.mr_as = NULL;
4054 3675 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
4055 3676 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
4056 3677 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
4057 3678 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
4058 3679 kmem_free(state->id_rx_wqes,
4059 3680 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t));
4060 3681 kmem_free(state->id_rx_bufs,
4061 3682 state->id_ud_num_rwqe * state->id_rx_buf_sz);
4062 3683 state->id_rx_bufs = NULL;
4063 3684 state->id_rx_wqes = NULL;
4064 3685 return (DDI_FAILURE);
4065 3686 }
4066 3687
4067 3688 return (DDI_SUCCESS);
4068 3689 }
4069 3690
4070 3691 /*
4071 3692 * Allocate the statically allocated Rx buffer list.
4072 3693 */
4073 3694 static int
4074 3695 ibd_init_rxlist(ibd_state_t *state)
4075 3696 {
4076 3697 ibd_rwqe_t *rwqe, *next;
4077 3698 ibd_wqe_t *list;
4078 3699 ibt_lkey_t lkey;
4079 3700 int i;
4080 3701 uint_t len;
4081 3702 uint8_t *bufaddr;
4082 3703
4083 3704 mutex_enter(&state->id_rx_free_list.dl_mutex);
4084 3705 if (state->id_rx_free_list.dl_head != NULL) {
4085 3706 /* rx rsrcs were never freed. Just repost them */
4086 3707 len = state->id_rx_buf_sz;
4087 3708 list = state->id_rx_free_list.dl_head;
4088 3709 state->id_rx_free_list.dl_head = NULL;
4089 3710 state->id_rx_free_list.dl_cnt = 0;
4090 3711 mutex_exit(&state->id_rx_free_list.dl_mutex);
4091 3712 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4092 3713 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4093 3714 if ((rwqe->rwqe_im_mblk = desballoc(
4094 3715 rwqe->rwqe_copybuf.ic_bufaddr, len, 0,
4095 3716 &rwqe->w_freemsg_cb)) == NULL) {
4096 3717 /* allow freemsg_cb to free the rwqes */
4097 3718 if (atomic_dec_32_nv(&state->id_running) != 0) {
4098 3719 cmn_err(CE_WARN, "ibd_init_rxlist: "
4099 3720 "id_running was not 1\n");
4100 3721 }
4101 3722 DPRINT(10, "ibd_init_rxlist : "
4102 3723 "failed in desballoc()");
4103 3724 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4104 3725 rwqe = next) {
4105 3726 next = WQE_TO_RWQE(rwqe->rwqe_next);
4106 3727 if (rwqe->rwqe_im_mblk) {
4107 3728 atomic_inc_32(&state->
4108 3729 id_rx_list.
4109 3730 dl_bufs_outstanding);
4110 3731 freemsg(rwqe->rwqe_im_mblk);
4111 3732 } else
4112 3733 ibd_free_rwqe(state, rwqe);
4113 3734 }
4114 3735 atomic_inc_32(&state->id_running);
4115 3736 return (DDI_FAILURE);
4116 3737 }
4117 3738 }
4118 3739 ibd_post_recv_list(state, WQE_TO_RWQE(list));
4119 3740 return (DDI_SUCCESS);
4120 3741 }
4121 3742 mutex_exit(&state->id_rx_free_list.dl_mutex);
4122 3743
4123 3744 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
4124 3745 return (DDI_FAILURE);
4125 3746
4126 3747 /*
4127 3748 * Allocate and setup the rwqe list
4128 3749 */
4129 3750 len = state->id_rx_buf_sz;
4130 3751 lkey = state->id_rx_mr_desc.md_lkey;
4131 3752 rwqe = state->id_rx_wqes;
4132 3753 bufaddr = state->id_rx_bufs;
4133 3754 list = NULL;
4134 3755 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) {
4135 3756 rwqe->w_state = state;
4136 3757 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
4137 3758 rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
4138 3759
4139 3760 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
4140 3761
4141 3762 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
4142 3763 &rwqe->w_freemsg_cb)) == NULL) {
4143 3764 DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
4144 3765 /* allow freemsg_cb to free the rwqes */
4145 3766 if (atomic_dec_32_nv(&state->id_running) != 0) {
4146 3767 cmn_err(CE_WARN, "ibd_init_rxlist: "
4147 3768 "id_running was not 1\n");
4148 3769 }
4149 3770 DPRINT(10, "ibd_init_rxlist : "
4150 3771 "failed in desballoc()");
4151 3772 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL;
4152 3773 rwqe = next) {
4153 3774 next = WQE_TO_RWQE(rwqe->rwqe_next);
4154 3775 freemsg(rwqe->rwqe_im_mblk);
4155 3776 }
4156 3777 atomic_inc_32(&state->id_running);
4157 3778
4158 3779 /* remove reference to free'd rwqes */
4159 3780 mutex_enter(&state->id_rx_free_list.dl_mutex);
4160 3781 state->id_rx_free_list.dl_head = NULL;
4161 3782 state->id_rx_free_list.dl_cnt = 0;
4162 3783 mutex_exit(&state->id_rx_free_list.dl_mutex);
4163 3784
4164 3785 ibd_fini_rxlist(state);
4165 3786 return (DDI_FAILURE);
4166 3787 }
4167 3788
4168 3789 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
4169 3790 rwqe->rwqe_copybuf.ic_sgl.ds_va =
4170 3791 (ib_vaddr_t)(uintptr_t)bufaddr;
4171 3792 rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
4172 3793 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
4173 3794 rwqe->w_rwr.wr_nds = 1;
4174 3795 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
4175 3796
4176 3797 rwqe->rwqe_next = list;
4177 3798 list = RWQE_TO_WQE(rwqe);
4178 3799 }
4179 3800 ibd_post_recv_list(state, WQE_TO_RWQE(list));
4180 3801
4181 3802 return (DDI_SUCCESS);
4182 3803 }
4183 3804
4184 3805 static void
4185 3806 ibd_free_rx_copybufs(ibd_state_t *state)
4186 3807 {
4187 3808 int i;
4188 3809
4189 3810 /*
4190 3811 * Unregister rxbuf mr
4191 3812 */
4192 3813 if (ibt_deregister_mr(state->id_hca_hdl,
4193 3814 state->id_rx_mr_hdl) != IBT_SUCCESS) {
4194 3815 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
4195 3816 }
4196 3817 state->id_rx_mr_hdl = NULL;
4197 3818
4198 3819 /*
4199 3820 * Free rxbuf memory
4200 3821 */
4201 3822 for (i = 0; i < state->id_rx_nqueues; i++) {
4202 3823 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4203 3824 mutex_destroy(&rxp->rx_post_lock);
4204 3825 }
4205 3826 kmem_free(state->id_rx_queues, state->id_rx_nqueues *
4206 3827 sizeof (ibd_rx_queue_t));
4207 3828 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe *
4208 3829 sizeof (ibd_rwqe_t));
4209 3830 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe *
4210 3831 state->id_rx_buf_sz);
4211 3832 state->id_rx_queues = NULL;
4212 3833 state->id_rx_wqes = NULL;
4213 3834 state->id_rx_bufs = NULL;
4214 3835 }
4215 3836
4216 3837 static void
4217 3838 ibd_free_rx_rsrcs(ibd_state_t *state)
4218 3839 {
4219 3840 mutex_enter(&state->id_rx_free_list.dl_mutex);
4220 3841 if (state->id_rx_free_list.dl_head == NULL) {
4221 3842 /* already freed */
4222 3843 mutex_exit(&state->id_rx_free_list.dl_mutex);
4223 3844 return;
4224 3845 }
4225 3846 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe);
4226 3847 ibd_free_rx_copybufs(state);
4227 3848 state->id_rx_free_list.dl_cnt = 0;
4228 3849 state->id_rx_free_list.dl_head = NULL;
4229 3850 mutex_exit(&state->id_rx_free_list.dl_mutex);
4230 3851 }
4231 3852
4232 3853 /*
4233 3854 * Free the statically allocated Rx buffer list.
4234 3855 */
4235 3856 static void
4236 3857 ibd_fini_rxlist(ibd_state_t *state)
4237 3858 {
4238 3859 ibd_rwqe_t *rwqe;
4239 3860 int i;
4240 3861
4241 3862 /* run through the rx_queue's, calling freemsg() */
4242 3863 for (i = 0; i < state->id_rx_nqueues; i++) {
4243 3864 ibd_rx_queue_t *rxp = state->id_rx_queues + i;
4244 3865 mutex_enter(&rxp->rx_post_lock);
4245 3866 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe;
4246 3867 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) {
4247 3868 freemsg(rwqe->rwqe_im_mblk);
4248 3869 rxp->rx_cnt--;
4249 3870 }
4250 3871 rxp->rx_head = NULL;
4251 3872 mutex_exit(&rxp->rx_post_lock);
4252 3873 }
4253 3874
4254 3875 /* cannot free rx resources unless gld returned everything */
4255 3876 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0)
4256 3877 ibd_free_rx_rsrcs(state);
4257 3878 }
4258 3879
4259 3880 /*
4260 3881 * Free an allocated recv wqe.
4261 3882 */
4262 3883 /* ARGSUSED */
4263 3884 static void
4264 3885 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
4265 3886 {
4266 3887 /*
4267 3888 * desballoc() failed (no memory).
4268 3889 *
4269 3890 * This rwqe is placed on a free list so that it
4270 3891 * can be reinstated when memory is available.
4271 3892 *
4272 3893 * NOTE: no code currently exists to reinstate
4273 3894 * these "lost" rwqes.
4274 3895 */
4275 3896 mutex_enter(&state->id_rx_free_list.dl_mutex);
4276 3897 state->id_rx_free_list.dl_cnt++;
4277 3898 rwqe->rwqe_next = state->id_rx_free_list.dl_head;
4278 3899 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
4279 3900 mutex_exit(&state->id_rx_free_list.dl_mutex);
4280 3901 }
4281 3902
4282 3903 /*
4283 3904 * IBA Rx completion queue handler. Guaranteed to be single
4284 3905 * threaded and nonreentrant for this CQ.
4285 3906 */
4286 3907 /* ARGSUSED */
4287 3908 static void
4288 3909 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4289 3910 {
4290 3911 ibd_state_t *state = (ibd_state_t *)arg;
4291 3912
4292 3913 atomic_inc_64(&state->id_num_intrs);
4293 3914
4294 3915 if (ibd_rx_softintr == 1) {
4295 3916 mutex_enter(&state->id_rcq_poll_lock);
4296 3917 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
4297 3918 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
4298 3919 mutex_exit(&state->id_rcq_poll_lock);
4299 3920 return;
4300 3921 } else {
4301 3922 mutex_exit(&state->id_rcq_poll_lock);
4302 3923 ddi_trigger_softintr(state->id_rx);
4303 3924 }
4304 3925 } else
4305 3926 (void) ibd_intr((caddr_t)state);
4306 3927 }
4307 3928
4308 3929 /*
4309 3930 * CQ handler for Tx completions, when the Tx CQ is in
4310 3931 * interrupt driven mode.
4311 3932 */
4312 3933 /* ARGSUSED */
4313 3934 static void
4314 3935 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
4315 3936 {
4316 3937 ibd_state_t *state = (ibd_state_t *)arg;
4317 3938
4318 3939 atomic_inc_64(&state->id_num_intrs);
4319 3940
4320 3941 if (ibd_tx_softintr == 1) {
4321 3942 mutex_enter(&state->id_scq_poll_lock);
4322 3943 if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
4323 3944 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
4324 3945 mutex_exit(&state->id_scq_poll_lock);
4325 3946 return;
4326 3947 } else {
4327 3948 mutex_exit(&state->id_scq_poll_lock);
4328 3949 ddi_trigger_softintr(state->id_tx);
4329 3950 }
4330 3951 } else
4331 3952 (void) ibd_tx_recycle((caddr_t)state);
4332 3953 }
4333 3954
4334 3955 /*
4335 3956 * Multicast group create/delete trap handler. These will be delivered
4336 3957 * on a kernel thread (handling can thus block) and can be invoked
4337 3958 * concurrently. The handler can be invoked anytime after it is
4338 3959 * registered and before ibt_detach().
4339 3960 */
4340 3961 /* ARGSUSED */
4341 3962 static void
4342 3963 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
4343 3964 ibt_subnet_event_t *event)
4344 3965 {
4345 3966 ibd_state_t *state = (ibd_state_t *)arg;
↓ open down ↓ |
861 lines elided |
↑ open up ↑ |
4346 3967 ibd_req_t *req;
4347 3968
4348 3969 /*
4349 3970 * The trap handler will get invoked once for every event for
4350 3971 * every port. The input "gid" is the GID0 of the port the
4351 3972 * trap came in on; we just need to act on traps that came
4352 3973 * to our port, meaning the port on which the ipoib interface
4353 3974 * resides. Since ipoib uses GID0 of the port, we just match
4354 3975 * the gids to check whether we need to handle the trap.
4355 3976 */
4356 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4357 3977 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4358 3978 return;
4359 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4360 3979
4361 3980 DPRINT(10, "ibd_notices_handler : %d\n", code);
4362 3981
4363 3982 switch (code) {
4364 3983 case IBT_SM_EVENT_UNAVAILABLE:
4365 3984 /*
4366 3985 * If we are in promiscuous mode or have
4367 3986 * sendnonmembers, we need to print a warning
4368 3987 * message right now. Else, just store the
4369 3988 * information, print when we enter promiscuous
4370 3989 * mode or attempt nonmember send. We might
4371 3990 * also want to stop caching sendnonmember.
4372 3991 */
4373 3992 ibd_print_warn(state, "IBA multicast support "
4374 3993 "degraded due to unavailability of multicast "
4375 3994 "traps");
4376 3995 break;
4377 3996 case IBT_SM_EVENT_AVAILABLE:
4378 3997 /*
4379 3998 * If we printed a warning message above or
4380 3999 * while trying to nonmember send or get into
4381 4000 * promiscuous mode, print an okay message.
4382 4001 */
4383 4002 ibd_print_warn(state, "IBA multicast support "
4384 4003 "restored due to availability of multicast "
4385 4004 "traps");
4386 4005 break;
4387 4006 case IBT_SM_EVENT_MCG_CREATED:
4388 4007 case IBT_SM_EVENT_MCG_DELETED:
4389 4008 /*
4390 4009 * If it is a "deleted" event and we are in late hca
4391 4010 * init, nothing to do.
4392 4011 */
4393 4012 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4394 4013 IBD_DRV_IN_LATE_HCA_INIT) && (code ==
4395 4014 IBT_SM_EVENT_MCG_DELETED)) {
4396 4015 break;
4397 4016 }
4398 4017 /*
4399 4018 * Common processing of creation/deletion traps.
4400 4019 * First check if the instance is being
4401 4020 * [de]initialized; back off then, without doing
4402 4021 * anything more, since we are not sure if the
4403 4022 * async thread is around, or whether we might
4404 4023 * be racing with the detach code in ibd_m_stop()
4405 4024 * that scans the mcg list.
4406 4025 */
4407 4026 if (!ibd_async_safe(state))
4408 4027 return;
4409 4028
4410 4029 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4411 4030 req->rq_gid = event->sm_notice_gid;
4412 4031 req->rq_ptr = (void *)code;
4413 4032 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4414 4033 break;
4415 4034 }
4416 4035 }
4417 4036
4418 4037 static void
4419 4038 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4420 4039 {
4421 4040 ib_gid_t mgid = req->rq_gid;
4422 4041 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4423 4042 int ret;
4424 4043 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff;
4425 4044
4426 4045 DPRINT(10, "ibd_async_trap : %d\n", code);
4427 4046
4428 4047 /*
4429 4048 * Check if we have already joined the IPoIB broadcast group for our
4430 4049 * PKEY. If joined, perform the rest of the operation.
4431 4050 * Else, the interface is not initialised. Do the initialisation here
4432 4051 * by calling ibd_start() and return.
4433 4052 */
4434 4053
4435 4054 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4436 4055 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) &&
4437 4056 (code == IBT_SM_EVENT_MCG_CREATED)) {
4438 4057 /*
4439 4058 * If we are in late HCA init and a notification for the
4440 4059 * creation of a MCG came in, check if it is the IPoIB MCG for
4441 4060 * this pkey. If not, return.
4442 4061 */
4443 4062 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey !=
4444 4063 state->id_pkey)) {
4445 4064 ibd_async_done(state);
4446 4065 return;
4447 4066 }
4448 4067 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4449 4068 /*
4450 4069 * Check if there is still a necessity to start the interface.
4451 4070 * It is possible that the user attempted unplumb at just about
4452 4071 * the same time, and if unplumb succeeded, we have nothing to
4453 4072 * do.
4454 4073 */
4455 4074 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ==
4456 4075 IBD_DRV_IN_LATE_HCA_INIT) &&
4457 4076 ((ret = ibd_start(state)) != 0)) {
4458 4077 DPRINT(10, "ibd_async_trap: cannot start from late HCA "
4459 4078 "init, ret=%d", ret);
4460 4079 }
4461 4080 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
4462 4081 ibd_async_done(state);
4463 4082 return;
4464 4083 }
4465 4084
4466 4085 /*
4467 4086 * Atomically search the nonmember and sendonlymember lists and
4468 4087 * delete.
4469 4088 */
4470 4089 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4471 4090
4472 4091 if (state->id_prom_op == IBD_OP_COMPLETED) {
4473 4092 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4474 4093
4475 4094 /*
4476 4095 * If in promiscuous mode, try to join/attach to the new
4477 4096 * mcg. Given the unreliable out-of-order mode of trap
4478 4097 * delivery, we can never be sure whether it is a problem
4479 4098 * if the join fails. Thus, we warn the admin of a failure
4480 4099 * if this was a creation trap. Note that the trap might
4481 4100 * actually be reporting a long past event, and the mcg
4482 4101 * might already have been deleted, thus we might be warning
4483 4102 * in vain.
4484 4103 */
4485 4104 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4486 4105 NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4487 4106 ibd_print_warn(state, "IBA promiscuous mode missed "
4488 4107 "new multicast gid %016llx:%016llx",
4489 4108 (u_longlong_t)mgid.gid_prefix,
4490 4109 (u_longlong_t)mgid.gid_guid);
4491 4110 }
4492 4111
4493 4112 /*
4494 4113 * Free the request slot allocated by the subnet event thread.
4495 4114 */
4496 4115 ibd_async_done(state);
4497 4116 }
4498 4117
4499 4118 /*
4500 4119 * GLDv3 entry point to get capabilities.
4501 4120 */
4502 4121 static boolean_t
4503 4122 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4504 4123 {
4505 4124 ibd_state_t *state = arg;
4506 4125
4507 4126 if (state->id_type == IBD_PORT_DRIVER)
4508 4127 return (B_FALSE);
4509 4128
4510 4129 switch (cap) {
4511 4130 case MAC_CAPAB_HCKSUM: {
4512 4131 uint32_t *txflags = cap_data;
4513 4132
4514 4133 /*
4515 4134 * We either do full checksum or not do it at all
4516 4135 */
4517 4136 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4518 4137 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4519 4138 else
4520 4139 return (B_FALSE);
4521 4140 break;
4522 4141 }
4523 4142
4524 4143 case MAC_CAPAB_LSO: {
4525 4144 mac_capab_lso_t *cap_lso = cap_data;
4526 4145
4527 4146 /*
4528 4147 * In addition to the capability and policy, since LSO
4529 4148 * relies on hw checksum, we'll not enable LSO if we
4530 4149 * don't have hw checksum. Of course, if the HCA doesn't
4531 4150 * provide the reserved lkey capability, enabling LSO will
4532 4151 * actually affect performance adversely, so we'll disable
4533 4152 * LSO even for that case.
4534 4153 */
4535 4154 if (!state->id_lso_policy || !state->id_lso_capable)
4536 4155 return (B_FALSE);
4537 4156
4538 4157 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4539 4158 return (B_FALSE);
4540 4159
4541 4160 if (state->id_hca_res_lkey_capab == 0) {
4542 4161 ibd_print_warn(state, "no reserved-lkey capability, "
4543 4162 "disabling LSO");
4544 4163 return (B_FALSE);
4545 4164 }
4546 4165
4547 4166 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4548 4167 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4549 4168 break;
4550 4169 }
4551 4170
4552 4171 default:
4553 4172 return (B_FALSE);
4554 4173 }
4555 4174
4556 4175 return (B_TRUE);
4557 4176 }
4558 4177
4559 4178 /*
4560 4179 * callback function for set/get of properties
4561 4180 */
4562 4181 static int
4563 4182 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4564 4183 uint_t pr_valsize, const void *pr_val)
4565 4184 {
4566 4185 ibd_state_t *state = arg;
4567 4186 int err = 0;
4568 4187 uint32_t link_mode;
4569 4188
4570 4189 /* Cannot set properties on a port driver */
4571 4190 if (state->id_type == IBD_PORT_DRIVER) {
4572 4191 return (ENOTSUP);
4573 4192 }
4574 4193
4575 4194 switch (pr_num) {
4576 4195 case MAC_PROP_IB_LINKMODE:
4577 4196 if (state->id_mac_state & IBD_DRV_STARTED) {
4578 4197 err = EBUSY;
4579 4198 break;
4580 4199 }
4581 4200 if (pr_val == NULL) {
4582 4201 err = EINVAL;
4583 4202 break;
4584 4203 }
4585 4204 bcopy(pr_val, &link_mode, sizeof (link_mode));
4586 4205 if (link_mode != IBD_LINK_MODE_UD &&
4587 4206 link_mode != IBD_LINK_MODE_RC) {
4588 4207 err = EINVAL;
4589 4208 } else {
4590 4209 if (link_mode == IBD_LINK_MODE_RC) {
4591 4210 if (state->id_enable_rc) {
4592 4211 return (0);
4593 4212 }
4594 4213 state->id_enable_rc = 1;
4595 4214 /* inform MAC framework of new MTU */
4596 4215 err = mac_maxsdu_update2(state->id_mh,
4597 4216 state->rc_mtu - IPOIB_HDRSIZE,
4598 4217 state->id_mtu - IPOIB_HDRSIZE);
4599 4218 } else {
4600 4219 if (!state->id_enable_rc) {
4601 4220 return (0);
4602 4221 }
4603 4222 state->id_enable_rc = 0;
4604 4223 err = mac_maxsdu_update2(state->id_mh,
4605 4224 state->id_mtu - IPOIB_HDRSIZE,
4606 4225 state->id_mtu - IPOIB_HDRSIZE);
4607 4226 }
4608 4227 (void) ibd_record_capab(state);
4609 4228 mac_capab_update(state->id_mh);
4610 4229 }
4611 4230 break;
4612 4231 case MAC_PROP_PRIVATE:
4613 4232 err = ibd_set_priv_prop(state, pr_name,
4614 4233 pr_valsize, pr_val);
4615 4234 break;
4616 4235 default:
4617 4236 err = ENOTSUP;
4618 4237 break;
4619 4238 }
4620 4239 return (err);
4621 4240 }
4622 4241
4623 4242 static int
4624 4243 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4625 4244 uint_t pr_valsize, void *pr_val)
4626 4245 {
4627 4246 ibd_state_t *state = arg;
4628 4247 int err = 0;
4629 4248
4630 4249 switch (pr_num) {
4631 4250 case MAC_PROP_MTU:
4632 4251 break;
4633 4252 default:
4634 4253 if (state->id_type == IBD_PORT_DRIVER) {
4635 4254 return (ENOTSUP);
4636 4255 }
4637 4256 break;
4638 4257 }
4639 4258
4640 4259 switch (pr_num) {
4641 4260 case MAC_PROP_IB_LINKMODE:
4642 4261 *(uint_t *)pr_val = state->id_enable_rc;
4643 4262 break;
4644 4263 case MAC_PROP_PRIVATE:
4645 4264 err = ibd_get_priv_prop(state, pr_name, pr_valsize,
4646 4265 pr_val);
4647 4266 break;
4648 4267 default:
4649 4268 err = ENOTSUP;
4650 4269 break;
4651 4270 }
4652 4271 return (err);
4653 4272 }
4654 4273
4655 4274 static void
4656 4275 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
4657 4276 mac_prop_info_handle_t prh)
4658 4277 {
4659 4278 ibd_state_t *state = arg;
4660 4279
4661 4280 switch (pr_num) {
4662 4281 case MAC_PROP_IB_LINKMODE: {
4663 4282 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE);
4664 4283 break;
4665 4284 }
4666 4285 case MAC_PROP_MTU: {
4667 4286 uint32_t min, max;
4668 4287 if (state->id_type == IBD_PORT_DRIVER) {
4669 4288 min = 1500;
4670 4289 max = IBD_DEF_RC_MAX_SDU;
4671 4290 } else if (state->id_enable_rc) {
4672 4291 min = max = IBD_DEF_RC_MAX_SDU;
4673 4292 } else {
4674 4293 min = max = state->id_mtu - IPOIB_HDRSIZE;
4675 4294 }
4676 4295 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4677 4296 mac_prop_info_set_range_uint32(prh, min, max);
4678 4297 break;
4679 4298 }
4680 4299 case MAC_PROP_PRIVATE: {
4681 4300 char valstr[64];
4682 4301 int value;
4683 4302
4684 4303 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
4685 4304 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ);
4686 4305 return;
4687 4306 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4688 4307 value = IBD_DEF_COALESCE_COMPLETIONS;
4689 4308 } else if (strcmp(pr_name,
4690 4309 "_ibd_create_broadcast_group") == 0) {
4691 4310 value = IBD_DEF_CREATE_BCAST_GROUP;
4692 4311 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4693 4312 value = IBD_DEF_HASH_SIZE;
4694 4313 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4695 4314 value = IBD_DEF_LSO_POLICY;
4696 4315 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4697 4316 value = IBD_DEF_NUM_AH;
4698 4317 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4699 4318 value = IBD_DEF_NUM_LSO_BUFS;
4700 4319 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4701 4320 value = IBD_DEF_RC_ENABLE_SRQ;
4702 4321 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4703 4322 value = IBD_DEF_RC_NUM_RWQE;
4704 4323 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4705 4324 value = IBD_DEF_RC_NUM_SRQ;
4706 4325 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4707 4326 value = IBD_DEF_RC_NUM_SWQE;
4708 4327 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4709 4328 value = IBD_DEF_RC_RX_COMP_COUNT;
4710 4329 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4711 4330 value = IBD_DEF_RC_RX_COMP_USEC;
4712 4331 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4713 4332 value = IBD_DEF_RC_RX_COPY_THRESH;
4714 4333 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4715 4334 value = IBD_DEF_RC_RX_RWQE_THRESH;
4716 4335 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
4717 4336 value = IBD_DEF_RC_TX_COMP_COUNT;
4718 4337 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
4719 4338 value = IBD_DEF_RC_TX_COMP_USEC;
4720 4339 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
4721 4340 value = IBD_DEF_RC_TX_COPY_THRESH;
4722 4341 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
4723 4342 value = IBD_DEF_UD_NUM_RWQE;
4724 4343 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
4725 4344 value = IBD_DEF_UD_NUM_SWQE;
4726 4345 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
4727 4346 value = IBD_DEF_UD_RX_COMP_COUNT;
4728 4347 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
4729 4348 value = IBD_DEF_UD_RX_COMP_USEC;
4730 4349 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
4731 4350 value = IBD_DEF_UD_TX_COMP_COUNT;
4732 4351 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
4733 4352 value = IBD_DEF_UD_TX_COMP_USEC;
4734 4353 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
4735 4354 value = IBD_DEF_UD_TX_COPY_THRESH;
4736 4355 } else {
4737 4356 return;
4738 4357 }
4739 4358
4740 4359 (void) snprintf(valstr, sizeof (valstr), "%d", value);
4741 4360 mac_prop_info_set_default_str(prh, valstr);
4742 4361 break;
4743 4362 }
4744 4363 } /* switch (pr_num) */
4745 4364 }
4746 4365
4747 4366 /* ARGSUSED2 */
4748 4367 static int
4749 4368 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name,
4750 4369 uint_t pr_valsize, const void *pr_val)
4751 4370 {
4752 4371 int err = 0;
4753 4372 long result;
4754 4373
4755 4374 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
4756 4375 if (pr_val == NULL) {
4757 4376 return (EINVAL);
4758 4377 }
4759 4378 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4760 4379 if (result < 0 || result > 1) {
4761 4380 err = EINVAL;
4762 4381 } else {
4763 4382 state->id_allow_coalesce_comp_tuning = (result == 1) ?
4764 4383 B_TRUE: B_FALSE;
4765 4384 }
4766 4385 return (err);
4767 4386 }
4768 4387 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
4769 4388 if (state->id_mac_state & IBD_DRV_STARTED) {
4770 4389 return (EBUSY);
4771 4390 }
4772 4391 if (pr_val == NULL) {
4773 4392 return (EINVAL);
4774 4393 }
4775 4394 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4776 4395 if (result < 0 || result > 1) {
4777 4396 err = EINVAL;
4778 4397 } else {
4779 4398 state->id_create_broadcast_group = (result == 1) ?
4780 4399 B_TRUE: B_FALSE;
4781 4400 }
4782 4401 return (err);
4783 4402 }
4784 4403 if (strcmp(pr_name, "_ibd_hash_size") == 0) {
4785 4404 if (state->id_mac_state & IBD_DRV_STARTED) {
4786 4405 return (EBUSY);
4787 4406 }
4788 4407 if (pr_val == NULL) {
4789 4408 return (EINVAL);
4790 4409 }
4791 4410 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4792 4411 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) {
4793 4412 err = EINVAL;
4794 4413 } else {
4795 4414 state->id_hash_size = (uint32_t)result;
4796 4415 }
4797 4416 return (err);
4798 4417 }
4799 4418 if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
4800 4419 if (state->id_mac_state & IBD_DRV_STARTED) {
4801 4420 return (EBUSY);
4802 4421 }
4803 4422 if (pr_val == NULL) {
4804 4423 return (EINVAL);
4805 4424 }
4806 4425 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4807 4426 if (result < 0 || result > 1) {
4808 4427 err = EINVAL;
4809 4428 } else {
4810 4429 state->id_lso_policy = (result == 1) ?
4811 4430 B_TRUE: B_FALSE;
4812 4431 }
4813 4432 mac_capab_update(state->id_mh);
4814 4433 return (err);
4815 4434 }
4816 4435 if (strcmp(pr_name, "_ibd_num_ah") == 0) {
4817 4436 if (state->id_mac_state & IBD_DRV_STARTED) {
4818 4437 return (EBUSY);
4819 4438 }
4820 4439 if (pr_val == NULL) {
4821 4440 return (EINVAL);
4822 4441 }
4823 4442 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4824 4443 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) {
4825 4444 err = EINVAL;
4826 4445 } else {
4827 4446 state->id_num_ah = (uint32_t)result;
4828 4447 }
4829 4448 return (err);
4830 4449 }
4831 4450 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
4832 4451 if (state->id_mac_state & IBD_DRV_STARTED) {
4833 4452 return (EBUSY);
4834 4453 }
4835 4454 if (!state->id_lso_policy || !state->id_lso_capable) {
4836 4455 return (EINVAL);
4837 4456 }
4838 4457 if (pr_val == NULL) {
4839 4458 return (EINVAL);
4840 4459 }
4841 4460 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4842 4461 if (result < IBD_MIN_NUM_LSO_BUFS ||
4843 4462 result > IBD_MAX_NUM_LSO_BUFS) {
4844 4463 err = EINVAL;
4845 4464 } else {
4846 4465 state->id_num_lso_bufs = (uint32_t)result;
4847 4466 }
4848 4467 return (err);
4849 4468 }
4850 4469 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
4851 4470 if (state->id_mac_state & IBD_DRV_STARTED) {
4852 4471 return (EBUSY);
4853 4472 }
4854 4473 if (pr_val == NULL) {
4855 4474 return (EINVAL);
4856 4475 }
4857 4476 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4858 4477 if (result < 0 || result > 1) {
4859 4478 err = EINVAL;
4860 4479 } else {
4861 4480 state->rc_enable_srq = (result == 1) ?
4862 4481 B_TRUE: B_FALSE;
4863 4482 }
4864 4483 if (!state->rc_enable_srq) {
4865 4484 state->id_rc_num_srq = 0;
4866 4485 }
4867 4486 return (err);
4868 4487 }
4869 4488 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
4870 4489 if (state->id_mac_state & IBD_DRV_STARTED) {
4871 4490 return (EBUSY);
4872 4491 }
4873 4492 if (pr_val == NULL) {
4874 4493 return (EINVAL);
4875 4494 }
4876 4495 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4877 4496 if (result < IBD_MIN_RC_NUM_RWQE ||
4878 4497 result > IBD_MAX_RC_NUM_RWQE) {
4879 4498 err = EINVAL;
4880 4499 } else {
4881 4500 state->id_rc_num_rwqe = (uint32_t)result;
4882 4501 if (state->id_allow_coalesce_comp_tuning &&
4883 4502 state->id_rc_rx_comp_count > state->id_rc_num_rwqe)
4884 4503 state->id_rc_rx_comp_count =
4885 4504 state->id_rc_num_rwqe;
4886 4505 if (state->id_rc_num_srq > state->id_rc_num_rwqe)
4887 4506 state->id_rc_num_srq =
4888 4507 state->id_rc_num_rwqe - 1;
4889 4508 /*
4890 4509 * If rx_rwqe_threshold is greater than the number of
4891 4510 * rwqes, pull it back to 25% of number of rwqes.
4892 4511 */
4893 4512 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe)
4894 4513 state->id_rc_rx_rwqe_thresh =
4895 4514 (state->id_rc_num_rwqe >> 2);
4896 4515
4897 4516 }
4898 4517 return (err);
4899 4518 }
4900 4519 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
4901 4520 if (state->id_mac_state & IBD_DRV_STARTED) {
4902 4521 return (EBUSY);
4903 4522 }
4904 4523 if (pr_val == NULL) {
4905 4524 return (EINVAL);
4906 4525 }
4907 4526 if (!state->rc_enable_srq)
4908 4527 return (EINVAL);
4909 4528
4910 4529 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4911 4530 if (result < IBD_MIN_RC_NUM_SRQ ||
4912 4531 result >= state->id_rc_num_rwqe) {
4913 4532 err = EINVAL;
4914 4533 } else
4915 4534 state->id_rc_num_srq = (uint32_t)result;
4916 4535 return (err);
4917 4536 }
4918 4537 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
4919 4538 if (state->id_mac_state & IBD_DRV_STARTED) {
4920 4539 return (EBUSY);
4921 4540 }
4922 4541 if (pr_val == NULL) {
4923 4542 return (EINVAL);
4924 4543 }
4925 4544 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4926 4545 if (result < IBD_MIN_RC_NUM_SWQE ||
4927 4546 result > IBD_MAX_RC_NUM_SWQE) {
4928 4547 err = EINVAL;
4929 4548 } else {
4930 4549 state->id_rc_num_swqe = (uint32_t)result;
4931 4550 if (state->id_allow_coalesce_comp_tuning &&
4932 4551 state->id_rc_tx_comp_count > state->id_rc_num_swqe)
4933 4552 state->id_rc_tx_comp_count =
4934 4553 state->id_rc_num_swqe;
4935 4554 }
4936 4555 return (err);
4937 4556 }
4938 4557 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
4939 4558 if (!state->id_allow_coalesce_comp_tuning) {
4940 4559 return (ENOTSUP);
4941 4560 }
4942 4561 if (pr_val == NULL) {
4943 4562 return (EINVAL);
4944 4563 }
4945 4564 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4946 4565 if (result < 1 || result > state->id_rc_num_rwqe) {
4947 4566 err = EINVAL;
4948 4567 } else {
4949 4568 state->id_rc_rx_comp_count = (uint32_t)result;
4950 4569 }
4951 4570 return (err);
4952 4571 }
4953 4572 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
4954 4573 if (!state->id_allow_coalesce_comp_tuning) {
4955 4574 return (ENOTSUP);
4956 4575 }
4957 4576 if (pr_val == NULL) {
4958 4577 return (EINVAL);
4959 4578 }
4960 4579 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4961 4580 if (result < 1) {
4962 4581 err = EINVAL;
4963 4582 } else {
4964 4583 state->id_rc_rx_comp_usec = (uint32_t)result;
4965 4584 }
4966 4585 return (err);
4967 4586 }
4968 4587 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
4969 4588 if (state->id_mac_state & IBD_DRV_STARTED) {
4970 4589 return (EBUSY);
4971 4590 }
4972 4591 if (pr_val == NULL) {
4973 4592 return (EINVAL);
4974 4593 }
4975 4594 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4976 4595 if (result < IBD_MIN_RC_RX_COPY_THRESH ||
4977 4596 result > state->rc_mtu) {
4978 4597 err = EINVAL;
4979 4598 } else {
4980 4599 state->id_rc_rx_copy_thresh = (uint32_t)result;
4981 4600 }
4982 4601 return (err);
4983 4602 }
4984 4603 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
4985 4604 if (state->id_mac_state & IBD_DRV_STARTED) {
4986 4605 return (EBUSY);
4987 4606 }
4988 4607 if (pr_val == NULL) {
4989 4608 return (EINVAL);
4990 4609 }
4991 4610 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
4992 4611 if (result < IBD_MIN_RC_RX_RWQE_THRESH ||
4993 4612 result >= state->id_rc_num_rwqe) {
4994 4613 err = EINVAL;
4995 4614 } else {
4996 4615 state->id_rc_rx_rwqe_thresh = (uint32_t)result;
4997 4616 }
4998 4617 return (err);
4999 4618 }
5000 4619 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5001 4620 if (!state->id_allow_coalesce_comp_tuning) {
5002 4621 return (ENOTSUP);
5003 4622 }
5004 4623 if (pr_val == NULL) {
5005 4624 return (EINVAL);
5006 4625 }
5007 4626 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5008 4627 if (result < 1 || result > state->id_rc_num_swqe) {
5009 4628 err = EINVAL;
5010 4629 } else {
5011 4630 state->id_rc_tx_comp_count = (uint32_t)result;
5012 4631 }
5013 4632 return (err);
5014 4633 }
5015 4634 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5016 4635 if (!state->id_allow_coalesce_comp_tuning) {
5017 4636 return (ENOTSUP);
5018 4637 }
5019 4638 if (pr_val == NULL) {
5020 4639 return (EINVAL);
5021 4640 }
5022 4641 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5023 4642 if (result < 1)
5024 4643 err = EINVAL;
5025 4644 else {
5026 4645 state->id_rc_tx_comp_usec = (uint32_t)result;
5027 4646 }
5028 4647 return (err);
5029 4648 }
5030 4649 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5031 4650 if (state->id_mac_state & IBD_DRV_STARTED) {
5032 4651 return (EBUSY);
5033 4652 }
5034 4653 if (pr_val == NULL) {
5035 4654 return (EINVAL);
5036 4655 }
5037 4656 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5038 4657 if (result < IBD_MIN_RC_TX_COPY_THRESH ||
5039 4658 result > state->rc_mtu) {
5040 4659 err = EINVAL;
5041 4660 } else {
5042 4661 state->id_rc_tx_copy_thresh = (uint32_t)result;
5043 4662 }
5044 4663 return (err);
5045 4664 }
5046 4665 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5047 4666 if (state->id_mac_state & IBD_DRV_STARTED) {
5048 4667 return (EBUSY);
5049 4668 }
5050 4669 if (pr_val == NULL) {
5051 4670 return (EINVAL);
5052 4671 }
5053 4672 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5054 4673 if (result < IBD_MIN_UD_NUM_RWQE ||
5055 4674 result > IBD_MAX_UD_NUM_RWQE) {
5056 4675 err = EINVAL;
5057 4676 } else {
5058 4677 if (result > state->id_hca_max_chan_sz) {
5059 4678 state->id_ud_num_rwqe =
5060 4679 state->id_hca_max_chan_sz;
5061 4680 } else {
5062 4681 state->id_ud_num_rwqe = (uint32_t)result;
5063 4682 }
5064 4683 if (state->id_allow_coalesce_comp_tuning &&
5065 4684 state->id_ud_rx_comp_count > state->id_ud_num_rwqe)
5066 4685 state->id_ud_rx_comp_count =
5067 4686 state->id_ud_num_rwqe;
5068 4687 }
5069 4688 return (err);
5070 4689 }
5071 4690 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5072 4691 if (state->id_mac_state & IBD_DRV_STARTED) {
5073 4692 return (EBUSY);
5074 4693 }
5075 4694 if (pr_val == NULL) {
5076 4695 return (EINVAL);
5077 4696 }
5078 4697 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5079 4698 if (result < IBD_MIN_UD_NUM_SWQE ||
5080 4699 result > IBD_MAX_UD_NUM_SWQE) {
5081 4700 err = EINVAL;
5082 4701 } else {
5083 4702 if (result > state->id_hca_max_chan_sz) {
5084 4703 state->id_ud_num_swqe =
5085 4704 state->id_hca_max_chan_sz;
5086 4705 } else {
5087 4706 state->id_ud_num_swqe = (uint32_t)result;
5088 4707 }
5089 4708 if (state->id_allow_coalesce_comp_tuning &&
5090 4709 state->id_ud_tx_comp_count > state->id_ud_num_swqe)
5091 4710 state->id_ud_tx_comp_count =
5092 4711 state->id_ud_num_swqe;
5093 4712 }
5094 4713 return (err);
5095 4714 }
5096 4715 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5097 4716 if (!state->id_allow_coalesce_comp_tuning) {
5098 4717 return (ENOTSUP);
5099 4718 }
5100 4719 if (pr_val == NULL) {
5101 4720 return (EINVAL);
5102 4721 }
5103 4722 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5104 4723 if (result < 1 || result > state->id_ud_num_rwqe) {
5105 4724 err = EINVAL;
5106 4725 } else {
5107 4726 state->id_ud_rx_comp_count = (uint32_t)result;
5108 4727 }
5109 4728 return (err);
5110 4729 }
5111 4730 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5112 4731 if (!state->id_allow_coalesce_comp_tuning) {
5113 4732 return (ENOTSUP);
5114 4733 }
5115 4734 if (pr_val == NULL) {
5116 4735 return (EINVAL);
5117 4736 }
5118 4737 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5119 4738 if (result < 1) {
5120 4739 err = EINVAL;
5121 4740 } else {
5122 4741 state->id_ud_rx_comp_usec = (uint32_t)result;
5123 4742 }
5124 4743 return (err);
5125 4744 }
5126 4745 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5127 4746 if (!state->id_allow_coalesce_comp_tuning) {
5128 4747 return (ENOTSUP);
5129 4748 }
5130 4749 if (pr_val == NULL) {
5131 4750 return (EINVAL);
5132 4751 }
5133 4752 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5134 4753 if (result < 1 || result > state->id_ud_num_swqe) {
5135 4754 err = EINVAL;
5136 4755 } else {
5137 4756 state->id_ud_tx_comp_count = (uint32_t)result;
5138 4757 }
5139 4758 return (err);
5140 4759 }
5141 4760 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5142 4761 if (!state->id_allow_coalesce_comp_tuning) {
5143 4762 return (ENOTSUP);
5144 4763 }
5145 4764 if (pr_val == NULL) {
5146 4765 return (EINVAL);
5147 4766 }
5148 4767 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5149 4768 if (result < 1) {
5150 4769 err = EINVAL;
5151 4770 } else {
5152 4771 state->id_ud_tx_comp_usec = (uint32_t)result;
5153 4772 }
5154 4773 return (err);
5155 4774 }
5156 4775 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5157 4776 if (state->id_mac_state & IBD_DRV_STARTED) {
5158 4777 return (EBUSY);
5159 4778 }
5160 4779 if (pr_val == NULL) {
5161 4780 return (EINVAL);
5162 4781 }
5163 4782 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result);
5164 4783 if (result < IBD_MIN_UD_TX_COPY_THRESH ||
5165 4784 result > IBD_MAX_UD_TX_COPY_THRESH) {
5166 4785 err = EINVAL;
5167 4786 } else {
5168 4787 state->id_ud_tx_copy_thresh = (uint32_t)result;
5169 4788 }
5170 4789 return (err);
5171 4790 }
5172 4791 return (ENOTSUP);
5173 4792 }
5174 4793
5175 4794 static int
5176 4795 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize,
5177 4796 void *pr_val)
5178 4797 {
5179 4798 int err = ENOTSUP;
5180 4799 int value;
5181 4800
5182 4801 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) {
5183 4802 value = state->id_bgroup_present;
5184 4803 err = 0;
5185 4804 goto done;
5186 4805 }
5187 4806 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) {
5188 4807 value = state->id_allow_coalesce_comp_tuning;
5189 4808 err = 0;
5190 4809 goto done;
5191 4810 }
5192 4811 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) {
5193 4812 value = state->id_create_broadcast_group;
5194 4813 err = 0;
5195 4814 goto done;
5196 4815 }
5197 4816 if (strcmp(pr_name, "_ibd_hash_size") == 0) {
5198 4817 value = state->id_hash_size;
5199 4818 err = 0;
5200 4819 goto done;
5201 4820 }
5202 4821 if (strcmp(pr_name, "_ibd_lso_enable") == 0) {
5203 4822 value = state->id_lso_policy;
5204 4823 err = 0;
5205 4824 goto done;
5206 4825 }
5207 4826 if (strcmp(pr_name, "_ibd_num_ah") == 0) {
5208 4827 value = state->id_num_ah;
5209 4828 err = 0;
5210 4829 goto done;
5211 4830 }
5212 4831 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) {
5213 4832 value = state->id_num_lso_bufs;
5214 4833 err = 0;
5215 4834 goto done;
5216 4835 }
5217 4836 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) {
5218 4837 value = state->rc_enable_srq;
5219 4838 err = 0;
5220 4839 goto done;
5221 4840 }
5222 4841 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) {
5223 4842 value = state->id_rc_num_rwqe;
5224 4843 err = 0;
5225 4844 goto done;
5226 4845 }
5227 4846 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) {
5228 4847 value = state->id_rc_num_srq;
5229 4848 err = 0;
5230 4849 goto done;
5231 4850 }
5232 4851 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) {
5233 4852 value = state->id_rc_num_swqe;
5234 4853 err = 0;
5235 4854 goto done;
5236 4855 }
5237 4856 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) {
5238 4857 value = state->id_rc_rx_comp_count;
5239 4858 err = 0;
5240 4859 goto done;
5241 4860 }
5242 4861 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) {
5243 4862 value = state->id_rc_rx_comp_usec;
5244 4863 err = 0;
5245 4864 goto done;
5246 4865 }
5247 4866 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) {
5248 4867 value = state->id_rc_rx_copy_thresh;
5249 4868 err = 0;
5250 4869 goto done;
5251 4870 }
5252 4871 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) {
5253 4872 value = state->id_rc_rx_rwqe_thresh;
5254 4873 err = 0;
5255 4874 goto done;
5256 4875 }
5257 4876 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) {
5258 4877 value = state->id_rc_tx_comp_count;
5259 4878 err = 0;
5260 4879 goto done;
5261 4880 }
5262 4881 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) {
5263 4882 value = state->id_rc_tx_comp_usec;
5264 4883 err = 0;
5265 4884 goto done;
5266 4885 }
5267 4886 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) {
5268 4887 value = state->id_rc_tx_copy_thresh;
5269 4888 err = 0;
5270 4889 goto done;
5271 4890 }
5272 4891 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) {
5273 4892 value = state->id_ud_num_rwqe;
5274 4893 err = 0;
5275 4894 goto done;
5276 4895 }
5277 4896 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) {
5278 4897 value = state->id_ud_num_swqe;
5279 4898 err = 0;
5280 4899 goto done;
5281 4900 }
5282 4901 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) {
5283 4902 value = state->id_ud_rx_comp_count;
5284 4903 err = 0;
5285 4904 goto done;
5286 4905 }
5287 4906 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) {
5288 4907 value = state->id_ud_rx_comp_usec;
5289 4908 err = 0;
5290 4909 goto done;
5291 4910 }
5292 4911 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) {
5293 4912 value = state->id_ud_tx_comp_count;
5294 4913 err = 0;
5295 4914 goto done;
5296 4915 }
5297 4916 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) {
5298 4917 value = state->id_ud_tx_comp_usec;
5299 4918 err = 0;
5300 4919 goto done;
5301 4920 }
5302 4921 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) {
5303 4922 value = state->id_ud_tx_copy_thresh;
5304 4923 err = 0;
5305 4924 goto done;
5306 4925 }
5307 4926 done:
5308 4927 if (err == 0) {
5309 4928 (void) snprintf(pr_val, pr_valsize, "%d", value);
5310 4929 }
5311 4930 return (err);
5312 4931 }
5313 4932
5314 4933 static int
5315 4934 ibd_get_port_details(ibd_state_t *state)
5316 4935 {
5317 4936 ibt_hca_portinfo_t *port_infop;
5318 4937 ibt_status_t ret;
5319 4938 uint_t psize, port_infosz;
5320 4939
5321 4940 mutex_enter(&state->id_link_mutex);
5322 4941
5323 4942 /*
5324 4943 * Query for port information
5325 4944 */
5326 4945 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
5327 4946 &port_infop, &psize, &port_infosz);
5328 4947 if ((ret != IBT_SUCCESS) || (psize != 1)) {
5329 4948 mutex_exit(&state->id_link_mutex);
5330 4949 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
5331 4950 "failed, ret=%d", ret);
5332 4951 return (ENETDOWN);
5333 4952 }
5334 4953
5335 4954 /*
↓ open down ↓ |
966 lines elided |
↑ open up ↑ |
5336 4955 * If the link is active, verify the pkey
5337 4956 */
5338 4957 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) {
5339 4958 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
5340 4959 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
5341 4960 state->id_link_state = LINK_STATE_DOWN;
5342 4961 } else {
5343 4962 state->id_link_state = LINK_STATE_UP;
5344 4963 }
5345 4964 state->id_mtu = (128 << port_infop->p_mtu);
5346 - _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
5347 4965 state->id_sgid = *port_infop->p_sgid_tbl;
5348 - _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
5349 4966 /*
5350 4967 * Now that the port is active, record the port speed
5351 4968 */
5352 4969 state->id_link_speed = ibd_get_portspeed(state);
5353 4970 } else {
5354 4971 /* Make sure that these are handled in PORT_UP/CHANGE */
5355 4972 state->id_mtu = 0;
5356 4973 state->id_link_state = LINK_STATE_DOWN;
5357 4974 state->id_link_speed = 0;
5358 4975 }
5359 4976 mutex_exit(&state->id_link_mutex);
5360 4977 ibt_free_portinfo(port_infop, port_infosz);
5361 4978
5362 4979 return (0);
5363 4980 }
5364 4981
5365 4982 static int
5366 4983 ibd_alloc_cqs(ibd_state_t *state)
5367 4984 {
5368 4985 ibt_hca_attr_t hca_attrs;
5369 4986 ibt_cq_attr_t cq_attr;
5370 4987 ibt_status_t ret;
5371 4988 uint32_t real_size;
5372 4989 uint_t num_rwqe_change = 0;
5373 4990 uint_t num_swqe_change = 0;
5374 4991
5375 4992 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
5376 4993 ASSERT(ret == IBT_SUCCESS);
5377 4994
5378 4995 /*
5379 4996 * Allocate Rx/combined CQ:
5380 4997 * Theoretically, there is no point in having more than #rwqe
5381 4998 * plus #swqe cqe's, except that the CQ will be signaled for
5382 4999 * overflow when the last wqe completes, if none of the previous
5383 5000 * cqe's have been polled. Thus, we allocate just a few less wqe's
5384 5001 * to make sure such overflow does not occur.
5385 5002 */
5386 5003 cq_attr.cq_sched = NULL;
5387 5004 cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
5388 5005
5389 5006 /*
5390 5007 * Allocate Receive CQ.
5391 5008 */
5392 5009 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) {
5393 5010 cq_attr.cq_size = state->id_ud_num_rwqe + 1;
5394 5011 } else {
5395 5012 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5396 5013 num_rwqe_change = state->id_ud_num_rwqe;
5397 5014 state->id_ud_num_rwqe = cq_attr.cq_size - 1;
5398 5015 }
5399 5016
5400 5017 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5401 5018 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
5402 5019 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
5403 5020 "failed, ret=%d\n", ret);
5404 5021 return (DDI_FAILURE);
5405 5022 }
5406 5023
5407 5024 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count,
5408 5025 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) {
5409 5026 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
5410 5027 "moderation failed, ret=%d\n", ret);
5411 5028 }
5412 5029
5413 5030 /* make the #rx wc's the same as max rx chain size */
5414 5031 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
5415 5032 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
5416 5033 state->id_rxwcs_size, KM_SLEEP);
5417 5034
5418 5035 /*
5419 5036 * Allocate Send CQ.
5420 5037 */
5421 5038 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) {
5422 5039 cq_attr.cq_size = state->id_ud_num_swqe + 1;
5423 5040 } else {
5424 5041 cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
5425 5042 num_swqe_change = state->id_ud_num_swqe;
5426 5043 state->id_ud_num_swqe = cq_attr.cq_size - 1;
5427 5044 }
5428 5045
5429 5046 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
5430 5047 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
5431 5048 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
5432 5049 "failed, ret=%d\n", ret);
5433 5050 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
5434 5051 state->id_rxwcs_size);
5435 5052 (void) ibt_free_cq(state->id_rcq_hdl);
5436 5053 return (DDI_FAILURE);
5437 5054 }
5438 5055 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count,
5439 5056 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) {
5440 5057 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
5441 5058 "moderation failed, ret=%d\n", ret);
5442 5059 }
5443 5060
5444 5061 state->id_txwcs_size = IBD_TX_POLL_THRESH;
5445 5062 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
5446 5063 state->id_txwcs_size, KM_SLEEP);
5447 5064
5448 5065 /*
5449 5066 * Print message in case we could not allocate as many wqe's
5450 5067 * as was requested.
5451 5068 */
5452 5069 if (num_rwqe_change) {
5453 5070 ibd_print_warn(state, "Setting #rwqe = %d instead of default "
5454 5071 "%d", state->id_ud_num_rwqe, num_rwqe_change);
5455 5072 }
5456 5073 if (num_swqe_change) {
5457 5074 ibd_print_warn(state, "Setting #swqe = %d instead of default "
5458 5075 "%d", state->id_ud_num_swqe, num_swqe_change);
5459 5076 }
5460 5077
5461 5078 return (DDI_SUCCESS);
5462 5079 }
5463 5080
5464 5081 static int
5465 5082 ibd_setup_ud_channel(ibd_state_t *state)
5466 5083 {
5467 5084 ibt_ud_chan_alloc_args_t ud_alloc_attr;
5468 5085 ibt_ud_chan_query_attr_t ud_chan_attr;
5469 5086 ibt_status_t ret;
5470 5087
5471 5088 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED;
5472 5089 if (state->id_hca_res_lkey_capab)
5473 5090 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
5474 5091 if (state->id_lso_policy && state->id_lso_capable)
5475 5092 ud_alloc_attr.ud_flags |= IBT_USES_LSO;
5476 5093
5477 5094 ud_alloc_attr.ud_hca_port_num = state->id_port;
5478 5095 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
5479 5096 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
5480 5097 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe;
5481 5098 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe;
5482 5099 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey;
5483 5100 ud_alloc_attr.ud_scq = state->id_scq_hdl;
5484 5101 ud_alloc_attr.ud_rcq = state->id_rcq_hdl;
5485 5102 ud_alloc_attr.ud_pd = state->id_pd_hdl;
5486 5103 ud_alloc_attr.ud_pkey_ix = state->id_pkix;
5487 5104 ud_alloc_attr.ud_clone_chan = NULL;
5488 5105
5489 5106 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
5490 5107 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
5491 5108 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
5492 5109 "failed, ret=%d\n", ret);
5493 5110 return (DDI_FAILURE);
5494 5111 }
5495 5112
5496 5113 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
5497 5114 &ud_chan_attr)) != IBT_SUCCESS) {
5498 5115 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
5499 5116 "failed, ret=%d\n", ret);
5500 5117 (void) ibt_free_channel(state->id_chnl_hdl);
5501 5118 return (DDI_FAILURE);
5502 5119 }
5503 5120
5504 5121 state->id_qpnum = ud_chan_attr.ud_qpn;
5505 5122
5506 5123 return (DDI_SUCCESS);
5507 5124 }
5508 5125
5509 5126 static int
5510 5127 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
5511 5128 {
5512 5129 uint32_t progress = state->id_mac_state;
5513 5130 uint_t attempts;
5514 5131 ibt_status_t ret;
5515 5132 ib_gid_t mgid;
5516 5133 ibd_mce_t *mce;
5517 5134 uint8_t jstate;
5518 5135 timeout_id_t tid;
5519 5136
5520 5137 if (atomic_dec_32_nv(&state->id_running) != 0)
5521 5138 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n");
5522 5139
5523 5140 /*
5524 5141 * Before we try to stop/undo whatever we did in ibd_start(),
5525 5142 * we need to mark the link state appropriately to prevent the
5526 5143 * ip layer from using this instance for any new transfers. Note
5527 5144 * that if the original state of the link was "up" when we're
5528 5145 * here, we'll set the final link state to "unknown", to behave
5529 5146 * in the same fashion as other ethernet drivers.
5530 5147 */
5531 5148 mutex_enter(&state->id_link_mutex);
5532 5149 if (cur_link_state == LINK_STATE_DOWN) {
5533 5150 state->id_link_state = cur_link_state;
5534 5151 } else {
5535 5152 state->id_link_state = LINK_STATE_UNKNOWN;
5536 5153 }
5537 5154 mutex_exit(&state->id_link_mutex);
5538 5155 bzero(&state->id_macaddr, sizeof (ipoib_mac_t));
5539 5156 mac_link_update(state->id_mh, state->id_link_state);
5540 5157
5541 5158 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
5542 5159 if (progress & IBD_DRV_STARTED) {
5543 5160 state->id_mac_state &= (~IBD_DRV_STARTED);
5544 5161 }
5545 5162
5546 5163 if (progress & IBD_DRV_IN_LATE_HCA_INIT) {
5547 5164 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT);
5548 5165 }
5549 5166
5550 5167 /* Stop listen under Reliable Connected Mode */
5551 5168 if (progress & IBD_DRV_RC_LISTEN) {
5552 5169 ASSERT(state->id_enable_rc);
5553 5170 if (state->rc_listen_hdl != NULL) {
5554 5171 ibd_rc_stop_listen(state);
5555 5172 }
5556 5173 state->id_mac_state &= (~IBD_DRV_RC_LISTEN);
5557 5174 }
5558 5175
5559 5176 /* Stop timeout routine */
5560 5177 if (progress & IBD_DRV_RC_TIMEOUT) {
5561 5178 ASSERT(state->id_enable_rc);
5562 5179 mutex_enter(&state->rc_timeout_lock);
5563 5180 state->rc_timeout_start = B_FALSE;
5564 5181 tid = state->rc_timeout;
5565 5182 state->rc_timeout = 0;
5566 5183 mutex_exit(&state->rc_timeout_lock);
5567 5184 if (tid != 0)
5568 5185 (void) untimeout(tid);
5569 5186 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT);
5570 5187 }
5571 5188
5572 5189 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) {
5573 5190 attempts = 100;
5574 5191 while (state->id_ah_op == IBD_OP_ONGOING) {
5575 5192 /*
5576 5193 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB
5577 5194 * port is connecting to a remote IPoIB port. Wait for
5578 5195 * the end of this connecting operation.
5579 5196 */
5580 5197 delay(drv_usectohz(100000));
5581 5198 if (--attempts == 0) {
5582 5199 state->rc_stop_connect++;
5583 5200 DPRINT(40, "ibd_undo_start: connecting");
5584 5201 break;
5585 5202 }
5586 5203 }
5587 5204 mutex_enter(&state->id_sched_lock);
5588 5205 state->id_sched_needed = 0;
5589 5206 mutex_exit(&state->id_sched_lock);
5590 5207 (void) ibd_rc_close_all_chan(state);
5591 5208 }
5592 5209
5593 5210 /*
5594 5211 * First, stop receive interrupts; this stops the driver from
5595 5212 * handing up buffers to higher layers. Wait for receive buffers
5596 5213 * to be returned and give up after 1 second.
5597 5214 */
5598 5215 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
5599 5216 attempts = 10;
5600 5217 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding,
5601 5218 0) > 0) {
5602 5219 delay(drv_usectohz(100000));
5603 5220 if (--attempts == 0) {
5604 5221 /*
5605 5222 * There are pending bufs with the network
5606 5223 * layer and we have no choice but to wait
5607 5224 * for them to be done with. Reap all the
5608 5225 * Tx/Rx completions that were posted since
5609 5226 * we turned off the notification and
5610 5227 * return failure.
5611 5228 */
5612 5229 cmn_err(CE_CONT, "!ibd: bufs outstanding\n");
5613 5230 DPRINT(2, "ibd_undo_start: "
5614 5231 "reclaiming failed");
5615 5232 break;
5616 5233 }
5617 5234 }
5618 5235 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
5619 5236 }
5620 5237
5621 5238 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) {
5622 5239 ibd_rc_fini_tx_largebuf_list(state);
5623 5240 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD);
5624 5241 }
5625 5242
5626 5243 if (progress & IBD_DRV_RC_SRQ_ALLOCD) {
5627 5244 ASSERT(state->id_enable_rc);
5628 5245 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) {
5629 5246 if (state->id_ah_op == IBD_OP_ONGOING) {
5630 5247 delay(drv_usectohz(10000));
5631 5248 if (state->id_ah_op == IBD_OP_ONGOING) {
5632 5249 /*
5633 5250 * "state->id_ah_op == IBD_OP_ONGOING"
5634 5251 * means this IPoIB port is connecting
5635 5252 * to a remote IPoIB port. We can't
5636 5253 * delete SRQ here.
5637 5254 */
5638 5255 state->rc_stop_connect++;
5639 5256 DPRINT(40, "ibd_undo_start: "
5640 5257 "connecting");
5641 5258 } else {
5642 5259 ibd_rc_fini_srq_list(state);
5643 5260 state->id_mac_state &=
5644 5261 (~IBD_DRV_RC_SRQ_ALLOCD);
5645 5262 }
5646 5263 } else {
5647 5264 ibd_rc_fini_srq_list(state);
5648 5265 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD);
5649 5266 }
5650 5267 } else {
5651 5268 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n");
5652 5269 }
5653 5270 }
5654 5271
5655 5272 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
5656 5273 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
5657 5274
5658 5275 mutex_enter(&state->id_trap_lock);
5659 5276 state->id_trap_stop = B_TRUE;
5660 5277 while (state->id_trap_inprog > 0)
5661 5278 cv_wait(&state->id_trap_cv, &state->id_trap_lock);
5662 5279 mutex_exit(&state->id_trap_lock);
5663 5280
5664 5281 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
5665 5282 }
5666 5283
5667 5284 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
5668 5285 /*
5669 5286 * Flushing the channel ensures that all pending WQE's
5670 5287 * are marked with flush_error and handed to the CQ. It
5671 5288 * does not guarantee the invocation of the CQ handler.
5672 5289 * This call is guaranteed to return successfully for
5673 5290 * UD QPNs.
5674 5291 */
5675 5292 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
5676 5293 IBT_SUCCESS) {
5677 5294 DPRINT(10, "ibd_undo_start: flush_channel "
5678 5295 "failed, ret=%d", ret);
5679 5296 }
5680 5297
5681 5298 /*
5682 5299 * Give some time for the TX CQ handler to process the
5683 5300 * completions.
5684 5301 */
5685 5302 attempts = 10;
5686 5303 mutex_enter(&state->id_tx_list.dl_mutex);
5687 5304 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5688 5305 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt
5689 5306 != state->id_ud_num_swqe) {
5690 5307 if (--attempts == 0)
5691 5308 break;
5692 5309 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5693 5310 mutex_exit(&state->id_tx_list.dl_mutex);
5694 5311 delay(drv_usectohz(100000));
5695 5312 mutex_enter(&state->id_tx_list.dl_mutex);
5696 5313 mutex_enter(&state->id_tx_rel_list.dl_mutex);
5697 5314 }
5698 5315 ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
5699 5316 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt !=
5700 5317 state->id_ud_num_swqe) {
5701 5318 cmn_err(CE_WARN, "tx resources not freed\n");
5702 5319 }
5703 5320 mutex_exit(&state->id_tx_rel_list.dl_mutex);
5704 5321 mutex_exit(&state->id_tx_list.dl_mutex);
5705 5322
5706 5323 attempts = 10;
5707 5324 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5708 5325 if (--attempts == 0)
5709 5326 break;
5710 5327 delay(drv_usectohz(100000));
5711 5328 }
5712 5329 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
5713 5330 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) {
5714 5331 cmn_err(CE_WARN, "rx resources not freed\n");
5715 5332 }
5716 5333
5717 5334 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
5718 5335 }
5719 5336
5720 5337 if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
5721 5338 /*
5722 5339 * Drop all residual full/non membership. This includes full
5723 5340 * membership to the broadcast group, and any nonmembership
5724 5341 * acquired during transmits. We do this after the Tx completion
5725 5342 * handlers are done, since those might result in some late
5726 5343 * leaves; this also eliminates a potential race with that
5727 5344 * path wrt the mc full list insert/delete. Trap handling
5728 5345 * has also been suppressed at this point. Thus, no locks
5729 5346 * are required while traversing the mc full list.
5730 5347 */
5731 5348 DPRINT(2, "ibd_undo_start: clear full cache entries");
5732 5349 mce = list_head(&state->id_mc_full);
5733 5350 while (mce != NULL) {
5734 5351 mgid = mce->mc_info.mc_adds_vect.av_dgid;
5735 5352 jstate = mce->mc_jstate;
5736 5353 mce = list_next(&state->id_mc_full, mce);
5737 5354 ibd_leave_group(state, mgid, jstate);
5738 5355 }
5739 5356 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
5740 5357 }
5741 5358
5742 5359 if (progress & IBD_DRV_RXLIST_ALLOCD) {
5743 5360 ibd_fini_rxlist(state);
5744 5361 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
5745 5362 }
5746 5363
5747 5364 if (progress & IBD_DRV_TXLIST_ALLOCD) {
5748 5365 ibd_fini_txlist(state);
5749 5366 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
5750 5367 }
5751 5368
5752 5369 if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
5753 5370 if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
5754 5371 IBT_SUCCESS) {
5755 5372 DPRINT(10, "ibd_undo_start: free_channel "
5756 5373 "failed, ret=%d", ret);
5757 5374 }
5758 5375
5759 5376 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
5760 5377 }
5761 5378
5762 5379 if (progress & IBD_DRV_CQS_ALLOCD) {
5763 5380 kmem_free(state->id_txwcs,
5764 5381 sizeof (ibt_wc_t) * state->id_txwcs_size);
5765 5382 if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
5766 5383 IBT_SUCCESS) {
5767 5384 DPRINT(10, "ibd_undo_start: free_cq(scq) "
5768 5385 "failed, ret=%d", ret);
5769 5386 }
5770 5387
5771 5388 kmem_free(state->id_rxwcs,
5772 5389 sizeof (ibt_wc_t) * state->id_rxwcs_size);
5773 5390 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
5774 5391 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
5775 5392 "ret=%d", ret);
5776 5393 }
5777 5394
5778 5395 state->id_txwcs = NULL;
5779 5396 state->id_rxwcs = NULL;
5780 5397 state->id_scq_hdl = NULL;
5781 5398 state->id_rcq_hdl = NULL;
5782 5399
5783 5400 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
5784 5401 }
5785 5402
5786 5403 if (progress & IBD_DRV_ACACHE_INITIALIZED) {
5787 5404 mutex_enter(&state->id_ac_mutex);
5788 5405 mod_hash_destroy_hash(state->id_ah_active_hash);
5789 5406 mutex_exit(&state->id_ac_mutex);
5790 5407 ibd_acache_fini(state);
5791 5408
5792 5409 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
5793 5410 }
5794 5411
5795 5412 if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
5796 5413 /*
5797 5414 * If we'd created the ipoib broadcast group and had
5798 5415 * successfully joined it, leave it now
5799 5416 */
5800 5417 if (state->id_bgroup_created) {
5801 5418 mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
5802 5419 jstate = IB_MC_JSTATE_FULL;
5803 5420 (void) ibt_leave_mcg(state->id_sgid, mgid,
5804 5421 state->id_sgid, jstate);
5805 5422 }
5806 5423 ibt_free_mcg_info(state->id_mcinfo, 1);
5807 5424
5808 5425 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
5809 5426 }
5810 5427
5811 5428 return (DDI_SUCCESS);
5812 5429 }
5813 5430
5814 5431 /*
5815 5432 * These pair of routines are used to set/clear the condition that
5816 5433 * the caller is likely to do something to change the id_mac_state.
5817 5434 * If there's already someone doing either a start or a stop (possibly
5818 5435 * due to the async handler detecting a pkey relocation event, a plumb
5819 5436 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
5820 5437 * that's done.
5821 5438 */
5822 5439 static void
5823 5440 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
5824 5441 {
5825 5442 mutex_enter(&state->id_macst_lock);
5826 5443 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
5827 5444 cv_wait(&state->id_macst_cv, &state->id_macst_lock);
5828 5445
5829 5446 state->id_mac_state |= flag;
5830 5447 mutex_exit(&state->id_macst_lock);
5831 5448 }
5832 5449
5833 5450 static void
5834 5451 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
5835 5452 {
5836 5453 mutex_enter(&state->id_macst_lock);
5837 5454 state->id_mac_state &= (~flag);
5838 5455 cv_signal(&state->id_macst_cv);
5839 5456 mutex_exit(&state->id_macst_lock);
5840 5457 }
5841 5458
5842 5459 /*
5843 5460 * GLDv3 entry point to start hardware.
5844 5461 */
5845 5462 /*ARGSUSED*/
5846 5463 static int
5847 5464 ibd_m_start(void *arg)
5848 5465 {
5849 5466 ibd_state_t *state = arg;
5850 5467 int ret;
5851 5468
5852 5469 if (state->id_type == IBD_PORT_DRIVER)
5853 5470 return (EINVAL);
5854 5471
5855 5472 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5856 5473 if (state->id_mac_state & IBD_DRV_IN_DELETION) {
5857 5474 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5858 5475 return (EIO);
5859 5476 }
5860 5477
5861 5478 ret = ibd_start(state);
5862 5479 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
5863 5480 return (ret);
5864 5481 }
5865 5482
5866 5483 static int
5867 5484 ibd_start(ibd_state_t *state)
5868 5485 {
5869 5486 int err;
5870 5487 ibt_status_t ret;
5871 5488 int late_hca_init = 0;
5872 5489
5873 5490 if (state->id_mac_state & IBD_DRV_STARTED)
5874 5491 return (DDI_SUCCESS);
5875 5492
5876 5493 /*
5877 5494 * We do not increment the running flag when calling ibd_start() as
5878 5495 * a result of some event which moves the state away from late HCA
5879 5496 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability.
5880 5497 */
5881 5498 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) &&
5882 5499 (atomic_inc_32_nv(&state->id_running) != 1)) {
5883 5500 DPRINT(10, "ibd_start: id_running is non-zero");
5884 5501 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n");
5885 5502 atomic_dec_32(&state->id_running);
5886 5503 return (EINVAL);
5887 5504 }
5888 5505
5889 5506 /*
5890 5507 * Get port details; if we fail here, something bad happened.
5891 5508 * Fail plumb.
5892 5509 */
5893 5510 if ((err = ibd_get_port_details(state)) != 0) {
5894 5511 DPRINT(10, "ibd_start: ibd_get_port_details() failed");
5895 5512 goto start_fail;
5896 5513 }
5897 5514 /*
5898 5515 * If state->id_link_state is DOWN, it indicates that either the port
5899 5516 * is down, or the pkey is not available. In both cases, resort to late
5900 5517 * initialization. Register for subnet notices, and return success.
5901 5518 */
5902 5519 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
5903 5520 if (state->id_link_state == LINK_STATE_DOWN) {
5904 5521 late_hca_init = 1;
5905 5522 goto late_hca_init_return;
5906 5523 }
5907 5524
5908 5525 /*
5909 5526 * Find the IPoIB broadcast group
5910 5527 */
5911 5528 if (ibd_find_bgroup(state) != IBT_SUCCESS) {
5912 5529 /* Resort to late initialization */
5913 5530 late_hca_init = 1;
5914 5531 goto reg_snet_notices;
5915 5532 }
5916 5533 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
5917 5534
5918 5535 /*
5919 5536 * Initialize per-interface caches and lists; if we fail here,
5920 5537 * it is most likely due to a lack of resources
5921 5538 */
5922 5539 if (ibd_acache_init(state) != DDI_SUCCESS) {
5923 5540 DPRINT(10, "ibd_start: ibd_acache_init() failed");
5924 5541 err = ENOMEM;
5925 5542 goto start_fail;
5926 5543 }
5927 5544 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
5928 5545
5929 5546 /*
5930 5547 * Allocate send and receive completion queues
5931 5548 */
5932 5549 if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
5933 5550 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
5934 5551 err = ENOMEM;
5935 5552 goto start_fail;
5936 5553 }
5937 5554 state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
5938 5555
5939 5556 /*
5940 5557 * Setup a UD channel
5941 5558 */
5942 5559 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
5943 5560 err = ENOMEM;
5944 5561 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
5945 5562 goto start_fail;
5946 5563 }
5947 5564 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
5948 5565
5949 5566 /*
5950 5567 * Allocate and initialize the tx buffer list
5951 5568 */
5952 5569 if (ibd_init_txlist(state) != DDI_SUCCESS) {
5953 5570 DPRINT(10, "ibd_start: ibd_init_txlist() failed");
5954 5571 err = ENOMEM;
5955 5572 goto start_fail;
5956 5573 }
5957 5574 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
5958 5575
5959 5576 /*
5960 5577 * Create the send cq handler here
5961 5578 */
5962 5579 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
5963 5580 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
5964 5581 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
5965 5582 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
5966 5583 "failed, ret=%d", ret);
5967 5584 err = EINVAL;
5968 5585 goto start_fail;
5969 5586 }
5970 5587 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
5971 5588
5972 5589 /*
5973 5590 * Allocate and initialize the rx buffer list
5974 5591 */
5975 5592 if (ibd_init_rxlist(state) != DDI_SUCCESS) {
5976 5593 DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
5977 5594 err = ENOMEM;
5978 5595 goto start_fail;
5979 5596 }
5980 5597 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
5981 5598
5982 5599 /*
5983 5600 * Join IPoIB broadcast group
5984 5601 */
5985 5602 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
5986 5603 DPRINT(10, "ibd_start: ibd_join_group() failed");
5987 5604 err = ENOTACTIVE;
5988 5605 goto start_fail;
5989 5606 }
5990 5607 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
5991 5608
5992 5609 /*
5993 5610 * When we did mac_register() in ibd_attach(), we didn't register
5994 5611 * the real macaddr and we didn't have the true port mtu. Now that
5995 5612 * we're almost ready, set the local mac address and broadcast
5996 5613 * addresses and update gldv3 about the real values of these
5997 5614 * parameters.
5998 5615 */
5999 5616 if (state->id_enable_rc) {
6000 5617 ibd_h2n_mac(&state->id_macaddr,
6001 5618 IBD_MAC_ADDR_RC + state->id_qpnum,
6002 5619 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6003 5620 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum,
6004 5621 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6005 5622 } else {
6006 5623 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
6007 5624 state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
6008 5625 }
6009 5626 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
6010 5627 state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
6011 5628
6012 5629 if (!state->id_enable_rc) {
6013 5630 (void) mac_maxsdu_update2(state->id_mh,
6014 5631 state->id_mtu - IPOIB_HDRSIZE,
6015 5632 state->id_mtu - IPOIB_HDRSIZE);
6016 5633 }
6017 5634 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6018 5635
6019 5636 /*
6020 5637 * Setup the receive cq handler
6021 5638 */
6022 5639 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
6023 5640 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
6024 5641 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
6025 5642 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
6026 5643 "failed, ret=%d", ret);
6027 5644 err = EINVAL;
6028 5645 goto start_fail;
6029 5646 }
6030 5647 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
6031 5648
6032 5649 reg_snet_notices:
6033 5650 /*
6034 5651 * In case of normal initialization sequence,
6035 5652 * Setup the subnet notices handler after we've initialized the acache/
6036 5653 * mcache and started the async thread, both of which are required for
6037 5654 * the trap handler to function properly.
6038 5655 *
6039 5656 * Now that the async thread has been started (and we've already done
6040 5657 * a mac_register() during attach so mac_tx_update() can be called
6041 5658 * if necessary without any problem), we can enable the trap handler
6042 5659 * to queue requests to the async thread.
6043 5660 *
6044 5661 * In case of late hca initialization, the subnet notices handler will
6045 5662 * only handle MCG created/deleted event. The action performed as part
6046 5663 * of handling these events is to start the interface. So, the
6047 5664 * acache/mcache initialization is not a necessity in such cases for
6048 5665 * registering the subnet notices handler. Also, if we are in
6049 5666 * ibd_start() as a result of, say, some event handling after entering
6050 5667 * late hca initialization phase no need to register again.
6051 5668 */
6052 5669 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) {
6053 5670 ibt_register_subnet_notices(state->id_ibt_hdl,
6054 5671 ibd_snet_notices_handler, state);
6055 5672 mutex_enter(&state->id_trap_lock);
6056 5673 state->id_trap_stop = B_FALSE;
6057 5674 mutex_exit(&state->id_trap_lock);
6058 5675 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
6059 5676 }
6060 5677
6061 5678 late_hca_init_return:
6062 5679 if (late_hca_init == 1) {
6063 5680 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT;
6064 5681 /*
6065 5682 * In case of late initialization, mark the link state as down,
6066 5683 * immaterial of the actual link state as reported in the
6067 5684 * port_info.
6068 5685 */
6069 5686 state->id_link_state = LINK_STATE_DOWN;
6070 5687 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
6071 5688 mac_link_update(state->id_mh, state->id_link_state);
6072 5689 return (DDI_SUCCESS);
6073 5690 }
6074 5691
6075 5692 if (state->id_enable_rc) {
6076 5693 if (state->rc_enable_srq) {
6077 5694 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) {
6078 5695 if (ibd_rc_repost_srq_free_list(state) !=
6079 5696 IBT_SUCCESS) {
6080 5697 err = ENOMEM;
6081 5698 goto start_fail;
6082 5699 }
6083 5700 } else {
6084 5701 /* Allocate SRQ resource */
6085 5702 if (ibd_rc_init_srq_list(state) !=
6086 5703 IBT_SUCCESS) {
6087 5704 err = ENOMEM;
6088 5705 goto start_fail;
6089 5706 }
6090 5707 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD;
6091 5708 }
6092 5709 }
6093 5710
6094 5711 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) {
6095 5712 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() "
6096 5713 "failed");
6097 5714 err = ENOMEM;
6098 5715 goto start_fail;
6099 5716 }
6100 5717 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD;
6101 5718
6102 5719 /* RC: begin to listen only after everything is available */
6103 5720 if (ibd_rc_listen(state) != IBT_SUCCESS) {
6104 5721 DPRINT(10, "ibd_start: ibd_rc_listen() failed");
6105 5722 err = EINVAL;
6106 5723 goto start_fail;
6107 5724 }
6108 5725 state->id_mac_state |= IBD_DRV_RC_LISTEN;
6109 5726 }
6110 5727
6111 5728 /*
6112 5729 * Indicate link status to GLDv3 and higher layers. By default,
6113 5730 * we assume we are in up state (which must have been true at
6114 5731 * least at the time the broadcast mcg's were probed); if there
6115 5732 * were any up/down transitions till the time we come here, the
6116 5733 * async handler will have updated last known state, which we
6117 5734 * use to tell GLDv3. The async handler will not send any
6118 5735 * notifications to GLDv3 till we reach here in the initialization
6119 5736 * sequence.
6120 5737 */
6121 5738 mac_link_update(state->id_mh, state->id_link_state);
6122 5739 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT;
6123 5740 state->id_mac_state |= IBD_DRV_STARTED;
6124 5741
6125 5742 /* Start timer after everything is ready */
6126 5743 if (state->id_enable_rc) {
6127 5744 mutex_enter(&state->rc_timeout_lock);
6128 5745 state->rc_timeout_start = B_TRUE;
6129 5746 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state,
6130 5747 SEC_TO_TICK(ibd_rc_conn_timeout));
6131 5748 mutex_exit(&state->rc_timeout_lock);
6132 5749 state->id_mac_state |= IBD_DRV_RC_TIMEOUT;
6133 5750 }
6134 5751
6135 5752 return (DDI_SUCCESS);
6136 5753
6137 5754 start_fail:
6138 5755 /*
6139 5756 * If we ran into a problem during ibd_start() and ran into
6140 5757 * some other problem during undoing our partial work, we can't
6141 5758 * do anything about it. Ignore any errors we might get from
6142 5759 * ibd_undo_start() and just return the original error we got.
6143 5760 */
6144 5761 (void) ibd_undo_start(state, LINK_STATE_DOWN);
6145 5762 return (err);
6146 5763 }
6147 5764
6148 5765 /*
6149 5766 * GLDv3 entry point to stop hardware from receiving packets.
6150 5767 */
6151 5768 /*ARGSUSED*/
6152 5769 static void
6153 5770 ibd_m_stop(void *arg)
6154 5771 {
6155 5772 ibd_state_t *state = (ibd_state_t *)arg;
6156 5773
6157 5774 if (state->id_type == IBD_PORT_DRIVER)
6158 5775 return;
6159 5776
6160 5777 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6161 5778
6162 5779 (void) ibd_undo_start(state, state->id_link_state);
6163 5780
6164 5781 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
6165 5782 }
6166 5783
6167 5784 /*
6168 5785 * GLDv3 entry point to modify device's mac address. We do not
6169 5786 * allow address modifications.
6170 5787 */
6171 5788 static int
6172 5789 ibd_m_unicst(void *arg, const uint8_t *macaddr)
6173 5790 {
6174 5791 ibd_state_t *state = arg;
6175 5792
6176 5793 if (state->id_type == IBD_PORT_DRIVER)
6177 5794 return (EINVAL);
6178 5795
6179 5796 /*
6180 5797 * Don't bother even comparing the macaddr if we haven't
6181 5798 * completed ibd_m_start().
6182 5799 */
6183 5800 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6184 5801 return (0);
6185 5802
6186 5803 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
6187 5804 return (0);
6188 5805 else
6189 5806 return (EINVAL);
6190 5807 }
6191 5808
6192 5809 /*
6193 5810 * The blocking part of the IBA join/leave operations are done out
6194 5811 * of here on the async thread.
6195 5812 */
6196 5813 static void
6197 5814 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
6198 5815 {
6199 5816 DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
6200 5817 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
6201 5818
6202 5819 if (op == IBD_ASYNC_JOIN) {
6203 5820 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
6204 5821 ibd_print_warn(state, "Join multicast group failed :"
6205 5822 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6206 5823 }
6207 5824 } else {
6208 5825 /*
6209 5826 * Here, we must search for the proper mcg_info and
6210 5827 * use that to leave the group.
6211 5828 */
6212 5829 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
6213 5830 }
6214 5831 }
6215 5832
6216 5833 /*
6217 5834 * GLDv3 entry point for multicast enable/disable requests.
6218 5835 * This function queues the operation to the async thread and
6219 5836 * return success for a valid multicast address.
6220 5837 */
6221 5838 static int
6222 5839 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
6223 5840 {
6224 5841 ibd_state_t *state = (ibd_state_t *)arg;
6225 5842 ipoib_mac_t maddr, *mcast;
6226 5843 ib_gid_t mgid;
6227 5844 ibd_req_t *req;
6228 5845
6229 5846 if (state->id_type == IBD_PORT_DRIVER)
6230 5847 return (EINVAL);
6231 5848
6232 5849 /*
6233 5850 * If we haven't completed ibd_m_start(), async thread wouldn't
6234 5851 * have been started and id_bcaddr wouldn't be set, so there's
6235 5852 * no point in continuing.
6236 5853 */
6237 5854 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6238 5855 return (0);
6239 5856
6240 5857 /*
6241 5858 * The incoming multicast address might not be aligned properly
6242 5859 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
6243 5860 * it to look like one though, to get the offsets of the mc gid,
6244 5861 * since we know we are not going to dereference any values with
6245 5862 * the ipoib_mac_t pointer.
6246 5863 */
6247 5864 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
6248 5865 mcast = &maddr;
6249 5866
6250 5867 /*
6251 5868 * Check validity of MCG address. We could additionally check
6252 5869 * that a enable/disable is not being issued on the "broadcast"
6253 5870 * mcg, but since this operation is only invokable by privileged
6254 5871 * programs anyway, we allow the flexibility to those dlpi apps.
6255 5872 * Note that we do not validate the "scope" of the IBA mcg.
6256 5873 */
6257 5874 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
6258 5875 return (EINVAL);
6259 5876
6260 5877 /*
6261 5878 * fill in multicast pkey and scope
6262 5879 */
6263 5880 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
6264 5881
6265 5882 /*
6266 5883 * If someone is trying to JOIN/LEAVE the broadcast group, we do
6267 5884 * nothing (i.e. we stay JOINed to the broadcast group done in
6268 5885 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
6269 5886 * requires to be joined to broadcast groups at all times.
6270 5887 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
6271 5888 * depends on this.
6272 5889 */
6273 5890 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6274 5891 return (0);
6275 5892
6276 5893 ibd_n2h_gid(mcast, &mgid);
6277 5894 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6278 5895 if (req == NULL)
6279 5896 return (ENOMEM);
6280 5897
6281 5898 req->rq_gid = mgid;
6282 5899
6283 5900 if (add) {
6284 5901 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
6285 5902 mgid.gid_prefix, mgid.gid_guid);
6286 5903 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
6287 5904 } else {
6288 5905 DPRINT(1, "ibd_m_multicst : unset_multicast : "
6289 5906 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
6290 5907 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
6291 5908 }
6292 5909 return (0);
6293 5910 }
6294 5911
6295 5912 /*
6296 5913 * The blocking part of the IBA promiscuous operations are done
6297 5914 * out of here on the async thread. The dlpireq parameter indicates
6298 5915 * whether this invocation is due to a dlpi request or due to
6299 5916 * a port up/down event.
6300 5917 */
6301 5918 static void
6302 5919 ibd_async_unsetprom(ibd_state_t *state)
6303 5920 {
6304 5921 ibd_mce_t *mce = list_head(&state->id_mc_non);
6305 5922 ib_gid_t mgid;
6306 5923
6307 5924 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
6308 5925
6309 5926 while (mce != NULL) {
6310 5927 mgid = mce->mc_info.mc_adds_vect.av_dgid;
6311 5928 mce = list_next(&state->id_mc_non, mce);
6312 5929 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
6313 5930 }
6314 5931 state->id_prom_op = IBD_OP_NOTSTARTED;
6315 5932 }
6316 5933
6317 5934 /*
6318 5935 * The blocking part of the IBA promiscuous operations are done
6319 5936 * out of here on the async thread. The dlpireq parameter indicates
6320 5937 * whether this invocation is due to a dlpi request or due to
6321 5938 * a port up/down event.
6322 5939 */
6323 5940 static void
6324 5941 ibd_async_setprom(ibd_state_t *state)
6325 5942 {
6326 5943 ibt_mcg_attr_t mcg_attr;
6327 5944 ibt_mcg_info_t *mcg_info;
6328 5945 ib_gid_t mgid;
6329 5946 uint_t numg;
6330 5947 int i;
6331 5948 char ret = IBD_OP_COMPLETED;
6332 5949
6333 5950 DPRINT(2, "ibd_async_setprom : async_set_promisc");
6334 5951
6335 5952 /*
6336 5953 * Obtain all active MC groups on the IB fabric with
6337 5954 * specified criteria (scope + Pkey + Qkey + mtu).
6338 5955 */
6339 5956 bzero(&mcg_attr, sizeof (mcg_attr));
6340 5957 mcg_attr.mc_pkey = state->id_pkey;
6341 5958 mcg_attr.mc_scope = state->id_scope;
6342 5959 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
6343 5960 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
6344 5961 mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
6345 5962 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
6346 5963 IBT_SUCCESS) {
6347 5964 ibd_print_warn(state, "Could not get list of IBA multicast "
6348 5965 "groups");
6349 5966 ret = IBD_OP_ERRORED;
6350 5967 goto done;
6351 5968 }
6352 5969
6353 5970 /*
6354 5971 * Iterate over the returned mcg's and join as NonMember
6355 5972 * to the IP mcg's.
6356 5973 */
6357 5974 for (i = 0; i < numg; i++) {
6358 5975 /*
6359 5976 * Do a NonMember JOIN on the MC group.
6360 5977 */
6361 5978 mgid = mcg_info[i].mc_adds_vect.av_dgid;
6362 5979 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
6363 5980 ibd_print_warn(state, "IBA promiscuous mode missed "
6364 5981 "multicast gid %016llx:%016llx",
6365 5982 (u_longlong_t)mgid.gid_prefix,
6366 5983 (u_longlong_t)mgid.gid_guid);
6367 5984 }
6368 5985
6369 5986 ibt_free_mcg_info(mcg_info, numg);
6370 5987 DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
6371 5988 done:
6372 5989 state->id_prom_op = ret;
6373 5990 }
6374 5991
6375 5992 /*
6376 5993 * GLDv3 entry point for multicast promiscuous enable/disable requests.
6377 5994 * GLDv3 assumes phys state receives more packets than multi state,
6378 5995 * which is not true for IPoIB. Thus, treat the multi and phys
6379 5996 * promiscuous states the same way to work with GLDv3's assumption.
6380 5997 */
6381 5998 static int
6382 5999 ibd_m_promisc(void *arg, boolean_t on)
6383 6000 {
6384 6001 ibd_state_t *state = (ibd_state_t *)arg;
6385 6002 ibd_req_t *req;
6386 6003
6387 6004 if (state->id_type == IBD_PORT_DRIVER)
6388 6005 return (EINVAL);
6389 6006
6390 6007 /*
6391 6008 * Async thread wouldn't have been started if we haven't
6392 6009 * passed ibd_m_start()
6393 6010 */
6394 6011 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6395 6012 return (0);
6396 6013
6397 6014 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6398 6015 if (req == NULL)
6399 6016 return (ENOMEM);
6400 6017 if (on) {
6401 6018 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
6402 6019 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
6403 6020 } else {
6404 6021 DPRINT(1, "ibd_m_promisc : unset_promisc");
6405 6022 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
6406 6023 }
6407 6024
6408 6025 return (0);
6409 6026 }
6410 6027
6411 6028 /*
6412 6029 * GLDv3 entry point for gathering statistics.
6413 6030 */
6414 6031 static int
6415 6032 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
6416 6033 {
6417 6034 ibd_state_t *state = (ibd_state_t *)arg;
6418 6035
6419 6036 switch (stat) {
6420 6037 case MAC_STAT_IFSPEED:
6421 6038 *val = state->id_link_speed;
6422 6039 break;
6423 6040 case MAC_STAT_MULTIRCV:
6424 6041 *val = state->id_multi_rcv;
6425 6042 break;
6426 6043 case MAC_STAT_BRDCSTRCV:
6427 6044 *val = state->id_brd_rcv;
6428 6045 break;
6429 6046 case MAC_STAT_MULTIXMT:
6430 6047 *val = state->id_multi_xmt;
6431 6048 break;
6432 6049 case MAC_STAT_BRDCSTXMT:
6433 6050 *val = state->id_brd_xmt;
6434 6051 break;
6435 6052 case MAC_STAT_RBYTES:
6436 6053 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte
6437 6054 + state->rc_rcv_copy_byte;
6438 6055 break;
6439 6056 case MAC_STAT_IPACKETS:
6440 6057 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt
6441 6058 + state->rc_rcv_copy_pkt;
6442 6059 break;
6443 6060 case MAC_STAT_OBYTES:
6444 6061 *val = state->id_xmt_bytes + state->rc_xmt_bytes;
6445 6062 break;
6446 6063 case MAC_STAT_OPACKETS:
6447 6064 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt +
6448 6065 state->rc_xmt_fragmented_pkt +
6449 6066 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt;
6450 6067 break;
6451 6068 case MAC_STAT_OERRORS:
6452 6069 *val = state->id_ah_error; /* failed AH translation */
6453 6070 break;
6454 6071 case MAC_STAT_IERRORS:
6455 6072 *val = 0;
6456 6073 break;
6457 6074 case MAC_STAT_NOXMTBUF:
6458 6075 *val = state->id_tx_short + state->rc_swqe_short +
6459 6076 state->rc_xmt_buf_short;
6460 6077 break;
6461 6078 case MAC_STAT_NORCVBUF:
6462 6079 default:
6463 6080 return (ENOTSUP);
6464 6081 }
6465 6082
6466 6083 return (0);
6467 6084 }
6468 6085
6469 6086 static void
6470 6087 ibd_async_txsched(ibd_state_t *state)
6471 6088 {
6472 6089 ibd_resume_transmission(state);
6473 6090 }
6474 6091
6475 6092 static void
6476 6093 ibd_resume_transmission(ibd_state_t *state)
6477 6094 {
6478 6095 int flag;
6479 6096 int met_thresh = 0;
6480 6097 int thresh = 0;
6481 6098 int ret = -1;
6482 6099
6483 6100 mutex_enter(&state->id_sched_lock);
6484 6101 if (state->id_sched_needed & IBD_RSRC_SWQE) {
6485 6102 mutex_enter(&state->id_tx_list.dl_mutex);
6486 6103 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6487 6104 met_thresh = state->id_tx_list.dl_cnt +
6488 6105 state->id_tx_rel_list.dl_cnt;
6489 6106 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6490 6107 mutex_exit(&state->id_tx_list.dl_mutex);
6491 6108 thresh = IBD_FREE_SWQES_THRESH;
6492 6109 flag = IBD_RSRC_SWQE;
6493 6110 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
6494 6111 ASSERT(state->id_lso != NULL);
6495 6112 mutex_enter(&state->id_lso_lock);
6496 6113 met_thresh = state->id_lso->bkt_nfree;
6497 6114 thresh = IBD_FREE_LSOS_THRESH;
6498 6115 mutex_exit(&state->id_lso_lock);
6499 6116 flag = IBD_RSRC_LSOBUF;
6500 6117 if (met_thresh > thresh)
6501 6118 state->id_sched_lso_cnt++;
6502 6119 }
6503 6120 if (met_thresh > thresh) {
6504 6121 state->id_sched_needed &= ~flag;
6505 6122 state->id_sched_cnt++;
6506 6123 ret = 0;
6507 6124 }
6508 6125 mutex_exit(&state->id_sched_lock);
6509 6126
6510 6127 if (ret == 0)
6511 6128 mac_tx_update(state->id_mh);
6512 6129 }
6513 6130
6514 6131 /*
6515 6132 * Release the send wqe back into free list.
6516 6133 */
6517 6134 static void
6518 6135 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
6519 6136 {
6520 6137 /*
6521 6138 * Add back on Tx list for reuse.
6522 6139 */
6523 6140 ASSERT(tail->swqe_next == NULL);
6524 6141 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6525 6142 state->id_tx_rel_list.dl_pending_sends = B_FALSE;
6526 6143 tail->swqe_next = state->id_tx_rel_list.dl_head;
6527 6144 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
6528 6145 state->id_tx_rel_list.dl_cnt += n;
6529 6146 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6530 6147 }
6531 6148
6532 6149 /*
6533 6150 * Acquire a send wqe from free list.
6534 6151 * Returns error number and send wqe pointer.
6535 6152 */
6536 6153 static ibd_swqe_t *
6537 6154 ibd_acquire_swqe(ibd_state_t *state)
6538 6155 {
6539 6156 ibd_swqe_t *wqe;
6540 6157
6541 6158 mutex_enter(&state->id_tx_rel_list.dl_mutex);
6542 6159 if (state->id_tx_rel_list.dl_head != NULL) {
6543 6160 /* transfer id_tx_rel_list to id_tx_list */
6544 6161 state->id_tx_list.dl_head =
6545 6162 state->id_tx_rel_list.dl_head;
6546 6163 state->id_tx_list.dl_cnt =
6547 6164 state->id_tx_rel_list.dl_cnt;
6548 6165 state->id_tx_list.dl_pending_sends = B_FALSE;
6549 6166
6550 6167 /* clear id_tx_rel_list */
6551 6168 state->id_tx_rel_list.dl_head = NULL;
6552 6169 state->id_tx_rel_list.dl_cnt = 0;
6553 6170 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6554 6171
6555 6172 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
6556 6173 state->id_tx_list.dl_cnt -= 1;
6557 6174 state->id_tx_list.dl_head = wqe->swqe_next;
6558 6175 } else { /* no free swqe */
6559 6176 mutex_exit(&state->id_tx_rel_list.dl_mutex);
6560 6177 state->id_tx_list.dl_pending_sends = B_TRUE;
6561 6178 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
6562 6179 state->id_tx_short++;
6563 6180 wqe = NULL;
6564 6181 }
6565 6182 return (wqe);
6566 6183 }
6567 6184
6568 6185 static int
6569 6186 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
6570 6187 ibt_ud_dest_hdl_t ud_dest)
6571 6188 {
6572 6189 mblk_t *nmp;
6573 6190 int iph_len, tcph_len;
6574 6191 ibt_wr_lso_t *lso;
6575 6192 uintptr_t ip_start, tcp_start;
6576 6193 uint8_t *dst;
6577 6194 uint_t pending, mblen;
6578 6195
6579 6196 /*
6580 6197 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
6581 6198 * we need to adjust it here for lso.
6582 6199 */
6583 6200 lso = &(node->w_swr.wr.ud_lso);
6584 6201 lso->lso_ud_dest = ud_dest;
6585 6202 lso->lso_mss = mss;
6586 6203
6587 6204 /*
6588 6205 * Calculate the LSO header size and set it in the UD LSO structure.
6589 6206 * Note that the only assumption we make is that each of the IPoIB,
6590 6207 * IP and TCP headers will be contained in a single mblk fragment;
6591 6208 * together, the headers may span multiple mblk fragments.
6592 6209 */
6593 6210 nmp = mp;
6594 6211 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
6595 6212 if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
6596 6213 ip_start = (uintptr_t)nmp->b_cont->b_rptr
6597 6214 + (ip_start - (uintptr_t)(nmp->b_wptr));
6598 6215 nmp = nmp->b_cont;
6599 6216
6600 6217 }
6601 6218 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
6602 6219
6603 6220 tcp_start = ip_start + iph_len;
6604 6221 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
6605 6222 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
6606 6223 + (tcp_start - (uintptr_t)(nmp->b_wptr));
6607 6224 nmp = nmp->b_cont;
6608 6225 }
6609 6226 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
6610 6227 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
6611 6228
6612 6229 /*
6613 6230 * If the lso header fits entirely within a single mblk fragment,
6614 6231 * we'll avoid an additional copy of the lso header here and just
6615 6232 * pass the b_rptr of the mblk directly.
6616 6233 *
6617 6234 * If this isn't true, we'd have to allocate for it explicitly.
6618 6235 */
6619 6236 if (lso->lso_hdr_sz <= MBLKL(mp)) {
6620 6237 lso->lso_hdr = mp->b_rptr;
6621 6238 } else {
6622 6239 /* On work completion, remember to free this allocated hdr */
6623 6240 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
6624 6241 if (lso->lso_hdr == NULL) {
6625 6242 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
6626 6243 "sz = %d", lso->lso_hdr_sz);
6627 6244 lso->lso_hdr_sz = 0;
6628 6245 lso->lso_mss = 0;
6629 6246 return (-1);
6630 6247 }
6631 6248 }
6632 6249
6633 6250 /*
6634 6251 * Copy in the lso header only if we need to
6635 6252 */
6636 6253 if (lso->lso_hdr != mp->b_rptr) {
6637 6254 dst = lso->lso_hdr;
6638 6255 pending = lso->lso_hdr_sz;
6639 6256
6640 6257 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
6641 6258 mblen = MBLKL(nmp);
6642 6259 if (pending > mblen) {
6643 6260 bcopy(nmp->b_rptr, dst, mblen);
6644 6261 dst += mblen;
6645 6262 pending -= mblen;
6646 6263 } else {
6647 6264 bcopy(nmp->b_rptr, dst, pending);
6648 6265 break;
6649 6266 }
6650 6267 }
6651 6268 }
6652 6269
6653 6270 return (0);
6654 6271 }
6655 6272
6656 6273 static void
6657 6274 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
6658 6275 {
6659 6276 ibt_wr_lso_t *lso;
6660 6277
6661 6278 if ((!node) || (!mp))
6662 6279 return;
6663 6280
6664 6281 /*
6665 6282 * Free any header space that we might've allocated if we
6666 6283 * did an LSO
6667 6284 */
6668 6285 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
6669 6286 lso = &(node->w_swr.wr.ud_lso);
6670 6287 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
6671 6288 kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
6672 6289 lso->lso_hdr = NULL;
6673 6290 lso->lso_hdr_sz = 0;
6674 6291 }
6675 6292 }
6676 6293 }
6677 6294
6678 6295 static void
6679 6296 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
6680 6297 {
6681 6298 uint_t i;
6682 6299 uint_t num_posted;
6683 6300 uint_t n_wrs;
6684 6301 ibt_status_t ibt_status;
6685 6302 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE];
6686 6303 ibd_swqe_t *tx_head, *elem;
6687 6304 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE];
6688 6305
6689 6306 /* post the one request, then check for more */
6690 6307 ibt_status = ibt_post_send(state->id_chnl_hdl,
6691 6308 &node->w_swr, 1, NULL);
6692 6309 if (ibt_status != IBT_SUCCESS) {
6693 6310 ibd_print_warn(state, "ibd_post_send: "
6694 6311 "posting one wr failed: ret=%d", ibt_status);
6695 6312 ibd_tx_cleanup(state, node);
6696 6313 }
6697 6314
6698 6315 tx_head = NULL;
6699 6316 for (;;) {
6700 6317 if (tx_head == NULL) {
6701 6318 mutex_enter(&state->id_txpost_lock);
6702 6319 tx_head = state->id_tx_head;
6703 6320 if (tx_head == NULL) {
6704 6321 state->id_tx_busy = 0;
6705 6322 mutex_exit(&state->id_txpost_lock);
6706 6323 return;
6707 6324 }
6708 6325 state->id_tx_head = NULL;
6709 6326 mutex_exit(&state->id_txpost_lock);
6710 6327 }
6711 6328
6712 6329 /*
6713 6330 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
6714 6331 * at a time if possible, and keep posting them.
6715 6332 */
6716 6333 for (n_wrs = 0, elem = tx_head;
6717 6334 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
6718 6335 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
6719 6336 nodes[n_wrs] = elem;
6720 6337 wrs[n_wrs] = elem->w_swr;
6721 6338 }
6722 6339 tx_head = elem;
6723 6340
6724 6341 ASSERT(n_wrs != 0);
6725 6342
6726 6343 /*
6727 6344 * If posting fails for some reason, we'll never receive
6728 6345 * completion intimation, so we'll need to cleanup. But
6729 6346 * we need to make sure we don't clean up nodes whose
6730 6347 * wrs have been successfully posted. We assume that the
6731 6348 * hca driver returns on the first failure to post and
6732 6349 * therefore the first 'num_posted' entries don't need
6733 6350 * cleanup here.
6734 6351 */
6735 6352 num_posted = 0;
6736 6353 ibt_status = ibt_post_send(state->id_chnl_hdl,
6737 6354 wrs, n_wrs, &num_posted);
6738 6355 if (ibt_status != IBT_SUCCESS) {
6739 6356 ibd_print_warn(state, "ibd_post_send: "
6740 6357 "posting multiple wrs failed: "
6741 6358 "requested=%d, done=%d, ret=%d",
6742 6359 n_wrs, num_posted, ibt_status);
6743 6360
6744 6361 for (i = num_posted; i < n_wrs; i++)
6745 6362 ibd_tx_cleanup(state, nodes[i]);
6746 6363 }
6747 6364 }
6748 6365 }
6749 6366
6750 6367 static int
6751 6368 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
6752 6369 uint_t lsohdr_sz)
6753 6370 {
6754 6371 ibt_wr_ds_t *sgl;
6755 6372 ibt_status_t ibt_status;
6756 6373 mblk_t *nmp;
6757 6374 mblk_t *data_mp;
6758 6375 uchar_t *bufp;
6759 6376 size_t blksize;
6760 6377 size_t skip;
6761 6378 size_t avail;
6762 6379 uint_t pktsize;
6763 6380 uint_t frag_len;
6764 6381 uint_t pending_hdr;
6765 6382 int nmblks;
6766 6383 int i;
6767 6384
6768 6385 /*
6769 6386 * Let's skip ahead to the data if this is LSO
6770 6387 */
6771 6388 data_mp = mp;
6772 6389 pending_hdr = 0;
6773 6390 if (lsohdr_sz) {
6774 6391 pending_hdr = lsohdr_sz;
6775 6392 for (nmp = mp; nmp; nmp = nmp->b_cont) {
6776 6393 frag_len = nmp->b_wptr - nmp->b_rptr;
6777 6394 if (frag_len > pending_hdr)
6778 6395 break;
6779 6396 pending_hdr -= frag_len;
6780 6397 }
6781 6398 data_mp = nmp; /* start of data past lso header */
6782 6399 ASSERT(data_mp != NULL);
6783 6400 }
6784 6401
6785 6402 /*
6786 6403 * Calculate the size of message data and number of msg blocks
6787 6404 */
6788 6405 pktsize = 0;
6789 6406 for (nmblks = 0, nmp = data_mp; nmp != NULL;
6790 6407 nmp = nmp->b_cont, nmblks++) {
6791 6408 pktsize += MBLKL(nmp);
6792 6409 }
6793 6410 pktsize -= pending_hdr;
6794 6411
6795 6412 /*
6796 6413 * We only do ibt_map_mem_iov() if the pktsize is above the
6797 6414 * "copy-threshold", and if the number of mp fragments is less than
6798 6415 * the maximum acceptable.
6799 6416 */
6800 6417 if ((state->id_hca_res_lkey_capab) &&
6801 6418 (pktsize > state->id_ud_tx_copy_thresh) &&
6802 6419 (nmblks < state->id_max_sqseg_hiwm)) {
6803 6420 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
6804 6421 ibt_iov_attr_t iov_attr;
6805 6422
6806 6423 iov_attr.iov_as = NULL;
6807 6424 iov_attr.iov = iov_arr;
6808 6425 iov_attr.iov_buf = NULL;
6809 6426 iov_attr.iov_list_len = nmblks;
6810 6427 iov_attr.iov_wr_nds = state->id_max_sqseg;
6811 6428 iov_attr.iov_lso_hdr_sz = lsohdr_sz;
6812 6429 iov_attr.iov_flags = IBT_IOV_SLEEP;
6813 6430
6814 6431 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
6815 6432 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
6816 6433 iov_arr[i].iov_len = MBLKL(nmp);
6817 6434 if (i == 0) {
6818 6435 iov_arr[i].iov_addr += pending_hdr;
6819 6436 iov_arr[i].iov_len -= pending_hdr;
6820 6437 }
6821 6438 }
6822 6439
6823 6440 node->w_buftype = IBD_WQE_MAPPED;
6824 6441 node->w_swr.wr_sgl = node->w_sgl;
6825 6442
6826 6443 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
6827 6444 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
6828 6445 if (ibt_status != IBT_SUCCESS) {
6829 6446 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
6830 6447 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
6831 6448 goto ibd_copy_path;
6832 6449 }
6833 6450
6834 6451 return (0);
6835 6452 }
6836 6453
6837 6454 ibd_copy_path:
6838 6455 if (pktsize <= state->id_tx_buf_sz) {
6839 6456 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
6840 6457 node->w_swr.wr_nds = 1;
6841 6458 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
6842 6459 node->w_buftype = IBD_WQE_TXBUF;
6843 6460
6844 6461 /*
6845 6462 * Even though this is the copy path for transfers less than
6846 6463 * id_tx_buf_sz, it could still be an LSO packet. If so, it
6847 6464 * is possible the first data mblk fragment (data_mp) still
6848 6465 * contains part of the LSO header that we need to skip.
6849 6466 */
6850 6467 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
6851 6468 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
6852 6469 blksize = MBLKL(nmp) - pending_hdr;
6853 6470 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
6854 6471 bufp += blksize;
6855 6472 pending_hdr = 0;
6856 6473 }
6857 6474
6858 6475 return (0);
6859 6476 }
6860 6477
6861 6478 /*
6862 6479 * Copy path for transfers greater than id_tx_buf_sz
6863 6480 */
6864 6481 node->w_swr.wr_sgl = node->w_sgl;
6865 6482 if (ibd_acquire_lsobufs(state, pktsize,
6866 6483 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
6867 6484 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
6868 6485 return (-1);
6869 6486 }
6870 6487 node->w_buftype = IBD_WQE_LSOBUF;
6871 6488
6872 6489 /*
6873 6490 * Copy the larger-than-id_tx_buf_sz packet into a set of
6874 6491 * fixed-sized, pre-mapped LSO buffers. Note that we might
6875 6492 * need to skip part of the LSO header in the first fragment
6876 6493 * as before.
6877 6494 */
6878 6495 nmp = data_mp;
6879 6496 skip = pending_hdr;
6880 6497 for (i = 0; i < node->w_swr.wr_nds; i++) {
6881 6498 sgl = node->w_swr.wr_sgl + i;
6882 6499 bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
6883 6500 avail = IBD_LSO_BUFSZ;
6884 6501 while (nmp && avail) {
6885 6502 blksize = MBLKL(nmp) - skip;
6886 6503 if (blksize > avail) {
6887 6504 bcopy(nmp->b_rptr + skip, bufp, avail);
6888 6505 skip += avail;
6889 6506 avail = 0;
6890 6507 } else {
6891 6508 bcopy(nmp->b_rptr + skip, bufp, blksize);
6892 6509 skip = 0;
6893 6510 avail -= blksize;
6894 6511 bufp += blksize;
6895 6512 nmp = nmp->b_cont;
6896 6513 }
6897 6514 }
6898 6515 }
6899 6516
6900 6517 return (0);
6901 6518 }
6902 6519
6903 6520 /*
6904 6521 * Schedule a completion queue polling to reap the resource we're
6905 6522 * short on. If we implement the change to reap tx completions
6906 6523 * in a separate thread, we'll need to wake up that thread here.
6907 6524 */
6908 6525 static int
6909 6526 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
6910 6527 {
6911 6528 ibd_req_t *req;
6912 6529
6913 6530 mutex_enter(&state->id_sched_lock);
6914 6531 state->id_sched_needed |= resource_type;
6915 6532 mutex_exit(&state->id_sched_lock);
6916 6533
6917 6534 /*
6918 6535 * If we are asked to queue a work entry, we need to do it
6919 6536 */
6920 6537 if (q_flag) {
6921 6538 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
6922 6539 if (req == NULL)
6923 6540 return (-1);
6924 6541
6925 6542 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
6926 6543 }
6927 6544
6928 6545 return (0);
6929 6546 }
6930 6547
6931 6548 /*
6932 6549 * The passed in packet has this format:
6933 6550 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
6934 6551 */
6935 6552 static boolean_t
6936 6553 ibd_send(ibd_state_t *state, mblk_t *mp)
6937 6554 {
6938 6555 ibd_ace_t *ace;
6939 6556 ibd_swqe_t *node;
6940 6557 ipoib_mac_t *dest;
6941 6558 ib_header_info_t *ipibp;
6942 6559 ip6_t *ip6h;
6943 6560 uint_t pktsize;
6944 6561 uint32_t mss;
6945 6562 uint32_t hckflags;
6946 6563 uint32_t lsoflags = 0;
6947 6564 uint_t lsohdr_sz = 0;
6948 6565 int ret, len;
6949 6566 boolean_t dofree = B_FALSE;
6950 6567 boolean_t rc;
6951 6568 /* if (rc_chan == NULL) send by UD; else send by RC; */
6952 6569 ibd_rc_chan_t *rc_chan;
6953 6570 int nmblks;
6954 6571 mblk_t *nmp;
6955 6572
6956 6573 /*
6957 6574 * If we aren't done with the device initialization and start,
6958 6575 * we shouldn't be here.
6959 6576 */
6960 6577 if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
6961 6578 return (B_FALSE);
6962 6579
6963 6580 /*
6964 6581 * Obtain an address handle for the destination.
6965 6582 */
6966 6583 ipibp = (ib_header_info_t *)mp->b_rptr;
6967 6584 dest = (ipoib_mac_t *)&ipibp->ib_dst;
6968 6585 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6969 6586 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
6970 6587
6971 6588 rc_chan = NULL;
6972 6589 ace = ibd_acache_lookup(state, dest, &ret, 1);
6973 6590 if (state->id_enable_rc && (ace != NULL) &&
6974 6591 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) {
6975 6592 if (ace->ac_chan == NULL) {
6976 6593 state->rc_null_conn++;
6977 6594 } else {
6978 6595 if (ace->ac_chan->chan_state ==
6979 6596 IBD_RC_STATE_ACT_ESTAB) {
6980 6597 rc_chan = ace->ac_chan;
6981 6598 rc_chan->is_used = B_TRUE;
6982 6599 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
6983 6600 node = WQE_TO_SWQE(
6984 6601 rc_chan->tx_wqe_list.dl_head);
6985 6602 if (node != NULL) {
6986 6603 rc_chan->tx_wqe_list.dl_cnt -= 1;
6987 6604 rc_chan->tx_wqe_list.dl_head =
6988 6605 node->swqe_next;
6989 6606 } else {
6990 6607 node = ibd_rc_acquire_swqes(rc_chan);
6991 6608 }
6992 6609 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
6993 6610
6994 6611 if (node == NULL) {
6995 6612 state->rc_swqe_short++;
6996 6613 mutex_enter(&state->id_sched_lock);
6997 6614 state->id_sched_needed |=
6998 6615 IBD_RSRC_RC_SWQE;
6999 6616 mutex_exit(&state->id_sched_lock);
7000 6617 ibd_dec_ref_ace(state, ace);
7001 6618 return (B_FALSE);
7002 6619 }
7003 6620 } else {
7004 6621 state->rc_no_estab_conn++;
7005 6622 }
7006 6623 }
7007 6624 }
7008 6625
7009 6626 if (rc_chan == NULL) {
7010 6627 mutex_enter(&state->id_tx_list.dl_mutex);
7011 6628 node = WQE_TO_SWQE(state->id_tx_list.dl_head);
7012 6629 if (node != NULL) {
7013 6630 state->id_tx_list.dl_cnt -= 1;
7014 6631 state->id_tx_list.dl_head = node->swqe_next;
7015 6632 } else {
7016 6633 node = ibd_acquire_swqe(state);
7017 6634 }
7018 6635 mutex_exit(&state->id_tx_list.dl_mutex);
7019 6636 if (node == NULL) {
7020 6637 /*
7021 6638 * If we don't have an swqe available, schedule a
7022 6639 * transmit completion queue cleanup and hold off on
7023 6640 * sending more packets until we have some free swqes
7024 6641 */
7025 6642 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) {
7026 6643 if (ace != NULL) {
7027 6644 ibd_dec_ref_ace(state, ace);
7028 6645 }
7029 6646 return (B_FALSE);
7030 6647 }
7031 6648
7032 6649 /*
7033 6650 * If a poll cannot be scheduled, we have no choice but
7034 6651 * to drop this packet
7035 6652 */
7036 6653 ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
7037 6654 if (ace != NULL) {
7038 6655 ibd_dec_ref_ace(state, ace);
7039 6656 }
7040 6657 return (B_TRUE);
7041 6658 }
7042 6659 }
7043 6660
7044 6661 /*
7045 6662 * Initialize the commonly used fields in swqe to NULL to protect
7046 6663 * against ibd_tx_cleanup accidentally misinterpreting these on a
7047 6664 * failure.
7048 6665 */
7049 6666 node->swqe_im_mblk = NULL;
7050 6667 node->w_swr.wr_nds = 0;
7051 6668 node->w_swr.wr_sgl = NULL;
7052 6669 node->w_swr.wr_opcode = IBT_WRC_SEND;
7053 6670
7054 6671 /*
7055 6672 * Calculate the size of message data and number of msg blocks
7056 6673 */
7057 6674 pktsize = 0;
7058 6675 for (nmblks = 0, nmp = mp; nmp != NULL;
7059 6676 nmp = nmp->b_cont, nmblks++) {
7060 6677 pktsize += MBLKL(nmp);
7061 6678 }
7062 6679
7063 6680 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
7064 6681 atomic_inc_64(&state->id_brd_xmt);
7065 6682 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
7066 6683 atomic_inc_64(&state->id_multi_xmt);
7067 6684
7068 6685 if (ace != NULL) {
7069 6686 node->w_ahandle = ace;
7070 6687 node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
7071 6688 } else {
7072 6689 DPRINT(5,
7073 6690 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
7074 6691 ((ret == EFAULT) ? "failed" : "queued"),
7075 6692 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
7076 6693 htonl(dest->ipoib_gidpref[1]),
7077 6694 htonl(dest->ipoib_gidsuff[0]),
7078 6695 htonl(dest->ipoib_gidsuff[1]));
7079 6696 state->rc_ace_not_found++;
7080 6697 node->w_ahandle = NULL;
7081 6698
7082 6699 /*
7083 6700 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
7084 6701 * can not find a path for the specific dest address. We
7085 6702 * should get rid of this kind of packet. We also should get
7086 6703 * rid of the packet if we cannot schedule a poll via the
7087 6704 * async thread. For the normal case, ibd will return the
7088 6705 * packet to upper layer and wait for AH creating.
7089 6706 *
7090 6707 * Note that we always queue a work slot entry for the async
7091 6708 * thread when we fail AH lookup (even in intr mode); this is
7092 6709 * due to the convoluted way the code currently looks for AH.
7093 6710 */
7094 6711 if (ret == EFAULT) {
7095 6712 dofree = B_TRUE;
7096 6713 rc = B_TRUE;
7097 6714 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
7098 6715 dofree = B_TRUE;
7099 6716 rc = B_TRUE;
7100 6717 } else {
7101 6718 dofree = B_FALSE;
7102 6719 rc = B_FALSE;
7103 6720 }
7104 6721 goto ibd_send_fail;
7105 6722 }
7106 6723
7107 6724 /*
7108 6725 * For ND6 packets, padding is at the front of the source lladdr.
7109 6726 * Insert the padding at front.
7110 6727 */
7111 6728 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
7112 6729 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
7113 6730 if (!pullupmsg(mp, IPV6_HDR_LEN +
7114 6731 sizeof (ib_header_info_t))) {
7115 6732 DPRINT(10, "ibd_send: pullupmsg failure ");
7116 6733 dofree = B_TRUE;
7117 6734 rc = B_TRUE;
7118 6735 goto ibd_send_fail;
7119 6736 }
7120 6737 ipibp = (ib_header_info_t *)mp->b_rptr;
7121 6738 }
7122 6739 ip6h = (ip6_t *)((uchar_t *)ipibp +
7123 6740 sizeof (ib_header_info_t));
7124 6741 len = ntohs(ip6h->ip6_plen);
7125 6742 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
7126 6743 mblk_t *pad;
7127 6744
7128 6745 pad = allocb(4, 0);
7129 6746 pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
7130 6747 linkb(mp, pad);
7131 6748 if (MBLKL(mp) < sizeof (ib_header_info_t) +
7132 6749 IPV6_HDR_LEN + len + 4) {
7133 6750 if (!pullupmsg(mp, sizeof (ib_header_info_t) +
7134 6751 IPV6_HDR_LEN + len + 4)) {
7135 6752 DPRINT(10, "ibd_send: pullupmsg "
7136 6753 "failure ");
7137 6754 dofree = B_TRUE;
7138 6755 rc = B_TRUE;
7139 6756 goto ibd_send_fail;
7140 6757 }
7141 6758 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
7142 6759 sizeof (ib_header_info_t));
7143 6760 }
7144 6761
7145 6762 /* LINTED: E_CONSTANT_CONDITION */
7146 6763 IBD_PAD_NSNA(ip6h, len, IBD_SEND);
7147 6764 }
7148 6765 }
7149 6766
7150 6767 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t));
7151 6768 mp->b_rptr += sizeof (ib_addrs_t);
7152 6769 pktsize -= sizeof (ib_addrs_t);
7153 6770
7154 6771 if (rc_chan) { /* send in RC mode */
7155 6772 ibt_iov_t iov_arr[IBD_MAX_SQSEG];
7156 6773 ibt_iov_attr_t iov_attr;
7157 6774 uint_t i;
7158 6775 size_t blksize;
7159 6776 uchar_t *bufp;
7160 6777 ibd_rc_tx_largebuf_t *lbufp;
7161 6778
7162 6779 atomic_add_64(&state->rc_xmt_bytes, pktsize);
7163 6780
7164 6781 /*
7165 6782 * Upper layer does Tx checksum, we don't need do any
7166 6783 * checksum here.
7167 6784 */
7168 6785 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV);
7169 6786
7170 6787 /*
7171 6788 * We only do ibt_map_mem_iov() if the pktsize is above
7172 6789 * the "copy-threshold", and if the number of mp
7173 6790 * fragments is less than the maximum acceptable.
7174 6791 */
7175 6792 if (pktsize <= state->id_rc_tx_copy_thresh) {
7176 6793 atomic_inc_64(&state->rc_xmt_small_pkt);
7177 6794 /*
7178 6795 * Only process unicast packet in Reliable Connected
7179 6796 * mode.
7180 6797 */
7181 6798 node->swqe_copybuf.ic_sgl.ds_len = pktsize;
7182 6799 node->w_swr.wr_nds = 1;
7183 6800 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
7184 6801 node->w_buftype = IBD_WQE_TXBUF;
7185 6802
7186 6803 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
7187 6804 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7188 6805 blksize = MBLKL(nmp);
7189 6806 bcopy(nmp->b_rptr, bufp, blksize);
7190 6807 bufp += blksize;
7191 6808 }
7192 6809 freemsg(mp);
7193 6810 ASSERT(node->swqe_im_mblk == NULL);
7194 6811 } else {
7195 6812 if ((state->rc_enable_iov_map) &&
7196 6813 (nmblks < state->rc_max_sqseg_hiwm)) {
7197 6814
7198 6815 /* do ibt_map_mem_iov() */
7199 6816 iov_attr.iov_as = NULL;
7200 6817 iov_attr.iov = iov_arr;
7201 6818 iov_attr.iov_buf = NULL;
7202 6819 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg;
7203 6820 iov_attr.iov_lso_hdr_sz = 0;
7204 6821 iov_attr.iov_flags = IBT_IOV_SLEEP;
7205 6822
7206 6823 i = 0;
7207 6824 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7208 6825 iov_arr[i].iov_len = MBLKL(nmp);
7209 6826 if (iov_arr[i].iov_len != 0) {
7210 6827 iov_arr[i].iov_addr = (caddr_t)
7211 6828 (void *)nmp->b_rptr;
7212 6829 i++;
7213 6830 }
7214 6831 }
7215 6832 iov_attr.iov_list_len = i;
7216 6833 node->w_swr.wr_sgl = node->w_sgl;
7217 6834
7218 6835 ret = ibt_map_mem_iov(state->id_hca_hdl,
7219 6836 &iov_attr, (ibt_all_wr_t *)&node->w_swr,
7220 6837 &node->w_mi_hdl);
7221 6838 if (ret != IBT_SUCCESS) {
7222 6839 atomic_inc_64(
7223 6840 &state->rc_xmt_map_fail_pkt);
7224 6841 DPRINT(30, "ibd_send: ibt_map_mem_iov("
7225 6842 ") failed, nmblks=%d, real_nmblks"
7226 6843 "=%d, ret=0x%x", nmblks, i, ret);
7227 6844 goto ibd_rc_large_copy;
7228 6845 }
7229 6846
7230 6847 atomic_inc_64(&state->rc_xmt_map_succ_pkt);
7231 6848 node->w_buftype = IBD_WQE_MAPPED;
7232 6849 node->swqe_im_mblk = mp;
7233 6850 } else {
7234 6851 atomic_inc_64(&state->rc_xmt_fragmented_pkt);
7235 6852 ibd_rc_large_copy:
7236 6853 mutex_enter(&state->rc_tx_large_bufs_lock);
7237 6854 if (state->rc_tx_largebuf_nfree == 0) {
7238 6855 state->rc_xmt_buf_short++;
7239 6856 mutex_exit
7240 6857 (&state->rc_tx_large_bufs_lock);
7241 6858 mutex_enter(&state->id_sched_lock);
7242 6859 state->id_sched_needed |=
7243 6860 IBD_RSRC_RC_TX_LARGEBUF;
7244 6861 mutex_exit(&state->id_sched_lock);
7245 6862 dofree = B_FALSE;
7246 6863 rc = B_FALSE;
7247 6864 /*
7248 6865 * If we don't have Tx large bufs,
7249 6866 * return failure. node->w_buftype
7250 6867 * should not be IBD_WQE_RC_COPYBUF,
7251 6868 * otherwise it will cause problem
7252 6869 * in ibd_rc_tx_cleanup()
7253 6870 */
7254 6871 node->w_buftype = IBD_WQE_TXBUF;
7255 6872 goto ibd_send_fail;
7256 6873 }
7257 6874
7258 6875 lbufp = state->rc_tx_largebuf_free_head;
7259 6876 ASSERT(lbufp->lb_buf != NULL);
7260 6877 state->rc_tx_largebuf_free_head =
7261 6878 lbufp->lb_next;
7262 6879 lbufp->lb_next = NULL;
7263 6880 /* Update nfree count */
7264 6881 state->rc_tx_largebuf_nfree --;
7265 6882 mutex_exit(&state->rc_tx_large_bufs_lock);
7266 6883 bufp = lbufp->lb_buf;
7267 6884 node->w_sgl[0].ds_va =
7268 6885 (ib_vaddr_t)(uintptr_t)bufp;
7269 6886 node->w_sgl[0].ds_key =
7270 6887 state->rc_tx_mr_desc.md_lkey;
7271 6888 node->w_sgl[0].ds_len = pktsize;
7272 6889 node->w_swr.wr_sgl = node->w_sgl;
7273 6890 node->w_swr.wr_nds = 1;
7274 6891 node->w_buftype = IBD_WQE_RC_COPYBUF;
7275 6892 node->w_rc_tx_largebuf = lbufp;
7276 6893
7277 6894 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) {
7278 6895 blksize = MBLKL(nmp);
7279 6896 if (blksize != 0) {
7280 6897 bcopy(nmp->b_rptr, bufp,
7281 6898 blksize);
7282 6899 bufp += blksize;
7283 6900 }
7284 6901 }
7285 6902 freemsg(mp);
7286 6903 ASSERT(node->swqe_im_mblk == NULL);
7287 6904 }
7288 6905 }
7289 6906
7290 6907 node->swqe_next = NULL;
7291 6908 mutex_enter(&rc_chan->tx_post_lock);
7292 6909 if (rc_chan->tx_busy) {
7293 6910 if (rc_chan->tx_head) {
7294 6911 rc_chan->tx_tail->swqe_next =
7295 6912 SWQE_TO_WQE(node);
7296 6913 } else {
7297 6914 rc_chan->tx_head = node;
7298 6915 }
7299 6916 rc_chan->tx_tail = node;
7300 6917 mutex_exit(&rc_chan->tx_post_lock);
7301 6918 } else {
7302 6919 rc_chan->tx_busy = 1;
7303 6920 mutex_exit(&rc_chan->tx_post_lock);
7304 6921 ibd_rc_post_send(rc_chan, node);
7305 6922 }
7306 6923
7307 6924 return (B_TRUE);
7308 6925 } /* send by RC */
7309 6926
7310 6927 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) {
7311 6928 /*
7312 6929 * Too long pktsize. The packet size from GLD should <=
7313 6930 * state->id_mtu + sizeof (ib_addrs_t)
7314 6931 */
7315 6932 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) {
7316 6933 ibd_req_t *req;
7317 6934
7318 6935 mutex_enter(&ace->tx_too_big_mutex);
7319 6936 if (ace->tx_too_big_ongoing) {
7320 6937 mutex_exit(&ace->tx_too_big_mutex);
7321 6938 state->rc_xmt_reenter_too_long_pkt++;
7322 6939 dofree = B_TRUE;
7323 6940 } else {
7324 6941 ace->tx_too_big_ongoing = B_TRUE;
7325 6942 mutex_exit(&ace->tx_too_big_mutex);
7326 6943 state->rc_xmt_icmp_too_long_pkt++;
7327 6944
7328 6945 req = kmem_cache_alloc(state->id_req_kmc,
7329 6946 KM_NOSLEEP);
7330 6947 if (req == NULL) {
7331 6948 ibd_print_warn(state, "ibd_send: alloc "
7332 6949 "ibd_req_t fail");
7333 6950 /* Drop it. */
7334 6951 dofree = B_TRUE;
7335 6952 } else {
7336 6953 req->rq_ptr = mp;
7337 6954 req->rq_ptr2 = ace;
7338 6955 ibd_queue_work_slot(state, req,
7339 6956 IBD_ASYNC_RC_TOO_BIG);
7340 6957 dofree = B_FALSE;
7341 6958 }
7342 6959 }
7343 6960 } else {
7344 6961 ibd_print_warn(state, "Reliable Connected mode is on. "
7345 6962 "Multicast packet length %d > %d is too long to "
7346 6963 "send packet (%d > %d), drop it",
7347 6964 pktsize, state->id_mtu);
7348 6965 state->rc_xmt_drop_too_long_pkt++;
7349 6966 /* Drop it. */
7350 6967 dofree = B_TRUE;
7351 6968 }
7352 6969 rc = B_TRUE;
7353 6970 goto ibd_send_fail;
7354 6971 }
7355 6972
7356 6973 atomic_add_64(&state->id_xmt_bytes, pktsize);
7357 6974 atomic_inc_64(&state->id_xmt_pkt);
7358 6975
7359 6976 /*
7360 6977 * Do LSO and checksum related work here. For LSO send, adjust the
7361 6978 * ud destination, the opcode and the LSO header information to the
7362 6979 * work request.
7363 6980 */
7364 6981 mac_lso_get(mp, &mss, &lsoflags);
7365 6982 if ((lsoflags & HW_LSO) != HW_LSO) {
7366 6983 node->w_swr.wr_opcode = IBT_WRC_SEND;
7367 6984 lsohdr_sz = 0;
7368 6985 } else {
7369 6986 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
7370 6987 /*
7371 6988 * The routine can only fail if there's no memory; we
7372 6989 * can only drop the packet if this happens
7373 6990 */
7374 6991 ibd_print_warn(state,
7375 6992 "ibd_send: no memory, lso posting failed");
7376 6993 dofree = B_TRUE;
7377 6994 rc = B_TRUE;
7378 6995 goto ibd_send_fail;
7379 6996 }
7380 6997
7381 6998 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
7382 6999 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
7383 7000 }
7384 7001
7385 7002 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags);
7386 7003 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
7387 7004 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
7388 7005 else
7389 7006 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
7390 7007
7391 7008 /*
7392 7009 * Prepare the sgl for posting; the routine can only fail if there's
7393 7010 * no lso buf available for posting. If this is the case, we should
7394 7011 * probably resched for lso bufs to become available and then try again.
7395 7012 */
7396 7013 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
7397 7014 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
7398 7015 dofree = B_TRUE;
7399 7016 rc = B_TRUE;
7400 7017 } else {
7401 7018 dofree = B_FALSE;
7402 7019 rc = B_FALSE;
7403 7020 }
7404 7021 goto ibd_send_fail;
7405 7022 }
7406 7023 node->swqe_im_mblk = mp;
7407 7024
7408 7025 /*
7409 7026 * Queue the wqe to hardware; since we can now simply queue a
7410 7027 * post instead of doing it serially, we cannot assume anything
7411 7028 * about the 'node' after ibd_post_send() returns.
7412 7029 */
7413 7030 node->swqe_next = NULL;
7414 7031
7415 7032 mutex_enter(&state->id_txpost_lock);
7416 7033 if (state->id_tx_busy) {
7417 7034 if (state->id_tx_head) {
7418 7035 state->id_tx_tail->swqe_next =
7419 7036 SWQE_TO_WQE(node);
7420 7037 } else {
7421 7038 state->id_tx_head = node;
7422 7039 }
7423 7040 state->id_tx_tail = node;
7424 7041 mutex_exit(&state->id_txpost_lock);
7425 7042 } else {
7426 7043 state->id_tx_busy = 1;
7427 7044 mutex_exit(&state->id_txpost_lock);
7428 7045 ibd_post_send(state, node);
7429 7046 }
7430 7047
7431 7048 return (B_TRUE);
7432 7049
7433 7050 ibd_send_fail:
7434 7051 if (node && mp)
7435 7052 ibd_free_lsohdr(node, mp);
7436 7053
7437 7054 if (dofree)
7438 7055 freemsg(mp);
7439 7056
7440 7057 if (node != NULL) {
7441 7058 if (rc_chan) {
7442 7059 ibd_rc_tx_cleanup(node);
7443 7060 } else {
7444 7061 ibd_tx_cleanup(state, node);
7445 7062 }
7446 7063 }
7447 7064
7448 7065 return (rc);
7449 7066 }
7450 7067
7451 7068 /*
7452 7069 * GLDv3 entry point for transmitting datagram.
7453 7070 */
7454 7071 static mblk_t *
7455 7072 ibd_m_tx(void *arg, mblk_t *mp)
7456 7073 {
7457 7074 ibd_state_t *state = (ibd_state_t *)arg;
7458 7075 mblk_t *next;
7459 7076
7460 7077 if (state->id_type == IBD_PORT_DRIVER) {
7461 7078 freemsgchain(mp);
7462 7079 return (NULL);
7463 7080 }
7464 7081
7465 7082 if ((state->id_link_state != LINK_STATE_UP) ||
7466 7083 !(state->id_mac_state & IBD_DRV_STARTED)) {
7467 7084 freemsgchain(mp);
7468 7085 mp = NULL;
7469 7086 }
7470 7087
7471 7088 while (mp != NULL) {
7472 7089 next = mp->b_next;
7473 7090 mp->b_next = NULL;
7474 7091 if (ibd_send(state, mp) == B_FALSE) {
7475 7092 /* Send fail */
7476 7093 mp->b_next = next;
7477 7094 break;
7478 7095 }
7479 7096 mp = next;
7480 7097 }
7481 7098
7482 7099 return (mp);
7483 7100 }
7484 7101
7485 7102 /*
7486 7103 * this handles Tx and Rx completions. With separate CQs, this handles
7487 7104 * only Rx completions.
7488 7105 */
7489 7106 static uint_t
7490 7107 ibd_intr(caddr_t arg)
7491 7108 {
7492 7109 ibd_state_t *state = (ibd_state_t *)arg;
7493 7110
7494 7111 ibd_poll_rcq(state, state->id_rcq_hdl);
7495 7112
7496 7113 return (DDI_INTR_CLAIMED);
7497 7114 }
7498 7115
7499 7116 /*
7500 7117 * Poll and fully drain the send cq
7501 7118 */
7502 7119 static void
7503 7120 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7504 7121 {
7505 7122 ibt_wc_t *wcs = state->id_txwcs;
7506 7123 uint_t numwcs = state->id_txwcs_size;
7507 7124 ibd_wqe_t *wqe;
7508 7125 ibd_swqe_t *head, *tail;
7509 7126 ibt_wc_t *wc;
7510 7127 uint_t num_polled;
7511 7128 int i;
7512 7129
7513 7130 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7514 7131 head = tail = NULL;
7515 7132 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7516 7133 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
7517 7134 if (wc->wc_status != IBT_WC_SUCCESS) {
7518 7135 /*
7519 7136 * Channel being torn down.
7520 7137 */
7521 7138 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7522 7139 DPRINT(5, "ibd_drain_scq: flush error");
7523 7140 DPRINT(10, "ibd_drain_scq: Bad "
7524 7141 "status %d", wc->wc_status);
7525 7142 } else {
7526 7143 DPRINT(10, "ibd_drain_scq: "
7527 7144 "unexpected wc_status %d",
7528 7145 wc->wc_status);
7529 7146 }
7530 7147 /*
7531 7148 * Fallthrough to invoke the Tx handler to
7532 7149 * release held resources, e.g., AH refcount.
7533 7150 */
7534 7151 }
7535 7152 /*
7536 7153 * Add this swqe to the list to be cleaned up.
7537 7154 */
7538 7155 if (head)
7539 7156 tail->swqe_next = wqe;
7540 7157 else
7541 7158 head = WQE_TO_SWQE(wqe);
7542 7159 tail = WQE_TO_SWQE(wqe);
7543 7160 }
7544 7161 tail->swqe_next = NULL;
7545 7162 ibd_tx_cleanup_list(state, head, tail);
7546 7163
7547 7164 /*
7548 7165 * Resume any blocked transmissions if possible
7549 7166 */
7550 7167 ibd_resume_transmission(state);
7551 7168 }
7552 7169 }
7553 7170
7554 7171 /*
7555 7172 * Poll and fully drain the receive cq
7556 7173 */
7557 7174 static void
7558 7175 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7559 7176 {
7560 7177 ibt_wc_t *wcs = state->id_rxwcs;
7561 7178 uint_t numwcs = state->id_rxwcs_size;
7562 7179 ibd_rwqe_t *rwqe;
7563 7180 ibt_wc_t *wc;
7564 7181 uint_t num_polled;
7565 7182 int i;
7566 7183 mblk_t *head, *tail, *mp;
7567 7184
7568 7185 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
7569 7186 head = tail = NULL;
7570 7187 for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
7571 7188 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id;
7572 7189 if (wc->wc_status != IBT_WC_SUCCESS) {
7573 7190 /*
7574 7191 * Channel being torn down.
7575 7192 */
7576 7193 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
7577 7194 DPRINT(5, "ibd_drain_rcq: "
7578 7195 "expected flushed rwqe");
7579 7196 } else {
7580 7197 DPRINT(5, "ibd_drain_rcq: "
7581 7198 "unexpected wc_status %d",
7582 7199 wc->wc_status);
7583 7200 }
7584 7201 atomic_inc_32(
7585 7202 &state->id_rx_list.dl_bufs_outstanding);
7586 7203 freemsg(rwqe->rwqe_im_mblk);
7587 7204 continue;
7588 7205 }
7589 7206 mp = ibd_process_rx(state, rwqe, wc);
7590 7207 if (mp == NULL)
7591 7208 continue;
7592 7209
7593 7210 /*
7594 7211 * Add this mp to the list to send to the nw layer.
7595 7212 */
7596 7213 if (head)
7597 7214 tail->b_next = mp;
7598 7215 else
7599 7216 head = mp;
7600 7217 tail = mp;
7601 7218 }
7602 7219 if (head)
7603 7220 mac_rx(state->id_mh, state->id_rh, head);
7604 7221
7605 7222 /*
7606 7223 * Account for #rwqes polled.
7607 7224 * Post more here, if less than one fourth full.
7608 7225 */
7609 7226 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) <
7610 7227 (state->id_ud_num_rwqe / 4))
7611 7228 ibd_post_recv_intr(state);
7612 7229 }
7613 7230 }
7614 7231
7615 7232 /*
7616 7233 * Common code for interrupt handling as well as for polling
7617 7234 * for all completed wqe's while detaching.
7618 7235 */
7619 7236 static void
7620 7237 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
7621 7238 {
7622 7239 int flag, redo_flag;
7623 7240 int redo = 1;
7624 7241
7625 7242 flag = IBD_CQ_POLLING;
7626 7243 redo_flag = IBD_REDO_CQ_POLLING;
7627 7244
7628 7245 mutex_enter(&state->id_scq_poll_lock);
7629 7246 if (state->id_scq_poll_busy & flag) {
7630 7247 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
7631 7248 state->id_scq_poll_busy |= redo_flag;
7632 7249 mutex_exit(&state->id_scq_poll_lock);
7633 7250 return;
7634 7251 }
7635 7252 state->id_scq_poll_busy |= flag;
7636 7253 mutex_exit(&state->id_scq_poll_lock);
7637 7254
7638 7255 /*
7639 7256 * In some cases (eg detaching), this code can be invoked on
7640 7257 * any cpu after disabling cq notification (thus no concurrency
7641 7258 * exists). Apart from that, the following applies normally:
7642 7259 * Transmit completion handling could be from any cpu if
7643 7260 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
7644 7261 * is interrupt driven.
7645 7262 */
7646 7263
7647 7264 /*
7648 7265 * Poll and drain the CQ
7649 7266 */
7650 7267 ibd_drain_scq(state, cq_hdl);
7651 7268
7652 7269 /*
7653 7270 * Enable CQ notifications and redrain the cq to catch any
7654 7271 * completions we might have missed after the ibd_drain_scq()
7655 7272 * above and before the ibt_enable_cq_notify() that follows.
7656 7273 * Finally, service any new requests to poll the cq that
7657 7274 * could've come in after the ibt_enable_cq_notify().
7658 7275 */
7659 7276 do {
7660 7277 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
7661 7278 IBT_SUCCESS) {
7662 7279 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7663 7280 }
7664 7281
7665 7282 ibd_drain_scq(state, cq_hdl);
7666 7283
7667 7284 mutex_enter(&state->id_scq_poll_lock);
7668 7285 if (state->id_scq_poll_busy & redo_flag)
7669 7286 state->id_scq_poll_busy &= ~redo_flag;
7670 7287 else {
7671 7288 state->id_scq_poll_busy &= ~flag;
7672 7289 redo = 0;
7673 7290 }
7674 7291 mutex_exit(&state->id_scq_poll_lock);
7675 7292
7676 7293 } while (redo);
7677 7294 }
7678 7295
7679 7296 /*
7680 7297 * Common code for interrupt handling as well as for polling
7681 7298 * for all completed wqe's while detaching.
7682 7299 */
7683 7300 static void
7684 7301 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
7685 7302 {
7686 7303 int flag, redo_flag;
7687 7304 int redo = 1;
7688 7305
7689 7306 flag = IBD_CQ_POLLING;
7690 7307 redo_flag = IBD_REDO_CQ_POLLING;
7691 7308
7692 7309 mutex_enter(&state->id_rcq_poll_lock);
7693 7310 if (state->id_rcq_poll_busy & flag) {
7694 7311 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
7695 7312 state->id_rcq_poll_busy |= redo_flag;
7696 7313 mutex_exit(&state->id_rcq_poll_lock);
7697 7314 return;
7698 7315 }
7699 7316 state->id_rcq_poll_busy |= flag;
7700 7317 mutex_exit(&state->id_rcq_poll_lock);
7701 7318
7702 7319 /*
7703 7320 * Poll and drain the CQ
7704 7321 */
7705 7322 ibd_drain_rcq(state, rcq);
7706 7323
7707 7324 /*
7708 7325 * Enable CQ notifications and redrain the cq to catch any
7709 7326 * completions we might have missed after the ibd_drain_cq()
7710 7327 * above and before the ibt_enable_cq_notify() that follows.
7711 7328 * Finally, service any new requests to poll the cq that
7712 7329 * could've come in after the ibt_enable_cq_notify().
7713 7330 */
7714 7331 do {
7715 7332 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
7716 7333 IBT_SUCCESS) {
7717 7334 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
7718 7335 }
7719 7336
7720 7337 ibd_drain_rcq(state, rcq);
7721 7338
7722 7339 mutex_enter(&state->id_rcq_poll_lock);
7723 7340 if (state->id_rcq_poll_busy & redo_flag)
7724 7341 state->id_rcq_poll_busy &= ~redo_flag;
7725 7342 else {
7726 7343 state->id_rcq_poll_busy &= ~flag;
7727 7344 redo = 0;
7728 7345 }
7729 7346 mutex_exit(&state->id_rcq_poll_lock);
7730 7347
7731 7348 } while (redo);
7732 7349 }
7733 7350
7734 7351 /*
7735 7352 * Unmap the memory area associated with a given swqe.
7736 7353 */
7737 7354 void
7738 7355 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
7739 7356 {
7740 7357 ibt_status_t stat;
7741 7358
7742 7359 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
7743 7360
7744 7361 if (swqe->w_mi_hdl) {
7745 7362 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
7746 7363 swqe->w_mi_hdl)) != IBT_SUCCESS) {
7747 7364 DPRINT(10,
7748 7365 "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
7749 7366 }
7750 7367 swqe->w_mi_hdl = NULL;
7751 7368 }
7752 7369 swqe->w_swr.wr_nds = 0;
7753 7370 }
7754 7371
7755 7372 void
7756 7373 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
7757 7374 {
7758 7375 /*
7759 7376 * The recycling logic can be eliminated from here
7760 7377 * and put into the async thread if we create another
7761 7378 * list to hold ACE's for unjoined mcg's.
7762 7379 */
7763 7380 if (DEC_REF_DO_CYCLE(ace)) {
7764 7381 ibd_mce_t *mce;
7765 7382
7766 7383 /*
7767 7384 * Check with the lock taken: we decremented
7768 7385 * reference count without the lock, and some
7769 7386 * transmitter might already have bumped the
7770 7387 * reference count (possible in case of multicast
7771 7388 * disable when we leave the AH on the active
7772 7389 * list). If not still 0, get out, leaving the
7773 7390 * recycle bit intact.
7774 7391 *
7775 7392 * Atomically transition the AH from active
7776 7393 * to free list, and queue a work request to
7777 7394 * leave the group and destroy the mce. No
7778 7395 * transmitter can be looking at the AH or
7779 7396 * the MCE in between, since we have the
7780 7397 * ac_mutex lock. In the SendOnly reap case,
7781 7398 * it is not necessary to hold the ac_mutex
7782 7399 * and recheck the ref count (since the AH was
7783 7400 * taken off the active list), we just do it
7784 7401 * to have uniform processing with the Full
7785 7402 * reap case.
7786 7403 */
7787 7404 mutex_enter(&state->id_ac_mutex);
7788 7405 mce = ace->ac_mce;
7789 7406 if (GET_REF_CYCLE(ace) == 0) {
7790 7407 CLEAR_REFCYCLE(ace);
7791 7408 /*
7792 7409 * Identify the case of fullmember reap as
7793 7410 * opposed to mcg trap reap. Also, port up
7794 7411 * might set ac_mce to NULL to indicate Tx
7795 7412 * cleanup should do no more than put the
7796 7413 * AH in the free list (see ibd_async_link).
7797 7414 */
7798 7415 if (mce != NULL) {
7799 7416 ace->ac_mce = NULL;
7800 7417 IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
7801 7418 /*
7802 7419 * mc_req was initialized at mce
7803 7420 * creation time.
7804 7421 */
7805 7422 ibd_queue_work_slot(state,
7806 7423 &mce->mc_req, IBD_ASYNC_REAP);
7807 7424 }
7808 7425 IBD_ACACHE_INSERT_FREE(state, ace);
7809 7426 }
7810 7427 mutex_exit(&state->id_ac_mutex);
7811 7428 }
7812 7429 }
7813 7430
7814 7431 /*
7815 7432 * Common code that deals with clean ups after a successful or
7816 7433 * erroneous transmission attempt.
7817 7434 */
7818 7435 static void
7819 7436 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
7820 7437 {
7821 7438 ibd_ace_t *ace = swqe->w_ahandle;
7822 7439
7823 7440 DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
7824 7441
7825 7442 /*
7826 7443 * If this was a dynamic mapping in ibd_send(), we need to
7827 7444 * unmap here. If this was an lso buffer we'd used for sending,
7828 7445 * we need to release the lso buf to the pool, since the resource
7829 7446 * is scarce. However, if this was simply a normal send using
7830 7447 * the copybuf (present in each swqe), we don't need to release it.
7831 7448 */
7832 7449 if (swqe->swqe_im_mblk != NULL) {
7833 7450 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7834 7451 ibd_unmap_mem(state, swqe);
7835 7452 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7836 7453 ibd_release_lsobufs(state,
7837 7454 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7838 7455 }
7839 7456 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7840 7457 freemsg(swqe->swqe_im_mblk);
7841 7458 swqe->swqe_im_mblk = NULL;
7842 7459 }
7843 7460
7844 7461 /*
7845 7462 * Drop the reference count on the AH; it can be reused
7846 7463 * now for a different destination if there are no more
7847 7464 * posted sends that will use it. This can be eliminated
7848 7465 * if we can always associate each Tx buffer with an AH.
7849 7466 * The ace can be null if we are cleaning up from the
7850 7467 * ibd_send() error path.
7851 7468 */
7852 7469 if (ace != NULL) {
7853 7470 ibd_dec_ref_ace(state, ace);
7854 7471 }
7855 7472
7856 7473 /*
7857 7474 * Release the send wqe for reuse.
7858 7475 */
7859 7476 swqe->swqe_next = NULL;
7860 7477 ibd_release_swqe(state, swqe, swqe, 1);
7861 7478 }
7862 7479
7863 7480 static void
7864 7481 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
7865 7482 {
7866 7483 ibd_ace_t *ace;
7867 7484 ibd_swqe_t *swqe;
7868 7485 int n = 0;
7869 7486
7870 7487 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
7871 7488
7872 7489 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
7873 7490
7874 7491 /*
7875 7492 * If this was a dynamic mapping in ibd_send(), we need to
7876 7493 * unmap here. If this was an lso buffer we'd used for sending,
7877 7494 * we need to release the lso buf to the pool, since the
7878 7495 * resource is scarce. However, if this was simply a normal
7879 7496 * send using the copybuf (present in each swqe), we don't need
7880 7497 * to release it.
7881 7498 */
7882 7499 if (swqe->swqe_im_mblk != NULL) {
7883 7500 if (swqe->w_buftype == IBD_WQE_MAPPED) {
7884 7501 ibd_unmap_mem(state, swqe);
7885 7502 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
7886 7503 ibd_release_lsobufs(state,
7887 7504 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
7888 7505 }
7889 7506 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
7890 7507 freemsg(swqe->swqe_im_mblk);
7891 7508 swqe->swqe_im_mblk = NULL;
7892 7509 }
7893 7510
7894 7511 /*
7895 7512 * Drop the reference count on the AH; it can be reused
7896 7513 * now for a different destination if there are no more
7897 7514 * posted sends that will use it. This can be eliminated
7898 7515 * if we can always associate each Tx buffer with an AH.
7899 7516 * The ace can be null if we are cleaning up from the
7900 7517 * ibd_send() error path.
7901 7518 */
7902 7519 ace = swqe->w_ahandle;
7903 7520 if (ace != NULL) {
7904 7521 ibd_dec_ref_ace(state, ace);
7905 7522 }
7906 7523 n++;
7907 7524 }
7908 7525
7909 7526 /*
7910 7527 * Release the send wqes for reuse.
7911 7528 */
7912 7529 ibd_release_swqe(state, head, tail, n);
7913 7530 }
7914 7531
7915 7532 /*
7916 7533 * Processing to be done after receipt of a packet; hand off to GLD
7917 7534 * in the format expected by GLD. The received packet has this
7918 7535 * format: 2b sap :: 00 :: data.
7919 7536 */
7920 7537 static mblk_t *
7921 7538 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
7922 7539 {
7923 7540 ib_header_info_t *phdr;
7924 7541 mblk_t *mp;
7925 7542 ipoib_hdr_t *ipibp;
7926 7543 ipha_t *iphap;
7927 7544 ip6_t *ip6h;
7928 7545 int len;
7929 7546 ib_msglen_t pkt_len = wc->wc_bytes_xfer;
7930 7547 uint32_t bufs;
7931 7548
7932 7549 /*
7933 7550 * Track number handed to upper layer that need to be returned.
7934 7551 */
7935 7552 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding);
7936 7553
7937 7554 /* Never run out of rwqes, use allocb when running low */
7938 7555 if (bufs >= state->id_rx_bufs_outstanding_limit) {
7939 7556 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
7940 7557 atomic_inc_32(&state->id_rx_allocb);
7941 7558 mp = allocb(pkt_len, BPRI_HI);
7942 7559 if (mp) {
7943 7560 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
7944 7561 ibd_post_recv(state, rwqe);
7945 7562 } else { /* no memory */
7946 7563 atomic_inc_32(&state->id_rx_allocb_failed);
7947 7564 ibd_post_recv(state, rwqe);
7948 7565 return (NULL);
7949 7566 }
7950 7567 } else {
7951 7568 mp = rwqe->rwqe_im_mblk;
7952 7569 }
7953 7570
7954 7571
7955 7572 /*
7956 7573 * Adjust write pointer depending on how much data came in.
7957 7574 */
7958 7575 mp->b_wptr = mp->b_rptr + pkt_len;
7959 7576
7960 7577 /*
7961 7578 * Make sure this is NULL or we're in trouble.
7962 7579 */
7963 7580 if (mp->b_next != NULL) {
7964 7581 ibd_print_warn(state,
7965 7582 "ibd_process_rx: got duplicate mp from rcq?");
7966 7583 mp->b_next = NULL;
7967 7584 }
7968 7585
7969 7586 /*
7970 7587 * the IB link will deliver one of the IB link layer
7971 7588 * headers called, the Global Routing Header (GRH).
7972 7589 * ibd driver uses the information in GRH to build the
7973 7590 * Header_info structure and pass it with the datagram up
7974 7591 * to GLDv3.
7975 7592 * If the GRH is not valid, indicate to GLDv3 by setting
7976 7593 * the VerTcFlow field to 0.
7977 7594 */
7978 7595 phdr = (ib_header_info_t *)mp->b_rptr;
7979 7596 if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
7980 7597 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
7981 7598
7982 7599 /* if it is loop back packet, just drop it. */
7983 7600 if (state->id_enable_rc) {
7984 7601 if (bcmp(&phdr->ib_grh.ipoib_sqpn,
7985 7602 &state->rc_macaddr_loopback,
7986 7603 IPOIB_ADDRL) == 0) {
7987 7604 freemsg(mp);
7988 7605 return (NULL);
7989 7606 }
7990 7607 } else {
7991 7608 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
7992 7609 IPOIB_ADDRL) == 0) {
7993 7610 freemsg(mp);
7994 7611 return (NULL);
7995 7612 }
7996 7613 }
7997 7614
7998 7615 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
7999 7616 sizeof (ipoib_mac_t));
8000 7617 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
8001 7618 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
8002 7619 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
8003 7620 } else {
8004 7621 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
8005 7622 }
8006 7623 } else {
8007 7624 /*
8008 7625 * It can not be a IBA multicast packet. Must have been
8009 7626 * unicast for us. Just copy the interface address to dst.
8010 7627 */
8011 7628 phdr->ib_grh.ipoib_vertcflow = 0;
8012 7629 ovbcopy(&state->id_macaddr, &phdr->ib_dst,
8013 7630 sizeof (ipoib_mac_t));
8014 7631 }
8015 7632
8016 7633 /*
8017 7634 * For ND6 packets, padding is at the front of the source/target
8018 7635 * lladdr. However the inet6 layer is not aware of it, hence remove
8019 7636 * the padding from such packets.
8020 7637 */
8021 7638 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
8022 7639 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
8023 7640 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8024 7641 len = ntohs(ip6h->ip6_plen);
8025 7642 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8026 7643 /* LINTED: E_CONSTANT_CONDITION */
8027 7644 IBD_PAD_NSNA(ip6h, len, IBD_RECV);
8028 7645 }
8029 7646 }
8030 7647
8031 7648 /*
8032 7649 * Update statistics
8033 7650 */
8034 7651 atomic_add_64(&state->id_rcv_bytes, pkt_len);
8035 7652 atomic_inc_64(&state->id_rcv_pkt);
8036 7653 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
8037 7654 atomic_inc_64(&state->id_brd_rcv);
8038 7655 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
8039 7656 atomic_inc_64(&state->id_multi_rcv);
8040 7657
8041 7658 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
8042 7659 /*
8043 7660 * Set receive checksum status in mp
8044 7661 * Hardware checksumming can be considered valid only if:
8045 7662 * 1. CQE.IP_OK bit is set
8046 7663 * 2. CQE.CKSUM = 0xffff
8047 7664 * 3. IPv6 routing header is not present in the packet
8048 7665 * 4. If there are no IP_OPTIONS in the IP HEADER
8049 7666 */
8050 7667
8051 7668 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
8052 7669 (wc->wc_cksum == 0xFFFF) &&
8053 7670 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
8054 7671 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
8055 7672 }
8056 7673
8057 7674 return (mp);
8058 7675 }
8059 7676
8060 7677 /*
8061 7678 * Callback code invoked from STREAMs when the receive data buffer is
8062 7679 * free for recycling.
8063 7680 */
8064 7681 static void
8065 7682 ibd_freemsg_cb(char *arg)
8066 7683 {
8067 7684 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
8068 7685 ibd_state_t *state = rwqe->w_state;
8069 7686
8070 7687 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding);
8071 7688
8072 7689 /*
8073 7690 * If the driver is stopped, just free the rwqe.
8074 7691 */
8075 7692 if (atomic_add_32_nv(&state->id_running, 0) == 0) {
8076 7693 DPRINT(6, "ibd_freemsg: wqe being freed");
8077 7694 rwqe->rwqe_im_mblk = NULL;
8078 7695 ibd_free_rwqe(state, rwqe);
8079 7696 return;
8080 7697 }
8081 7698
8082 7699 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
8083 7700 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
8084 7701 if (rwqe->rwqe_im_mblk == NULL) {
8085 7702 ibd_free_rwqe(state, rwqe);
8086 7703 DPRINT(6, "ibd_freemsg: desballoc failed");
8087 7704 return;
8088 7705 }
8089 7706
8090 7707 ibd_post_recv(state, rwqe);
8091 7708 }
8092 7709
8093 7710 static uint_t
8094 7711 ibd_tx_recycle(caddr_t arg)
8095 7712 {
8096 7713 ibd_state_t *state = (ibd_state_t *)arg;
8097 7714
8098 7715 /*
8099 7716 * Poll for completed entries
8100 7717 */
8101 7718 ibd_poll_scq(state, state->id_scq_hdl);
8102 7719
8103 7720 return (DDI_INTR_CLAIMED);
8104 7721 }
8105 7722
8106 7723 #ifdef IBD_LOGGING
8107 7724 static void
8108 7725 ibd_log_init(void)
8109 7726 {
8110 7727 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
8111 7728 ibd_lbuf_ndx = 0;
8112 7729
8113 7730 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
8114 7731 }
8115 7732
8116 7733 static void
8117 7734 ibd_log_fini(void)
8118 7735 {
8119 7736 if (ibd_lbuf)
8120 7737 kmem_free(ibd_lbuf, IBD_LOG_SZ);
8121 7738 ibd_lbuf_ndx = 0;
8122 7739 ibd_lbuf = NULL;
8123 7740
8124 7741 mutex_destroy(&ibd_lbuf_lock);
8125 7742 }
8126 7743
8127 7744 static void
8128 7745 ibd_log(const char *fmt, ...)
8129 7746 {
8130 7747 va_list ap;
8131 7748 uint32_t off;
8132 7749 uint32_t msglen;
8133 7750 char tmpbuf[IBD_DMAX_LINE];
8134 7751
8135 7752 if (ibd_lbuf == NULL)
8136 7753 return;
8137 7754
8138 7755 va_start(ap, fmt);
8139 7756 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
8140 7757 va_end(ap);
8141 7758
8142 7759 if (msglen >= IBD_DMAX_LINE)
8143 7760 msglen = IBD_DMAX_LINE - 1;
8144 7761
8145 7762 mutex_enter(&ibd_lbuf_lock);
8146 7763
8147 7764 off = ibd_lbuf_ndx; /* current msg should go here */
8148 7765 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
8149 7766 ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
8150 7767
8151 7768 ibd_lbuf_ndx += msglen; /* place where next msg should start */
8152 7769 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */
8153 7770
8154 7771 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
8155 7772 ibd_lbuf_ndx = 0;
8156 7773
8157 7774 mutex_exit(&ibd_lbuf_lock);
8158 7775
8159 7776 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */
8160 7777 }
8161 7778 #endif
8162 7779
8163 7780 /* ARGSUSED */
8164 7781 static int
8165 7782 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8166 7783 int *rvalp)
8167 7784 {
8168 7785 ibd_create_ioctl_t *cmd = karg;
8169 7786 ibd_state_t *state, *port_state, *p;
8170 7787 int i, err, rval = 0;
8171 7788 mac_register_t *macp;
8172 7789 ibt_hca_portinfo_t *pinfop = NULL;
8173 7790 ibt_status_t ibt_status;
8174 7791 uint_t psize, pinfosz;
8175 7792 boolean_t force_create = B_FALSE;
8176 7793
8177 7794 cmd->ibdioc.ioc_status = 0;
8178 7795
8179 7796 if (cmd->ibdioc.ioc_port_inst < 0) {
8180 7797 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8181 7798 return (EINVAL);
8182 7799 }
8183 7800 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst);
8184 7801 if (port_state == NULL) {
8185 7802 DPRINT(10, "ibd_create_partition: failed to get state %d",
8186 7803 cmd->ibdioc.ioc_port_inst);
8187 7804 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST;
8188 7805 return (EINVAL);
8189 7806 }
8190 7807
8191 7808 /* Limited PKeys not supported */
8192 7809 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) {
8193 7810 rval = EINVAL;
8194 7811 goto part_create_return;
8195 7812 }
8196 7813
8197 7814 if (cmd->ioc_force_create == 0) {
8198 7815 /*
8199 7816 * Check if the port pkey table contains the pkey for which
8200 7817 * this partition is being created.
8201 7818 */
8202 7819 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8203 7820 port_state->id_port, &pinfop, &psize, &pinfosz);
8204 7821
8205 7822 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8206 7823 rval = EINVAL;
8207 7824 goto part_create_return;
8208 7825 }
8209 7826
8210 7827 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) {
8211 7828 rval = ENETDOWN;
8212 7829 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN;
8213 7830 goto part_create_return;
8214 7831 }
8215 7832
8216 7833 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) {
8217 7834 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) {
8218 7835 break;
8219 7836 }
8220 7837 }
8221 7838 if (i == pinfop->p_pkey_tbl_sz) {
8222 7839 rval = EINVAL;
8223 7840 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT;
8224 7841 goto part_create_return;
8225 7842 }
8226 7843 } else {
8227 7844 force_create = B_TRUE;
8228 7845 }
8229 7846
8230 7847 mutex_enter(&ibd_objlist_lock);
8231 7848 for (p = ibd_objlist_head; p; p = p->id_next) {
8232 7849 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) &&
8233 7850 (p->id_pkey == cmd->ioc_pkey) &&
8234 7851 (p->id_plinkid == cmd->ioc_partid)) {
8235 7852 mutex_exit(&ibd_objlist_lock);
8236 7853 rval = EEXIST;
8237 7854 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS;
8238 7855 goto part_create_return;
8239 7856 }
8240 7857 }
8241 7858 mutex_exit(&ibd_objlist_lock);
8242 7859
8243 7860 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP);
8244 7861
8245 7862 state->id_type = IBD_PARTITION_OBJ;
8246 7863
8247 7864 state->id_plinkid = cmd->ioc_partid;
8248 7865 state->id_dlinkid = cmd->ibdioc.ioc_linkid;
8249 7866 state->id_port_inst = cmd->ibdioc.ioc_port_inst;
8250 7867
8251 7868 state->id_dip = port_state->id_dip;
8252 7869 state->id_port = port_state->id_port;
8253 7870 state->id_pkey = cmd->ioc_pkey;
8254 7871 state->id_hca_guid = port_state->id_hca_guid;
8255 7872 state->id_port_guid = port_state->id_port_guid;
8256 7873 state->id_force_create = force_create;
8257 7874
8258 7875 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
8259 7876 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
8260 7877
8261 7878 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) {
8262 7879 rval = EIO;
8263 7880 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE;
8264 7881 goto fail;
8265 7882 }
8266 7883
8267 7884 if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
8268 7885 rval = EAGAIN;
8269 7886 goto fail;
8270 7887 }
8271 7888
8272 7889 macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
8273 7890 macp->m_dip = port_state->id_dip;
8274 7891 macp->m_instance = (uint_t)-1;
8275 7892 macp->m_driver = state;
8276 7893 macp->m_src_addr = (uint8_t *)&state->id_macaddr;
8277 7894 macp->m_callbacks = &ibd_m_callbacks;
8278 7895 macp->m_min_sdu = 0;
8279 7896 macp->m_multicast_sdu = IBD_DEF_MAX_SDU;
8280 7897 if (state->id_enable_rc) {
8281 7898 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU;
8282 7899 } else {
8283 7900 macp->m_max_sdu = IBD_DEF_MAX_SDU;
8284 7901 }
8285 7902 macp->m_priv_props = ibd_priv_props;
8286 7903
8287 7904 err = mac_register(macp, &state->id_mh);
8288 7905 mac_free(macp);
8289 7906
8290 7907 if (err != 0) {
8291 7908 DPRINT(10, "ibd_create_partition: mac_register() failed %d",
8292 7909 err);
8293 7910 rval = err;
8294 7911 goto fail;
8295 7912 }
8296 7913
8297 7914 err = dls_devnet_create(state->id_mh,
8298 7915 cmd->ioc_partid, crgetzoneid(credp));
8299 7916 if (err != 0) {
8300 7917 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed "
8301 7918 "%d", err);
8302 7919 rval = err;
8303 7920 (void) mac_unregister(state->id_mh);
8304 7921 goto fail;
8305 7922 }
8306 7923
8307 7924 /*
8308 7925 * Add the new partition state structure to the list
8309 7926 */
8310 7927 mutex_enter(&ibd_objlist_lock);
8311 7928 if (ibd_objlist_head)
8312 7929 state->id_next = ibd_objlist_head;
8313 7930
8314 7931 ibd_objlist_head = state;
8315 7932 mutex_exit(&ibd_objlist_lock);
8316 7933
8317 7934 part_create_return:
8318 7935 if (pinfop) {
8319 7936 ibt_free_portinfo(pinfop, pinfosz);
8320 7937 }
8321 7938 return (rval);
8322 7939
8323 7940 fail:
8324 7941 if (pinfop) {
8325 7942 ibt_free_portinfo(pinfop, pinfosz);
8326 7943 }
8327 7944 ibd_part_unattach(state);
8328 7945 kmem_free(state, sizeof (ibd_state_t));
8329 7946 return (rval);
8330 7947 }
8331 7948
8332 7949 /* ARGSUSED */
8333 7950 static int
8334 7951 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp,
8335 7952 int *rvalp)
8336 7953 {
8337 7954 int err;
8338 7955 datalink_id_t tmpid;
8339 7956 ibd_state_t *node, *prev;
8340 7957 ibd_delete_ioctl_t *cmd = karg;
8341 7958
8342 7959 prev = NULL;
8343 7960
8344 7961 mutex_enter(&ibd_objlist_lock);
8345 7962 node = ibd_objlist_head;
8346 7963
8347 7964 /* Find the ibd state structure corresponding to the partition */
8348 7965 while (node != NULL) {
8349 7966 if (node->id_plinkid == cmd->ioc_partid)
8350 7967 break;
8351 7968 prev = node;
8352 7969 node = node->id_next;
8353 7970 }
8354 7971
8355 7972 if (node == NULL) {
8356 7973 mutex_exit(&ibd_objlist_lock);
8357 7974 return (ENOENT);
8358 7975 }
8359 7976
8360 7977 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) {
8361 7978 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed "
8362 7979 "%d", err);
8363 7980 mutex_exit(&ibd_objlist_lock);
8364 7981 return (err);
8365 7982 }
8366 7983
8367 7984 /*
8368 7985 * Call ibd_part_unattach() only after making sure that the instance has
8369 7986 * not been started yet and is also not in late hca init mode.
8370 7987 */
8371 7988 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8372 7989
8373 7990 err = 0;
8374 7991 if ((node->id_mac_state & IBD_DRV_STARTED) ||
8375 7992 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) ||
8376 7993 (ibd_part_busy(node) != DDI_SUCCESS) ||
8377 7994 ((err = mac_disable(node->id_mh)) != 0)) {
8378 7995 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid,
8379 7996 crgetzoneid(credp));
8380 7997 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8381 7998 mutex_exit(&ibd_objlist_lock);
8382 7999 return (err != 0 ? err : EBUSY);
8383 8000 }
8384 8001
8385 8002 node->id_mac_state |= IBD_DRV_IN_DELETION;
8386 8003
8387 8004 ibd_part_unattach(node);
8388 8005
8389 8006 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS);
8390 8007
8391 8008 /* Remove the partition state structure from the linked list */
8392 8009 if (prev == NULL)
8393 8010 ibd_objlist_head = node->id_next;
8394 8011 else
8395 8012 prev->id_next = node->id_next;
8396 8013 mutex_exit(&ibd_objlist_lock);
8397 8014
8398 8015 if ((err = mac_unregister(node->id_mh)) != 0) {
8399 8016 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d",
8400 8017 err);
8401 8018 }
8402 8019
8403 8020 cv_destroy(&node->id_macst_cv);
8404 8021 mutex_destroy(&node->id_macst_lock);
8405 8022
8406 8023 kmem_free(node, sizeof (ibd_state_t));
8407 8024
8408 8025 return (0);
8409 8026 }
8410 8027
8411 8028 /* ARGSUSED */
8412 8029 static int
8413 8030 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred,
8414 8031 int *rvalp)
8415 8032 {
8416 8033 ibd_ioctl_t cmd;
8417 8034 ibpart_ioctl_t partioc;
8418 8035 ibport_ioctl_t portioc;
8419 8036 #ifdef _MULTI_DATAMODEL
8420 8037 ibport_ioctl32_t portioc32;
8421 8038 #endif
8422 8039 ibd_state_t *state, *port_state;
8423 8040 int size;
8424 8041 ibt_hca_portinfo_t *pinfop = NULL;
8425 8042 ibt_status_t ibt_status;
8426 8043 uint_t psize, pinfosz;
8427 8044 int rval = 0;
8428 8045
8429 8046 size = sizeof (ibd_ioctl_t);
8430 8047 if (ddi_copyin((void *)arg, &cmd, size, mode)) {
8431 8048 return (EFAULT);
8432 8049 }
8433 8050 cmd.ioc_status = 0;
8434 8051 switch (cmd.ioc_info_cmd) {
8435 8052 case IBD_INFO_CMD_IBPART:
8436 8053 size = sizeof (ibpart_ioctl_t);
8437 8054 if (ddi_copyin((void *)arg, &partioc, size, mode)) {
8438 8055 return (EFAULT);
8439 8056 }
8440 8057
8441 8058 mutex_enter(&ibd_objlist_lock);
8442 8059 /* Find the ibd state structure corresponding the partition */
8443 8060 for (state = ibd_objlist_head; state; state = state->id_next) {
8444 8061 if (state->id_plinkid == cmd.ioc_linkid) {
8445 8062 break;
8446 8063 }
8447 8064 }
8448 8065
8449 8066 if (state == NULL) {
8450 8067 mutex_exit(&ibd_objlist_lock);
8451 8068 return (ENOENT);
8452 8069 }
8453 8070
8454 8071 partioc.ibdioc.ioc_linkid = state->id_dlinkid;
8455 8072 partioc.ibdioc.ioc_port_inst = state->id_port_inst;
8456 8073 partioc.ibdioc.ioc_portnum = state->id_port;
8457 8074 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid;
8458 8075 partioc.ibdioc.ioc_portguid = state->id_port_guid;
8459 8076 partioc.ibdioc.ioc_status = 0;
8460 8077 partioc.ioc_partid = state->id_plinkid;
8461 8078 partioc.ioc_pkey = state->id_pkey;
8462 8079 partioc.ioc_force_create = state->id_force_create;
8463 8080 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) {
8464 8081 mutex_exit(&ibd_objlist_lock);
8465 8082 return (EFAULT);
8466 8083 }
8467 8084 mutex_exit(&ibd_objlist_lock);
8468 8085
8469 8086 break;
8470 8087
8471 8088 case IBD_INFO_CMD_IBPORT:
8472 8089 if ((cmd.ioc_port_inst < 0) || ((port_state =
8473 8090 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8474 8091 DPRINT(10, "ibd_create_partition: failed to get"
8475 8092 " state %d", cmd.ioc_port_inst);
8476 8093 size = sizeof (ibd_ioctl_t);
8477 8094 cmd.ioc_status = IBD_INVALID_PORT_INST;
8478 8095 if (ddi_copyout((void *)&cmd, (void *)arg, size,
8479 8096 mode)) {
8480 8097 return (EFAULT);
8481 8098 }
8482 8099 return (EINVAL);
8483 8100 }
8484 8101 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8485 8102 port_state->id_port, &pinfop, &psize, &pinfosz);
8486 8103 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8487 8104 return (EINVAL);
8488 8105 }
8489 8106 #ifdef _MULTI_DATAMODEL
8490 8107 switch (ddi_model_convert_from(mode & FMODELS)) {
8491 8108 case DDI_MODEL_ILP32: {
8492 8109 size = sizeof (ibport_ioctl32_t);
8493 8110 if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8494 8111 rval = EFAULT;
8495 8112 goto fail;
8496 8113 }
8497 8114 portioc32.ibdioc.ioc_status = 0;
8498 8115 portioc32.ibdioc.ioc_portnum = port_state->id_port;
8499 8116 portioc32.ibdioc.ioc_hcaguid =
8500 8117 port_state->id_hca_guid;
8501 8118 portioc32.ibdioc.ioc_portguid =
8502 8119 port_state->id_port_guid;
8503 8120 if (portioc32.ioc_pkey_tbl_sz !=
8504 8121 pinfop->p_pkey_tbl_sz) {
8505 8122 rval = EINVAL;
8506 8123 size = sizeof (ibd_ioctl_t);
8507 8124 portioc32.ibdioc.ioc_status =
8508 8125 IBD_INVALID_PKEY_TBL_SIZE;
8509 8126 if (ddi_copyout((void *)&portioc32.ibdioc,
8510 8127 (void *)arg, size, mode)) {
8511 8128 rval = EFAULT;
8512 8129 goto fail;
8513 8130 }
8514 8131 goto fail;
8515 8132 }
8516 8133 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8517 8134 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8518 8135 (void *)(uintptr_t)portioc32.ioc_pkeys, size,
8519 8136 mode)) {
8520 8137 rval = EFAULT;
8521 8138 goto fail;
8522 8139 }
8523 8140 size = sizeof (ibport_ioctl32_t);
8524 8141 if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8525 8142 mode)) {
8526 8143 rval = EFAULT;
8527 8144 goto fail;
8528 8145 }
8529 8146 break;
8530 8147 }
8531 8148 case DDI_MODEL_NONE:
8532 8149 size = sizeof (ibport_ioctl_t);
8533 8150 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8534 8151 rval = EFAULT;
8535 8152 goto fail;
8536 8153 }
8537 8154 portioc.ibdioc.ioc_status = 0;
8538 8155 portioc.ibdioc.ioc_portnum = port_state->id_port;
8539 8156 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8540 8157 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8541 8158 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8542 8159 rval = EINVAL;
8543 8160 size = sizeof (ibd_ioctl_t);
8544 8161 portioc.ibdioc.ioc_status =
8545 8162 IBD_INVALID_PKEY_TBL_SIZE;
8546 8163 if (ddi_copyout((void *)&portioc.ibdioc,
8547 8164 (void *)arg, size, mode)) {
8548 8165 rval = EFAULT;
8549 8166 goto fail;
8550 8167 }
8551 8168 goto fail;
8552 8169 }
8553 8170 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8554 8171 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8555 8172 (void *)(portioc.ioc_pkeys), size, mode)) {
8556 8173 rval = EFAULT;
8557 8174 goto fail;
8558 8175 }
8559 8176 size = sizeof (ibport_ioctl_t);
8560 8177 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8561 8178 mode)) {
8562 8179 rval = EFAULT;
8563 8180 goto fail;
8564 8181 }
8565 8182 break;
8566 8183 }
8567 8184 #else /* ! _MULTI_DATAMODEL */
8568 8185 size = sizeof (ibport_ioctl_t);
8569 8186 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8570 8187 rval = EFAULT;
8571 8188 goto fail;
8572 8189 }
8573 8190 portioc.ibdioc.ioc_status = 0;
8574 8191 portioc.ibdioc.ioc_portnum = port_state->id_port;
8575 8192 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8576 8193 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8577 8194 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) {
8578 8195 rval = EINVAL;
8579 8196 size = sizeof (ibd_ioctl_t);
8580 8197 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE;
8581 8198 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg,
8582 8199 size, mode)) {
8583 8200 rval = EFAULT;
8584 8201 goto fail;
8585 8202 }
8586 8203 goto fail;
8587 8204 }
8588 8205 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t);
8589 8206 if (ddi_copyout((void *)pinfop->p_pkey_tbl,
8590 8207 (void *)(portioc.ioc_pkeys), size, mode)) {
8591 8208 rval = EFAULT;
8592 8209 goto fail;
8593 8210 }
8594 8211 size = sizeof (ibport_ioctl_t);
8595 8212 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8596 8213 mode)) {
8597 8214 rval = EFAULT;
8598 8215 goto fail;
8599 8216 }
8600 8217 #endif /* _MULTI_DATAMODEL */
8601 8218
8602 8219 break;
8603 8220
8604 8221 case IBD_INFO_CMD_PKEYTBLSZ:
8605 8222 if ((cmd.ioc_port_inst < 0) || ((port_state =
8606 8223 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) {
8607 8224 DPRINT(10, "ibd_create_partition: failed to get"
8608 8225 " state %d", cmd.ioc_port_inst);
8609 8226 size = sizeof (ibd_ioctl_t);
8610 8227 cmd.ioc_status = IBD_INVALID_PORT_INST;
8611 8228 if (ddi_copyout((void *)&cmd, (void *)arg, size,
8612 8229 mode)) {
8613 8230 return (EFAULT);
8614 8231 }
8615 8232 return (EINVAL);
8616 8233 }
8617 8234 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl,
8618 8235 port_state->id_port, &pinfop, &psize, &pinfosz);
8619 8236 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) {
8620 8237 return (EINVAL);
8621 8238 }
8622 8239 #ifdef _MULTI_DATAMODEL
8623 8240 switch (ddi_model_convert_from(mode & FMODELS)) {
8624 8241 case DDI_MODEL_ILP32: {
8625 8242 size = sizeof (ibport_ioctl32_t);
8626 8243 if (ddi_copyin((void *)arg, &portioc32, size, mode)) {
8627 8244 rval = EFAULT;
8628 8245 goto fail;
8629 8246 }
8630 8247 portioc32.ibdioc.ioc_status = 0;
8631 8248 portioc32.ibdioc.ioc_portnum = port_state->id_port;
8632 8249 portioc32.ibdioc.ioc_hcaguid =
8633 8250 port_state->id_hca_guid;
8634 8251 portioc32.ibdioc.ioc_portguid =
8635 8252 port_state->id_port_guid;
8636 8253 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8637 8254 if (ddi_copyout((void *)&portioc32, (void *)arg, size,
8638 8255 mode)) {
8639 8256 rval = EFAULT;
8640 8257 goto fail;
8641 8258 }
8642 8259 break;
8643 8260 }
8644 8261 case DDI_MODEL_NONE:
8645 8262 size = sizeof (ibport_ioctl_t);
8646 8263 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8647 8264 rval = EFAULT;
8648 8265 goto fail;
8649 8266 }
8650 8267 portioc.ibdioc.ioc_status = 0;
8651 8268 portioc.ibdioc.ioc_portnum = port_state->id_port;
8652 8269 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8653 8270 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8654 8271 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8655 8272 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8656 8273 mode)) {
8657 8274 rval = EFAULT;
8658 8275 goto fail;
8659 8276 }
8660 8277 break;
8661 8278 }
8662 8279 #else /* ! _MULTI_DATAMODEL */
8663 8280 size = sizeof (ibport_ioctl_t);
8664 8281 if (ddi_copyin((void *)arg, &portioc, size, mode)) {
8665 8282 rval = EFAULT;
8666 8283 goto fail;
8667 8284 }
8668 8285 portioc.ibdioc.ioc_status = 0;
8669 8286 portioc.ibdioc.ioc_portnum = port_state->id_port;
8670 8287 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid;
8671 8288 portioc.ibdioc.ioc_portguid = port_state->id_port_guid;
8672 8289 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz;
8673 8290 if (ddi_copyout((void *)&portioc, (void *)arg, size,
8674 8291 mode)) {
8675 8292 rval = EFAULT;
8676 8293 goto fail;
8677 8294 }
8678 8295 #endif /* _MULTI_DATAMODEL */
8679 8296 break;
8680 8297
8681 8298 default:
8682 8299 return (EINVAL);
8683 8300
8684 8301 } /* switch (cmd.ioc_info_cmd) */
8685 8302 fail:
8686 8303 if (pinfop) {
8687 8304 ibt_free_portinfo(pinfop, pinfosz);
8688 8305 }
8689 8306 return (rval);
8690 8307 }
8691 8308
8692 8309 /* ARGSUSED */
8693 8310 static void
8694 8311 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl,
8695 8312 ibt_async_code_t code, ibt_async_event_t *event)
8696 8313 {
8697 8314 ibd_state_t *state = (ibd_state_t *)arg;
8698 8315 link_state_t lstate;
8699 8316
8700 8317 switch (code) {
8701 8318 case IBT_EVENT_PORT_UP:
8702 8319 case IBT_ERROR_PORT_DOWN:
8703 8320 if (ibd_get_port_state(state, &lstate) != 0)
8704 8321 break;
8705 8322
8706 8323 if (state->id_link_state != lstate) {
8707 8324 state->id_link_state = lstate;
8708 8325 mac_link_update(state->id_mh, lstate);
8709 8326 }
8710 8327 break;
8711 8328 default:
8712 8329 break;
8713 8330 }
8714 8331 }
8715 8332
8716 8333 static int
8717 8334 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate)
8718 8335 {
8719 8336 ibt_hca_portinfo_t *port_infop;
8720 8337 uint_t psize, port_infosz;
8721 8338 ibt_status_t ret;
8722 8339
8723 8340 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
8724 8341 &port_infop, &psize, &port_infosz);
8725 8342 if ((ret != IBT_SUCCESS) || (psize != 1))
8726 8343 return (-1);
8727 8344
8728 8345 state->id_sgid = *port_infop->p_sgid_tbl;
8729 8346 state->id_link_speed = ibd_get_portspeed(state);
8730 8347
8731 8348 if (port_infop->p_linkstate == IBT_PORT_ACTIVE)
8732 8349 *lstate = LINK_STATE_UP;
8733 8350 else
8734 8351 *lstate = LINK_STATE_DOWN;
8735 8352
8736 8353 ibt_free_portinfo(port_infop, port_infosz);
8737 8354 return (0);
8738 8355 }
8739 8356
8740 8357 static int
8741 8358 ibd_port_attach(dev_info_t *dip)
8742 8359 {
8743 8360 ibd_state_t *state;
8744 8361 link_state_t lstate;
8745 8362 int instance;
8746 8363 ibt_status_t ret;
8747 8364
8748 8365 /*
8749 8366 * Allocate softstate structure
8750 8367 */
8751 8368 instance = ddi_get_instance(dip);
8752 8369 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) {
8753 8370 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed");
8754 8371 return (DDI_FAILURE);
8755 8372 }
8756 8373
8757 8374 state = ddi_get_soft_state(ibd_list, instance);
8758 8375
8759 8376 state->id_dip = dip;
8760 8377 state->id_type = IBD_PORT_DRIVER;
8761 8378
8762 8379 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
8763 8380 "port-number", 0)) == 0) {
8764 8381 DPRINT(10, "ibd_port_attach: invalid port number (%d)",
8765 8382 state->id_port);
8766 8383 return (DDI_FAILURE);
8767 8384 }
8768 8385 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8769 8386 "hca-guid", 0)) == 0) {
8770 8387 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)",
8771 8388 state->id_hca_guid);
8772 8389 return (DDI_FAILURE);
8773 8390 }
8774 8391 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
8775 8392 "port-guid", 0)) == 0) {
8776 8393 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)",
8777 8394 state->id_port_guid);
8778 8395 return (DDI_FAILURE);
8779 8396 }
8780 8397
8781 8398 /*
8782 8399 * Attach to IBTL
8783 8400 */
8784 8401 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state,
8785 8402 &state->id_ibt_hdl)) != IBT_SUCCESS) {
8786 8403 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d",
8787 8404 ret);
8788 8405 goto done;
8789 8406 }
8790 8407
8791 8408 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
8792 8409
8793 8410 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid,
8794 8411 &state->id_hca_hdl)) != IBT_SUCCESS) {
8795 8412 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8796 8413 ret);
8797 8414 goto done;
8798 8415 }
8799 8416 state->id_mac_state |= IBD_DRV_HCA_OPENED;
8800 8417
8801 8418 /* Update link status */
8802 8419
8803 8420 if (ibd_get_port_state(state, &lstate) != 0) {
8804 8421 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d",
8805 8422 ret);
8806 8423 goto done;
8807 8424 }
8808 8425 state->id_link_state = lstate;
8809 8426 /*
8810 8427 * Register ibd interfaces with the Nemo framework
8811 8428 */
8812 8429 if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
8813 8430 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()");
8814 8431 goto done;
8815 8432 }
8816 8433 state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
8817 8434
8818 8435 mac_link_update(state->id_mh, lstate);
8819 8436
8820 8437 return (DDI_SUCCESS);
8821 8438 done:
8822 8439 (void) ibd_port_unattach(state, dip);
8823 8440 return (DDI_FAILURE);
8824 8441 }
8825 8442
8826 8443 static int
8827 8444 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip)
8828 8445 {
8829 8446 int instance;
8830 8447 uint32_t progress = state->id_mac_state;
8831 8448 ibt_status_t ret;
8832 8449
8833 8450 if (progress & IBD_DRV_MAC_REGISTERED) {
8834 8451 (void) mac_unregister(state->id_mh);
8835 8452 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
8836 8453 }
8837 8454
8838 8455 if (progress & IBD_DRV_HCA_OPENED) {
8839 8456 if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
8840 8457 IBT_SUCCESS) {
8841 8458 ibd_print_warn(state, "failed to close "
8842 8459 "HCA device, ret=%d", ret);
8843 8460 }
8844 8461 state->id_hca_hdl = NULL;
8845 8462 state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
8846 8463 }
8847 8464
8848 8465 if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
8849 8466 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
8850 8467 ibd_print_warn(state,
8851 8468 "ibt_detach() failed, ret=%d", ret);
8852 8469 }
8853 8470 state->id_ibt_hdl = NULL;
8854 8471 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
8855 8472 }
8856 8473 instance = ddi_get_instance(dip);
8857 8474 ddi_soft_state_free(ibd_list, instance);
8858 8475
8859 8476 return (DDI_SUCCESS);
8860 8477 }
8861 8478
8862 8479 ibt_status_t
8863 8480 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr)
8864 8481 {
8865 8482 ibd_state_t *state;
8866 8483
8867 8484 mutex_enter(&ibd_objlist_lock);
8868 8485
8869 8486 /* Find the ibd state structure corresponding the partition */
8870 8487 for (state = ibd_objlist_head; state; state = state->id_next) {
8871 8488 if (state->id_plinkid == linkid) {
8872 8489 break;
8873 8490 }
8874 8491 }
8875 8492
8876 8493 if (state == NULL) {
8877 8494 mutex_exit(&ibd_objlist_lock);
8878 8495 return (IBT_NO_SUCH_OBJECT);
8879 8496 }
8880 8497
8881 8498 attr->pa_dlinkid = state->id_dlinkid;
8882 8499 attr->pa_plinkid = state->id_plinkid;
8883 8500 attr->pa_port = state->id_port;
8884 8501 attr->pa_hca_guid = state->id_hca_guid;
8885 8502 attr->pa_port_guid = state->id_port_guid;
8886 8503 attr->pa_pkey = state->id_pkey;
8887 8504
8888 8505 mutex_exit(&ibd_objlist_lock);
8889 8506
8890 8507 return (IBT_SUCCESS);
8891 8508 }
8892 8509
8893 8510 ibt_status_t
8894 8511 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts)
8895 8512 {
8896 8513 ibd_state_t *state;
8897 8514 int n = 0;
8898 8515 ibt_part_attr_t *attr;
8899 8516
8900 8517 mutex_enter(&ibd_objlist_lock);
8901 8518
8902 8519 for (state = ibd_objlist_head; state; state = state->id_next)
8903 8520 n++;
8904 8521
8905 8522 *nparts = n;
8906 8523 if (n == 0) {
8907 8524 *attr_list = NULL;
8908 8525 mutex_exit(&ibd_objlist_lock);
8909 8526 return (IBT_SUCCESS);
8910 8527 }
8911 8528
8912 8529 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP);
8913 8530 attr = *attr_list;
8914 8531 for (state = ibd_objlist_head; state; state = state->id_next) {
8915 8532 #ifdef DEBUG
8916 8533 ASSERT(n > 0);
8917 8534 n--;
8918 8535 #endif
8919 8536 attr->pa_dlinkid = state->id_dlinkid;
8920 8537 attr->pa_plinkid = state->id_plinkid;
8921 8538 attr->pa_port = state->id_port;
8922 8539 attr->pa_hca_guid = state->id_hca_guid;
8923 8540 attr->pa_port_guid = state->id_port_guid;
8924 8541 attr->pa_pkey = state->id_pkey;
8925 8542 attr++;
8926 8543 }
8927 8544
8928 8545 mutex_exit(&ibd_objlist_lock);
8929 8546 return (IBT_SUCCESS);
8930 8547 }
↓ open down ↓ |
3572 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX