1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2019, Joyent, Inc.
28 */
29
30 #ifndef _SYS_IB_EOIB_EIB_IMPL_H
31 #define _SYS_IB_EOIB_EIB_IMPL_H
32
33 #ifdef __cplusplus
34 extern "C" {
35 #endif
36
37 #include <sys/ddi.h>
38 #include <sys/mac.h>
39 #include <sys/sunddi.h>
40 #include <sys/varargs.h>
41 #include <sys/vlan.h>
42 #include <sys/ib/ibtl/ibti.h>
43 #include <sys/ib/ibtl/ibvti.h>
44 #include <sys/ib/ib_pkt_hdrs.h>
45
46 #include <sys/ib/clients/eoib/fip.h>
47 #include <sys/ib/clients/eoib/eib.h>
48
49 /*
50 * Driver specific constants
51 */
52 #define EIB_E_SUCCESS 0
53 #define EIB_E_FAILURE -1
54 #define EIB_MAX_LINE 128
55 #define EIB_MAX_SGL 59
56 #define EIB_MAX_POST_MULTIPLE 4
57 #define EIB_MAX_PAYLOAD_HDR_SZ 160
58 #define EIB_TX_COPY_THRESH 4096 /* greater than mtu */
59 #define EIB_MAX_VNICS 64 /* do not change this */
60 #define EIB_LOGIN_TIMEOUT_USEC 8000000
61 #define EIB_RWR_CHUNK_SZ 8
62 #define EIB_IPHDR_ALIGN_ROOM 32
63 #define EIB_IP_HDR_ALIGN 2
64 #define EIB_MAX_RX_PKTS_ONINTR 0x800
65 #define EIB_MAX_LOGIN_ATTEMPTS 3
66 #define EIB_MAX_VHUB_TBL_ATTEMPTS 3
67 #define EIB_MAX_KA_ATTEMPTS 3
68 #define EIB_MAX_ATTEMPTS 10
69 #define EIB_DELAY_HALF_SECOND 500000
70 #define EIB_GRH_SZ (sizeof (ib_grh_t))
71
72 /*
73 * Debug messages
74 */
75 #define EIB_MSGS_CRIT 0x01
76 #define EIB_MSGS_ERR 0x02
77 #define EIB_MSGS_WARN 0x04
78 #define EIB_MSGS_DEBUG 0x08
79 #define EIB_MSGS_ARGS 0x10
80 #define EIB_MSGS_PKT 0x20
81 #define EIB_MSGS_VERBOSE 0x40
82 #define EIB_MSGS_DEFAULT (EIB_MSGS_CRIT | EIB_MSGS_ERR | EIB_MSGS_WARN)
83
84 #define EIB_LOGSZ_DEFAULT 0x20000
85
86 #define EIB_DPRINTF_CRIT eib_dprintf_crit
87 #define EIB_DPRINTF_ERR eib_dprintf_err
88 #define EIB_DPRINTF_WARN eib_dprintf_warn
89 #ifdef EIB_DEBUG
90 #define EIB_DPRINTF_DEBUG eib_dprintf_debug
91 #define EIB_DPRINTF_ARGS eib_dprintf_args
92 #define EIB_DPRINTF_PKT eib_dprintf_pkt
93 #define EIB_DPRINTF_VERBOSE eib_dprintf_verbose
94 #else
95 #define EIB_DPRINTF_DEBUG(...)
96 #define EIB_DPRINTF_ARGS(...)
97 #define EIB_DPRINTF_PKT(...)
98 #define EIB_DPRINTF_VERBOSE(...)
99 #endif
100
101 /*
102 * EoIB threads to provide various services
103 */
104 #define EIB_EVENTS_HDLR "eib_events_handler"
105 #define EIB_RWQES_REFILLER "eib_rwqes_refiller"
106 #define EIB_VNIC_CREATOR "eib_vnic_creator"
107 #define EIB_TXWQES_MONITOR "eib_txwqe_monitor"
108 #define EIB_LSOBUFS_MONITOR "eib_lsobufs_monitor"
109
110 /*
111 * Macro for finding the least significant bit set in a 64-bit unsigned int
112 */
113 #define EIB_FIND_LSB_SET(val64) eib_setbit_mod67[((-(val64) & (val64)) % 67)]
114
115 /*
116 * LSO buffers
117 *
118 * Under normal circumstances we should never need to use any buffer
119 * that's larger than MTU. Unfortunately, IB HCA has limitations
120 * on the length of SGL that are much smaller than those for regular
121 * ethernet NICs. Since the network layer doesn't care to limit the
122 * number of mblk fragments in any send mp chain, we end up having to
123 * use these larger buffers occasionally.
124 */
125 #define EIB_LSO_MAXLEN 65536
126 #define EIB_LSO_BUFSZ 8192
127 #define EIB_LSO_NUM_BUFS 1024
128 #define EIB_LSO_FREE_BUFS_THRESH (EIB_LSO_NUM_BUFS >> 5)
129
130 typedef struct eib_lsobuf_s {
131 struct eib_lsobuf_s *lb_next;
132 uint8_t *lb_buf;
133 int lb_isfree;
134 } eib_lsobuf_t;
135
136 typedef struct eib_lsobkt_s {
137 kmutex_t bk_lock;
138 kcondvar_t bk_cv;
139 uint_t bk_status;
140 uint8_t *bk_mem;
141 eib_lsobuf_t *bk_bufl;
142 eib_lsobuf_t *bk_free_head;
143 ibt_mr_hdl_t bk_mr_hdl;
144 ibt_lkey_t bk_lkey;
145 uint_t bk_nelem;
146 uint_t bk_nfree;
147 } eib_lsobkt_t;
148
149 #define EIB_LBUF_SHORT 0x1
150 #define EIB_LBUF_MONITOR_DIE 0x2
151
152 /*
153 * The admin partition is only used for sending login and logout messages
154 * and receiving login acknowledgements from the gateway. While packets
155 * going out on several vlans at the same time could result in multiple
156 * vnic creations happening at the same time (and therefore multiple login
157 * packets), we serialize the vnic creation via the vnic creator thread, so
158 * we shouldn't need a lot of send wqes or receive wqes. Note also that we
159 * keep the cq size request to slightly less than a 2^n boundary to allow
160 * the alloc cq routine to return the closest 2^n boundary as the real cq
161 * size without wasting too much memory.
162 */
163 #define EIB_ADMIN_MAX_SWQE 30
164 #define EIB_ADMIN_MAX_RWQE 30
165 #define EIB_ADMIN_CQ_SIZE (EIB_ADMIN_MAX_SWQE + EIB_ADMIN_MAX_RWQE + 1)
166
167 /*
168 * The control qp is per vhub partition, and is used to send and receive
169 * vhub control messages such as vhub table request/response, vhub
170 * update response and vnic alive messages. While the vhub table response
171 * and vhub update messages might take a few rwqes, the vhub table request
172 * is made only once per vnic, and the vnic alive message is periodic
173 * and uses a single swqe as well. Per vnic, we should certainly not need
174 * too many swqes/rwqes.
175 */
176 #define EIB_CTL_MAX_SWQE 30
177 #define EIB_CTL_MAX_RWQE 30
178 #define EIB_CTL_CQ_SIZE (EIB_CTL_MAX_SWQE + EIB_CTL_MAX_RWQE + 1)
179
180 /*
181 * For the vNIC's data channel, there are three items that are of importance:
182 * the constraints defined below, the hca_max_chan_sz attribute and the value of
183 * (hca_max_cq_sz - 1). The maximum limit on swqe/rwqe is set to the minimum
184 * of these three values.
185 *
186 * While the total number of RWQEs posted to the data channel of any vNIC will
187 * not exceed EIB_DATA_MAX_RWQE, we also do not want to acquire and post all of
188 * it during the data channel initialization, since that is a lot of wqes for
189 * one vnic to consume when we don't even know if the vnic will need it at all.
190 * We post an initial set of EIB_DATA_RWQE_BKT rwqes, and slowly post more and
191 * more sets as we see them being consumed, until we hit the hard limit of
192 * EIB_DATA_MAX_RWQE.
193 */
194 #define EIB_DATA_MAX_SWQE 4000
195 #define EIB_DATA_MAX_RWQE 4000
196 #define EIB_DATA_RWQE_BKT 512
197
198 /*
199 * vNIC data channel CQ moderation parameters
200 */
201 #define EIB_TX_COMP_COUNT 10
202 #define EIB_TX_COMP_USEC 300
203 #define EIB_RX_COMP_COUNT 4
204 #define EIB_RX_COMP_USEC 10
205
206 /*
207 * qe_info masks (blk:ndx:type:flags)
208 */
209 #define EIB_WQEBLK_SHIFT 24
210 #define EIB_WQEBLK_MASK 0xFF
211 #define EIB_WQENDX_SHIFT 16
212 #define EIB_WQENDX_MASK 0xFF
213 #define EIB_WQETYP_SHIFT 8
214 #define EIB_WQETYP_MASK 0xFF
215 #define EIB_WQEFLGS_SHIFT 0
216 #define EIB_WQEFLGS_MASK 0xFF
217
218 /*
219 * Macros to get the bit fields from qe_info
220 */
221 #define EIB_WQE_BLK(info) (((info) >> EIB_WQEBLK_SHIFT) & EIB_WQEBLK_MASK)
222 #define EIB_WQE_NDX(info) (((info) >> EIB_WQENDX_SHIFT) & EIB_WQENDX_MASK)
223 #define EIB_WQE_TYPE(info) (((info) >> EIB_WQETYP_SHIFT) & EIB_WQETYP_MASK)
224 #define EIB_WQE_FLAGS(info) ((info) & EIB_WQEFLGS_MASK)
225
226 /*
227 * Values for type and flags in qe_info
228 */
229 #define EIB_WQE_TX 0x1
230 #define EIB_WQE_RX 0x2
231
232 /*
233 * Flags for rx wqes/buffers
234 */
235 #define EIB_WQE_FLG_POSTED_TO_HCA 0x1
236 #define EIB_WQE_FLG_WITH_NW 0x2
237
238 /*
239 * Flags for tx wqes/buffers
240 */
241 #define EIB_WQE_FLG_BUFTYPE_LSO 0x4
242 #define EIB_WQE_FLG_BUFTYPE_MAPPED 0x8
243
244 /*
245 * Send/Recv workq entries
246 */
247 typedef struct eib_wqe_s {
248 struct eib_wqe_pool_s *qe_pool;
249 uint8_t *qe_cpbuf;
250 uint8_t *qe_payload_hdr;
251 uint_t qe_bufsz;
252 uint_t qe_info;
253 int qe_vnic_inst;
254 ibt_ud_dest_hdl_t qe_dest;
255 frtn_t qe_frp;
256
257 mblk_t *qe_mp;
258 ibt_mi_hdl_t qe_iov_hdl;
259 ibt_all_wr_t qe_wr;
260 ibt_wr_ds_t qe_sgl;
261 ibt_wr_ds_t qe_big_sgl[EIB_MAX_SGL];
262 struct eib_wqe_s *qe_nxt_post;
263 struct eib_chan_s *qe_chan;
264 } eib_wqe_t;
265
266 /*
267 * The wqe in-use/free status in EoIB is managed via a 2-level bitmap
268 * logic.
269 *
270 * Each set of 64 wqes (a "wqe block") is managed by a single 64-bit
271 * integer bitmap. The free status of a set of 64 such wqe blocks (a
272 * "wqe pool") is managed by one 64-bit integer bitmap (if any wqe in
273 * the wqe block is free, the bit in the map is 1, otherwise it is 0).
274 *
275 * The maximum pool size is 4096 wqes, but this can easily be extended
276 * to support more wqes using additional pools of wqes.
277 *
278 * Note that an entire pool of wqes is allocated via a single allocation,
279 * the wqe addresses in a pool are all contiguous. The tx/rx copy buffers
280 * for a wqe pool are also allocated via a single allocation.
281 */
282 #define EIB_BLKS_PER_POOL 64
283 #define EIB_WQES_PER_BLK 64 /* do not change this */
284 #define EIB_WQES_PER_POOL (EIB_BLKS_PER_POOL * EIB_WQES_PER_BLK)
285
286 #define EIB_WQE_SZ (sizeof (eib_wqe_t))
287 #define EIB_WQEBLK_SZ (EIB_WQES_PER_BLK * EIB_WQE_SZ)
288
289 typedef struct eib_wqe_pool_s {
290 struct eib_wqe_pool_s *wp_next;
291 struct eib_s *wp_ss;
292 ib_vaddr_t wp_vaddr;
293 ib_memlen_t wp_memsz;
294 ibt_mr_hdl_t wp_mr;
295 ibt_lkey_t wp_lkey;
296 uint_t wp_nfree_lwm;
297 int wp_type;
298
299 kmutex_t wp_lock;
300 kcondvar_t wp_cv;
301 uint_t wp_status;
302 uint_t wp_nfree;
303 uint64_t wp_free_blks;
304 uint64_t wp_free_wqes[EIB_BLKS_PER_POOL];
305 struct eib_wqe_s *wp_wqe;
306 } eib_wqe_pool_t;
307
308 /*
309 * Values for wp_type
310 */
311 #define EIB_WP_TYPE_TX 0x1
312 #define EIB_WP_TYPE_RX 0x2
313
314 /*
315 * Values for wp_status (bit fields)
316 */
317 #define EIB_TXWQE_SHORT 0x1 /* only for tx wqe pool */
318 #define EIB_TXWQE_MONITOR_DIE 0x2 /* only for tx wqe pool */
319
320 #define EIB_RXWQE_SHORT 0x1 /* only for rx wqe pool */
321
322 /*
323 * The low-water-mark is an indication of when wqe grabs for low-priority
324 * qps should start to get refused (swqe grabs for control messages such
325 * as keepalives and rwqe grabs for posting back to control qps will still
326 * be allowed). The high-water-mark is an indication of when normal
327 * behavior should resume.
328 */
329 #define EIB_NFREE_SWQES_LWM (EIB_WQES_PER_POOL / 64) /* 1/64 */
330 #define EIB_NFREE_SWQES_HWM (EIB_WQES_PER_POOL / 32) /* 1/32 */
331 #define EIB_NFREE_RWQES_LWM (EIB_WQES_PER_POOL / 10) /* 10% */
332 #define EIB_NFREE_RWQES_HWM (EIB_WQES_PER_POOL / 5) /* 20% */
333
334 /*
335 * The "rwqes low" is used to determine when we should start using allocb()
336 * to copy and send received mblks in the rx path. It should be a little
337 * above the rwqes low-water-mark, but less than the high-water-mark.
338 */
339 #define EIB_NFREE_RWQES_LOW \
340 ((EIB_NFREE_RWQES_LWM + EIB_NFREE_RWQES_HWM) / 2)
341
342 #define EIB_WPRI_HI 1 /* for keepalive posts */
343 #define EIB_WPRI_LO 2 /* for all other posts */
344
345 /*
346 * Multicast GID Layout: the multicast gid is specified in big-endian
347 * representation, as a collection of different-sized fields in the
348 * EoIB specification. On Solaris, the multicast gid is represented
349 * as a collection of two 8-byte fields (in ib_gid_t).
350 */
351 typedef struct eib_mgid_spec_s {
352 uint8_t sp_mgid_prefix[FIP_MGID_PREFIX_LEN];
353 uint8_t sp_type;
354 uint8_t sp_dmac[ETHERADDRL];
355 uint8_t sp_rss_hash;
356 uint8_t sp_vhub_id[FIP_VHUBID_LEN];
357 } eib_mgid_spec_t;
358
359 /*
360 * Values for sp_type in mgid as per EoIB specification
361 */
362 #define EIB_MGID_VHUB_DATA 0x0
363 #define EIB_MGID_VHUB_UPDATE 0x2
364 #define EIB_MGID_VHUB_TABLE 0x3
365
366 typedef union eib_mgid_s {
367 eib_mgid_spec_t gd_spec;
368 ib_gid_t gd_sol;
369 } eib_mgid_t;
370
371 /*
372 * Gateway properties handed over to us by the EoIB nexus
373 */
374 typedef struct eib_gw_props_s {
375 kmutex_t pp_gw_lock;
376
377 ib_guid_t pp_gw_system_guid;
378 ib_guid_t pp_gw_guid;
379 ib_sn_prefix_t pp_gw_sn_prefix;
380
381 uint_t pp_gw_adv_period;
382 uint_t pp_gw_ka_period;
383 uint_t pp_vnic_ka_period;
384
385 ib_qpn_t pp_gw_ctrl_qpn;
386 ib_lid_t pp_gw_lid;
387 uint16_t pp_gw_portid;
388
389 uint16_t pp_gw_num_net_vnics;
390 uint8_t pp_gw_flag_available;
391 uint8_t pp_gw_is_host_adm_vnics;
392 uint8_t pp_gw_sl;
393 uint8_t pp_gw_n_rss_qpn;
394
395 uint8_t *pp_gw_system_name;
396 uint8_t *pp_gw_port_name;
397 uint8_t *pp_gw_vendor_id;
398
399 clock_t pp_gw_ka_ticks; /* 2.5 x gw_ka_period */
400 clock_t pp_vnic_ka_ticks; /* vnic_ka_period */
401 } eib_gw_props_t;
402
403 /*
404 * Port-specific properties
405 */
406 typedef struct eib_props_s {
407 uint64_t ep_ifspeed;
408 ib_guid_t ep_hca_guid;
409 uint8_t ep_port_num;
410 ib_gid_t ep_sgid;
411 ib_lid_t ep_blid;
412 uint16_t ep_mtu;
413 ibt_srate_t ep_srate;
414 } eib_props_t;
415
416 /*
417 * Capabilities derived from HCA attributes
418 */
419 typedef struct eib_caps_s {
420 uint_t cp_lso_maxlen;
421 uint32_t cp_cksum_flags;
422 int cp_resv_lkey_capab;
423 ibt_lkey_t cp_resv_lkey;
424
425 uint_t cp_max_swqe;
426 uint_t cp_max_rwqe;
427 uint_t cp_max_sgl;
428 uint_t cp_hiwm_sgl;
429 } eib_caps_t;
430
431 /*
432 * List of multicast groups the vnic joined
433 */
434 typedef struct eib_mcg_s {
435 struct eib_mcg_s *mg_next;
436 ib_gid_t mg_rgid;
437 ib_gid_t mg_mgid;
438 uint8_t mg_join_state;
439 uint8_t mg_mac[ETHERADDRL];
440 ibt_mcg_info_t *mg_mcginfo;
441 } eib_mcg_t;
442
443 /*
444 * Admin/control/data channel information
445 */
446 typedef struct eib_chan_s {
447 ibt_channel_hdl_t ch_chan;
448 ib_qpn_t ch_qpn;
449
450 ibt_wc_t *ch_wc;
451 ibt_cq_hdl_t ch_cq_hdl;
452 uint_t ch_cq_sz;
453
454 ibt_wc_t *ch_rcv_wc;
455 ibt_cq_hdl_t ch_rcv_cq_hdl;
456 uint_t ch_rcv_cq_sz;
457
458 int ch_vnic_inst;
459 uint_t ch_max_swqes;
460 uint_t ch_max_rwqes;
461 uint_t ch_lwm_rwqes;
462 uint_t ch_rwqe_bktsz;
463 uint_t ch_ip_hdr_align;
464 boolean_t ch_alloc_mp;
465 boolean_t ch_tear_down;
466
467 kmutex_t ch_pkey_lock;
468 ib_pkey_t ch_pkey;
469 uint16_t ch_pkey_ix;
470
471 kmutex_t ch_cep_lock;
472 kcondvar_t ch_cep_cv;
473 ibt_cep_state_t ch_cep_state;
474
475 kmutex_t ch_tx_lock;
476 kcondvar_t ch_tx_cv;
477 uint_t ch_tx_posted;
478 boolean_t ch_tx_busy;
479 struct eib_wqe_s *ch_tx;
480 struct eib_wqe_s *ch_tx_tail;
481
482 kmutex_t ch_rx_lock;
483 kcondvar_t ch_rx_cv;
484 uint_t ch_rx_posted;
485 boolean_t ch_rx_refilling;
486
487 kmutex_t ch_vhub_lock;
488 struct eib_mcg_s *ch_vhub_table;
489 struct eib_mcg_s *ch_vhub_update;
490 struct eib_mcg_s *ch_vhub_data;
491
492 struct eib_chan_s *ch_rxpost_next;
493 } eib_chan_t;
494
495 /*
496 * States for vNIC state machine during login
497 */
498 #define EIB_LOGIN_INIT 0
499 #define EIB_LOGIN_ACK_WAIT 1
500 #define EIB_LOGIN_ACK_RCVD 2
501 #define EIB_LOGIN_NACK_RCVD 3
502 #define EIB_LOGIN_TBL_WAIT 4
503 #define EIB_LOGIN_TBL_INPROG 5
504 #define EIB_LOGIN_TBL_DONE 6
505 #define EIB_LOGIN_TBL_FAILED 7
506 #define EIB_LOGIN_DONE 8
507 #define EIB_LOGIN_TIMED_OUT 9
508 #define EIB_LOGOUT_DONE 10
509
510 typedef struct eib_login_data_s {
511 ib_guid_t ld_gw_guid;
512 ib_lid_t ld_gw_lid;
513 uint_t ld_syndrome;
514 uint16_t ld_gw_port_id;
515 ib_qpn_t ld_gw_data_qpn;
516 ib_qpn_t ld_gw_ctl_qpn;
517 uint16_t ld_vnic_id; /* includes set msbit */
518 uint16_t ld_vhub_mtu;
519 uint16_t ld_vhub_pkey;
520 uint16_t ld_assigned_vlan;
521 uint8_t ld_gw_sl;
522 uint8_t ld_n_rss_mcgid;
523 uint8_t ld_n_mac_mcgid;
524 uint8_t ld_vnic_name[FIP_VNIC_NAME_LEN];
525 uint8_t ld_assigned_mac[ETHERADDRL];
526 uint8_t ld_gw_mgid_prefix[FIP_MGID_PREFIX_LEN];
527 uint8_t ld_vlan_in_packets;
528 uint32_t ld_vhub_id;
529 } eib_login_data_t;
530
531 #define EIB_UNICAST_MAC(mac) (((mac)[0] & 0x01) == 0)
532
533 /*
534 * Map to translate between DMAC and {qpn, lid, sl}
535 */
536 typedef struct eib_vhub_map_s {
537 struct eib_vhub_map_s *mp_next;
538 uint32_t mp_tusn;
539 ib_qpn_t mp_qpn;
540 ib_lid_t mp_lid;
541 uint8_t mp_mac[ETHERADDRL];
542 uint8_t mp_sl;
543 uint8_t mp_v_rss_type;
544 } eib_vhub_map_t;
545
546 /*
547 * Per-vNIC vHUB Table
548 */
549 #define EIB_TB_NBUCKETS 13
550 typedef struct eib_vhub_table_s {
551 kmutex_t tb_lock;
552 struct eib_vhub_map_s *tb_gateway;
553 struct eib_vhub_map_s *tb_unicast_miss;
554 struct eib_vhub_map_s *tb_vhub_multicast;
555 struct eib_vhub_map_s *tb_vnic_entry[EIB_TB_NBUCKETS];
556 struct eib_vhub_map_s *tb_mcast_entry[EIB_TB_NBUCKETS];
557
558 uint32_t tb_tusn;
559 uint8_t tb_eport_state;
560
561 uint16_t tb_entries_seen;
562 uint16_t tb_entries_in_table;
563 uint32_t tb_checksum;
564 } eib_vhub_table_t;
565
566 typedef struct eib_vhub_update_s {
567 kmutex_t up_lock;
568 eib_vhub_map_t *up_vnic_entry;
569 uint32_t up_tusn;
570 uint8_t up_eport_state;
571 } eib_vhub_update_t;
572
573 typedef struct eib_ether_hdr_s {
574 int eh_tagless;
575 uint16_t eh_ether_type;
576 uint16_t eh_vlan;
577 uint8_t eh_dmac[ETHERADDRL];
578 uint8_t eh_smac[ETHERADDRL];
579 } eib_ether_hdr_t;
580
581 /*
582 * vNIC Information
583 */
584 typedef struct eib_vnic_s {
585 struct eib_s *vn_ss;
586 eib_chan_t *vn_ctl_chan;
587 eib_chan_t *vn_data_chan;
588 int vn_instance;
589 uint16_t vn_vlan;
590 uint16_t vn_id;
591 uint8_t vn_macaddr[ETHERADDRL];
592 struct eib_login_data_s vn_login_data;
593
594 kmutex_t vn_lock;
595 kcondvar_t vn_cv;
596 uint_t vn_state;
597 struct eib_vhub_table_s *vn_vhub_table;
598 struct eib_vhub_update_s *vn_vhub_update;
599
600 ddi_softint_handle_t vn_ctl_si_hdl;
601 ddi_softint_handle_t vn_data_tx_si_hdl;
602 ddi_softint_handle_t vn_data_rx_si_hdl;
603 } eib_vnic_t;
604
605
606 /*
607 * Base NIC's mac state flags. The lock protects the starting/stopping
608 * bits. Access to the rest of the mac state is protected by these
609 * two bits.
610 */
611 #define EIB_NIC_STARTING 0x01
612 #define EIB_NIC_STOPPING 0x02
613 #define EIB_NIC_STARTED 0x80
614 #define EIB_NIC_RESTARTING (EIB_NIC_STARTING | EIB_NIC_STOPPING)
615
616 typedef struct eib_node_state_s {
617 kmutex_t ns_lock;
618 kcondvar_t ns_cv;
619 uint_t ns_nic_state;
620 link_state_t ns_link_state;
621 } eib_node_state_t;
622
623 /*
624 * MIB-II statistics to report to the mac layer
625 */
626 typedef struct eib_stats_s {
627 uint64_t st_obytes; /* bytes sent out */
628 uint64_t st_opkts; /* pkts sent out */
629 uint64_t st_brdcstxmit; /* broadcast pkts transmitted */
630 uint64_t st_multixmit; /* multicast pkts transmitted */
631 uint64_t st_oerrors; /* transmit errors */
632 uint64_t st_noxmitbuf; /* transmit pkts discarded */
633
634 uint64_t st_rbytes; /* bytes received */
635 uint64_t st_ipkts; /* pkts received */
636 uint64_t st_brdcstrcv; /* broadcast pkts received */
637 uint64_t st_multircv; /* multicast pkts received */
638 uint64_t st_ierrors; /* receive errors */
639 uint64_t st_norcvbuf; /* receive pkts discarded */
640 } eib_stats_t;
641
642 #define EIB_UPDATE_COUNTER(addr, val) (atomic_add_64((addr), (val)))
643 #define EIB_INCR_COUNTER(addr) (atomic_inc_64((addr)))
644 #define EIB_DECR_COUNTER(addr) (atomic_dec_64((addr)))
645
646 /*
647 * Cache of address vectors with dlid as the key. Currently we use
648 * eib state structure's ei_lock to protect the individual address
649 * vector's fields. This is a lock granularity that's slightly
650 * bigger than ideal, but it should do for now.
651 */
652 #define EIB_AV_NBUCKETS 17
653 typedef struct eib_avect_s {
654 struct eib_avect_s *av_next;
655 ibt_adds_vect_t av_vect;
656 uint_t av_ref;
657 } eib_avect_t;
658
659 /*
660 * vNIC creation and deletion are serialized by a non-zero value
661 * to the ei_vnic_state member (i.e. only one vnic may be created
662 * or deleted at a time). The code makes sure to access/update
663 * the ei_active_vnics member only after a successful setting of
664 * ei_vnic_state.
665 */
666 #define EIB_VN_BEING_CREATED 0x01
667 #define EIB_VN_BEING_DELETED 0x02
668 #define EIB_VN_BEING_MODIFIED (EIB_VN_BEING_CREATED | EIB_VN_BEING_DELETED)
669
670 /*
671 * All possible EoIB event work items that need to be handled
672 */
673 #define EIB_EV_NONE 0
674 #define EIB_EV_PORT_DOWN 1
675 #define EIB_EV_PORT_UP 2
676 #define EIB_EV_PKEY_CHANGE 3
677 #define EIB_EV_SGID_CHANGE 4
678 #define EIB_EV_CLNT_REREG 5
679 #define EIB_EV_GW_EPORT_DOWN 6
680 #define EIB_EV_GW_DOWN 7
681 #define EIB_EV_GW_UP 8
682 #define EIB_EV_GW_INFO_UPDATE 9
683 #define EIB_EV_MCG_DELETED 10
684 #define EIB_EV_MCG_CREATED 11
685 #define EIB_EV_SHUTDOWN 12
686
687 typedef struct eib_event_s {
688 struct eib_event_s *ev_next;
689 uint_t ev_code;
690 void *ev_arg;
691 } eib_event_t;
692
693 /*
694 * Work element for new vnic creation
695 */
696 typedef struct eib_vnic_req_s {
697 struct eib_vnic_req_s *vr_next;
698 uint_t vr_req;
699 uint8_t vr_mac[ETHERADDRL];
700 uint16_t vr_vlan;
701 } eib_vnic_req_t;
702
703 /*
704 * Values for vr_req
705 */
706 #define EIB_CR_REQ_NEW_VNIC 1
707 #define EIB_CR_REQ_FLUSH 2
708 #define EIB_CR_REQ_DIE 3
709
710 /*
711 * Work element for vnics kept alive by the keepalive manager thread
712 * and bitfield values for ei_ka_vnics_event.
713 */
714 typedef struct eib_ka_vnics_s {
715 struct eib_ka_vnics_s *ka_next;
716 struct eib_vnic_s *ka_vnic;
717 } eib_ka_vnics_t;
718
719 #define EIB_KA_VNICS_DIE 0x1
720 #define EIB_KA_VNICS_TIMED_OUT 0x2
721
722 /*
723 * EoIB per-instance state
724 */
725 typedef struct eib_s {
726 ibt_clnt_hdl_t ei_ibt_hdl;
727 ibt_hca_hdl_t ei_hca_hdl;
728 ibt_pd_hdl_t ei_pd_hdl;
729 mac_handle_t ei_mac_hdl;
730
731 ddi_softint_handle_t ei_admin_si_hdl;
732 ddi_callback_id_t ei_login_ack_cb;
733 ddi_callback_id_t ei_gw_alive_cb;
734 ddi_callback_id_t ei_gw_info_cb;
735
736 ibt_hca_attr_t *ei_hca_attrs;
737 dev_info_t *ei_dip;
738 uint_t ei_instance;
739
740 struct eib_gw_props_s *ei_gw_props;
741 struct eib_props_s *ei_props;
742 struct eib_caps_s *ei_caps;
743 struct eib_stats_s *ei_stats;
744
745 struct eib_node_state_s *ei_node_state;
746 struct eib_chan_s *ei_admin_chan;
747
748 struct eib_wqe_pool_s *ei_tx;
749 struct eib_wqe_pool_s *ei_rx;
750 struct eib_lsobkt_s *ei_lso;
751
752 kmutex_t ei_vnic_lock;
753 kcondvar_t ei_vnic_cv;
754 uint_t ei_vnic_state;
755 uint64_t ei_active_vnics;
756 uint64_t ei_zombie_vnics;
757 uint64_t ei_rejoin_vnics;
758 struct eib_vnic_s *ei_vnic[EIB_MAX_VNICS];
759 struct eib_vnic_s *ei_vnic_pending;
760 int64_t ei_gw_last_heartbeat;
761 boolean_t ei_gw_unreachable;
762 uint8_t ei_gw_eport_state;
763
764 kmutex_t ei_av_lock;
765 struct eib_avect_s *ei_av[EIB_AV_NBUCKETS];
766
767 kmutex_t ei_ev_lock;
768 kcondvar_t ei_ev_cv;
769 struct eib_event_s *ei_event;
770
771 kmutex_t ei_rxpost_lock;
772 kcondvar_t ei_rxpost_cv;
773 uint_t ei_rxpost_die;
774 struct eib_chan_s *ei_rxpost;
775
776 kmutex_t ei_vnic_req_lock;
777 kcondvar_t ei_vnic_req_cv;
778 struct eib_vnic_req_s *ei_vnic_req;
779 struct eib_vnic_req_s *ei_failed_vnic_req;
780 struct eib_vnic_req_s *ei_pending_vnic_req;
781
782 kmutex_t ei_ka_vnics_lock;
783 kcondvar_t ei_ka_vnics_cv;
784 uint_t ei_ka_vnics_event;
785 struct eib_ka_vnics_s *ei_ka_vnics;
786
787 kt_did_t ei_txwqe_monitor;
788 kt_did_t ei_lsobufs_monitor;
789 kt_did_t ei_rwqes_refiller;
790 kt_did_t ei_vnic_creator;
791 kt_did_t ei_events_handler;
792 kt_did_t ei_keepalives_manager;
793 } eib_t;
794
795 /*
796 * Private read-only datalink properties
797 */
798 #define EIB_DLPROP_GW_EPORT_STATE "_eib_eport_state"
799 #define EIB_DLPROP_HCA_GUID "_eib_hca_guid"
800 #define EIB_DLPROP_PORT_GUID "_eib_port_guid"
801
802 /*
803 * FUNCTION PROTOTYPES FOR CROSS-FILE LINKAGE
804 */
805
806 /*
807 * FIP protocol related
808 */
809 extern int eib_fip_login(eib_t *, eib_vnic_t *, int *);
810 extern int eib_fip_heartbeat(eib_t *, eib_vnic_t *, int *);
811 extern int eib_fip_vhub_table(eib_t *, eib_vnic_t *, int *);
812 extern int eib_fip_logout(eib_t *, eib_vnic_t *, int *);
813 extern int eib_fip_parse_login_ack(eib_t *, uint8_t *, eib_login_data_t *);
814 extern int eib_fip_parse_ctl_pkt(uint8_t *, eib_vnic_t *);
815
816 /*
817 * Service threads and other handlers
818 */
819 extern void eib_events_handler(eib_t *);
820 extern void eib_svc_enqueue_event(eib_t *, eib_event_t *);
821 extern void eib_refill_rwqes(eib_t *);
822 extern void eib_vnic_creator(eib_t *);
823 extern void eib_monitor_tx_wqes(eib_t *);
824 extern void eib_monitor_lso_bufs(eib_t *);
825 extern void eib_manage_keepalives(eib_t *);
826 extern void eib_stop_events_handler(eib_t *);
827 extern void eib_stop_refill_rwqes(eib_t *);
828 extern void eib_stop_vnic_creator(eib_t *);
829 extern void eib_stop_monitor_tx_wqes(eib_t *);
830 extern int eib_stop_monitor_lso_bufs(eib_t *, boolean_t);
831 extern void eib_stop_manage_keepalives(eib_t *);
832 extern void eib_flush_vnic_reqs(eib_t *);
833 extern void eib_gw_info_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
834 extern void eib_gw_alive_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
835 extern void eib_login_ack_cb(dev_info_t *, ddi_eventcookie_t, void *, void *);
836
837 /*
838 * Admin QP related
839 */
840 extern int eib_adm_setup_qp(eib_t *, int *);
841 extern uint_t eib_adm_comp_handler(caddr_t, caddr_t);
842 extern void eib_rb_adm_setup_qp(eib_t *);
843
844 /*
845 * Control QP related
846 */
847 extern int eib_ctl_create_qp(eib_t *, eib_vnic_t *, int *);
848 extern uint_t eib_ctl_comp_handler(caddr_t, caddr_t);
849 extern void eib_rb_ctl_create_qp(eib_t *, eib_vnic_t *);
850
851 /*
852 * Data QP related
853 */
854 extern int eib_data_create_qp(eib_t *, eib_vnic_t *, int *);
855 extern uint_t eib_data_rx_comp_handler(caddr_t, caddr_t);
856 extern uint_t eib_data_tx_comp_handler(caddr_t, caddr_t);
857 extern void eib_data_rx_recycle(caddr_t);
858 extern void eib_data_post_tx(eib_vnic_t *, eib_wqe_t *);
859 extern void eib_data_parse_ether_hdr(mblk_t *, eib_ether_hdr_t *);
860 extern int eib_data_lookup_vnic(eib_t *, uint8_t *, uint16_t, eib_vnic_t **,
861 boolean_t *);
862 extern int eib_data_prepare_frame(eib_vnic_t *, eib_wqe_t *, mblk_t *,
863 eib_ether_hdr_t *);
864 extern void eib_rb_data_create_qp(eib_t *, eib_vnic_t *);
865
866 /*
867 * Resource related
868 */
869 extern int eib_rsrc_setup_bufs(eib_t *, int *);
870 extern int eib_rsrc_grab_swqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
871 extern int eib_rsrc_grab_rwqes(eib_t *, eib_wqe_t **, uint_t, uint_t *, int);
872 extern int eib_rsrc_grab_lsobufs(eib_t *, uint_t, ibt_wr_ds_t *, uint32_t *);
873 extern eib_wqe_t *eib_rsrc_grab_swqe(eib_t *, int);
874 extern eib_wqe_t *eib_rsrc_grab_rwqe(eib_t *, int);
875 extern void eib_rsrc_return_swqe(eib_t *, eib_wqe_t *, eib_chan_t *);
876 extern void eib_rsrc_return_rwqe(eib_t *, eib_wqe_t *, eib_chan_t *);
877 extern void eib_rsrc_return_lsobufs(eib_t *, ibt_wr_ds_t *, uint32_t);
878 extern void eib_rsrc_decr_posted_swqe(eib_t *, eib_chan_t *);
879 extern void eib_rsrc_decr_posted_rwqe(eib_t *, eib_chan_t *);
880 extern void eib_rsrc_txwqes_needed(eib_t *);
881 extern void eib_rsrc_lsobufs_needed(eib_t *);
882 extern boolean_t eib_rsrc_rxpool_low(eib_wqe_t *);
883 extern void eib_rb_rsrc_setup_bufs(eib_t *, boolean_t);
884
885 /*
886 * IBT related
887 */
888 extern int eib_ibt_hca_init(eib_t *);
889 extern void eib_ibt_link_mod(eib_t *);
890 extern int eib_ibt_modify_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t);
891 extern eib_avect_t *eib_ibt_hold_avect(eib_t *, ib_lid_t, uint8_t);
892 extern void eib_ibt_release_avect(eib_t *, eib_avect_t *);
893 extern void eib_ibt_free_avects(eib_t *);
894 extern void eib_ibt_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
895 ibt_async_event_t *);
896 extern void eib_ibt_record_capab(eib_t *, ibt_hca_attr_t *, eib_caps_t *);
897 extern void eib_rb_ibt_hca_init(eib_t *, uint_t);
898
899 /*
900 * Chan related
901 */
902 extern eib_chan_t *eib_chan_init(void);
903 extern void eib_chan_fini(eib_chan_t *);
904 extern int eib_chan_post_rx(eib_t *, eib_chan_t *, uint_t *);
905 extern int eib_chan_post_recv(eib_t *, eib_chan_t *, eib_wqe_t *);
906
907 /*
908 * Mac layer related
909 */
910 extern void eib_mac_set_nic_state(eib_t *, uint_t);
911 extern void eib_mac_clr_nic_state(eib_t *, uint_t);
912 extern void eib_mac_upd_nic_state(eib_t *, uint_t, uint_t);
913 extern uint_t eib_mac_get_nic_state(eib_t *);
914 extern void eib_mac_link_state(eib_t *, link_state_t, boolean_t);
915 extern void eib_mac_link_down(eib_t *, boolean_t);
916 extern void eib_mac_link_up(eib_t *, boolean_t);
917 extern int eib_mac_start(eib_t *);
918 extern void eib_mac_stop(eib_t *);
919 extern int eib_mac_multicast(eib_t *, boolean_t, uint8_t *);
920 extern int eib_mac_promisc(eib_t *, boolean_t);
921 extern int eib_mac_tx(eib_t *, mblk_t *);
922 extern int eib_mac_hca_portstate(eib_t *, ib_lid_t *, int *);
923
924 /*
925 * VNIC related
926 */
927 extern int eib_vnic_create(eib_t *, uint8_t *, uint16_t, eib_vnic_t **, int *);
928 extern void eib_vnic_delete(eib_t *, eib_vnic_t *);
929 extern int eib_vnic_wait_for_login_ack(eib_t *, eib_vnic_t *, int *);
930 extern void eib_vnic_login_ack(eib_t *, eib_login_data_t *);
931 extern int eib_vnic_wait_for_table(eib_t *, eib_vnic_t *, int *);
932 extern void eib_vnic_vhub_table_done(eib_vnic_t *, uint_t);
933 extern int eib_vnic_join_data_mcg(eib_t *, eib_vnic_t *, uint8_t *,
934 boolean_t, int *);
935 extern int eib_vnic_setup_dest(eib_vnic_t *, eib_wqe_t *, uint8_t *, uint16_t);
936 extern void eib_vnic_leave_data_mcg(eib_t *, eib_vnic_t *, uint8_t *);
937 extern void eib_vnic_init_tables(eib_t *, eib_vnic_t *);
938 extern void eib_vnic_fini_tables(eib_t *, eib_vnic_t *, boolean_t);
939 extern eib_chan_t *eib_vnic_get_data_chan(eib_t *, int);
940 extern void eib_vnic_need_new(eib_t *, uint8_t *, uint16_t);
941 extern void eib_vnic_enqueue_req(eib_t *, eib_vnic_req_t *);
942 extern void eib_vnic_resurrect_zombies(eib_t *, uint8_t *);
943 extern void eib_vnic_restart(eib_t *, int, uint8_t *);
944 extern void eib_vnic_rejoin_mcgs(eib_t *);
945 extern void eib_rb_vnic_create(eib_t *, eib_vnic_t *, uint_t);
946
947 /*
948 * Logging and other stuff
949 */
950 extern void eib_debug_init(void);
951 extern void eib_debug_fini(void);
952 extern void eib_dprintf_crit(int, const char *fmt, ...);
953 extern void eib_dprintf_err(int, const char *fmt, ...);
954 extern void eib_dprintf_warn(int, const char *fmt, ...);
955 #ifdef EIB_DEBUG
956 extern void eib_dprintf_debug(int, const char *fmt, ...);
957 extern void eib_dprintf_args(int, const char *fmt, ...);
958 extern void eib_dprintf_pkt(int, uint8_t *, uint_t);
959 extern void eib_dprintf_verbose(int, const char *fmt, ...);
960 #endif
961 extern int eib_get_props(eib_t *);
962 extern void eib_update_props(eib_t *, eib_gw_info_t *);
963 extern void eib_rb_get_props(eib_t *);
964
965 /*
966 * EoIB specific global variables
967 */
968 extern ib_gid_t eib_reserved_gid;
969 extern uint8_t eib_zero_mac[];
970 extern uint8_t eib_broadcast_mac[];
971 extern int eib_setbit_mod67[];
972 extern char *eib_pvt_props[];
973
974 /*
975 * HW/FW workarounds
976 */
977 extern int eib_wa_no_desc_list_len;
978 extern int eib_wa_no_cksum_offload;
979 extern int eib_wa_no_lso;
980 extern int eib_wa_no_mcast_entries;
981 extern int eib_wa_no_av_discover;
982 extern int eib_wa_no_good_vp_flag;
983 extern int eib_wa_no_good_vhub_cksum;
984
985 /*
986 * Miscellaneous externs
987 */
988 extern void freemsgchain(mblk_t *);
989 extern pri_t minclsyspri;
990
991 #ifdef __cplusplus
992 }
993 #endif
994
995 #endif /* _SYS_IB_EOIB_EIB_IMPL_H */