1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* 27 * An implementation of the IPoIB standard based on PSARC 2001/289. 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/ddi.h> 33 #include <sys/sunddi.h> 34 #include <sys/modctl.h> 35 #include <sys/stropts.h> 36 #include <sys/stream.h> 37 #include <sys/strsun.h> 38 #include <sys/strsubr.h> 39 #include <sys/dlpi.h> 40 #include <sys/mac_provider.h> 41 42 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 43 #include <sys/sysmacros.h> /* for offsetof */ 44 #include <sys/disp.h> /* for async thread pri */ 45 #include <sys/atomic.h> /* for atomic_add*() */ 46 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 47 #include <netinet/in.h> /* for netinet/ip.h below */ 48 #include <netinet/ip.h> /* for struct ip */ 49 #include <netinet/udp.h> /* for struct udphdr */ 50 #include <inet/common.h> /* for inet/ip.h below */ 51 #include <inet/ip.h> /* for ipha_t */ 52 #include <inet/ip6.h> /* for ip6_t */ 53 #include <inet/tcp.h> /* for tcph_t */ 54 #include <netinet/icmp6.h> /* for icmp6_t */ 55 #include <sys/callb.h> 56 #include <sys/modhash.h> 57 58 #include <sys/ib/clients/ibd/ibd.h> 59 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 60 #include <sys/note.h> 61 #include <sys/multidata.h> 62 63 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 64 65 #include <sys/priv_names.h> 66 #include <sys/dls.h> 67 #include <sys/dld_ioc.h> 68 #include <sys/policy.h> 69 #include <sys/ibpart.h> 70 #include <sys/file.h> 71 72 /* 73 * The write-up below includes details on the following: 74 * 1. The dladm administrative model. 75 * 2. Late HCA initialization feature. 76 * 3. Brussels support and its implications to the current architecture. 77 * 78 * 1. The dladm administrative model. 79 * ------------------------------------------ 80 * With the dladm model, ibnex will create one ibd instance per port. These 81 * instances will be created independent of the port state. 82 * 83 * The ibd driver is two faceted: One side of it working as the port driver and 84 * the other as the partition object driver. 85 * 86 * The port instance is a child of the HCA, and will have an entry in the devfs. 87 * A DDI attach only happens for the port driver, and its attach is 88 * handled in ibd_port_attach(). Similary, a DDI detach for the port driver is 89 * handled in ibd_port_unattach(). 90 * 91 * The partition object is only a registrant to the mac layer via mac_register() 92 * and does not have an entry in the device tree. There is no DDI softstate 93 * managed by the DDI framework for the partition objects. However, the state is 94 * managed inside the ibd driver, and every partition object hangs off the 95 * "ibd_objlist_head". 96 * 97 * The partition object first comes into existence when a user runs the 98 * 'create-part' subcommand of dladm. This is like invoking the attach entry 99 * point of the partition object. The partition object goes away with the 100 * 'delete-part' subcommand of dladm. This is like invoking the detach entry 101 * point of the partition object. 102 * 103 * The create-part and delete-part subcommands result in dld ioctls that end up 104 * calling ibd_create_parition() and ibd_delete_partition respectively. 105 * There ioctls are registered with the dld layer in _init() via a call to 106 * dld_ioc_register(). 107 * 108 * The port instance by itself cannot be plumbed. It is only the partition 109 * objects that can be plumbed and they alone participate in I/O and not the 110 * port driver. 111 * 112 * There are some info ioctls supported in ibd which are used by dladm(1M) to 113 * display useful information. The info entry point for ibd is 114 * ibd_get_partition_info(). 115 * 116 * 2. Late HCA initialization feature. 117 * ------------------------------------ 118 * As mentioned in section 1, the user creates the partition objects via 119 * dladm(1M). It is possible that: 120 * a) The physical port itself is down and the SM cannot be reached. 121 * b) The PKEY specified by the used has not been created in the SM yet. 122 * c) An IPoIB broadcast group for the specified PKEY is not present. 123 * 124 * In all of the above cases, complete initialization of the partition object is 125 * not possible. However, the new model allows the creation of partition 126 * objects even in such cases but will defer the initialization for later. 127 * When such a partition object is plumbed, the link state will be displayed as 128 * "down". 129 * The driver, at this point, is listening to events that herald the 130 * availability of resources - 131 * i) LINK_UP when the link becomes available 132 * ii) PORT_CHANGE when the PKEY has been created 133 * iii) MCG_CREATED when the IPoIB broadcast group for the given pkey has been 134 * created 135 * via ibd_async_handler() for events i) and ii), and via 136 * ibd_snet_notices_handler() for iii. 137 * The driver handles these events (as and when they arrive) and completes the 138 * initialization of the partition object and transitions it to a usable state. 139 * 140 * 3. Brussels support and its implications to the current architecture. 141 * --------------------------------------------------------------------- 142 * The brussels support introduces two new interfaces to the ibd driver - 143 * ibd_m_getprop() and ibd_m_setprop(). 144 * These interfaces allow setting and retrieval of certain properties. 145 * Some of them are public properties while most other are private properties 146 * meant to be used by developers. Tuning the latter kind can cause 147 * performance issues and should not be used without understanding the 148 * implications. All properties are specific to an instance of either the 149 * partition object or the port driver. 150 * 151 * The public properties are : mtu and linkmode. 152 * mtu is a read-only property. 153 * linkmode can take two values - UD and CM. 154 * 155 * Changing the linkmode requires some bookkeeping in the driver. The 156 * capabilities need to be re-reported to the mac layer. This is done by 157 * calling mac_capab_update(). The maxsdu is updated by calling 158 * mac_maxsdu_update2(). 159 * The private properties retain their values across the change of linkmode. 160 * NOTE: 161 * - The port driver does not support any property apart from mtu. 162 * - All other properties are only meant for the partition object. 163 * - The properties cannot be set when an instance is plumbed. The 164 * instance has to be unplumbed to effect any setting. 165 */ 166 167 /* 168 * Driver wide tunables 169 * 170 * ibd_tx_softintr 171 * ibd_rx_softintr 172 * The softintr mechanism allows ibd to avoid event queue overflows if 173 * the receive/completion handlers are to be expensive. These are enabled 174 * by default. 175 * 176 * ibd_log_sz 177 * This specifies the size of the ibd log buffer in bytes. The buffer is 178 * allocated and logging is enabled only when IBD_LOGGING is defined. 179 * 180 */ 181 uint_t ibd_rx_softintr = 1; 182 uint_t ibd_tx_softintr = 1; 183 184 #ifdef IBD_LOGGING 185 uint_t ibd_log_sz = 0x20000; 186 #endif 187 188 #ifdef IBD_LOGGING 189 #define IBD_LOG_SZ ibd_log_sz 190 #endif 191 192 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 193 #define IBD_RX_POST_CNT 8 194 195 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 196 #define IBD_LOG_RX_POST 4 197 198 /* Minimum number of receive work requests driver needs to always have */ 199 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 200 201 /* 202 * LSO parameters 203 */ 204 #define IBD_LSO_MAXLEN 65536 205 #define IBD_LSO_BUFSZ 8192 206 207 /* 208 * Async operation states 209 */ 210 #define IBD_OP_NOTSTARTED 0 211 #define IBD_OP_ONGOING 1 212 #define IBD_OP_COMPLETED 2 213 #define IBD_OP_ERRORED 3 214 #define IBD_OP_ROUTERED 4 215 216 /* 217 * Start/stop in-progress flags; note that restart must always remain 218 * the OR of start and stop flag values. 219 */ 220 #define IBD_DRV_START_IN_PROGRESS 0x10000000 221 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 222 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 223 #define IBD_DRV_DELETE_IN_PROGRESS IBD_DRV_RESTART_IN_PROGRESS 224 225 /* 226 * Miscellaneous constants 227 */ 228 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 229 #define IBD_DEF_MAX_SDU 2044 230 #define IBD_DEF_MAX_MTU (IBD_DEF_MAX_SDU + IPOIB_HDRSIZE) 231 #define IBD_DEF_RC_MAX_SDU 65520 232 #define IBD_DEF_RC_MAX_MTU (IBD_DEF_RC_MAX_SDU + IPOIB_HDRSIZE) 233 #define IBD_DEFAULT_QKEY 0xB1B 234 #ifdef IBD_LOGGING 235 #define IBD_DMAX_LINE 100 236 #endif 237 238 /* 239 * Enumerations for link states 240 */ 241 typedef enum { 242 IBD_LINK_DOWN, 243 IBD_LINK_UP, 244 IBD_LINK_UP_ABSENT 245 } ibd_link_op_t; 246 247 /* 248 * Driver State Pointer 249 */ 250 void *ibd_list; 251 252 /* 253 * Driver Global Data 254 */ 255 ibd_global_state_t ibd_gstate; 256 257 /* 258 * Partition object list 259 */ 260 ibd_state_t *ibd_objlist_head = NULL; 261 kmutex_t ibd_objlist_lock; 262 263 int ibd_rc_conn_timeout = 60 * 10; /* 10 minutes */ 264 265 /* 266 * Logging 267 */ 268 #ifdef IBD_LOGGING 269 kmutex_t ibd_lbuf_lock; 270 uint8_t *ibd_lbuf; 271 uint32_t ibd_lbuf_ndx; 272 #endif 273 274 /* 275 * Required system entry points 276 */ 277 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 278 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 279 280 /* 281 * Required driver entry points for GLDv3 282 */ 283 static int ibd_m_stat(void *, uint_t, uint64_t *); 284 static int ibd_m_start(void *); 285 static void ibd_m_stop(void *); 286 static int ibd_m_promisc(void *, boolean_t); 287 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 288 static int ibd_m_unicst(void *, const uint8_t *); 289 static mblk_t *ibd_m_tx(void *, mblk_t *); 290 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 291 292 static int ibd_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 293 const void *); 294 static int ibd_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 295 static void ibd_m_propinfo(void *, const char *, mac_prop_id_t, 296 mac_prop_info_handle_t); 297 static int ibd_set_priv_prop(ibd_state_t *, const char *, uint_t, 298 const void *); 299 static int ibd_get_priv_prop(ibd_state_t *, const char *, uint_t, void *); 300 301 /* 302 * Private driver entry points for GLDv3 303 */ 304 305 /* 306 * Initialization 307 */ 308 static int ibd_state_init(ibd_state_t *, dev_info_t *); 309 static int ibd_init_txlist(ibd_state_t *); 310 static int ibd_init_rxlist(ibd_state_t *); 311 static int ibd_acache_init(ibd_state_t *); 312 #ifdef IBD_LOGGING 313 static void ibd_log_init(void); 314 #endif 315 316 /* 317 * Termination/cleanup 318 */ 319 static void ibd_state_fini(ibd_state_t *); 320 static void ibd_fini_txlist(ibd_state_t *); 321 static void ibd_fini_rxlist(ibd_state_t *); 322 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 323 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 324 static void ibd_acache_fini(ibd_state_t *); 325 #ifdef IBD_LOGGING 326 static void ibd_log_fini(void); 327 #endif 328 329 /* 330 * Allocation/acquire/map routines 331 */ 332 static int ibd_alloc_tx_copybufs(ibd_state_t *); 333 static int ibd_alloc_rx_copybufs(ibd_state_t *); 334 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 335 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 336 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 337 uint32_t *); 338 339 /* 340 * Free/release/unmap routines 341 */ 342 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 343 static void ibd_free_tx_copybufs(ibd_state_t *); 344 static void ibd_free_rx_copybufs(ibd_state_t *); 345 static void ibd_free_rx_rsrcs(ibd_state_t *); 346 static void ibd_free_tx_lsobufs(ibd_state_t *); 347 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 348 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 349 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 350 351 /* 352 * Handlers/callback routines 353 */ 354 static uint_t ibd_intr(caddr_t); 355 static uint_t ibd_tx_recycle(caddr_t); 356 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 357 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 358 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 359 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 360 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 361 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 362 static void ibd_freemsg_cb(char *); 363 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 364 ibt_async_event_t *); 365 static void ibdpd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 366 ibt_async_event_t *); 367 static void ibd_snet_notices_handler(void *, ib_gid_t, 368 ibt_subnet_event_code_t, ibt_subnet_event_t *); 369 370 /* 371 * Send/receive routines 372 */ 373 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 374 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 375 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 376 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 377 378 /* 379 * Threads 380 */ 381 static void ibd_async_work(ibd_state_t *); 382 383 /* 384 * Async tasks 385 */ 386 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 387 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 388 static void ibd_async_setprom(ibd_state_t *); 389 static void ibd_async_unsetprom(ibd_state_t *); 390 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 391 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 392 static void ibd_async_txsched(ibd_state_t *); 393 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 394 395 /* 396 * Async task helpers 397 */ 398 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 399 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 400 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 401 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 402 ipoib_mac_t *, ipoib_mac_t *); 403 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 404 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 405 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 406 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 407 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 408 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 409 static uint64_t ibd_get_portspeed(ibd_state_t *); 410 static boolean_t ibd_async_safe(ibd_state_t *); 411 static void ibd_async_done(ibd_state_t *); 412 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 413 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 414 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 415 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 416 417 /* 418 * Helpers for attach/start routines 419 */ 420 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 421 static int ibd_record_capab(ibd_state_t *); 422 static int ibd_get_port_details(ibd_state_t *); 423 static int ibd_alloc_cqs(ibd_state_t *); 424 static int ibd_setup_ud_channel(ibd_state_t *); 425 static int ibd_start(ibd_state_t *); 426 static int ibd_undo_start(ibd_state_t *, link_state_t); 427 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 428 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 429 static int ibd_part_attach(ibd_state_t *state, dev_info_t *dip); 430 static void ibd_part_unattach(ibd_state_t *state); 431 static int ibd_port_attach(dev_info_t *); 432 static int ibd_port_unattach(ibd_state_t *state, dev_info_t *dip); 433 static int ibd_get_port_state(ibd_state_t *, link_state_t *); 434 static int ibd_part_busy(ibd_state_t *); 435 436 /* 437 * Miscellaneous helpers 438 */ 439 static int ibd_sched_poll(ibd_state_t *, int, int); 440 static void ibd_resume_transmission(ibd_state_t *); 441 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 442 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 443 static void *list_get_head(list_t *); 444 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 445 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 446 447 ibt_status_t ibd_get_part_attr(datalink_id_t, ibt_part_attr_t *); 448 ibt_status_t ibd_get_all_part_attr(ibt_part_attr_t **, int *); 449 450 #ifdef IBD_LOGGING 451 static void ibd_log(const char *, ...); 452 #endif 453 454 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 455 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 456 457 /* Module Driver Info */ 458 static struct modldrv ibd_modldrv = { 459 &mod_driverops, /* This one is a driver */ 460 "InfiniBand GLDv3 Driver", /* short description */ 461 &ibd_dev_ops /* driver specific ops */ 462 }; 463 464 /* Module Linkage */ 465 static struct modlinkage ibd_modlinkage = { 466 MODREV_1, (void *)&ibd_modldrv, NULL 467 }; 468 469 /* 470 * Module (static) info passed to IBTL during ibt_attach 471 */ 472 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 473 IBTI_V_CURR, 474 IBT_NETWORK, 475 ibd_async_handler, 476 NULL, 477 "IBPART" 478 }; 479 480 static struct ibt_clnt_modinfo_s ibdpd_clnt_modinfo = { 481 IBTI_V_CURR, 482 IBT_NETWORK, 483 ibdpd_async_handler, 484 NULL, 485 "IPIB" 486 }; 487 488 /* 489 * GLDv3 entry points 490 */ 491 #define IBD_M_CALLBACK_FLAGS \ 492 (MC_GETCAPAB | MC_SETPROP | MC_GETPROP | MC_PROPINFO) 493 494 static mac_callbacks_t ibd_m_callbacks = { 495 IBD_M_CALLBACK_FLAGS, 496 ibd_m_stat, 497 ibd_m_start, 498 ibd_m_stop, 499 ibd_m_promisc, 500 ibd_m_multicst, 501 ibd_m_unicst, 502 ibd_m_tx, 503 NULL, 504 NULL, 505 ibd_m_getcapab, 506 NULL, 507 NULL, 508 ibd_m_setprop, 509 ibd_m_getprop, 510 ibd_m_propinfo 511 }; 512 513 /* Private properties */ 514 char *ibd_priv_props[] = { 515 "_ibd_broadcast_group", 516 "_ibd_coalesce_completions", 517 "_ibd_create_broadcast_group", 518 "_ibd_hash_size", 519 "_ibd_lso_enable", 520 "_ibd_num_ah", 521 "_ibd_num_lso_bufs", 522 "_ibd_rc_enable_srq", 523 "_ibd_rc_num_rwqe", 524 "_ibd_rc_num_srq", 525 "_ibd_rc_num_swqe", 526 "_ibd_rc_rx_comp_count", 527 "_ibd_rc_rx_comp_usec", 528 "_ibd_rc_rx_copy_thresh", 529 "_ibd_rc_rx_rwqe_thresh", 530 "_ibd_rc_tx_comp_count", 531 "_ibd_rc_tx_comp_usec", 532 "_ibd_rc_tx_copy_thresh", 533 "_ibd_ud_num_rwqe", 534 "_ibd_ud_num_swqe", 535 "_ibd_ud_rx_comp_count", 536 "_ibd_ud_rx_comp_usec", 537 "_ibd_ud_tx_comp_count", 538 "_ibd_ud_tx_comp_usec", 539 "_ibd_ud_tx_copy_thresh", 540 NULL 541 }; 542 543 static int ibd_create_partition(void *, intptr_t, int, cred_t *, int *); 544 static int ibd_delete_partition(void *, intptr_t, int, cred_t *, int *); 545 static int ibd_get_partition_info(void *, intptr_t, int, cred_t *, int *); 546 547 static dld_ioc_info_t ibd_dld_ioctl_list[] = { 548 {IBD_CREATE_IBPART, DLDCOPYINOUT, sizeof (ibpart_ioctl_t), 549 ibd_create_partition, secpolicy_dl_config}, 550 {IBD_DELETE_IBPART, DLDCOPYIN, sizeof (ibpart_ioctl_t), 551 ibd_delete_partition, secpolicy_dl_config}, 552 {IBD_INFO_IBPART, DLDCOPYIN, sizeof (ibd_ioctl_t), 553 ibd_get_partition_info, NULL} 554 }; 555 556 /* 557 * Fill/clear <scope> and <p_key> in multicast/broadcast address 558 */ 559 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 560 { \ 561 *(uint32_t *)((char *)(maddr) + 4) |= \ 562 htonl((uint32_t)(scope) << 16); \ 563 *(uint32_t *)((char *)(maddr) + 8) |= \ 564 htonl((uint32_t)(pkey) << 16); \ 565 } 566 567 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 568 { \ 569 *(uint32_t *)((char *)(maddr) + 4) &= \ 570 htonl(~((uint32_t)0xF << 16)); \ 571 *(uint32_t *)((char *)(maddr) + 8) &= \ 572 htonl(~((uint32_t)0xFFFF << 16)); \ 573 } 574 575 /* 576 * Rudimentary debugging support 577 */ 578 #ifdef DEBUG 579 int ibd_debuglevel = 100; 580 void 581 debug_print(int l, char *fmt, ...) 582 { 583 va_list ap; 584 585 if (l < ibd_debuglevel) 586 return; 587 va_start(ap, fmt); 588 vcmn_err(CE_CONT, fmt, ap); 589 va_end(ap); 590 } 591 #endif 592 593 /* 594 * Common routine to print warning messages; adds in hca guid, port number 595 * and pkey to be able to identify the IBA interface. 596 */ 597 void 598 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 599 { 600 ib_guid_t hca_guid; 601 char ibd_print_buf[MAXNAMELEN + 256]; 602 int len; 603 va_list ap; 604 char part_name[MAXNAMELEN]; 605 datalink_id_t linkid = state->id_plinkid; 606 607 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 608 0, "hca-guid", 0); 609 (void) dls_mgmt_get_linkinfo(linkid, part_name, NULL, NULL, NULL); 610 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 611 "%s%d: HCA GUID %016llx port %d PKEY %02x link %s ", 612 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 613 (u_longlong_t)hca_guid, state->id_port, state->id_pkey, 614 part_name); 615 va_start(ap, fmt); 616 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 617 fmt, ap); 618 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 619 va_end(ap); 620 } 621 622 int 623 _init() 624 { 625 int status; 626 627 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 628 PAGESIZE), 0); 629 if (status != 0) { 630 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 631 return (status); 632 } 633 634 mutex_init(&ibd_objlist_lock, NULL, MUTEX_DRIVER, NULL); 635 636 mac_init_ops(&ibd_dev_ops, "ibp"); 637 status = mod_install(&ibd_modlinkage); 638 if (status != 0) { 639 DPRINT(10, "_init:failed in mod_install()"); 640 ddi_soft_state_fini(&ibd_list); 641 mac_fini_ops(&ibd_dev_ops); 642 return (status); 643 } 644 645 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 646 mutex_enter(&ibd_gstate.ig_mutex); 647 ibd_gstate.ig_ibt_hdl = NULL; 648 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 649 ibd_gstate.ig_service_list = NULL; 650 mutex_exit(&ibd_gstate.ig_mutex); 651 652 if (dld_ioc_register(IBPART_IOC, ibd_dld_ioctl_list, 653 DLDIOCCNT(ibd_dld_ioctl_list)) != 0) { 654 return (EIO); 655 } 656 657 ibt_register_part_attr_cb(ibd_get_part_attr, ibd_get_all_part_attr); 658 659 #ifdef IBD_LOGGING 660 ibd_log_init(); 661 #endif 662 return (0); 663 } 664 665 int 666 _info(struct modinfo *modinfop) 667 { 668 return (mod_info(&ibd_modlinkage, modinfop)); 669 } 670 671 int 672 _fini() 673 { 674 int status; 675 676 status = mod_remove(&ibd_modlinkage); 677 if (status != 0) 678 return (status); 679 680 ibt_unregister_part_attr_cb(); 681 682 mac_fini_ops(&ibd_dev_ops); 683 mutex_destroy(&ibd_objlist_lock); 684 ddi_soft_state_fini(&ibd_list); 685 mutex_destroy(&ibd_gstate.ig_mutex); 686 #ifdef IBD_LOGGING 687 ibd_log_fini(); 688 #endif 689 return (0); 690 } 691 692 /* 693 * Convert the GID part of the mac address from network byte order 694 * to host order. 695 */ 696 static void 697 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 698 { 699 ib_sn_prefix_t nbopref; 700 ib_guid_t nboguid; 701 702 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 703 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 704 dgid->gid_prefix = b2h64(nbopref); 705 dgid->gid_guid = b2h64(nboguid); 706 } 707 708 /* 709 * Create the IPoIB address in network byte order from host order inputs. 710 */ 711 static void 712 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 713 ib_guid_t guid) 714 { 715 ib_sn_prefix_t nbopref; 716 ib_guid_t nboguid; 717 718 mac->ipoib_qpn = htonl(qpn); 719 nbopref = h2b64(prefix); 720 nboguid = h2b64(guid); 721 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 722 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 723 } 724 725 /* 726 * Send to the appropriate all-routers group when the IBA multicast group 727 * does not exist, based on whether the target group is v4 or v6. 728 */ 729 static boolean_t 730 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 731 ipoib_mac_t *rmac) 732 { 733 boolean_t retval = B_TRUE; 734 uint32_t adjscope = state->id_scope << 16; 735 uint32_t topword; 736 737 /* 738 * Copy the first 4 bytes in without assuming any alignment of 739 * input mac address; this will have IPoIB signature, flags and 740 * scope bits. 741 */ 742 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 743 topword = ntohl(topword); 744 745 /* 746 * Generate proper address for IPv4/v6, adding in the Pkey properly. 747 */ 748 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 749 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 750 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 751 ((uint32_t)(state->id_pkey << 16))), 752 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 753 else 754 /* 755 * Does not have proper bits in the mgid address. 756 */ 757 retval = B_FALSE; 758 759 return (retval); 760 } 761 762 /* 763 * Membership states for different mcg's are tracked by two lists: 764 * the "non" list is used for promiscuous mode, when all mcg traffic 765 * needs to be inspected. This type of membership is never used for 766 * transmission, so there can not be an AH in the active list 767 * corresponding to a member in this list. This list does not need 768 * any protection, since all operations are performed by the async 769 * thread. 770 * 771 * "Full" and "SendOnly" membership is tracked using a single list, 772 * the "full" list. This is because this single list can then be 773 * searched during transmit to a multicast group (if an AH for the 774 * mcg is not found in the active list), since at least one type 775 * of membership must be present before initiating the transmit. 776 * This list is also emptied during driver detach, since sendonly 777 * membership acquired during transmit is dropped at detach time 778 * along with ipv4 broadcast full membership. Insert/deletes to 779 * this list are done only by the async thread, but it is also 780 * searched in program context (see multicast disable case), thus 781 * the id_mc_mutex protects the list. The driver detach path also 782 * deconstructs the "full" list, but it ensures that the async 783 * thread will not be accessing the list (by blocking out mcg 784 * trap handling and making sure no more Tx reaping will happen). 785 * 786 * Currently, an IBA attach is done in the SendOnly case too, 787 * although this is not required. 788 */ 789 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 790 list_insert_head(&state->id_mc_full, mce) 791 #define IBD_MCACHE_INSERT_NON(state, mce) \ 792 list_insert_head(&state->id_mc_non, mce) 793 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 794 ibd_mcache_find(mgid, &state->id_mc_full) 795 #define IBD_MCACHE_FIND_NON(state, mgid) \ 796 ibd_mcache_find(mgid, &state->id_mc_non) 797 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 798 list_remove(&state->id_mc_full, mce) 799 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 800 list_remove(&state->id_mc_non, mce) 801 802 static void * 803 list_get_head(list_t *list) 804 { 805 list_node_t *lhead = list_head(list); 806 807 if (lhead != NULL) 808 list_remove(list, lhead); 809 return (lhead); 810 } 811 812 /* 813 * This is always guaranteed to be able to queue the work. 814 */ 815 void 816 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 817 { 818 /* Initialize request */ 819 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 820 ptr->rq_op = op; 821 822 /* 823 * Queue provided slot onto request pool. 824 */ 825 mutex_enter(&state->id_acache_req_lock); 826 list_insert_tail(&state->id_req_list, ptr); 827 828 /* Go, fetch, async thread */ 829 cv_signal(&state->id_acache_req_cv); 830 mutex_exit(&state->id_acache_req_lock); 831 } 832 833 /* 834 * Main body of the per interface async thread. 835 */ 836 static void 837 ibd_async_work(ibd_state_t *state) 838 { 839 ibd_req_t *ptr; 840 callb_cpr_t cprinfo; 841 842 mutex_enter(&state->id_acache_req_lock); 843 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 844 callb_generic_cpr, "ibd_async_work"); 845 846 for (;;) { 847 ptr = list_get_head(&state->id_req_list); 848 if (ptr != NULL) { 849 mutex_exit(&state->id_acache_req_lock); 850 851 /* 852 * If we are in late hca initialization mode, do not 853 * process any other async request other than TRAP. TRAP 854 * is used for indicating creation of a broadcast group; 855 * in which case, we need to join/create the group. 856 */ 857 if ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 858 (ptr->rq_op != IBD_ASYNC_TRAP)) { 859 goto free_req_and_continue; 860 } 861 862 /* 863 * Once we have done the operation, there is no 864 * guarantee the request slot is going to be valid, 865 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 866 * TRAP). 867 * 868 * Perform the request. 869 */ 870 switch (ptr->rq_op) { 871 case IBD_ASYNC_GETAH: 872 ibd_async_acache(state, &ptr->rq_mac); 873 break; 874 case IBD_ASYNC_JOIN: 875 case IBD_ASYNC_LEAVE: 876 ibd_async_multicast(state, 877 ptr->rq_gid, ptr->rq_op); 878 break; 879 case IBD_ASYNC_PROMON: 880 ibd_async_setprom(state); 881 break; 882 case IBD_ASYNC_PROMOFF: 883 ibd_async_unsetprom(state); 884 break; 885 case IBD_ASYNC_REAP: 886 ibd_async_reap_group(state, 887 ptr->rq_ptr, ptr->rq_gid, 888 IB_MC_JSTATE_FULL); 889 /* 890 * the req buf contains in mce 891 * structure, so we do not need 892 * to free it here. 893 */ 894 ptr = NULL; 895 break; 896 case IBD_ASYNC_TRAP: 897 ibd_async_trap(state, ptr); 898 break; 899 case IBD_ASYNC_SCHED: 900 ibd_async_txsched(state); 901 break; 902 case IBD_ASYNC_LINK: 903 ibd_async_link(state, ptr); 904 break; 905 case IBD_ASYNC_EXIT: 906 mutex_enter(&state->id_acache_req_lock); 907 CALLB_CPR_EXIT(&cprinfo); 908 return; 909 case IBD_ASYNC_RC_TOO_BIG: 910 ibd_async_rc_process_too_big(state, 911 ptr); 912 break; 913 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 914 ibd_async_rc_close_act_chan(state, ptr); 915 break; 916 case IBD_ASYNC_RC_RECYCLE_ACE: 917 ibd_async_rc_recycle_ace(state, ptr); 918 break; 919 case IBD_ASYNC_RC_CLOSE_PAS_CHAN: 920 (void) ibd_rc_pas_close(ptr->rq_ptr, 921 B_TRUE, B_TRUE); 922 break; 923 } 924 free_req_and_continue: 925 if (ptr != NULL) 926 kmem_cache_free(state->id_req_kmc, ptr); 927 928 mutex_enter(&state->id_acache_req_lock); 929 } else { 930 /* 931 * Nothing to do: wait till new request arrives. 932 */ 933 CALLB_CPR_SAFE_BEGIN(&cprinfo); 934 cv_wait(&state->id_acache_req_cv, 935 &state->id_acache_req_lock); 936 CALLB_CPR_SAFE_END(&cprinfo, 937 &state->id_acache_req_lock); 938 } 939 } 940 941 /*NOTREACHED*/ 942 _NOTE(NOT_REACHED) 943 } 944 945 /* 946 * Return when it is safe to queue requests to the async daemon; primarily 947 * for subnet trap and async event handling. Disallow requests before the 948 * daemon is created, and when interface deinitilization starts. 949 */ 950 static boolean_t 951 ibd_async_safe(ibd_state_t *state) 952 { 953 mutex_enter(&state->id_trap_lock); 954 if (state->id_trap_stop) { 955 mutex_exit(&state->id_trap_lock); 956 return (B_FALSE); 957 } 958 state->id_trap_inprog++; 959 mutex_exit(&state->id_trap_lock); 960 return (B_TRUE); 961 } 962 963 /* 964 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 965 * trap or event handling to complete to kill the async thread and deconstruct 966 * the mcg/ace list. 967 */ 968 static void 969 ibd_async_done(ibd_state_t *state) 970 { 971 mutex_enter(&state->id_trap_lock); 972 if (--state->id_trap_inprog == 0) 973 cv_signal(&state->id_trap_cv); 974 mutex_exit(&state->id_trap_lock); 975 } 976 977 /* 978 * Hash functions: 979 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 980 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 981 * These operate on mac addresses input into ibd_send, but there is no 982 * guarantee on the alignment of the ipoib_mac_t structure. 983 */ 984 /*ARGSUSED*/ 985 static uint_t 986 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 987 { 988 ulong_t ptraddr = (ulong_t)key; 989 uint_t hval; 990 991 /* 992 * If the input address is 4 byte aligned, we can just dereference 993 * it. This is most common, since IP will send in a 4 byte aligned 994 * IP header, which implies the 24 byte IPoIB psuedo header will be 995 * 4 byte aligned too. 996 */ 997 if ((ptraddr & 3) == 0) 998 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 999 1000 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1001 return (hval); 1002 } 1003 1004 static int 1005 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1006 { 1007 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1008 return (0); 1009 else 1010 return (1); 1011 } 1012 1013 /* 1014 * Initialize all the per interface caches and lists; AH cache, 1015 * MCG list etc. 1016 */ 1017 static int 1018 ibd_acache_init(ibd_state_t *state) 1019 { 1020 ibd_ace_t *ce; 1021 int i; 1022 1023 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1024 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1025 mutex_enter(&state->id_ac_mutex); 1026 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1027 offsetof(ibd_ace_t, ac_list)); 1028 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1029 offsetof(ibd_ace_t, ac_list)); 1030 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1031 state->id_hash_size, mod_hash_null_keydtor, mod_hash_null_valdtor, 1032 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1033 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1034 offsetof(ibd_mce_t, mc_list)); 1035 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1036 offsetof(ibd_mce_t, mc_list)); 1037 state->id_ac_hot_ace = NULL; 1038 1039 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1040 state->id_num_ah, KM_SLEEP); 1041 for (i = 0; i < state->id_num_ah; i++, ce++) { 1042 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1043 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1044 mutex_exit(&state->id_ac_mutex); 1045 ibd_acache_fini(state); 1046 return (DDI_FAILURE); 1047 } else { 1048 CLEAR_REFCYCLE(ce); 1049 ce->ac_mce = NULL; 1050 mutex_init(&ce->tx_too_big_mutex, NULL, 1051 MUTEX_DRIVER, NULL); 1052 IBD_ACACHE_INSERT_FREE(state, ce); 1053 } 1054 } 1055 mutex_exit(&state->id_ac_mutex); 1056 return (DDI_SUCCESS); 1057 } 1058 1059 static void 1060 ibd_acache_fini(ibd_state_t *state) 1061 { 1062 ibd_ace_t *ptr; 1063 1064 mutex_enter(&state->id_ac_mutex); 1065 1066 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1067 ASSERT(GET_REF(ptr) == 0); 1068 mutex_destroy(&ptr->tx_too_big_mutex); 1069 (void) ibt_free_ud_dest(ptr->ac_dest); 1070 } 1071 1072 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1073 ASSERT(GET_REF(ptr) == 0); 1074 mutex_destroy(&ptr->tx_too_big_mutex); 1075 (void) ibt_free_ud_dest(ptr->ac_dest); 1076 } 1077 1078 list_destroy(&state->id_ah_free); 1079 list_destroy(&state->id_ah_active); 1080 list_destroy(&state->id_mc_full); 1081 list_destroy(&state->id_mc_non); 1082 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * state->id_num_ah); 1083 mutex_exit(&state->id_ac_mutex); 1084 mutex_destroy(&state->id_ac_mutex); 1085 mutex_destroy(&state->id_mc_mutex); 1086 } 1087 1088 /* 1089 * Search AH active hash list for a cached path to input destination. 1090 * If we are "just looking", hold == F. When we are in the Tx path, 1091 * we set hold == T to grab a reference on the AH so that it can not 1092 * be recycled to a new destination while the Tx request is posted. 1093 */ 1094 ibd_ace_t * 1095 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1096 { 1097 ibd_ace_t *ptr; 1098 1099 ASSERT(mutex_owned(&state->id_ac_mutex)); 1100 1101 /* 1102 * Do hash search. 1103 */ 1104 if (mod_hash_find(state->id_ah_active_hash, 1105 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1106 if (hold) 1107 INC_REF(ptr, num); 1108 return (ptr); 1109 } 1110 return (NULL); 1111 } 1112 1113 /* 1114 * This is called by the tx side; if an initialized AH is found in 1115 * the active list, it is locked down and can be used; if no entry 1116 * is found, an async request is queued to do path resolution. 1117 */ 1118 static ibd_ace_t * 1119 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1120 { 1121 ibd_ace_t *ptr; 1122 ibd_req_t *req; 1123 1124 /* 1125 * Only attempt to print when we can; in the mdt pattr case, the 1126 * address is not aligned properly. 1127 */ 1128 if (((ulong_t)mac & 3) == 0) { 1129 DPRINT(4, 1130 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1131 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1132 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1133 htonl(mac->ipoib_gidsuff[1])); 1134 } 1135 1136 mutex_enter(&state->id_ac_mutex); 1137 1138 if (((ptr = state->id_ac_hot_ace) != NULL) && 1139 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1140 INC_REF(ptr, numwqe); 1141 mutex_exit(&state->id_ac_mutex); 1142 return (ptr); 1143 } 1144 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1145 state->id_ac_hot_ace = ptr; 1146 mutex_exit(&state->id_ac_mutex); 1147 return (ptr); 1148 } 1149 1150 /* 1151 * Implementation of a single outstanding async request; if 1152 * the operation is not started yet, queue a request and move 1153 * to ongoing state. Remember in id_ah_addr for which address 1154 * we are queueing the request, in case we need to flag an error; 1155 * Any further requests, for the same or different address, until 1156 * the operation completes, is sent back to GLDv3 to be retried. 1157 * The async thread will update id_ah_op with an error indication 1158 * or will set it to indicate the next look up can start; either 1159 * way, it will mac_tx_update() so that all blocked requests come 1160 * back here. 1161 */ 1162 *err = EAGAIN; 1163 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1164 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1165 if (req != NULL) { 1166 /* 1167 * We did not even find the entry; queue a request 1168 * for it. 1169 */ 1170 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1171 state->id_ah_op = IBD_OP_ONGOING; 1172 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1173 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1174 } 1175 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1176 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1177 /* 1178 * Check the status of the pathrecord lookup request 1179 * we had queued before. 1180 */ 1181 if (state->id_ah_op == IBD_OP_ERRORED) { 1182 *err = EFAULT; 1183 state->id_ah_error++; 1184 } else { 1185 /* 1186 * IBD_OP_ROUTERED case: We need to send to the 1187 * all-router MCG. If we can find the AH for 1188 * the mcg, the Tx will be attempted. If we 1189 * do not find the AH, we return NORESOURCES 1190 * to retry. 1191 */ 1192 ipoib_mac_t routermac; 1193 1194 (void) ibd_get_allroutergroup(state, mac, &routermac); 1195 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1196 numwqe); 1197 } 1198 state->id_ah_op = IBD_OP_NOTSTARTED; 1199 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1200 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1201 /* 1202 * This case can happen when we get a higher band 1203 * packet. The easiest way is to reset the state machine 1204 * to accommodate the higher priority packet. 1205 */ 1206 state->id_ah_op = IBD_OP_NOTSTARTED; 1207 } 1208 mutex_exit(&state->id_ac_mutex); 1209 1210 return (ptr); 1211 } 1212 1213 /* 1214 * Grab a not-currently-in-use AH/PathRecord from the active 1215 * list to recycle to a new destination. Only the async thread 1216 * executes this code. 1217 */ 1218 static ibd_ace_t * 1219 ibd_acache_get_unref(ibd_state_t *state) 1220 { 1221 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1222 boolean_t try_rc_chan_recycle = B_FALSE; 1223 1224 ASSERT(mutex_owned(&state->id_ac_mutex)); 1225 1226 /* 1227 * Do plain linear search. 1228 */ 1229 while (ptr != NULL) { 1230 /* 1231 * Note that it is possible that the "cycle" bit 1232 * is set on the AH w/o any reference count. The 1233 * mcg must have been deleted, and the tx cleanup 1234 * just decremented the reference count to 0, but 1235 * hasn't gotten around to grabbing the id_ac_mutex 1236 * to move the AH into the free list. 1237 */ 1238 if (GET_REF(ptr) == 0) { 1239 if (ptr->ac_chan != NULL) { 1240 ASSERT(state->id_enable_rc == B_TRUE); 1241 if (!try_rc_chan_recycle) { 1242 try_rc_chan_recycle = B_TRUE; 1243 ibd_rc_signal_ace_recycle(state, ptr); 1244 } 1245 } else { 1246 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1247 break; 1248 } 1249 } 1250 ptr = list_prev(&state->id_ah_active, ptr); 1251 } 1252 return (ptr); 1253 } 1254 1255 /* 1256 * Invoked to clean up AH from active list in case of multicast 1257 * disable and to handle sendonly memberships during mcg traps. 1258 * And for port up processing for multicast and unicast AHs. 1259 * Normally, the AH is taken off the active list, and put into 1260 * the free list to be recycled for a new destination. In case 1261 * Tx requests on the AH have not completed yet, the AH is marked 1262 * for reaping (which will put the AH on the free list) once the Tx's 1263 * complete; in this case, depending on the "force" input, we take 1264 * out the AH from the active list right now, or leave it also for 1265 * the reap operation. Returns TRUE if the AH is taken off the active 1266 * list (and either put into the free list right now, or arranged for 1267 * later), FALSE otherwise. 1268 */ 1269 boolean_t 1270 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1271 { 1272 ibd_ace_t *acactive; 1273 boolean_t ret = B_TRUE; 1274 1275 ASSERT(mutex_owned(&state->id_ac_mutex)); 1276 1277 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1278 1279 /* 1280 * Note that the AH might already have the cycle bit set 1281 * on it; this might happen if sequences of multicast 1282 * enables and disables are coming so fast, that posted 1283 * Tx's to the mcg have not completed yet, and the cycle 1284 * bit is set successively by each multicast disable. 1285 */ 1286 if (SET_CYCLE_IF_REF(acactive)) { 1287 if (!force) { 1288 /* 1289 * The ace is kept on the active list, further 1290 * Tx's can still grab a reference on it; the 1291 * ace is reaped when all pending Tx's 1292 * referencing the AH complete. 1293 */ 1294 ret = B_FALSE; 1295 } else { 1296 /* 1297 * In the mcg trap case, we always pull the 1298 * AH from the active list. And also the port 1299 * up multi/unicast case. 1300 */ 1301 ASSERT(acactive->ac_chan == NULL); 1302 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1303 acactive->ac_mce = NULL; 1304 } 1305 } else { 1306 /* 1307 * Determined the ref count is 0, thus reclaim 1308 * immediately after pulling out the ace from 1309 * the active list. 1310 */ 1311 ASSERT(acactive->ac_chan == NULL); 1312 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1313 acactive->ac_mce = NULL; 1314 IBD_ACACHE_INSERT_FREE(state, acactive); 1315 } 1316 1317 } 1318 return (ret); 1319 } 1320 1321 /* 1322 * Helper function for async path record lookup. If we are trying to 1323 * Tx to a MCG, check our membership, possibly trying to join the 1324 * group if required. If that fails, try to send the packet to the 1325 * all router group (indicated by the redirect output), pointing 1326 * the input mac address to the router mcg address. 1327 */ 1328 static ibd_mce_t * 1329 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1330 { 1331 ib_gid_t mgid; 1332 ibd_mce_t *mce; 1333 ipoib_mac_t routermac; 1334 1335 *redirect = B_FALSE; 1336 ibd_n2h_gid(mac, &mgid); 1337 1338 /* 1339 * Check the FullMember+SendOnlyNonMember list. 1340 * Since we are the only one who manipulates the 1341 * id_mc_full list, no locks are needed. 1342 */ 1343 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1344 if (mce != NULL) { 1345 DPRINT(4, "ibd_async_mcache : already joined to group"); 1346 return (mce); 1347 } 1348 1349 /* 1350 * Not found; try to join(SendOnlyNonMember) and attach. 1351 */ 1352 DPRINT(4, "ibd_async_mcache : not joined to group"); 1353 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1354 NULL) { 1355 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1356 return (mce); 1357 } 1358 1359 /* 1360 * MCGroup not present; try to join the all-router group. If 1361 * any of the following steps succeed, we will be redirecting 1362 * to the all router group. 1363 */ 1364 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1365 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1366 return (NULL); 1367 *redirect = B_TRUE; 1368 ibd_n2h_gid(&routermac, &mgid); 1369 bcopy(&routermac, mac, IPOIB_ADDRL); 1370 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1371 mgid.gid_prefix, mgid.gid_guid); 1372 1373 /* 1374 * Are we already joined to the router group? 1375 */ 1376 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1377 DPRINT(4, "ibd_async_mcache : using already joined router" 1378 "group\n"); 1379 return (mce); 1380 } 1381 1382 /* 1383 * Can we join(SendOnlyNonMember) the router group? 1384 */ 1385 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1386 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1387 NULL) { 1388 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1389 return (mce); 1390 } 1391 1392 return (NULL); 1393 } 1394 1395 /* 1396 * Async path record lookup code. 1397 */ 1398 static void 1399 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1400 { 1401 ibd_ace_t *ce; 1402 ibd_mce_t *mce = NULL; 1403 ibt_path_attr_t path_attr; 1404 ibt_path_info_t path_info; 1405 ib_gid_t destgid; 1406 char ret = IBD_OP_NOTSTARTED; 1407 1408 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1409 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1410 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1411 htonl(mac->ipoib_gidsuff[1])); 1412 1413 /* 1414 * Check whether we are trying to transmit to a MCG. 1415 * In that case, we need to make sure we are a member of 1416 * the MCG. 1417 */ 1418 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1419 boolean_t redirected; 1420 1421 /* 1422 * If we can not find or join the group or even 1423 * redirect, error out. 1424 */ 1425 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1426 NULL) { 1427 state->id_ah_op = IBD_OP_ERRORED; 1428 return; 1429 } 1430 1431 /* 1432 * If we got redirected, we need to determine whether 1433 * the AH for the new mcg is in the cache already, and 1434 * not pull it in then; otherwise proceed to get the 1435 * path for the new mcg. There is no guarantee that 1436 * if the AH is currently in the cache, it will still be 1437 * there when we look in ibd_acache_lookup(), but that's 1438 * okay, we will come back here. 1439 */ 1440 if (redirected) { 1441 ret = IBD_OP_ROUTERED; 1442 DPRINT(4, "ibd_async_acache : redirected to " 1443 "%08X:%08X:%08X:%08X:%08X", 1444 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1445 htonl(mac->ipoib_gidpref[1]), 1446 htonl(mac->ipoib_gidsuff[0]), 1447 htonl(mac->ipoib_gidsuff[1])); 1448 1449 mutex_enter(&state->id_ac_mutex); 1450 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1451 state->id_ah_op = IBD_OP_ROUTERED; 1452 mutex_exit(&state->id_ac_mutex); 1453 DPRINT(4, "ibd_async_acache : router AH found"); 1454 return; 1455 } 1456 mutex_exit(&state->id_ac_mutex); 1457 } 1458 } 1459 1460 /* 1461 * Get an AH from the free list. 1462 */ 1463 mutex_enter(&state->id_ac_mutex); 1464 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1465 /* 1466 * No free ones; try to grab an unreferenced active 1467 * one. Maybe we need to make the active list LRU, 1468 * but that will create more work for Tx callbacks. 1469 * Is there a way of not having to pull out the 1470 * entry from the active list, but just indicate it 1471 * is being recycled? Yes, but that creates one more 1472 * check in the fast lookup path. 1473 */ 1474 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1475 /* 1476 * Pretty serious shortage now. 1477 */ 1478 state->id_ah_op = IBD_OP_NOTSTARTED; 1479 mutex_exit(&state->id_ac_mutex); 1480 DPRINT(10, "ibd_async_acache : failed to find AH " 1481 "slot\n"); 1482 return; 1483 } 1484 /* 1485 * We could check whether ac_mce points to a SendOnly 1486 * member and drop that membership now. Or do it lazily 1487 * at detach time. 1488 */ 1489 ce->ac_mce = NULL; 1490 } 1491 mutex_exit(&state->id_ac_mutex); 1492 ASSERT(ce->ac_mce == NULL); 1493 1494 /* 1495 * Update the entry. 1496 */ 1497 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1498 1499 bzero(&path_info, sizeof (path_info)); 1500 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1501 path_attr.pa_sgid = state->id_sgid; 1502 path_attr.pa_num_dgids = 1; 1503 ibd_n2h_gid(&ce->ac_mac, &destgid); 1504 path_attr.pa_dgids = &destgid; 1505 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1506 path_attr.pa_pkey = state->id_pkey; 1507 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_PKEY, &path_attr, 1, 1508 &path_info, NULL) != IBT_SUCCESS) { 1509 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1510 goto error; 1511 } 1512 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1513 ntohl(ce->ac_mac.ipoib_qpn), 1514 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1515 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1516 goto error; 1517 } 1518 1519 /* 1520 * mce is set whenever an AH is being associated with a 1521 * MCG; this will come in handy when we leave the MCG. The 1522 * lock protects Tx fastpath from scanning the active list. 1523 */ 1524 if (mce != NULL) 1525 ce->ac_mce = mce; 1526 1527 /* 1528 * initiate a RC mode connection for unicast address 1529 */ 1530 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1531 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1532 ASSERT(ce->ac_chan == NULL); 1533 DPRINT(10, "ibd_async_acache: call " 1534 "ibd_rc_try_connect(ace=%p)", ce); 1535 ibd_rc_try_connect(state, ce, &path_info); 1536 if (ce->ac_chan == NULL) { 1537 DPRINT(10, "ibd_async_acache: fail to setup RC" 1538 " channel"); 1539 state->rc_conn_fail++; 1540 goto error; 1541 } 1542 } 1543 1544 mutex_enter(&state->id_ac_mutex); 1545 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1546 state->id_ah_op = ret; 1547 mutex_exit(&state->id_ac_mutex); 1548 return; 1549 error: 1550 /* 1551 * We might want to drop SendOnly membership here if we 1552 * joined above. The lock protects Tx callbacks inserting 1553 * into the free list. 1554 */ 1555 mutex_enter(&state->id_ac_mutex); 1556 state->id_ah_op = IBD_OP_ERRORED; 1557 IBD_ACACHE_INSERT_FREE(state, ce); 1558 mutex_exit(&state->id_ac_mutex); 1559 } 1560 1561 /* 1562 * While restoring port's presence on the subnet on a port up, it is possible 1563 * that the port goes down again. 1564 */ 1565 static void 1566 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1567 { 1568 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1569 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1570 LINK_STATE_UP; 1571 ibd_mce_t *mce, *pmce; 1572 ibd_ace_t *ace, *pace; 1573 1574 DPRINT(10, "ibd_async_link(): %d", opcode); 1575 1576 /* 1577 * On a link up, revalidate the link speed/width. No point doing 1578 * this on a link down, since we will be unable to do SA operations, 1579 * defaulting to the lowest speed. Also notice that we update our 1580 * notion of speed before calling mac_link_update(), which will do 1581 * necessary higher level notifications for speed changes. 1582 */ 1583 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1584 state->id_link_speed = ibd_get_portspeed(state); 1585 } 1586 1587 /* 1588 * Do all the work required to establish our presence on 1589 * the subnet. 1590 */ 1591 if (opcode == IBD_LINK_UP_ABSENT) { 1592 /* 1593 * If in promiscuous mode ... 1594 */ 1595 if (state->id_prom_op == IBD_OP_COMPLETED) { 1596 /* 1597 * Drop all nonmembership. 1598 */ 1599 ibd_async_unsetprom(state); 1600 1601 /* 1602 * Then, try to regain nonmembership to all mcg's. 1603 */ 1604 ibd_async_setprom(state); 1605 1606 } 1607 1608 /* 1609 * Drop all sendonly membership (which also gets rid of the 1610 * AHs); try to reacquire all full membership. 1611 */ 1612 mce = list_head(&state->id_mc_full); 1613 while ((pmce = mce) != NULL) { 1614 mce = list_next(&state->id_mc_full, mce); 1615 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1616 ibd_leave_group(state, 1617 pmce->mc_info.mc_adds_vect.av_dgid, 1618 IB_MC_JSTATE_SEND_ONLY_NON); 1619 else 1620 ibd_reacquire_group(state, pmce); 1621 } 1622 1623 /* 1624 * Recycle all active AHs to free list (and if there are 1625 * pending posts, make sure they will go into the free list 1626 * once the Tx's complete). Grab the lock to prevent 1627 * concurrent Tx's as well as Tx cleanups. 1628 */ 1629 mutex_enter(&state->id_ac_mutex); 1630 ace = list_head(&state->id_ah_active); 1631 while ((pace = ace) != NULL) { 1632 boolean_t cycled; 1633 1634 ace = list_next(&state->id_ah_active, ace); 1635 mce = pace->ac_mce; 1636 if (pace->ac_chan != NULL) { 1637 ASSERT(mce == NULL); 1638 ASSERT(state->id_enable_rc == B_TRUE); 1639 if (pace->ac_chan->chan_state == 1640 IBD_RC_STATE_ACT_ESTAB) { 1641 INC_REF(pace, 1); 1642 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 1643 pace->ac_chan->chan_state = 1644 IBD_RC_STATE_ACT_CLOSING; 1645 ibd_rc_signal_act_close(state, pace); 1646 } else { 1647 state->rc_act_close_simultaneous++; 1648 DPRINT(40, "ibd_async_link: other " 1649 "thread is closing it, ace=%p, " 1650 "ac_chan=%p, chan_state=%d", 1651 pace, pace->ac_chan, 1652 pace->ac_chan->chan_state); 1653 } 1654 } else { 1655 cycled = ibd_acache_recycle(state, 1656 &pace->ac_mac, B_TRUE); 1657 } 1658 /* 1659 * If this is for an mcg, it must be for a fullmember, 1660 * since we got rid of send-only members above when 1661 * processing the mce list. 1662 */ 1663 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1664 IB_MC_JSTATE_FULL))); 1665 1666 /* 1667 * Check if the fullmember mce needs to be torn down, 1668 * ie whether the DLPI disable has already been done. 1669 * If so, do some of the work of tx_cleanup, namely 1670 * causing leave (which will fail), detach and 1671 * mce-freeing. tx_cleanup will put the AH into free 1672 * list. The reason to duplicate some of this 1673 * tx_cleanup work is because we want to delete the 1674 * AH right now instead of waiting for tx_cleanup, to 1675 * force subsequent Tx's to reacquire an AH. 1676 */ 1677 if ((mce != NULL) && (mce->mc_fullreap)) 1678 ibd_async_reap_group(state, mce, 1679 mce->mc_info.mc_adds_vect.av_dgid, 1680 mce->mc_jstate); 1681 } 1682 mutex_exit(&state->id_ac_mutex); 1683 } 1684 1685 /* 1686 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1687 * (which stops further events from being delivered) before 1688 * mac_unregister(). At this point, it is guaranteed that mac_register 1689 * has already been done. 1690 */ 1691 mutex_enter(&state->id_link_mutex); 1692 state->id_link_state = lstate; 1693 mac_link_update(state->id_mh, lstate); 1694 mutex_exit(&state->id_link_mutex); 1695 1696 ibd_async_done(state); 1697 } 1698 1699 /* 1700 * Check the pkey table to see if we can find the pkey we're looking for. 1701 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1702 * failure. 1703 */ 1704 static int 1705 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1706 uint16_t *pkix) 1707 { 1708 uint16_t ndx; 1709 1710 ASSERT(pkix != NULL); 1711 1712 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1713 if (pkey_tbl[ndx] == pkey) { 1714 *pkix = ndx; 1715 return (0); 1716 } 1717 } 1718 return (-1); 1719 } 1720 1721 /* 1722 * Late HCA Initialization: 1723 * If plumb had succeeded without the availability of an active port or the 1724 * pkey, and either of their availability is now being indicated via PORT_UP 1725 * or PORT_CHANGE respectively, try a start of the interface. 1726 * 1727 * Normal Operation: 1728 * When the link is notified up, we need to do a few things, based 1729 * on the port's current p_init_type_reply claiming a reinit has been 1730 * done or not. The reinit steps are: 1731 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1732 * the old Pkey and GID0 are correct. 1733 * 2. Register for mcg traps (already done by ibmf). 1734 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1735 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1736 * 4. Give up all sendonly memberships. 1737 * 5. Acquire all full memberships. 1738 * 6. In promiscuous mode, acquire all non memberships. 1739 * 7. Recycle all AHs to free list. 1740 */ 1741 static void 1742 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 1743 { 1744 ibt_hca_portinfo_t *port_infop = NULL; 1745 ibt_status_t ibt_status; 1746 uint_t psize, port_infosz; 1747 ibd_link_op_t opcode; 1748 ibd_req_t *req; 1749 link_state_t new_link_state = LINK_STATE_UP; 1750 uint8_t itreply; 1751 uint16_t pkix; 1752 int ret; 1753 1754 /* 1755 * Let's not race with a plumb or an unplumb; if we detect a 1756 * pkey relocation event later on here, we may have to restart. 1757 */ 1758 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 1759 1760 mutex_enter(&state->id_link_mutex); 1761 1762 /* 1763 * If the link state is unknown, a plumb has not yet been attempted 1764 * on the interface. Nothing to do. 1765 */ 1766 if (state->id_link_state == LINK_STATE_UNKNOWN) { 1767 mutex_exit(&state->id_link_mutex); 1768 goto link_mod_return; 1769 } 1770 1771 /* 1772 * If link state is down because of plumb failure, and we are not in 1773 * late HCA init, and we were not successfully plumbed, nothing to do. 1774 */ 1775 if ((state->id_link_state == LINK_STATE_DOWN) && 1776 ((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 0) && 1777 ((state->id_mac_state & IBD_DRV_STARTED) == 0)) { 1778 mutex_exit(&state->id_link_mutex); 1779 goto link_mod_return; 1780 } 1781 1782 /* 1783 * If this routine was called in response to a port down event, 1784 * we just need to see if this should be informed. 1785 */ 1786 if (code == IBT_ERROR_PORT_DOWN) { 1787 new_link_state = LINK_STATE_DOWN; 1788 goto update_link_state; 1789 } 1790 1791 /* 1792 * If it's not a port down event we've received, try to get the port 1793 * attributes first. If we fail here, the port is as good as down. 1794 * Otherwise, if the link went down by the time the handler gets 1795 * here, give up - we cannot even validate the pkey/gid since those 1796 * are not valid and this is as bad as a port down anyway. 1797 */ 1798 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 1799 &port_infop, &psize, &port_infosz); 1800 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 1801 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 1802 new_link_state = LINK_STATE_DOWN; 1803 goto update_link_state; 1804 } 1805 1806 /* 1807 * If in the previous attempt, the pkey was not found either due to the 1808 * port state being down, or due to it's absence in the pkey table, 1809 * look for it now and try to start the interface. 1810 */ 1811 if (state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) { 1812 mutex_exit(&state->id_link_mutex); 1813 if ((ret = ibd_start(state)) != 0) { 1814 DPRINT(10, "ibd_linkmod: cannot start from late HCA " 1815 "init, ret=%d", ret); 1816 } 1817 ibt_free_portinfo(port_infop, port_infosz); 1818 goto link_mod_return; 1819 } 1820 1821 /* 1822 * Check the SM InitTypeReply flags. If both NoLoadReply and 1823 * PreserveContentReply are 0, we don't know anything about the 1824 * data loaded into the port attributes, so we need to verify 1825 * if gid0 and pkey are still valid. 1826 */ 1827 itreply = port_infop->p_init_type_reply; 1828 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 1829 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 1830 /* 1831 * Check to see if the subnet part of GID0 has changed. If 1832 * not, check the simple case first to see if the pkey 1833 * index is the same as before; finally check to see if the 1834 * pkey has been relocated to a different index in the table. 1835 */ 1836 if (bcmp(port_infop->p_sgid_tbl, 1837 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 1838 1839 new_link_state = LINK_STATE_DOWN; 1840 1841 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 1842 state->id_pkey) { 1843 1844 new_link_state = LINK_STATE_UP; 1845 1846 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 1847 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 1848 1849 ibt_free_portinfo(port_infop, port_infosz); 1850 mutex_exit(&state->id_link_mutex); 1851 1852 /* 1853 * Currently a restart is required if our pkey has moved 1854 * in the pkey table. If we get the ibt_recycle_ud() to 1855 * work as documented (expected), we may be able to 1856 * avoid a complete restart. Note that we've already 1857 * marked both the start and stop 'in-progress' flags, 1858 * so it is ok to go ahead and do this restart. 1859 */ 1860 (void) ibd_undo_start(state, LINK_STATE_DOWN); 1861 if ((ret = ibd_start(state)) != 0) { 1862 DPRINT(10, "ibd_restart: cannot restart, " 1863 "ret=%d", ret); 1864 } 1865 1866 goto link_mod_return; 1867 } else { 1868 new_link_state = LINK_STATE_DOWN; 1869 } 1870 } 1871 1872 update_link_state: 1873 if (port_infop) { 1874 ibt_free_portinfo(port_infop, port_infosz); 1875 } 1876 1877 /* 1878 * If we're reporting a link up, check InitTypeReply to see if 1879 * the SM has ensured that the port's presence in mcg, traps, 1880 * etc. is intact. 1881 */ 1882 if (new_link_state == LINK_STATE_DOWN) { 1883 opcode = IBD_LINK_DOWN; 1884 } else { 1885 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 1886 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 1887 opcode = IBD_LINK_UP; 1888 } else { 1889 opcode = IBD_LINK_UP_ABSENT; 1890 } 1891 } 1892 1893 /* 1894 * If the old state is the same as the new state, and the SM indicated 1895 * no change in the port parameters, nothing to do. 1896 */ 1897 if ((state->id_link_state == new_link_state) && (opcode != 1898 IBD_LINK_UP_ABSENT)) { 1899 mutex_exit(&state->id_link_mutex); 1900 goto link_mod_return; 1901 } 1902 1903 /* 1904 * Ok, so there was a link state change; see if it's safe to ask 1905 * the async thread to do the work 1906 */ 1907 if (!ibd_async_safe(state)) { 1908 state->id_link_state = new_link_state; 1909 mutex_exit(&state->id_link_mutex); 1910 goto link_mod_return; 1911 } 1912 1913 mutex_exit(&state->id_link_mutex); 1914 1915 /* 1916 * Queue up a request for ibd_async_link() to handle this link 1917 * state change event 1918 */ 1919 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 1920 req->rq_ptr = (void *)opcode; 1921 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 1922 1923 link_mod_return: 1924 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 1925 } 1926 1927 /* 1928 * For the port up/down events, IBTL guarantees there will not be concurrent 1929 * invocations of the handler. IBTL might coalesce link transition events, 1930 * and not invoke the handler for _each_ up/down transition, but it will 1931 * invoke the handler with last known state 1932 */ 1933 static void 1934 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 1935 ibt_async_code_t code, ibt_async_event_t *event) 1936 { 1937 ibd_state_t *state = (ibd_state_t *)clnt_private; 1938 1939 switch (code) { 1940 case IBT_ERROR_CATASTROPHIC_CHAN: 1941 ibd_print_warn(state, "catastrophic channel error"); 1942 break; 1943 case IBT_ERROR_CQ: 1944 ibd_print_warn(state, "completion queue error"); 1945 break; 1946 case IBT_PORT_CHANGE_EVENT: 1947 /* 1948 * Events will be delivered to all instances that have 1949 * done ibt_open_hca() but not yet done ibt_close_hca(). 1950 * Only need to do work for our port; IBTF will deliver 1951 * events for other ports on the hca we have ibt_open_hca'ed 1952 * too. Note that id_port is initialized in ibd_attach() 1953 * before we do an ibt_open_hca() in ibd_attach(). 1954 */ 1955 ASSERT(state->id_hca_hdl == hca_hdl); 1956 if (state->id_port != event->ev_port) 1957 break; 1958 1959 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 1960 IBT_PORT_CHANGE_PKEY) { 1961 ibd_link_mod(state, code); 1962 } 1963 break; 1964 case IBT_ERROR_PORT_DOWN: 1965 case IBT_CLNT_REREG_EVENT: 1966 case IBT_EVENT_PORT_UP: 1967 /* 1968 * Events will be delivered to all instances that have 1969 * done ibt_open_hca() but not yet done ibt_close_hca(). 1970 * Only need to do work for our port; IBTF will deliver 1971 * events for other ports on the hca we have ibt_open_hca'ed 1972 * too. Note that id_port is initialized in ibd_attach() 1973 * before we do an ibt_open_hca() in ibd_attach(). 1974 */ 1975 ASSERT(state->id_hca_hdl == hca_hdl); 1976 if (state->id_port != event->ev_port) 1977 break; 1978 1979 ibd_link_mod(state, code); 1980 break; 1981 1982 case IBT_HCA_ATTACH_EVENT: 1983 case IBT_HCA_DETACH_EVENT: 1984 /* 1985 * When a new card is plugged to the system, attach_event is 1986 * invoked. Additionally, a cfgadm needs to be run to make the 1987 * card known to the system, and an ifconfig needs to be run to 1988 * plumb up any ibd interfaces on the card. In the case of card 1989 * unplug, a cfgadm is run that will trigger any RCM scripts to 1990 * unplumb the ibd interfaces on the card; when the card is 1991 * actually unplugged, the detach_event is invoked; 1992 * additionally, if any ibd instances are still active on the 1993 * card (eg there were no associated RCM scripts), driver's 1994 * detach routine is invoked. 1995 */ 1996 break; 1997 default: 1998 break; 1999 } 2000 } 2001 2002 static int 2003 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2004 { 2005 mac_register_t *macp; 2006 int ret; 2007 2008 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2009 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2010 return (DDI_FAILURE); 2011 } 2012 2013 /* 2014 * Note that when we register with mac during attach, we don't 2015 * have the id_macaddr yet, so we'll simply be registering a 2016 * zero macaddr that we'll overwrite later during plumb (in 2017 * ibd_m_start()). Similar is the case with id_mtu - we'll 2018 * update the mac layer with the correct mtu during plumb. 2019 */ 2020 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2021 macp->m_driver = state; 2022 macp->m_dip = dip; 2023 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2024 macp->m_callbacks = &ibd_m_callbacks; 2025 macp->m_min_sdu = 0; 2026 macp->m_multicast_sdu = IBD_DEF_MAX_SDU; 2027 if (state->id_type == IBD_PORT_DRIVER) { 2028 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 2029 } else if (state->id_enable_rc) { 2030 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2031 } else { 2032 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2033 } 2034 macp->m_priv_props = ibd_priv_props; 2035 2036 /* 2037 * Register ourselves with the GLDv3 interface 2038 */ 2039 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2040 mac_free(macp); 2041 DPRINT(10, 2042 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2043 return (DDI_FAILURE); 2044 } 2045 2046 mac_free(macp); 2047 return (DDI_SUCCESS); 2048 } 2049 2050 static int 2051 ibd_record_capab(ibd_state_t *state) 2052 { 2053 ibt_hca_attr_t hca_attrs; 2054 ibt_status_t ibt_status; 2055 2056 /* 2057 * Query the HCA and fetch its attributes 2058 */ 2059 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2060 ASSERT(ibt_status == IBT_SUCCESS); 2061 2062 /* 2063 * 1. Set the Hardware Checksum capability. Currently we only consider 2064 * full checksum offload. 2065 */ 2066 if (state->id_enable_rc) { 2067 state->id_hwcksum_capab = 0; 2068 } else { 2069 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2070 == IBT_HCA_CKSUM_FULL) { 2071 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2072 } 2073 } 2074 2075 /* 2076 * 2. Set LSO policy, capability and maximum length 2077 */ 2078 if (state->id_enable_rc) { 2079 state->id_lso_capable = B_FALSE; 2080 state->id_lso_maxlen = 0; 2081 } else { 2082 if (hca_attrs.hca_max_lso_size > 0) { 2083 state->id_lso_capable = B_TRUE; 2084 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2085 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2086 else 2087 state->id_lso_maxlen = 2088 hca_attrs.hca_max_lso_size; 2089 } else { 2090 state->id_lso_capable = B_FALSE; 2091 state->id_lso_maxlen = 0; 2092 } 2093 } 2094 2095 /* 2096 * 3. Set Reserved L_Key capability 2097 */ 2098 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2099 state->id_hca_res_lkey_capab = 1; 2100 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2101 state->rc_enable_iov_map = B_TRUE; 2102 } else { 2103 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2104 state->rc_enable_iov_map = B_FALSE; 2105 } 2106 2107 /* 2108 * 4. Set maximum sqseg value after checking to see if extended sgl 2109 * size information is provided by the hca 2110 */ 2111 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2112 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2113 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2114 } else { 2115 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2116 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2117 } 2118 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2119 state->id_max_sqseg = IBD_MAX_SQSEG; 2120 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2121 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2122 state->id_max_sqseg, IBD_MAX_SQSEG); 2123 } 2124 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2125 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2126 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2127 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2128 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2129 } 2130 2131 /* 2132 * Translating the virtual address regions into physical regions 2133 * for using the Reserved LKey feature results in a wr sgl that 2134 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2135 * we'll fix a high-water mark (65%) for when we should stop. 2136 */ 2137 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2138 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2139 2140 /* 2141 * 5. Set number of recv and send wqes after checking hca maximum 2142 * channel size. Store the max channel size in the state so that it 2143 * can be referred to when the swqe/rwqe change is requested via 2144 * dladm. 2145 */ 2146 2147 state->id_hca_max_chan_sz = hca_attrs.hca_max_chan_sz; 2148 2149 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_rwqe) 2150 state->id_ud_num_rwqe = hca_attrs.hca_max_chan_sz; 2151 2152 state->id_rx_bufs_outstanding_limit = state->id_ud_num_rwqe - 2153 IBD_RWQE_MIN; 2154 2155 if (hca_attrs.hca_max_chan_sz < state->id_ud_num_swqe) 2156 state->id_ud_num_swqe = hca_attrs.hca_max_chan_sz; 2157 2158 return (DDI_SUCCESS); 2159 } 2160 2161 static int 2162 ibd_part_busy(ibd_state_t *state) 2163 { 2164 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2165 DPRINT(10, "ibd_part_busy: failed: rx bufs outstanding\n"); 2166 return (DDI_FAILURE); 2167 } 2168 2169 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2170 DPRINT(10, "ibd_part_busy: failed: srq bufs outstanding\n"); 2171 return (DDI_FAILURE); 2172 } 2173 2174 /* 2175 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB port is 2176 * connecting to a remote IPoIB port. We can't remove this port. 2177 */ 2178 if (state->id_ah_op == IBD_OP_ONGOING) { 2179 DPRINT(10, "ibd_part_busy: failed: connecting\n"); 2180 return (DDI_FAILURE); 2181 } 2182 2183 return (DDI_SUCCESS); 2184 } 2185 2186 2187 static void 2188 ibd_part_unattach(ibd_state_t *state) 2189 { 2190 uint32_t progress = state->id_mac_state; 2191 ibt_status_t ret; 2192 2193 /* make sure rx resources are freed */ 2194 ibd_free_rx_rsrcs(state); 2195 2196 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2197 ASSERT(state->id_enable_rc); 2198 ibd_rc_fini_srq_list(state); 2199 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2200 } 2201 2202 if (progress & IBD_DRV_MAC_REGISTERED) { 2203 (void) mac_unregister(state->id_mh); 2204 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2205 } 2206 2207 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 2208 /* 2209 * No new async requests will be posted since the device 2210 * link state has been marked as unknown; completion handlers 2211 * have been turned off, so Tx handler will not cause any 2212 * more IBD_ASYNC_REAP requests. 2213 * 2214 * Queue a request for the async thread to exit, which will 2215 * be serviced after any pending ones. This can take a while, 2216 * specially if the SM is unreachable, since IBMF will slowly 2217 * timeout each SM request issued by the async thread. Reap 2218 * the thread before continuing on, we do not want it to be 2219 * lingering in modunloaded code. 2220 */ 2221 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 2222 thread_join(state->id_async_thrid); 2223 2224 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 2225 } 2226 2227 if (progress & IBD_DRV_REQ_LIST_INITED) { 2228 list_destroy(&state->id_req_list); 2229 mutex_destroy(&state->id_acache_req_lock); 2230 cv_destroy(&state->id_acache_req_cv); 2231 state->id_mac_state &= ~IBD_DRV_REQ_LIST_INITED; 2232 } 2233 2234 if (progress & IBD_DRV_PD_ALLOCD) { 2235 if ((ret = ibt_free_pd(state->id_hca_hdl, 2236 state->id_pd_hdl)) != IBT_SUCCESS) { 2237 ibd_print_warn(state, "failed to free " 2238 "protection domain, ret=%d", ret); 2239 } 2240 state->id_pd_hdl = NULL; 2241 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2242 } 2243 2244 if (progress & IBD_DRV_HCA_OPENED) { 2245 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2246 IBT_SUCCESS) { 2247 ibd_print_warn(state, "failed to close " 2248 "HCA device, ret=%d", ret); 2249 } 2250 state->id_hca_hdl = NULL; 2251 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2252 } 2253 2254 mutex_enter(&ibd_gstate.ig_mutex); 2255 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2256 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2257 IBT_SUCCESS) { 2258 ibd_print_warn(state, 2259 "ibt_detach() failed, ret=%d", ret); 2260 } 2261 state->id_ibt_hdl = NULL; 2262 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2263 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2264 } 2265 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2266 (ibd_gstate.ig_ibt_hdl != NULL)) { 2267 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2268 IBT_SUCCESS) { 2269 ibd_print_warn(state, "ibt_detach(): global " 2270 "failed, ret=%d", ret); 2271 } 2272 ibd_gstate.ig_ibt_hdl = NULL; 2273 } 2274 mutex_exit(&ibd_gstate.ig_mutex); 2275 2276 if (progress & IBD_DRV_TXINTR_ADDED) { 2277 ddi_remove_softintr(state->id_tx); 2278 state->id_tx = NULL; 2279 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2280 } 2281 2282 if (progress & IBD_DRV_RXINTR_ADDED) { 2283 ddi_remove_softintr(state->id_rx); 2284 state->id_rx = NULL; 2285 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2286 } 2287 2288 #ifdef DEBUG 2289 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2290 kstat_delete(state->rc_ksp); 2291 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2292 } 2293 #endif 2294 2295 if (progress & IBD_DRV_STATE_INITIALIZED) { 2296 ibd_state_fini(state); 2297 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2298 } 2299 } 2300 2301 int 2302 ibd_part_attach(ibd_state_t *state, dev_info_t *dip) 2303 { 2304 ibt_status_t ret; 2305 int rv; 2306 kthread_t *kht; 2307 2308 /* 2309 * Initialize mutexes and condition variables 2310 */ 2311 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2312 DPRINT(10, "ibd_part_attach: failed in ibd_state_init()"); 2313 return (DDI_FAILURE); 2314 } 2315 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2316 2317 /* 2318 * Allocate rx,tx softintr 2319 */ 2320 if (ibd_rx_softintr == 1) { 2321 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2322 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2323 DPRINT(10, "ibd_part_attach: failed in " 2324 "ddi_add_softintr(id_rx), ret=%d", rv); 2325 return (DDI_FAILURE); 2326 } 2327 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2328 } 2329 if (ibd_tx_softintr == 1) { 2330 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2331 NULL, NULL, ibd_tx_recycle, 2332 (caddr_t)state)) != DDI_SUCCESS) { 2333 DPRINT(10, "ibd_part_attach: failed in " 2334 "ddi_add_softintr(id_tx), ret=%d", rv); 2335 return (DDI_FAILURE); 2336 } 2337 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2338 } 2339 2340 /* 2341 * Attach to IBTL 2342 */ 2343 mutex_enter(&ibd_gstate.ig_mutex); 2344 if (ibd_gstate.ig_ibt_hdl == NULL) { 2345 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2346 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2347 DPRINT(10, "ibd_part_attach: global: failed in " 2348 "ibt_attach(), ret=%d", ret); 2349 mutex_exit(&ibd_gstate.ig_mutex); 2350 return (DDI_FAILURE); 2351 } 2352 } 2353 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2354 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2355 DPRINT(10, "ibd_part_attach: failed in ibt_attach(), ret=%d", 2356 ret); 2357 mutex_exit(&ibd_gstate.ig_mutex); 2358 return (DDI_FAILURE); 2359 } 2360 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2361 mutex_exit(&ibd_gstate.ig_mutex); 2362 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2363 2364 /* 2365 * Open the HCA 2366 */ 2367 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 2368 &state->id_hca_hdl)) != IBT_SUCCESS) { 2369 DPRINT(10, "ibd_part_attach: ibt_open_hca() failed, ret=%d", 2370 ret); 2371 return (DDI_FAILURE); 2372 } 2373 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2374 2375 #ifdef DEBUG 2376 /* Initialize Driver Counters for Reliable Connected Mode */ 2377 if (state->id_enable_rc) { 2378 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2379 DPRINT(10, "ibd_part_attach: failed in " 2380 "ibd_rc_init_stats"); 2381 return (DDI_FAILURE); 2382 } 2383 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2384 } 2385 #endif 2386 2387 /* 2388 * Record capabilities 2389 */ 2390 (void) ibd_record_capab(state); 2391 2392 /* 2393 * Allocate a protection domain on the HCA 2394 */ 2395 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2396 &state->id_pd_hdl)) != IBT_SUCCESS) { 2397 DPRINT(10, "ibd_part_attach: ibt_alloc_pd() failed, ret=%d", 2398 ret); 2399 return (DDI_FAILURE); 2400 } 2401 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2402 2403 2404 /* 2405 * We need to initialise the req_list that is required for the 2406 * operation of the async_thread. 2407 */ 2408 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 2409 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 2410 list_create(&state->id_req_list, sizeof (ibd_req_t), 2411 offsetof(ibd_req_t, rq_list)); 2412 state->id_mac_state |= IBD_DRV_REQ_LIST_INITED; 2413 2414 /* 2415 * Create the async thread; thread_create never fails. 2416 */ 2417 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 2418 TS_RUN, minclsyspri); 2419 state->id_async_thrid = kht->t_did; 2420 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 2421 2422 return (DDI_SUCCESS); 2423 } 2424 2425 /* 2426 * Attach device to the IO framework. 2427 */ 2428 static int 2429 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2430 { 2431 int ret; 2432 2433 switch (cmd) { 2434 case DDI_ATTACH: 2435 ret = ibd_port_attach(dip); 2436 break; 2437 default: 2438 ret = DDI_FAILURE; 2439 break; 2440 } 2441 return (ret); 2442 } 2443 2444 /* 2445 * Detach device from the IO framework. 2446 */ 2447 static int 2448 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2449 { 2450 ibd_state_t *state; 2451 int instance; 2452 2453 /* 2454 * IBD doesn't support suspend/resume 2455 */ 2456 if (cmd != DDI_DETACH) 2457 return (DDI_FAILURE); 2458 2459 /* 2460 * Get the instance softstate 2461 */ 2462 instance = ddi_get_instance(dip); 2463 state = ddi_get_soft_state(ibd_list, instance); 2464 2465 /* 2466 * Release all resources we're holding still. Note that if we'd 2467 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2468 * so far, we should find all the flags we need in id_mac_state. 2469 */ 2470 return (ibd_port_unattach(state, dip)); 2471 } 2472 2473 /* 2474 * Pre ibt_attach() driver initialization 2475 */ 2476 static int 2477 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2478 { 2479 char buf[64]; 2480 2481 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2482 state->id_link_state = LINK_STATE_UNKNOWN; 2483 2484 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2485 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2486 state->id_trap_stop = B_TRUE; 2487 state->id_trap_inprog = 0; 2488 2489 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2490 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2491 state->id_dip = dip; 2492 2493 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2494 2495 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2496 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2497 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2498 state->id_tx_busy = 0; 2499 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2500 2501 state->id_rx_list.dl_bufs_outstanding = 0; 2502 state->id_rx_list.dl_cnt = 0; 2503 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2504 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2505 (void) sprintf(buf, "ibd_req%d_%x_%u", ddi_get_instance(dip), 2506 state->id_pkey, state->id_plinkid); 2507 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2508 0, NULL, NULL, NULL, NULL, NULL, 0); 2509 2510 /* For Reliable Connected Mode */ 2511 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2512 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2513 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2514 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2515 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2516 MUTEX_DRIVER, NULL); 2517 mutex_init(&state->rc_timeout_lock, NULL, MUTEX_DRIVER, NULL); 2518 2519 /* 2520 * Make the default link mode as RC. If this fails during connection 2521 * setup, the link mode is automatically transitioned to UD. 2522 * Also set the RC MTU. 2523 */ 2524 state->id_enable_rc = IBD_DEF_LINK_MODE; 2525 state->rc_mtu = IBD_DEF_RC_MAX_MTU; 2526 state->id_mtu = IBD_DEF_MAX_MTU; 2527 2528 /* Iniatialize all tunables to default */ 2529 state->id_lso_policy = IBD_DEF_LSO_POLICY; 2530 state->id_num_lso_bufs = IBD_DEF_NUM_LSO_BUFS; 2531 state->id_num_ah = IBD_DEF_NUM_AH; 2532 state->id_hash_size = IBD_DEF_HASH_SIZE; 2533 state->id_create_broadcast_group = IBD_DEF_CREATE_BCAST_GROUP; 2534 state->id_allow_coalesce_comp_tuning = IBD_DEF_COALESCE_COMPLETIONS; 2535 state->id_ud_rx_comp_count = IBD_DEF_UD_RX_COMP_COUNT; 2536 state->id_ud_rx_comp_usec = IBD_DEF_UD_RX_COMP_USEC; 2537 state->id_ud_tx_comp_count = IBD_DEF_UD_TX_COMP_COUNT; 2538 state->id_ud_tx_comp_usec = IBD_DEF_UD_TX_COMP_USEC; 2539 state->id_rc_rx_comp_count = IBD_DEF_RC_RX_COMP_COUNT; 2540 state->id_rc_rx_comp_usec = IBD_DEF_RC_RX_COMP_USEC; 2541 state->id_rc_tx_comp_count = IBD_DEF_RC_TX_COMP_COUNT; 2542 state->id_rc_tx_comp_usec = IBD_DEF_RC_TX_COMP_USEC; 2543 state->id_ud_tx_copy_thresh = IBD_DEF_UD_TX_COPY_THRESH; 2544 state->id_rc_rx_copy_thresh = IBD_DEF_RC_RX_COPY_THRESH; 2545 state->id_rc_tx_copy_thresh = IBD_DEF_RC_TX_COPY_THRESH; 2546 state->id_ud_num_rwqe = IBD_DEF_UD_NUM_RWQE; 2547 state->id_ud_num_swqe = IBD_DEF_UD_NUM_SWQE; 2548 state->id_rc_num_rwqe = IBD_DEF_RC_NUM_RWQE; 2549 state->id_rc_num_swqe = IBD_DEF_RC_NUM_SWQE; 2550 state->rc_enable_srq = IBD_DEF_RC_ENABLE_SRQ; 2551 state->id_rc_num_srq = IBD_DEF_RC_NUM_SRQ; 2552 state->id_rc_rx_rwqe_thresh = IBD_DEF_RC_RX_RWQE_THRESH; 2553 2554 return (DDI_SUCCESS); 2555 } 2556 2557 /* 2558 * Post ibt_detach() driver deconstruction 2559 */ 2560 static void 2561 ibd_state_fini(ibd_state_t *state) 2562 { 2563 kmem_cache_destroy(state->id_req_kmc); 2564 2565 mutex_destroy(&state->id_rx_list.dl_mutex); 2566 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2567 2568 mutex_destroy(&state->id_txpost_lock); 2569 mutex_destroy(&state->id_tx_list.dl_mutex); 2570 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2571 mutex_destroy(&state->id_lso_lock); 2572 2573 mutex_destroy(&state->id_sched_lock); 2574 mutex_destroy(&state->id_scq_poll_lock); 2575 mutex_destroy(&state->id_rcq_poll_lock); 2576 2577 cv_destroy(&state->id_trap_cv); 2578 mutex_destroy(&state->id_trap_lock); 2579 mutex_destroy(&state->id_link_mutex); 2580 2581 /* For Reliable Connected Mode */ 2582 mutex_destroy(&state->rc_timeout_lock); 2583 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2584 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2585 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2586 mutex_destroy(&state->rc_tx_large_bufs_lock); 2587 mutex_destroy(&state->rc_rx_lock); 2588 } 2589 2590 /* 2591 * Fetch link speed from SA for snmp ifspeed reporting. 2592 */ 2593 static uint64_t 2594 ibd_get_portspeed(ibd_state_t *state) 2595 { 2596 int ret; 2597 ibt_path_info_t path; 2598 ibt_path_attr_t path_attr; 2599 uint8_t num_paths; 2600 uint64_t ifspeed; 2601 2602 /* 2603 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2604 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2605 * 2000000000. Start with that as default. 2606 */ 2607 ifspeed = 2000000000; 2608 2609 bzero(&path_attr, sizeof (path_attr)); 2610 2611 /* 2612 * Get the port speed from Loopback path information. 2613 */ 2614 path_attr.pa_dgids = &state->id_sgid; 2615 path_attr.pa_num_dgids = 1; 2616 path_attr.pa_sgid = state->id_sgid; 2617 2618 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2619 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2620 goto earlydone; 2621 2622 if (num_paths < 1) 2623 goto earlydone; 2624 2625 /* 2626 * In case SA does not return an expected value, report the default 2627 * speed as 1X. 2628 */ 2629 ret = 1; 2630 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2631 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2632 ret = 1; 2633 break; 2634 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2635 ret = 4; 2636 break; 2637 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2638 ret = 12; 2639 break; 2640 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2641 ret = 2; 2642 break; 2643 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2644 ret = 8; 2645 break; 2646 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2647 ret = 16; 2648 break; 2649 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2650 ret = 24; 2651 break; 2652 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2653 ret = 32; 2654 break; 2655 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2656 ret = 48; 2657 break; 2658 } 2659 2660 ifspeed *= ret; 2661 2662 earlydone: 2663 return (ifspeed); 2664 } 2665 2666 /* 2667 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2668 * representing the input mcg mgid. 2669 */ 2670 static ibd_mce_t * 2671 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2672 { 2673 ibd_mce_t *ptr = list_head(mlist); 2674 2675 /* 2676 * Do plain linear search. 2677 */ 2678 while (ptr != NULL) { 2679 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2680 sizeof (ib_gid_t)) == 0) 2681 return (ptr); 2682 ptr = list_next(mlist, ptr); 2683 } 2684 return (NULL); 2685 } 2686 2687 /* 2688 * Execute IBA JOIN. 2689 */ 2690 static ibt_status_t 2691 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2692 { 2693 ibt_mcg_attr_t mcg_attr; 2694 2695 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2696 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2697 mcg_attr.mc_mgid = mgid; 2698 mcg_attr.mc_join_state = mce->mc_jstate; 2699 mcg_attr.mc_scope = state->id_scope; 2700 mcg_attr.mc_pkey = state->id_pkey; 2701 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2702 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2703 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2704 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2705 NULL, NULL)); 2706 } 2707 2708 /* 2709 * This code JOINs the port in the proper way (depending on the join 2710 * state) so that IBA fabric will forward mcg packets to/from the port. 2711 * It also attaches the QPN to the mcg so it can receive those mcg 2712 * packets. This code makes sure not to attach the mcg to the QP if 2713 * that has been previously done due to the mcg being joined with a 2714 * different join state, even though this is not required by SWG_0216, 2715 * refid 3610. 2716 */ 2717 static ibd_mce_t * 2718 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2719 { 2720 ibt_status_t ibt_status; 2721 ibd_mce_t *mce, *tmce, *omce = NULL; 2722 boolean_t do_attach = B_TRUE; 2723 2724 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2725 jstate, mgid.gid_prefix, mgid.gid_guid); 2726 2727 /* 2728 * For enable_multicast Full member joins, we need to do some 2729 * extra work. If there is already an mce on the list that 2730 * indicates full membership, that means the membership has 2731 * not yet been dropped (since the disable_multicast was issued) 2732 * because there are pending Tx's to the mcg; in that case, just 2733 * mark the mce not to be reaped when the Tx completion queues 2734 * an async reap operation. 2735 * 2736 * If there is already an mce on the list indicating sendonly 2737 * membership, try to promote to full membership. Be careful 2738 * not to deallocate the old mce, since there might be an AH 2739 * pointing to it; instead, update the old mce with new data 2740 * that tracks the full membership. 2741 */ 2742 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2743 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2744 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2745 ASSERT(omce->mc_fullreap); 2746 omce->mc_fullreap = B_FALSE; 2747 return (omce); 2748 } else { 2749 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2750 } 2751 } 2752 2753 /* 2754 * Allocate the ibd_mce_t to track this JOIN. 2755 */ 2756 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2757 mce->mc_fullreap = B_FALSE; 2758 mce->mc_jstate = jstate; 2759 2760 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2761 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2762 ibt_status); 2763 kmem_free(mce, sizeof (ibd_mce_t)); 2764 return (NULL); 2765 } 2766 2767 /* 2768 * Is an IBA attach required? Not if the interface is already joined 2769 * to the mcg in a different appropriate join state. 2770 */ 2771 if (jstate == IB_MC_JSTATE_NON) { 2772 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2773 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2774 do_attach = B_FALSE; 2775 } else if (jstate == IB_MC_JSTATE_FULL) { 2776 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2777 do_attach = B_FALSE; 2778 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2779 do_attach = B_FALSE; 2780 } 2781 2782 if (do_attach) { 2783 /* 2784 * Do the IBA attach. 2785 */ 2786 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2787 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2788 &mce->mc_info)) != IBT_SUCCESS) { 2789 DPRINT(10, "ibd_join_group : failed qp attachment " 2790 "%d\n", ibt_status); 2791 /* 2792 * NOTE that we should probably preserve the join info 2793 * in the list and later try to leave again at detach 2794 * time. 2795 */ 2796 (void) ibt_leave_mcg(state->id_sgid, mgid, 2797 state->id_sgid, jstate); 2798 kmem_free(mce, sizeof (ibd_mce_t)); 2799 return (NULL); 2800 } 2801 } 2802 2803 /* 2804 * Insert the ibd_mce_t in the proper list. 2805 */ 2806 if (jstate == IB_MC_JSTATE_NON) { 2807 IBD_MCACHE_INSERT_NON(state, mce); 2808 } else { 2809 /* 2810 * Set up the mc_req fields used for reaping the 2811 * mcg in case of delayed tx completion (see 2812 * ibd_tx_cleanup()). Also done for sendonly join in 2813 * case we are promoted to fullmembership later and 2814 * keep using the same mce. 2815 */ 2816 mce->mc_req.rq_gid = mgid; 2817 mce->mc_req.rq_ptr = mce; 2818 /* 2819 * Check whether this is the case of trying to join 2820 * full member, and we were already joined send only. 2821 * We try to drop our SendOnly membership, but it is 2822 * possible that the mcg does not exist anymore (and 2823 * the subnet trap never reached us), so the leave 2824 * operation might fail. 2825 */ 2826 if (omce != NULL) { 2827 (void) ibt_leave_mcg(state->id_sgid, mgid, 2828 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2829 omce->mc_jstate = IB_MC_JSTATE_FULL; 2830 bcopy(&mce->mc_info, &omce->mc_info, 2831 sizeof (ibt_mcg_info_t)); 2832 kmem_free(mce, sizeof (ibd_mce_t)); 2833 return (omce); 2834 } 2835 mutex_enter(&state->id_mc_mutex); 2836 IBD_MCACHE_INSERT_FULL(state, mce); 2837 mutex_exit(&state->id_mc_mutex); 2838 } 2839 2840 return (mce); 2841 } 2842 2843 /* 2844 * Called during port up event handling to attempt to reacquire full 2845 * membership to an mcg. Stripped down version of ibd_join_group(). 2846 * Note that it is possible that the mcg might have gone away, and 2847 * gets recreated at this point. 2848 */ 2849 static void 2850 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2851 { 2852 ib_gid_t mgid; 2853 2854 /* 2855 * If the mc_fullreap flag is set, or this join fails, a subsequent 2856 * reap/leave is going to try to leave the group. We could prevent 2857 * that by adding a boolean flag into ibd_mce_t, if required. 2858 */ 2859 if (mce->mc_fullreap) 2860 return; 2861 2862 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2863 2864 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2865 mgid.gid_guid); 2866 2867 /* While reacquiring, leave and then join the MCG */ 2868 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 2869 mce->mc_jstate); 2870 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2871 ibd_print_warn(state, "Failure on port up to rejoin " 2872 "multicast gid %016llx:%016llx", 2873 (u_longlong_t)mgid.gid_prefix, 2874 (u_longlong_t)mgid.gid_guid); 2875 } 2876 2877 /* 2878 * This code handles delayed Tx completion cleanups for mcg's to which 2879 * disable_multicast has been issued, regular mcg related cleanups during 2880 * disable_multicast, disable_promiscuous and mcg traps, as well as 2881 * cleanups during driver detach time. Depending on the join state, 2882 * it deletes the mce from the appropriate list and issues the IBA 2883 * leave/detach; except in the disable_multicast case when the mce 2884 * is left on the active list for a subsequent Tx completion cleanup. 2885 */ 2886 static void 2887 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2888 uint8_t jstate) 2889 { 2890 ibd_mce_t *tmce; 2891 boolean_t do_detach = B_TRUE; 2892 2893 /* 2894 * Before detaching, we must check whether the other list 2895 * contains the mcg; if we detach blindly, the consumer 2896 * who set up the other list will also stop receiving 2897 * traffic. 2898 */ 2899 if (jstate == IB_MC_JSTATE_FULL) { 2900 /* 2901 * The following check is only relevant while coming 2902 * from the Tx completion path in the reap case. 2903 */ 2904 if (!mce->mc_fullreap) 2905 return; 2906 mutex_enter(&state->id_mc_mutex); 2907 IBD_MCACHE_PULLOUT_FULL(state, mce); 2908 mutex_exit(&state->id_mc_mutex); 2909 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2910 do_detach = B_FALSE; 2911 } else if (jstate == IB_MC_JSTATE_NON) { 2912 IBD_MCACHE_PULLOUT_NON(state, mce); 2913 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2914 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2915 do_detach = B_FALSE; 2916 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2917 mutex_enter(&state->id_mc_mutex); 2918 IBD_MCACHE_PULLOUT_FULL(state, mce); 2919 mutex_exit(&state->id_mc_mutex); 2920 do_detach = B_FALSE; 2921 } 2922 2923 /* 2924 * If we are reacting to a mcg trap and leaving our sendonly or 2925 * non membership, the mcg is possibly already gone, so attempting 2926 * to leave might fail. On the other hand, we must try to leave 2927 * anyway, since this might be a trap from long ago, and we could 2928 * have potentially sendonly joined to a recent incarnation of 2929 * the mcg and are about to loose track of this information. 2930 */ 2931 if (do_detach) { 2932 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 2933 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2934 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 2935 } 2936 2937 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 2938 kmem_free(mce, sizeof (ibd_mce_t)); 2939 } 2940 2941 /* 2942 * Async code executed due to multicast and promiscuous disable requests 2943 * and mcg trap handling; also executed during driver detach. Mostly, a 2944 * leave and detach is done; except for the fullmember case when Tx 2945 * requests are pending, whence arrangements are made for subsequent 2946 * cleanup on Tx completion. 2947 */ 2948 static void 2949 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2950 { 2951 ipoib_mac_t mcmac; 2952 boolean_t recycled; 2953 ibd_mce_t *mce; 2954 2955 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 2956 jstate, mgid.gid_prefix, mgid.gid_guid); 2957 2958 if (jstate == IB_MC_JSTATE_NON) { 2959 recycled = B_TRUE; 2960 mce = IBD_MCACHE_FIND_NON(state, mgid); 2961 /* 2962 * In case we are handling a mcg trap, we might not find 2963 * the mcg in the non list. 2964 */ 2965 if (mce == NULL) { 2966 return; 2967 } 2968 } else { 2969 mce = IBD_MCACHE_FIND_FULL(state, mgid); 2970 2971 /* 2972 * In case we are handling a mcg trap, make sure the trap 2973 * is not arriving late; if we have an mce that indicates 2974 * that we are already a fullmember, that would be a clear 2975 * indication that the trap arrived late (ie, is for a 2976 * previous incarnation of the mcg). 2977 */ 2978 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 2979 if ((mce == NULL) || (mce->mc_jstate == 2980 IB_MC_JSTATE_FULL)) { 2981 return; 2982 } 2983 } else { 2984 ASSERT(jstate == IB_MC_JSTATE_FULL); 2985 2986 /* 2987 * If join group failed, mce will be NULL here. 2988 * This is because in GLDv3 driver, set multicast 2989 * will always return success. 2990 */ 2991 if (mce == NULL) { 2992 return; 2993 } 2994 2995 mce->mc_fullreap = B_TRUE; 2996 } 2997 2998 /* 2999 * If no pending Tx's remain that reference the AH 3000 * for the mcg, recycle it from active to free list. 3001 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3002 * so the last completing Tx will cause an async reap 3003 * operation to be invoked, at which time we will drop our 3004 * membership to the mcg so that the pending Tx's complete 3005 * successfully. Refer to comments on "AH and MCE active 3006 * list manipulation" at top of this file. The lock protects 3007 * against Tx fast path and Tx cleanup code. 3008 */ 3009 mutex_enter(&state->id_ac_mutex); 3010 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3011 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3012 IB_MC_JSTATE_SEND_ONLY_NON)); 3013 mutex_exit(&state->id_ac_mutex); 3014 } 3015 3016 if (recycled) { 3017 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3018 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3019 ibd_async_reap_group(state, mce, mgid, jstate); 3020 } 3021 } 3022 3023 /* 3024 * Find the broadcast address as defined by IPoIB; implicitly 3025 * determines the IBA scope, mtu, tclass etc of the link the 3026 * interface is going to be a member of. 3027 */ 3028 static ibt_status_t 3029 ibd_find_bgroup(ibd_state_t *state) 3030 { 3031 ibt_mcg_attr_t mcg_attr; 3032 uint_t numg; 3033 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3034 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3035 IB_MC_SCOPE_GLOBAL }; 3036 int i, mcgmtu; 3037 boolean_t found = B_FALSE; 3038 int ret; 3039 ibt_mcg_info_t mcg_info; 3040 3041 state->id_bgroup_created = B_FALSE; 3042 state->id_bgroup_present = B_FALSE; 3043 3044 query_bcast_grp: 3045 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3046 mcg_attr.mc_pkey = state->id_pkey; 3047 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3048 3049 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3050 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3051 3052 /* 3053 * Look for the IPoIB broadcast group. 3054 */ 3055 state->id_mgid.gid_prefix = 3056 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3057 ((uint64_t)state->id_scope << 48) | 3058 ((uint32_t)(state->id_pkey << 16))); 3059 mcg_attr.mc_mgid = state->id_mgid; 3060 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3061 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3062 found = B_TRUE; 3063 break; 3064 } 3065 } 3066 3067 if (!found) { 3068 if (state->id_create_broadcast_group) { 3069 /* 3070 * If we created the broadcast group, but failed to 3071 * find it, we can't do anything except leave the 3072 * one we created and return failure. 3073 */ 3074 if (state->id_bgroup_created) { 3075 ibd_print_warn(state, "IPoIB broadcast group " 3076 "absent. Unable to query after create."); 3077 goto find_bgroup_fail; 3078 } 3079 3080 /* 3081 * Create the ipoib broadcast group if it didn't exist 3082 */ 3083 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3084 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3085 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3086 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3087 mcg_attr.mc_pkey = state->id_pkey; 3088 mcg_attr.mc_flow = 0; 3089 mcg_attr.mc_sl = 0; 3090 mcg_attr.mc_tclass = 0; 3091 state->id_mgid.gid_prefix = 3092 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3093 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3094 ((uint32_t)(state->id_pkey << 16))); 3095 mcg_attr.mc_mgid = state->id_mgid; 3096 3097 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3098 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3099 ibd_print_warn(state, "IPoIB broadcast group " 3100 "absent, create failed: ret = %d\n", ret); 3101 state->id_bgroup_created = B_FALSE; 3102 return (IBT_FAILURE); 3103 } 3104 state->id_bgroup_created = B_TRUE; 3105 goto query_bcast_grp; 3106 } else { 3107 ibd_print_warn(state, "IPoIB broadcast group absent"); 3108 return (IBT_FAILURE); 3109 } 3110 } 3111 3112 /* 3113 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3114 */ 3115 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3116 if (state->id_mtu < mcgmtu) { 3117 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3118 "greater than port's maximum MTU %d", mcgmtu, 3119 state->id_mtu); 3120 ibt_free_mcg_info(state->id_mcinfo, 1); 3121 goto find_bgroup_fail; 3122 } 3123 state->id_mtu = mcgmtu; 3124 state->id_bgroup_present = B_TRUE; 3125 3126 return (IBT_SUCCESS); 3127 3128 find_bgroup_fail: 3129 if (state->id_bgroup_created) { 3130 (void) ibt_leave_mcg(state->id_sgid, 3131 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3132 IB_MC_JSTATE_FULL); 3133 } 3134 3135 return (IBT_FAILURE); 3136 } 3137 3138 static int 3139 ibd_alloc_tx_copybufs(ibd_state_t *state) 3140 { 3141 ibt_mr_attr_t mem_attr; 3142 3143 /* 3144 * Allocate one big chunk for all regular tx copy bufs 3145 */ 3146 state->id_tx_buf_sz = state->id_mtu; 3147 if (state->id_lso_policy && state->id_lso_capable && 3148 (state->id_ud_tx_copy_thresh > state->id_mtu)) { 3149 state->id_tx_buf_sz = state->id_ud_tx_copy_thresh; 3150 } 3151 3152 state->id_tx_bufs = kmem_zalloc(state->id_ud_num_swqe * 3153 state->id_tx_buf_sz, KM_SLEEP); 3154 3155 state->id_tx_wqes = kmem_zalloc(state->id_ud_num_swqe * 3156 sizeof (ibd_swqe_t), KM_SLEEP); 3157 3158 /* 3159 * Do one memory registration on the entire txbuf area 3160 */ 3161 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3162 mem_attr.mr_len = state->id_ud_num_swqe * state->id_tx_buf_sz; 3163 mem_attr.mr_as = NULL; 3164 mem_attr.mr_flags = IBT_MR_SLEEP; 3165 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3166 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3167 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3168 kmem_free(state->id_tx_wqes, 3169 state->id_ud_num_swqe * sizeof (ibd_swqe_t)); 3170 kmem_free(state->id_tx_bufs, 3171 state->id_ud_num_swqe * state->id_tx_buf_sz); 3172 state->id_tx_bufs = NULL; 3173 return (DDI_FAILURE); 3174 } 3175 3176 return (DDI_SUCCESS); 3177 } 3178 3179 static int 3180 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3181 { 3182 ibt_mr_attr_t mem_attr; 3183 ibd_lsobuf_t *buflist; 3184 ibd_lsobuf_t *lbufp; 3185 ibd_lsobuf_t *tail; 3186 ibd_lsobkt_t *bktp; 3187 uint8_t *membase; 3188 uint8_t *memp; 3189 uint_t memsz; 3190 int i; 3191 3192 /* 3193 * Allocate the lso bucket 3194 */ 3195 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3196 3197 /* 3198 * Allocate the entire lso memory and register it 3199 */ 3200 memsz = state->id_num_lso_bufs * IBD_LSO_BUFSZ; 3201 membase = kmem_zalloc(memsz, KM_SLEEP); 3202 3203 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3204 mem_attr.mr_len = memsz; 3205 mem_attr.mr_as = NULL; 3206 mem_attr.mr_flags = IBT_MR_SLEEP; 3207 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3208 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3209 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3210 kmem_free(membase, memsz); 3211 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3212 return (DDI_FAILURE); 3213 } 3214 3215 mutex_enter(&state->id_lso_lock); 3216 3217 /* 3218 * Now allocate the buflist. Note that the elements in the buflist and 3219 * the buffers in the lso memory have a permanent 1-1 relation, so we 3220 * can always derive the address of a buflist entry from the address of 3221 * an lso buffer. 3222 */ 3223 buflist = kmem_zalloc(state->id_num_lso_bufs * sizeof (ibd_lsobuf_t), 3224 KM_SLEEP); 3225 3226 /* 3227 * Set up the lso buf chain 3228 */ 3229 memp = membase; 3230 lbufp = buflist; 3231 for (i = 0; i < state->id_num_lso_bufs; i++) { 3232 lbufp->lb_isfree = 1; 3233 lbufp->lb_buf = memp; 3234 lbufp->lb_next = lbufp + 1; 3235 3236 tail = lbufp; 3237 3238 memp += IBD_LSO_BUFSZ; 3239 lbufp++; 3240 } 3241 tail->lb_next = NULL; 3242 3243 /* 3244 * Set up the LSO buffer information in ibd state 3245 */ 3246 bktp->bkt_bufl = buflist; 3247 bktp->bkt_free_head = buflist; 3248 bktp->bkt_mem = membase; 3249 bktp->bkt_nelem = state->id_num_lso_bufs; 3250 bktp->bkt_nfree = bktp->bkt_nelem; 3251 3252 state->id_lso = bktp; 3253 mutex_exit(&state->id_lso_lock); 3254 3255 return (DDI_SUCCESS); 3256 } 3257 3258 /* 3259 * Statically allocate Tx buffer list(s). 3260 */ 3261 static int 3262 ibd_init_txlist(ibd_state_t *state) 3263 { 3264 ibd_swqe_t *swqe; 3265 ibt_lkey_t lkey; 3266 int i; 3267 uint_t len; 3268 uint8_t *bufaddr; 3269 3270 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3271 return (DDI_FAILURE); 3272 3273 if (state->id_lso_policy && state->id_lso_capable) { 3274 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3275 state->id_lso_capable = B_FALSE; 3276 } 3277 3278 mutex_enter(&state->id_tx_list.dl_mutex); 3279 state->id_tx_list.dl_head = NULL; 3280 state->id_tx_list.dl_pending_sends = B_FALSE; 3281 state->id_tx_list.dl_cnt = 0; 3282 mutex_exit(&state->id_tx_list.dl_mutex); 3283 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3284 state->id_tx_rel_list.dl_head = NULL; 3285 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3286 state->id_tx_rel_list.dl_cnt = 0; 3287 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3288 3289 /* 3290 * Allocate and setup the swqe list 3291 */ 3292 lkey = state->id_tx_mr_desc.md_lkey; 3293 bufaddr = state->id_tx_bufs; 3294 len = state->id_tx_buf_sz; 3295 swqe = state->id_tx_wqes; 3296 mutex_enter(&state->id_tx_list.dl_mutex); 3297 for (i = 0; i < state->id_ud_num_swqe; i++, swqe++, bufaddr += len) { 3298 swqe->swqe_next = NULL; 3299 swqe->swqe_im_mblk = NULL; 3300 3301 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3302 bufaddr; 3303 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3304 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3305 3306 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3307 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3308 swqe->w_swr.wr_trans = IBT_UD_SRV; 3309 3310 /* These are set in send */ 3311 swqe->w_swr.wr_nds = 0; 3312 swqe->w_swr.wr_sgl = NULL; 3313 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3314 3315 /* add to list */ 3316 state->id_tx_list.dl_cnt++; 3317 swqe->swqe_next = state->id_tx_list.dl_head; 3318 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3319 } 3320 mutex_exit(&state->id_tx_list.dl_mutex); 3321 3322 return (DDI_SUCCESS); 3323 } 3324 3325 static int 3326 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3327 uint32_t *nds_p) 3328 { 3329 ibd_lsobkt_t *bktp; 3330 ibd_lsobuf_t *lbufp; 3331 ibd_lsobuf_t *nextp; 3332 ibt_lkey_t lso_lkey; 3333 uint_t frag_sz; 3334 uint_t num_needed; 3335 int i; 3336 3337 ASSERT(sgl_p != NULL); 3338 ASSERT(nds_p != NULL); 3339 ASSERT(req_sz != 0); 3340 3341 /* 3342 * Determine how many bufs we'd need for the size requested 3343 */ 3344 num_needed = req_sz / IBD_LSO_BUFSZ; 3345 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3346 num_needed++; 3347 3348 mutex_enter(&state->id_lso_lock); 3349 3350 /* 3351 * If we don't have enough lso bufs, return failure 3352 */ 3353 ASSERT(state->id_lso != NULL); 3354 bktp = state->id_lso; 3355 if (bktp->bkt_nfree < num_needed) { 3356 mutex_exit(&state->id_lso_lock); 3357 return (-1); 3358 } 3359 3360 /* 3361 * Pick the first 'num_needed' bufs from the free list 3362 */ 3363 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3364 lbufp = bktp->bkt_free_head; 3365 for (i = 0; i < num_needed; i++) { 3366 ASSERT(lbufp->lb_isfree != 0); 3367 ASSERT(lbufp->lb_buf != NULL); 3368 3369 nextp = lbufp->lb_next; 3370 3371 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3372 sgl_p[i].ds_key = lso_lkey; 3373 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3374 3375 lbufp->lb_isfree = 0; 3376 lbufp->lb_next = NULL; 3377 3378 lbufp = nextp; 3379 } 3380 bktp->bkt_free_head = lbufp; 3381 3382 /* 3383 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3384 * to adjust the last sgl entry's length. Since we know we need atleast 3385 * one, the i-1 use below is ok. 3386 */ 3387 if (frag_sz) { 3388 sgl_p[i-1].ds_len = frag_sz; 3389 } 3390 3391 /* 3392 * Update nfree count and return 3393 */ 3394 bktp->bkt_nfree -= num_needed; 3395 3396 mutex_exit(&state->id_lso_lock); 3397 3398 *nds_p = num_needed; 3399 3400 return (0); 3401 } 3402 3403 static void 3404 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3405 { 3406 ibd_lsobkt_t *bktp; 3407 ibd_lsobuf_t *lbufp; 3408 uint8_t *lso_mem_end; 3409 uint_t ndx; 3410 int i; 3411 3412 mutex_enter(&state->id_lso_lock); 3413 3414 bktp = state->id_lso; 3415 ASSERT(bktp != NULL); 3416 3417 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3418 for (i = 0; i < nds; i++) { 3419 uint8_t *va; 3420 3421 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3422 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3423 3424 /* 3425 * Figure out the buflist element this sgl buffer corresponds 3426 * to and put it back at the head 3427 */ 3428 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3429 lbufp = bktp->bkt_bufl + ndx; 3430 3431 ASSERT(lbufp->lb_isfree == 0); 3432 ASSERT(lbufp->lb_buf == va); 3433 3434 lbufp->lb_isfree = 1; 3435 lbufp->lb_next = bktp->bkt_free_head; 3436 bktp->bkt_free_head = lbufp; 3437 } 3438 bktp->bkt_nfree += nds; 3439 3440 mutex_exit(&state->id_lso_lock); 3441 } 3442 3443 static void 3444 ibd_free_tx_copybufs(ibd_state_t *state) 3445 { 3446 /* 3447 * Unregister txbuf mr 3448 */ 3449 if (ibt_deregister_mr(state->id_hca_hdl, 3450 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3451 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3452 } 3453 state->id_tx_mr_hdl = NULL; 3454 3455 /* 3456 * Free txbuf memory 3457 */ 3458 kmem_free(state->id_tx_wqes, state->id_ud_num_swqe * 3459 sizeof (ibd_swqe_t)); 3460 kmem_free(state->id_tx_bufs, state->id_ud_num_swqe * 3461 state->id_tx_buf_sz); 3462 state->id_tx_wqes = NULL; 3463 state->id_tx_bufs = NULL; 3464 } 3465 3466 static void 3467 ibd_free_tx_lsobufs(ibd_state_t *state) 3468 { 3469 ibd_lsobkt_t *bktp; 3470 3471 mutex_enter(&state->id_lso_lock); 3472 3473 if ((bktp = state->id_lso) == NULL) { 3474 mutex_exit(&state->id_lso_lock); 3475 return; 3476 } 3477 3478 /* 3479 * First, free the buflist 3480 */ 3481 ASSERT(bktp->bkt_bufl != NULL); 3482 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3483 3484 /* 3485 * Unregister the LSO memory and free it 3486 */ 3487 ASSERT(bktp->bkt_mr_hdl != NULL); 3488 if (ibt_deregister_mr(state->id_hca_hdl, 3489 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3490 DPRINT(10, 3491 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3492 } 3493 ASSERT(bktp->bkt_mem); 3494 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3495 3496 /* 3497 * Finally free the bucket 3498 */ 3499 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3500 state->id_lso = NULL; 3501 3502 mutex_exit(&state->id_lso_lock); 3503 } 3504 3505 /* 3506 * Free the statically allocated Tx buffer list. 3507 */ 3508 static void 3509 ibd_fini_txlist(ibd_state_t *state) 3510 { 3511 /* 3512 * Free the allocated swqes 3513 */ 3514 mutex_enter(&state->id_tx_list.dl_mutex); 3515 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3516 state->id_tx_list.dl_head = NULL; 3517 state->id_tx_list.dl_pending_sends = B_FALSE; 3518 state->id_tx_list.dl_cnt = 0; 3519 state->id_tx_rel_list.dl_head = NULL; 3520 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3521 state->id_tx_rel_list.dl_cnt = 0; 3522 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3523 mutex_exit(&state->id_tx_list.dl_mutex); 3524 3525 ibd_free_tx_lsobufs(state); 3526 ibd_free_tx_copybufs(state); 3527 } 3528 3529 /* 3530 * post a list of rwqes, NULL terminated. 3531 */ 3532 static void 3533 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3534 { 3535 uint_t i; 3536 uint_t num_posted; 3537 ibt_status_t ibt_status; 3538 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3539 3540 while (rwqe) { 3541 /* Post up to IBD_RX_POST_CNT receive work requests */ 3542 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3543 wrs[i] = rwqe->w_rwr; 3544 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3545 if (rwqe == NULL) { 3546 i++; 3547 break; 3548 } 3549 } 3550 3551 /* 3552 * If posting fails for some reason, we'll never receive 3553 * completion intimation, so we'll need to cleanup. But 3554 * we need to make sure we don't clean up nodes whose 3555 * wrs have been successfully posted. We assume that the 3556 * hca driver returns on the first failure to post and 3557 * therefore the first 'num_posted' entries don't need 3558 * cleanup here. 3559 */ 3560 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3561 3562 num_posted = 0; 3563 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3564 &num_posted); 3565 if (ibt_status != IBT_SUCCESS) { 3566 /* This cannot happen unless the device has an error. */ 3567 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3568 "posting multiple wrs failed: " 3569 "requested=%d, done=%d, ret=%d", 3570 IBD_RX_POST_CNT, num_posted, ibt_status); 3571 atomic_add_32(&state->id_rx_list.dl_cnt, 3572 num_posted - i); 3573 } 3574 } 3575 } 3576 3577 /* 3578 * Grab a list of rwqes from the array of lists, and post the list. 3579 */ 3580 static void 3581 ibd_post_recv_intr(ibd_state_t *state) 3582 { 3583 ibd_rx_queue_t *rxp; 3584 ibd_rwqe_t *list; 3585 3586 /* rotate through the rx_queue array, expecting an adequate number */ 3587 state->id_rx_post_queue_index = 3588 (state->id_rx_post_queue_index + 1) & 3589 (state->id_rx_nqueues - 1); 3590 3591 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3592 mutex_enter(&rxp->rx_post_lock); 3593 list = WQE_TO_RWQE(rxp->rx_head); 3594 rxp->rx_head = NULL; 3595 rxp->rx_cnt = 0; 3596 mutex_exit(&rxp->rx_post_lock); 3597 ibd_post_recv_list(state, list); 3598 } 3599 3600 /* macro explained below */ 3601 #define RX_QUEUE_HASH(rwqe) \ 3602 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3603 3604 /* 3605 * Add a rwqe to one of the the Rx lists. If the list is large enough 3606 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3607 * 3608 * Note: one of 2^N lists is chosen via a hash. This is done 3609 * because using one list is contentious. If the first list is busy 3610 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3611 * 3612 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3613 * even distribution of mapping rwqes to the 2^N queues. 3614 */ 3615 static void 3616 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3617 { 3618 ibd_rx_queue_t *rxp; 3619 3620 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3621 3622 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3623 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3624 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3625 mutex_enter(&rxp->rx_post_lock); 3626 } 3627 rwqe->rwqe_next = rxp->rx_head; 3628 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 3629 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 3630 3631 /* only call ibt_post_recv() every Nth time through here */ 3632 if ((active & (state->id_rx_nqueues - 1)) == 0) { 3633 rxp->rx_head = NULL; 3634 rxp->rx_cnt = 0; 3635 mutex_exit(&rxp->rx_post_lock); 3636 ibd_post_recv_list(state, rwqe); 3637 return; 3638 } 3639 } 3640 rxp->rx_head = RWQE_TO_WQE(rwqe); 3641 mutex_exit(&rxp->rx_post_lock); 3642 } 3643 3644 static int 3645 ibd_alloc_rx_copybufs(ibd_state_t *state) 3646 { 3647 ibt_mr_attr_t mem_attr; 3648 int i; 3649 3650 /* 3651 * Allocate one big chunk for all regular rx copy bufs 3652 */ 3653 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 3654 3655 state->id_rx_bufs = kmem_zalloc(state->id_ud_num_rwqe * 3656 state->id_rx_buf_sz, KM_SLEEP); 3657 3658 state->id_rx_wqes = kmem_zalloc(state->id_ud_num_rwqe * 3659 sizeof (ibd_rwqe_t), KM_SLEEP); 3660 3661 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 3662 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 3663 sizeof (ibd_rx_queue_t), KM_SLEEP); 3664 for (i = 0; i < state->id_rx_nqueues; i++) { 3665 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3666 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 3667 } 3668 3669 /* 3670 * Do one memory registration on the entire rxbuf area 3671 */ 3672 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 3673 mem_attr.mr_len = state->id_ud_num_rwqe * state->id_rx_buf_sz; 3674 mem_attr.mr_as = NULL; 3675 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3676 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3677 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 3678 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 3679 kmem_free(state->id_rx_wqes, 3680 state->id_ud_num_rwqe * sizeof (ibd_rwqe_t)); 3681 kmem_free(state->id_rx_bufs, 3682 state->id_ud_num_rwqe * state->id_rx_buf_sz); 3683 state->id_rx_bufs = NULL; 3684 state->id_rx_wqes = NULL; 3685 return (DDI_FAILURE); 3686 } 3687 3688 return (DDI_SUCCESS); 3689 } 3690 3691 /* 3692 * Allocate the statically allocated Rx buffer list. 3693 */ 3694 static int 3695 ibd_init_rxlist(ibd_state_t *state) 3696 { 3697 ibd_rwqe_t *rwqe, *next; 3698 ibd_wqe_t *list; 3699 ibt_lkey_t lkey; 3700 int i; 3701 uint_t len; 3702 uint8_t *bufaddr; 3703 3704 mutex_enter(&state->id_rx_free_list.dl_mutex); 3705 if (state->id_rx_free_list.dl_head != NULL) { 3706 /* rx rsrcs were never freed. Just repost them */ 3707 len = state->id_rx_buf_sz; 3708 list = state->id_rx_free_list.dl_head; 3709 state->id_rx_free_list.dl_head = NULL; 3710 state->id_rx_free_list.dl_cnt = 0; 3711 mutex_exit(&state->id_rx_free_list.dl_mutex); 3712 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3713 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3714 if ((rwqe->rwqe_im_mblk = desballoc( 3715 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 3716 &rwqe->w_freemsg_cb)) == NULL) { 3717 /* allow freemsg_cb to free the rwqes */ 3718 if (atomic_dec_32_nv(&state->id_running) != 0) { 3719 cmn_err(CE_WARN, "ibd_init_rxlist: " 3720 "id_running was not 1\n"); 3721 } 3722 DPRINT(10, "ibd_init_rxlist : " 3723 "failed in desballoc()"); 3724 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3725 rwqe = next) { 3726 next = WQE_TO_RWQE(rwqe->rwqe_next); 3727 if (rwqe->rwqe_im_mblk) { 3728 atomic_inc_32(&state-> 3729 id_rx_list. 3730 dl_bufs_outstanding); 3731 freemsg(rwqe->rwqe_im_mblk); 3732 } else 3733 ibd_free_rwqe(state, rwqe); 3734 } 3735 atomic_inc_32(&state->id_running); 3736 return (DDI_FAILURE); 3737 } 3738 } 3739 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3740 return (DDI_SUCCESS); 3741 } 3742 mutex_exit(&state->id_rx_free_list.dl_mutex); 3743 3744 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 3745 return (DDI_FAILURE); 3746 3747 /* 3748 * Allocate and setup the rwqe list 3749 */ 3750 len = state->id_rx_buf_sz; 3751 lkey = state->id_rx_mr_desc.md_lkey; 3752 rwqe = state->id_rx_wqes; 3753 bufaddr = state->id_rx_bufs; 3754 list = NULL; 3755 for (i = 0; i < state->id_ud_num_rwqe; i++, rwqe++, bufaddr += len) { 3756 rwqe->w_state = state; 3757 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3758 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3759 3760 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 3761 3762 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 3763 &rwqe->w_freemsg_cb)) == NULL) { 3764 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 3765 /* allow freemsg_cb to free the rwqes */ 3766 if (atomic_dec_32_nv(&state->id_running) != 0) { 3767 cmn_err(CE_WARN, "ibd_init_rxlist: " 3768 "id_running was not 1\n"); 3769 } 3770 DPRINT(10, "ibd_init_rxlist : " 3771 "failed in desballoc()"); 3772 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3773 rwqe = next) { 3774 next = WQE_TO_RWQE(rwqe->rwqe_next); 3775 freemsg(rwqe->rwqe_im_mblk); 3776 } 3777 atomic_inc_32(&state->id_running); 3778 3779 /* remove reference to free'd rwqes */ 3780 mutex_enter(&state->id_rx_free_list.dl_mutex); 3781 state->id_rx_free_list.dl_head = NULL; 3782 state->id_rx_free_list.dl_cnt = 0; 3783 mutex_exit(&state->id_rx_free_list.dl_mutex); 3784 3785 ibd_fini_rxlist(state); 3786 return (DDI_FAILURE); 3787 } 3788 3789 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 3790 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3791 (ib_vaddr_t)(uintptr_t)bufaddr; 3792 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 3793 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3794 rwqe->w_rwr.wr_nds = 1; 3795 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3796 3797 rwqe->rwqe_next = list; 3798 list = RWQE_TO_WQE(rwqe); 3799 } 3800 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3801 3802 return (DDI_SUCCESS); 3803 } 3804 3805 static void 3806 ibd_free_rx_copybufs(ibd_state_t *state) 3807 { 3808 int i; 3809 3810 /* 3811 * Unregister rxbuf mr 3812 */ 3813 if (ibt_deregister_mr(state->id_hca_hdl, 3814 state->id_rx_mr_hdl) != IBT_SUCCESS) { 3815 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 3816 } 3817 state->id_rx_mr_hdl = NULL; 3818 3819 /* 3820 * Free rxbuf memory 3821 */ 3822 for (i = 0; i < state->id_rx_nqueues; i++) { 3823 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3824 mutex_destroy(&rxp->rx_post_lock); 3825 } 3826 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 3827 sizeof (ibd_rx_queue_t)); 3828 kmem_free(state->id_rx_wqes, state->id_ud_num_rwqe * 3829 sizeof (ibd_rwqe_t)); 3830 kmem_free(state->id_rx_bufs, state->id_ud_num_rwqe * 3831 state->id_rx_buf_sz); 3832 state->id_rx_queues = NULL; 3833 state->id_rx_wqes = NULL; 3834 state->id_rx_bufs = NULL; 3835 } 3836 3837 static void 3838 ibd_free_rx_rsrcs(ibd_state_t *state) 3839 { 3840 mutex_enter(&state->id_rx_free_list.dl_mutex); 3841 if (state->id_rx_free_list.dl_head == NULL) { 3842 /* already freed */ 3843 mutex_exit(&state->id_rx_free_list.dl_mutex); 3844 return; 3845 } 3846 ASSERT(state->id_rx_free_list.dl_cnt == state->id_ud_num_rwqe); 3847 ibd_free_rx_copybufs(state); 3848 state->id_rx_free_list.dl_cnt = 0; 3849 state->id_rx_free_list.dl_head = NULL; 3850 mutex_exit(&state->id_rx_free_list.dl_mutex); 3851 } 3852 3853 /* 3854 * Free the statically allocated Rx buffer list. 3855 */ 3856 static void 3857 ibd_fini_rxlist(ibd_state_t *state) 3858 { 3859 ibd_rwqe_t *rwqe; 3860 int i; 3861 3862 /* run through the rx_queue's, calling freemsg() */ 3863 for (i = 0; i < state->id_rx_nqueues; i++) { 3864 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3865 mutex_enter(&rxp->rx_post_lock); 3866 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 3867 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3868 freemsg(rwqe->rwqe_im_mblk); 3869 rxp->rx_cnt--; 3870 } 3871 rxp->rx_head = NULL; 3872 mutex_exit(&rxp->rx_post_lock); 3873 } 3874 3875 /* cannot free rx resources unless gld returned everything */ 3876 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 3877 ibd_free_rx_rsrcs(state); 3878 } 3879 3880 /* 3881 * Free an allocated recv wqe. 3882 */ 3883 /* ARGSUSED */ 3884 static void 3885 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3886 { 3887 /* 3888 * desballoc() failed (no memory). 3889 * 3890 * This rwqe is placed on a free list so that it 3891 * can be reinstated when memory is available. 3892 * 3893 * NOTE: no code currently exists to reinstate 3894 * these "lost" rwqes. 3895 */ 3896 mutex_enter(&state->id_rx_free_list.dl_mutex); 3897 state->id_rx_free_list.dl_cnt++; 3898 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 3899 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 3900 mutex_exit(&state->id_rx_free_list.dl_mutex); 3901 } 3902 3903 /* 3904 * IBA Rx completion queue handler. Guaranteed to be single 3905 * threaded and nonreentrant for this CQ. 3906 */ 3907 /* ARGSUSED */ 3908 static void 3909 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3910 { 3911 ibd_state_t *state = (ibd_state_t *)arg; 3912 3913 atomic_inc_64(&state->id_num_intrs); 3914 3915 if (ibd_rx_softintr == 1) { 3916 mutex_enter(&state->id_rcq_poll_lock); 3917 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 3918 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 3919 mutex_exit(&state->id_rcq_poll_lock); 3920 return; 3921 } else { 3922 mutex_exit(&state->id_rcq_poll_lock); 3923 ddi_trigger_softintr(state->id_rx); 3924 } 3925 } else 3926 (void) ibd_intr((caddr_t)state); 3927 } 3928 3929 /* 3930 * CQ handler for Tx completions, when the Tx CQ is in 3931 * interrupt driven mode. 3932 */ 3933 /* ARGSUSED */ 3934 static void 3935 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3936 { 3937 ibd_state_t *state = (ibd_state_t *)arg; 3938 3939 atomic_inc_64(&state->id_num_intrs); 3940 3941 if (ibd_tx_softintr == 1) { 3942 mutex_enter(&state->id_scq_poll_lock); 3943 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 3944 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 3945 mutex_exit(&state->id_scq_poll_lock); 3946 return; 3947 } else { 3948 mutex_exit(&state->id_scq_poll_lock); 3949 ddi_trigger_softintr(state->id_tx); 3950 } 3951 } else 3952 (void) ibd_tx_recycle((caddr_t)state); 3953 } 3954 3955 /* 3956 * Multicast group create/delete trap handler. These will be delivered 3957 * on a kernel thread (handling can thus block) and can be invoked 3958 * concurrently. The handler can be invoked anytime after it is 3959 * registered and before ibt_detach(). 3960 */ 3961 /* ARGSUSED */ 3962 static void 3963 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3964 ibt_subnet_event_t *event) 3965 { 3966 ibd_state_t *state = (ibd_state_t *)arg; 3967 ibd_req_t *req; 3968 3969 /* 3970 * The trap handler will get invoked once for every event for 3971 * every port. The input "gid" is the GID0 of the port the 3972 * trap came in on; we just need to act on traps that came 3973 * to our port, meaning the port on which the ipoib interface 3974 * resides. Since ipoib uses GID0 of the port, we just match 3975 * the gids to check whether we need to handle the trap. 3976 */ 3977 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3978 return; 3979 3980 DPRINT(10, "ibd_notices_handler : %d\n", code); 3981 3982 switch (code) { 3983 case IBT_SM_EVENT_UNAVAILABLE: 3984 /* 3985 * If we are in promiscuous mode or have 3986 * sendnonmembers, we need to print a warning 3987 * message right now. Else, just store the 3988 * information, print when we enter promiscuous 3989 * mode or attempt nonmember send. We might 3990 * also want to stop caching sendnonmember. 3991 */ 3992 ibd_print_warn(state, "IBA multicast support " 3993 "degraded due to unavailability of multicast " 3994 "traps"); 3995 break; 3996 case IBT_SM_EVENT_AVAILABLE: 3997 /* 3998 * If we printed a warning message above or 3999 * while trying to nonmember send or get into 4000 * promiscuous mode, print an okay message. 4001 */ 4002 ibd_print_warn(state, "IBA multicast support " 4003 "restored due to availability of multicast " 4004 "traps"); 4005 break; 4006 case IBT_SM_EVENT_MCG_CREATED: 4007 case IBT_SM_EVENT_MCG_DELETED: 4008 /* 4009 * If it is a "deleted" event and we are in late hca 4010 * init, nothing to do. 4011 */ 4012 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4013 IBD_DRV_IN_LATE_HCA_INIT) && (code == 4014 IBT_SM_EVENT_MCG_DELETED)) { 4015 break; 4016 } 4017 /* 4018 * Common processing of creation/deletion traps. 4019 * First check if the instance is being 4020 * [de]initialized; back off then, without doing 4021 * anything more, since we are not sure if the 4022 * async thread is around, or whether we might 4023 * be racing with the detach code in ibd_m_stop() 4024 * that scans the mcg list. 4025 */ 4026 if (!ibd_async_safe(state)) 4027 return; 4028 4029 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4030 req->rq_gid = event->sm_notice_gid; 4031 req->rq_ptr = (void *)code; 4032 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4033 break; 4034 } 4035 } 4036 4037 static void 4038 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4039 { 4040 ib_gid_t mgid = req->rq_gid; 4041 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4042 int ret; 4043 ib_pkey_t pkey = (mgid.gid_prefix >> 16) & 0xffff; 4044 4045 DPRINT(10, "ibd_async_trap : %d\n", code); 4046 4047 /* 4048 * Check if we have already joined the IPoIB broadcast group for our 4049 * PKEY. If joined, perform the rest of the operation. 4050 * Else, the interface is not initialised. Do the initialisation here 4051 * by calling ibd_start() and return. 4052 */ 4053 4054 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4055 IBD_DRV_IN_LATE_HCA_INIT) && (state->id_bgroup_present == 0) && 4056 (code == IBT_SM_EVENT_MCG_CREATED)) { 4057 /* 4058 * If we are in late HCA init and a notification for the 4059 * creation of a MCG came in, check if it is the IPoIB MCG for 4060 * this pkey. If not, return. 4061 */ 4062 if ((mgid.gid_guid != IB_MGID_IPV4_LOWGRP_MASK) || (pkey != 4063 state->id_pkey)) { 4064 ibd_async_done(state); 4065 return; 4066 } 4067 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4068 /* 4069 * Check if there is still a necessity to start the interface. 4070 * It is possible that the user attempted unplumb at just about 4071 * the same time, and if unplumb succeeded, we have nothing to 4072 * do. 4073 */ 4074 if (((state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) == 4075 IBD_DRV_IN_LATE_HCA_INIT) && 4076 ((ret = ibd_start(state)) != 0)) { 4077 DPRINT(10, "ibd_async_trap: cannot start from late HCA " 4078 "init, ret=%d", ret); 4079 } 4080 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 4081 ibd_async_done(state); 4082 return; 4083 } 4084 4085 /* 4086 * Atomically search the nonmember and sendonlymember lists and 4087 * delete. 4088 */ 4089 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4090 4091 if (state->id_prom_op == IBD_OP_COMPLETED) { 4092 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4093 4094 /* 4095 * If in promiscuous mode, try to join/attach to the new 4096 * mcg. Given the unreliable out-of-order mode of trap 4097 * delivery, we can never be sure whether it is a problem 4098 * if the join fails. Thus, we warn the admin of a failure 4099 * if this was a creation trap. Note that the trap might 4100 * actually be reporting a long past event, and the mcg 4101 * might already have been deleted, thus we might be warning 4102 * in vain. 4103 */ 4104 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4105 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4106 ibd_print_warn(state, "IBA promiscuous mode missed " 4107 "new multicast gid %016llx:%016llx", 4108 (u_longlong_t)mgid.gid_prefix, 4109 (u_longlong_t)mgid.gid_guid); 4110 } 4111 4112 /* 4113 * Free the request slot allocated by the subnet event thread. 4114 */ 4115 ibd_async_done(state); 4116 } 4117 4118 /* 4119 * GLDv3 entry point to get capabilities. 4120 */ 4121 static boolean_t 4122 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4123 { 4124 ibd_state_t *state = arg; 4125 4126 if (state->id_type == IBD_PORT_DRIVER) 4127 return (B_FALSE); 4128 4129 switch (cap) { 4130 case MAC_CAPAB_HCKSUM: { 4131 uint32_t *txflags = cap_data; 4132 4133 /* 4134 * We either do full checksum or not do it at all 4135 */ 4136 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4137 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4138 else 4139 return (B_FALSE); 4140 break; 4141 } 4142 4143 case MAC_CAPAB_LSO: { 4144 mac_capab_lso_t *cap_lso = cap_data; 4145 4146 /* 4147 * In addition to the capability and policy, since LSO 4148 * relies on hw checksum, we'll not enable LSO if we 4149 * don't have hw checksum. Of course, if the HCA doesn't 4150 * provide the reserved lkey capability, enabling LSO will 4151 * actually affect performance adversely, so we'll disable 4152 * LSO even for that case. 4153 */ 4154 if (!state->id_lso_policy || !state->id_lso_capable) 4155 return (B_FALSE); 4156 4157 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4158 return (B_FALSE); 4159 4160 if (state->id_hca_res_lkey_capab == 0) { 4161 ibd_print_warn(state, "no reserved-lkey capability, " 4162 "disabling LSO"); 4163 return (B_FALSE); 4164 } 4165 4166 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4167 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4168 break; 4169 } 4170 4171 default: 4172 return (B_FALSE); 4173 } 4174 4175 return (B_TRUE); 4176 } 4177 4178 /* 4179 * callback function for set/get of properties 4180 */ 4181 static int 4182 ibd_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4183 uint_t pr_valsize, const void *pr_val) 4184 { 4185 ibd_state_t *state = arg; 4186 int err = 0; 4187 uint32_t link_mode; 4188 4189 /* Cannot set properties on a port driver */ 4190 if (state->id_type == IBD_PORT_DRIVER) { 4191 return (ENOTSUP); 4192 } 4193 4194 switch (pr_num) { 4195 case MAC_PROP_IB_LINKMODE: 4196 if (state->id_mac_state & IBD_DRV_STARTED) { 4197 err = EBUSY; 4198 break; 4199 } 4200 if (pr_val == NULL) { 4201 err = EINVAL; 4202 break; 4203 } 4204 bcopy(pr_val, &link_mode, sizeof (link_mode)); 4205 if (link_mode != IBD_LINK_MODE_UD && 4206 link_mode != IBD_LINK_MODE_RC) { 4207 err = EINVAL; 4208 } else { 4209 if (link_mode == IBD_LINK_MODE_RC) { 4210 if (state->id_enable_rc) { 4211 return (0); 4212 } 4213 state->id_enable_rc = 1; 4214 /* inform MAC framework of new MTU */ 4215 err = mac_maxsdu_update2(state->id_mh, 4216 state->rc_mtu - IPOIB_HDRSIZE, 4217 state->id_mtu - IPOIB_HDRSIZE); 4218 } else { 4219 if (!state->id_enable_rc) { 4220 return (0); 4221 } 4222 state->id_enable_rc = 0; 4223 err = mac_maxsdu_update2(state->id_mh, 4224 state->id_mtu - IPOIB_HDRSIZE, 4225 state->id_mtu - IPOIB_HDRSIZE); 4226 } 4227 (void) ibd_record_capab(state); 4228 mac_capab_update(state->id_mh); 4229 } 4230 break; 4231 case MAC_PROP_PRIVATE: 4232 err = ibd_set_priv_prop(state, pr_name, 4233 pr_valsize, pr_val); 4234 break; 4235 default: 4236 err = ENOTSUP; 4237 break; 4238 } 4239 return (err); 4240 } 4241 4242 static int 4243 ibd_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4244 uint_t pr_valsize, void *pr_val) 4245 { 4246 ibd_state_t *state = arg; 4247 int err = 0; 4248 4249 switch (pr_num) { 4250 case MAC_PROP_MTU: 4251 break; 4252 default: 4253 if (state->id_type == IBD_PORT_DRIVER) { 4254 return (ENOTSUP); 4255 } 4256 break; 4257 } 4258 4259 switch (pr_num) { 4260 case MAC_PROP_IB_LINKMODE: 4261 *(uint_t *)pr_val = state->id_enable_rc; 4262 break; 4263 case MAC_PROP_PRIVATE: 4264 err = ibd_get_priv_prop(state, pr_name, pr_valsize, 4265 pr_val); 4266 break; 4267 default: 4268 err = ENOTSUP; 4269 break; 4270 } 4271 return (err); 4272 } 4273 4274 static void 4275 ibd_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 4276 mac_prop_info_handle_t prh) 4277 { 4278 ibd_state_t *state = arg; 4279 4280 switch (pr_num) { 4281 case MAC_PROP_IB_LINKMODE: { 4282 mac_prop_info_set_default_uint32(prh, IBD_DEF_LINK_MODE); 4283 break; 4284 } 4285 case MAC_PROP_MTU: { 4286 uint32_t min, max; 4287 if (state->id_type == IBD_PORT_DRIVER) { 4288 min = 1500; 4289 max = IBD_DEF_RC_MAX_SDU; 4290 } else if (state->id_enable_rc) { 4291 min = max = IBD_DEF_RC_MAX_SDU; 4292 } else { 4293 min = max = state->id_mtu - IPOIB_HDRSIZE; 4294 } 4295 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4296 mac_prop_info_set_range_uint32(prh, min, max); 4297 break; 4298 } 4299 case MAC_PROP_PRIVATE: { 4300 char valstr[64]; 4301 int value; 4302 4303 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4304 mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); 4305 return; 4306 } else if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4307 value = IBD_DEF_COALESCE_COMPLETIONS; 4308 } else if (strcmp(pr_name, 4309 "_ibd_create_broadcast_group") == 0) { 4310 value = IBD_DEF_CREATE_BCAST_GROUP; 4311 } else if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4312 value = IBD_DEF_HASH_SIZE; 4313 } else if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4314 value = IBD_DEF_LSO_POLICY; 4315 } else if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4316 value = IBD_DEF_NUM_AH; 4317 } else if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4318 value = IBD_DEF_NUM_LSO_BUFS; 4319 } else if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4320 value = IBD_DEF_RC_ENABLE_SRQ; 4321 } else if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4322 value = IBD_DEF_RC_NUM_RWQE; 4323 } else if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4324 value = IBD_DEF_RC_NUM_SRQ; 4325 } else if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4326 value = IBD_DEF_RC_NUM_SWQE; 4327 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4328 value = IBD_DEF_RC_RX_COMP_COUNT; 4329 } else if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4330 value = IBD_DEF_RC_RX_COMP_USEC; 4331 } else if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4332 value = IBD_DEF_RC_RX_COPY_THRESH; 4333 } else if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4334 value = IBD_DEF_RC_RX_RWQE_THRESH; 4335 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4336 value = IBD_DEF_RC_TX_COMP_COUNT; 4337 } else if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4338 value = IBD_DEF_RC_TX_COMP_USEC; 4339 } else if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4340 value = IBD_DEF_RC_TX_COPY_THRESH; 4341 } else if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4342 value = IBD_DEF_UD_NUM_RWQE; 4343 } else if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4344 value = IBD_DEF_UD_NUM_SWQE; 4345 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4346 value = IBD_DEF_UD_RX_COMP_COUNT; 4347 } else if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4348 value = IBD_DEF_UD_RX_COMP_USEC; 4349 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4350 value = IBD_DEF_UD_TX_COMP_COUNT; 4351 } else if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4352 value = IBD_DEF_UD_TX_COMP_USEC; 4353 } else if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4354 value = IBD_DEF_UD_TX_COPY_THRESH; 4355 } else { 4356 return; 4357 } 4358 4359 (void) snprintf(valstr, sizeof (valstr), "%d", value); 4360 mac_prop_info_set_default_str(prh, valstr); 4361 break; 4362 } 4363 } /* switch (pr_num) */ 4364 } 4365 4366 /* ARGSUSED2 */ 4367 static int 4368 ibd_set_priv_prop(ibd_state_t *state, const char *pr_name, 4369 uint_t pr_valsize, const void *pr_val) 4370 { 4371 int err = 0; 4372 long result; 4373 4374 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4375 if (pr_val == NULL) { 4376 return (EINVAL); 4377 } 4378 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4379 if (result < 0 || result > 1) { 4380 err = EINVAL; 4381 } else { 4382 state->id_allow_coalesce_comp_tuning = (result == 1) ? 4383 B_TRUE: B_FALSE; 4384 } 4385 return (err); 4386 } 4387 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4388 if (state->id_mac_state & IBD_DRV_STARTED) { 4389 return (EBUSY); 4390 } 4391 if (pr_val == NULL) { 4392 return (EINVAL); 4393 } 4394 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4395 if (result < 0 || result > 1) { 4396 err = EINVAL; 4397 } else { 4398 state->id_create_broadcast_group = (result == 1) ? 4399 B_TRUE: B_FALSE; 4400 } 4401 return (err); 4402 } 4403 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4404 if (state->id_mac_state & IBD_DRV_STARTED) { 4405 return (EBUSY); 4406 } 4407 if (pr_val == NULL) { 4408 return (EINVAL); 4409 } 4410 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4411 if (result < IBD_MIN_HASH_SIZE || result > IBD_MAX_HASH_SIZE) { 4412 err = EINVAL; 4413 } else { 4414 state->id_hash_size = (uint32_t)result; 4415 } 4416 return (err); 4417 } 4418 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4419 if (state->id_mac_state & IBD_DRV_STARTED) { 4420 return (EBUSY); 4421 } 4422 if (pr_val == NULL) { 4423 return (EINVAL); 4424 } 4425 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4426 if (result < 0 || result > 1) { 4427 err = EINVAL; 4428 } else { 4429 state->id_lso_policy = (result == 1) ? 4430 B_TRUE: B_FALSE; 4431 } 4432 mac_capab_update(state->id_mh); 4433 return (err); 4434 } 4435 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4436 if (state->id_mac_state & IBD_DRV_STARTED) { 4437 return (EBUSY); 4438 } 4439 if (pr_val == NULL) { 4440 return (EINVAL); 4441 } 4442 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4443 if (result < IBD_MIN_NUM_AH || result > IBD_MAX_NUM_AH) { 4444 err = EINVAL; 4445 } else { 4446 state->id_num_ah = (uint32_t)result; 4447 } 4448 return (err); 4449 } 4450 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4451 if (state->id_mac_state & IBD_DRV_STARTED) { 4452 return (EBUSY); 4453 } 4454 if (!state->id_lso_policy || !state->id_lso_capable) { 4455 return (EINVAL); 4456 } 4457 if (pr_val == NULL) { 4458 return (EINVAL); 4459 } 4460 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4461 if (result < IBD_MIN_NUM_LSO_BUFS || 4462 result > IBD_MAX_NUM_LSO_BUFS) { 4463 err = EINVAL; 4464 } else { 4465 state->id_num_lso_bufs = (uint32_t)result; 4466 } 4467 return (err); 4468 } 4469 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4470 if (state->id_mac_state & IBD_DRV_STARTED) { 4471 return (EBUSY); 4472 } 4473 if (pr_val == NULL) { 4474 return (EINVAL); 4475 } 4476 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4477 if (result < 0 || result > 1) { 4478 err = EINVAL; 4479 } else { 4480 state->rc_enable_srq = (result == 1) ? 4481 B_TRUE: B_FALSE; 4482 } 4483 if (!state->rc_enable_srq) { 4484 state->id_rc_num_srq = 0; 4485 } 4486 return (err); 4487 } 4488 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4489 if (state->id_mac_state & IBD_DRV_STARTED) { 4490 return (EBUSY); 4491 } 4492 if (pr_val == NULL) { 4493 return (EINVAL); 4494 } 4495 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4496 if (result < IBD_MIN_RC_NUM_RWQE || 4497 result > IBD_MAX_RC_NUM_RWQE) { 4498 err = EINVAL; 4499 } else { 4500 state->id_rc_num_rwqe = (uint32_t)result; 4501 if (state->id_allow_coalesce_comp_tuning && 4502 state->id_rc_rx_comp_count > state->id_rc_num_rwqe) 4503 state->id_rc_rx_comp_count = 4504 state->id_rc_num_rwqe; 4505 if (state->id_rc_num_srq > state->id_rc_num_rwqe) 4506 state->id_rc_num_srq = 4507 state->id_rc_num_rwqe - 1; 4508 /* 4509 * If rx_rwqe_threshold is greater than the number of 4510 * rwqes, pull it back to 25% of number of rwqes. 4511 */ 4512 if (state->id_rc_rx_rwqe_thresh > state->id_rc_num_rwqe) 4513 state->id_rc_rx_rwqe_thresh = 4514 (state->id_rc_num_rwqe >> 2); 4515 4516 } 4517 return (err); 4518 } 4519 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4520 if (state->id_mac_state & IBD_DRV_STARTED) { 4521 return (EBUSY); 4522 } 4523 if (pr_val == NULL) { 4524 return (EINVAL); 4525 } 4526 if (!state->rc_enable_srq) 4527 return (EINVAL); 4528 4529 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4530 if (result < IBD_MIN_RC_NUM_SRQ || 4531 result >= state->id_rc_num_rwqe) { 4532 err = EINVAL; 4533 } else 4534 state->id_rc_num_srq = (uint32_t)result; 4535 return (err); 4536 } 4537 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4538 if (state->id_mac_state & IBD_DRV_STARTED) { 4539 return (EBUSY); 4540 } 4541 if (pr_val == NULL) { 4542 return (EINVAL); 4543 } 4544 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4545 if (result < IBD_MIN_RC_NUM_SWQE || 4546 result > IBD_MAX_RC_NUM_SWQE) { 4547 err = EINVAL; 4548 } else { 4549 state->id_rc_num_swqe = (uint32_t)result; 4550 if (state->id_allow_coalesce_comp_tuning && 4551 state->id_rc_tx_comp_count > state->id_rc_num_swqe) 4552 state->id_rc_tx_comp_count = 4553 state->id_rc_num_swqe; 4554 } 4555 return (err); 4556 } 4557 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4558 if (!state->id_allow_coalesce_comp_tuning) { 4559 return (ENOTSUP); 4560 } 4561 if (pr_val == NULL) { 4562 return (EINVAL); 4563 } 4564 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4565 if (result < 1 || result > state->id_rc_num_rwqe) { 4566 err = EINVAL; 4567 } else { 4568 state->id_rc_rx_comp_count = (uint32_t)result; 4569 } 4570 return (err); 4571 } 4572 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4573 if (!state->id_allow_coalesce_comp_tuning) { 4574 return (ENOTSUP); 4575 } 4576 if (pr_val == NULL) { 4577 return (EINVAL); 4578 } 4579 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4580 if (result < 1) { 4581 err = EINVAL; 4582 } else { 4583 state->id_rc_rx_comp_usec = (uint32_t)result; 4584 } 4585 return (err); 4586 } 4587 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4588 if (state->id_mac_state & IBD_DRV_STARTED) { 4589 return (EBUSY); 4590 } 4591 if (pr_val == NULL) { 4592 return (EINVAL); 4593 } 4594 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4595 if (result < IBD_MIN_RC_RX_COPY_THRESH || 4596 result > state->rc_mtu) { 4597 err = EINVAL; 4598 } else { 4599 state->id_rc_rx_copy_thresh = (uint32_t)result; 4600 } 4601 return (err); 4602 } 4603 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4604 if (state->id_mac_state & IBD_DRV_STARTED) { 4605 return (EBUSY); 4606 } 4607 if (pr_val == NULL) { 4608 return (EINVAL); 4609 } 4610 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4611 if (result < IBD_MIN_RC_RX_RWQE_THRESH || 4612 result >= state->id_rc_num_rwqe) { 4613 err = EINVAL; 4614 } else { 4615 state->id_rc_rx_rwqe_thresh = (uint32_t)result; 4616 } 4617 return (err); 4618 } 4619 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4620 if (!state->id_allow_coalesce_comp_tuning) { 4621 return (ENOTSUP); 4622 } 4623 if (pr_val == NULL) { 4624 return (EINVAL); 4625 } 4626 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4627 if (result < 1 || result > state->id_rc_num_swqe) { 4628 err = EINVAL; 4629 } else { 4630 state->id_rc_tx_comp_count = (uint32_t)result; 4631 } 4632 return (err); 4633 } 4634 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4635 if (!state->id_allow_coalesce_comp_tuning) { 4636 return (ENOTSUP); 4637 } 4638 if (pr_val == NULL) { 4639 return (EINVAL); 4640 } 4641 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4642 if (result < 1) 4643 err = EINVAL; 4644 else { 4645 state->id_rc_tx_comp_usec = (uint32_t)result; 4646 } 4647 return (err); 4648 } 4649 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4650 if (state->id_mac_state & IBD_DRV_STARTED) { 4651 return (EBUSY); 4652 } 4653 if (pr_val == NULL) { 4654 return (EINVAL); 4655 } 4656 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4657 if (result < IBD_MIN_RC_TX_COPY_THRESH || 4658 result > state->rc_mtu) { 4659 err = EINVAL; 4660 } else { 4661 state->id_rc_tx_copy_thresh = (uint32_t)result; 4662 } 4663 return (err); 4664 } 4665 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4666 if (state->id_mac_state & IBD_DRV_STARTED) { 4667 return (EBUSY); 4668 } 4669 if (pr_val == NULL) { 4670 return (EINVAL); 4671 } 4672 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4673 if (result < IBD_MIN_UD_NUM_RWQE || 4674 result > IBD_MAX_UD_NUM_RWQE) { 4675 err = EINVAL; 4676 } else { 4677 if (result > state->id_hca_max_chan_sz) { 4678 state->id_ud_num_rwqe = 4679 state->id_hca_max_chan_sz; 4680 } else { 4681 state->id_ud_num_rwqe = (uint32_t)result; 4682 } 4683 if (state->id_allow_coalesce_comp_tuning && 4684 state->id_ud_rx_comp_count > state->id_ud_num_rwqe) 4685 state->id_ud_rx_comp_count = 4686 state->id_ud_num_rwqe; 4687 } 4688 return (err); 4689 } 4690 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4691 if (state->id_mac_state & IBD_DRV_STARTED) { 4692 return (EBUSY); 4693 } 4694 if (pr_val == NULL) { 4695 return (EINVAL); 4696 } 4697 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4698 if (result < IBD_MIN_UD_NUM_SWQE || 4699 result > IBD_MAX_UD_NUM_SWQE) { 4700 err = EINVAL; 4701 } else { 4702 if (result > state->id_hca_max_chan_sz) { 4703 state->id_ud_num_swqe = 4704 state->id_hca_max_chan_sz; 4705 } else { 4706 state->id_ud_num_swqe = (uint32_t)result; 4707 } 4708 if (state->id_allow_coalesce_comp_tuning && 4709 state->id_ud_tx_comp_count > state->id_ud_num_swqe) 4710 state->id_ud_tx_comp_count = 4711 state->id_ud_num_swqe; 4712 } 4713 return (err); 4714 } 4715 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4716 if (!state->id_allow_coalesce_comp_tuning) { 4717 return (ENOTSUP); 4718 } 4719 if (pr_val == NULL) { 4720 return (EINVAL); 4721 } 4722 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4723 if (result < 1 || result > state->id_ud_num_rwqe) { 4724 err = EINVAL; 4725 } else { 4726 state->id_ud_rx_comp_count = (uint32_t)result; 4727 } 4728 return (err); 4729 } 4730 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4731 if (!state->id_allow_coalesce_comp_tuning) { 4732 return (ENOTSUP); 4733 } 4734 if (pr_val == NULL) { 4735 return (EINVAL); 4736 } 4737 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4738 if (result < 1) { 4739 err = EINVAL; 4740 } else { 4741 state->id_ud_rx_comp_usec = (uint32_t)result; 4742 } 4743 return (err); 4744 } 4745 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4746 if (!state->id_allow_coalesce_comp_tuning) { 4747 return (ENOTSUP); 4748 } 4749 if (pr_val == NULL) { 4750 return (EINVAL); 4751 } 4752 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4753 if (result < 1 || result > state->id_ud_num_swqe) { 4754 err = EINVAL; 4755 } else { 4756 state->id_ud_tx_comp_count = (uint32_t)result; 4757 } 4758 return (err); 4759 } 4760 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4761 if (!state->id_allow_coalesce_comp_tuning) { 4762 return (ENOTSUP); 4763 } 4764 if (pr_val == NULL) { 4765 return (EINVAL); 4766 } 4767 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4768 if (result < 1) { 4769 err = EINVAL; 4770 } else { 4771 state->id_ud_tx_comp_usec = (uint32_t)result; 4772 } 4773 return (err); 4774 } 4775 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4776 if (state->id_mac_state & IBD_DRV_STARTED) { 4777 return (EBUSY); 4778 } 4779 if (pr_val == NULL) { 4780 return (EINVAL); 4781 } 4782 (void) ddi_strtol(pr_val, (char **)NULL, 0, &result); 4783 if (result < IBD_MIN_UD_TX_COPY_THRESH || 4784 result > IBD_MAX_UD_TX_COPY_THRESH) { 4785 err = EINVAL; 4786 } else { 4787 state->id_ud_tx_copy_thresh = (uint32_t)result; 4788 } 4789 return (err); 4790 } 4791 return (ENOTSUP); 4792 } 4793 4794 static int 4795 ibd_get_priv_prop(ibd_state_t *state, const char *pr_name, uint_t pr_valsize, 4796 void *pr_val) 4797 { 4798 int err = ENOTSUP; 4799 int value; 4800 4801 if (strcmp(pr_name, "_ibd_broadcast_group") == 0) { 4802 value = state->id_bgroup_present; 4803 err = 0; 4804 goto done; 4805 } 4806 if (strcmp(pr_name, "_ibd_coalesce_completions") == 0) { 4807 value = state->id_allow_coalesce_comp_tuning; 4808 err = 0; 4809 goto done; 4810 } 4811 if (strcmp(pr_name, "_ibd_create_broadcast_group") == 0) { 4812 value = state->id_create_broadcast_group; 4813 err = 0; 4814 goto done; 4815 } 4816 if (strcmp(pr_name, "_ibd_hash_size") == 0) { 4817 value = state->id_hash_size; 4818 err = 0; 4819 goto done; 4820 } 4821 if (strcmp(pr_name, "_ibd_lso_enable") == 0) { 4822 value = state->id_lso_policy; 4823 err = 0; 4824 goto done; 4825 } 4826 if (strcmp(pr_name, "_ibd_num_ah") == 0) { 4827 value = state->id_num_ah; 4828 err = 0; 4829 goto done; 4830 } 4831 if (strcmp(pr_name, "_ibd_num_lso_bufs") == 0) { 4832 value = state->id_num_lso_bufs; 4833 err = 0; 4834 goto done; 4835 } 4836 if (strcmp(pr_name, "_ibd_rc_enable_srq") == 0) { 4837 value = state->rc_enable_srq; 4838 err = 0; 4839 goto done; 4840 } 4841 if (strcmp(pr_name, "_ibd_rc_num_rwqe") == 0) { 4842 value = state->id_rc_num_rwqe; 4843 err = 0; 4844 goto done; 4845 } 4846 if (strcmp(pr_name, "_ibd_rc_num_srq") == 0) { 4847 value = state->id_rc_num_srq; 4848 err = 0; 4849 goto done; 4850 } 4851 if (strcmp(pr_name, "_ibd_rc_num_swqe") == 0) { 4852 value = state->id_rc_num_swqe; 4853 err = 0; 4854 goto done; 4855 } 4856 if (strcmp(pr_name, "_ibd_rc_rx_comp_count") == 0) { 4857 value = state->id_rc_rx_comp_count; 4858 err = 0; 4859 goto done; 4860 } 4861 if (strcmp(pr_name, "_ibd_rc_rx_comp_usec") == 0) { 4862 value = state->id_rc_rx_comp_usec; 4863 err = 0; 4864 goto done; 4865 } 4866 if (strcmp(pr_name, "_ibd_rc_rx_copy_thresh") == 0) { 4867 value = state->id_rc_rx_copy_thresh; 4868 err = 0; 4869 goto done; 4870 } 4871 if (strcmp(pr_name, "_ibd_rc_rx_rwqe_thresh") == 0) { 4872 value = state->id_rc_rx_rwqe_thresh; 4873 err = 0; 4874 goto done; 4875 } 4876 if (strcmp(pr_name, "_ibd_rc_tx_comp_count") == 0) { 4877 value = state->id_rc_tx_comp_count; 4878 err = 0; 4879 goto done; 4880 } 4881 if (strcmp(pr_name, "_ibd_rc_tx_comp_usec") == 0) { 4882 value = state->id_rc_tx_comp_usec; 4883 err = 0; 4884 goto done; 4885 } 4886 if (strcmp(pr_name, "_ibd_rc_tx_copy_thresh") == 0) { 4887 value = state->id_rc_tx_copy_thresh; 4888 err = 0; 4889 goto done; 4890 } 4891 if (strcmp(pr_name, "_ibd_ud_num_rwqe") == 0) { 4892 value = state->id_ud_num_rwqe; 4893 err = 0; 4894 goto done; 4895 } 4896 if (strcmp(pr_name, "_ibd_ud_num_swqe") == 0) { 4897 value = state->id_ud_num_swqe; 4898 err = 0; 4899 goto done; 4900 } 4901 if (strcmp(pr_name, "_ibd_ud_rx_comp_count") == 0) { 4902 value = state->id_ud_rx_comp_count; 4903 err = 0; 4904 goto done; 4905 } 4906 if (strcmp(pr_name, "_ibd_ud_rx_comp_usec") == 0) { 4907 value = state->id_ud_rx_comp_usec; 4908 err = 0; 4909 goto done; 4910 } 4911 if (strcmp(pr_name, "_ibd_ud_tx_comp_count") == 0) { 4912 value = state->id_ud_tx_comp_count; 4913 err = 0; 4914 goto done; 4915 } 4916 if (strcmp(pr_name, "_ibd_ud_tx_comp_usec") == 0) { 4917 value = state->id_ud_tx_comp_usec; 4918 err = 0; 4919 goto done; 4920 } 4921 if (strcmp(pr_name, "_ibd_ud_tx_copy_thresh") == 0) { 4922 value = state->id_ud_tx_copy_thresh; 4923 err = 0; 4924 goto done; 4925 } 4926 done: 4927 if (err == 0) { 4928 (void) snprintf(pr_val, pr_valsize, "%d", value); 4929 } 4930 return (err); 4931 } 4932 4933 static int 4934 ibd_get_port_details(ibd_state_t *state) 4935 { 4936 ibt_hca_portinfo_t *port_infop; 4937 ibt_status_t ret; 4938 uint_t psize, port_infosz; 4939 4940 mutex_enter(&state->id_link_mutex); 4941 4942 /* 4943 * Query for port information 4944 */ 4945 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4946 &port_infop, &psize, &port_infosz); 4947 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4948 mutex_exit(&state->id_link_mutex); 4949 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4950 "failed, ret=%d", ret); 4951 return (ENETDOWN); 4952 } 4953 4954 /* 4955 * If the link is active, verify the pkey 4956 */ 4957 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) { 4958 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4959 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4960 state->id_link_state = LINK_STATE_DOWN; 4961 } else { 4962 state->id_link_state = LINK_STATE_UP; 4963 } 4964 state->id_mtu = (128 << port_infop->p_mtu); 4965 state->id_sgid = *port_infop->p_sgid_tbl; 4966 /* 4967 * Now that the port is active, record the port speed 4968 */ 4969 state->id_link_speed = ibd_get_portspeed(state); 4970 } else { 4971 /* Make sure that these are handled in PORT_UP/CHANGE */ 4972 state->id_mtu = 0; 4973 state->id_link_state = LINK_STATE_DOWN; 4974 state->id_link_speed = 0; 4975 } 4976 mutex_exit(&state->id_link_mutex); 4977 ibt_free_portinfo(port_infop, port_infosz); 4978 4979 return (0); 4980 } 4981 4982 static int 4983 ibd_alloc_cqs(ibd_state_t *state) 4984 { 4985 ibt_hca_attr_t hca_attrs; 4986 ibt_cq_attr_t cq_attr; 4987 ibt_status_t ret; 4988 uint32_t real_size; 4989 uint_t num_rwqe_change = 0; 4990 uint_t num_swqe_change = 0; 4991 4992 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4993 ASSERT(ret == IBT_SUCCESS); 4994 4995 /* 4996 * Allocate Rx/combined CQ: 4997 * Theoretically, there is no point in having more than #rwqe 4998 * plus #swqe cqe's, except that the CQ will be signaled for 4999 * overflow when the last wqe completes, if none of the previous 5000 * cqe's have been polled. Thus, we allocate just a few less wqe's 5001 * to make sure such overflow does not occur. 5002 */ 5003 cq_attr.cq_sched = NULL; 5004 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 5005 5006 /* 5007 * Allocate Receive CQ. 5008 */ 5009 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_rwqe + 1)) { 5010 cq_attr.cq_size = state->id_ud_num_rwqe + 1; 5011 } else { 5012 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5013 num_rwqe_change = state->id_ud_num_rwqe; 5014 state->id_ud_num_rwqe = cq_attr.cq_size - 1; 5015 } 5016 5017 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5018 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 5019 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 5020 "failed, ret=%d\n", ret); 5021 return (DDI_FAILURE); 5022 } 5023 5024 if ((ret = ibt_modify_cq(state->id_rcq_hdl, state->id_ud_rx_comp_count, 5025 state->id_ud_rx_comp_usec, 0)) != IBT_SUCCESS) { 5026 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 5027 "moderation failed, ret=%d\n", ret); 5028 } 5029 5030 /* make the #rx wc's the same as max rx chain size */ 5031 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 5032 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 5033 state->id_rxwcs_size, KM_SLEEP); 5034 5035 /* 5036 * Allocate Send CQ. 5037 */ 5038 if (hca_attrs.hca_max_cq_sz >= (state->id_ud_num_swqe + 1)) { 5039 cq_attr.cq_size = state->id_ud_num_swqe + 1; 5040 } else { 5041 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 5042 num_swqe_change = state->id_ud_num_swqe; 5043 state->id_ud_num_swqe = cq_attr.cq_size - 1; 5044 } 5045 5046 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 5047 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 5048 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 5049 "failed, ret=%d\n", ret); 5050 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 5051 state->id_rxwcs_size); 5052 (void) ibt_free_cq(state->id_rcq_hdl); 5053 return (DDI_FAILURE); 5054 } 5055 if ((ret = ibt_modify_cq(state->id_scq_hdl, state->id_ud_tx_comp_count, 5056 state->id_ud_tx_comp_usec, 0)) != IBT_SUCCESS) { 5057 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 5058 "moderation failed, ret=%d\n", ret); 5059 } 5060 5061 state->id_txwcs_size = IBD_TX_POLL_THRESH; 5062 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 5063 state->id_txwcs_size, KM_SLEEP); 5064 5065 /* 5066 * Print message in case we could not allocate as many wqe's 5067 * as was requested. 5068 */ 5069 if (num_rwqe_change) { 5070 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 5071 "%d", state->id_ud_num_rwqe, num_rwqe_change); 5072 } 5073 if (num_swqe_change) { 5074 ibd_print_warn(state, "Setting #swqe = %d instead of default " 5075 "%d", state->id_ud_num_swqe, num_swqe_change); 5076 } 5077 5078 return (DDI_SUCCESS); 5079 } 5080 5081 static int 5082 ibd_setup_ud_channel(ibd_state_t *state) 5083 { 5084 ibt_ud_chan_alloc_args_t ud_alloc_attr; 5085 ibt_ud_chan_query_attr_t ud_chan_attr; 5086 ibt_status_t ret; 5087 5088 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 5089 if (state->id_hca_res_lkey_capab) 5090 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 5091 if (state->id_lso_policy && state->id_lso_capable) 5092 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 5093 5094 ud_alloc_attr.ud_hca_port_num = state->id_port; 5095 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 5096 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 5097 ud_alloc_attr.ud_sizes.cs_sq = state->id_ud_num_swqe; 5098 ud_alloc_attr.ud_sizes.cs_rq = state->id_ud_num_rwqe; 5099 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 5100 ud_alloc_attr.ud_scq = state->id_scq_hdl; 5101 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 5102 ud_alloc_attr.ud_pd = state->id_pd_hdl; 5103 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 5104 ud_alloc_attr.ud_clone_chan = NULL; 5105 5106 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 5107 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 5108 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 5109 "failed, ret=%d\n", ret); 5110 return (DDI_FAILURE); 5111 } 5112 5113 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 5114 &ud_chan_attr)) != IBT_SUCCESS) { 5115 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 5116 "failed, ret=%d\n", ret); 5117 (void) ibt_free_channel(state->id_chnl_hdl); 5118 return (DDI_FAILURE); 5119 } 5120 5121 state->id_qpnum = ud_chan_attr.ud_qpn; 5122 5123 return (DDI_SUCCESS); 5124 } 5125 5126 static int 5127 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 5128 { 5129 uint32_t progress = state->id_mac_state; 5130 uint_t attempts; 5131 ibt_status_t ret; 5132 ib_gid_t mgid; 5133 ibd_mce_t *mce; 5134 uint8_t jstate; 5135 timeout_id_t tid; 5136 5137 if (atomic_dec_32_nv(&state->id_running) != 0) 5138 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 5139 5140 /* 5141 * Before we try to stop/undo whatever we did in ibd_start(), 5142 * we need to mark the link state appropriately to prevent the 5143 * ip layer from using this instance for any new transfers. Note 5144 * that if the original state of the link was "up" when we're 5145 * here, we'll set the final link state to "unknown", to behave 5146 * in the same fashion as other ethernet drivers. 5147 */ 5148 mutex_enter(&state->id_link_mutex); 5149 if (cur_link_state == LINK_STATE_DOWN) { 5150 state->id_link_state = cur_link_state; 5151 } else { 5152 state->id_link_state = LINK_STATE_UNKNOWN; 5153 } 5154 mutex_exit(&state->id_link_mutex); 5155 bzero(&state->id_macaddr, sizeof (ipoib_mac_t)); 5156 mac_link_update(state->id_mh, state->id_link_state); 5157 5158 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 5159 if (progress & IBD_DRV_STARTED) { 5160 state->id_mac_state &= (~IBD_DRV_STARTED); 5161 } 5162 5163 if (progress & IBD_DRV_IN_LATE_HCA_INIT) { 5164 state->id_mac_state &= (~IBD_DRV_IN_LATE_HCA_INIT); 5165 } 5166 5167 /* Stop listen under Reliable Connected Mode */ 5168 if (progress & IBD_DRV_RC_LISTEN) { 5169 ASSERT(state->id_enable_rc); 5170 if (state->rc_listen_hdl != NULL) { 5171 ibd_rc_stop_listen(state); 5172 } 5173 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 5174 } 5175 5176 /* Stop timeout routine */ 5177 if (progress & IBD_DRV_RC_TIMEOUT) { 5178 ASSERT(state->id_enable_rc); 5179 mutex_enter(&state->rc_timeout_lock); 5180 state->rc_timeout_start = B_FALSE; 5181 tid = state->rc_timeout; 5182 state->rc_timeout = 0; 5183 mutex_exit(&state->rc_timeout_lock); 5184 if (tid != 0) 5185 (void) untimeout(tid); 5186 state->id_mac_state &= (~IBD_DRV_RC_TIMEOUT); 5187 } 5188 5189 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 5190 attempts = 100; 5191 while (state->id_ah_op == IBD_OP_ONGOING) { 5192 /* 5193 * "state->id_ah_op == IBD_OP_ONGOING" means this IPoIB 5194 * port is connecting to a remote IPoIB port. Wait for 5195 * the end of this connecting operation. 5196 */ 5197 delay(drv_usectohz(100000)); 5198 if (--attempts == 0) { 5199 state->rc_stop_connect++; 5200 DPRINT(40, "ibd_undo_start: connecting"); 5201 break; 5202 } 5203 } 5204 mutex_enter(&state->id_sched_lock); 5205 state->id_sched_needed = 0; 5206 mutex_exit(&state->id_sched_lock); 5207 (void) ibd_rc_close_all_chan(state); 5208 } 5209 5210 /* 5211 * First, stop receive interrupts; this stops the driver from 5212 * handing up buffers to higher layers. Wait for receive buffers 5213 * to be returned and give up after 1 second. 5214 */ 5215 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 5216 attempts = 10; 5217 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 5218 0) > 0) { 5219 delay(drv_usectohz(100000)); 5220 if (--attempts == 0) { 5221 /* 5222 * There are pending bufs with the network 5223 * layer and we have no choice but to wait 5224 * for them to be done with. Reap all the 5225 * Tx/Rx completions that were posted since 5226 * we turned off the notification and 5227 * return failure. 5228 */ 5229 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 5230 DPRINT(2, "ibd_undo_start: " 5231 "reclaiming failed"); 5232 break; 5233 } 5234 } 5235 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 5236 } 5237 5238 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 5239 ibd_rc_fini_tx_largebuf_list(state); 5240 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 5241 } 5242 5243 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 5244 ASSERT(state->id_enable_rc); 5245 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 5246 if (state->id_ah_op == IBD_OP_ONGOING) { 5247 delay(drv_usectohz(10000)); 5248 if (state->id_ah_op == IBD_OP_ONGOING) { 5249 /* 5250 * "state->id_ah_op == IBD_OP_ONGOING" 5251 * means this IPoIB port is connecting 5252 * to a remote IPoIB port. We can't 5253 * delete SRQ here. 5254 */ 5255 state->rc_stop_connect++; 5256 DPRINT(40, "ibd_undo_start: " 5257 "connecting"); 5258 } else { 5259 ibd_rc_fini_srq_list(state); 5260 state->id_mac_state &= 5261 (~IBD_DRV_RC_SRQ_ALLOCD); 5262 } 5263 } else { 5264 ibd_rc_fini_srq_list(state); 5265 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 5266 } 5267 } else { 5268 DPRINT(40, "ibd_undo_start: srq bufs outstanding\n"); 5269 } 5270 } 5271 5272 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 5273 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 5274 5275 mutex_enter(&state->id_trap_lock); 5276 state->id_trap_stop = B_TRUE; 5277 while (state->id_trap_inprog > 0) 5278 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 5279 mutex_exit(&state->id_trap_lock); 5280 5281 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 5282 } 5283 5284 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 5285 /* 5286 * Flushing the channel ensures that all pending WQE's 5287 * are marked with flush_error and handed to the CQ. It 5288 * does not guarantee the invocation of the CQ handler. 5289 * This call is guaranteed to return successfully for 5290 * UD QPNs. 5291 */ 5292 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 5293 IBT_SUCCESS) { 5294 DPRINT(10, "ibd_undo_start: flush_channel " 5295 "failed, ret=%d", ret); 5296 } 5297 5298 /* 5299 * Give some time for the TX CQ handler to process the 5300 * completions. 5301 */ 5302 attempts = 10; 5303 mutex_enter(&state->id_tx_list.dl_mutex); 5304 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5305 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 5306 != state->id_ud_num_swqe) { 5307 if (--attempts == 0) 5308 break; 5309 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5310 mutex_exit(&state->id_tx_list.dl_mutex); 5311 delay(drv_usectohz(100000)); 5312 mutex_enter(&state->id_tx_list.dl_mutex); 5313 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5314 } 5315 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 5316 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 5317 state->id_ud_num_swqe) { 5318 cmn_err(CE_WARN, "tx resources not freed\n"); 5319 } 5320 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5321 mutex_exit(&state->id_tx_list.dl_mutex); 5322 5323 attempts = 10; 5324 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5325 if (--attempts == 0) 5326 break; 5327 delay(drv_usectohz(100000)); 5328 } 5329 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 5330 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 5331 cmn_err(CE_WARN, "rx resources not freed\n"); 5332 } 5333 5334 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 5335 } 5336 5337 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 5338 /* 5339 * Drop all residual full/non membership. This includes full 5340 * membership to the broadcast group, and any nonmembership 5341 * acquired during transmits. We do this after the Tx completion 5342 * handlers are done, since those might result in some late 5343 * leaves; this also eliminates a potential race with that 5344 * path wrt the mc full list insert/delete. Trap handling 5345 * has also been suppressed at this point. Thus, no locks 5346 * are required while traversing the mc full list. 5347 */ 5348 DPRINT(2, "ibd_undo_start: clear full cache entries"); 5349 mce = list_head(&state->id_mc_full); 5350 while (mce != NULL) { 5351 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5352 jstate = mce->mc_jstate; 5353 mce = list_next(&state->id_mc_full, mce); 5354 ibd_leave_group(state, mgid, jstate); 5355 } 5356 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 5357 } 5358 5359 if (progress & IBD_DRV_RXLIST_ALLOCD) { 5360 ibd_fini_rxlist(state); 5361 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 5362 } 5363 5364 if (progress & IBD_DRV_TXLIST_ALLOCD) { 5365 ibd_fini_txlist(state); 5366 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 5367 } 5368 5369 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 5370 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 5371 IBT_SUCCESS) { 5372 DPRINT(10, "ibd_undo_start: free_channel " 5373 "failed, ret=%d", ret); 5374 } 5375 5376 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 5377 } 5378 5379 if (progress & IBD_DRV_CQS_ALLOCD) { 5380 kmem_free(state->id_txwcs, 5381 sizeof (ibt_wc_t) * state->id_txwcs_size); 5382 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 5383 IBT_SUCCESS) { 5384 DPRINT(10, "ibd_undo_start: free_cq(scq) " 5385 "failed, ret=%d", ret); 5386 } 5387 5388 kmem_free(state->id_rxwcs, 5389 sizeof (ibt_wc_t) * state->id_rxwcs_size); 5390 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 5391 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 5392 "ret=%d", ret); 5393 } 5394 5395 state->id_txwcs = NULL; 5396 state->id_rxwcs = NULL; 5397 state->id_scq_hdl = NULL; 5398 state->id_rcq_hdl = NULL; 5399 5400 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 5401 } 5402 5403 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 5404 mutex_enter(&state->id_ac_mutex); 5405 mod_hash_destroy_hash(state->id_ah_active_hash); 5406 mutex_exit(&state->id_ac_mutex); 5407 ibd_acache_fini(state); 5408 5409 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 5410 } 5411 5412 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 5413 /* 5414 * If we'd created the ipoib broadcast group and had 5415 * successfully joined it, leave it now 5416 */ 5417 if (state->id_bgroup_created) { 5418 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 5419 jstate = IB_MC_JSTATE_FULL; 5420 (void) ibt_leave_mcg(state->id_sgid, mgid, 5421 state->id_sgid, jstate); 5422 } 5423 ibt_free_mcg_info(state->id_mcinfo, 1); 5424 5425 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 5426 } 5427 5428 return (DDI_SUCCESS); 5429 } 5430 5431 /* 5432 * These pair of routines are used to set/clear the condition that 5433 * the caller is likely to do something to change the id_mac_state. 5434 * If there's already someone doing either a start or a stop (possibly 5435 * due to the async handler detecting a pkey relocation event, a plumb 5436 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 5437 * that's done. 5438 */ 5439 static void 5440 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 5441 { 5442 mutex_enter(&state->id_macst_lock); 5443 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 5444 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 5445 5446 state->id_mac_state |= flag; 5447 mutex_exit(&state->id_macst_lock); 5448 } 5449 5450 static void 5451 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 5452 { 5453 mutex_enter(&state->id_macst_lock); 5454 state->id_mac_state &= (~flag); 5455 cv_signal(&state->id_macst_cv); 5456 mutex_exit(&state->id_macst_lock); 5457 } 5458 5459 /* 5460 * GLDv3 entry point to start hardware. 5461 */ 5462 /*ARGSUSED*/ 5463 static int 5464 ibd_m_start(void *arg) 5465 { 5466 ibd_state_t *state = arg; 5467 int ret; 5468 5469 if (state->id_type == IBD_PORT_DRIVER) 5470 return (EINVAL); 5471 5472 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5473 if (state->id_mac_state & IBD_DRV_IN_DELETION) { 5474 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5475 return (EIO); 5476 } 5477 5478 ret = ibd_start(state); 5479 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 5480 return (ret); 5481 } 5482 5483 static int 5484 ibd_start(ibd_state_t *state) 5485 { 5486 int err; 5487 ibt_status_t ret; 5488 int late_hca_init = 0; 5489 5490 if (state->id_mac_state & IBD_DRV_STARTED) 5491 return (DDI_SUCCESS); 5492 5493 /* 5494 * We do not increment the running flag when calling ibd_start() as 5495 * a result of some event which moves the state away from late HCA 5496 * initialization viz. MCG_CREATED, PORT_CHANGE or link availability. 5497 */ 5498 if (!(state->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) && 5499 (atomic_inc_32_nv(&state->id_running) != 1)) { 5500 DPRINT(10, "ibd_start: id_running is non-zero"); 5501 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 5502 atomic_dec_32(&state->id_running); 5503 return (EINVAL); 5504 } 5505 5506 /* 5507 * Get port details; if we fail here, something bad happened. 5508 * Fail plumb. 5509 */ 5510 if ((err = ibd_get_port_details(state)) != 0) { 5511 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 5512 goto start_fail; 5513 } 5514 /* 5515 * If state->id_link_state is DOWN, it indicates that either the port 5516 * is down, or the pkey is not available. In both cases, resort to late 5517 * initialization. Register for subnet notices, and return success. 5518 */ 5519 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 5520 if (state->id_link_state == LINK_STATE_DOWN) { 5521 late_hca_init = 1; 5522 goto late_hca_init_return; 5523 } 5524 5525 /* 5526 * Find the IPoIB broadcast group 5527 */ 5528 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 5529 /* Resort to late initialization */ 5530 late_hca_init = 1; 5531 goto reg_snet_notices; 5532 } 5533 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 5534 5535 /* 5536 * Initialize per-interface caches and lists; if we fail here, 5537 * it is most likely due to a lack of resources 5538 */ 5539 if (ibd_acache_init(state) != DDI_SUCCESS) { 5540 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 5541 err = ENOMEM; 5542 goto start_fail; 5543 } 5544 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 5545 5546 /* 5547 * Allocate send and receive completion queues 5548 */ 5549 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 5550 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 5551 err = ENOMEM; 5552 goto start_fail; 5553 } 5554 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 5555 5556 /* 5557 * Setup a UD channel 5558 */ 5559 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 5560 err = ENOMEM; 5561 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 5562 goto start_fail; 5563 } 5564 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 5565 5566 /* 5567 * Allocate and initialize the tx buffer list 5568 */ 5569 if (ibd_init_txlist(state) != DDI_SUCCESS) { 5570 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 5571 err = ENOMEM; 5572 goto start_fail; 5573 } 5574 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 5575 5576 /* 5577 * Create the send cq handler here 5578 */ 5579 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 5580 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 5581 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5582 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 5583 "failed, ret=%d", ret); 5584 err = EINVAL; 5585 goto start_fail; 5586 } 5587 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 5588 5589 /* 5590 * Allocate and initialize the rx buffer list 5591 */ 5592 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 5593 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 5594 err = ENOMEM; 5595 goto start_fail; 5596 } 5597 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 5598 5599 /* 5600 * Join IPoIB broadcast group 5601 */ 5602 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 5603 DPRINT(10, "ibd_start: ibd_join_group() failed"); 5604 err = ENOTACTIVE; 5605 goto start_fail; 5606 } 5607 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 5608 5609 /* 5610 * When we did mac_register() in ibd_attach(), we didn't register 5611 * the real macaddr and we didn't have the true port mtu. Now that 5612 * we're almost ready, set the local mac address and broadcast 5613 * addresses and update gldv3 about the real values of these 5614 * parameters. 5615 */ 5616 if (state->id_enable_rc) { 5617 ibd_h2n_mac(&state->id_macaddr, 5618 IBD_MAC_ADDR_RC + state->id_qpnum, 5619 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5620 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 5621 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5622 } else { 5623 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 5624 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 5625 } 5626 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 5627 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 5628 5629 if (!state->id_enable_rc) { 5630 (void) mac_maxsdu_update2(state->id_mh, 5631 state->id_mtu - IPOIB_HDRSIZE, 5632 state->id_mtu - IPOIB_HDRSIZE); 5633 } 5634 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 5635 5636 /* 5637 * Setup the receive cq handler 5638 */ 5639 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 5640 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 5641 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 5642 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 5643 "failed, ret=%d", ret); 5644 err = EINVAL; 5645 goto start_fail; 5646 } 5647 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 5648 5649 reg_snet_notices: 5650 /* 5651 * In case of normal initialization sequence, 5652 * Setup the subnet notices handler after we've initialized the acache/ 5653 * mcache and started the async thread, both of which are required for 5654 * the trap handler to function properly. 5655 * 5656 * Now that the async thread has been started (and we've already done 5657 * a mac_register() during attach so mac_tx_update() can be called 5658 * if necessary without any problem), we can enable the trap handler 5659 * to queue requests to the async thread. 5660 * 5661 * In case of late hca initialization, the subnet notices handler will 5662 * only handle MCG created/deleted event. The action performed as part 5663 * of handling these events is to start the interface. So, the 5664 * acache/mcache initialization is not a necessity in such cases for 5665 * registering the subnet notices handler. Also, if we are in 5666 * ibd_start() as a result of, say, some event handling after entering 5667 * late hca initialization phase no need to register again. 5668 */ 5669 if ((state->id_mac_state & IBD_DRV_SM_NOTICES_REGISTERED) == 0) { 5670 ibt_register_subnet_notices(state->id_ibt_hdl, 5671 ibd_snet_notices_handler, state); 5672 mutex_enter(&state->id_trap_lock); 5673 state->id_trap_stop = B_FALSE; 5674 mutex_exit(&state->id_trap_lock); 5675 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 5676 } 5677 5678 late_hca_init_return: 5679 if (late_hca_init == 1) { 5680 state->id_mac_state |= IBD_DRV_IN_LATE_HCA_INIT; 5681 /* 5682 * In case of late initialization, mark the link state as down, 5683 * immaterial of the actual link state as reported in the 5684 * port_info. 5685 */ 5686 state->id_link_state = LINK_STATE_DOWN; 5687 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 5688 mac_link_update(state->id_mh, state->id_link_state); 5689 return (DDI_SUCCESS); 5690 } 5691 5692 if (state->id_enable_rc) { 5693 if (state->rc_enable_srq) { 5694 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 5695 if (ibd_rc_repost_srq_free_list(state) != 5696 IBT_SUCCESS) { 5697 err = ENOMEM; 5698 goto start_fail; 5699 } 5700 } else { 5701 /* Allocate SRQ resource */ 5702 if (ibd_rc_init_srq_list(state) != 5703 IBT_SUCCESS) { 5704 err = ENOMEM; 5705 goto start_fail; 5706 } 5707 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 5708 } 5709 } 5710 5711 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 5712 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 5713 "failed"); 5714 err = ENOMEM; 5715 goto start_fail; 5716 } 5717 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 5718 5719 /* RC: begin to listen only after everything is available */ 5720 if (ibd_rc_listen(state) != IBT_SUCCESS) { 5721 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 5722 err = EINVAL; 5723 goto start_fail; 5724 } 5725 state->id_mac_state |= IBD_DRV_RC_LISTEN; 5726 } 5727 5728 /* 5729 * Indicate link status to GLDv3 and higher layers. By default, 5730 * we assume we are in up state (which must have been true at 5731 * least at the time the broadcast mcg's were probed); if there 5732 * were any up/down transitions till the time we come here, the 5733 * async handler will have updated last known state, which we 5734 * use to tell GLDv3. The async handler will not send any 5735 * notifications to GLDv3 till we reach here in the initialization 5736 * sequence. 5737 */ 5738 mac_link_update(state->id_mh, state->id_link_state); 5739 state->id_mac_state &= ~IBD_DRV_IN_LATE_HCA_INIT; 5740 state->id_mac_state |= IBD_DRV_STARTED; 5741 5742 /* Start timer after everything is ready */ 5743 if (state->id_enable_rc) { 5744 mutex_enter(&state->rc_timeout_lock); 5745 state->rc_timeout_start = B_TRUE; 5746 state->rc_timeout = timeout(ibd_rc_conn_timeout_call, state, 5747 SEC_TO_TICK(ibd_rc_conn_timeout)); 5748 mutex_exit(&state->rc_timeout_lock); 5749 state->id_mac_state |= IBD_DRV_RC_TIMEOUT; 5750 } 5751 5752 return (DDI_SUCCESS); 5753 5754 start_fail: 5755 /* 5756 * If we ran into a problem during ibd_start() and ran into 5757 * some other problem during undoing our partial work, we can't 5758 * do anything about it. Ignore any errors we might get from 5759 * ibd_undo_start() and just return the original error we got. 5760 */ 5761 (void) ibd_undo_start(state, LINK_STATE_DOWN); 5762 return (err); 5763 } 5764 5765 /* 5766 * GLDv3 entry point to stop hardware from receiving packets. 5767 */ 5768 /*ARGSUSED*/ 5769 static void 5770 ibd_m_stop(void *arg) 5771 { 5772 ibd_state_t *state = (ibd_state_t *)arg; 5773 5774 if (state->id_type == IBD_PORT_DRIVER) 5775 return; 5776 5777 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 5778 5779 (void) ibd_undo_start(state, state->id_link_state); 5780 5781 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 5782 } 5783 5784 /* 5785 * GLDv3 entry point to modify device's mac address. We do not 5786 * allow address modifications. 5787 */ 5788 static int 5789 ibd_m_unicst(void *arg, const uint8_t *macaddr) 5790 { 5791 ibd_state_t *state = arg; 5792 5793 if (state->id_type == IBD_PORT_DRIVER) 5794 return (EINVAL); 5795 5796 /* 5797 * Don't bother even comparing the macaddr if we haven't 5798 * completed ibd_m_start(). 5799 */ 5800 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5801 return (0); 5802 5803 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 5804 return (0); 5805 else 5806 return (EINVAL); 5807 } 5808 5809 /* 5810 * The blocking part of the IBA join/leave operations are done out 5811 * of here on the async thread. 5812 */ 5813 static void 5814 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 5815 { 5816 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 5817 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 5818 5819 if (op == IBD_ASYNC_JOIN) { 5820 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 5821 ibd_print_warn(state, "Join multicast group failed :" 5822 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5823 } 5824 } else { 5825 /* 5826 * Here, we must search for the proper mcg_info and 5827 * use that to leave the group. 5828 */ 5829 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 5830 } 5831 } 5832 5833 /* 5834 * GLDv3 entry point for multicast enable/disable requests. 5835 * This function queues the operation to the async thread and 5836 * return success for a valid multicast address. 5837 */ 5838 static int 5839 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 5840 { 5841 ibd_state_t *state = (ibd_state_t *)arg; 5842 ipoib_mac_t maddr, *mcast; 5843 ib_gid_t mgid; 5844 ibd_req_t *req; 5845 5846 if (state->id_type == IBD_PORT_DRIVER) 5847 return (EINVAL); 5848 5849 /* 5850 * If we haven't completed ibd_m_start(), async thread wouldn't 5851 * have been started and id_bcaddr wouldn't be set, so there's 5852 * no point in continuing. 5853 */ 5854 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5855 return (0); 5856 5857 /* 5858 * The incoming multicast address might not be aligned properly 5859 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 5860 * it to look like one though, to get the offsets of the mc gid, 5861 * since we know we are not going to dereference any values with 5862 * the ipoib_mac_t pointer. 5863 */ 5864 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 5865 mcast = &maddr; 5866 5867 /* 5868 * Check validity of MCG address. We could additionally check 5869 * that a enable/disable is not being issued on the "broadcast" 5870 * mcg, but since this operation is only invokable by privileged 5871 * programs anyway, we allow the flexibility to those dlpi apps. 5872 * Note that we do not validate the "scope" of the IBA mcg. 5873 */ 5874 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 5875 return (EINVAL); 5876 5877 /* 5878 * fill in multicast pkey and scope 5879 */ 5880 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 5881 5882 /* 5883 * If someone is trying to JOIN/LEAVE the broadcast group, we do 5884 * nothing (i.e. we stay JOINed to the broadcast group done in 5885 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 5886 * requires to be joined to broadcast groups at all times. 5887 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 5888 * depends on this. 5889 */ 5890 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5891 return (0); 5892 5893 ibd_n2h_gid(mcast, &mgid); 5894 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5895 if (req == NULL) 5896 return (ENOMEM); 5897 5898 req->rq_gid = mgid; 5899 5900 if (add) { 5901 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 5902 mgid.gid_prefix, mgid.gid_guid); 5903 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 5904 } else { 5905 DPRINT(1, "ibd_m_multicst : unset_multicast : " 5906 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5907 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 5908 } 5909 return (0); 5910 } 5911 5912 /* 5913 * The blocking part of the IBA promiscuous operations are done 5914 * out of here on the async thread. The dlpireq parameter indicates 5915 * whether this invocation is due to a dlpi request or due to 5916 * a port up/down event. 5917 */ 5918 static void 5919 ibd_async_unsetprom(ibd_state_t *state) 5920 { 5921 ibd_mce_t *mce = list_head(&state->id_mc_non); 5922 ib_gid_t mgid; 5923 5924 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 5925 5926 while (mce != NULL) { 5927 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5928 mce = list_next(&state->id_mc_non, mce); 5929 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 5930 } 5931 state->id_prom_op = IBD_OP_NOTSTARTED; 5932 } 5933 5934 /* 5935 * The blocking part of the IBA promiscuous operations are done 5936 * out of here on the async thread. The dlpireq parameter indicates 5937 * whether this invocation is due to a dlpi request or due to 5938 * a port up/down event. 5939 */ 5940 static void 5941 ibd_async_setprom(ibd_state_t *state) 5942 { 5943 ibt_mcg_attr_t mcg_attr; 5944 ibt_mcg_info_t *mcg_info; 5945 ib_gid_t mgid; 5946 uint_t numg; 5947 int i; 5948 char ret = IBD_OP_COMPLETED; 5949 5950 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 5951 5952 /* 5953 * Obtain all active MC groups on the IB fabric with 5954 * specified criteria (scope + Pkey + Qkey + mtu). 5955 */ 5956 bzero(&mcg_attr, sizeof (mcg_attr)); 5957 mcg_attr.mc_pkey = state->id_pkey; 5958 mcg_attr.mc_scope = state->id_scope; 5959 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 5960 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 5961 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 5962 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 5963 IBT_SUCCESS) { 5964 ibd_print_warn(state, "Could not get list of IBA multicast " 5965 "groups"); 5966 ret = IBD_OP_ERRORED; 5967 goto done; 5968 } 5969 5970 /* 5971 * Iterate over the returned mcg's and join as NonMember 5972 * to the IP mcg's. 5973 */ 5974 for (i = 0; i < numg; i++) { 5975 /* 5976 * Do a NonMember JOIN on the MC group. 5977 */ 5978 mgid = mcg_info[i].mc_adds_vect.av_dgid; 5979 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 5980 ibd_print_warn(state, "IBA promiscuous mode missed " 5981 "multicast gid %016llx:%016llx", 5982 (u_longlong_t)mgid.gid_prefix, 5983 (u_longlong_t)mgid.gid_guid); 5984 } 5985 5986 ibt_free_mcg_info(mcg_info, numg); 5987 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 5988 done: 5989 state->id_prom_op = ret; 5990 } 5991 5992 /* 5993 * GLDv3 entry point for multicast promiscuous enable/disable requests. 5994 * GLDv3 assumes phys state receives more packets than multi state, 5995 * which is not true for IPoIB. Thus, treat the multi and phys 5996 * promiscuous states the same way to work with GLDv3's assumption. 5997 */ 5998 static int 5999 ibd_m_promisc(void *arg, boolean_t on) 6000 { 6001 ibd_state_t *state = (ibd_state_t *)arg; 6002 ibd_req_t *req; 6003 6004 if (state->id_type == IBD_PORT_DRIVER) 6005 return (EINVAL); 6006 6007 /* 6008 * Async thread wouldn't have been started if we haven't 6009 * passed ibd_m_start() 6010 */ 6011 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6012 return (0); 6013 6014 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6015 if (req == NULL) 6016 return (ENOMEM); 6017 if (on) { 6018 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 6019 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 6020 } else { 6021 DPRINT(1, "ibd_m_promisc : unset_promisc"); 6022 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 6023 } 6024 6025 return (0); 6026 } 6027 6028 /* 6029 * GLDv3 entry point for gathering statistics. 6030 */ 6031 static int 6032 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 6033 { 6034 ibd_state_t *state = (ibd_state_t *)arg; 6035 6036 switch (stat) { 6037 case MAC_STAT_IFSPEED: 6038 *val = state->id_link_speed; 6039 break; 6040 case MAC_STAT_MULTIRCV: 6041 *val = state->id_multi_rcv; 6042 break; 6043 case MAC_STAT_BRDCSTRCV: 6044 *val = state->id_brd_rcv; 6045 break; 6046 case MAC_STAT_MULTIXMT: 6047 *val = state->id_multi_xmt; 6048 break; 6049 case MAC_STAT_BRDCSTXMT: 6050 *val = state->id_brd_xmt; 6051 break; 6052 case MAC_STAT_RBYTES: 6053 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 6054 + state->rc_rcv_copy_byte; 6055 break; 6056 case MAC_STAT_IPACKETS: 6057 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 6058 + state->rc_rcv_copy_pkt; 6059 break; 6060 case MAC_STAT_OBYTES: 6061 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 6062 break; 6063 case MAC_STAT_OPACKETS: 6064 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 6065 state->rc_xmt_fragmented_pkt + 6066 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 6067 break; 6068 case MAC_STAT_OERRORS: 6069 *val = state->id_ah_error; /* failed AH translation */ 6070 break; 6071 case MAC_STAT_IERRORS: 6072 *val = 0; 6073 break; 6074 case MAC_STAT_NOXMTBUF: 6075 *val = state->id_tx_short + state->rc_swqe_short + 6076 state->rc_xmt_buf_short; 6077 break; 6078 case MAC_STAT_NORCVBUF: 6079 default: 6080 return (ENOTSUP); 6081 } 6082 6083 return (0); 6084 } 6085 6086 static void 6087 ibd_async_txsched(ibd_state_t *state) 6088 { 6089 ibd_resume_transmission(state); 6090 } 6091 6092 static void 6093 ibd_resume_transmission(ibd_state_t *state) 6094 { 6095 int flag; 6096 int met_thresh = 0; 6097 int thresh = 0; 6098 int ret = -1; 6099 6100 mutex_enter(&state->id_sched_lock); 6101 if (state->id_sched_needed & IBD_RSRC_SWQE) { 6102 mutex_enter(&state->id_tx_list.dl_mutex); 6103 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6104 met_thresh = state->id_tx_list.dl_cnt + 6105 state->id_tx_rel_list.dl_cnt; 6106 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6107 mutex_exit(&state->id_tx_list.dl_mutex); 6108 thresh = IBD_FREE_SWQES_THRESH; 6109 flag = IBD_RSRC_SWQE; 6110 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 6111 ASSERT(state->id_lso != NULL); 6112 mutex_enter(&state->id_lso_lock); 6113 met_thresh = state->id_lso->bkt_nfree; 6114 thresh = IBD_FREE_LSOS_THRESH; 6115 mutex_exit(&state->id_lso_lock); 6116 flag = IBD_RSRC_LSOBUF; 6117 if (met_thresh > thresh) 6118 state->id_sched_lso_cnt++; 6119 } 6120 if (met_thresh > thresh) { 6121 state->id_sched_needed &= ~flag; 6122 state->id_sched_cnt++; 6123 ret = 0; 6124 } 6125 mutex_exit(&state->id_sched_lock); 6126 6127 if (ret == 0) 6128 mac_tx_update(state->id_mh); 6129 } 6130 6131 /* 6132 * Release the send wqe back into free list. 6133 */ 6134 static void 6135 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 6136 { 6137 /* 6138 * Add back on Tx list for reuse. 6139 */ 6140 ASSERT(tail->swqe_next == NULL); 6141 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6142 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 6143 tail->swqe_next = state->id_tx_rel_list.dl_head; 6144 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 6145 state->id_tx_rel_list.dl_cnt += n; 6146 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6147 } 6148 6149 /* 6150 * Acquire a send wqe from free list. 6151 * Returns error number and send wqe pointer. 6152 */ 6153 static ibd_swqe_t * 6154 ibd_acquire_swqe(ibd_state_t *state) 6155 { 6156 ibd_swqe_t *wqe; 6157 6158 mutex_enter(&state->id_tx_rel_list.dl_mutex); 6159 if (state->id_tx_rel_list.dl_head != NULL) { 6160 /* transfer id_tx_rel_list to id_tx_list */ 6161 state->id_tx_list.dl_head = 6162 state->id_tx_rel_list.dl_head; 6163 state->id_tx_list.dl_cnt = 6164 state->id_tx_rel_list.dl_cnt; 6165 state->id_tx_list.dl_pending_sends = B_FALSE; 6166 6167 /* clear id_tx_rel_list */ 6168 state->id_tx_rel_list.dl_head = NULL; 6169 state->id_tx_rel_list.dl_cnt = 0; 6170 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6171 6172 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 6173 state->id_tx_list.dl_cnt -= 1; 6174 state->id_tx_list.dl_head = wqe->swqe_next; 6175 } else { /* no free swqe */ 6176 mutex_exit(&state->id_tx_rel_list.dl_mutex); 6177 state->id_tx_list.dl_pending_sends = B_TRUE; 6178 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 6179 state->id_tx_short++; 6180 wqe = NULL; 6181 } 6182 return (wqe); 6183 } 6184 6185 static int 6186 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 6187 ibt_ud_dest_hdl_t ud_dest) 6188 { 6189 mblk_t *nmp; 6190 int iph_len, tcph_len; 6191 ibt_wr_lso_t *lso; 6192 uintptr_t ip_start, tcp_start; 6193 uint8_t *dst; 6194 uint_t pending, mblen; 6195 6196 /* 6197 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 6198 * we need to adjust it here for lso. 6199 */ 6200 lso = &(node->w_swr.wr.ud_lso); 6201 lso->lso_ud_dest = ud_dest; 6202 lso->lso_mss = mss; 6203 6204 /* 6205 * Calculate the LSO header size and set it in the UD LSO structure. 6206 * Note that the only assumption we make is that each of the IPoIB, 6207 * IP and TCP headers will be contained in a single mblk fragment; 6208 * together, the headers may span multiple mblk fragments. 6209 */ 6210 nmp = mp; 6211 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 6212 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 6213 ip_start = (uintptr_t)nmp->b_cont->b_rptr 6214 + (ip_start - (uintptr_t)(nmp->b_wptr)); 6215 nmp = nmp->b_cont; 6216 6217 } 6218 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 6219 6220 tcp_start = ip_start + iph_len; 6221 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 6222 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 6223 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 6224 nmp = nmp->b_cont; 6225 } 6226 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 6227 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 6228 6229 /* 6230 * If the lso header fits entirely within a single mblk fragment, 6231 * we'll avoid an additional copy of the lso header here and just 6232 * pass the b_rptr of the mblk directly. 6233 * 6234 * If this isn't true, we'd have to allocate for it explicitly. 6235 */ 6236 if (lso->lso_hdr_sz <= MBLKL(mp)) { 6237 lso->lso_hdr = mp->b_rptr; 6238 } else { 6239 /* On work completion, remember to free this allocated hdr */ 6240 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 6241 if (lso->lso_hdr == NULL) { 6242 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 6243 "sz = %d", lso->lso_hdr_sz); 6244 lso->lso_hdr_sz = 0; 6245 lso->lso_mss = 0; 6246 return (-1); 6247 } 6248 } 6249 6250 /* 6251 * Copy in the lso header only if we need to 6252 */ 6253 if (lso->lso_hdr != mp->b_rptr) { 6254 dst = lso->lso_hdr; 6255 pending = lso->lso_hdr_sz; 6256 6257 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 6258 mblen = MBLKL(nmp); 6259 if (pending > mblen) { 6260 bcopy(nmp->b_rptr, dst, mblen); 6261 dst += mblen; 6262 pending -= mblen; 6263 } else { 6264 bcopy(nmp->b_rptr, dst, pending); 6265 break; 6266 } 6267 } 6268 } 6269 6270 return (0); 6271 } 6272 6273 static void 6274 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 6275 { 6276 ibt_wr_lso_t *lso; 6277 6278 if ((!node) || (!mp)) 6279 return; 6280 6281 /* 6282 * Free any header space that we might've allocated if we 6283 * did an LSO 6284 */ 6285 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 6286 lso = &(node->w_swr.wr.ud_lso); 6287 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 6288 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 6289 lso->lso_hdr = NULL; 6290 lso->lso_hdr_sz = 0; 6291 } 6292 } 6293 } 6294 6295 static void 6296 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 6297 { 6298 uint_t i; 6299 uint_t num_posted; 6300 uint_t n_wrs; 6301 ibt_status_t ibt_status; 6302 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 6303 ibd_swqe_t *tx_head, *elem; 6304 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 6305 6306 /* post the one request, then check for more */ 6307 ibt_status = ibt_post_send(state->id_chnl_hdl, 6308 &node->w_swr, 1, NULL); 6309 if (ibt_status != IBT_SUCCESS) { 6310 ibd_print_warn(state, "ibd_post_send: " 6311 "posting one wr failed: ret=%d", ibt_status); 6312 ibd_tx_cleanup(state, node); 6313 } 6314 6315 tx_head = NULL; 6316 for (;;) { 6317 if (tx_head == NULL) { 6318 mutex_enter(&state->id_txpost_lock); 6319 tx_head = state->id_tx_head; 6320 if (tx_head == NULL) { 6321 state->id_tx_busy = 0; 6322 mutex_exit(&state->id_txpost_lock); 6323 return; 6324 } 6325 state->id_tx_head = NULL; 6326 mutex_exit(&state->id_txpost_lock); 6327 } 6328 6329 /* 6330 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 6331 * at a time if possible, and keep posting them. 6332 */ 6333 for (n_wrs = 0, elem = tx_head; 6334 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 6335 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 6336 nodes[n_wrs] = elem; 6337 wrs[n_wrs] = elem->w_swr; 6338 } 6339 tx_head = elem; 6340 6341 ASSERT(n_wrs != 0); 6342 6343 /* 6344 * If posting fails for some reason, we'll never receive 6345 * completion intimation, so we'll need to cleanup. But 6346 * we need to make sure we don't clean up nodes whose 6347 * wrs have been successfully posted. We assume that the 6348 * hca driver returns on the first failure to post and 6349 * therefore the first 'num_posted' entries don't need 6350 * cleanup here. 6351 */ 6352 num_posted = 0; 6353 ibt_status = ibt_post_send(state->id_chnl_hdl, 6354 wrs, n_wrs, &num_posted); 6355 if (ibt_status != IBT_SUCCESS) { 6356 ibd_print_warn(state, "ibd_post_send: " 6357 "posting multiple wrs failed: " 6358 "requested=%d, done=%d, ret=%d", 6359 n_wrs, num_posted, ibt_status); 6360 6361 for (i = num_posted; i < n_wrs; i++) 6362 ibd_tx_cleanup(state, nodes[i]); 6363 } 6364 } 6365 } 6366 6367 static int 6368 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 6369 uint_t lsohdr_sz) 6370 { 6371 ibt_wr_ds_t *sgl; 6372 ibt_status_t ibt_status; 6373 mblk_t *nmp; 6374 mblk_t *data_mp; 6375 uchar_t *bufp; 6376 size_t blksize; 6377 size_t skip; 6378 size_t avail; 6379 uint_t pktsize; 6380 uint_t frag_len; 6381 uint_t pending_hdr; 6382 int nmblks; 6383 int i; 6384 6385 /* 6386 * Let's skip ahead to the data if this is LSO 6387 */ 6388 data_mp = mp; 6389 pending_hdr = 0; 6390 if (lsohdr_sz) { 6391 pending_hdr = lsohdr_sz; 6392 for (nmp = mp; nmp; nmp = nmp->b_cont) { 6393 frag_len = nmp->b_wptr - nmp->b_rptr; 6394 if (frag_len > pending_hdr) 6395 break; 6396 pending_hdr -= frag_len; 6397 } 6398 data_mp = nmp; /* start of data past lso header */ 6399 ASSERT(data_mp != NULL); 6400 } 6401 6402 /* 6403 * Calculate the size of message data and number of msg blocks 6404 */ 6405 pktsize = 0; 6406 for (nmblks = 0, nmp = data_mp; nmp != NULL; 6407 nmp = nmp->b_cont, nmblks++) { 6408 pktsize += MBLKL(nmp); 6409 } 6410 pktsize -= pending_hdr; 6411 6412 /* 6413 * We only do ibt_map_mem_iov() if the pktsize is above the 6414 * "copy-threshold", and if the number of mp fragments is less than 6415 * the maximum acceptable. 6416 */ 6417 if ((state->id_hca_res_lkey_capab) && 6418 (pktsize > state->id_ud_tx_copy_thresh) && 6419 (nmblks < state->id_max_sqseg_hiwm)) { 6420 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6421 ibt_iov_attr_t iov_attr; 6422 6423 iov_attr.iov_as = NULL; 6424 iov_attr.iov = iov_arr; 6425 iov_attr.iov_buf = NULL; 6426 iov_attr.iov_list_len = nmblks; 6427 iov_attr.iov_wr_nds = state->id_max_sqseg; 6428 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 6429 iov_attr.iov_flags = IBT_IOV_SLEEP; 6430 6431 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 6432 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 6433 iov_arr[i].iov_len = MBLKL(nmp); 6434 if (i == 0) { 6435 iov_arr[i].iov_addr += pending_hdr; 6436 iov_arr[i].iov_len -= pending_hdr; 6437 } 6438 } 6439 6440 node->w_buftype = IBD_WQE_MAPPED; 6441 node->w_swr.wr_sgl = node->w_sgl; 6442 6443 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 6444 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 6445 if (ibt_status != IBT_SUCCESS) { 6446 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 6447 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 6448 goto ibd_copy_path; 6449 } 6450 6451 return (0); 6452 } 6453 6454 ibd_copy_path: 6455 if (pktsize <= state->id_tx_buf_sz) { 6456 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6457 node->w_swr.wr_nds = 1; 6458 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6459 node->w_buftype = IBD_WQE_TXBUF; 6460 6461 /* 6462 * Even though this is the copy path for transfers less than 6463 * id_tx_buf_sz, it could still be an LSO packet. If so, it 6464 * is possible the first data mblk fragment (data_mp) still 6465 * contains part of the LSO header that we need to skip. 6466 */ 6467 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6468 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 6469 blksize = MBLKL(nmp) - pending_hdr; 6470 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 6471 bufp += blksize; 6472 pending_hdr = 0; 6473 } 6474 6475 return (0); 6476 } 6477 6478 /* 6479 * Copy path for transfers greater than id_tx_buf_sz 6480 */ 6481 node->w_swr.wr_sgl = node->w_sgl; 6482 if (ibd_acquire_lsobufs(state, pktsize, 6483 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 6484 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 6485 return (-1); 6486 } 6487 node->w_buftype = IBD_WQE_LSOBUF; 6488 6489 /* 6490 * Copy the larger-than-id_tx_buf_sz packet into a set of 6491 * fixed-sized, pre-mapped LSO buffers. Note that we might 6492 * need to skip part of the LSO header in the first fragment 6493 * as before. 6494 */ 6495 nmp = data_mp; 6496 skip = pending_hdr; 6497 for (i = 0; i < node->w_swr.wr_nds; i++) { 6498 sgl = node->w_swr.wr_sgl + i; 6499 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 6500 avail = IBD_LSO_BUFSZ; 6501 while (nmp && avail) { 6502 blksize = MBLKL(nmp) - skip; 6503 if (blksize > avail) { 6504 bcopy(nmp->b_rptr + skip, bufp, avail); 6505 skip += avail; 6506 avail = 0; 6507 } else { 6508 bcopy(nmp->b_rptr + skip, bufp, blksize); 6509 skip = 0; 6510 avail -= blksize; 6511 bufp += blksize; 6512 nmp = nmp->b_cont; 6513 } 6514 } 6515 } 6516 6517 return (0); 6518 } 6519 6520 /* 6521 * Schedule a completion queue polling to reap the resource we're 6522 * short on. If we implement the change to reap tx completions 6523 * in a separate thread, we'll need to wake up that thread here. 6524 */ 6525 static int 6526 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 6527 { 6528 ibd_req_t *req; 6529 6530 mutex_enter(&state->id_sched_lock); 6531 state->id_sched_needed |= resource_type; 6532 mutex_exit(&state->id_sched_lock); 6533 6534 /* 6535 * If we are asked to queue a work entry, we need to do it 6536 */ 6537 if (q_flag) { 6538 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 6539 if (req == NULL) 6540 return (-1); 6541 6542 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 6543 } 6544 6545 return (0); 6546 } 6547 6548 /* 6549 * The passed in packet has this format: 6550 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 6551 */ 6552 static boolean_t 6553 ibd_send(ibd_state_t *state, mblk_t *mp) 6554 { 6555 ibd_ace_t *ace; 6556 ibd_swqe_t *node; 6557 ipoib_mac_t *dest; 6558 ib_header_info_t *ipibp; 6559 ip6_t *ip6h; 6560 uint_t pktsize; 6561 uint32_t mss; 6562 uint32_t hckflags; 6563 uint32_t lsoflags = 0; 6564 uint_t lsohdr_sz = 0; 6565 int ret, len; 6566 boolean_t dofree = B_FALSE; 6567 boolean_t rc; 6568 /* if (rc_chan == NULL) send by UD; else send by RC; */ 6569 ibd_rc_chan_t *rc_chan; 6570 int nmblks; 6571 mblk_t *nmp; 6572 6573 /* 6574 * If we aren't done with the device initialization and start, 6575 * we shouldn't be here. 6576 */ 6577 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 6578 return (B_FALSE); 6579 6580 /* 6581 * Obtain an address handle for the destination. 6582 */ 6583 ipibp = (ib_header_info_t *)mp->b_rptr; 6584 dest = (ipoib_mac_t *)&ipibp->ib_dst; 6585 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6586 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 6587 6588 rc_chan = NULL; 6589 ace = ibd_acache_lookup(state, dest, &ret, 1); 6590 if (state->id_enable_rc && (ace != NULL) && 6591 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 6592 if (ace->ac_chan == NULL) { 6593 state->rc_null_conn++; 6594 } else { 6595 if (ace->ac_chan->chan_state == 6596 IBD_RC_STATE_ACT_ESTAB) { 6597 rc_chan = ace->ac_chan; 6598 rc_chan->is_used = B_TRUE; 6599 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 6600 node = WQE_TO_SWQE( 6601 rc_chan->tx_wqe_list.dl_head); 6602 if (node != NULL) { 6603 rc_chan->tx_wqe_list.dl_cnt -= 1; 6604 rc_chan->tx_wqe_list.dl_head = 6605 node->swqe_next; 6606 } else { 6607 node = ibd_rc_acquire_swqes(rc_chan); 6608 } 6609 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 6610 6611 if (node == NULL) { 6612 state->rc_swqe_short++; 6613 mutex_enter(&state->id_sched_lock); 6614 state->id_sched_needed |= 6615 IBD_RSRC_RC_SWQE; 6616 mutex_exit(&state->id_sched_lock); 6617 ibd_dec_ref_ace(state, ace); 6618 return (B_FALSE); 6619 } 6620 } else { 6621 state->rc_no_estab_conn++; 6622 } 6623 } 6624 } 6625 6626 if (rc_chan == NULL) { 6627 mutex_enter(&state->id_tx_list.dl_mutex); 6628 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 6629 if (node != NULL) { 6630 state->id_tx_list.dl_cnt -= 1; 6631 state->id_tx_list.dl_head = node->swqe_next; 6632 } else { 6633 node = ibd_acquire_swqe(state); 6634 } 6635 mutex_exit(&state->id_tx_list.dl_mutex); 6636 if (node == NULL) { 6637 /* 6638 * If we don't have an swqe available, schedule a 6639 * transmit completion queue cleanup and hold off on 6640 * sending more packets until we have some free swqes 6641 */ 6642 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 6643 if (ace != NULL) { 6644 ibd_dec_ref_ace(state, ace); 6645 } 6646 return (B_FALSE); 6647 } 6648 6649 /* 6650 * If a poll cannot be scheduled, we have no choice but 6651 * to drop this packet 6652 */ 6653 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 6654 if (ace != NULL) { 6655 ibd_dec_ref_ace(state, ace); 6656 } 6657 return (B_TRUE); 6658 } 6659 } 6660 6661 /* 6662 * Initialize the commonly used fields in swqe to NULL to protect 6663 * against ibd_tx_cleanup accidentally misinterpreting these on a 6664 * failure. 6665 */ 6666 node->swqe_im_mblk = NULL; 6667 node->w_swr.wr_nds = 0; 6668 node->w_swr.wr_sgl = NULL; 6669 node->w_swr.wr_opcode = IBT_WRC_SEND; 6670 6671 /* 6672 * Calculate the size of message data and number of msg blocks 6673 */ 6674 pktsize = 0; 6675 for (nmblks = 0, nmp = mp; nmp != NULL; 6676 nmp = nmp->b_cont, nmblks++) { 6677 pktsize += MBLKL(nmp); 6678 } 6679 6680 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6681 atomic_inc_64(&state->id_brd_xmt); 6682 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6683 atomic_inc_64(&state->id_multi_xmt); 6684 6685 if (ace != NULL) { 6686 node->w_ahandle = ace; 6687 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 6688 } else { 6689 DPRINT(5, 6690 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 6691 ((ret == EFAULT) ? "failed" : "queued"), 6692 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 6693 htonl(dest->ipoib_gidpref[1]), 6694 htonl(dest->ipoib_gidsuff[0]), 6695 htonl(dest->ipoib_gidsuff[1])); 6696 state->rc_ace_not_found++; 6697 node->w_ahandle = NULL; 6698 6699 /* 6700 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 6701 * can not find a path for the specific dest address. We 6702 * should get rid of this kind of packet. We also should get 6703 * rid of the packet if we cannot schedule a poll via the 6704 * async thread. For the normal case, ibd will return the 6705 * packet to upper layer and wait for AH creating. 6706 * 6707 * Note that we always queue a work slot entry for the async 6708 * thread when we fail AH lookup (even in intr mode); this is 6709 * due to the convoluted way the code currently looks for AH. 6710 */ 6711 if (ret == EFAULT) { 6712 dofree = B_TRUE; 6713 rc = B_TRUE; 6714 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 6715 dofree = B_TRUE; 6716 rc = B_TRUE; 6717 } else { 6718 dofree = B_FALSE; 6719 rc = B_FALSE; 6720 } 6721 goto ibd_send_fail; 6722 } 6723 6724 /* 6725 * For ND6 packets, padding is at the front of the source lladdr. 6726 * Insert the padding at front. 6727 */ 6728 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 6729 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 6730 if (!pullupmsg(mp, IPV6_HDR_LEN + 6731 sizeof (ib_header_info_t))) { 6732 DPRINT(10, "ibd_send: pullupmsg failure "); 6733 dofree = B_TRUE; 6734 rc = B_TRUE; 6735 goto ibd_send_fail; 6736 } 6737 ipibp = (ib_header_info_t *)mp->b_rptr; 6738 } 6739 ip6h = (ip6_t *)((uchar_t *)ipibp + 6740 sizeof (ib_header_info_t)); 6741 len = ntohs(ip6h->ip6_plen); 6742 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6743 mblk_t *pad; 6744 6745 pad = allocb(4, 0); 6746 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 6747 linkb(mp, pad); 6748 if (MBLKL(mp) < sizeof (ib_header_info_t) + 6749 IPV6_HDR_LEN + len + 4) { 6750 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 6751 IPV6_HDR_LEN + len + 4)) { 6752 DPRINT(10, "ibd_send: pullupmsg " 6753 "failure "); 6754 dofree = B_TRUE; 6755 rc = B_TRUE; 6756 goto ibd_send_fail; 6757 } 6758 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6759 sizeof (ib_header_info_t)); 6760 } 6761 6762 /* LINTED: E_CONSTANT_CONDITION */ 6763 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 6764 } 6765 } 6766 6767 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 6768 mp->b_rptr += sizeof (ib_addrs_t); 6769 pktsize -= sizeof (ib_addrs_t); 6770 6771 if (rc_chan) { /* send in RC mode */ 6772 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6773 ibt_iov_attr_t iov_attr; 6774 uint_t i; 6775 size_t blksize; 6776 uchar_t *bufp; 6777 ibd_rc_tx_largebuf_t *lbufp; 6778 6779 atomic_add_64(&state->rc_xmt_bytes, pktsize); 6780 6781 /* 6782 * Upper layer does Tx checksum, we don't need do any 6783 * checksum here. 6784 */ 6785 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 6786 6787 /* 6788 * We only do ibt_map_mem_iov() if the pktsize is above 6789 * the "copy-threshold", and if the number of mp 6790 * fragments is less than the maximum acceptable. 6791 */ 6792 if (pktsize <= state->id_rc_tx_copy_thresh) { 6793 atomic_inc_64(&state->rc_xmt_small_pkt); 6794 /* 6795 * Only process unicast packet in Reliable Connected 6796 * mode. 6797 */ 6798 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6799 node->w_swr.wr_nds = 1; 6800 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6801 node->w_buftype = IBD_WQE_TXBUF; 6802 6803 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6804 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6805 blksize = MBLKL(nmp); 6806 bcopy(nmp->b_rptr, bufp, blksize); 6807 bufp += blksize; 6808 } 6809 freemsg(mp); 6810 ASSERT(node->swqe_im_mblk == NULL); 6811 } else { 6812 if ((state->rc_enable_iov_map) && 6813 (nmblks < state->rc_max_sqseg_hiwm)) { 6814 6815 /* do ibt_map_mem_iov() */ 6816 iov_attr.iov_as = NULL; 6817 iov_attr.iov = iov_arr; 6818 iov_attr.iov_buf = NULL; 6819 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 6820 iov_attr.iov_lso_hdr_sz = 0; 6821 iov_attr.iov_flags = IBT_IOV_SLEEP; 6822 6823 i = 0; 6824 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6825 iov_arr[i].iov_len = MBLKL(nmp); 6826 if (iov_arr[i].iov_len != 0) { 6827 iov_arr[i].iov_addr = (caddr_t) 6828 (void *)nmp->b_rptr; 6829 i++; 6830 } 6831 } 6832 iov_attr.iov_list_len = i; 6833 node->w_swr.wr_sgl = node->w_sgl; 6834 6835 ret = ibt_map_mem_iov(state->id_hca_hdl, 6836 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 6837 &node->w_mi_hdl); 6838 if (ret != IBT_SUCCESS) { 6839 atomic_inc_64( 6840 &state->rc_xmt_map_fail_pkt); 6841 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 6842 ") failed, nmblks=%d, real_nmblks" 6843 "=%d, ret=0x%x", nmblks, i, ret); 6844 goto ibd_rc_large_copy; 6845 } 6846 6847 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 6848 node->w_buftype = IBD_WQE_MAPPED; 6849 node->swqe_im_mblk = mp; 6850 } else { 6851 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 6852 ibd_rc_large_copy: 6853 mutex_enter(&state->rc_tx_large_bufs_lock); 6854 if (state->rc_tx_largebuf_nfree == 0) { 6855 state->rc_xmt_buf_short++; 6856 mutex_exit 6857 (&state->rc_tx_large_bufs_lock); 6858 mutex_enter(&state->id_sched_lock); 6859 state->id_sched_needed |= 6860 IBD_RSRC_RC_TX_LARGEBUF; 6861 mutex_exit(&state->id_sched_lock); 6862 dofree = B_FALSE; 6863 rc = B_FALSE; 6864 /* 6865 * If we don't have Tx large bufs, 6866 * return failure. node->w_buftype 6867 * should not be IBD_WQE_RC_COPYBUF, 6868 * otherwise it will cause problem 6869 * in ibd_rc_tx_cleanup() 6870 */ 6871 node->w_buftype = IBD_WQE_TXBUF; 6872 goto ibd_send_fail; 6873 } 6874 6875 lbufp = state->rc_tx_largebuf_free_head; 6876 ASSERT(lbufp->lb_buf != NULL); 6877 state->rc_tx_largebuf_free_head = 6878 lbufp->lb_next; 6879 lbufp->lb_next = NULL; 6880 /* Update nfree count */ 6881 state->rc_tx_largebuf_nfree --; 6882 mutex_exit(&state->rc_tx_large_bufs_lock); 6883 bufp = lbufp->lb_buf; 6884 node->w_sgl[0].ds_va = 6885 (ib_vaddr_t)(uintptr_t)bufp; 6886 node->w_sgl[0].ds_key = 6887 state->rc_tx_mr_desc.md_lkey; 6888 node->w_sgl[0].ds_len = pktsize; 6889 node->w_swr.wr_sgl = node->w_sgl; 6890 node->w_swr.wr_nds = 1; 6891 node->w_buftype = IBD_WQE_RC_COPYBUF; 6892 node->w_rc_tx_largebuf = lbufp; 6893 6894 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6895 blksize = MBLKL(nmp); 6896 if (blksize != 0) { 6897 bcopy(nmp->b_rptr, bufp, 6898 blksize); 6899 bufp += blksize; 6900 } 6901 } 6902 freemsg(mp); 6903 ASSERT(node->swqe_im_mblk == NULL); 6904 } 6905 } 6906 6907 node->swqe_next = NULL; 6908 mutex_enter(&rc_chan->tx_post_lock); 6909 if (rc_chan->tx_busy) { 6910 if (rc_chan->tx_head) { 6911 rc_chan->tx_tail->swqe_next = 6912 SWQE_TO_WQE(node); 6913 } else { 6914 rc_chan->tx_head = node; 6915 } 6916 rc_chan->tx_tail = node; 6917 mutex_exit(&rc_chan->tx_post_lock); 6918 } else { 6919 rc_chan->tx_busy = 1; 6920 mutex_exit(&rc_chan->tx_post_lock); 6921 ibd_rc_post_send(rc_chan, node); 6922 } 6923 6924 return (B_TRUE); 6925 } /* send by RC */ 6926 6927 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 6928 /* 6929 * Too long pktsize. The packet size from GLD should <= 6930 * state->id_mtu + sizeof (ib_addrs_t) 6931 */ 6932 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 6933 ibd_req_t *req; 6934 6935 mutex_enter(&ace->tx_too_big_mutex); 6936 if (ace->tx_too_big_ongoing) { 6937 mutex_exit(&ace->tx_too_big_mutex); 6938 state->rc_xmt_reenter_too_long_pkt++; 6939 dofree = B_TRUE; 6940 } else { 6941 ace->tx_too_big_ongoing = B_TRUE; 6942 mutex_exit(&ace->tx_too_big_mutex); 6943 state->rc_xmt_icmp_too_long_pkt++; 6944 6945 req = kmem_cache_alloc(state->id_req_kmc, 6946 KM_NOSLEEP); 6947 if (req == NULL) { 6948 ibd_print_warn(state, "ibd_send: alloc " 6949 "ibd_req_t fail"); 6950 /* Drop it. */ 6951 dofree = B_TRUE; 6952 } else { 6953 req->rq_ptr = mp; 6954 req->rq_ptr2 = ace; 6955 ibd_queue_work_slot(state, req, 6956 IBD_ASYNC_RC_TOO_BIG); 6957 dofree = B_FALSE; 6958 } 6959 } 6960 } else { 6961 ibd_print_warn(state, "Reliable Connected mode is on. " 6962 "Multicast packet length %d > %d is too long to " 6963 "send packet (%d > %d), drop it", 6964 pktsize, state->id_mtu); 6965 state->rc_xmt_drop_too_long_pkt++; 6966 /* Drop it. */ 6967 dofree = B_TRUE; 6968 } 6969 rc = B_TRUE; 6970 goto ibd_send_fail; 6971 } 6972 6973 atomic_add_64(&state->id_xmt_bytes, pktsize); 6974 atomic_inc_64(&state->id_xmt_pkt); 6975 6976 /* 6977 * Do LSO and checksum related work here. For LSO send, adjust the 6978 * ud destination, the opcode and the LSO header information to the 6979 * work request. 6980 */ 6981 mac_lso_get(mp, &mss, &lsoflags); 6982 if ((lsoflags & HW_LSO) != HW_LSO) { 6983 node->w_swr.wr_opcode = IBT_WRC_SEND; 6984 lsohdr_sz = 0; 6985 } else { 6986 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 6987 /* 6988 * The routine can only fail if there's no memory; we 6989 * can only drop the packet if this happens 6990 */ 6991 ibd_print_warn(state, 6992 "ibd_send: no memory, lso posting failed"); 6993 dofree = B_TRUE; 6994 rc = B_TRUE; 6995 goto ibd_send_fail; 6996 } 6997 6998 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 6999 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 7000 } 7001 7002 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 7003 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 7004 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 7005 else 7006 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 7007 7008 /* 7009 * Prepare the sgl for posting; the routine can only fail if there's 7010 * no lso buf available for posting. If this is the case, we should 7011 * probably resched for lso bufs to become available and then try again. 7012 */ 7013 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 7014 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 7015 dofree = B_TRUE; 7016 rc = B_TRUE; 7017 } else { 7018 dofree = B_FALSE; 7019 rc = B_FALSE; 7020 } 7021 goto ibd_send_fail; 7022 } 7023 node->swqe_im_mblk = mp; 7024 7025 /* 7026 * Queue the wqe to hardware; since we can now simply queue a 7027 * post instead of doing it serially, we cannot assume anything 7028 * about the 'node' after ibd_post_send() returns. 7029 */ 7030 node->swqe_next = NULL; 7031 7032 mutex_enter(&state->id_txpost_lock); 7033 if (state->id_tx_busy) { 7034 if (state->id_tx_head) { 7035 state->id_tx_tail->swqe_next = 7036 SWQE_TO_WQE(node); 7037 } else { 7038 state->id_tx_head = node; 7039 } 7040 state->id_tx_tail = node; 7041 mutex_exit(&state->id_txpost_lock); 7042 } else { 7043 state->id_tx_busy = 1; 7044 mutex_exit(&state->id_txpost_lock); 7045 ibd_post_send(state, node); 7046 } 7047 7048 return (B_TRUE); 7049 7050 ibd_send_fail: 7051 if (node && mp) 7052 ibd_free_lsohdr(node, mp); 7053 7054 if (dofree) 7055 freemsg(mp); 7056 7057 if (node != NULL) { 7058 if (rc_chan) { 7059 ibd_rc_tx_cleanup(node); 7060 } else { 7061 ibd_tx_cleanup(state, node); 7062 } 7063 } 7064 7065 return (rc); 7066 } 7067 7068 /* 7069 * GLDv3 entry point for transmitting datagram. 7070 */ 7071 static mblk_t * 7072 ibd_m_tx(void *arg, mblk_t *mp) 7073 { 7074 ibd_state_t *state = (ibd_state_t *)arg; 7075 mblk_t *next; 7076 7077 if (state->id_type == IBD_PORT_DRIVER) { 7078 freemsgchain(mp); 7079 return (NULL); 7080 } 7081 7082 if ((state->id_link_state != LINK_STATE_UP) || 7083 !(state->id_mac_state & IBD_DRV_STARTED)) { 7084 freemsgchain(mp); 7085 mp = NULL; 7086 } 7087 7088 while (mp != NULL) { 7089 next = mp->b_next; 7090 mp->b_next = NULL; 7091 if (ibd_send(state, mp) == B_FALSE) { 7092 /* Send fail */ 7093 mp->b_next = next; 7094 break; 7095 } 7096 mp = next; 7097 } 7098 7099 return (mp); 7100 } 7101 7102 /* 7103 * this handles Tx and Rx completions. With separate CQs, this handles 7104 * only Rx completions. 7105 */ 7106 static uint_t 7107 ibd_intr(caddr_t arg) 7108 { 7109 ibd_state_t *state = (ibd_state_t *)arg; 7110 7111 ibd_poll_rcq(state, state->id_rcq_hdl); 7112 7113 return (DDI_INTR_CLAIMED); 7114 } 7115 7116 /* 7117 * Poll and fully drain the send cq 7118 */ 7119 static void 7120 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7121 { 7122 ibt_wc_t *wcs = state->id_txwcs; 7123 uint_t numwcs = state->id_txwcs_size; 7124 ibd_wqe_t *wqe; 7125 ibd_swqe_t *head, *tail; 7126 ibt_wc_t *wc; 7127 uint_t num_polled; 7128 int i; 7129 7130 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7131 head = tail = NULL; 7132 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7133 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 7134 if (wc->wc_status != IBT_WC_SUCCESS) { 7135 /* 7136 * Channel being torn down. 7137 */ 7138 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7139 DPRINT(5, "ibd_drain_scq: flush error"); 7140 DPRINT(10, "ibd_drain_scq: Bad " 7141 "status %d", wc->wc_status); 7142 } else { 7143 DPRINT(10, "ibd_drain_scq: " 7144 "unexpected wc_status %d", 7145 wc->wc_status); 7146 } 7147 /* 7148 * Fallthrough to invoke the Tx handler to 7149 * release held resources, e.g., AH refcount. 7150 */ 7151 } 7152 /* 7153 * Add this swqe to the list to be cleaned up. 7154 */ 7155 if (head) 7156 tail->swqe_next = wqe; 7157 else 7158 head = WQE_TO_SWQE(wqe); 7159 tail = WQE_TO_SWQE(wqe); 7160 } 7161 tail->swqe_next = NULL; 7162 ibd_tx_cleanup_list(state, head, tail); 7163 7164 /* 7165 * Resume any blocked transmissions if possible 7166 */ 7167 ibd_resume_transmission(state); 7168 } 7169 } 7170 7171 /* 7172 * Poll and fully drain the receive cq 7173 */ 7174 static void 7175 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7176 { 7177 ibt_wc_t *wcs = state->id_rxwcs; 7178 uint_t numwcs = state->id_rxwcs_size; 7179 ibd_rwqe_t *rwqe; 7180 ibt_wc_t *wc; 7181 uint_t num_polled; 7182 int i; 7183 mblk_t *head, *tail, *mp; 7184 7185 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 7186 head = tail = NULL; 7187 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 7188 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 7189 if (wc->wc_status != IBT_WC_SUCCESS) { 7190 /* 7191 * Channel being torn down. 7192 */ 7193 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 7194 DPRINT(5, "ibd_drain_rcq: " 7195 "expected flushed rwqe"); 7196 } else { 7197 DPRINT(5, "ibd_drain_rcq: " 7198 "unexpected wc_status %d", 7199 wc->wc_status); 7200 } 7201 atomic_inc_32( 7202 &state->id_rx_list.dl_bufs_outstanding); 7203 freemsg(rwqe->rwqe_im_mblk); 7204 continue; 7205 } 7206 mp = ibd_process_rx(state, rwqe, wc); 7207 if (mp == NULL) 7208 continue; 7209 7210 /* 7211 * Add this mp to the list to send to the nw layer. 7212 */ 7213 if (head) 7214 tail->b_next = mp; 7215 else 7216 head = mp; 7217 tail = mp; 7218 } 7219 if (head) 7220 mac_rx(state->id_mh, state->id_rh, head); 7221 7222 /* 7223 * Account for #rwqes polled. 7224 * Post more here, if less than one fourth full. 7225 */ 7226 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 7227 (state->id_ud_num_rwqe / 4)) 7228 ibd_post_recv_intr(state); 7229 } 7230 } 7231 7232 /* 7233 * Common code for interrupt handling as well as for polling 7234 * for all completed wqe's while detaching. 7235 */ 7236 static void 7237 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 7238 { 7239 int flag, redo_flag; 7240 int redo = 1; 7241 7242 flag = IBD_CQ_POLLING; 7243 redo_flag = IBD_REDO_CQ_POLLING; 7244 7245 mutex_enter(&state->id_scq_poll_lock); 7246 if (state->id_scq_poll_busy & flag) { 7247 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 7248 state->id_scq_poll_busy |= redo_flag; 7249 mutex_exit(&state->id_scq_poll_lock); 7250 return; 7251 } 7252 state->id_scq_poll_busy |= flag; 7253 mutex_exit(&state->id_scq_poll_lock); 7254 7255 /* 7256 * In some cases (eg detaching), this code can be invoked on 7257 * any cpu after disabling cq notification (thus no concurrency 7258 * exists). Apart from that, the following applies normally: 7259 * Transmit completion handling could be from any cpu if 7260 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 7261 * is interrupt driven. 7262 */ 7263 7264 /* 7265 * Poll and drain the CQ 7266 */ 7267 ibd_drain_scq(state, cq_hdl); 7268 7269 /* 7270 * Enable CQ notifications and redrain the cq to catch any 7271 * completions we might have missed after the ibd_drain_scq() 7272 * above and before the ibt_enable_cq_notify() that follows. 7273 * Finally, service any new requests to poll the cq that 7274 * could've come in after the ibt_enable_cq_notify(). 7275 */ 7276 do { 7277 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 7278 IBT_SUCCESS) { 7279 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7280 } 7281 7282 ibd_drain_scq(state, cq_hdl); 7283 7284 mutex_enter(&state->id_scq_poll_lock); 7285 if (state->id_scq_poll_busy & redo_flag) 7286 state->id_scq_poll_busy &= ~redo_flag; 7287 else { 7288 state->id_scq_poll_busy &= ~flag; 7289 redo = 0; 7290 } 7291 mutex_exit(&state->id_scq_poll_lock); 7292 7293 } while (redo); 7294 } 7295 7296 /* 7297 * Common code for interrupt handling as well as for polling 7298 * for all completed wqe's while detaching. 7299 */ 7300 static void 7301 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 7302 { 7303 int flag, redo_flag; 7304 int redo = 1; 7305 7306 flag = IBD_CQ_POLLING; 7307 redo_flag = IBD_REDO_CQ_POLLING; 7308 7309 mutex_enter(&state->id_rcq_poll_lock); 7310 if (state->id_rcq_poll_busy & flag) { 7311 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 7312 state->id_rcq_poll_busy |= redo_flag; 7313 mutex_exit(&state->id_rcq_poll_lock); 7314 return; 7315 } 7316 state->id_rcq_poll_busy |= flag; 7317 mutex_exit(&state->id_rcq_poll_lock); 7318 7319 /* 7320 * Poll and drain the CQ 7321 */ 7322 ibd_drain_rcq(state, rcq); 7323 7324 /* 7325 * Enable CQ notifications and redrain the cq to catch any 7326 * completions we might have missed after the ibd_drain_cq() 7327 * above and before the ibt_enable_cq_notify() that follows. 7328 * Finally, service any new requests to poll the cq that 7329 * could've come in after the ibt_enable_cq_notify(). 7330 */ 7331 do { 7332 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 7333 IBT_SUCCESS) { 7334 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 7335 } 7336 7337 ibd_drain_rcq(state, rcq); 7338 7339 mutex_enter(&state->id_rcq_poll_lock); 7340 if (state->id_rcq_poll_busy & redo_flag) 7341 state->id_rcq_poll_busy &= ~redo_flag; 7342 else { 7343 state->id_rcq_poll_busy &= ~flag; 7344 redo = 0; 7345 } 7346 mutex_exit(&state->id_rcq_poll_lock); 7347 7348 } while (redo); 7349 } 7350 7351 /* 7352 * Unmap the memory area associated with a given swqe. 7353 */ 7354 void 7355 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 7356 { 7357 ibt_status_t stat; 7358 7359 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 7360 7361 if (swqe->w_mi_hdl) { 7362 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 7363 swqe->w_mi_hdl)) != IBT_SUCCESS) { 7364 DPRINT(10, 7365 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 7366 } 7367 swqe->w_mi_hdl = NULL; 7368 } 7369 swqe->w_swr.wr_nds = 0; 7370 } 7371 7372 void 7373 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 7374 { 7375 /* 7376 * The recycling logic can be eliminated from here 7377 * and put into the async thread if we create another 7378 * list to hold ACE's for unjoined mcg's. 7379 */ 7380 if (DEC_REF_DO_CYCLE(ace)) { 7381 ibd_mce_t *mce; 7382 7383 /* 7384 * Check with the lock taken: we decremented 7385 * reference count without the lock, and some 7386 * transmitter might already have bumped the 7387 * reference count (possible in case of multicast 7388 * disable when we leave the AH on the active 7389 * list). If not still 0, get out, leaving the 7390 * recycle bit intact. 7391 * 7392 * Atomically transition the AH from active 7393 * to free list, and queue a work request to 7394 * leave the group and destroy the mce. No 7395 * transmitter can be looking at the AH or 7396 * the MCE in between, since we have the 7397 * ac_mutex lock. In the SendOnly reap case, 7398 * it is not necessary to hold the ac_mutex 7399 * and recheck the ref count (since the AH was 7400 * taken off the active list), we just do it 7401 * to have uniform processing with the Full 7402 * reap case. 7403 */ 7404 mutex_enter(&state->id_ac_mutex); 7405 mce = ace->ac_mce; 7406 if (GET_REF_CYCLE(ace) == 0) { 7407 CLEAR_REFCYCLE(ace); 7408 /* 7409 * Identify the case of fullmember reap as 7410 * opposed to mcg trap reap. Also, port up 7411 * might set ac_mce to NULL to indicate Tx 7412 * cleanup should do no more than put the 7413 * AH in the free list (see ibd_async_link). 7414 */ 7415 if (mce != NULL) { 7416 ace->ac_mce = NULL; 7417 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 7418 /* 7419 * mc_req was initialized at mce 7420 * creation time. 7421 */ 7422 ibd_queue_work_slot(state, 7423 &mce->mc_req, IBD_ASYNC_REAP); 7424 } 7425 IBD_ACACHE_INSERT_FREE(state, ace); 7426 } 7427 mutex_exit(&state->id_ac_mutex); 7428 } 7429 } 7430 7431 /* 7432 * Common code that deals with clean ups after a successful or 7433 * erroneous transmission attempt. 7434 */ 7435 static void 7436 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 7437 { 7438 ibd_ace_t *ace = swqe->w_ahandle; 7439 7440 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 7441 7442 /* 7443 * If this was a dynamic mapping in ibd_send(), we need to 7444 * unmap here. If this was an lso buffer we'd used for sending, 7445 * we need to release the lso buf to the pool, since the resource 7446 * is scarce. However, if this was simply a normal send using 7447 * the copybuf (present in each swqe), we don't need to release it. 7448 */ 7449 if (swqe->swqe_im_mblk != NULL) { 7450 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7451 ibd_unmap_mem(state, swqe); 7452 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7453 ibd_release_lsobufs(state, 7454 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7455 } 7456 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7457 freemsg(swqe->swqe_im_mblk); 7458 swqe->swqe_im_mblk = NULL; 7459 } 7460 7461 /* 7462 * Drop the reference count on the AH; it can be reused 7463 * now for a different destination if there are no more 7464 * posted sends that will use it. This can be eliminated 7465 * if we can always associate each Tx buffer with an AH. 7466 * The ace can be null if we are cleaning up from the 7467 * ibd_send() error path. 7468 */ 7469 if (ace != NULL) { 7470 ibd_dec_ref_ace(state, ace); 7471 } 7472 7473 /* 7474 * Release the send wqe for reuse. 7475 */ 7476 swqe->swqe_next = NULL; 7477 ibd_release_swqe(state, swqe, swqe, 1); 7478 } 7479 7480 static void 7481 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 7482 { 7483 ibd_ace_t *ace; 7484 ibd_swqe_t *swqe; 7485 int n = 0; 7486 7487 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 7488 7489 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 7490 7491 /* 7492 * If this was a dynamic mapping in ibd_send(), we need to 7493 * unmap here. If this was an lso buffer we'd used for sending, 7494 * we need to release the lso buf to the pool, since the 7495 * resource is scarce. However, if this was simply a normal 7496 * send using the copybuf (present in each swqe), we don't need 7497 * to release it. 7498 */ 7499 if (swqe->swqe_im_mblk != NULL) { 7500 if (swqe->w_buftype == IBD_WQE_MAPPED) { 7501 ibd_unmap_mem(state, swqe); 7502 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 7503 ibd_release_lsobufs(state, 7504 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 7505 } 7506 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 7507 freemsg(swqe->swqe_im_mblk); 7508 swqe->swqe_im_mblk = NULL; 7509 } 7510 7511 /* 7512 * Drop the reference count on the AH; it can be reused 7513 * now for a different destination if there are no more 7514 * posted sends that will use it. This can be eliminated 7515 * if we can always associate each Tx buffer with an AH. 7516 * The ace can be null if we are cleaning up from the 7517 * ibd_send() error path. 7518 */ 7519 ace = swqe->w_ahandle; 7520 if (ace != NULL) { 7521 ibd_dec_ref_ace(state, ace); 7522 } 7523 n++; 7524 } 7525 7526 /* 7527 * Release the send wqes for reuse. 7528 */ 7529 ibd_release_swqe(state, head, tail, n); 7530 } 7531 7532 /* 7533 * Processing to be done after receipt of a packet; hand off to GLD 7534 * in the format expected by GLD. The received packet has this 7535 * format: 2b sap :: 00 :: data. 7536 */ 7537 static mblk_t * 7538 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 7539 { 7540 ib_header_info_t *phdr; 7541 mblk_t *mp; 7542 ipoib_hdr_t *ipibp; 7543 ipha_t *iphap; 7544 ip6_t *ip6h; 7545 int len; 7546 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 7547 uint32_t bufs; 7548 7549 /* 7550 * Track number handed to upper layer that need to be returned. 7551 */ 7552 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 7553 7554 /* Never run out of rwqes, use allocb when running low */ 7555 if (bufs >= state->id_rx_bufs_outstanding_limit) { 7556 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7557 atomic_inc_32(&state->id_rx_allocb); 7558 mp = allocb(pkt_len, BPRI_HI); 7559 if (mp) { 7560 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 7561 ibd_post_recv(state, rwqe); 7562 } else { /* no memory */ 7563 atomic_inc_32(&state->id_rx_allocb_failed); 7564 ibd_post_recv(state, rwqe); 7565 return (NULL); 7566 } 7567 } else { 7568 mp = rwqe->rwqe_im_mblk; 7569 } 7570 7571 7572 /* 7573 * Adjust write pointer depending on how much data came in. 7574 */ 7575 mp->b_wptr = mp->b_rptr + pkt_len; 7576 7577 /* 7578 * Make sure this is NULL or we're in trouble. 7579 */ 7580 if (mp->b_next != NULL) { 7581 ibd_print_warn(state, 7582 "ibd_process_rx: got duplicate mp from rcq?"); 7583 mp->b_next = NULL; 7584 } 7585 7586 /* 7587 * the IB link will deliver one of the IB link layer 7588 * headers called, the Global Routing Header (GRH). 7589 * ibd driver uses the information in GRH to build the 7590 * Header_info structure and pass it with the datagram up 7591 * to GLDv3. 7592 * If the GRH is not valid, indicate to GLDv3 by setting 7593 * the VerTcFlow field to 0. 7594 */ 7595 phdr = (ib_header_info_t *)mp->b_rptr; 7596 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 7597 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 7598 7599 /* if it is loop back packet, just drop it. */ 7600 if (state->id_enable_rc) { 7601 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 7602 &state->rc_macaddr_loopback, 7603 IPOIB_ADDRL) == 0) { 7604 freemsg(mp); 7605 return (NULL); 7606 } 7607 } else { 7608 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 7609 IPOIB_ADDRL) == 0) { 7610 freemsg(mp); 7611 return (NULL); 7612 } 7613 } 7614 7615 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 7616 sizeof (ipoib_mac_t)); 7617 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 7618 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 7619 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 7620 } else { 7621 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 7622 } 7623 } else { 7624 /* 7625 * It can not be a IBA multicast packet. Must have been 7626 * unicast for us. Just copy the interface address to dst. 7627 */ 7628 phdr->ib_grh.ipoib_vertcflow = 0; 7629 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 7630 sizeof (ipoib_mac_t)); 7631 } 7632 7633 /* 7634 * For ND6 packets, padding is at the front of the source/target 7635 * lladdr. However the inet6 layer is not aware of it, hence remove 7636 * the padding from such packets. 7637 */ 7638 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 7639 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 7640 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 7641 len = ntohs(ip6h->ip6_plen); 7642 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 7643 /* LINTED: E_CONSTANT_CONDITION */ 7644 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 7645 } 7646 } 7647 7648 /* 7649 * Update statistics 7650 */ 7651 atomic_add_64(&state->id_rcv_bytes, pkt_len); 7652 atomic_inc_64(&state->id_rcv_pkt); 7653 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 7654 atomic_inc_64(&state->id_brd_rcv); 7655 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 7656 atomic_inc_64(&state->id_multi_rcv); 7657 7658 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 7659 /* 7660 * Set receive checksum status in mp 7661 * Hardware checksumming can be considered valid only if: 7662 * 1. CQE.IP_OK bit is set 7663 * 2. CQE.CKSUM = 0xffff 7664 * 3. IPv6 routing header is not present in the packet 7665 * 4. If there are no IP_OPTIONS in the IP HEADER 7666 */ 7667 7668 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 7669 (wc->wc_cksum == 0xFFFF) && 7670 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 7671 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 7672 } 7673 7674 return (mp); 7675 } 7676 7677 /* 7678 * Callback code invoked from STREAMs when the receive data buffer is 7679 * free for recycling. 7680 */ 7681 static void 7682 ibd_freemsg_cb(char *arg) 7683 { 7684 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 7685 ibd_state_t *state = rwqe->w_state; 7686 7687 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 7688 7689 /* 7690 * If the driver is stopped, just free the rwqe. 7691 */ 7692 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 7693 DPRINT(6, "ibd_freemsg: wqe being freed"); 7694 rwqe->rwqe_im_mblk = NULL; 7695 ibd_free_rwqe(state, rwqe); 7696 return; 7697 } 7698 7699 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 7700 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 7701 if (rwqe->rwqe_im_mblk == NULL) { 7702 ibd_free_rwqe(state, rwqe); 7703 DPRINT(6, "ibd_freemsg: desballoc failed"); 7704 return; 7705 } 7706 7707 ibd_post_recv(state, rwqe); 7708 } 7709 7710 static uint_t 7711 ibd_tx_recycle(caddr_t arg) 7712 { 7713 ibd_state_t *state = (ibd_state_t *)arg; 7714 7715 /* 7716 * Poll for completed entries 7717 */ 7718 ibd_poll_scq(state, state->id_scq_hdl); 7719 7720 return (DDI_INTR_CLAIMED); 7721 } 7722 7723 #ifdef IBD_LOGGING 7724 static void 7725 ibd_log_init(void) 7726 { 7727 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 7728 ibd_lbuf_ndx = 0; 7729 7730 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 7731 } 7732 7733 static void 7734 ibd_log_fini(void) 7735 { 7736 if (ibd_lbuf) 7737 kmem_free(ibd_lbuf, IBD_LOG_SZ); 7738 ibd_lbuf_ndx = 0; 7739 ibd_lbuf = NULL; 7740 7741 mutex_destroy(&ibd_lbuf_lock); 7742 } 7743 7744 static void 7745 ibd_log(const char *fmt, ...) 7746 { 7747 va_list ap; 7748 uint32_t off; 7749 uint32_t msglen; 7750 char tmpbuf[IBD_DMAX_LINE]; 7751 7752 if (ibd_lbuf == NULL) 7753 return; 7754 7755 va_start(ap, fmt); 7756 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 7757 va_end(ap); 7758 7759 if (msglen >= IBD_DMAX_LINE) 7760 msglen = IBD_DMAX_LINE - 1; 7761 7762 mutex_enter(&ibd_lbuf_lock); 7763 7764 off = ibd_lbuf_ndx; /* current msg should go here */ 7765 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 7766 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 7767 7768 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 7769 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 7770 7771 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 7772 ibd_lbuf_ndx = 0; 7773 7774 mutex_exit(&ibd_lbuf_lock); 7775 7776 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 7777 } 7778 #endif 7779 7780 /* ARGSUSED */ 7781 static int 7782 ibd_create_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 7783 int *rvalp) 7784 { 7785 ibd_create_ioctl_t *cmd = karg; 7786 ibd_state_t *state, *port_state, *p; 7787 int i, err, rval = 0; 7788 mac_register_t *macp; 7789 ibt_hca_portinfo_t *pinfop = NULL; 7790 ibt_status_t ibt_status; 7791 uint_t psize, pinfosz; 7792 boolean_t force_create = B_FALSE; 7793 7794 cmd->ibdioc.ioc_status = 0; 7795 7796 if (cmd->ibdioc.ioc_port_inst < 0) { 7797 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 7798 return (EINVAL); 7799 } 7800 port_state = ddi_get_soft_state(ibd_list, cmd->ibdioc.ioc_port_inst); 7801 if (port_state == NULL) { 7802 DPRINT(10, "ibd_create_partition: failed to get state %d", 7803 cmd->ibdioc.ioc_port_inst); 7804 cmd->ibdioc.ioc_status = IBD_INVALID_PORT_INST; 7805 return (EINVAL); 7806 } 7807 7808 /* Limited PKeys not supported */ 7809 if (cmd->ioc_pkey <= IB_PKEY_INVALID_FULL) { 7810 rval = EINVAL; 7811 goto part_create_return; 7812 } 7813 7814 if (cmd->ioc_force_create == 0) { 7815 /* 7816 * Check if the port pkey table contains the pkey for which 7817 * this partition is being created. 7818 */ 7819 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 7820 port_state->id_port, &pinfop, &psize, &pinfosz); 7821 7822 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 7823 rval = EINVAL; 7824 goto part_create_return; 7825 } 7826 7827 if (pinfop->p_linkstate != IBT_PORT_ACTIVE) { 7828 rval = ENETDOWN; 7829 cmd->ibdioc.ioc_status = IBD_PORT_IS_DOWN; 7830 goto part_create_return; 7831 } 7832 7833 for (i = 0; i < pinfop->p_pkey_tbl_sz; i++) { 7834 if (pinfop->p_pkey_tbl[i] == cmd->ioc_pkey) { 7835 break; 7836 } 7837 } 7838 if (i == pinfop->p_pkey_tbl_sz) { 7839 rval = EINVAL; 7840 cmd->ibdioc.ioc_status = IBD_PKEY_NOT_PRESENT; 7841 goto part_create_return; 7842 } 7843 } else { 7844 force_create = B_TRUE; 7845 } 7846 7847 mutex_enter(&ibd_objlist_lock); 7848 for (p = ibd_objlist_head; p; p = p->id_next) { 7849 if ((p->id_port_inst == cmd->ibdioc.ioc_port_inst) && 7850 (p->id_pkey == cmd->ioc_pkey) && 7851 (p->id_plinkid == cmd->ioc_partid)) { 7852 mutex_exit(&ibd_objlist_lock); 7853 rval = EEXIST; 7854 cmd->ibdioc.ioc_status = IBD_PARTITION_EXISTS; 7855 goto part_create_return; 7856 } 7857 } 7858 mutex_exit(&ibd_objlist_lock); 7859 7860 state = kmem_zalloc(sizeof (ibd_state_t), KM_SLEEP); 7861 7862 state->id_type = IBD_PARTITION_OBJ; 7863 7864 state->id_plinkid = cmd->ioc_partid; 7865 state->id_dlinkid = cmd->ibdioc.ioc_linkid; 7866 state->id_port_inst = cmd->ibdioc.ioc_port_inst; 7867 7868 state->id_dip = port_state->id_dip; 7869 state->id_port = port_state->id_port; 7870 state->id_pkey = cmd->ioc_pkey; 7871 state->id_hca_guid = port_state->id_hca_guid; 7872 state->id_port_guid = port_state->id_port_guid; 7873 state->id_force_create = force_create; 7874 7875 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 7876 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 7877 7878 if (ibd_part_attach(state, state->id_dip) != DDI_SUCCESS) { 7879 rval = EIO; 7880 cmd->ibdioc.ioc_status = IBD_NO_HW_RESOURCE; 7881 goto fail; 7882 } 7883 7884 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 7885 rval = EAGAIN; 7886 goto fail; 7887 } 7888 7889 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 7890 macp->m_dip = port_state->id_dip; 7891 macp->m_instance = (uint_t)-1; 7892 macp->m_driver = state; 7893 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 7894 macp->m_callbacks = &ibd_m_callbacks; 7895 macp->m_min_sdu = 0; 7896 macp->m_multicast_sdu = IBD_DEF_MAX_SDU; 7897 if (state->id_enable_rc) { 7898 macp->m_max_sdu = IBD_DEF_RC_MAX_SDU; 7899 } else { 7900 macp->m_max_sdu = IBD_DEF_MAX_SDU; 7901 } 7902 macp->m_priv_props = ibd_priv_props; 7903 7904 err = mac_register(macp, &state->id_mh); 7905 mac_free(macp); 7906 7907 if (err != 0) { 7908 DPRINT(10, "ibd_create_partition: mac_register() failed %d", 7909 err); 7910 rval = err; 7911 goto fail; 7912 } 7913 7914 err = dls_devnet_create(state->id_mh, 7915 cmd->ioc_partid, crgetzoneid(credp)); 7916 if (err != 0) { 7917 DPRINT(10, "ibd_create_partition: dls_devnet_create() failed " 7918 "%d", err); 7919 rval = err; 7920 (void) mac_unregister(state->id_mh); 7921 goto fail; 7922 } 7923 7924 /* 7925 * Add the new partition state structure to the list 7926 */ 7927 mutex_enter(&ibd_objlist_lock); 7928 if (ibd_objlist_head) 7929 state->id_next = ibd_objlist_head; 7930 7931 ibd_objlist_head = state; 7932 mutex_exit(&ibd_objlist_lock); 7933 7934 part_create_return: 7935 if (pinfop) { 7936 ibt_free_portinfo(pinfop, pinfosz); 7937 } 7938 return (rval); 7939 7940 fail: 7941 if (pinfop) { 7942 ibt_free_portinfo(pinfop, pinfosz); 7943 } 7944 ibd_part_unattach(state); 7945 kmem_free(state, sizeof (ibd_state_t)); 7946 return (rval); 7947 } 7948 7949 /* ARGSUSED */ 7950 static int 7951 ibd_delete_partition(void *karg, intptr_t arg, int mode, cred_t *credp, 7952 int *rvalp) 7953 { 7954 int err; 7955 datalink_id_t tmpid; 7956 ibd_state_t *node, *prev; 7957 ibd_delete_ioctl_t *cmd = karg; 7958 7959 prev = NULL; 7960 7961 mutex_enter(&ibd_objlist_lock); 7962 node = ibd_objlist_head; 7963 7964 /* Find the ibd state structure corresponding to the partition */ 7965 while (node != NULL) { 7966 if (node->id_plinkid == cmd->ioc_partid) 7967 break; 7968 prev = node; 7969 node = node->id_next; 7970 } 7971 7972 if (node == NULL) { 7973 mutex_exit(&ibd_objlist_lock); 7974 return (ENOENT); 7975 } 7976 7977 if ((err = dls_devnet_destroy(node->id_mh, &tmpid, B_TRUE)) != 0) { 7978 DPRINT(10, "ibd_delete_partition: dls_devnet_destroy() failed " 7979 "%d", err); 7980 mutex_exit(&ibd_objlist_lock); 7981 return (err); 7982 } 7983 7984 /* 7985 * Call ibd_part_unattach() only after making sure that the instance has 7986 * not been started yet and is also not in late hca init mode. 7987 */ 7988 ibd_set_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 7989 7990 err = 0; 7991 if ((node->id_mac_state & IBD_DRV_STARTED) || 7992 (node->id_mac_state & IBD_DRV_IN_LATE_HCA_INIT) || 7993 (ibd_part_busy(node) != DDI_SUCCESS) || 7994 ((err = mac_disable(node->id_mh)) != 0)) { 7995 (void) dls_devnet_create(node->id_mh, cmd->ioc_partid, 7996 crgetzoneid(credp)); 7997 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 7998 mutex_exit(&ibd_objlist_lock); 7999 return (err != 0 ? err : EBUSY); 8000 } 8001 8002 node->id_mac_state |= IBD_DRV_IN_DELETION; 8003 8004 ibd_part_unattach(node); 8005 8006 ibd_clr_mac_progress(node, IBD_DRV_DELETE_IN_PROGRESS); 8007 8008 /* Remove the partition state structure from the linked list */ 8009 if (prev == NULL) 8010 ibd_objlist_head = node->id_next; 8011 else 8012 prev->id_next = node->id_next; 8013 mutex_exit(&ibd_objlist_lock); 8014 8015 if ((err = mac_unregister(node->id_mh)) != 0) { 8016 DPRINT(10, "ibd_delete_partition: mac_unregister() failed %d", 8017 err); 8018 } 8019 8020 cv_destroy(&node->id_macst_cv); 8021 mutex_destroy(&node->id_macst_lock); 8022 8023 kmem_free(node, sizeof (ibd_state_t)); 8024 8025 return (0); 8026 } 8027 8028 /* ARGSUSED */ 8029 static int 8030 ibd_get_partition_info(void *karg, intptr_t arg, int mode, cred_t *cred, 8031 int *rvalp) 8032 { 8033 ibd_ioctl_t cmd; 8034 ibpart_ioctl_t partioc; 8035 ibport_ioctl_t portioc; 8036 #ifdef _MULTI_DATAMODEL 8037 ibport_ioctl32_t portioc32; 8038 #endif 8039 ibd_state_t *state, *port_state; 8040 int size; 8041 ibt_hca_portinfo_t *pinfop = NULL; 8042 ibt_status_t ibt_status; 8043 uint_t psize, pinfosz; 8044 int rval = 0; 8045 8046 size = sizeof (ibd_ioctl_t); 8047 if (ddi_copyin((void *)arg, &cmd, size, mode)) { 8048 return (EFAULT); 8049 } 8050 cmd.ioc_status = 0; 8051 switch (cmd.ioc_info_cmd) { 8052 case IBD_INFO_CMD_IBPART: 8053 size = sizeof (ibpart_ioctl_t); 8054 if (ddi_copyin((void *)arg, &partioc, size, mode)) { 8055 return (EFAULT); 8056 } 8057 8058 mutex_enter(&ibd_objlist_lock); 8059 /* Find the ibd state structure corresponding the partition */ 8060 for (state = ibd_objlist_head; state; state = state->id_next) { 8061 if (state->id_plinkid == cmd.ioc_linkid) { 8062 break; 8063 } 8064 } 8065 8066 if (state == NULL) { 8067 mutex_exit(&ibd_objlist_lock); 8068 return (ENOENT); 8069 } 8070 8071 partioc.ibdioc.ioc_linkid = state->id_dlinkid; 8072 partioc.ibdioc.ioc_port_inst = state->id_port_inst; 8073 partioc.ibdioc.ioc_portnum = state->id_port; 8074 partioc.ibdioc.ioc_hcaguid = state->id_hca_guid; 8075 partioc.ibdioc.ioc_portguid = state->id_port_guid; 8076 partioc.ibdioc.ioc_status = 0; 8077 partioc.ioc_partid = state->id_plinkid; 8078 partioc.ioc_pkey = state->id_pkey; 8079 partioc.ioc_force_create = state->id_force_create; 8080 if (ddi_copyout((void *)&partioc, (void *)arg, size, mode)) { 8081 mutex_exit(&ibd_objlist_lock); 8082 return (EFAULT); 8083 } 8084 mutex_exit(&ibd_objlist_lock); 8085 8086 break; 8087 8088 case IBD_INFO_CMD_IBPORT: 8089 if ((cmd.ioc_port_inst < 0) || ((port_state = 8090 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8091 DPRINT(10, "ibd_create_partition: failed to get" 8092 " state %d", cmd.ioc_port_inst); 8093 size = sizeof (ibd_ioctl_t); 8094 cmd.ioc_status = IBD_INVALID_PORT_INST; 8095 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8096 mode)) { 8097 return (EFAULT); 8098 } 8099 return (EINVAL); 8100 } 8101 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8102 port_state->id_port, &pinfop, &psize, &pinfosz); 8103 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8104 return (EINVAL); 8105 } 8106 #ifdef _MULTI_DATAMODEL 8107 switch (ddi_model_convert_from(mode & FMODELS)) { 8108 case DDI_MODEL_ILP32: { 8109 size = sizeof (ibport_ioctl32_t); 8110 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8111 rval = EFAULT; 8112 goto fail; 8113 } 8114 portioc32.ibdioc.ioc_status = 0; 8115 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8116 portioc32.ibdioc.ioc_hcaguid = 8117 port_state->id_hca_guid; 8118 portioc32.ibdioc.ioc_portguid = 8119 port_state->id_port_guid; 8120 if (portioc32.ioc_pkey_tbl_sz != 8121 pinfop->p_pkey_tbl_sz) { 8122 rval = EINVAL; 8123 size = sizeof (ibd_ioctl_t); 8124 portioc32.ibdioc.ioc_status = 8125 IBD_INVALID_PKEY_TBL_SIZE; 8126 if (ddi_copyout((void *)&portioc32.ibdioc, 8127 (void *)arg, size, mode)) { 8128 rval = EFAULT; 8129 goto fail; 8130 } 8131 goto fail; 8132 } 8133 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8134 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8135 (void *)(uintptr_t)portioc32.ioc_pkeys, size, 8136 mode)) { 8137 rval = EFAULT; 8138 goto fail; 8139 } 8140 size = sizeof (ibport_ioctl32_t); 8141 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8142 mode)) { 8143 rval = EFAULT; 8144 goto fail; 8145 } 8146 break; 8147 } 8148 case DDI_MODEL_NONE: 8149 size = sizeof (ibport_ioctl_t); 8150 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8151 rval = EFAULT; 8152 goto fail; 8153 } 8154 portioc.ibdioc.ioc_status = 0; 8155 portioc.ibdioc.ioc_portnum = port_state->id_port; 8156 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8157 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8158 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8159 rval = EINVAL; 8160 size = sizeof (ibd_ioctl_t); 8161 portioc.ibdioc.ioc_status = 8162 IBD_INVALID_PKEY_TBL_SIZE; 8163 if (ddi_copyout((void *)&portioc.ibdioc, 8164 (void *)arg, size, mode)) { 8165 rval = EFAULT; 8166 goto fail; 8167 } 8168 goto fail; 8169 } 8170 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8171 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8172 (void *)(portioc.ioc_pkeys), size, mode)) { 8173 rval = EFAULT; 8174 goto fail; 8175 } 8176 size = sizeof (ibport_ioctl_t); 8177 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8178 mode)) { 8179 rval = EFAULT; 8180 goto fail; 8181 } 8182 break; 8183 } 8184 #else /* ! _MULTI_DATAMODEL */ 8185 size = sizeof (ibport_ioctl_t); 8186 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8187 rval = EFAULT; 8188 goto fail; 8189 } 8190 portioc.ibdioc.ioc_status = 0; 8191 portioc.ibdioc.ioc_portnum = port_state->id_port; 8192 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8193 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8194 if (portioc.ioc_pkey_tbl_sz != pinfop->p_pkey_tbl_sz) { 8195 rval = EINVAL; 8196 size = sizeof (ibd_ioctl_t); 8197 portioc.ibdioc.ioc_status = IBD_INVALID_PKEY_TBL_SIZE; 8198 if (ddi_copyout((void *)&portioc.ibdioc, (void *)arg, 8199 size, mode)) { 8200 rval = EFAULT; 8201 goto fail; 8202 } 8203 goto fail; 8204 } 8205 size = pinfop->p_pkey_tbl_sz * sizeof (ib_pkey_t); 8206 if (ddi_copyout((void *)pinfop->p_pkey_tbl, 8207 (void *)(portioc.ioc_pkeys), size, mode)) { 8208 rval = EFAULT; 8209 goto fail; 8210 } 8211 size = sizeof (ibport_ioctl_t); 8212 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8213 mode)) { 8214 rval = EFAULT; 8215 goto fail; 8216 } 8217 #endif /* _MULTI_DATAMODEL */ 8218 8219 break; 8220 8221 case IBD_INFO_CMD_PKEYTBLSZ: 8222 if ((cmd.ioc_port_inst < 0) || ((port_state = 8223 ddi_get_soft_state(ibd_list, cmd.ioc_port_inst)) == NULL)) { 8224 DPRINT(10, "ibd_create_partition: failed to get" 8225 " state %d", cmd.ioc_port_inst); 8226 size = sizeof (ibd_ioctl_t); 8227 cmd.ioc_status = IBD_INVALID_PORT_INST; 8228 if (ddi_copyout((void *)&cmd, (void *)arg, size, 8229 mode)) { 8230 return (EFAULT); 8231 } 8232 return (EINVAL); 8233 } 8234 ibt_status = ibt_query_hca_ports(port_state->id_hca_hdl, 8235 port_state->id_port, &pinfop, &psize, &pinfosz); 8236 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 8237 return (EINVAL); 8238 } 8239 #ifdef _MULTI_DATAMODEL 8240 switch (ddi_model_convert_from(mode & FMODELS)) { 8241 case DDI_MODEL_ILP32: { 8242 size = sizeof (ibport_ioctl32_t); 8243 if (ddi_copyin((void *)arg, &portioc32, size, mode)) { 8244 rval = EFAULT; 8245 goto fail; 8246 } 8247 portioc32.ibdioc.ioc_status = 0; 8248 portioc32.ibdioc.ioc_portnum = port_state->id_port; 8249 portioc32.ibdioc.ioc_hcaguid = 8250 port_state->id_hca_guid; 8251 portioc32.ibdioc.ioc_portguid = 8252 port_state->id_port_guid; 8253 portioc32.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8254 if (ddi_copyout((void *)&portioc32, (void *)arg, size, 8255 mode)) { 8256 rval = EFAULT; 8257 goto fail; 8258 } 8259 break; 8260 } 8261 case DDI_MODEL_NONE: 8262 size = sizeof (ibport_ioctl_t); 8263 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8264 rval = EFAULT; 8265 goto fail; 8266 } 8267 portioc.ibdioc.ioc_status = 0; 8268 portioc.ibdioc.ioc_portnum = port_state->id_port; 8269 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8270 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8271 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8272 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8273 mode)) { 8274 rval = EFAULT; 8275 goto fail; 8276 } 8277 break; 8278 } 8279 #else /* ! _MULTI_DATAMODEL */ 8280 size = sizeof (ibport_ioctl_t); 8281 if (ddi_copyin((void *)arg, &portioc, size, mode)) { 8282 rval = EFAULT; 8283 goto fail; 8284 } 8285 portioc.ibdioc.ioc_status = 0; 8286 portioc.ibdioc.ioc_portnum = port_state->id_port; 8287 portioc.ibdioc.ioc_hcaguid = port_state->id_hca_guid; 8288 portioc.ibdioc.ioc_portguid = port_state->id_port_guid; 8289 portioc.ioc_pkey_tbl_sz = pinfop->p_pkey_tbl_sz; 8290 if (ddi_copyout((void *)&portioc, (void *)arg, size, 8291 mode)) { 8292 rval = EFAULT; 8293 goto fail; 8294 } 8295 #endif /* _MULTI_DATAMODEL */ 8296 break; 8297 8298 default: 8299 return (EINVAL); 8300 8301 } /* switch (cmd.ioc_info_cmd) */ 8302 fail: 8303 if (pinfop) { 8304 ibt_free_portinfo(pinfop, pinfosz); 8305 } 8306 return (rval); 8307 } 8308 8309 /* ARGSUSED */ 8310 static void 8311 ibdpd_async_handler(void *arg, ibt_hca_hdl_t hca_hdl, 8312 ibt_async_code_t code, ibt_async_event_t *event) 8313 { 8314 ibd_state_t *state = (ibd_state_t *)arg; 8315 link_state_t lstate; 8316 8317 switch (code) { 8318 case IBT_EVENT_PORT_UP: 8319 case IBT_ERROR_PORT_DOWN: 8320 if (ibd_get_port_state(state, &lstate) != 0) 8321 break; 8322 8323 if (state->id_link_state != lstate) { 8324 state->id_link_state = lstate; 8325 mac_link_update(state->id_mh, lstate); 8326 } 8327 break; 8328 default: 8329 break; 8330 } 8331 } 8332 8333 static int 8334 ibd_get_port_state(ibd_state_t *state, link_state_t *lstate) 8335 { 8336 ibt_hca_portinfo_t *port_infop; 8337 uint_t psize, port_infosz; 8338 ibt_status_t ret; 8339 8340 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 8341 &port_infop, &psize, &port_infosz); 8342 if ((ret != IBT_SUCCESS) || (psize != 1)) 8343 return (-1); 8344 8345 state->id_sgid = *port_infop->p_sgid_tbl; 8346 state->id_link_speed = ibd_get_portspeed(state); 8347 8348 if (port_infop->p_linkstate == IBT_PORT_ACTIVE) 8349 *lstate = LINK_STATE_UP; 8350 else 8351 *lstate = LINK_STATE_DOWN; 8352 8353 ibt_free_portinfo(port_infop, port_infosz); 8354 return (0); 8355 } 8356 8357 static int 8358 ibd_port_attach(dev_info_t *dip) 8359 { 8360 ibd_state_t *state; 8361 link_state_t lstate; 8362 int instance; 8363 ibt_status_t ret; 8364 8365 /* 8366 * Allocate softstate structure 8367 */ 8368 instance = ddi_get_instance(dip); 8369 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) { 8370 DPRINT(10, "ibd_port_attach: ddi_soft_state_zalloc() failed"); 8371 return (DDI_FAILURE); 8372 } 8373 8374 state = ddi_get_soft_state(ibd_list, instance); 8375 8376 state->id_dip = dip; 8377 state->id_type = IBD_PORT_DRIVER; 8378 8379 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 8380 "port-number", 0)) == 0) { 8381 DPRINT(10, "ibd_port_attach: invalid port number (%d)", 8382 state->id_port); 8383 return (DDI_FAILURE); 8384 } 8385 if ((state->id_hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8386 "hca-guid", 0)) == 0) { 8387 DPRINT(10, "ibd_port_attach: hca has invalid guid (0x%llx)", 8388 state->id_hca_guid); 8389 return (DDI_FAILURE); 8390 } 8391 if ((state->id_port_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 8392 "port-guid", 0)) == 0) { 8393 DPRINT(10, "ibd_port_attach: port has invalid guid (0x%llx)", 8394 state->id_port_guid); 8395 return (DDI_FAILURE); 8396 } 8397 8398 /* 8399 * Attach to IBTL 8400 */ 8401 if ((ret = ibt_attach(&ibdpd_clnt_modinfo, dip, state, 8402 &state->id_ibt_hdl)) != IBT_SUCCESS) { 8403 DPRINT(10, "ibd_port_attach: failed in ibt_attach(), ret=%d", 8404 ret); 8405 goto done; 8406 } 8407 8408 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 8409 8410 if ((ret = ibt_open_hca(state->id_ibt_hdl, state->id_hca_guid, 8411 &state->id_hca_hdl)) != IBT_SUCCESS) { 8412 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8413 ret); 8414 goto done; 8415 } 8416 state->id_mac_state |= IBD_DRV_HCA_OPENED; 8417 8418 /* Update link status */ 8419 8420 if (ibd_get_port_state(state, &lstate) != 0) { 8421 DPRINT(10, "ibd_port_attach: ibt_open_hca() failed, ret=%d", 8422 ret); 8423 goto done; 8424 } 8425 state->id_link_state = lstate; 8426 /* 8427 * Register ibd interfaces with the Nemo framework 8428 */ 8429 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 8430 DPRINT(10, "ibd_port_attach: failed in ibd_register_mac()"); 8431 goto done; 8432 } 8433 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 8434 8435 mac_link_update(state->id_mh, lstate); 8436 8437 return (DDI_SUCCESS); 8438 done: 8439 (void) ibd_port_unattach(state, dip); 8440 return (DDI_FAILURE); 8441 } 8442 8443 static int 8444 ibd_port_unattach(ibd_state_t *state, dev_info_t *dip) 8445 { 8446 int instance; 8447 uint32_t progress = state->id_mac_state; 8448 ibt_status_t ret; 8449 8450 if (progress & IBD_DRV_MAC_REGISTERED) { 8451 (void) mac_unregister(state->id_mh); 8452 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 8453 } 8454 8455 if (progress & IBD_DRV_HCA_OPENED) { 8456 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 8457 IBT_SUCCESS) { 8458 ibd_print_warn(state, "failed to close " 8459 "HCA device, ret=%d", ret); 8460 } 8461 state->id_hca_hdl = NULL; 8462 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 8463 } 8464 8465 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 8466 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 8467 ibd_print_warn(state, 8468 "ibt_detach() failed, ret=%d", ret); 8469 } 8470 state->id_ibt_hdl = NULL; 8471 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 8472 } 8473 instance = ddi_get_instance(dip); 8474 ddi_soft_state_free(ibd_list, instance); 8475 8476 return (DDI_SUCCESS); 8477 } 8478 8479 ibt_status_t 8480 ibd_get_part_attr(datalink_id_t linkid, ibt_part_attr_t *attr) 8481 { 8482 ibd_state_t *state; 8483 8484 mutex_enter(&ibd_objlist_lock); 8485 8486 /* Find the ibd state structure corresponding the partition */ 8487 for (state = ibd_objlist_head; state; state = state->id_next) { 8488 if (state->id_plinkid == linkid) { 8489 break; 8490 } 8491 } 8492 8493 if (state == NULL) { 8494 mutex_exit(&ibd_objlist_lock); 8495 return (IBT_NO_SUCH_OBJECT); 8496 } 8497 8498 attr->pa_dlinkid = state->id_dlinkid; 8499 attr->pa_plinkid = state->id_plinkid; 8500 attr->pa_port = state->id_port; 8501 attr->pa_hca_guid = state->id_hca_guid; 8502 attr->pa_port_guid = state->id_port_guid; 8503 attr->pa_pkey = state->id_pkey; 8504 8505 mutex_exit(&ibd_objlist_lock); 8506 8507 return (IBT_SUCCESS); 8508 } 8509 8510 ibt_status_t 8511 ibd_get_all_part_attr(ibt_part_attr_t **attr_list, int *nparts) 8512 { 8513 ibd_state_t *state; 8514 int n = 0; 8515 ibt_part_attr_t *attr; 8516 8517 mutex_enter(&ibd_objlist_lock); 8518 8519 for (state = ibd_objlist_head; state; state = state->id_next) 8520 n++; 8521 8522 *nparts = n; 8523 if (n == 0) { 8524 *attr_list = NULL; 8525 mutex_exit(&ibd_objlist_lock); 8526 return (IBT_SUCCESS); 8527 } 8528 8529 *attr_list = kmem_alloc(sizeof (ibt_part_attr_t) * n, KM_SLEEP); 8530 attr = *attr_list; 8531 for (state = ibd_objlist_head; state; state = state->id_next) { 8532 #ifdef DEBUG 8533 ASSERT(n > 0); 8534 n--; 8535 #endif 8536 attr->pa_dlinkid = state->id_dlinkid; 8537 attr->pa_plinkid = state->id_plinkid; 8538 attr->pa_port = state->id_port; 8539 attr->pa_hca_guid = state->id_hca_guid; 8540 attr->pa_port_guid = state->id_port_guid; 8541 attr->pa_pkey = state->id_pkey; 8542 attr++; 8543 } 8544 8545 mutex_exit(&ibd_objlist_lock); 8546 return (IBT_SUCCESS); 8547 }